diff --git a/.github/workflows/push-image-olmo.yml b/.github/workflows/push-image-olmo.yml
new file mode 100644
index 000000000..28a8c3467
--- /dev/null
+++ b/.github/workflows/push-image-olmo.yml
@@ -0,0 +1,81 @@
+# This is an example workflow file.
+#
+# When you add a new image, copy this file and then change all mentions of "hello-world" with
+# the name of your new image.
+#
+# Read through the rest of the comments in this file to figure out how it works, and what else
+# you need to change.
+name: build_open_instruct_olmo
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+on:
+  push:
+    # Run this workflow anytime a push updates one of the files in the image's directory
+    # (other than the README), and anytime there's a new release tag for this image.
+    paths:
+      - 'open_instruct/**'
+      - '!open_instruct/README.md'
+      - 'requirements-olmo.txt'
+      - 'Dockerfile.olmo'
+      - '.github/workflows/push-image-olmo.yml'
+      # Note, add .olmo dockerfile + requirements if adding auto build to those
+    branches: [main]
+  # pull_request: # note, comment this out for running on every push
+  #   # Also run on PRs that update the files in the image's directory (other than README).
+  #   branches: [main]
+  #   paths:
+  #     - 'open_instruct/**'
+  #     - '!open_instruct/README.md'
+  #     - 'requirements-olmo.txt'
+  #     - 'Dockerfile.olmo'
+  workflow_dispatch:  # This allows us to manually trigger a build through the GitHub UI.
+
+env:
+  DOCKER_BUILDKIT: "1"
+
+jobs:
+  build:
+    name: open_instruct
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    if: (github.event_name != 'workflow_run') || (github.event.workflow_run.conclusion == 'success')
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          repository: allenai/oe-eval-internal
+          path: './oe-eval-internal'
+          ssh-key: ${{ secrets.OE_EVAL_GIT_CLONE_ACCESS_PRIVATE_SSH_DEPLOY_KEY }}
+
+      - name: Setup environment
+        uses: ./.github/actions/setup
+        with:
+          beaker_token: ${{ secrets.BEAKER_TOKEN }}
+          # ghcr_token: ${{ secrets.GHCR_TOKEN }}
+          # ghcr_user: ${{ secrets.GHCR_USER }}
+
+      # big images fail, trying this
+      - name: Delete huge unnecessary tools folder
+        run: rm -rf /opt/hostedtoolcache /usr/share/dotnet "$AGENT_TOOLSDIRECTORY"
+
+      - name: Build image
+        run: |
+          docker build \
+              --build-arg BUILDKIT_INLINE_CACHE=1 \
+              --build-arg CUDA=12.1.0 --build-arg \
+              TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 \
+              -f Dockerfile.olmo . \
+              -t open_instruct_olmo
+              
+      - name: Check image
+        run: |
+          docker run --rm open_instruct_olmo
+      - name: Push image
+        # if: github.event_name != 'pull_request'
+        uses: ./.github/actions/push
+        with:
+          image: open_instruct_olmo  # this is the tag of the image we just built in the previous step
+          beaker: open_instruct_olmo_auto  # this is the name of the image on Beaker
+          latest: true  # this flag says we should also push this as the 'latest' version to GHCR
diff --git a/.github/workflows/push-image.yml b/.github/workflows/push-image.yml
index 40e205fb3..f5a35fdc0 100644
--- a/.github/workflows/push-image.yml
+++ b/.github/workflows/push-image.yml
@@ -44,8 +44,6 @@ jobs:
     timeout-minutes: 60
     if: (github.event_name != 'workflow_run') || (github.event.workflow_run.conclusion == 'success')
     steps:
-      - uses: actions/checkout@v3
-
       - uses: actions/checkout@v3
         with:
           repository: allenai/oe-eval-internal
@@ -69,7 +67,6 @@ jobs:
               --build-arg BUILDKIT_INLINE_CACHE=1 \
               --build-arg CUDA=12.1.0 --build-arg \
               TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 \
-              --build-arg REQUIRE=requirements.txt . \
               -t open_instruct
               
 
diff --git a/Dockerfile.olmo b/Dockerfile.olmo
new file mode 100644
index 000000000..40ef4377a
--- /dev/null
+++ b/Dockerfile.olmo
@@ -0,0 +1,121 @@
+ARG CUDA
+ARG DIST
+ARG TARGET
+FROM --platform=linux/amd64 nvidia/cuda:${CUDA}-${TARGET}-${DIST}
+
+ARG DEBIAN_FRONTEND="noninteractive"
+ENV TZ="America/Los_Angeles"
+
+# Install base tools.
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    jq \
+    language-pack-en \
+    make \
+    sudo \
+    unzip \
+    vim \
+    wget \
+    parallel \
+    iputils-ping \
+    tmux
+
+ARG BEAKER_VERSION
+RUN curl --silent \
+    --connect-timeout 5 \
+    --max-time 10 \
+    --retry 5 \
+    --retry-delay 0 \
+    --retry-max-time 40 \
+    --output beaker.tar.gz \
+    "https://beaker.org/api/v3/release/cli?os=linux&arch=amd64&version=${BEAKER_VERSION}" \
+    && tar -zxf beaker.tar.gz -C /usr/local/bin/ ./beaker \
+    && rm beaker.tar.gz
+
+# This ensures the dynamic linker (or NVIDIA's container runtime, I'm not sure)
+# puts the right NVIDIA things in the right place (that THOR requires).
+ENV NVIDIA_DRIVER_CAPABILITIES=graphics,utility,compute
+
+# Install conda. We give anyone in the users group the ability to run
+# conda commands and install packages in the base (default) environment.
+# Things installed into the default environment won't persist, but we prefer
+# convenience in this case and try to make sure the user is aware of this
+# with a message that's printed when the session starts.
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-Linux-x86_64.sh \
+    && echo "32d73e1bc33fda089d7cd9ef4c1be542616bd8e437d1f77afeeaf7afdb019787 Miniconda3-py310_23.1.0-1-Linux-x86_64.sh" \
+        | sha256sum --check \
+    && bash Miniconda3-py310_23.1.0-1-Linux-x86_64.sh -b -p /opt/miniconda3 \
+    && rm Miniconda3-py310_23.1.0-1-Linux-x86_64.sh
+
+ENV PATH=/opt/miniconda3/bin:/opt/miniconda3/condabin:$PATH
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+
+# Install a few additional utilities via pip
+RUN /opt/miniconda3/bin/pip install --no-cache-dir \
+    gpustat \
+    jupyter \
+    beaker-gantry \
+    oocmap
+
+# Ensure users can modify their container environment.
+RUN echo '%users ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+
+# Make the base image friendlier for interactive workloads. This makes things like the man command
+# work.
+RUN yes | unminimize
+
+# Install MLNX OFED user-space drivers
+# See https://docs.nvidia.com/networking/pages/releaseview.action?pageId=15049785#Howto:DeployRDMAacceleratedDockercontaineroverInfiniBandfabric.-Dockerfile
+ENV MOFED_VER 5.8-1.1.2.1
+ENV OS_VER ubuntu20.04
+ENV PLATFORM x86_64
+RUN wget --quiet https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
+    tar -xvf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
+    MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}/mlnxofedinstall --basic --user-space-only --without-fw-update -q && \
+    rm -rf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM} && \
+    rm MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz
+
+# The -l flag makes bash act as a login shell and load /etc/profile, etc.
+ENTRYPOINT ["bash", "-l"]
+
+WORKDIR /stage/
+
+# TODO When updating flash-attn or torch in the future, make sure to update the version in the requirements.txt file. 
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+COPY requirements-olmo.txt .
+RUN pip install --upgrade pip "setuptools<70.0.0" wheel 
+# TODO, unpin setuptools when this issue in flash attention is resolved
+RUN pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
+RUN pip install packaging
+RUN pip install flash-attn==2.5.9.post1 --no-build-isolation
+# for newest olmo's, move to requirements when ai2-olmo supports torch 2.4
+# core is a dependency of ai2-olmo
+RUN pip install ai2-olmo-core==0.1.0 omegaconf
+# RUN pip install ai2-olmo>=0.5.0 --no-deps
+# TODO Update Once this is merged https://github.com/allenai/OLMo/pull/719, then next release
+RUN pip install git+https://github.com/allenai/OLMo.git@47f8f5abb40eb100c6623be12e1648c841b2ab99 --no-deps
+RUN pip install -r requirements-olmo.txt
+
+RUN pip install git+https://github.com/AkshitaB/vllm.git
+RUN pip install vllm-flash-attn
+
+
+# NLTK download
+RUN python -m nltk.downloader punkt
+COPY open_instruct open_instruct
+COPY oe-eval-internal oe-eval-internal
+
+# install the package in editable mode
+COPY pyproject.toml .
+RUN pip install -e .
+COPY .git/ ./.git/
+COPY eval eval
+COPY configs configs
+COPY scripts scripts
+COPY mason.py mason.py
+RUN chmod +x scripts/*
+
+# for interactive session
+RUN chmod -R 777 /stage/
diff --git a/README.md b/README.md
index 1e6fe9409..2eaef5b9b 100644
--- a/README.md
+++ b/README.md
@@ -54,7 +54,7 @@ pip install -r weight-diff-requirements.txt
 For a second installation strategy, if you'd like to *run experiments within a Docker environment*, you can create one using:
 
 ```bash
-docker build --build-arg CUDA=12.1.0 --build-arg TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 --build-arg REQUIRE=requirements.txt . -t open_instruct
+docker build --build-arg CUDA=12.1.0 --build-arg TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 . -t open_instruct
 
 # if you are interally at AI2, you can create an image like this:
 beaker image create open_instruct -n open_instruct -w ai2/$(whoami)
diff --git a/configs/beaker_configs/default_finetune_offloading.yaml b/configs/beaker_configs/default_finetune_offloading.yaml
new file mode 100644
index 000000000..4722b4e13
--- /dev/null
+++ b/configs/beaker_configs/default_finetune_offloading.yaml
@@ -0,0 +1,69 @@
+version: v2
+description: open-instruct-finetune
+budget: ai2/oe-adapt
+tasks:
+  - name: open-instruct-finetune
+    image:
+      beaker: nathanl/open_instruct_auto
+    command: [
+      '/bin/sh', '-c'
+    ]
+    arguments: ['PYTHONPATH="/stage:$PYTHONPATH" accelerate launch
+      --mixed_precision bf16
+      --num_machines 1
+      --num_processes 4
+      --use_deepspeed
+      --deepspeed_config_file configs/ds_configs/stage3_offloading_accelerate.conf
+      open_instruct/finetune.py
+      --model_name_or_path /hf_llama_models
+      --use_flash_attn
+      --tokenizer_name /hf_llama_models
+      --max_seq_length 2048
+      --preprocessing_num_workers 16
+      --per_device_train_batch_size 2
+      --gradient_accumulation_steps 16
+      --learning_rate 2e-5
+      --lr_scheduler_type linear
+      --warmup_ratio 0.03
+      --weight_decay 0.
+      --num_train_epochs 2
+      --output_dir /output/
+      --with_tracking
+      --report_to tensorboard
+      --logging_steps 1
+    ']
+    envVars:
+      - name: CUDA_DEVICE_ORDER
+        value: PCI_BUS_ID
+      - name: TRANSFORMERS_CACHE
+        value: ./cache/
+      - name: WANDB_API_KEY
+        secret: WANDB_API_KEY
+      - name: WANDB_PROJECT
+        value: open-instruct
+      - name: WANDB_WATCH
+        value: false
+      - name: WANDB_LOG_MODEL
+        value: false
+      - name: WANDB_DISABLED
+        value: true
+      - name: HF_TOKEN
+        secret: HF_TOKEN
+    # datasets: # example for how to include datasets in mounting
+    #   - mountPath: /data
+    #     source:
+    #       beaker: Yizhongw03/processed_open_instruct_data
+    #   - mountPath: /mmlu
+    #     source:
+    #       beaker: Yizhongw03/mmlu
+    #   - mountPath: /hf_llama_models
+    #     source:
+    #       beaker: Yizhongw03/hf_llama_model_7B
+    result:
+      path: /output
+    resources:
+      gpuCount: 4
+    context:
+      cluster: ai2/allennlp-cirrascale
+      priority: high
+      preemptible: false
\ No newline at end of file
diff --git a/configs/train_configs/dpo/olmo_7b_0924.yaml b/configs/train_configs/dpo/olmo_7b_0924.yaml
new file mode 100644
index 000000000..3028bfca8
--- /dev/null
+++ b/configs/train_configs/dpo/olmo_7b_0924.yaml
@@ -0,0 +1,29 @@
+model_name_or_path: /model
+model_revision: main
+use_flash_attn: true
+gradient_checkpointing: true
+dataset_mixer:
+  allenai/ultrafeedback_binarized_cleaned_train: 1.0
+  ai2-adapt-dev/DaringAnteater-prefs-RM-filter: 1.0
+  ai2-adapt-dev/WildChat-prefs-280824: 1.0
+  allenai/tulu-3-hardcoded-preferences: 1.0
+tokenizer_name: /model
+use_slow_tokenizer: true
+max_seq_length: 2048
+preprocessing_num_workers: 16
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 16 # designed for 8 GPUs, so batch size 128
+learning_rate: 5.0e-7
+lr_scheduler_type: linear
+warmup_ratio: 0.1
+weight_decay: 0.0
+num_train_epochs: 1
+output_dir: /output
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+use_lora: false
+dpo_loss_type: dpo_norm
+dpo_beta: 5
+checkpointing_steps: 1000
\ No newline at end of file
diff --git a/configs/train_configs/sft/olmo_7b_0924.yaml b/configs/train_configs/sft/olmo_7b_0924.yaml
new file mode 100644
index 000000000..e8264bd0a
--- /dev/null
+++ b/configs/train_configs/sft/olmo_7b_0924.yaml
@@ -0,0 +1,22 @@
+model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+model_revision: main
+use_flash_attn: true
+tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+use_slow_tokenizer: false # olmo models only use fast tokenizers
+dataset_name: allenai/llama-3-tulu-v3.3-mix-preview
+max_seq_length: 4096
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8 # should run with this set to 16 for 1 node only
+learning_rate: 2.0e-06
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 3
+output_dir: /output/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+add_bos: true
\ No newline at end of file
diff --git a/configs/train_configs/sft/olmo_7b_0924_fw2_permissive.yaml b/configs/train_configs/sft/olmo_7b_0924_fw2_permissive.yaml
new file mode 100644
index 000000000..4539f713a
--- /dev/null
+++ b/configs/train_configs/sft/olmo_7b_0924_fw2_permissive.yaml
@@ -0,0 +1,36 @@
+# model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+model_name_or_path: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf
+model_revision: main
+use_flash_attn: true
+# tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+tokenizer_name: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf
+use_slow_tokenizer: false # olmo models only use fast tokenizers
+dataset_mixer:
+    ai2-adapt-dev/metamath-qa-reformat: 1.0 # MIT License
+    natolambert/tulu-v2-sft-mixture-flan: 1.0 # FLAN Apache 2.0
+    natolambert/tulu-v2-sft-mixture-cot: 1.0 # FLAN Apache 2.0
+    allenai/openassistant-guanaco-reformatted: 1.0 # Apache 2.0
+    ai2-adapt-dev/codefeedback-single-turn-reformat-magicoder: 1.0 # MIT MagiCoder section of CodeFeedback
+    ai2-adapt-dev/aya_dataset-reformat: 1.0 # Apache 2.0
+    ai2-adapt-dev/SlimOrca-reformat: 0.25 # MIT License
+    ai2-adapt-dev/Daring-Anteater-reformat: 1.0 # CC BY 4.0
+    ai2-adapt-dev/WebInstructSub-reformat-apache: 0.1 # Apache 2.0
+    ai2-adapt-dev/Table-GPT-All-train: 0.5 # MIT
+max_seq_length: 4096
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 4 # designed for 4 nodes
+# gradient_accumulation_steps: 16 # designed for 1 nodes
+gradient_checkpointing: true
+learning_rate: 2.0e-06
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 3
+output_dir: /output/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+add_bos: true
\ No newline at end of file
diff --git a/configs/train_configs/sft/olmo_7b_0924_fw2_tulu_v3.4.yaml b/configs/train_configs/sft/olmo_7b_0924_fw2_tulu_v3.4.yaml
new file mode 100644
index 000000000..491fc4502
--- /dev/null
+++ b/configs/train_configs/sft/olmo_7b_0924_fw2_tulu_v3.4.yaml
@@ -0,0 +1,28 @@
+# model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+model_name_or_path: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf
+model_revision: main
+use_flash_attn: true
+# tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+tokenizer_name: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf
+use_slow_tokenizer: false # olmo models only use fast tokenizers
+dataset_name: allenai/tulu-v3.4-mix-preview
+max_seq_length: 4096
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1
+# gradient_accumulation_steps: 4 # designed for 4 nodes
+gradient_accumulation_steps: 8 # designed for 2 nodes
+# gradient_accumulation_steps: 16 # designed for 1 nodes
+gradient_checkpointing: true
+learning_rate: 2.0e-06
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 3
+output_dir: /output/
+with_tracking: true
+reduce_loss: mean
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+add_bos: true
\ No newline at end of file
diff --git a/open_instruct/dpo_tune.py b/open_instruct/dpo_tune.py
index cbd95c2a3..36b643f4c 100644
--- a/open_instruct/dpo_tune.py
+++ b/open_instruct/dpo_tune.py
@@ -67,6 +67,7 @@
 from open_instruct.model_utils import push_folder_to_hub, save_with_accelerate
 from open_instruct.utils import (
     ArgumentParserPlus,
+    check_hf_olmo_availability,
     clean_last_n_checkpoints,
     get_datasets,
     get_last_checkpoint_path,
@@ -469,6 +470,12 @@ def prepare_deepspeed(accelerator, model):
 
 
 def main(args: FlatArguments):
+    # try to import OLMo for automodel
+    if check_hf_olmo_availability():
+        # allows AutoModel... to work with not in transformers olmo models
+        import hf_olmo  # noqa
+        from hf_olmo import OLMoTokenizerFast
+
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
@@ -673,7 +680,9 @@ def load_model():
             0,
             1,
         ], "LlamaTokenizer should only add one special token - the pad_token, or no tokens if pad token present."
-    elif isinstance(tokenizer, GPTNeoXTokenizerFast):
+    elif isinstance(tokenizer, GPTNeoXTokenizerFast) or (
+        check_hf_olmo_availability() and isinstance(tokenizer, OLMoTokenizerFast)
+    ):
         # OLMo newer models use this tokenizer
         if tokenizer.bos_token is None:
             tokenizer.bos_token = tokenizer.eos_token
diff --git a/open_instruct/finetune.py b/open_instruct/finetune.py
index e7cf30916..edb6da6b9 100644
--- a/open_instruct/finetune.py
+++ b/open_instruct/finetune.py
@@ -56,6 +56,7 @@
 from open_instruct.model_utils import push_folder_to_hub, save_with_accelerate
 from open_instruct.utils import (
     ArgumentParserPlus,
+    check_hf_olmo_availability,
     clean_last_n_checkpoints,
     get_datasets,
     get_last_checkpoint_path,
@@ -453,6 +454,12 @@ def encode_sft_example(example, tokenizer, max_seq_length):
 
 
 def main(args: FlatArguments):
+    # try to import OLMo for automodel
+    if check_hf_olmo_availability():
+        # allows AutoModel... to work with not in transformers olmo models
+        import hf_olmo  # noqa
+        from hf_olmo import OLMoTokenizerFast
+
     # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
     # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
     # in the environment
@@ -646,7 +653,9 @@ def main(args: FlatArguments):
             0,
             1,
         ], "LlamaTokenizer should only add one special token - the pad_token, or no tokens if pad token present."
-    elif isinstance(tokenizer, GPTNeoXTokenizerFast):
+    elif isinstance(tokenizer, GPTNeoXTokenizerFast) or (
+        check_hf_olmo_availability() and isinstance(tokenizer, OLMoTokenizerFast)
+    ):
         # OLMo newer models use this tokenizer
         if tokenizer.bos_token is None:
             tokenizer.bos_token = tokenizer.eos_token
diff --git a/open_instruct/mix_data.py b/open_instruct/mix_data.py
index 05b8113cc..ea3490b47 100644
--- a/open_instruct/mix_data.py
+++ b/open_instruct/mix_data.py
@@ -14,9 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from open_instruct.finetune import FlatArguments
 
 # script for mixing and saving data
+from open_instruct.finetune import FlatArguments
 from open_instruct.utils import ArgumentParserPlus, get_datasets
 
 # Run as module for local imports, e.g.:
diff --git a/open_instruct/utils.py b/open_instruct/utils.py
index 120b9d087..b560478e4 100644
--- a/open_instruct/utils.py
+++ b/open_instruct/utils.py
@@ -14,6 +14,7 @@
 
 import dataclasses
 import functools
+import importlib
 import json
 import logging
 import os
@@ -53,6 +54,40 @@
 """
 
 
+# ----------------------------------------------------------------------------
+# Import utilities
+def check_hf_olmo_availability(return_version: bool = False) -> Union[dict, bool]:
+    pkg_name = "hf_olmo"
+
+    # Check if the package spec exists
+    package_exists = importlib.util.find_spec(pkg_name) is not None
+    package_version = "N/A"
+
+    if package_exists:
+        try:
+            # Primary method to get the package version
+            package_version = importlib.metadata.version(pkg_name)
+        except importlib.metadata.PackageNotFoundError:
+            # Fallback method
+            try:
+                package = importlib.import_module(pkg_name)
+                package_version = getattr(package, "__version__", "N/A")
+            except ImportError:
+                package_exists = False
+                package_version = "N/A"
+
+    if return_version:
+        return {
+            "available": package_exists,
+            "version": package_version,
+            "python_version": sys.version,
+            "os": os.name,
+            "platform": sys.platform,
+        }
+    else:
+        return package_exists
+
+
 # ----------------------------------------------------------------------------
 # Dataset utilities
 def is_openai_format(messages: Any) -> bool:
diff --git a/requirements-olmo.txt b/requirements-olmo.txt
new file mode 100644
index 000000000..1ec51fb2f
--- /dev/null
+++ b/requirements-olmo.txt
@@ -0,0 +1,46 @@
+# TODO When updating flash-attn or torch in the future, make sure to update the version in the Dockerfile 
+torch==2.4.0
+scipy
+packaging
+sentencepiece
+datasets
+deepspeed==0.14.4
+accelerate==0.31.0
+peft>=0.11.1
+bitsandbytes>=0.41.1
+evaluate>=0.4.0
+tokenizers==0.19.1
+protobuf
+transformers==4.43.4
+openai>=1.0.0
+tiktoken
+rouge_score
+tensorboard
+wandb
+gradio>=3.50.2
+termcolor
+jsonlines
+unidic-lite
+einops
+flash-attn==2.5.9.post1 # should really only be in dockerfile. Local env often doesn't have GPUs
+fire
+alpaca-eval==0.6.2
+# for human eval web app
+flask
+openpyxl
+# for ifeval
+nltk==3.8.1
+langdetect
+immutabledict
+# for math evaluations
+antlr4-python3-runtime==4.9.2
+mpmath==1.3.0
+sympy==1.12.0
+# for linting
+black
+flake8
+isort
+autoflake
+pytest
+hf_transfer
+beaker-py
\ No newline at end of file
diff --git a/scripts/eval/oe-eval.sh b/scripts/eval/oe-eval.sh
index ed605b685..c30ce38cf 100755
--- a/scripts/eval/oe-eval.sh
+++ b/scripts/eval/oe-eval.sh
@@ -97,7 +97,10 @@ TASKS=(
     "alpaca_eval_v2::tulu"
     "truthfulqa::tulu"
 )
-MODEL_TYPE="--model-type vllm"
+# For models without VLLM (experimental architectures)
+# comment out the VLLM arg and set GPU_COUNT_OTHER to 1
+# also consider lowering the batch size (VLLM arg), maybe to 5, VLLM handles it differently
+# MODEL_TYPE="--model-type vllm"
 BATCH_SIZE_VLLM=10000
 BATCH_SIZE_OTHER=1
 # Set GPU_COUNT and GPU_COUNT_OTHER based on NUM_GPUS
diff --git a/scripts/submit_finetune_job.py b/scripts/submit_finetune_job.py
index 7b7e7609f..617200b1c 100644
--- a/scripts/submit_finetune_job.py
+++ b/scripts/submit_finetune_job.py
@@ -25,6 +25,8 @@ def main():
     parser.add_argument("--num_nodes", type=int, default=1, help="Number of nodes to use")
     parser.add_argument("--image", type=str, default="nathanl/open_instruct_auto", help="Beaker image to use.")
     parser.add_argument("--workspace", type=str, default="ai2/tulu-2-improvements", help="Beaker workspace to use.")
+    parser.add_argument("--mount_on_weka", type=str, default=None, help="Mount a Weka directory to the job")
+    parser.add_argument("--weka_mount_path", type=str, default="/adapt-data", help="Path to mount the Weka directory")
     # allow unknown args from CLI, use this to modify loaded config in bash scripts for sweeping
     # Note, can only override args in --config passed (not default FlatArguments class in open_instruct/utils.py)
     
@@ -166,7 +168,7 @@ def parse_args(args):
     d['tasks'][0]['arguments'][0] = new_arguments
 
     # name and description
-    exp_name = f"open_instruct_finetune_{model_name}_{now}"
+    exp_name = f"open_instruct_finetune_{model_name}_{now}"[:128]
     d['description'] = exp_name
     d['tasks'][0]['name'] = exp_name
 
@@ -221,6 +223,14 @@ def parse_args(args):
     d['tasks'][0]['envVars'].append({
         'name': 'WANDB_API_KEY', 'secret': f"{beaker_whoami}_WANDB_API_KEY"
     })
+    
+    # Weka setting
+    if args.mount_on_weka:
+        if d['tasks'][0].get('datasets') is None:
+            d['tasks'][0]['datasets'] = []
+        d['tasks'][0]['datasets'].append({
+            'mountPath': f"{args.weka_mount_path}", 'source': {'weka': f"{args.mount_on_weka}"}
+        })
 
     # optionally, print to debug config
     print(d)