allenai · natolambert · Sep 5, 2024 · Sep 5, 2024 · Sep 9, 2024 · Sep 9, 2024
diff --git a/.github/workflows/push-image-olmo.yml b/.github/workflows/push-image-olmo.yml
@@ -0,0 +1,81 @@
+# This is an example workflow file.
+#
+# When you add a new image, copy this file and then change all mentions of "hello-world" with
+# the name of your new image.
+#
+# Read through the rest of the comments in this file to figure out how it works, and what else
+# you need to change.
+name: build_open_instruct_olmo
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+on:
+  push:
+    # Run this workflow anytime a push updates one of the files in the image's directory
+    # (other than the README), and anytime there's a new release tag for this image.
+    paths:
+      - 'open_instruct/**'
+      - '!open_instruct/README.md'
+      - 'requirements-olmo.txt'
+      - 'Dockerfile.olmo'
+      - '.github/workflows/push-image-olmo.yml'
+      # Note, add .olmo dockerfile + requirements if adding auto build to those
+    branches: [main]
+  # pull_request: # note, comment this out for running on every push
+  #   # Also run on PRs that update the files in the image's directory (other than README).
+  #   branches: [main]
+  #   paths:
+  #     - 'open_instruct/**'
+  #     - '!open_instruct/README.md'
+  #     - 'requirements-olmo.txt'
+  #     - 'Dockerfile.olmo'
+  workflow_dispatch:  # This allows us to manually trigger a build through the GitHub UI.
+
+env:
+  DOCKER_BUILDKIT: "1"
+
+jobs:
+  build:
+    name: open_instruct
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    if: (github.event_name != 'workflow_run') || (github.event.workflow_run.conclusion == 'success')
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          repository: allenai/oe-eval-internal
+          path: './oe-eval-internal'
+          ssh-key: ${{ secrets.OE_EVAL_GIT_CLONE_ACCESS_PRIVATE_SSH_DEPLOY_KEY }}
+
+      - name: Setup environment
+        uses: ./.github/actions/setup
+        with:
+          beaker_token: ${{ secrets.BEAKER_TOKEN }}
+          # ghcr_token: ${{ secrets.GHCR_TOKEN }}
+          # ghcr_user: ${{ secrets.GHCR_USER }}
+
+      # big images fail, trying this
+      - name: Delete huge unnecessary tools folder
+        run: rm -rf /opt/hostedtoolcache /usr/share/dotnet "$AGENT_TOOLSDIRECTORY"
+
+      - name: Build image
+        run: |
+          docker build \
+              --build-arg BUILDKIT_INLINE_CACHE=1 \
+              --build-arg CUDA=12.1.0 --build-arg \
+              TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 \
+              -f Dockerfile.olmo . \
+              -t open_instruct_olmo
+
+      - name: Check image
+        run: |
+          docker run --rm open_instruct_olmo
+      - name: Push image
+        # if: github.event_name != 'pull_request'
+        uses: ./.github/actions/push
+        with:
+          image: open_instruct_olmo  # this is the tag of the image we just built in the previous step
+          beaker: open_instruct_olmo_auto  # this is the name of the image on Beaker
+          latest: true  # this flag says we should also push this as the 'latest' version to GHCR
diff --git a/.github/workflows/push-image.yml b/.github/workflows/push-image.yml
@@ -44,8 +44,6 @@ jobs:
     timeout-minutes: 60
     if: (github.event_name != 'workflow_run') || (github.event.workflow_run.conclusion == 'success')
     steps:
-      - uses: actions/checkout@v3
-
       - uses: actions/checkout@v3
         with:
           repository: allenai/oe-eval-internal
@@ -69,7 +67,6 @@ jobs:
               --build-arg BUILDKIT_INLINE_CACHE=1 \
               --build-arg CUDA=12.1.0 --build-arg \
               TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 \
-              --build-arg REQUIRE=requirements.txt . \
               -t open_instruct
 
 

diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,8 @@ rejection_sampling/shards1
 token_length.png
 *.tfevents.*
 
+oe-eval-internal/
+
 results
 models
 wandb

diff --git a/Dockerfile.olmo b/Dockerfile.olmo
@@ -0,0 +1,117 @@
+ARG CUDA
+ARG DIST
+ARG TARGET
+FROM --platform=linux/amd64 nvidia/cuda:${CUDA}-${TARGET}-${DIST}
+
+ARG DEBIAN_FRONTEND="noninteractive"
+ENV TZ="America/Los_Angeles"
+
+# Install base tools.
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    jq \
+    language-pack-en \
+    make \
+    sudo \
+    unzip \
+    vim \
+    wget \
+    parallel \
+    iputils-ping \
+    tmux
+
+ARG BEAKER_VERSION
+RUN curl --silent \
+    --connect-timeout 5 \
+    --max-time 10 \
+    --retry 5 \
+    --retry-delay 0 \
+    --retry-max-time 40 \
+    --output beaker.tar.gz \
+    "https://beaker.org/api/v3/release/cli?os=linux&arch=amd64&version=${BEAKER_VERSION}" \
+    && tar -zxf beaker.tar.gz -C /usr/local/bin/ ./beaker \
+    && rm beaker.tar.gz
+
+# This ensures the dynamic linker (or NVIDIA's container runtime, I'm not sure)
+# puts the right NVIDIA things in the right place (that THOR requires).
+ENV NVIDIA_DRIVER_CAPABILITIES=graphics,utility,compute
+
+# Install conda. We give anyone in the users group the ability to run
+# conda commands and install packages in the base (default) environment.
+# Things installed into the default environment won't persist, but we prefer
+# convenience in this case and try to make sure the user is aware of this
+# with a message that's printed when the session starts.
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-Linux-x86_64.sh \
+    && echo "32d73e1bc33fda089d7cd9ef4c1be542616bd8e437d1f77afeeaf7afdb019787 Miniconda3-py310_23.1.0-1-Linux-x86_64.sh" \
+        | sha256sum --check \
+    && bash Miniconda3-py310_23.1.0-1-Linux-x86_64.sh -b -p /opt/miniconda3 \
+    && rm Miniconda3-py310_23.1.0-1-Linux-x86_64.sh
+
+ENV PATH=/opt/miniconda3/bin:/opt/miniconda3/condabin:$PATH
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+
+# Install a few additional utilities via pip
+RUN /opt/miniconda3/bin/pip install --no-cache-dir \
+    gpustat \
+    jupyter \
+    beaker-gantry \
+    oocmap
+
+# Ensure users can modify their container environment.
+RUN echo '%users ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+
+# Make the base image friendlier for interactive workloads. This makes things like the man command
+# work.
+RUN yes | unminimize
+
+# Install MLNX OFED user-space drivers
+# See https://docs.nvidia.com/networking/pages/releaseview.action?pageId=15049785#Howto:DeployRDMAacceleratedDockercontaineroverInfiniBandfabric.-Dockerfile
+ENV MOFED_VER 5.8-1.1.2.1
+ENV OS_VER ubuntu20.04
+ENV PLATFORM x86_64
+RUN wget --quiet https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
+    tar -xvf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
+    MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}/mlnxofedinstall --basic --user-space-only --without-fw-update -q && \
+    rm -rf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM} && \
+    rm MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz
+
+# The -l flag makes bash act as a login shell and load /etc/profile, etc.
+ENTRYPOINT ["bash", "-l"]
+
+WORKDIR /stage/
+
+# TODO When updating flash-attn or torch in the future, make sure to update the version in the requirements.txt file. 
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+COPY requirements-olmo.txt .
+RUN pip install --upgrade pip "setuptools<70.0.0" wheel 
+# TODO, unpin setuptools when this issue in flash attention is resolved
+RUN pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
+RUN pip install packaging
+RUN pip install flash-attn==2.5.9.post1 --no-build-isolation
+# for newest olmo's, move to requirements when ai2-olmo supports torch 2.4
+# core is a dependency of ai2-olmo
+RUN pip install ai2-olmo-core==0.1.0 omegaconf
+# RUN pip install ai2-olmo>=0.5.0 --no-deps
+# TODO Update Once this is merged https://github.com/allenai/OLMo/pull/719, then next release
+RUN pip install git+https://github.com/allenai/OLMo.git@47f8f5abb40eb100c6623be12e1648c841b2ab99 --no-deps
+RUN pip install -r requirements-olmo.txt
+
+# NLTK download
+RUN python -m nltk.downloader punkt
+COPY open_instruct open_instruct
+COPY oe-eval-internal oe-eval-internal
+
+# install the package in editable mode
+COPY pyproject.toml .
+RUN pip install -e .
+COPY .git/ ./.git/
+COPY eval eval
+COPY configs configs
+COPY scripts scripts
+COPY mason.py mason.py
+RUN chmod +x scripts/*
+
+# for interactive session
+RUN chmod -R 777 /stage/
diff --git a/README.md b/README.md
@@ -54,7 +54,7 @@ pip install -r weight-diff-requirements.txt
 For a second installation strategy, if you'd like to *run experiments within a Docker environment*, you can create one using:
 
 ```bash
-docker build --build-arg CUDA=12.1.0 --build-arg TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 --build-arg REQUIRE=requirements.txt . -t open_instruct
+docker build --build-arg CUDA=12.1.0 --build-arg TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 . -t open_instruct
 
 # if you are interally at AI2, you can create an image like this:
 beaker image create open_instruct -n open_instruct -w ai2/$(whoami)

diff --git a/configs/beaker_configs/default_finetune_offloading.yaml b/configs/beaker_configs/default_finetune_offloading.yaml
@@ -0,0 +1,69 @@
+version: v2
+description: open-instruct-finetune
+budget: ai2/oe-adapt
+tasks:
+  - name: open-instruct-finetune
+    image:
+      beaker: nathanl/open_instruct_auto
+    command: [
+      '/bin/sh', '-c'
+    ]
+    arguments: ['PYTHONPATH="/stage:$PYTHONPATH" accelerate launch
+      --mixed_precision bf16
+      --num_machines 1
+      --num_processes 4
+      --use_deepspeed
+      --deepspeed_config_file configs/ds_configs/stage3_offloading_accelerate.conf
+      open_instruct/finetune.py
+      --model_name_or_path /hf_llama_models
+      --use_flash_attn
+      --tokenizer_name /hf_llama_models
+      --max_seq_length 2048
+      --preprocessing_num_workers 16
+      --per_device_train_batch_size 2
+      --gradient_accumulation_steps 16
+      --learning_rate 2e-5
+      --lr_scheduler_type linear
+      --warmup_ratio 0.03
+      --weight_decay 0.
+      --num_train_epochs 2
+      --output_dir /output/
+      --with_tracking
+      --report_to tensorboard
+      --logging_steps 1
+    ']
+    envVars:
+      - name: CUDA_DEVICE_ORDER
+        value: PCI_BUS_ID
+      - name: TRANSFORMERS_CACHE
+        value: ./cache/
+      - name: WANDB_API_KEY
+        secret: WANDB_API_KEY
+      - name: WANDB_PROJECT
+        value: open-instruct
+      - name: WANDB_WATCH
+        value: false
+      - name: WANDB_LOG_MODEL
+        value: false
+      - name: WANDB_DISABLED
+        value: true
+      - name: HF_TOKEN
+        secret: HF_TOKEN
+    # datasets: # example for how to include datasets in mounting
+    #   - mountPath: /data
+    #     source:
+    #       beaker: Yizhongw03/processed_open_instruct_data
+    #   - mountPath: /mmlu
+    #     source:
+    #       beaker: Yizhongw03/mmlu
+    #   - mountPath: /hf_llama_models
+    #     source:
+    #       beaker: Yizhongw03/hf_llama_model_7B
+    result:
+      path: /output
+    resources:
+      gpuCount: 4
+    context:
+      cluster: ai2/allennlp-cirrascale
+      priority: high
+      preemptible: false
diff --git a/configs/train_configs/sft/olmo_7b_0924.yaml b/configs/train_configs/sft/olmo_7b_0924.yaml
@@ -0,0 +1,22 @@
+model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+model_revision: main
+use_flash_attn: true
+tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+use_slow_tokenizer: false # olmo models only use fast tokenizers
+dataset_name: allenai/llama-3-tulu-v3.3-mix-preview
+max_seq_length: 4096
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8 # should run with this set to 16 for 1 node only
+learning_rate: 2.0e-06
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 3
+output_dir: /output/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+add_bos: true
diff --git a/configs/train_configs/sft/olmo_7b_0924_fw2_permissive.yaml b/configs/train_configs/sft/olmo_7b_0924_fw2_permissive.yaml
@@ -0,0 +1,36 @@
+# model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+model_name_or_path: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf
+model_revision: main
+use_flash_attn: true
+# tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+tokenizer_name: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf
+use_slow_tokenizer: false # olmo models only use fast tokenizers
+dataset_mixer:
+    ai2-adapt-dev/metamath-qa-reformat: 1.0 # MIT License
+    natolambert/tulu-v2-sft-mixture-flan: 1.0 # FLAN Apache 2.0
+    natolambert/tulu-v2-sft-mixture-cot: 1.0 # FLAN Apache 2.0
+    allenai/openassistant-guanaco-reformatted: 1.0 # Apache 2.0
+    ai2-adapt-dev/codefeedback-single-turn-reformat-magicoder: 1.0 # MIT MagiCoder section of CodeFeedback
+    ai2-adapt-dev/aya_dataset-reformat: 1.0 # Apache 2.0
+    ai2-adapt-dev/SlimOrca-reformat: 0.25 # MIT License
+    ai2-adapt-dev/Daring-Anteater-reformat: 1.0 # CC BY 4.0
+    ai2-adapt-dev/WebInstructSub-reformat-apache: 0.1 # Apache 2.0
+    ai2-adapt-dev/Table-GPT-All-train: 0.5 # MIT
+max_seq_length: 4096
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 4 # designed for 4 nodes
+# gradient_accumulation_steps: 16 # designed for 1 nodes
+gradient_checkpointing: true
+learning_rate: 2.0e-06
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 3
+output_dir: /output/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+add_bos: true
diff --git a/configs/train_configs/sft/olmo_7b_0924_fw2_tulu_v3.4.yaml b/configs/train_configs/sft/olmo_7b_0924_fw2_tulu_v3.4.yaml
@@ -0,0 +1,26 @@
+# model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+model_name_or_path: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf
+model_revision: main
+use_flash_attn: true
+# tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan
+tokenizer_name: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf
+use_slow_tokenizer: false # olmo models only use fast tokenizers
+dataset_name: allenai/tulu-v3.4-mix-preview
+max_seq_length: 4096
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 4 # designed for 4 nodes
+# gradient_accumulation_steps: 16 # designed for 1 nodes
+gradient_checkpointing: true
+learning_rate: 2.0e-06
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 3
+output_dir: /output/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+add_bos: true