From 65aaaec2499e36c04602ca74a1d7d14ba371a699 Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Thu, 5 Sep 2024 19:41:20 +0000 Subject: [PATCH 01/23] init --- .gitignore | 2 ++ Dockerfile | 4 ++++ open_instruct/dpo_tune.py | 8 +++++++- open_instruct/finetune.py | 8 +++++++- open_instruct/utils.py | 38 ++++++++++++++++++++++++++++++++++++++ requirements.txt | 2 +- 6 files changed, 59 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 55d2caedc..5952c206c 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,8 @@ rejection_sampling/shards1 token_length.png *.tfevents.* +oe-eval-internal/ + results models wandb diff --git a/Dockerfile b/Dockerfile index dd6b95a97..7579a02fc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -90,6 +90,10 @@ RUN pip install --upgrade pip "setuptools<70.0.0" wheel RUN pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121 RUN pip install packaging RUN pip install flash-attn==2.6.3 --no-build-isolation +# for newest olmo's, move to requirements when ai2-olmo supports torch 2.4 +# core is a dependency of ai2-olmo +RUN pip install ai2-olmo-core==0.1.0 +RUN pip install ai2-olmo>=0.5.0 --no-deps RUN pip install -r requirements.txt # NLTK download diff --git a/open_instruct/dpo_tune.py b/open_instruct/dpo_tune.py index 9c5cf1714..74bb522fd 100644 --- a/open_instruct/dpo_tune.py +++ b/open_instruct/dpo_tune.py @@ -17,13 +17,13 @@ DPO tuning script. Adapted from our finetuning script. """ +import json import logging import math import os import random import subprocess import time -import json from copy import deepcopy from dataclasses import dataclass, field from datetime import timedelta @@ -65,6 +65,7 @@ from open_instruct.model_utils import push_folder_to_hub, save_with_accelerate from open_instruct.utils import ( ArgumentParserPlus, + check_hf_olmo_availability, clean_last_n_checkpoints, get_datasets, get_last_checkpoint_path, @@ -499,6 +500,11 @@ def prepare_deepspeed(accelerator, model): def main(args: FlatArguments): + # try to import OLMo for automodel + if check_hf_olmo_availability(): + # allows AutoModel... to work with not in transformers olmo models + import hf_olmo # noqa + # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment diff --git a/open_instruct/finetune.py b/open_instruct/finetune.py index e8d69c08e..48ba726e1 100644 --- a/open_instruct/finetune.py +++ b/open_instruct/finetune.py @@ -14,13 +14,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json import logging import math import os import random import subprocess import time -import json from dataclasses import dataclass, field from datetime import timedelta from functools import partial @@ -55,6 +55,7 @@ from open_instruct.model_utils import push_folder_to_hub, save_with_accelerate from open_instruct.utils import ( ArgumentParserPlus, + check_hf_olmo_availability, clean_last_n_checkpoints, get_datasets, get_last_checkpoint_path, @@ -448,6 +449,11 @@ def _concat_messages(messages): def main(args: FlatArguments): + # try to import OLMo for automodel + if check_hf_olmo_availability(): + # allows AutoModel... to work with not in transformers olmo models + import hf_olmo # noqa + # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment diff --git a/open_instruct/utils.py b/open_instruct/utils.py index 34fc96dfe..0a95b3f28 100644 --- a/open_instruct/utils.py +++ b/open_instruct/utils.py @@ -14,6 +14,7 @@ import dataclasses import functools +import importlib import json import logging import os @@ -51,6 +52,43 @@ """ +# ---------------------------------------------------------------------------- +# Import utilities +def check_hf_olmo_availability(return_version: bool = True) -> Union[dict, bool]: + pkg_name = "hf_olmo" + + # Check if the package spec exists + package_exists = importlib.util.find_spec(pkg_name) is not None + package_version = "N/A" + + if package_exists: + try: + # Primary method to get the package version + package_version = importlib.metadata.version(pkg_name) + except importlib.metadata.PackageNotFoundError: + # Fallback method + try: + package = importlib.import_module(pkg_name) + package_version = getattr(package, "__version__", "N/A") + if package_version == "N/A": + package_exists = False + except ImportError: + package_exists = False + + logger.debug(f"Detected {pkg_name} version: {package_version}") + + if return_version: + return { + "available": package_exists, + "version": package_version, + "python_version": sys.version, + "os": os.name, + "platform": sys.platform, + } + else: + return package_exists + + # ---------------------------------------------------------------------------- # Dataset utilities def is_openai_format(messages: Any) -> bool: diff --git a/requirements.txt b/requirements.txt index 39a3073d6..9b9ff095b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -44,4 +44,4 @@ isort autoflake pytest hf_transfer -beaker-py +beaker-py \ No newline at end of file From 7da49a9b5a8f1491267603f2371700d4aca5c2f7 Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Thu, 5 Sep 2024 20:35:18 +0000 Subject: [PATCH 02/23] up --- Dockerfile | 2 +- configs/train_configs/sft/olmo_7b_0924.yaml | 22 +++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 configs/train_configs/sft/olmo_7b_0924.yaml diff --git a/Dockerfile b/Dockerfile index 7579a02fc..87181fb2f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -92,7 +92,7 @@ RUN pip install packaging RUN pip install flash-attn==2.6.3 --no-build-isolation # for newest olmo's, move to requirements when ai2-olmo supports torch 2.4 # core is a dependency of ai2-olmo -RUN pip install ai2-olmo-core==0.1.0 +RUN pip install ai2-olmo-core==0.1.0 omegaconf RUN pip install ai2-olmo>=0.5.0 --no-deps RUN pip install -r requirements.txt diff --git a/configs/train_configs/sft/olmo_7b_0924.yaml b/configs/train_configs/sft/olmo_7b_0924.yaml new file mode 100644 index 000000000..d76b2e560 --- /dev/null +++ b/configs/train_configs/sft/olmo_7b_0924.yaml @@ -0,0 +1,22 @@ +model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan +model_revision: main +use_flash_attn: true +tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan +use_slow_tokenizer: false # olmo models only use fast tokenizers +dataset_name: allenai/tulu-v2-sft-mixture-olmo-2048 +max_seq_length: 2048 +preprocessing_num_workers: 128 +per_device_train_batch_size: 1 # note, this is set up for 8 GPUs +gradient_accumulation_steps: 16 +learning_rate: 2.0e-06 +lr_scheduler_type: linear +warmup_ratio: 0.03 +weight_decay: 0.0 +num_train_epochs: 3 +output_dir: output/olmo_instruct/ +with_tracking: true +report_to: + - wandb +logging_steps: 1 +checkpointing_steps: epoch +add_bos: true \ No newline at end of file From d929a895aa80a04ad4254989e1051a5b36c76e44 Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Mon, 9 Sep 2024 17:23:18 +0000 Subject: [PATCH 03/23] branch dockerfile --- .github/workflows/push-image-olmo.yml | 82 ++++++++++++++++++ .github/workflows/push-image.yml | 2 - Dockerfile | 4 +- Dockerfile.olmo | 115 ++++++++++++++++++++++++++ requirements-olmo.txt | 48 +++++++++++ 5 files changed, 247 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/push-image-olmo.yml create mode 100644 Dockerfile.olmo create mode 100644 requirements-olmo.txt diff --git a/.github/workflows/push-image-olmo.yml b/.github/workflows/push-image-olmo.yml new file mode 100644 index 000000000..8fdafc49f --- /dev/null +++ b/.github/workflows/push-image-olmo.yml @@ -0,0 +1,82 @@ +# This is an example workflow file. +# +# When you add a new image, copy this file and then change all mentions of "hello-world" with +# the name of your new image. +# +# Read through the rest of the comments in this file to figure out how it works, and what else +# you need to change. +name: build_open_instruct_olmo + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +on: + push: + # Run this workflow anytime a push updates one of the files in the image's directory + # (other than the README), and anytime there's a new release tag for this image. + paths: + - 'open_instruct/**' + - '!open_instruct/README.md' + - 'requirements-olmo.txt' + - 'Dockerfile.olmo' + - '.github/workflows/push-image-olmo.yml' + # Note, add .olmo dockerfile + requirements if adding auto build to those + branches: [main] + # pull_request: # note, comment this out for running on every push + # # Also run on PRs that update the files in the image's directory (other than README). + # branches: [main] + # paths: + # - 'open_instruct/**' + # - '!open_instruct/README.md' + # - 'requirements-olmo.txt' + # - 'Dockerfile.olmo' + workflow_dispatch: # This allows us to manually trigger a build through the GitHub UI. + +env: + DOCKER_BUILDKIT: "1" + +jobs: + build: + name: open_instruct + runs-on: ubuntu-latest + timeout-minutes: 60 + if: (github.event_name != 'workflow_run') || (github.event.workflow_run.conclusion == 'success') + steps: + - uses: actions/checkout@v3 + with: + repository: allenai/oe-eval-internal + path: './oe-eval-internal' + ssh-key: ${{ secrets.OE_EVAL_GIT_CLONE_ACCESS_PRIVATE_SSH_DEPLOY_KEY }} + + - name: Setup environment + uses: ./.github/actions/setup + with: + beaker_token: ${{ secrets.BEAKER_TOKEN }} + # ghcr_token: ${{ secrets.GHCR_TOKEN }} + # ghcr_user: ${{ secrets.GHCR_USER }} + + # big images fail, trying this + - name: Delete huge unnecessary tools folder + run: rm -rf /opt/hostedtoolcache /usr/share/dotnet "$AGENT_TOOLSDIRECTORY" + + - name: Build image + run: | + docker build \ + --build-arg BUILDKIT_INLINE_CACHE=1 \ + --build-arg CUDA=12.1.0 --build-arg \ + TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 \ + --build-arg REQUIRE=requirements.txt \ + -f Dockerfile.olmo . \ + -t open_instruct_olmo + + - name: Check image + run: | + docker run --rm open_instruct_olmo + - name: Push image + # if: github.event_name != 'pull_request' + uses: ./.github/actions/push + with: + image: open_instruct_olmo # this is the tag of the image we just built in the previous step + beaker: open_instruct_olmo_auto # this is the name of the image on Beaker + latest: true # this flag says we should also push this as the 'latest' version to GHCR diff --git a/.github/workflows/push-image.yml b/.github/workflows/push-image.yml index 40e205fb3..4e29f4d28 100644 --- a/.github/workflows/push-image.yml +++ b/.github/workflows/push-image.yml @@ -44,8 +44,6 @@ jobs: timeout-minutes: 60 if: (github.event_name != 'workflow_run') || (github.event.workflow_run.conclusion == 'success') steps: - - uses: actions/checkout@v3 - - uses: actions/checkout@v3 with: repository: allenai/oe-eval-internal diff --git a/Dockerfile b/Dockerfile index 87181fb2f..9f4e1fb00 100644 --- a/Dockerfile +++ b/Dockerfile @@ -92,8 +92,8 @@ RUN pip install packaging RUN pip install flash-attn==2.6.3 --no-build-isolation # for newest olmo's, move to requirements when ai2-olmo supports torch 2.4 # core is a dependency of ai2-olmo -RUN pip install ai2-olmo-core==0.1.0 omegaconf -RUN pip install ai2-olmo>=0.5.0 --no-deps +# RUN pip install ai2-olmo-core==0.1.0 omegaconf +# RUN pip install ai2-olmo>=0.5.0 --no-deps RUN pip install -r requirements.txt # NLTK download diff --git a/Dockerfile.olmo b/Dockerfile.olmo new file mode 100644 index 000000000..38f8a1db4 --- /dev/null +++ b/Dockerfile.olmo @@ -0,0 +1,115 @@ +ARG CUDA +ARG DIST +ARG TARGET +FROM --platform=linux/amd64 nvidia/cuda:${CUDA}-${TARGET}-${DIST} + +ARG DEBIAN_FRONTEND="noninteractive" +ENV TZ="America/Los_Angeles" + +# Install base tools. +RUN apt-get update && apt-get install -y \ + build-essential \ + curl \ + git \ + jq \ + language-pack-en \ + make \ + sudo \ + unzip \ + vim \ + wget \ + parallel \ + iputils-ping \ + tmux + +ARG BEAKER_VERSION +RUN curl --silent \ + --connect-timeout 5 \ + --max-time 10 \ + --retry 5 \ + --retry-delay 0 \ + --retry-max-time 40 \ + --output beaker.tar.gz \ + "https://beaker.org/api/v3/release/cli?os=linux&arch=amd64&version=${BEAKER_VERSION}" \ + && tar -zxf beaker.tar.gz -C /usr/local/bin/ ./beaker \ + && rm beaker.tar.gz + +# This ensures the dynamic linker (or NVIDIA's container runtime, I'm not sure) +# puts the right NVIDIA things in the right place (that THOR requires). +ENV NVIDIA_DRIVER_CAPABILITIES=graphics,utility,compute + +# Install conda. We give anyone in the users group the ability to run +# conda commands and install packages in the base (default) environment. +# Things installed into the default environment won't persist, but we prefer +# convenience in this case and try to make sure the user is aware of this +# with a message that's printed when the session starts. +RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-Linux-x86_64.sh \ + && echo "32d73e1bc33fda089d7cd9ef4c1be542616bd8e437d1f77afeeaf7afdb019787 Miniconda3-py310_23.1.0-1-Linux-x86_64.sh" \ + | sha256sum --check \ + && bash Miniconda3-py310_23.1.0-1-Linux-x86_64.sh -b -p /opt/miniconda3 \ + && rm Miniconda3-py310_23.1.0-1-Linux-x86_64.sh + +ENV PATH=/opt/miniconda3/bin:/opt/miniconda3/condabin:$PATH +ENV LD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH + +# Install a few additional utilities via pip +RUN /opt/miniconda3/bin/pip install --no-cache-dir \ + gpustat \ + jupyter \ + beaker-gantry \ + oocmap + +# Ensure users can modify their container environment. +RUN echo '%users ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers + +# Make the base image friendlier for interactive workloads. This makes things like the man command +# work. +RUN yes | unminimize + +# Install MLNX OFED user-space drivers +# See https://docs.nvidia.com/networking/pages/releaseview.action?pageId=15049785#Howto:DeployRDMAacceleratedDockercontaineroverInfiniBandfabric.-Dockerfile +ENV MOFED_VER 5.8-1.1.2.1 +ENV OS_VER ubuntu20.04 +ENV PLATFORM x86_64 +RUN wget --quiet https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \ + tar -xvf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \ + MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}/mlnxofedinstall --basic --user-space-only --without-fw-update -q && \ + rm -rf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM} && \ + rm MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz + +# The -l flag makes bash act as a login shell and load /etc/profile, etc. +ENTRYPOINT ["bash", "-l"] + +WORKDIR /stage/ + +# TODO When updating flash-attn or torch in the future, make sure to update the version in the requirements.txt file. +ENV HF_HUB_ENABLE_HF_TRANSFER=1 +COPY requirements.txt . +RUN pip install --upgrade pip "setuptools<70.0.0" wheel +# TODO, unpin setuptools when this issue in flash attention is resolved +RUN pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121 +RUN pip install packaging +RUN pip install flash-attn==2.5.9 --no-build-isolation +# for newest olmo's, move to requirements when ai2-olmo supports torch 2.4 +# core is a dependency of ai2-olmo +# RUN pip install ai2-olmo-core==0.1.0 omegaconf +# RUN pip install ai2-olmo>=0.5.0 --no-deps +RUN pip install -r requirements-olmo.txt + +# NLTK download +RUN python -m nltk.downloader punkt +COPY open_instruct open_instruct +COPY oe-eval-internal oe-eval-internal + +# install the package in editable mode +COPY pyproject.toml . +RUN pip install -e . +COPY .git/ ./.git/ +COPY eval eval +COPY configs configs +COPY scripts scripts +COPY mason.py mason.py +RUN chmod +x scripts/* + +# for interactive session +RUN chmod -R 777 /stage/ diff --git a/requirements-olmo.txt b/requirements-olmo.txt new file mode 100644 index 000000000..611ebda53 --- /dev/null +++ b/requirements-olmo.txt @@ -0,0 +1,48 @@ +# TODO When updating flash-attn or torch in the future, make sure to update the version in the Dockerfile +torch==2.4.0 +ai2-olmo-core==0.1.0 +ai2-olmo>=0.5.0 +scipy +packaging +sentencepiece +datasets +deepspeed==0.14.4 +accelerate==0.31.0 +peft>=0.11.1 +bitsandbytes>=0.41.1 +evaluate>=0.4.0 +tokenizers==0.19.1 +protobuf +transformers==4.43.4 +openai>=1.0.0 +tiktoken +rouge_score +tensorboard +wandb +gradio>=3.50.2 +termcolor +jsonlines +unidic-lite +einops +flash-attn==2.5.8 # should really only be in dockerfile. Local env often doesn't have GPUs +fire +alpaca-eval==0.6.2 +# for human eval web app +flask +openpyxl +# for ifeval +nltk==3.8.1 +langdetect +immutabledict +# for math evaluations +antlr4-python3-runtime==4.11.0 +mpmath==1.3.0 +sympy==1.12.0 +# for linting +black +flake8 +isort +autoflake +pytest +hf_transfer +beaker-py \ No newline at end of file From 6ef706c10b9d6e9656db34d544cd57b5a40cb274 Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Mon, 9 Sep 2024 17:57:40 +0000 Subject: [PATCH 04/23] update --- Dockerfile.olmo | 8 ++++---- requirements-olmo.txt | 4 +--- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/Dockerfile.olmo b/Dockerfile.olmo index 38f8a1db4..6259756ae 100644 --- a/Dockerfile.olmo +++ b/Dockerfile.olmo @@ -84,16 +84,16 @@ WORKDIR /stage/ # TODO When updating flash-attn or torch in the future, make sure to update the version in the requirements.txt file. ENV HF_HUB_ENABLE_HF_TRANSFER=1 -COPY requirements.txt . +COPY requirements-olmo.txt . RUN pip install --upgrade pip "setuptools<70.0.0" wheel # TODO, unpin setuptools when this issue in flash attention is resolved RUN pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121 RUN pip install packaging -RUN pip install flash-attn==2.5.9 --no-build-isolation +RUN pip install flash-attn==2.5.9.post1 --no-build-isolation # for newest olmo's, move to requirements when ai2-olmo supports torch 2.4 # core is a dependency of ai2-olmo -# RUN pip install ai2-olmo-core==0.1.0 omegaconf -# RUN pip install ai2-olmo>=0.5.0 --no-deps +RUN pip install ai2-olmo-core==0.1.0 omegaconf +RUN pip install ai2-olmo>=0.5.0 --no-deps RUN pip install -r requirements-olmo.txt # NLTK download diff --git a/requirements-olmo.txt b/requirements-olmo.txt index 611ebda53..93367a9e0 100644 --- a/requirements-olmo.txt +++ b/requirements-olmo.txt @@ -1,7 +1,5 @@ # TODO When updating flash-attn or torch in the future, make sure to update the version in the Dockerfile torch==2.4.0 -ai2-olmo-core==0.1.0 -ai2-olmo>=0.5.0 scipy packaging sentencepiece @@ -24,7 +22,7 @@ termcolor jsonlines unidic-lite einops -flash-attn==2.5.8 # should really only be in dockerfile. Local env often doesn't have GPUs +flash-attn==2.5.9.post1 # should really only be in dockerfile. Local env often doesn't have GPUs fire alpaca-eval==0.6.2 # for human eval web app From a4797fb6eda9a1ad6fe8f9d643995e7cdfbfe8b5 Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Mon, 9 Sep 2024 20:55:35 +0000 Subject: [PATCH 05/23] debugging and minor fixes --- .github/workflows/push-image-olmo.yml | 1 - .github/workflows/push-image.yml | 1 - README.md | 2 +- configs/train_configs/sft/olmo_7b_0924.yaml | 2 +- open_instruct/dpo_tune.py | 3 ++- open_instruct/finetune.py | 3 ++- open_instruct/utils.py | 2 -- requirements-olmo.txt | 2 +- 8 files changed, 7 insertions(+), 9 deletions(-) diff --git a/.github/workflows/push-image-olmo.yml b/.github/workflows/push-image-olmo.yml index 8fdafc49f..28a8c3467 100644 --- a/.github/workflows/push-image-olmo.yml +++ b/.github/workflows/push-image-olmo.yml @@ -66,7 +66,6 @@ jobs: --build-arg BUILDKIT_INLINE_CACHE=1 \ --build-arg CUDA=12.1.0 --build-arg \ TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 \ - --build-arg REQUIRE=requirements.txt \ -f Dockerfile.olmo . \ -t open_instruct_olmo diff --git a/.github/workflows/push-image.yml b/.github/workflows/push-image.yml index 4e29f4d28..f5a35fdc0 100644 --- a/.github/workflows/push-image.yml +++ b/.github/workflows/push-image.yml @@ -67,7 +67,6 @@ jobs: --build-arg BUILDKIT_INLINE_CACHE=1 \ --build-arg CUDA=12.1.0 --build-arg \ TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 \ - --build-arg REQUIRE=requirements.txt . \ -t open_instruct diff --git a/README.md b/README.md index e0ba44914..701358cd0 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ pip install -r weight-diff-requirements.txt For a second installation strategy, if you'd like to *run experiments within a Docker environment*, you can create one using: ```bash -docker build --build-arg CUDA=12.1.0 --build-arg TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 --build-arg REQUIRE=requirements.txt . -t open_instruct +docker build --build-arg CUDA=12.1.0 --build-arg TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 . -t open_instruct # if you are interally at AI2, you can create an image like this: beaker image create open_instruct -n open_instruct -w ai2/$(whoami) diff --git a/configs/train_configs/sft/olmo_7b_0924.yaml b/configs/train_configs/sft/olmo_7b_0924.yaml index d76b2e560..317e106d9 100644 --- a/configs/train_configs/sft/olmo_7b_0924.yaml +++ b/configs/train_configs/sft/olmo_7b_0924.yaml @@ -1,6 +1,6 @@ model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan model_revision: main -use_flash_attn: true +use_flash_attn: false tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan use_slow_tokenizer: false # olmo models only use fast tokenizers dataset_name: allenai/tulu-v2-sft-mixture-olmo-2048 diff --git a/open_instruct/dpo_tune.py b/open_instruct/dpo_tune.py index 74bb522fd..13b7c2f44 100644 --- a/open_instruct/dpo_tune.py +++ b/open_instruct/dpo_tune.py @@ -504,6 +504,7 @@ def main(args: FlatArguments): if check_hf_olmo_availability(): # allows AutoModel... to work with not in transformers olmo models import hf_olmo # noqa + from hf_olmo import OLMoTokenizerFast # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers @@ -678,7 +679,7 @@ def load_model(): 0, 1, ], "LlamaTokenizer should only add one special token - the pad_token, or no tokens if pad token present." - elif isinstance(tokenizer, GPTNeoXTokenizerFast): + elif isinstance(tokenizer, GPTNeoXTokenizerFast) or isinstance(tokenizer, OLMoTokenizerFast): # OLMo newer models use this tokenizer if tokenizer.bos_token is None: tokenizer.bos_token = tokenizer.eos_token diff --git a/open_instruct/finetune.py b/open_instruct/finetune.py index 48ba726e1..211c43887 100644 --- a/open_instruct/finetune.py +++ b/open_instruct/finetune.py @@ -453,6 +453,7 @@ def main(args: FlatArguments): if check_hf_olmo_availability(): # allows AutoModel... to work with not in transformers olmo models import hf_olmo # noqa + from hf_olmo import OLMoTokenizerFast # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers @@ -649,7 +650,7 @@ def main(args: FlatArguments): 0, 1, ], "LlamaTokenizer should only add one special token - the pad_token, or no tokens if pad token present." - elif isinstance(tokenizer, GPTNeoXTokenizerFast): + elif isinstance(tokenizer, GPTNeoXTokenizerFast) or isinstance(tokenizer, OLMoTokenizerFast): # noqa # OLMo newer models use this tokenizer if tokenizer.bos_token is None: tokenizer.bos_token = tokenizer.eos_token diff --git a/open_instruct/utils.py b/open_instruct/utils.py index 0a95b3f28..08046e17e 100644 --- a/open_instruct/utils.py +++ b/open_instruct/utils.py @@ -75,8 +75,6 @@ def check_hf_olmo_availability(return_version: bool = True) -> Union[dict, bool] except ImportError: package_exists = False - logger.debug(f"Detected {pkg_name} version: {package_version}") - if return_version: return { "available": package_exists, diff --git a/requirements-olmo.txt b/requirements-olmo.txt index 93367a9e0..1ec51fb2f 100644 --- a/requirements-olmo.txt +++ b/requirements-olmo.txt @@ -33,7 +33,7 @@ nltk==3.8.1 langdetect immutabledict # for math evaluations -antlr4-python3-runtime==4.11.0 +antlr4-python3-runtime==4.9.2 mpmath==1.3.0 sympy==1.12.0 # for linting From 56d6d8644c8e66e3586350ba963394b6e89a6def Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Mon, 9 Sep 2024 21:32:33 +0000 Subject: [PATCH 06/23] nit and style --- open_instruct/dpo_tune.py | 4 +++- open_instruct/finetune.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/open_instruct/dpo_tune.py b/open_instruct/dpo_tune.py index 13b7c2f44..cd0676648 100644 --- a/open_instruct/dpo_tune.py +++ b/open_instruct/dpo_tune.py @@ -679,7 +679,9 @@ def load_model(): 0, 1, ], "LlamaTokenizer should only add one special token - the pad_token, or no tokens if pad token present." - elif isinstance(tokenizer, GPTNeoXTokenizerFast) or isinstance(tokenizer, OLMoTokenizerFast): + elif isinstance(tokenizer, GPTNeoXTokenizerFast) or ( + check_hf_olmo_availability() and isinstance(tokenizer, OLMoTokenizerFast) + ): # OLMo newer models use this tokenizer if tokenizer.bos_token is None: tokenizer.bos_token = tokenizer.eos_token diff --git a/open_instruct/finetune.py b/open_instruct/finetune.py index 211c43887..4e917ff77 100644 --- a/open_instruct/finetune.py +++ b/open_instruct/finetune.py @@ -650,7 +650,7 @@ def main(args: FlatArguments): 0, 1, ], "LlamaTokenizer should only add one special token - the pad_token, or no tokens if pad token present." - elif isinstance(tokenizer, GPTNeoXTokenizerFast) or isinstance(tokenizer, OLMoTokenizerFast): # noqa + elif isinstance(tokenizer, GPTNeoXTokenizerFast) or isinstance(tokenizer, OLMoTokenizerFast): # noqa # OLMo newer models use this tokenizer if tokenizer.bos_token is None: tokenizer.bos_token = tokenizer.eos_token From 50500ea5f6f995b18e791242649cb6cb14044319 Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Mon, 9 Sep 2024 22:13:51 +0000 Subject: [PATCH 07/23] fixes --- open_instruct/finetune.py | 4 ++-- open_instruct/utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/open_instruct/finetune.py b/open_instruct/finetune.py index 4e917ff77..da6398f94 100644 --- a/open_instruct/finetune.py +++ b/open_instruct/finetune.py @@ -650,7 +650,7 @@ def main(args: FlatArguments): 0, 1, ], "LlamaTokenizer should only add one special token - the pad_token, or no tokens if pad token present." - elif isinstance(tokenizer, GPTNeoXTokenizerFast) or isinstance(tokenizer, OLMoTokenizerFast): # noqa + elif isinstance(tokenizer, GPTNeoXTokenizerFast) or (check_hf_olmo_availability() and isinstance(tokenizer, OLMoTokenizerFast)): # OLMo newer models use this tokenizer if tokenizer.bos_token is None: tokenizer.bos_token = tokenizer.eos_token @@ -1015,7 +1015,7 @@ def main(args: FlatArguments): if is_beaker_job() and accelerator.is_main_process: # dpo script only supports these two options right now for datasets if args.dataset_mixer: - dataset_list = args.dataset_mixer.keys() + dataset_list = list(args.dataset_mixer.keys()) elif args.dataset_mixer_list: dataset_list = args.dataset_mixer_list[::2] # even indices elif args.dataset_name: diff --git a/open_instruct/utils.py b/open_instruct/utils.py index 08046e17e..c45e5873d 100644 --- a/open_instruct/utils.py +++ b/open_instruct/utils.py @@ -54,7 +54,7 @@ # ---------------------------------------------------------------------------- # Import utilities -def check_hf_olmo_availability(return_version: bool = True) -> Union[dict, bool]: +def check_hf_olmo_availability(return_version: bool = False) -> Union[dict, bool]: pkg_name = "hf_olmo" # Check if the package spec exists From 5eb61ccb38da34d43f40d02c97dcc6741e047c06 Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Mon, 9 Sep 2024 23:16:00 +0000 Subject: [PATCH 08/23] add weka mounting --- configs/train_configs/sft/olmo_7b_0924.yaml | 10 ++++++---- scripts/submit_finetune_job.py | 12 +++++++++++- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/configs/train_configs/sft/olmo_7b_0924.yaml b/configs/train_configs/sft/olmo_7b_0924.yaml index 317e106d9..74bd932c2 100644 --- a/configs/train_configs/sft/olmo_7b_0924.yaml +++ b/configs/train_configs/sft/olmo_7b_0924.yaml @@ -1,10 +1,12 @@ -model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan +# model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan +model_name_or_path: /oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw3/step11931-hf model_revision: main use_flash_attn: false -tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan +# tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan +tokenizer_name: /oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw3/step11931-hf use_slow_tokenizer: false # olmo models only use fast tokenizers -dataset_name: allenai/tulu-v2-sft-mixture-olmo-2048 -max_seq_length: 2048 +dataset_name: allenai/llama-3-tulu-v3.3-mix-preview +max_seq_length: 4096 preprocessing_num_workers: 128 per_device_train_batch_size: 1 # note, this is set up for 8 GPUs gradient_accumulation_steps: 16 diff --git a/scripts/submit_finetune_job.py b/scripts/submit_finetune_job.py index ef4e87138..37df9509a 100644 --- a/scripts/submit_finetune_job.py +++ b/scripts/submit_finetune_job.py @@ -25,6 +25,8 @@ def main(): parser.add_argument("--num_nodes", type=int, default=1, help="Number of nodes to use") parser.add_argument("--image", type=str, default="nathanl/open_instruct_auto", help="Beaker image to use.") parser.add_argument("--workspace", type=str, default="ai2/tulu-2-improvements", help="Beaker workspace to use.") + parser.add_argument("--mount_on_weka", type=str, default=None, help="Mount a Weka directory to the job") + parser.add_argument("--weka_mount_path", type=str, default="/models", help="Path to mount the Weka directory") # allow unknown args from CLI, use this to modify loaded config in bash scripts for sweeping # Note, can only override args in --config passed (not default FlatArguments class in open_instruct/utils.py) @@ -166,7 +168,7 @@ def parse_args(args): d['tasks'][0]['arguments'][0] = new_arguments # name and description - exp_name = f"open_instruct_finetune_{model_name}_{now}" + exp_name = f"open_instruct_finetune_{model_name}_{now}"[:128] d['description'] = exp_name d['tasks'][0]['name'] = exp_name @@ -220,6 +222,14 @@ def parse_args(args): d['tasks'][0]['envVars'].append({ 'name': 'WANDB_API_KEY', 'secret': f"{beaker_whoami}_WANDB_API_KEY" }) + + # Weka setting + if args.mount_on_weka: + if d['tasks'][0].get('datasets') is None: + d['tasks'][0]['datasets'] = [] + d['tasks'][0]['datasets'].append({ + 'mountPath': f"{args.weka_mount_path}", 'source': {'weka': f"{args.mount_on_weka}"} + }) # optionally, print to debug config print(d) From b1344102cd0738e0579d6623c32793cb3dd02f4f Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Tue, 10 Sep 2024 00:46:57 +0000 Subject: [PATCH 09/23] up --- configs/train_configs/sft/olmo_7b_0924.yaml | 8 +++---- .../train_configs/sft/olmo_7b_0924_fw2.yaml | 24 +++++++++++++++++++ scripts/submit_finetune_job.py | 4 ++-- 3 files changed, 30 insertions(+), 6 deletions(-) create mode 100644 configs/train_configs/sft/olmo_7b_0924_fw2.yaml diff --git a/configs/train_configs/sft/olmo_7b_0924.yaml b/configs/train_configs/sft/olmo_7b_0924.yaml index 74bd932c2..91d72a1ec 100644 --- a/configs/train_configs/sft/olmo_7b_0924.yaml +++ b/configs/train_configs/sft/olmo_7b_0924.yaml @@ -1,21 +1,21 @@ # model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan -model_name_or_path: /oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw3/step11931-hf +model_name_or_path: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw3/step11931-hf model_revision: main use_flash_attn: false # tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan -tokenizer_name: /oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw3/step11931-hf +tokenizer_name: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw3/step11931-hf use_slow_tokenizer: false # olmo models only use fast tokenizers dataset_name: allenai/llama-3-tulu-v3.3-mix-preview max_seq_length: 4096 preprocessing_num_workers: 128 per_device_train_batch_size: 1 # note, this is set up for 8 GPUs -gradient_accumulation_steps: 16 +gradient_accumulation_steps: 8 learning_rate: 2.0e-06 lr_scheduler_type: linear warmup_ratio: 0.03 weight_decay: 0.0 num_train_epochs: 3 -output_dir: output/olmo_instruct/ +output_dir: /output/olmo_instruct/ with_tracking: true report_to: - wandb diff --git a/configs/train_configs/sft/olmo_7b_0924_fw2.yaml b/configs/train_configs/sft/olmo_7b_0924_fw2.yaml new file mode 100644 index 000000000..ff376aebf --- /dev/null +++ b/configs/train_configs/sft/olmo_7b_0924_fw2.yaml @@ -0,0 +1,24 @@ +# model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan +model_name_or_path: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf +model_revision: main +use_flash_attn: false +# tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan +tokenizer_name: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf +use_slow_tokenizer: false # olmo models only use fast tokenizers +dataset_name: allenai/llama-3-tulu-v3.3-mix-preview +max_seq_length: 4096 +preprocessing_num_workers: 128 +per_device_train_batch_size: 1 # note, this is set up for 8 GPUs +gradient_accumulation_steps: 4 # designed for 4 nodes +learning_rate: 2.0e-06 +lr_scheduler_type: linear +warmup_ratio: 0.03 +weight_decay: 0.0 +num_train_epochs: 3 +output_dir: /output/olmo_instruct/ +with_tracking: true +report_to: + - wandb +logging_steps: 1 +checkpointing_steps: epoch +add_bos: true \ No newline at end of file diff --git a/scripts/submit_finetune_job.py b/scripts/submit_finetune_job.py index 37df9509a..9991003f8 100644 --- a/scripts/submit_finetune_job.py +++ b/scripts/submit_finetune_job.py @@ -26,7 +26,7 @@ def main(): parser.add_argument("--image", type=str, default="nathanl/open_instruct_auto", help="Beaker image to use.") parser.add_argument("--workspace", type=str, default="ai2/tulu-2-improvements", help="Beaker workspace to use.") parser.add_argument("--mount_on_weka", type=str, default=None, help="Mount a Weka directory to the job") - parser.add_argument("--weka_mount_path", type=str, default="/models", help="Path to mount the Weka directory") + parser.add_argument("--weka_mount_path", type=str, default="/adapt-data", help="Path to mount the Weka directory") # allow unknown args from CLI, use this to modify loaded config in bash scripts for sweeping # Note, can only override args in --config passed (not default FlatArguments class in open_instruct/utils.py) @@ -173,7 +173,7 @@ def parse_args(args): d['tasks'][0]['name'] = exp_name # add cluster-specific env vars - if args.cluster == "ai2/jupiter-cirrascale-2": + if args.cluster == "ai2/jupiter-cirrascale-2" and args.num_nodes > 1: d['tasks'][0]['envVars'] += [ { "name": "NCCL_SOCKET_IFNAME", From d007da01bd5d75489fdf92d8d73c9c792eea3dc0 Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Tue, 10 Sep 2024 16:07:33 +0000 Subject: [PATCH 10/23] add hardcode flash_attn --- open_instruct/finetune.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/open_instruct/finetune.py b/open_instruct/finetune.py index da6398f94..f0aa3cab2 100644 --- a/open_instruct/finetune.py +++ b/open_instruct/finetune.py @@ -616,6 +616,7 @@ def main(args: FlatArguments): trust_remote_code=args.trust_remote_code, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" if args.use_flash_attn else "eager", + flash_attention=True if args.use_flash_attn else False, # TODO remove with ai2-olmo > 0.5.0 revision=args.model_revision, token=os.getenv("HF_TOKEN", None), ) @@ -628,6 +629,7 @@ def main(args: FlatArguments): low_cpu_mem_usage=args.low_cpu_mem_usage, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" if args.use_flash_attn else "eager", + flash_attention=True if args.use_flash_attn else False, # TODO remove with ai2-olmo > 0.5.0 revision=args.model_revision, token=os.getenv("HF_TOKEN", None), ) From 5d82ea2bd5293ab19c7c8346aba20031c01f5b6e Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Tue, 10 Sep 2024 16:28:16 +0000 Subject: [PATCH 11/23] tweaks --- open_instruct/finetune.py | 40 +++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/open_instruct/finetune.py b/open_instruct/finetune.py index f0aa3cab2..b2e9ed283 100644 --- a/open_instruct/finetune.py +++ b/open_instruct/finetune.py @@ -453,7 +453,7 @@ def main(args: FlatArguments): if check_hf_olmo_availability(): # allows AutoModel... to work with not in transformers olmo models import hf_olmo # noqa - from hf_olmo import OLMoTokenizerFast + from hf_olmo import OLMoTokenizerFast, OLMoConfig # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers @@ -616,23 +616,35 @@ def main(args: FlatArguments): trust_remote_code=args.trust_remote_code, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" if args.use_flash_attn else "eager", - flash_attention=True if args.use_flash_attn else False, # TODO remove with ai2-olmo > 0.5.0 revision=args.model_revision, token=os.getenv("HF_TOKEN", None), ) else: - model = AutoModelForCausalLM.from_pretrained( - args.model_name_or_path, - from_tf=bool(".ckpt" in args.model_name_or_path), - config=config, - trust_remote_code=args.trust_remote_code, - low_cpu_mem_usage=args.low_cpu_mem_usage, - torch_dtype=torch.bfloat16, - attn_implementation="flash_attention_2" if args.use_flash_attn else "eager", - flash_attention=True if args.use_flash_attn else False, # TODO remove with ai2-olmo > 0.5.0 - revision=args.model_revision, - token=os.getenv("HF_TOKEN", None), - ) + if (check_hf_olmo_availability() and isinstance(config, OLMoConfig)): + # handles flash_attn in config. TODO remove on ai2-olmo > 0.5.0 + config.flash_attention = args.use_flash_attn + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + trust_remote_code=args.trust_remote_code, + low_cpu_mem_usage=args.low_cpu_mem_usage, + torch_dtype=torch.bfloat16, + revision=args.model_revision, + token=os.getenv("HF_TOKEN", None), + ) + else: + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + trust_remote_code=args.trust_remote_code, + low_cpu_mem_usage=args.low_cpu_mem_usage, + torch_dtype=torch.bfloat16, + attn_implementation="flash_attention_2" if args.use_flash_attn else "eager", + revision=args.model_revision, + token=os.getenv("HF_TOKEN", None), + ) else: logger.info("Training new model from scratch") model = AutoModelForCausalLM.from_config(config) From 11397bf76e67d718da72ecd77856722315db338e Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Tue, 10 Sep 2024 17:07:05 +0000 Subject: [PATCH 12/23] making it work nicely --- configs/train_configs/sft/olmo_7b_0924.yaml | 2 +- .../train_configs/sft/olmo_7b_0924_fw2.yaml | 2 +- .../sft/olmo_7b_0924_fw2_permissive.yaml | 31 +++++++++++++++++++ open_instruct/finetune.py | 9 ++++-- open_instruct/utils.py | 3 +- 5 files changed, 40 insertions(+), 7 deletions(-) create mode 100644 configs/train_configs/sft/olmo_7b_0924_fw2_permissive.yaml diff --git a/configs/train_configs/sft/olmo_7b_0924.yaml b/configs/train_configs/sft/olmo_7b_0924.yaml index 91d72a1ec..b08a6ac2a 100644 --- a/configs/train_configs/sft/olmo_7b_0924.yaml +++ b/configs/train_configs/sft/olmo_7b_0924.yaml @@ -1,7 +1,7 @@ # model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan model_name_or_path: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw3/step11931-hf model_revision: main -use_flash_attn: false +use_flash_attn: true # tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan tokenizer_name: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw3/step11931-hf use_slow_tokenizer: false # olmo models only use fast tokenizers diff --git a/configs/train_configs/sft/olmo_7b_0924_fw2.yaml b/configs/train_configs/sft/olmo_7b_0924_fw2.yaml index ff376aebf..3a40cd915 100644 --- a/configs/train_configs/sft/olmo_7b_0924_fw2.yaml +++ b/configs/train_configs/sft/olmo_7b_0924_fw2.yaml @@ -1,7 +1,7 @@ # model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan model_name_or_path: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf model_revision: main -use_flash_attn: false +use_flash_attn: true # tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan tokenizer_name: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf use_slow_tokenizer: false # olmo models only use fast tokenizers diff --git a/configs/train_configs/sft/olmo_7b_0924_fw2_permissive.yaml b/configs/train_configs/sft/olmo_7b_0924_fw2_permissive.yaml new file mode 100644 index 000000000..72d3637b6 --- /dev/null +++ b/configs/train_configs/sft/olmo_7b_0924_fw2_permissive.yaml @@ -0,0 +1,31 @@ +# model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan +model_name_or_path: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf +model_revision: main +use_flash_attn: true +# tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan +tokenizer_name: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf +use_slow_tokenizer: false # olmo models only use fast tokenizers +dataset_mixer: + ai2-adapt-dev/metamath-qa-reformat: 1.0 # MIT License + nvidia/Daring-Anteater: 1.0 # CC BY 4.0, + natolambert/tulu-v2-sft-mixture-flan: 1.0 # FLAN Apache 2.0 + natolambert/tulu-v2-sft-mixture-cot: 1.0 # FLAN Apache 2.0 + Open-Orca/OpenOrca: .02 # MIT + allenai/openassistant-guanaco-reformatted: 1.0 # Apache 2.0 + ai2-adapt-dev/codefeedback-single-turn-reformat-magicoder: 1.0 # MIT MagiCoder section of CodeFeedback +max_seq_length: 4096 +preprocessing_num_workers: 128 +per_device_train_batch_size: 1 # note, this is set up for 8 GPUs +gradient_accumulation_steps: 4 # designed for 4 nodes +learning_rate: 2.0e-06 +lr_scheduler_type: linear +warmup_ratio: 0.03 +weight_decay: 0.0 +num_train_epochs: 3 +output_dir: /output/olmo_instruct/ +with_tracking: true +report_to: + - wandb +logging_steps: 1 +checkpointing_steps: epoch +add_bos: true \ No newline at end of file diff --git a/open_instruct/finetune.py b/open_instruct/finetune.py index b2e9ed283..5842e6811 100644 --- a/open_instruct/finetune.py +++ b/open_instruct/finetune.py @@ -453,7 +453,7 @@ def main(args: FlatArguments): if check_hf_olmo_availability(): # allows AutoModel... to work with not in transformers olmo models import hf_olmo # noqa - from hf_olmo import OLMoTokenizerFast, OLMoConfig + from hf_olmo import OLMoConfig, OLMoTokenizerFast # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers @@ -620,7 +620,8 @@ def main(args: FlatArguments): token=os.getenv("HF_TOKEN", None), ) else: - if (check_hf_olmo_availability() and isinstance(config, OLMoConfig)): + if check_hf_olmo_availability() and isinstance(config, OLMoConfig): + logger.info("Temporary loading for recent OLMo Models") # handles flash_attn in config. TODO remove on ai2-olmo > 0.5.0 config.flash_attention = args.use_flash_attn model = AutoModelForCausalLM.from_pretrained( @@ -664,7 +665,9 @@ def main(args: FlatArguments): 0, 1, ], "LlamaTokenizer should only add one special token - the pad_token, or no tokens if pad token present." - elif isinstance(tokenizer, GPTNeoXTokenizerFast) or (check_hf_olmo_availability() and isinstance(tokenizer, OLMoTokenizerFast)): + elif isinstance(tokenizer, GPTNeoXTokenizerFast) or ( + check_hf_olmo_availability() and isinstance(tokenizer, OLMoTokenizerFast) + ): # OLMo newer models use this tokenizer if tokenizer.bos_token is None: tokenizer.bos_token = tokenizer.eos_token diff --git a/open_instruct/utils.py b/open_instruct/utils.py index c45e5873d..b7fd1ae1a 100644 --- a/open_instruct/utils.py +++ b/open_instruct/utils.py @@ -70,10 +70,9 @@ def check_hf_olmo_availability(return_version: bool = False) -> Union[dict, bool try: package = importlib.import_module(pkg_name) package_version = getattr(package, "__version__", "N/A") - if package_version == "N/A": - package_exists = False except ImportError: package_exists = False + package_version = "N/A" if return_version: return { From ee5c7d22cce9aebfc46f5ab35d7eaa3f47e5d944 Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Wed, 11 Sep 2024 22:13:59 +0000 Subject: [PATCH 13/23] clean --- Dockerfile.olmo | 3 +- .../default_finetune_offloading.yaml | 69 +++++++++++++++++++ configs/train_configs/sft/olmo_7b_0924.yaml | 10 ++- .../train_configs/sft/olmo_7b_0924_fw2.yaml | 24 ------- .../sft/olmo_7b_0924_fw2_permissive.yaml | 13 ++-- open_instruct/finetune.py | 52 +++++++------- 6 files changed, 110 insertions(+), 61 deletions(-) create mode 100644 configs/beaker_configs/default_finetune_offloading.yaml delete mode 100644 configs/train_configs/sft/olmo_7b_0924_fw2.yaml diff --git a/Dockerfile.olmo b/Dockerfile.olmo index 6259756ae..1f8da727d 100644 --- a/Dockerfile.olmo +++ b/Dockerfile.olmo @@ -93,7 +93,8 @@ RUN pip install flash-attn==2.5.9.post1 --no-build-isolation # for newest olmo's, move to requirements when ai2-olmo supports torch 2.4 # core is a dependency of ai2-olmo RUN pip install ai2-olmo-core==0.1.0 omegaconf -RUN pip install ai2-olmo>=0.5.0 --no-deps +# RUN pip install ai2-olmo>=0.5.0 --no-deps +RUN pip install git+https://github.com/allenai/OLMo.git@shanea/hf-olmo-gradient-checkpointing --no-deps RUN pip install -r requirements-olmo.txt # NLTK download diff --git a/configs/beaker_configs/default_finetune_offloading.yaml b/configs/beaker_configs/default_finetune_offloading.yaml new file mode 100644 index 000000000..4722b4e13 --- /dev/null +++ b/configs/beaker_configs/default_finetune_offloading.yaml @@ -0,0 +1,69 @@ +version: v2 +description: open-instruct-finetune +budget: ai2/oe-adapt +tasks: + - name: open-instruct-finetune + image: + beaker: nathanl/open_instruct_auto + command: [ + '/bin/sh', '-c' + ] + arguments: ['PYTHONPATH="/stage:$PYTHONPATH" accelerate launch + --mixed_precision bf16 + --num_machines 1 + --num_processes 4 + --use_deepspeed + --deepspeed_config_file configs/ds_configs/stage3_offloading_accelerate.conf + open_instruct/finetune.py + --model_name_or_path /hf_llama_models + --use_flash_attn + --tokenizer_name /hf_llama_models + --max_seq_length 2048 + --preprocessing_num_workers 16 + --per_device_train_batch_size 2 + --gradient_accumulation_steps 16 + --learning_rate 2e-5 + --lr_scheduler_type linear + --warmup_ratio 0.03 + --weight_decay 0. + --num_train_epochs 2 + --output_dir /output/ + --with_tracking + --report_to tensorboard + --logging_steps 1 + '] + envVars: + - name: CUDA_DEVICE_ORDER + value: PCI_BUS_ID + - name: TRANSFORMERS_CACHE + value: ./cache/ + - name: WANDB_API_KEY + secret: WANDB_API_KEY + - name: WANDB_PROJECT + value: open-instruct + - name: WANDB_WATCH + value: false + - name: WANDB_LOG_MODEL + value: false + - name: WANDB_DISABLED + value: true + - name: HF_TOKEN + secret: HF_TOKEN + # datasets: # example for how to include datasets in mounting + # - mountPath: /data + # source: + # beaker: Yizhongw03/processed_open_instruct_data + # - mountPath: /mmlu + # source: + # beaker: Yizhongw03/mmlu + # - mountPath: /hf_llama_models + # source: + # beaker: Yizhongw03/hf_llama_model_7B + result: + path: /output + resources: + gpuCount: 4 + context: + cluster: ai2/allennlp-cirrascale + priority: high + preemptible: false \ No newline at end of file diff --git a/configs/train_configs/sft/olmo_7b_0924.yaml b/configs/train_configs/sft/olmo_7b_0924.yaml index b08a6ac2a..7f62e1dca 100644 --- a/configs/train_configs/sft/olmo_7b_0924.yaml +++ b/configs/train_configs/sft/olmo_7b_0924.yaml @@ -1,15 +1,13 @@ -# model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan -model_name_or_path: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw3/step11931-hf +model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan model_revision: main use_flash_attn: true -# tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan -tokenizer_name: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw3/step11931-hf +tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan use_slow_tokenizer: false # olmo models only use fast tokenizers dataset_name: allenai/llama-3-tulu-v3.3-mix-preview max_seq_length: 4096 preprocessing_num_workers: 128 -per_device_train_batch_size: 1 # note, this is set up for 8 GPUs -gradient_accumulation_steps: 8 +per_device_train_batch_size: 1 +gradient_accumulation_steps: 8 # should run with this set to 16 for 1 node only learning_rate: 2.0e-06 lr_scheduler_type: linear warmup_ratio: 0.03 diff --git a/configs/train_configs/sft/olmo_7b_0924_fw2.yaml b/configs/train_configs/sft/olmo_7b_0924_fw2.yaml deleted file mode 100644 index 3a40cd915..000000000 --- a/configs/train_configs/sft/olmo_7b_0924_fw2.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan -model_name_or_path: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf -model_revision: main -use_flash_attn: true -# tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan -tokenizer_name: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf -use_slow_tokenizer: false # olmo models only use fast tokenizers -dataset_name: allenai/llama-3-tulu-v3.3-mix-preview -max_seq_length: 4096 -preprocessing_num_workers: 128 -per_device_train_batch_size: 1 # note, this is set up for 8 GPUs -gradient_accumulation_steps: 4 # designed for 4 nodes -learning_rate: 2.0e-06 -lr_scheduler_type: linear -warmup_ratio: 0.03 -weight_decay: 0.0 -num_train_epochs: 3 -output_dir: /output/olmo_instruct/ -with_tracking: true -report_to: - - wandb -logging_steps: 1 -checkpointing_steps: epoch -add_bos: true \ No newline at end of file diff --git a/configs/train_configs/sft/olmo_7b_0924_fw2_permissive.yaml b/configs/train_configs/sft/olmo_7b_0924_fw2_permissive.yaml index 72d3637b6..4539f713a 100644 --- a/configs/train_configs/sft/olmo_7b_0924_fw2_permissive.yaml +++ b/configs/train_configs/sft/olmo_7b_0924_fw2_permissive.yaml @@ -7,22 +7,27 @@ tokenizer_name: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from use_slow_tokenizer: false # olmo models only use fast tokenizers dataset_mixer: ai2-adapt-dev/metamath-qa-reformat: 1.0 # MIT License - nvidia/Daring-Anteater: 1.0 # CC BY 4.0, natolambert/tulu-v2-sft-mixture-flan: 1.0 # FLAN Apache 2.0 natolambert/tulu-v2-sft-mixture-cot: 1.0 # FLAN Apache 2.0 - Open-Orca/OpenOrca: .02 # MIT allenai/openassistant-guanaco-reformatted: 1.0 # Apache 2.0 ai2-adapt-dev/codefeedback-single-turn-reformat-magicoder: 1.0 # MIT MagiCoder section of CodeFeedback + ai2-adapt-dev/aya_dataset-reformat: 1.0 # Apache 2.0 + ai2-adapt-dev/SlimOrca-reformat: 0.25 # MIT License + ai2-adapt-dev/Daring-Anteater-reformat: 1.0 # CC BY 4.0 + ai2-adapt-dev/WebInstructSub-reformat-apache: 0.1 # Apache 2.0 + ai2-adapt-dev/Table-GPT-All-train: 0.5 # MIT max_seq_length: 4096 preprocessing_num_workers: 128 -per_device_train_batch_size: 1 # note, this is set up for 8 GPUs +per_device_train_batch_size: 1 gradient_accumulation_steps: 4 # designed for 4 nodes +# gradient_accumulation_steps: 16 # designed for 1 nodes +gradient_checkpointing: true learning_rate: 2.0e-06 lr_scheduler_type: linear warmup_ratio: 0.03 weight_decay: 0.0 num_train_epochs: 3 -output_dir: /output/olmo_instruct/ +output_dir: /output/ with_tracking: true report_to: - wandb diff --git a/open_instruct/finetune.py b/open_instruct/finetune.py index 5842e6811..f771790f8 100644 --- a/open_instruct/finetune.py +++ b/open_instruct/finetune.py @@ -620,32 +620,32 @@ def main(args: FlatArguments): token=os.getenv("HF_TOKEN", None), ) else: - if check_hf_olmo_availability() and isinstance(config, OLMoConfig): - logger.info("Temporary loading for recent OLMo Models") - # handles flash_attn in config. TODO remove on ai2-olmo > 0.5.0 - config.flash_attention = args.use_flash_attn - model = AutoModelForCausalLM.from_pretrained( - args.model_name_or_path, - from_tf=bool(".ckpt" in args.model_name_or_path), - config=config, - trust_remote_code=args.trust_remote_code, - low_cpu_mem_usage=args.low_cpu_mem_usage, - torch_dtype=torch.bfloat16, - revision=args.model_revision, - token=os.getenv("HF_TOKEN", None), - ) - else: - model = AutoModelForCausalLM.from_pretrained( - args.model_name_or_path, - from_tf=bool(".ckpt" in args.model_name_or_path), - config=config, - trust_remote_code=args.trust_remote_code, - low_cpu_mem_usage=args.low_cpu_mem_usage, - torch_dtype=torch.bfloat16, - attn_implementation="flash_attention_2" if args.use_flash_attn else "eager", - revision=args.model_revision, - token=os.getenv("HF_TOKEN", None), - ) + # if check_hf_olmo_availability() and isinstance(config, OLMoConfig): + # logger.info("Temporary loading for recent OLMo Models") + # # handles flash_attn in config. TODO remove on ai2-olmo > 0.5.0 + # config.flash_attention = args.use_flash_attn + # model = AutoModelForCausalLM.from_pretrained( + # args.model_name_or_path, + # from_tf=bool(".ckpt" in args.model_name_or_path), + # config=config, + # trust_remote_code=args.trust_remote_code, + # low_cpu_mem_usage=args.low_cpu_mem_usage, + # torch_dtype=torch.bfloat16, + # revision=args.model_revision, + # token=os.getenv("HF_TOKEN", None), + # ) + # else: + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + trust_remote_code=args.trust_remote_code, + low_cpu_mem_usage=args.low_cpu_mem_usage, + torch_dtype=torch.bfloat16, + attn_implementation="flash_attention_2" if args.use_flash_attn else "eager", + revision=args.model_revision, + token=os.getenv("HF_TOKEN", None), + ) else: logger.info("Training new model from scratch") model = AutoModelForCausalLM.from_config(config) From fd16aeef6e97de44b938b424db5ccab460b53233 Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Wed, 11 Sep 2024 22:14:44 +0000 Subject: [PATCH 14/23] clean --- open_instruct/finetune.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/open_instruct/finetune.py b/open_instruct/finetune.py index f771790f8..488d4e551 100644 --- a/open_instruct/finetune.py +++ b/open_instruct/finetune.py @@ -453,7 +453,7 @@ def main(args: FlatArguments): if check_hf_olmo_availability(): # allows AutoModel... to work with not in transformers olmo models import hf_olmo # noqa - from hf_olmo import OLMoConfig, OLMoTokenizerFast + from hf_olmo import OLMoTokenizerFast # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers @@ -620,21 +620,6 @@ def main(args: FlatArguments): token=os.getenv("HF_TOKEN", None), ) else: - # if check_hf_olmo_availability() and isinstance(config, OLMoConfig): - # logger.info("Temporary loading for recent OLMo Models") - # # handles flash_attn in config. TODO remove on ai2-olmo > 0.5.0 - # config.flash_attention = args.use_flash_attn - # model = AutoModelForCausalLM.from_pretrained( - # args.model_name_or_path, - # from_tf=bool(".ckpt" in args.model_name_or_path), - # config=config, - # trust_remote_code=args.trust_remote_code, - # low_cpu_mem_usage=args.low_cpu_mem_usage, - # torch_dtype=torch.bfloat16, - # revision=args.model_revision, - # token=os.getenv("HF_TOKEN", None), - # ) - # else: model = AutoModelForCausalLM.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), From fa447f8e8c3e6ef0e6cfe6958398e6187054d203 Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Wed, 11 Sep 2024 22:16:51 +0000 Subject: [PATCH 15/23] clean --- Dockerfile | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 9f4e1fb00..dd6b95a97 100644 --- a/Dockerfile +++ b/Dockerfile @@ -90,10 +90,6 @@ RUN pip install --upgrade pip "setuptools<70.0.0" wheel RUN pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121 RUN pip install packaging RUN pip install flash-attn==2.6.3 --no-build-isolation -# for newest olmo's, move to requirements when ai2-olmo supports torch 2.4 -# core is a dependency of ai2-olmo -# RUN pip install ai2-olmo-core==0.1.0 omegaconf -# RUN pip install ai2-olmo>=0.5.0 --no-deps RUN pip install -r requirements.txt # NLTK download From e02e98547eb0f929837002c7191e9ee29f581117 Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Wed, 11 Sep 2024 22:26:17 +0000 Subject: [PATCH 16/23] up --- Dockerfile.olmo | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile.olmo b/Dockerfile.olmo index 1f8da727d..040abad5c 100644 --- a/Dockerfile.olmo +++ b/Dockerfile.olmo @@ -94,6 +94,7 @@ RUN pip install flash-attn==2.5.9.post1 --no-build-isolation # core is a dependency of ai2-olmo RUN pip install ai2-olmo-core==0.1.0 omegaconf # RUN pip install ai2-olmo>=0.5.0 --no-deps +# TODO Update Once this is merged https://github.com/allenai/OLMo/pull/719, then next release RUN pip install git+https://github.com/allenai/OLMo.git@shanea/hf-olmo-gradient-checkpointing --no-deps RUN pip install -r requirements-olmo.txt From a0a32bf0b4f73d3756b767d38c947dbf9a422cdb Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Wed, 11 Sep 2024 23:15:32 +0000 Subject: [PATCH 17/23] no longer install from branch --- Dockerfile.olmo | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.olmo b/Dockerfile.olmo index 040abad5c..10e2d9f12 100644 --- a/Dockerfile.olmo +++ b/Dockerfile.olmo @@ -95,7 +95,7 @@ RUN pip install flash-attn==2.5.9.post1 --no-build-isolation RUN pip install ai2-olmo-core==0.1.0 omegaconf # RUN pip install ai2-olmo>=0.5.0 --no-deps # TODO Update Once this is merged https://github.com/allenai/OLMo/pull/719, then next release -RUN pip install git+https://github.com/allenai/OLMo.git@shanea/hf-olmo-gradient-checkpointing --no-deps +RUN pip install git+https://github.com/allenai/OLMo.git@47f8f5abb40eb100c6623be12e1648c841b2ab99 --no-deps RUN pip install -r requirements-olmo.txt # NLTK download From 503e61e7d4d3b0eabd29c2015ac1fbe491281333 Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Mon, 16 Sep 2024 19:49:46 +0000 Subject: [PATCH 18/23] fixes --- configs/train_configs/sft/olmo_7b_0924.yaml | 2 +- .../sft/olmo_7b_0924_fw2_tulu_v3.4.yaml | 26 +++++++++++++++++++ open_instruct/mix_data.py | 6 +++-- 3 files changed, 31 insertions(+), 3 deletions(-) create mode 100644 configs/train_configs/sft/olmo_7b_0924_fw2_tulu_v3.4.yaml diff --git a/configs/train_configs/sft/olmo_7b_0924.yaml b/configs/train_configs/sft/olmo_7b_0924.yaml index 7f62e1dca..e8264bd0a 100644 --- a/configs/train_configs/sft/olmo_7b_0924.yaml +++ b/configs/train_configs/sft/olmo_7b_0924.yaml @@ -13,7 +13,7 @@ lr_scheduler_type: linear warmup_ratio: 0.03 weight_decay: 0.0 num_train_epochs: 3 -output_dir: /output/olmo_instruct/ +output_dir: /output/ with_tracking: true report_to: - wandb diff --git a/configs/train_configs/sft/olmo_7b_0924_fw2_tulu_v3.4.yaml b/configs/train_configs/sft/olmo_7b_0924_fw2_tulu_v3.4.yaml new file mode 100644 index 000000000..3efd900c6 --- /dev/null +++ b/configs/train_configs/sft/olmo_7b_0924_fw2_tulu_v3.4.yaml @@ -0,0 +1,26 @@ +# model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan +model_name_or_path: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf +model_revision: main +use_flash_attn: true +# tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan +tokenizer_name: /adapt-data/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-dclm07-fw2/step11931-hf +use_slow_tokenizer: false # olmo models only use fast tokenizers +dataset_name: allenai/tulu-v3.4-mix-preview +max_seq_length: 4096 +preprocessing_num_workers: 128 +per_device_train_batch_size: 1 +gradient_accumulation_steps: 4 # designed for 4 nodes +# gradient_accumulation_steps: 16 # designed for 1 nodes +gradient_checkpointing: true +learning_rate: 2.0e-06 +lr_scheduler_type: linear +warmup_ratio: 0.03 +weight_decay: 0.0 +num_train_epochs: 3 +output_dir: /output/ +with_tracking: true +report_to: + - wandb +logging_steps: 1 +checkpointing_steps: epoch +add_bos: true \ No newline at end of file diff --git a/open_instruct/mix_data.py b/open_instruct/mix_data.py index 6e7260c8f..c34bf0274 100644 --- a/open_instruct/mix_data.py +++ b/open_instruct/mix_data.py @@ -15,11 +15,13 @@ # limitations under the License. # script for mixing and saving data -from .utils import ArgumentParserPlus, FlatArguments, get_datasets +from open_instruct.utils import ArgumentParserPlus, get_datasets +from open_instruct.finetune import FlatArguments # Run as module for local imports, e.g.: -# python -m open_instruct.mix_data configs/train_configs/sft/default.yaml --dataset_mix_dir=output/tmp/ +# python open_instruct/mix_data.py configs/train_configs/sft/tulu3_8b_preview_mix_v3.4.yaml --dataset_mix_dir=output/tmp/ # can pass --save_to_hub=allenai/tulu-v3.1-mix-preview-4096-OLMoE +# note that = is needed with our argparser def main(): From 982726435268af12ade94d8713a39e747e0a413b Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Tue, 24 Sep 2024 00:52:04 +0000 Subject: [PATCH 19/23] dpo config --- configs/train_configs/dpo/olmo_7b_0924.yaml | 29 +++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 configs/train_configs/dpo/olmo_7b_0924.yaml diff --git a/configs/train_configs/dpo/olmo_7b_0924.yaml b/configs/train_configs/dpo/olmo_7b_0924.yaml new file mode 100644 index 000000000..3028bfca8 --- /dev/null +++ b/configs/train_configs/dpo/olmo_7b_0924.yaml @@ -0,0 +1,29 @@ +model_name_or_path: /model +model_revision: main +use_flash_attn: true +gradient_checkpointing: true +dataset_mixer: + allenai/ultrafeedback_binarized_cleaned_train: 1.0 + ai2-adapt-dev/DaringAnteater-prefs-RM-filter: 1.0 + ai2-adapt-dev/WildChat-prefs-280824: 1.0 + allenai/tulu-3-hardcoded-preferences: 1.0 +tokenizer_name: /model +use_slow_tokenizer: true +max_seq_length: 2048 +preprocessing_num_workers: 16 +per_device_train_batch_size: 1 +gradient_accumulation_steps: 16 # designed for 8 GPUs, so batch size 128 +learning_rate: 5.0e-7 +lr_scheduler_type: linear +warmup_ratio: 0.1 +weight_decay: 0.0 +num_train_epochs: 1 +output_dir: /output +with_tracking: true +report_to: + - wandb +logging_steps: 1 +use_lora: false +dpo_loss_type: dpo_norm +dpo_beta: 5 +checkpointing_steps: 1000 \ No newline at end of file From b175717e14ea64506485f7261cc1d02d921d5c89 Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Tue, 24 Sep 2024 16:30:10 +0000 Subject: [PATCH 20/23] temp olmo changes --- Dockerfile.olmo | 4 ++++ configs/train_configs/sft/olmo_7b_0924_fw2_tulu_v3.4.yaml | 4 +++- scripts/eval/oe-eval.sh | 5 ++++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/Dockerfile.olmo b/Dockerfile.olmo index 10e2d9f12..40ef4377a 100644 --- a/Dockerfile.olmo +++ b/Dockerfile.olmo @@ -98,6 +98,10 @@ RUN pip install ai2-olmo-core==0.1.0 omegaconf RUN pip install git+https://github.com/allenai/OLMo.git@47f8f5abb40eb100c6623be12e1648c841b2ab99 --no-deps RUN pip install -r requirements-olmo.txt +RUN pip install git+https://github.com/AkshitaB/vllm.git +RUN pip install vllm-flash-attn + + # NLTK download RUN python -m nltk.downloader punkt COPY open_instruct open_instruct diff --git a/configs/train_configs/sft/olmo_7b_0924_fw2_tulu_v3.4.yaml b/configs/train_configs/sft/olmo_7b_0924_fw2_tulu_v3.4.yaml index 3efd900c6..491fc4502 100644 --- a/configs/train_configs/sft/olmo_7b_0924_fw2_tulu_v3.4.yaml +++ b/configs/train_configs/sft/olmo_7b_0924_fw2_tulu_v3.4.yaml @@ -9,7 +9,8 @@ dataset_name: allenai/tulu-v3.4-mix-preview max_seq_length: 4096 preprocessing_num_workers: 128 per_device_train_batch_size: 1 -gradient_accumulation_steps: 4 # designed for 4 nodes +# gradient_accumulation_steps: 4 # designed for 4 nodes +gradient_accumulation_steps: 8 # designed for 2 nodes # gradient_accumulation_steps: 16 # designed for 1 nodes gradient_checkpointing: true learning_rate: 2.0e-06 @@ -19,6 +20,7 @@ weight_decay: 0.0 num_train_epochs: 3 output_dir: /output/ with_tracking: true +reduce_loss: mean report_to: - wandb logging_steps: 1 diff --git a/scripts/eval/oe-eval.sh b/scripts/eval/oe-eval.sh index e250faaef..3cb3a68e1 100755 --- a/scripts/eval/oe-eval.sh +++ b/scripts/eval/oe-eval.sh @@ -92,7 +92,10 @@ TASKS=( "alpaca_eval_v2::tulu" "truthfulqa::tulu" ) -MODEL_TYPE="--model-type vllm" +# For models without VLLM (experimental architectures) +# comment out the VLLM arg and set GPU_COUNT_OTHER to 1 +# also consider lowering the batch size (VLLM arg), maybe to 5, VLLM handles it differently +# MODEL_TYPE="--model-type vllm" BATCH_SIZE_VLLM=10000 BATCH_SIZE_OTHER=1 GPU_COUNT=1 From 281e28eb245648dc9f3dc4b38c664d1f0f06f2f7 Mon Sep 17 00:00:00 2001 From: nouhadziri Date: Mon, 4 Nov 2024 15:05:34 -0500 Subject: [PATCH 21/23] add olmo training --- .../sft/{ => olmo}/olmo_7b_0924.yaml | 0 .../sft/olmo/olmo_7b_0924_v3.9_safety.yaml | 24 +++++++++++++++++++ open_instruct/olmo/scripts/sft/olmo_test.sh | 7 ++++++ 3 files changed, 31 insertions(+) rename configs/train_configs/sft/{ => olmo}/olmo_7b_0924.yaml (100%) create mode 100644 configs/train_configs/sft/olmo/olmo_7b_0924_v3.9_safety.yaml create mode 100644 open_instruct/olmo/scripts/sft/olmo_test.sh diff --git a/configs/train_configs/sft/olmo_7b_0924.yaml b/configs/train_configs/sft/olmo/olmo_7b_0924.yaml similarity index 100% rename from configs/train_configs/sft/olmo_7b_0924.yaml rename to configs/train_configs/sft/olmo/olmo_7b_0924.yaml diff --git a/configs/train_configs/sft/olmo/olmo_7b_0924_v3.9_safety.yaml b/configs/train_configs/sft/olmo/olmo_7b_0924_v3.9_safety.yaml new file mode 100644 index 000000000..49f926e9f --- /dev/null +++ b/configs/train_configs/sft/olmo/olmo_7b_0924_v3.9_safety.yaml @@ -0,0 +1,24 @@ +model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan +model_revision: main +use_flash_attn: true +tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan +use_slow_tokenizer: false # olmo models only use fast tokenizers +dataset_mixer: + ai2-adapt-dev/synthetic-cot-wildguarmixtrain: 86759 + ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_unused: 209574 # all +max_seq_length: 4096 +preprocessing_num_workers: 128 +per_device_train_batch_size: 1 +gradient_accumulation_steps: 8 # should run with this set to 16 for 1 node only +learning_rate: 2.0e-06 +lr_scheduler_type: linear +warmup_ratio: 0.03 +weight_decay: 0.0 +num_train_epochs: 3 +output_dir: /output/ +with_tracking: true +report_to: + - wandb +logging_steps: 1 +checkpointing_steps: epoch +add_bos: true \ No newline at end of file diff --git a/open_instruct/olmo/scripts/sft/olmo_test.sh b/open_instruct/olmo/scripts/sft/olmo_test.sh new file mode 100644 index 000000000..f8ff7b63e --- /dev/null +++ b/open_instruct/olmo/scripts/sft/olmo_test.sh @@ -0,0 +1,7 @@ +python scripts/submit_finetune_job.py \ + --default_beaker_config configs/beaker_configs/default_finetune_offloading.yaml \ + --config configs/train_configs/sft/olmo_7b_0924_v3.9_safety.yaml \ + --cluster ai2/jupiter-cirrascale-2 \ + --priority high \ + --exp_name nd-SFT-olmo_7b_0924_v3.9_safety \ + --num_gpus 8 \ No newline at end of file From e8ddd567fb4f16d69341499fe81328a5d1dc6df8 Mon Sep 17 00:00:00 2001 From: nouhadziri Date: Mon, 4 Nov 2024 15:15:03 -0500 Subject: [PATCH 22/23] fix dir in config --- open_instruct/olmo/scripts/sft/olmo_test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/open_instruct/olmo/scripts/sft/olmo_test.sh b/open_instruct/olmo/scripts/sft/olmo_test.sh index f8ff7b63e..73c4b8153 100644 --- a/open_instruct/olmo/scripts/sft/olmo_test.sh +++ b/open_instruct/olmo/scripts/sft/olmo_test.sh @@ -1,6 +1,6 @@ python scripts/submit_finetune_job.py \ --default_beaker_config configs/beaker_configs/default_finetune_offloading.yaml \ - --config configs/train_configs/sft/olmo_7b_0924_v3.9_safety.yaml \ + --config configs/train_configs/sft/olmo/olmo_7b_0924_v3.9_safety.yaml \ --cluster ai2/jupiter-cirrascale-2 \ --priority high \ --exp_name nd-SFT-olmo_7b_0924_v3.9_safety \ From abd25bf81a5194b28cbd56ee3af8a7ffe7a5d46b Mon Sep 17 00:00:00 2001 From: nouhadziri Date: Mon, 4 Nov 2024 16:04:59 -0500 Subject: [PATCH 23/23] rollback my changes --- .../sft/olmo/olmo_7b_0924_v3.9_safety.yaml | 24 ------------------- .../sft/{olmo => }/olmo_7b_0924.yaml | 0 open_instruct/olmo/scripts/sft/olmo_test.sh | 7 ------ 3 files changed, 31 deletions(-) delete mode 100644 configs/train_configs/sft/olmo/olmo_7b_0924_v3.9_safety.yaml rename configs/train_configs/sft/{olmo => }/olmo_7b_0924.yaml (100%) delete mode 100644 open_instruct/olmo/scripts/sft/olmo_test.sh diff --git a/configs/train_configs/sft/olmo/olmo_7b_0924_v3.9_safety.yaml b/configs/train_configs/sft/olmo/olmo_7b_0924_v3.9_safety.yaml deleted file mode 100644 index 49f926e9f..000000000 --- a/configs/train_configs/sft/olmo/olmo_7b_0924_v3.9_safety.yaml +++ /dev/null @@ -1,24 +0,0 @@ -model_name_or_path: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan -model_revision: main -use_flash_attn: true -tokenizer_name: ai2-adapt-dev/OLMo-medium-peteish7-anneal-from-928646-50B-nowup-dclm07-flan -use_slow_tokenizer: false # olmo models only use fast tokenizers -dataset_mixer: - ai2-adapt-dev/synthetic-cot-wildguarmixtrain: 86759 - ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_unused: 209574 # all -max_seq_length: 4096 -preprocessing_num_workers: 128 -per_device_train_batch_size: 1 -gradient_accumulation_steps: 8 # should run with this set to 16 for 1 node only -learning_rate: 2.0e-06 -lr_scheduler_type: linear -warmup_ratio: 0.03 -weight_decay: 0.0 -num_train_epochs: 3 -output_dir: /output/ -with_tracking: true -report_to: - - wandb -logging_steps: 1 -checkpointing_steps: epoch -add_bos: true \ No newline at end of file diff --git a/configs/train_configs/sft/olmo/olmo_7b_0924.yaml b/configs/train_configs/sft/olmo_7b_0924.yaml similarity index 100% rename from configs/train_configs/sft/olmo/olmo_7b_0924.yaml rename to configs/train_configs/sft/olmo_7b_0924.yaml diff --git a/open_instruct/olmo/scripts/sft/olmo_test.sh b/open_instruct/olmo/scripts/sft/olmo_test.sh deleted file mode 100644 index 73c4b8153..000000000 --- a/open_instruct/olmo/scripts/sft/olmo_test.sh +++ /dev/null @@ -1,7 +0,0 @@ -python scripts/submit_finetune_job.py \ - --default_beaker_config configs/beaker_configs/default_finetune_offloading.yaml \ - --config configs/train_configs/sft/olmo/olmo_7b_0924_v3.9_safety.yaml \ - --cluster ai2/jupiter-cirrascale-2 \ - --priority high \ - --exp_name nd-SFT-olmo_7b_0924_v3.9_safety \ - --num_gpus 8 \ No newline at end of file