Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add re-sft scripts #14

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions eval-scripts/eval_13b_base.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash

set -eux

SCRIPT_PATH="llm-jp-eval/"
cd $SCRIPT_PATH

source venv/bin/activate
export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}

# Evaluate procedure
function evaluate_13b_model () {
TARGET_MODEL=$1
WANDB_NAME=$2

WANDB_ENTITY="llm-jp"
WANDB_PROJECT="13b-sft-and-eval"

python scripts/evaluate_llm.py \
model.pretrained_model_name_or_path="$TARGET_MODEL" \
tokenizer.pretrained_model_name_or_path="$TARGET_MODEL" \
target_dataset=all \
wandb.run_name="$WANDB_NAME" \
wandb.entity="$WANDB_ENTITY" \
wandb.project="$WANDB_PROJECT" \
wandb.log=true
}

# exp6
TARGET_MODEL="/model/13B_HF/llm-jp-13b-cc-v2-63500step.code10K_en20K_ja30K_ver2.2" # exp6
WANDB_NAME="eval-exp6-base"
evaluate_13b_model $TARGET_MODEL $WANDB_NAME

# exp8
TARGET_MODEL="/model/13B_HF/llm-jp-13b-cc-v2-beta-61000step.code20K_en40K_ja60K_ver2.2" # exp8
WANDB_NAME="eval-exp8-base"
evaluate_13b_model $TARGET_MODEL $WANDB_NAME
37 changes: 37 additions & 0 deletions eval-scripts/eval_13b_sft-full-jaster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash

set -eux

SCRIPT_PATH="llm-jp-eval/"
cd $SCRIPT_PATH

source venv/bin/activate
export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}

# Evaluate procedure
function evaluate_13b_model () {
TARGET_MODEL=$1
WANDB_NAME=$2

WANDB_ENTITY="llm-jp"
WANDB_PROJECT="13b-sft-and-eval"

python scripts/evaluate_llm.py \
model.pretrained_model_name_or_path="$TARGET_MODEL" \
tokenizer.pretrained_model_name_or_path="$TARGET_MODEL" \
target_dataset=all \
wandb.run_name="$WANDB_NAME" \
wandb.entity="$WANDB_ENTITY" \
wandb.project="$WANDB_PROJECT" \
wandb.log=true
}

# exp6
TARGET_MODEL="/model/13B_HF_RE/llm-jp-13b-cc-v2-63500step.code10K_en20K_ja30K_ver2.2_full-jaster" # exp6
WANDB_NAME="eval-exp6-full-jaster"
evaluate_13b_model $TARGET_MODEL $WANDB_NAME

# exp8
TARGET_MODEL="/model/13B_HF_RE/llm-jp-13b-cc-v2-beta-61000step.code20K_en40K_ja60K_ver2.2_full-jaster" # exp8
WANDB_NAME="eval-exp8-full-jaster"
evaluate_13b_model $TARGET_MODEL $WANDB_NAME
66 changes: 66 additions & 0 deletions eval-scripts/eval_7b_base.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/bin/bash

set -eux

# SCRIPT_PATH="llm-jp-eval/"
SCRIPT_PATH="/model/sosuke/llm-jp-eval-v1.1.0"
cd $SCRIPT_PATH

source venv/bin/activate
export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}

# Evaluate procedure
function evaluate_7b_model () {
TARGET_MODEL=$1
WANDB_NAME=$2

WANDB_ENTITY="llm-jp"
WANDB_PROJECT="7b-sft-and-eval"

python scripts/evaluate_llm.py \
model.pretrained_model_name_or_path="$TARGET_MODEL" \
tokenizer.pretrained_model_name_or_path="$TARGET_MODEL" \
target_dataset=all \
wandb.run_name="$WANDB_NAME" \
wandb.entity="$WANDB_ENTITY" \
wandb.project="$WANDB_PROJECT" \
wandb.log=true
}

# # exp1
# TARGET_MODEL="/model/7B_HF/llm-jp-7b-61000step.code20K_en40K_ja60K_ver2.2" # exp1
# WANDB_NAME="eval-exp1-base"
# evaluate_7b_model $TARGET_MODEL $WANDB_NAME

# # exp2
# TARGET_MODEL="/model/7B_HF/llm-jp-7b-okazaki-lab-cc-63500step.code10K_en20K_ja30K_ver2.2" # exp2
# WANDB_NAME="eval-exp2-base"
# evaluate_7b_model $TARGET_MODEL $WANDB_NAME

# # exp3
# TARGET_MODEL="/model/7B_HF/llm-jp-7b-63500step.code10K_en20K_ja30K_ver2.2" # exp3
# WANDB_NAME="eval-exp3-base"
# evaluate_7b_model $TARGET_MODEL $WANDB_NAME

# # exp4
# TARGET_MODEL="/model/7B_HF/llm-jp-7b-cc-v2-63500step.code10K_en20K_ja30K_ver2.2" # exp4
# WANDB_NAME="eval-exp4-base"
# evaluate_7b_model $TARGET_MODEL $WANDB_NAME

# # exp4+
# TARGET_MODEL="/model/7B_HF/llm-jp-7b-cc-v2-beta-63500step.code10K_en20K_ja30K_ver2.2" # exp4+
# WANDB_NAME="eval-exp4+-base"
# evaluate_7b_model $TARGET_MODEL $WANDB_NAME

# # exp5
# # Skip for now

# # exp7
# TARGET_MODEL="/model/7B_HF/llm-jp-7b-cc-v2-beta-119000step.code10k_en20k_ja30k_ver2.2" # exp7
# WANDB_NAME="eval-exp7-base"
# evaluate_7b_model $TARGET_MODEL $WANDB_NAME

# exp9
TARGET_MODEL="/model/7B_HF/llm-jp-7b-CC_v2_100k_ver3.0" # exp9
WANDB_NAME="eval-exp9-base"
evaluate_7b_model $TARGET_MODEL $WANDB_NAME
66 changes: 66 additions & 0 deletions eval-scripts/eval_7b_sft-full-jaster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/bin/bash

set -eux

# SCRIPT_PATH="llm-jp-eval/"
SCRIPT_PATH="/model/sosuke/llm-jp-eval-v1.1.0"
cd $SCRIPT_PATH

source venv/bin/activate
export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}

# Evaluate procedure
function evaluate_7b_model () {
TARGET_MODEL=$1
WANDB_NAME=$2

WANDB_ENTITY="llm-jp"
WANDB_PROJECT="7b-sft-and-eval"

python scripts/evaluate_llm.py \
model.pretrained_model_name_or_path="$TARGET_MODEL" \
tokenizer.pretrained_model_name_or_path="$TARGET_MODEL" \
target_dataset=all \
wandb.run_name="$WANDB_NAME" \
wandb.entity="$WANDB_ENTITY" \
wandb.project="$WANDB_PROJECT" \
wandb.log=true
}

# # exp1
# TARGET_MODEL="/model/7B_HF_RE/llm-jp-7b-61000step.code20K_en40K_ja60K_ver2.2_full-jaster" # exp1
# WANDB_NAME="eval-exp1-full-jaster"
# evaluate_7b_model $TARGET_MODEL $WANDB_NAME

# # exp2
# TARGET_MODEL="/model/7B_HF_RE/llm-jp-7b-okazaki-lab-cc-63500step.code10K_en20K_ja30K_ver2.2_full-jaster" # exp2
# WANDB_NAME="eval-exp2-full-jaster"
# evaluate_7b_model $TARGET_MODEL $WANDB_NAME

# # exp3
# TARGET_MODEL="/model/7B_HF_RE/llm-jp-7b-63500step.code10K_en20K_ja30K_ver2.2_full-jaster" # exp3
# WANDB_NAME="eval-exp3-full-jaster"
# evaluate_7b_model $TARGET_MODEL $WANDB_NAME

# # exp4
# TARGET_MODEL="/model/7B_HF_RE/llm-jp-7b-cc-v2-63500step.code10K_en20K_ja30K_ver2.2_full-jaster" # exp4
# WANDB_NAME="eval-exp4-full-jaster"
# evaluate_7b_model $TARGET_MODEL $WANDB_NAME

# # exp4+
# TARGET_MODEL="/model/7B_HF_RE/llm-jp-7b-cc-v2-beta-63500step.code10K_en20K_ja30K_ver2.2_full-jaster" # exp4+
# WANDB_NAME="eval-exp4+-full-jaster"
# evaluate_7b_model $TARGET_MODEL $WANDB_NAME

# # exp5
# # Skip for now

# # exp7
# TARGET_MODEL="/model/7B_HF_RE/llm-jp-7b-cc-v2-beta-119000step.code10k_en20k_ja30k_ver2.2_full-jaster" # exp7
# WANDB_NAME="eval-exp7-full-jaster"
# evaluate_7b_model $TARGET_MODEL $WANDB_NAME

# exp9
TARGET_MODEL="/model/7B_HF_RE/llm-jp-7b-CC_v2_100k_ver3.0_full-jaster" # exp9
WANDB_NAME="eval-exp9-full-jaster"
evaluate_7b_model $TARGET_MODEL $WANDB_NAME
67 changes: 67 additions & 0 deletions sft-scripts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# SFT scripts

## 実行環境

MDXの`llm-jp-nvlink`環境で実行

## Full parameter SFT

Full parameter SFTは以下の設定に固定して行う

```yaml
max_seq_length: 4096 # llama model
global_batch_size: 64 # = per_device_batch_size * num_devices * gradient_accumulation_steps
num_epochs: 5
learning_rate: 2e-5
lr_scheduler: cosine
warmup_ratio: 0.1
dtype: bfloat16
```

datasetは以下の2種類についてそれぞれ行う

- `jaster`: llm-jp-evalで利用するモデル向けのデータセット
- `gpt4-self-inst`: VicunaQAで利用するモデル向けのデータセット

### 7B models

上記の設定に加え、以下の設定で学習

- single node (8 GPUs => `gradient_accumulation_steps` = 8)
- ZeRO-2

### 13B models

上記の設定に加え、以下の設定で学習

- single node (8 GPU => `gradient_accumulation_steps` = 8)
- ZeRO-3
- gradient_checkpointingを有効化

## LoRA SFT

LoRA SFTは以下の設定に固定して行う(基本的な設定はFull parameter SFTと共通、LRのみ2種類用意)

```yaml
max_seq_length: 4096 # llama model
global_batch_size: 64 # = per_device_batch_size * num_devices * gradient_accumulation_steps
num_epochs: 5
learning_rate: [2e-5, 1e-4]
lr_scheduler: cosine
warmup_ratio: 0.1
dtype: bfloat16
```

datasetは以下の2種類についてそれぞれ行う(Full parameter SFTと共通)

- `jaster`: llm-jp-evalで利用するモデル向けのデータセット
- `gpt4-self-inst`: VicunaQAで利用するモデル向けのデータセット

### 7B models

すべてのモデルについてsingle GPUで学習

### 13B models

`jaster`のモデルについてはsingle GPUで学習
`gpt4-self-inst`のモデルについてはメモリ不足によりsingle GPUでは学習ができなかったため、single node (8 GPU) で学習
53 changes: 53 additions & 0 deletions sft-scripts/exp1_gpt4-self-inst_full.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/bin/bash

# Single node full parameter sft script

set -eux

# Move to script path
SCRIPT_PATH="llm-jp-sft/"
cd $SCRIPT_PATH
source venv/bin/activate

# Target Model (TODO: Change if needed)
TARGET_MODEL="/model/7B_HF/llm-jp-7b-61000step.code20K_en40K_ja60K_ver2.2" # exp1
SFT_TYPE="full-gpt4-self-inst"
OUTPUT_DIR="/model/7B_HF_RE/$(basename ${TARGET_MODEL%/})_${SFT_TYPE}"

# wandb info
export WANDB_ENTITY="llm-jp"
export WANDB_PROJECT="7b-sft-and-eval"
export WANDB_NAME="sft-exp1-${SFT_TYPE}"

# For debugging
export NCCL_DEBUG=WARN

# Training settings
config_file="configs/accelerate_config_zero3.yaml"
dataset_path="./dataset"
dataset_sh="./mdx/dataset_gpt4_self_inst_ja.sh"
num_train_epochs=5
per_device_train_batch_size=1
gradient_accumulation_steps=8 # global_batch_size = per_device_train_batch_size * n_gpus * gradient_accumulation_steps = 64
max_seq_length=4096 # for llama model
learning_rate="2e-5"
lr_scheduler_type="cosine"
warmup_ratio=0.1

# N_gpu = 8
accelerate launch --config_file $config_file \
train.py \
--model_name_or_path $TARGET_MODEL \
--tokenizer_name_or_path $TARGET_MODEL \
--num_train_epochs $num_train_epochs \
--per_device_train_batch_size $per_device_train_batch_size \
--gradient_accumulation_steps $gradient_accumulation_steps \
--learning_rate $learning_rate \
--warmup_ratio $warmup_ratio \
--lr_scheduler_type $lr_scheduler_type \
--bf16 \
--max_seq_length $max_seq_length \
--logging_steps 10 \
--report_to wandb \
--data_files $($dataset_sh $dataset_path) \
--output_dir "$OUTPUT_DIR"
54 changes: 54 additions & 0 deletions sft-scripts/exp1_gpt4-self-inst_lora_all.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/bin/bash

# Single GPU sft script
# based on https://github.com/llm-jp/llm-jp-sft/blob/main/mdx/train_peft_single_gpu.sh

set -eux

# Move to script path
SCRIPT_PATH="llm-jp-sft/"
cd $SCRIPT_PATH
source venv/bin/activate

# Set CUDA Device (default: 0)
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"

# Target Model (TODO: Change if needed)
TARGET_MODEL="/model/7B_HF/llm-jp-7b-61000step.code20K_en40K_ja60K_ver2.2" # exp1
SFT_TYPE="lora-all-gpt4-self-inst"
OUTPUT_DIR="/model/7B_HF_RE/$(basename ${TARGET_MODEL%/})_${SFT_TYPE}"

# wandb info
export WANDB_ENTITY="llm-jp"
export WANDB_PROJECT="7b-sft-and-eval"
export WANDB_NAME="sft-exp1-${SFT_TYPE}"

# Training settings
dataset_path="./dataset"
dataset_sh="./mdx/dataset_gpt4_self_inst_ja.sh"
num_train_epochs=5
per_device_train_batch_size=1
gradient_accumulation_steps=64 # global_batch_size = per_device_train_batch_size * gradient_accumulation_steps = 64
peft_target_model="llama-all"
max_seq_length=4096 # for llama model
learning_rate="2e-5"
lr_scheduler_type="cosine"
warmup_ratio=0.1

python train.py \
--model_name_or_path $TARGET_MODEL \
--tokenizer_name_or_path $TARGET_MODEL \
--use_peft \
--num_train_epochs $num_train_epochs \
--per_device_train_batch_size $per_device_train_batch_size \
--gradient_accumulation_steps $gradient_accumulation_steps \
--learning_rate $learning_rate \
--warmup_ratio $warmup_ratio \
--lr_scheduler_type $lr_scheduler_type \
--bf16 \
--max_seq_length $max_seq_length \
--logging_steps 10 \
--report_to wandb \
--data_files $($dataset_sh $dataset_path) \
--output_dir "$OUTPUT_DIR" \
--peft_target_model $peft_target_model
Loading