diff --git a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/data_config.yaml b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/data_config.yaml new file mode 100644 index 0000000..b4b05a2 --- /dev/null +++ b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/data_config.yaml @@ -0,0 +1,23 @@ +common: + v3_0_info_root: "/home/shared/corpus/llm-jp-corpus/v3.0.0/training_resharded_tokenize_ver3.0/token_info" + +datasets: + code: + basedir: "v3_0_info_root" + file: "2024_0410_code.sakura_home.csv" + repeat: 0.1014 + en: + basedir: "v3_0_info_root" + file: "2024_0410_en.sakura_home.csv" + repeat: 0.1014 + ja_cc1: + file: "/home/shared/experiments/0022_v3-high-quality-cpt/corpus/llm-jp-corpus/v3.0.0/training_resharded_sorted_tokenize_ver3.0/ppl_asc/token_info.csv" + filter: + - train/ja/cc-1 + repeat: 0.4318 + ja_wiki: + basedir: "v3_0_info_root" + file: "2024_0410_ja.sakura_home.csv" + filter: + - train/ja/wiki + repeat: 2 diff --git a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-1.7b.sh b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-1.7b.sh new file mode 100644 index 0000000..abb9821 --- /dev/null +++ b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-1.7b.sh @@ -0,0 +1,45 @@ +#!/bin/bash +#SBATCH --job-name=0022_1.7b-hugh-qaulity-cpt-exp1B_cc1_asc_ppl +#SBATCH --partition=gpu-small +#SBATCH --nodes=4 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -eu -o pipefail + +# change directory if each experiment will be handled as one experintal issue +EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt +CONF_DIR=exp1B_cc1_asc_ppl + +ENV_DIR=${EXPERIMENT_DIR}/environment +SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1) +NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE)) + +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +mpirun \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + -x EXPERIMENT_DIR=$EXPERIMENT_DIR \ + -x SCRIPT_ROOT=$SCRIPT_ROOT \ + -x CONF_DIR=$CONF_DIR \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + bash ${SCRIPT_ROOT}/train-1.7b.sh diff --git a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-13b.sh b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-13b.sh new file mode 100644 index 0000000..0b2e399 --- /dev/null +++ b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-13b.sh @@ -0,0 +1,45 @@ +#!/bin/bash +#SBATCH --job-name=0022_13b-hugh-qaulity-cpt-exp1B_cc1_asc_ppl +#SBATCH --partition=gpu-small +#SBATCH --nodes=8 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -eu -o pipefail + +# change directory if each experiment will be handled as one experintal issue +EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt +CONF_DIR=exp1B_cc1_asc_ppl + +ENV_DIR=${EXPERIMENT_DIR}/environment +SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1) +NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE)) + +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +mpirun \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + -x EXPERIMENT_DIR=$EXPERIMENT_DIR \ + -x SCRIPT_ROOT=$SCRIPT_ROOT \ + -x CONF_DIR=$CONF_DIR \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + bash ${SCRIPT_ROOT}/train-13b.sh diff --git a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-3.7b.sh b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-3.7b.sh new file mode 100644 index 0000000..501437e --- /dev/null +++ b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-3.7b.sh @@ -0,0 +1,45 @@ +#!/bin/bash +#SBATCH --job-name=0022_3.7b-hugh-qaulity-cpt-exp1B_cc1_asc_ppl +#SBATCH --partition=gpu-small +#SBATCH --nodes=4 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -eu -o pipefail + +# change directory if each experiment will be handled as one experintal issue +EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt +CONF_DIR=exp1B_cc1_asc_ppl + +ENV_DIR=${EXPERIMENT_DIR}/environment +SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1) +NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE)) + +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +mpirun \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + -x EXPERIMENT_DIR=$EXPERIMENT_DIR \ + -x SCRIPT_ROOT=$SCRIPT_ROOT \ + -x CONF_DIR=$CONF_DIR \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + bash ${SCRIPT_ROOT}/train-3.7b.sh diff --git a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/data_config.yaml b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/data_config.yaml new file mode 100644 index 0000000..2129a00 --- /dev/null +++ b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/data_config.yaml @@ -0,0 +1,23 @@ +common: + v3_0_info_root: "/home/shared/corpus/llm-jp-corpus/v3.0.0/training_resharded_tokenize_ver3.0/token_info" + +datasets: + code: + basedir: "v3_0_info_root" + file: "2024_0410_code.sakura_home.csv" + repeat: 0.1014 + en: + basedir: "v3_0_info_root" + file: "2024_0410_en.sakura_home.csv" + repeat: 0.1014 + ja_cc1: + file: "/home/shared/experiments/0022_v3-high-quality-cpt/corpus/llm-jp-corpus/v3.0.0/training_resharded_sorted_tokenize_ver3.0/ppl_desc/token_info.csv" + filter: + - train/ja/cc-1 + repeat: 0.4318 + ja_wiki: + basedir: "v3_0_info_root" + file: "2024_0410_ja.sakura_home.csv" + filter: + - train/ja/wiki + repeat: 2 diff --git a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-1.7b.sh b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-1.7b.sh new file mode 100644 index 0000000..7f9dbc9 --- /dev/null +++ b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-1.7b.sh @@ -0,0 +1,45 @@ +#!/bin/bash +#SBATCH --job-name=0022_1.7b-hugh-qaulity-cpt-exp1B_cc1_desc_ppl +#SBATCH --partition=gpu-small +#SBATCH --nodes=4 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -eu -o pipefail + +# change directory if each experiment will be handled as one experintal issue +EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt +CONF_DIR=exp1B_cc1_desc_ppl + +ENV_DIR=${EXPERIMENT_DIR}/environment +SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1) +NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE)) + +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +mpirun \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + -x EXPERIMENT_DIR=$EXPERIMENT_DIR \ + -x SCRIPT_ROOT=$SCRIPT_ROOT \ + -x CONF_DIR=$CONF_DIR \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + bash ${SCRIPT_ROOT}/train-1.7b.sh diff --git a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-13b.sh b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-13b.sh new file mode 100644 index 0000000..2604d40 --- /dev/null +++ b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-13b.sh @@ -0,0 +1,45 @@ +#!/bin/bash +#SBATCH --job-name=0022_13b-hugh-qaulity-cpt-exp1B_cc1_desc_ppl +#SBATCH --partition=gpu-small +#SBATCH --nodes=8 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -eu -o pipefail + +# change directory if each experiment will be handled as one experintal issue +EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt +CONF_DIR=exp1B_cc1_desc_ppl + +ENV_DIR=${EXPERIMENT_DIR}/environment +SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1) +NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE)) + +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +mpirun \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + -x EXPERIMENT_DIR=$EXPERIMENT_DIR \ + -x SCRIPT_ROOT=$SCRIPT_ROOT \ + -x CONF_DIR=$CONF_DIR \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + bash ${SCRIPT_ROOT}/train-13b.sh diff --git a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-3.7b.sh b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-3.7b.sh new file mode 100644 index 0000000..42a8964 --- /dev/null +++ b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-3.7b.sh @@ -0,0 +1,45 @@ +#!/bin/bash +#SBATCH --job-name=0022_3.7b-hugh-qaulity-cpt-exp1B_cc1_desc_ppl +#SBATCH --partition=gpu-small +#SBATCH --nodes=4 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -eu -o pipefail + +# change directory if each experiment will be handled as one experintal issue +EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt +CONF_DIR=exp1B_cc1_desc_ppl + +ENV_DIR=${EXPERIMENT_DIR}/environment +SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1) +NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE)) + +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +mpirun \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + -x EXPERIMENT_DIR=$EXPERIMENT_DIR \ + -x SCRIPT_ROOT=$SCRIPT_ROOT \ + -x CONF_DIR=$CONF_DIR \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + bash ${SCRIPT_ROOT}/train-3.7b.sh diff --git a/pretrain/scripts/v3-high-quality-cpt/exp4A/data_config.yaml b/pretrain/scripts/v3-high-quality-cpt/exp4A/data_config.yaml new file mode 100644 index 0000000..a08405b --- /dev/null +++ b/pretrain/scripts/v3-high-quality-cpt/exp4A/data_config.yaml @@ -0,0 +1,30 @@ +common: + v3_0_info_root: "/home/shared/corpus/llm-jp-corpus/v3.0.0/training_resharded_tokenize_ver3.0/token_info" + v3_1_info_root: "/home/shared/corpus/llm-jp-corpus/v3.1.0/tokenize/v3.0b1/token_info" + +datasets: + en: + basedir: "v3_0_info_root" + file: "2024_0410_en.sakura_home.csv" + repeat: 0.1658 + ja_v3_1_pdf00: + basedir: "v3_1_info_root" + file: "2024_0718_ja_train2.sakura_home.csv" + filter: + - "train2/ja/warp-pdf-e00" + repeat: 0.1043 + ja_v3_1_pdf02: + basedir: "v3_1_info_root" + file: "2024_0718_ja_train2.sakura_home.csv" + filter: + - "train2/ja/warp-pdf-e02" + repeat: 0.0522 + ja_other: + basedir: "v3_0_info_root" + file: "2024_0410_ja.sakura_home.csv" + filter: + - train/ja/cc + - train/ja/kaken + - train/ja/warp-html + - train/ja/wiki + repeat: 0.1043 diff --git a/pretrain/scripts/v3-high-quality-cpt/exp4A/sbatch-1.7b.sh b/pretrain/scripts/v3-high-quality-cpt/exp4A/sbatch-1.7b.sh new file mode 100644 index 0000000..1f8b336 --- /dev/null +++ b/pretrain/scripts/v3-high-quality-cpt/exp4A/sbatch-1.7b.sh @@ -0,0 +1,45 @@ +#!/bin/bash +#SBATCH --job-name=0022_1.7b-hugh-qaulity-cpt-exp4a +#SBATCH --partition=gpu-small +#SBATCH --nodes=4 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -eu -o pipefail + +# change directory if each experiment will be handled as one experintal issue +EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt +CONF_DIR=exp4A + +ENV_DIR=${EXPERIMENT_DIR}/environment +SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt/ + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1) +NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE)) + +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +mpirun \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + -x EXPERIMENT_DIR=$EXPERIMENT_DIR \ + -x SCRIPT_ROOT=$SCRIPT_ROOT \ + -x CONF_DIR=$CONF_DIR \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + bash ${SCRIPT_ROOT}/train-1.7b.sh diff --git a/pretrain/scripts/v3-high-quality-cpt/exp4A/sbatch-13b.sh b/pretrain/scripts/v3-high-quality-cpt/exp4A/sbatch-13b.sh new file mode 100644 index 0000000..5977416 --- /dev/null +++ b/pretrain/scripts/v3-high-quality-cpt/exp4A/sbatch-13b.sh @@ -0,0 +1,45 @@ +#!/bin/bash +#SBATCH --job-name=0022_13b-hugh-qaulity-cpt-exp4a +#SBATCH --partition=gpu-small +#SBATCH --nodes=8 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -eu -o pipefail + +# change directory if each experiment will be handled as one experintal issue +EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt +CONF_DIR=exp4A + +ENV_DIR=${EXPERIMENT_DIR}/environment +SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt/ + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1) +NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE)) + +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +mpirun \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + -x EXPERIMENT_DIR=$EXPERIMENT_DIR \ + -x SCRIPT_ROOT=$SCRIPT_ROOT \ + -x CONF_DIR=$CONF_DIR \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + bash ${SCRIPT_ROOT}/train-13b.sh diff --git a/pretrain/scripts/v3-high-quality-cpt/exp4A/sbatch-3.7b.sh b/pretrain/scripts/v3-high-quality-cpt/exp4A/sbatch-3.7b.sh new file mode 100644 index 0000000..b012c69 --- /dev/null +++ b/pretrain/scripts/v3-high-quality-cpt/exp4A/sbatch-3.7b.sh @@ -0,0 +1,45 @@ +#!/bin/bash +#SBATCH --job-name=0022_3.7b-hugh-qaulity-cpt-exp4a +#SBATCH --partition=gpu-small +#SBATCH --nodes=4 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -eu -o pipefail + +# change directory if each experiment will be handled as one experintal issue +EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt +CONF_DIR=exp4A + +ENV_DIR=${EXPERIMENT_DIR}/environment +SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt/ + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1) +NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE)) + +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +mpirun \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + -x EXPERIMENT_DIR=$EXPERIMENT_DIR \ + -x SCRIPT_ROOT=$SCRIPT_ROOT \ + -x CONF_DIR=$CONF_DIR \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + bash ${SCRIPT_ROOT}/train-3.7b.sh