From 400d8df46910fcbf3720b359e972995c4218f553 Mon Sep 17 00:00:00 2001 From: Yuma Tsuta Date: Mon, 2 Dec 2024 17:04:28 +0900 Subject: [PATCH 1/5] Add experiments of high quality cpt (exp1B with sorted cc1 by ppl) --- .../exp1B_cc1_asc_ppl/data_config.yaml | 23 ++++++++++ .../exp1B_cc1_asc_ppl/sbatch-1.7b.sh | 45 +++++++++++++++++++ .../exp1B_cc1_asc_ppl/sbatch-13b.sh | 45 +++++++++++++++++++ .../exp1B_cc1_asc_ppl/sbatch-3.7b.sh | 45 +++++++++++++++++++ .../exp1B_cc1_desc_ppl/data_config.yaml | 23 ++++++++++ .../exp1B_cc1_desc_ppl/sbatch-1.7b.sh | 45 +++++++++++++++++++ .../exp1B_cc1_desc_ppl/sbatch-13b.sh | 45 +++++++++++++++++++ .../exp1B_cc1_desc_ppl/sbatch-3.7b.sh | 45 +++++++++++++++++++ 8 files changed, 316 insertions(+) create mode 100644 pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/data_config.yaml create mode 100644 pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-1.7b.sh create mode 100644 pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-13b.sh create mode 100644 pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-3.7b.sh create mode 100644 pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/data_config.yaml create mode 100644 pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-1.7b.sh create mode 100644 pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-13b.sh create mode 100644 pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-3.7b.sh diff --git a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/data_config.yaml b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/data_config.yaml new file mode 100644 index 0000000..2129a00 --- /dev/null +++ b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/data_config.yaml @@ -0,0 +1,23 @@ +common: + v3_0_info_root: "/home/shared/corpus/llm-jp-corpus/v3.0.0/training_resharded_tokenize_ver3.0/token_info" + +datasets: + code: + basedir: "v3_0_info_root" + file: "2024_0410_code.sakura_home.csv" + repeat: 0.1014 + en: + basedir: "v3_0_info_root" + file: "2024_0410_en.sakura_home.csv" + repeat: 0.1014 + ja_cc1: + file: "/home/shared/experiments/0022_v3-high-quality-cpt/corpus/llm-jp-corpus/v3.0.0/training_resharded_sorted_tokenize_ver3.0/ppl_desc/token_info.csv" + filter: + - train/ja/cc-1 + repeat: 0.4318 + ja_wiki: + basedir: "v3_0_info_root" + file: "2024_0410_ja.sakura_home.csv" + filter: + - train/ja/wiki + repeat: 2 diff --git a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-1.7b.sh b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-1.7b.sh new file mode 100644 index 0000000..abb9821 --- /dev/null +++ b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-1.7b.sh @@ -0,0 +1,45 @@ +#!/bin/bash +#SBATCH --job-name=0022_1.7b-hugh-qaulity-cpt-exp1B_cc1_asc_ppl +#SBATCH --partition=gpu-small +#SBATCH --nodes=4 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -eu -o pipefail + +# change directory if each experiment will be handled as one experintal issue +EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt +CONF_DIR=exp1B_cc1_asc_ppl + +ENV_DIR=${EXPERIMENT_DIR}/environment +SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1) +NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE)) + +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +mpirun \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + -x EXPERIMENT_DIR=$EXPERIMENT_DIR \ + -x SCRIPT_ROOT=$SCRIPT_ROOT \ + -x CONF_DIR=$CONF_DIR \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + bash ${SCRIPT_ROOT}/train-1.7b.sh diff --git a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-13b.sh b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-13b.sh new file mode 100644 index 0000000..2282259 --- /dev/null +++ b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-13b.sh @@ -0,0 +1,45 @@ +#!/bin/bash +#SBATCH --job-name=0022_13b-hugh-qaulity-cpt-exp1b +#SBATCH --partition=gpu-small +#SBATCH --nodes=8 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -eu -o pipefail + +# change directory if each experiment will be handled as one experintal issue +EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt +CONF_DIR=exp1B + +ENV_DIR=${EXPERIMENT_DIR}/environment +SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1) +NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE)) + +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +mpirun \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + -x EXPERIMENT_DIR=$EXPERIMENT_DIR \ + -x SCRIPT_ROOT=$SCRIPT_ROOT \ + -x CONF_DIR=$CONF_DIR \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + bash ${SCRIPT_ROOT}/train-13b.sh diff --git a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-3.7b.sh b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-3.7b.sh new file mode 100644 index 0000000..4dbadf5 --- /dev/null +++ b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-3.7b.sh @@ -0,0 +1,45 @@ +#!/bin/bash +#SBATCH --job-name=0022_3.7b-hugh-qaulity-cpt-exp1b +#SBATCH --partition=gpu-small +#SBATCH --nodes=4 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -eu -o pipefail + +# change directory if each experiment will be handled as one experintal issue +EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt +CONF_DIR=exp1B + +ENV_DIR=${EXPERIMENT_DIR}/environment +SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1) +NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE)) + +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +mpirun \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + -x EXPERIMENT_DIR=$EXPERIMENT_DIR \ + -x SCRIPT_ROOT=$SCRIPT_ROOT \ + -x CONF_DIR=$CONF_DIR \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + bash ${SCRIPT_ROOT}/train-3.7b.sh diff --git a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/data_config.yaml b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/data_config.yaml new file mode 100644 index 0000000..b4b05a2 --- /dev/null +++ b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/data_config.yaml @@ -0,0 +1,23 @@ +common: + v3_0_info_root: "/home/shared/corpus/llm-jp-corpus/v3.0.0/training_resharded_tokenize_ver3.0/token_info" + +datasets: + code: + basedir: "v3_0_info_root" + file: "2024_0410_code.sakura_home.csv" + repeat: 0.1014 + en: + basedir: "v3_0_info_root" + file: "2024_0410_en.sakura_home.csv" + repeat: 0.1014 + ja_cc1: + file: "/home/shared/experiments/0022_v3-high-quality-cpt/corpus/llm-jp-corpus/v3.0.0/training_resharded_sorted_tokenize_ver3.0/ppl_asc/token_info.csv" + filter: + - train/ja/cc-1 + repeat: 0.4318 + ja_wiki: + basedir: "v3_0_info_root" + file: "2024_0410_ja.sakura_home.csv" + filter: + - train/ja/wiki + repeat: 2 diff --git a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-1.7b.sh b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-1.7b.sh new file mode 100644 index 0000000..abb9821 --- /dev/null +++ b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-1.7b.sh @@ -0,0 +1,45 @@ +#!/bin/bash +#SBATCH --job-name=0022_1.7b-hugh-qaulity-cpt-exp1B_cc1_asc_ppl +#SBATCH --partition=gpu-small +#SBATCH --nodes=4 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -eu -o pipefail + +# change directory if each experiment will be handled as one experintal issue +EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt +CONF_DIR=exp1B_cc1_asc_ppl + +ENV_DIR=${EXPERIMENT_DIR}/environment +SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1) +NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE)) + +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +mpirun \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + -x EXPERIMENT_DIR=$EXPERIMENT_DIR \ + -x SCRIPT_ROOT=$SCRIPT_ROOT \ + -x CONF_DIR=$CONF_DIR \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + bash ${SCRIPT_ROOT}/train-1.7b.sh diff --git a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-13b.sh b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-13b.sh new file mode 100644 index 0000000..2282259 --- /dev/null +++ b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-13b.sh @@ -0,0 +1,45 @@ +#!/bin/bash +#SBATCH --job-name=0022_13b-hugh-qaulity-cpt-exp1b +#SBATCH --partition=gpu-small +#SBATCH --nodes=8 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -eu -o pipefail + +# change directory if each experiment will be handled as one experintal issue +EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt +CONF_DIR=exp1B + +ENV_DIR=${EXPERIMENT_DIR}/environment +SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1) +NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE)) + +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +mpirun \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + -x EXPERIMENT_DIR=$EXPERIMENT_DIR \ + -x SCRIPT_ROOT=$SCRIPT_ROOT \ + -x CONF_DIR=$CONF_DIR \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + bash ${SCRIPT_ROOT}/train-13b.sh diff --git a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-3.7b.sh b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-3.7b.sh new file mode 100644 index 0000000..4dbadf5 --- /dev/null +++ b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-3.7b.sh @@ -0,0 +1,45 @@ +#!/bin/bash +#SBATCH --job-name=0022_3.7b-hugh-qaulity-cpt-exp1b +#SBATCH --partition=gpu-small +#SBATCH --nodes=4 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -eu -o pipefail + +# change directory if each experiment will be handled as one experintal issue +EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt +CONF_DIR=exp1B + +ENV_DIR=${EXPERIMENT_DIR}/environment +SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1) +NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE)) + +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +mpirun \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + -x EXPERIMENT_DIR=$EXPERIMENT_DIR \ + -x SCRIPT_ROOT=$SCRIPT_ROOT \ + -x CONF_DIR=$CONF_DIR \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + bash ${SCRIPT_ROOT}/train-3.7b.sh From 8dcfbd922f616bec44304e973bd4086a8cbd3447 Mon Sep 17 00:00:00 2001 From: Yuma Tsuta Date: Mon, 2 Dec 2024 17:11:14 +0900 Subject: [PATCH 2/5] Fix exp settings --- .../v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-13b.sh | 4 ++-- .../v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-3.7b.sh | 4 ++-- .../v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-1.7b.sh | 4 ++-- .../v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-13b.sh | 4 ++-- .../v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-3.7b.sh | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-13b.sh b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-13b.sh index 2282259..0b2e399 100644 --- a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-13b.sh +++ b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-13b.sh @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --job-name=0022_13b-hugh-qaulity-cpt-exp1b +#SBATCH --job-name=0022_13b-hugh-qaulity-cpt-exp1B_cc1_asc_ppl #SBATCH --partition=gpu-small #SBATCH --nodes=8 #SBATCH --gpus-per-node=8 @@ -11,7 +11,7 @@ set -eu -o pipefail # change directory if each experiment will be handled as one experintal issue EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt -CONF_DIR=exp1B +CONF_DIR=exp1B_cc1_asc_ppl ENV_DIR=${EXPERIMENT_DIR}/environment SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt diff --git a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-3.7b.sh b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-3.7b.sh index 4dbadf5..501437e 100644 --- a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-3.7b.sh +++ b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/sbatch-3.7b.sh @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --job-name=0022_3.7b-hugh-qaulity-cpt-exp1b +#SBATCH --job-name=0022_3.7b-hugh-qaulity-cpt-exp1B_cc1_asc_ppl #SBATCH --partition=gpu-small #SBATCH --nodes=4 #SBATCH --gpus-per-node=8 @@ -11,7 +11,7 @@ set -eu -o pipefail # change directory if each experiment will be handled as one experintal issue EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt -CONF_DIR=exp1B +CONF_DIR=exp1B_cc1_asc_ppl ENV_DIR=${EXPERIMENT_DIR}/environment SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt diff --git a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-1.7b.sh b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-1.7b.sh index abb9821..7f9dbc9 100644 --- a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-1.7b.sh +++ b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-1.7b.sh @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --job-name=0022_1.7b-hugh-qaulity-cpt-exp1B_cc1_asc_ppl +#SBATCH --job-name=0022_1.7b-hugh-qaulity-cpt-exp1B_cc1_desc_ppl #SBATCH --partition=gpu-small #SBATCH --nodes=4 #SBATCH --gpus-per-node=8 @@ -11,7 +11,7 @@ set -eu -o pipefail # change directory if each experiment will be handled as one experintal issue EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt -CONF_DIR=exp1B_cc1_asc_ppl +CONF_DIR=exp1B_cc1_desc_ppl ENV_DIR=${EXPERIMENT_DIR}/environment SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt diff --git a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-13b.sh b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-13b.sh index 2282259..2604d40 100644 --- a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-13b.sh +++ b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-13b.sh @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --job-name=0022_13b-hugh-qaulity-cpt-exp1b +#SBATCH --job-name=0022_13b-hugh-qaulity-cpt-exp1B_cc1_desc_ppl #SBATCH --partition=gpu-small #SBATCH --nodes=8 #SBATCH --gpus-per-node=8 @@ -11,7 +11,7 @@ set -eu -o pipefail # change directory if each experiment will be handled as one experintal issue EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt -CONF_DIR=exp1B +CONF_DIR=exp1B_cc1_desc_ppl ENV_DIR=${EXPERIMENT_DIR}/environment SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt diff --git a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-3.7b.sh b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-3.7b.sh index 4dbadf5..42a8964 100644 --- a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-3.7b.sh +++ b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/sbatch-3.7b.sh @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --job-name=0022_3.7b-hugh-qaulity-cpt-exp1b +#SBATCH --job-name=0022_3.7b-hugh-qaulity-cpt-exp1B_cc1_desc_ppl #SBATCH --partition=gpu-small #SBATCH --nodes=4 #SBATCH --gpus-per-node=8 @@ -11,7 +11,7 @@ set -eu -o pipefail # change directory if each experiment will be handled as one experintal issue EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt -CONF_DIR=exp1B +CONF_DIR=exp1B_cc1_desc_ppl ENV_DIR=${EXPERIMENT_DIR}/environment SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt From 1d967ba1c77fdddeae5974fa701d51dd5d30306a Mon Sep 17 00:00:00 2001 From: Yuma Tsuta Date: Mon, 2 Dec 2024 17:38:54 +0900 Subject: [PATCH 3/5] Fix file path --- .../v3-high-quality-cpt/exp1B_cc1_asc_ppl/data_config.yaml | 2 +- .../v3-high-quality-cpt/exp1B_cc1_desc_ppl/data_config.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/data_config.yaml b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/data_config.yaml index 2129a00..b4b05a2 100644 --- a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/data_config.yaml +++ b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_asc_ppl/data_config.yaml @@ -11,7 +11,7 @@ datasets: file: "2024_0410_en.sakura_home.csv" repeat: 0.1014 ja_cc1: - file: "/home/shared/experiments/0022_v3-high-quality-cpt/corpus/llm-jp-corpus/v3.0.0/training_resharded_sorted_tokenize_ver3.0/ppl_desc/token_info.csv" + file: "/home/shared/experiments/0022_v3-high-quality-cpt/corpus/llm-jp-corpus/v3.0.0/training_resharded_sorted_tokenize_ver3.0/ppl_asc/token_info.csv" filter: - train/ja/cc-1 repeat: 0.4318 diff --git a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/data_config.yaml b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/data_config.yaml index b4b05a2..2129a00 100644 --- a/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/data_config.yaml +++ b/pretrain/scripts/v3-high-quality-cpt/exp1B_cc1_desc_ppl/data_config.yaml @@ -11,7 +11,7 @@ datasets: file: "2024_0410_en.sakura_home.csv" repeat: 0.1014 ja_cc1: - file: "/home/shared/experiments/0022_v3-high-quality-cpt/corpus/llm-jp-corpus/v3.0.0/training_resharded_sorted_tokenize_ver3.0/ppl_asc/token_info.csv" + file: "/home/shared/experiments/0022_v3-high-quality-cpt/corpus/llm-jp-corpus/v3.0.0/training_resharded_sorted_tokenize_ver3.0/ppl_desc/token_info.csv" filter: - train/ja/cc-1 repeat: 0.4318 From f93b0aa8e5f878e54c36198e9fd3d77a5781d606 Mon Sep 17 00:00:00 2001 From: Yuma Tsuta Date: Mon, 9 Dec 2024 16:11:55 +0900 Subject: [PATCH 4/5] Add exp4a scripts --- .../exp4A/data_config.yaml | 30 +++++++++++++ .../v3-high-quality-cpt/exp4A/sbatch-1.7b.sh | 45 +++++++++++++++++++ .../v3-high-quality-cpt/exp4A/sbatch-13b.sh | 45 +++++++++++++++++++ .../v3-high-quality-cpt/exp4A/sbatch-3.7b.sh | 45 +++++++++++++++++++ 4 files changed, 165 insertions(+) create mode 100644 pretrain/scripts/v3-high-quality-cpt/exp4A/data_config.yaml create mode 100644 pretrain/scripts/v3-high-quality-cpt/exp4A/sbatch-1.7b.sh create mode 100644 pretrain/scripts/v3-high-quality-cpt/exp4A/sbatch-13b.sh create mode 100644 pretrain/scripts/v3-high-quality-cpt/exp4A/sbatch-3.7b.sh diff --git a/pretrain/scripts/v3-high-quality-cpt/exp4A/data_config.yaml b/pretrain/scripts/v3-high-quality-cpt/exp4A/data_config.yaml new file mode 100644 index 0000000..17f9bd5 --- /dev/null +++ b/pretrain/scripts/v3-high-quality-cpt/exp4A/data_config.yaml @@ -0,0 +1,30 @@ +common: + v3_0_info_root: "/home/shared/corpus/llm-jp-corpus/v3.0.0/training_resharded_tokenize_ver3.0/token_info" + v3_1_info_root: "/home/shared/corpus/llm-jp-corpus/v3.1.0/tokenize/v3.0b1/token_info" + +datasets: + en: + basedir: "v3_0_info_root" + file: "2024_0410_en.sakura_home.csv" + repeat: 0.1658 + ja_v3_1_pdf00: + basedir: "v3_1_info_root" + file: "2024_0718_ja_train2.sakura_home.csv" + filter: + - "train2/ja/warp-pdf-e00" + repeat: 0.1043 + ja_v3_1_pdf02: + basedir: "v3_1_info_root" + file: "2024_0718_ja_train2.sakura_home.csv" + filter: + - "train2/ja/warp-pdf-e02" + repeat: 0.0522 + ja_other: + basedir: "v3_0_info_root" + file: "2024_0410_ja.sakura_home.csv" + filter: + - train/ja/cc + - train/ja/kaken + - train/ja/warp-html + - train/ja/wiki + repeat: 0.1043 \ No newline at end of file diff --git a/pretrain/scripts/v3-high-quality-cpt/exp4A/sbatch-1.7b.sh b/pretrain/scripts/v3-high-quality-cpt/exp4A/sbatch-1.7b.sh new file mode 100644 index 0000000..1f8b336 --- /dev/null +++ b/pretrain/scripts/v3-high-quality-cpt/exp4A/sbatch-1.7b.sh @@ -0,0 +1,45 @@ +#!/bin/bash +#SBATCH --job-name=0022_1.7b-hugh-qaulity-cpt-exp4a +#SBATCH --partition=gpu-small +#SBATCH --nodes=4 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -eu -o pipefail + +# change directory if each experiment will be handled as one experintal issue +EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt +CONF_DIR=exp4A + +ENV_DIR=${EXPERIMENT_DIR}/environment +SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt/ + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1) +NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE)) + +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +mpirun \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + -x EXPERIMENT_DIR=$EXPERIMENT_DIR \ + -x SCRIPT_ROOT=$SCRIPT_ROOT \ + -x CONF_DIR=$CONF_DIR \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + bash ${SCRIPT_ROOT}/train-1.7b.sh diff --git a/pretrain/scripts/v3-high-quality-cpt/exp4A/sbatch-13b.sh b/pretrain/scripts/v3-high-quality-cpt/exp4A/sbatch-13b.sh new file mode 100644 index 0000000..5977416 --- /dev/null +++ b/pretrain/scripts/v3-high-quality-cpt/exp4A/sbatch-13b.sh @@ -0,0 +1,45 @@ +#!/bin/bash +#SBATCH --job-name=0022_13b-hugh-qaulity-cpt-exp4a +#SBATCH --partition=gpu-small +#SBATCH --nodes=8 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -eu -o pipefail + +# change directory if each experiment will be handled as one experintal issue +EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt +CONF_DIR=exp4A + +ENV_DIR=${EXPERIMENT_DIR}/environment +SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt/ + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1) +NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE)) + +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +mpirun \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + -x EXPERIMENT_DIR=$EXPERIMENT_DIR \ + -x SCRIPT_ROOT=$SCRIPT_ROOT \ + -x CONF_DIR=$CONF_DIR \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + bash ${SCRIPT_ROOT}/train-13b.sh diff --git a/pretrain/scripts/v3-high-quality-cpt/exp4A/sbatch-3.7b.sh b/pretrain/scripts/v3-high-quality-cpt/exp4A/sbatch-3.7b.sh new file mode 100644 index 0000000..b012c69 --- /dev/null +++ b/pretrain/scripts/v3-high-quality-cpt/exp4A/sbatch-3.7b.sh @@ -0,0 +1,45 @@ +#!/bin/bash +#SBATCH --job-name=0022_3.7b-hugh-qaulity-cpt-exp4a +#SBATCH --partition=gpu-small +#SBATCH --nodes=4 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -eu -o pipefail + +# change directory if each experiment will be handled as one experintal issue +EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt +CONF_DIR=exp4A + +ENV_DIR=${EXPERIMENT_DIR}/environment +SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt/ + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1) +NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE)) + +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +mpirun \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + -x EXPERIMENT_DIR=$EXPERIMENT_DIR \ + -x SCRIPT_ROOT=$SCRIPT_ROOT \ + -x CONF_DIR=$CONF_DIR \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + bash ${SCRIPT_ROOT}/train-3.7b.sh From 4b60d61296f54f99a1d98d30d56de479e60ad93a Mon Sep 17 00:00:00 2001 From: Yuma Tsuta Date: Mon, 9 Dec 2024 16:14:33 +0900 Subject: [PATCH 5/5] fix --- .../scripts/v3-high-quality-cpt/exp4A/data_config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pretrain/scripts/v3-high-quality-cpt/exp4A/data_config.yaml b/pretrain/scripts/v3-high-quality-cpt/exp4A/data_config.yaml index 17f9bd5..a08405b 100644 --- a/pretrain/scripts/v3-high-quality-cpt/exp4A/data_config.yaml +++ b/pretrain/scripts/v3-high-quality-cpt/exp4A/data_config.yaml @@ -10,21 +10,21 @@ datasets: ja_v3_1_pdf00: basedir: "v3_1_info_root" file: "2024_0718_ja_train2.sakura_home.csv" - filter: + filter: - "train2/ja/warp-pdf-e00" repeat: 0.1043 ja_v3_1_pdf02: basedir: "v3_1_info_root" file: "2024_0718_ja_train2.sakura_home.csv" - filter: + filter: - "train2/ja/warp-pdf-e02" repeat: 0.0522 ja_other: basedir: "v3_0_info_root" file: "2024_0410_ja.sakura_home.csv" - filter: + filter: - train/ja/cc - train/ja/kaken - train/ja/warp-html - train/ja/wiki - repeat: 0.1043 \ No newline at end of file + repeat: 0.1043