-
Notifications
You must be signed in to change notification settings - Fork 0
/
pretrain_tdvae.sh
142 lines (129 loc) · 3.35 KB
/
pretrain_tdvae.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/bin/bash
#SBATCH --job-name=tdvae_pretrain
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=32 #
#SBATCH --gres=gpu:1 # number of gpus
#SBATCH -o xxx
#SBATCH -e xxx
# SBATCH --requeue
# SBATCH --qos=preemptive
set -x -e
ulimit -s unlimited
echo "START TIME: $(date)"
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
# export MASTER_ADDR=127.0.0.1
export MASTER_PORT=$[RANDOM%10000+50000]
MICRO_BATCH_SIZE=6
ZERO_STAGE=0
ROOT_PATH=xxx
config_json=${ROOT_PATH}/job_out/ds_config.json
# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
cat <<EOT > $config_json
{
"train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE},
"steps_per_print": 100,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": $ZERO_STAGE,
"contiguous_gradients": false,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 50000000,
"allgather_bucket_size": 500000000
},
"optimizer": {
"type": "Adam",
"params": {
"lr": 1e-5,
"betas": [
0.9,
0.95
],
"eps": 1e-8,
"weight_decay": 1e-2
}
},
"scheduler": {
"type": "WarmupLR",
"params":{
"warmup_min_lr": 5e-6,
"warmup_max_lr": 1e-5
}
},
"zero_allow_untested_optimizer": false,
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"activation_checkpointing": {
"partition_activations": false,
"contiguous_memory_optimization": false
},
"wall_clock_breakdown": false
}
EOT
export PL_DEEPSPEED_CONFIG_PATH=$config_json
export TORCH_EXTENSIONS_DIR=/tmp
# NOTE both encoder and decoder use the same model
ENCODER_MODEL_PATH=/cognitive_comp/wanghao/models/gpt2-base
DECODER_MODEL_PATH=/cognitive_comp/wanghao/models/gpt2-base
VAE_ARGS="
--encoder_model_path $ENCODER_MODEL_PATH \
--decoder_model_path $DECODER_MODEL_PATH \
--latent_dim 256 \
--max_split_num 12 \
--beta_infer_belief 1 \
--beta_belief_predict 1e-1 \
--beta_kl_constraints 1e-2 \
--beta_n_cycles 30 \
--freebit_infer_belief 0 \
--freebit_belief_predict 0 \
--freebit_kl_constraints 1 \
"
#--checkpoint_path xxx
CHECKPOINT_SAVE_PATH=${ROOT_PATH}/checkpoints
MODEL_CHECKPOINT_ARGS="\
--monitor total_loss \
--save_top_k -1 \
--mode min \
--every_n_train_steps 5000 \
--save_weights_only True \
--dirpath $CHECKPOINT_SAVE_PATH \
--filename checkpoint-{epoch}-{step}-dim_256_sample_all_recon_beta_belief_predict_1em1 \
"
TRAINER_ARGS="
--max_epochs 10 \
--gpus 1 \
--num_nodes 1 \
--precision 16 \
--val_check_interval 2500 \
--learning_rate 1.0e-5 \
--warmup_steps 10000 \
--weight_decay 0.01 \
--default_root_dir ${ROOT_PATH} \
--log_every_n_steps 50 \
--strategy deepspeed_stage_2 \
"
# --strategy deepspeed_stage_2 \
DATA_ARGS="
--train_batchsize $MICRO_BATCH_SIZE \
--eval_batchsize $MICRO_BATCH_SIZE \
--test_batchsize $MICRO_BATCH_SIZE \
--num_workers 32 \
--ds_name wudao_tdvae \
"
SCRIPTS_PATH=xxxx
export CMD=" \
$SCRIPTS_PATH/pretrain_tdvae.py \
$TRAINER_ARGS \
$MODEL_CHECKPOINT_ARGS \
$VAE_ARGS \
$DATA_ARGS \
"
# srun python $CMD
# python -m debugpy --listen 5678 --wait-for-client $CMD
python $CMD