forked from benlevyx/florabert
-
Notifications
You must be signed in to change notification settings - Fork 5
/
config.yaml
98 lines (96 loc) · 2.52 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# Variables for train-test-split
TRAIN_SIZE: 0.83
# General parameters
max_len: 1000
num_tissues: 8
expressed_threshold: 0.1
random_seed: 766
dnabert:
max_seq_len: 512
kmer: 6
test_size: 0.2
tokenizer:
vocab_size: 5000
data:
max_seq_len: 1000
test_size: 0.2
num_labels: 8
training:
pretrain:
num_train_epochs: 3
per_device_train_batch_size: 64
per_device_eval_batch_size: 64
fp16: true
# fp16_full_eval: true
# bf16: true
logging_steps: 50
eval_steps: 200
save_steps: 400
save_total_limit: 20
gradient_accumulation_steps: 25
learning_rate: 1.e-4
weight_decay: 0
adam_epsilon: 1.e-8
max_grad_norm: 10
warmup_steps: 50
optimizer: "lamb"
scheduler: "linear"
mlm_prob: 0.15
# predict_with_generate: true # Add this line with the desired value
# eval_accumulation_steps: 1 # Add this line with the desired value
finetune:
num_train_epochs: 3
per_device_train_batch_size: 64
per_device_eval_batch_size: 8
# fp16: true
# fp16_eval: true
logging_steps: 50
eval_steps: 300
save_steps: 300
save_total_limit: 10
# gradient_accumulation_steps: 1
# gradient_accumulation_steps: 10
# eval_accumulation_steps: 64
learning_rate: 1.e-3
# learning_rate: 1.e-1
# lr: 1.e-3
betas:
- 0.9
- 0.999
eps: 1.e-8
weight_decay: 0
adam_epsilon: 1.e-8
max_grad_norm: 10
warmup_steps: 200
# num_cooldown_steps: 2000
optimizer: "lamb"
# optimizer: "adamw"
# scheduler: "delay"
scheduler: "constant"
# num_param_groups: 0
# param_group_size: 2 # Except for the classification head, which has param_group_size == 1
delay_size: 0
models:
roberta-base:
num_attention_heads: 6
num_hidden_layers: 6
type_vocab_size: 1
block_size: 258
max_tokenized_len: 256
roberta-lm: {}
roberta-pred: {}
roberta-pred-mean-pool:
hidden_dropout_prob: 0.2
output_mode: "regression"
# For sparse (bce + mse) loss
# output_mode: "sparse"
threshold: 1
alpha: 0.1
dnabert-base:
block_size: 512
max_tokenized_len: 510
dnabert-lm: {}
dnabert-pred: {}
dnabert-pred-mean-pool:
hidden_dropout_prob: 0.2
output_mode: "regression"