-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmain.py
81 lines (69 loc) · 2.68 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import copy
import os
import resource
import pytorch_lightning as pl
import wandb
from ptunifier.config import ex
from ptunifier.datamodules.multitask_datamodule import MTDataModule
from ptunifier.models import PTUnifierTransformerSS
rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1]))
@ex.automain
def main(_config):
_config = copy.deepcopy(_config)
pl.seed_everything(_config["seed"])
# Data modules
dm = MTDataModule(_config, dist=True)
# Module
model = PTUnifierTransformerSS(_config)
# Loggers
os.makedirs(_config["log_dir"], exist_ok=True)
exp_name = f'{_config["exp_name"]}'
run_name = f'{exp_name}-seed{_config["seed"]}-from_{_config["load_path"].replace("/", "_")}'
wb_logger = pl.loggers.WandbLogger(project="PTUnifier", name=run_name, settings=wandb.Settings(start_method='fork'))
loggers = wb_logger
# Callback
checkpoint_callback = pl.callbacks.ModelCheckpoint(
save_top_k=1,
verbose=True,
monitor="val/the_metric",
mode="max",
save_last=True,
save_weights_only=True if "finetune" in exp_name else False
)
lr_callback = pl.callbacks.LearningRateMonitor(logging_interval="step")
callbacks = [checkpoint_callback, lr_callback]
# Training Hyper-Parameters
num_gpus = (_config["num_gpus"] if isinstance(_config["num_gpus"], int) else len(_config["num_gpus"]))
grad_steps = max(_config["batch_size"] // (_config["per_gpu_batchsize"] * num_gpus * _config["num_nodes"]), 1)
max_steps = _config["max_steps"] if _config["max_steps"] is not None else -1
max_epochs = _config["max_epoch"] if max_steps == -1 else 1000
# Trainer
trainer = pl.Trainer(
accelerator="gpu",
strategy="ddp",
devices=num_gpus,
num_nodes=_config["num_nodes"],
precision=_config["precision"],
benchmark=True,
deterministic=True,
max_epochs=max_epochs,
max_steps=max_steps,
callbacks=callbacks,
logger=loggers,
replace_sampler_ddp=False,
accumulate_grad_batches=grad_steps,
log_every_n_steps=50,
enable_model_summary=True,
profiler="simple",
resume_from_checkpoint=_config["resume_from"],
fast_dev_run=_config["fast_dev_run"],
val_check_interval=_config["val_check_interval"],
default_root_dir=_config["default_root_dir"]
)
if not _config["test_only"]:
trainer.fit(model, datamodule=dm)
if "finetune" in exp_name:
trainer.test(ckpt_path="best" if "irtr" not in _config["exp_name"] else None, datamodule=dm)
else:
trainer.test(model, datamodule=dm)