-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_daemon.py
43 lines (30 loc) · 999 Bytes
/
train_daemon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import os
import time
log_root = './log'
checkpoint_root = './checkpoint'
def submmit_training(full_command, trial):
print("\n"+"="*40)
print("[Command]")
print(f"{full_command}\n")
print("[Infor]")
print(f"Submitting a new trainning program at {trial}th trail...\n")
flag = os.system(full_command)
trial += 1
return flag, trial
base_command = 'python -u ./train.py'
command_ls = [base_command, ]
trial = 0
flag = 1
try_interval = 0.5 # should be non-zero, OR the daemon itself couldn't be shut down by CTRL+C
while flag:
# command compile
command_ls = [base_command, ]
resume_command = f'--resume {checkpoint_root}/latest.pth'
command_ls.append(resume_command)
log_command = f'> {log_root}/log_{trial}.txt'
command_ls.append(log_command)
full_command = " \\\n".join(command_ls)
# training submit
time.sleep(try_interval)
flag, trial = submmit_training(full_command, trial)
print("training over, shut daemon")