Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix aiaccel/hpo/apps/optimize.py . #414

Open
wants to merge 6 commits into
base: develop/v2
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 3 additions & 8 deletions aiaccel/hpo/apps/optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,19 +147,16 @@ def main() -> None:

result_filename_template = "{job.cwd}/{job.job_name}_result.pkl"

finished_job_count = 0

while finished_job_count < config.n_trials:
n_running_jobs = len(jobs.get_running_jobs())
n_max_jobs = min(jobs.available_slots(), config.n_trials - finished_job_count - n_running_jobs)
while jobs.get_finished_job_count() < config.n_trials:
n_max_jobs = min(jobs.available_slots(), config.n_trials - jobs.get_submitted_job_count())
for _ in range(n_max_jobs):
trial = study.ask()

hparams = params.suggest_hparams(trial)

jobs.job_name = str(jobs.job_filename) + f"_{trial.number}"

job = jobs.submit(
job = jobs.submit_wrapper(
args=[result_filename_template] + sum([[f"--{k}", f"{v:.5f}"] for k, v in hparams.items()], []),
tag=trial,
)
Expand All @@ -172,8 +169,6 @@ def main() -> None:

study.tell(trial, y)

finished_job_count += 1


if __name__ == "__main__":
main()
19 changes: 19 additions & 0 deletions aiaccel/hpo/job_executors/base_job_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ def __init__(

self.job_list: list[Any] = []

self.submitted_job_count = 0
self.finished_job_count = 0

@abstractmethod
def submit(
self,
Expand All @@ -61,6 +64,15 @@ def submit(
"""
pass

def submit_wrapper(
self,
args: list[str],
tag: Any = None,
sleep_time: float = 5.0,
) -> Any:
self.submitted_job_count += 1
self.submit(args, tag, sleep_time)

def update_status_batch(self) -> None:
"""
Updates the status of a batch of jobs.
Expand All @@ -87,6 +99,7 @@ def collect_finished(self) -> list[Any]:
finished_jobs = [job for job in self.job_list if job.status >= JobStatus.FINISHED]
for job in finished_jobs:
self.job_list.remove(job)
self.finished_job_count += 1

return finished_jobs

Expand All @@ -98,3 +111,9 @@ def get_running_jobs(self) -> list[Any]:
list[JobFuture]: A list of running jobs.
"""
return [job for job in self.job_list if job.status == JobStatus.RUNNING]

def get_submitted_job_count(self) -> int:
return self.submitted_job_count

def get_finished_job_count(self) -> int:
return self.finished_job_count
Loading