Skip to content

Commit 9af2f9d

Browse files
committed
add job script generation script for parallel evaluation from pickled model checkpoints
1 parent ae8a056 commit 9af2f9d

File tree

1 file changed

+177
-0
lines changed

1 file changed

+177
-0
lines changed
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
import os
2+
import pathlib
3+
import subprocess
4+
5+
6+
def generate_parallel_evaluation_from_ckpt_job_scripts(
7+
log_n_samples: int,
8+
log_n_features: int,
9+
n_classes: int,
10+
n_trees: int,
11+
data_seed: int,
12+
model_seed: int,
13+
output_path: pathlib.Path,
14+
checkpoint_path: pathlib.Path,
15+
submit: bool = False,
16+
) -> None:
17+
"""
18+
Generate the job scripts for the inference flavor comparison experiments.
19+
20+
These experiments basically correspond to a weak scaling experiment series with shared global model.
21+
22+
NOTE: We estimated 1500 and 450 trees to be trainable in serial in 3 days for 1M samples with 10k features and 10M
23+
samples with 1k features, respectively, and chose the closest number evenly divisible by 64 as a baseline.
24+
With number of samples n, number of features m, and number of trees t:
25+
26+
Strong scaling:
27+
n6m4 baseline (n, m, t) = (10^6, 10^4, 1600) and n7m3 baseline: (n, m, t) = (10^7, 10^3, 448)
28+
29+
Weak scaling:
30+
n6m4 baseline (n, m, t) = (10^6, 10^4, 800) and n7m3 baseline: (n, m, t) = (10^7, 10^3, 224)
31+
32+
NOTE: All strong-scaling experiments used high-memory nodes, i.e., #SBATCH --mem=486400mb, except for the 64-node
33+
experiment which used the normal nodes. This is due to the fact that HoreKa has only 32 high-memory nodes. However,
34+
as the problem size per node decreases with increasing number of nodes in strong scaling, this was not a problem here
35+
but only for weak scaling. That is why the base problem size of weak scaling is only half the base problem size of
36+
strong scaling.
37+
38+
Parameters
39+
----------
40+
log_n_samples : int
41+
The common logarithm of the number of samples to use.
42+
log_n_features : int
43+
The common logarithm of the number of features to use.
44+
n_classes : int
45+
The number of classes in the synthetic classification dataset.
46+
n_trees : int
47+
The number of trees to use in the baseline (will be scaled up with the number of nodes).
48+
data_seed : int
49+
The random state used for synthetic dataset generation and splitting.
50+
model_seed : int
51+
The (base) random state used for initializing the (distributed) model.
52+
output_path : pathlib.Path
53+
The path to save the generated job scripts.
54+
checkpoint_path : pathlib.Path
55+
The path to load the pickled model checkpoints from.
56+
submit : bool, optional
57+
Whether to submit jobs to the cluster. Default is False.
58+
"""
59+
for n_nodes in [
60+
2,
61+
4,
62+
8,
63+
16,
64+
32,
65+
]: # Weak scaling type experiment (with shared global model)
66+
n_trees_global = (
67+
n_trees * n_nodes
68+
) # Number of trees is scaled with number of nodes.
69+
time = 120 # All experiments should take approx. the same time (in min).
70+
mem = 243200 # Use standard nodes.
71+
72+
print(
73+
f"Current config uses {n_nodes} nodes and {n_trees_global} trees. Wall-clock time is {time / 60}h."
74+
)
75+
76+
job_name = (
77+
f"n{log_n_samples}_m{log_n_features}_nodes_{n_nodes}_modelseed_{model_seed}"
78+
)
79+
job_script_name = f"{job_name}.sh"
80+
script_content = f"""#!/bin/bash
81+
#SBATCH --job-name={job_name} # Job name
82+
#SBATCH --partition=cpuonly # Queue for resource allocation
83+
#SBATCH --time={time} # Wall-clock time limit
84+
#SBATCH --mem={mem}mb # Main memory
85+
#SBATCH --cpus-per-task=76 # Number of CPUs required per (MPI) task
86+
#SBATCH --mail-type=ALL # Notify user by email when certain event types occur.
87+
#SBATCH --nodes={n_nodes} # Number of nodes
88+
#SBATCH --ntasks-per-node=1 # One MPI rank per node
89+
#SBATCH --exclude=hkn[0249-0251,0257,0259] # Exclude broken nodes
90+
91+
# Overwrite base directory by running export BASE_DIR="/some/alternative/path/here" before submitting the job.
92+
BASE_DIR=${{BASE_DIR:-/hkfs/work/workspace/scratch/ku4408-SpecialCouscous}}
93+
94+
export OMP_NUM_THREADS=${{SLURM_CPUS_PER_TASK}}
95+
96+
ml purge # Unload all currently loaded modules.
97+
ml load compiler/gnu # Load required modules.
98+
ml load mpi/openmpi/4.1
99+
source "${{BASE_DIR}}"/special-couscous-venv-openmpi4/bin/activate # Activate venv.
100+
101+
SCRIPT="special-couscous/scripts/examples/evaluate_from_checkpoint_parallel_synthetic.py"
102+
103+
RESDIR="${{BASE_DIR}}"/results/inference_flavor/train/n{log_n_samples}_m{log_n_features}/nodes_{n_nodes}/${{SLURM_JOB_ID}}_{data_seed}_{model_seed}/
104+
mkdir -p "${{RESDIR}}"
105+
cd "${{RESDIR}}" || exit
106+
107+
srun python -u ${{BASE_DIR}}/${{SCRIPT}} \\
108+
--n_samples {10**log_n_samples} \\
109+
--n_features {10**log_n_features} \\
110+
--n_classes {n_classes} \\
111+
--n_trees {n_trees_global} \\
112+
--random_state {data_seed} \\
113+
--random_state_model {model_seed} \\
114+
--checkpoint_path {checkpoint_path} \\
115+
--output_dir ${{RESDIR}} \\
116+
--output_label ${{SLURM_JOB_ID}} \\
117+
--detailed_evaluation \\
118+
"""
119+
120+
with open(output_path / job_script_name, "wt") as f:
121+
f.write(script_content)
122+
if submit:
123+
subprocess.run(f"sbatch {output_path}/{job_script_name}", shell=True)
124+
125+
126+
if __name__ == "__main__":
127+
data_sets = [
128+
(
129+
6,
130+
4,
131+
800,
132+
[
133+
pathlib.Path(
134+
"/hkfs/work/workspace/scratch/ku4408-SpecialCouscous/results/train/n6_m4/nodes_64/2655630_0_2" # pragma: allowlist secret
135+
),
136+
pathlib.Path(
137+
"/hkfs/work/workspace/scratch/ku4408-SpecialCouscous/results/train/n6_m4/nodes_64/2655632_0_3" # pragma: allowlist secret
138+
),
139+
],
140+
),
141+
(
142+
7,
143+
3,
144+
224,
145+
[
146+
pathlib.Path(
147+
"/hkfs/work/workspace/scratch/ku4408-SpecialCouscous/results/train/n7_m3/nodes_64/2655631_0_2" # pragma: allowlist secret
148+
),
149+
pathlib.Path(
150+
"/hkfs/work/workspace/scratch/ku4408-SpecialCouscous/results/train/n7_m3/nodes_64/2655633_0_3" # pragma: allowlist secret
151+
),
152+
],
153+
),
154+
]
155+
data_seed = 0
156+
model_seeds = [2, 3]
157+
n_classes = 10
158+
output_path = pathlib.Path("./evaluation/")
159+
os.makedirs(output_path, exist_ok=True)
160+
for data_set in data_sets:
161+
log_n_samples = data_set[0]
162+
log_n_features = data_set[1]
163+
n_trees = data_set[2]
164+
checkpoint_paths = data_set[3]
165+
for random_state_model, checkpoint_path in zip(model_seeds, checkpoint_paths):
166+
# Generate job scripts and possibly submit them to the cluster.
167+
generate_parallel_evaluation_from_ckpt_job_scripts(
168+
log_n_samples=log_n_samples,
169+
log_n_features=log_n_features,
170+
n_trees=n_trees,
171+
n_classes=n_classes,
172+
data_seed=data_seed,
173+
model_seed=random_state_model,
174+
output_path=output_path,
175+
checkpoint_path=checkpoint_path,
176+
submit=False,
177+
)

0 commit comments

Comments
 (0)