|
| 1 | +import os |
| 2 | +import pathlib |
| 3 | +import subprocess |
| 4 | + |
| 5 | + |
| 6 | +def generate_parallel_evaluation_from_ckpt_job_scripts( |
| 7 | + log_n_samples: int, |
| 8 | + log_n_features: int, |
| 9 | + n_classes: int, |
| 10 | + n_trees: int, |
| 11 | + data_seed: int, |
| 12 | + model_seed: int, |
| 13 | + output_path: pathlib.Path, |
| 14 | + checkpoint_path: pathlib.Path, |
| 15 | + submit: bool = False, |
| 16 | +) -> None: |
| 17 | + """ |
| 18 | + Generate the job scripts for the inference flavor comparison experiments. |
| 19 | +
|
| 20 | + These experiments basically correspond to a weak scaling experiment series with shared global model. |
| 21 | +
|
| 22 | + NOTE: We estimated 1500 and 450 trees to be trainable in serial in 3 days for 1M samples with 10k features and 10M |
| 23 | + samples with 1k features, respectively, and chose the closest number evenly divisible by 64 as a baseline. |
| 24 | + With number of samples n, number of features m, and number of trees t: |
| 25 | +
|
| 26 | + Strong scaling: |
| 27 | + n6m4 baseline (n, m, t) = (10^6, 10^4, 1600) and n7m3 baseline: (n, m, t) = (10^7, 10^3, 448) |
| 28 | +
|
| 29 | + Weak scaling: |
| 30 | + n6m4 baseline (n, m, t) = (10^6, 10^4, 800) and n7m3 baseline: (n, m, t) = (10^7, 10^3, 224) |
| 31 | +
|
| 32 | + NOTE: All strong-scaling experiments used high-memory nodes, i.e., #SBATCH --mem=486400mb, except for the 64-node |
| 33 | + experiment which used the normal nodes. This is due to the fact that HoreKa has only 32 high-memory nodes. However, |
| 34 | + as the problem size per node decreases with increasing number of nodes in strong scaling, this was not a problem here |
| 35 | + but only for weak scaling. That is why the base problem size of weak scaling is only half the base problem size of |
| 36 | + strong scaling. |
| 37 | +
|
| 38 | + Parameters |
| 39 | + ---------- |
| 40 | + log_n_samples : int |
| 41 | + The common logarithm of the number of samples to use. |
| 42 | + log_n_features : int |
| 43 | + The common logarithm of the number of features to use. |
| 44 | + n_classes : int |
| 45 | + The number of classes in the synthetic classification dataset. |
| 46 | + n_trees : int |
| 47 | + The number of trees to use in the baseline (will be scaled up with the number of nodes). |
| 48 | + data_seed : int |
| 49 | + The random state used for synthetic dataset generation and splitting. |
| 50 | + model_seed : int |
| 51 | + The (base) random state used for initializing the (distributed) model. |
| 52 | + output_path : pathlib.Path |
| 53 | + The path to save the generated job scripts. |
| 54 | + checkpoint_path : pathlib.Path |
| 55 | + The path to load the pickled model checkpoints from. |
| 56 | + submit : bool, optional |
| 57 | + Whether to submit jobs to the cluster. Default is False. |
| 58 | + """ |
| 59 | + for n_nodes in [ |
| 60 | + 2, |
| 61 | + 4, |
| 62 | + 8, |
| 63 | + 16, |
| 64 | + 32, |
| 65 | + ]: # Weak scaling type experiment (with shared global model) |
| 66 | + n_trees_global = ( |
| 67 | + n_trees * n_nodes |
| 68 | + ) # Number of trees is scaled with number of nodes. |
| 69 | + time = 120 # All experiments should take approx. the same time (in min). |
| 70 | + mem = 243200 # Use standard nodes. |
| 71 | + |
| 72 | + print( |
| 73 | + f"Current config uses {n_nodes} nodes and {n_trees_global} trees. Wall-clock time is {time / 60}h." |
| 74 | + ) |
| 75 | + |
| 76 | + job_name = ( |
| 77 | + f"n{log_n_samples}_m{log_n_features}_nodes_{n_nodes}_modelseed_{model_seed}" |
| 78 | + ) |
| 79 | + job_script_name = f"{job_name}.sh" |
| 80 | + script_content = f"""#!/bin/bash |
| 81 | +#SBATCH --job-name={job_name} # Job name |
| 82 | +#SBATCH --partition=cpuonly # Queue for resource allocation |
| 83 | +#SBATCH --time={time} # Wall-clock time limit |
| 84 | +#SBATCH --mem={mem}mb # Main memory |
| 85 | +#SBATCH --cpus-per-task=76 # Number of CPUs required per (MPI) task |
| 86 | +#SBATCH --mail-type=ALL # Notify user by email when certain event types occur. |
| 87 | +#SBATCH --nodes={n_nodes} # Number of nodes |
| 88 | +#SBATCH --ntasks-per-node=1 # One MPI rank per node |
| 89 | +#SBATCH --exclude=hkn[0249-0251,0257,0259] # Exclude broken nodes |
| 90 | +
|
| 91 | +# Overwrite base directory by running export BASE_DIR="/some/alternative/path/here" before submitting the job. |
| 92 | +BASE_DIR=${{BASE_DIR:-/hkfs/work/workspace/scratch/ku4408-SpecialCouscous}} |
| 93 | +
|
| 94 | +export OMP_NUM_THREADS=${{SLURM_CPUS_PER_TASK}} |
| 95 | +
|
| 96 | +ml purge # Unload all currently loaded modules. |
| 97 | +ml load compiler/gnu # Load required modules. |
| 98 | +ml load mpi/openmpi/4.1 |
| 99 | +source "${{BASE_DIR}}"/special-couscous-venv-openmpi4/bin/activate # Activate venv. |
| 100 | +
|
| 101 | +SCRIPT="special-couscous/scripts/examples/evaluate_from_checkpoint_parallel_synthetic.py" |
| 102 | +
|
| 103 | +RESDIR="${{BASE_DIR}}"/results/inference_flavor/train/n{log_n_samples}_m{log_n_features}/nodes_{n_nodes}/${{SLURM_JOB_ID}}_{data_seed}_{model_seed}/ |
| 104 | +mkdir -p "${{RESDIR}}" |
| 105 | +cd "${{RESDIR}}" || exit |
| 106 | +
|
| 107 | +srun python -u ${{BASE_DIR}}/${{SCRIPT}} \\ |
| 108 | + --n_samples {10**log_n_samples} \\ |
| 109 | + --n_features {10**log_n_features} \\ |
| 110 | + --n_classes {n_classes} \\ |
| 111 | + --n_trees {n_trees_global} \\ |
| 112 | + --random_state {data_seed} \\ |
| 113 | + --random_state_model {model_seed} \\ |
| 114 | + --checkpoint_path {checkpoint_path} \\ |
| 115 | + --output_dir ${{RESDIR}} \\ |
| 116 | + --output_label ${{SLURM_JOB_ID}} \\ |
| 117 | + --detailed_evaluation \\ |
| 118 | + """ |
| 119 | + |
| 120 | + with open(output_path / job_script_name, "wt") as f: |
| 121 | + f.write(script_content) |
| 122 | + if submit: |
| 123 | + subprocess.run(f"sbatch {output_path}/{job_script_name}", shell=True) |
| 124 | + |
| 125 | + |
| 126 | +if __name__ == "__main__": |
| 127 | + data_sets = [ |
| 128 | + ( |
| 129 | + 6, |
| 130 | + 4, |
| 131 | + 800, |
| 132 | + [ |
| 133 | + pathlib.Path( |
| 134 | + "/hkfs/work/workspace/scratch/ku4408-SpecialCouscous/results/train/n6_m4/nodes_64/2655630_0_2" # pragma: allowlist secret |
| 135 | + ), |
| 136 | + pathlib.Path( |
| 137 | + "/hkfs/work/workspace/scratch/ku4408-SpecialCouscous/results/train/n6_m4/nodes_64/2655632_0_3" # pragma: allowlist secret |
| 138 | + ), |
| 139 | + ], |
| 140 | + ), |
| 141 | + ( |
| 142 | + 7, |
| 143 | + 3, |
| 144 | + 224, |
| 145 | + [ |
| 146 | + pathlib.Path( |
| 147 | + "/hkfs/work/workspace/scratch/ku4408-SpecialCouscous/results/train/n7_m3/nodes_64/2655631_0_2" # pragma: allowlist secret |
| 148 | + ), |
| 149 | + pathlib.Path( |
| 150 | + "/hkfs/work/workspace/scratch/ku4408-SpecialCouscous/results/train/n7_m3/nodes_64/2655633_0_3" # pragma: allowlist secret |
| 151 | + ), |
| 152 | + ], |
| 153 | + ), |
| 154 | + ] |
| 155 | + data_seed = 0 |
| 156 | + model_seeds = [2, 3] |
| 157 | + n_classes = 10 |
| 158 | + output_path = pathlib.Path("./evaluation/") |
| 159 | + os.makedirs(output_path, exist_ok=True) |
| 160 | + for data_set in data_sets: |
| 161 | + log_n_samples = data_set[0] |
| 162 | + log_n_features = data_set[1] |
| 163 | + n_trees = data_set[2] |
| 164 | + checkpoint_paths = data_set[3] |
| 165 | + for random_state_model, checkpoint_path in zip(model_seeds, checkpoint_paths): |
| 166 | + # Generate job scripts and possibly submit them to the cluster. |
| 167 | + generate_parallel_evaluation_from_ckpt_job_scripts( |
| 168 | + log_n_samples=log_n_samples, |
| 169 | + log_n_features=log_n_features, |
| 170 | + n_trees=n_trees, |
| 171 | + n_classes=n_classes, |
| 172 | + data_seed=data_seed, |
| 173 | + model_seed=random_state_model, |
| 174 | + output_path=output_path, |
| 175 | + checkpoint_path=checkpoint_path, |
| 176 | + submit=False, |
| 177 | + ) |
0 commit comments