Skip to content

Commit c724407

Browse files
committed
clean up scripts and comments
1 parent a8e1abb commit c724407

File tree

6 files changed

+103
-84
lines changed

6 files changed

+103
-84
lines changed

scripts/scaling_experiments/strong_scaling/n6_m4/generate_job_scripts.py

Lines changed: 29 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import subprocess
22

33
limit = 60 * 24 * 3 # HoreKa wall-clock time limit in minutes
4-
nodes = [2, 4, 8, 16, 32, 64] # number of nodes for scaling exps
4+
nodes = [2, 4, 8, 16, 32, 64] # Number of nodes for scaling exps
55

66

77
def main() -> None:
@@ -15,35 +15,41 @@ def main() -> None:
1515
job_name = f"n6_m4_strong_{num_nodes}"
1616
job_script_name = f"{job_name}.sh"
1717
scriptcontent = f"""#!/bin/bash
18-
#SBATCH --job-name={job_name} # Job name
19-
#SBATCH --partition=cpuonly # Queue for resource allocation
20-
#SBATCH --time={time} # Wall-clock time limit
21-
#SBATCH --cpus-per-task=76 # Number of CPUs required per (MPI) task
22-
#SBATCH --mail-type=ALL # Notify user by email when certain event types occur.
23-
#SBATCH --account=hk-project-test-aihero2
24-
#SBATCH --nodes={num_nodes} # Number of nodes
25-
#SBATCH --ntasks-per-node=1 # One MPI rank per node.
26-
27-
28-
export OMP_NUM_THREADS=${{SLURM_CPUS_PER_TASK}}
29-
export PYDIR=/hkfs/work/workspace/scratch/ku4408-special-couscous/special-couscous/specialcouscous
30-
31-
ml purge # Unload all currently loaded modules.
32-
ml load compiler/gnu # Load required modules.
18+
#SBATCH --job-name={job_name} # Job name
19+
#SBATCH --partition=cpuonly # Queue for resource allocation
20+
#SBATCH --time={time} # Wall-clock time limit
21+
#SBATCH --cpus-per-task=76 # Number of CPUs required per (MPI) task
22+
#SBATCH --mail-type=ALL # Notify user by email when certain event types occur.
23+
#SBATCH --nodes={num_nodes} # Number of nodes
24+
#SBATCH --ntasks-per-node=1 # One MPI rank per node
25+
26+
# Overwrite base directory by running export BASE_DIR="/some/alternative/path/here" before submitting the job.
27+
BASE_DIR=${{BASE_DIR:-/hkfs/work/workspace/scratch/ku4408-special-couscous/}}
28+
29+
export OMP_NUM_THREADS=${{SLURM_CPUS_PER_TASK}} # Set number of threads to number of CPUs per task as provided by SLURM.
30+
export PYDIR=${{BASE_DIR}}/special-couscous/specialcouscous # Set path to Python package directory.
31+
32+
ml purge # Unload all currently loaded modules.
33+
ml load compiler/gnu # Load required modules.
3334
ml load mpi/openmpi
34-
source /hkfs/work/workspace/scratch/ku4408-special-couscous/special-couscous-venv/bin/activate # Activate venv.
3535
36+
source "${{BASE_DIR}}"/special-couscous-venv/bin/activate # Activate virtual environment.
37+
38+
# Set hyperparameters of synthetic dataset and random forest model.
39+
# We estimated 1500 trees should be trainable in serial in 3 days
40+
# and chose the closest number evenly divisible by 64 as a baseline for scaling exps.
3641
N_SAMPLES=1000000
3742
N_FEATURES=10000
38-
N_TREES=1600 # We estimated 1500 trees should be trainable in serial in 3 d and chose the closest number evenly divisble by 64 as a baseline for scaling exps.
43+
N_TREES=1600
3944
40-
SCRIPT="rf_scaling_synthetic.py"
45+
SCRIPT="scripts/examples/rf_parallel_synthetic.py"
4146
42-
RESDIR=/hkfs/work/workspace/scratch/ku4408-special-couscous/results/strong_scaling/n6_m4_nodes_${{SLURM_NPROCS}}_job_${{SLURM_JOB_ID}}/
43-
mkdir ${{RESDIR}}
44-
cd ${{RESDIR}}
47+
RESDIR=${{BASE_DIR}}/results/strong_scaling/n6_m4_nodes_${{SLURM_NPROCS}}_job_${{SLURM_JOB_ID}}/
48+
mkdir "${{RESDIR}}"
49+
cd "${{RESDIR}}" || exit
4550
46-
srun python -u ${{PYDIR}}/${{SCRIPT}} --n_samples ${{N_SAMPLES}} --n_features ${{N_FEATURES}} --n_trees ${{N_TREES}} --output_dir ${{RESDIR}} --output_label ${{SLURM_JOB_ID}} --detailed_evaluation
51+
# Run script
52+
srun python -u ${{PYDIR}}/${{SCRIPT}} --n_samples ${{N_SAMPLES}} --n_features ${{N_FEATURES}} --n_trees ${{N_TREES}} --output_dir ${{RESDIR}} --output_label ${{SLURM_JOB_ID}} --detailed_evaluation --save_model
4753
"""
4854

4955
with open(job_script_name, "wt") as f:

scripts/scaling_experiments/strong_scaling/n7_m3/baseline.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ N_TREES=448
2828
SCRIPT="scripts/examples/rf_serial_synthetic.py" # Set script name.
2929

3030
# Create directory to save results to.
31-
RESDIR=/hkfs/work/workspace/scratch/ku4408-special-couscous/results/single_node_experiments/job_${SLURM_JOB_ID}_n6_m4_baseline/
31+
RESDIR=${BASE_DIR}/results/single_node_experiments/job_${SLURM_JOB_ID}_n6_m4_baseline/
3232
mkdir "${RESDIR}"
3333
cd "${RESDIR}" || exit
3434

scripts/scaling_experiments/strong_scaling/n7_m3/generate_job_scripts.py

Lines changed: 29 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import subprocess
22

33
limit = 60 * 24 * 3 # HoreKa wall-clock time limit in minutes
4-
nodes = [2, 4, 8, 16, 32, 64] # number of nodes for scaling exps
4+
nodes = [2, 4, 8, 16, 32, 64] # Number of nodes for scaling exps
55

66

77
def main() -> None:
@@ -15,35 +15,41 @@ def main() -> None:
1515
job_name = f"n7_m3_strong_{num_nodes}"
1616
job_script_name = f"{job_name}.sh"
1717
scriptcontent = f"""#!/bin/bash
18-
#SBATCH --job-name={job_name} # Job name
19-
#SBATCH --partition=cpuonly # Queue for resource allocation
20-
#SBATCH --time={time} # Wall-clock time limit
21-
#SBATCH --cpus-per-task=76 # Number of CPUs required per (MPI) task
22-
#SBATCH --mail-type=ALL # Notify user by email when certain event types occur.
23-
#SBATCH --account=hk-project-test-aihero2
24-
#SBATCH --nodes={num_nodes} # Number of nodes
25-
#SBATCH --ntasks-per-node=1 # One MPI rank per node.
26-
27-
28-
export OMP_NUM_THREADS=${{SLURM_CPUS_PER_TASK}}
29-
export PYDIR=/hkfs/work/workspace/scratch/ku4408-special-couscous/special-couscous/specialcouscous
30-
31-
ml purge # Unload all currently loaded modules.
32-
ml load compiler/gnu # Load required modules.
18+
#SBATCH --job-name={job_name} # Job name
19+
#SBATCH --partition=cpuonly # Queue for resource allocation
20+
#SBATCH --time={time} # Wall-clock time limit
21+
#SBATCH --cpus-per-task=76 # Number of CPUs required per (MPI) task
22+
#SBATCH --mail-type=ALL # Notify user by email when certain event types occur.
23+
#SBATCH --nodes={num_nodes} # Number of nodes
24+
#SBATCH --ntasks-per-node=1 # One MPI rank per node
25+
26+
# Overwrite base directory by running export BASE_DIR="/some/alternative/path/here" before submitting the job.
27+
BASE_DIR=${{BASE_DIR:-/hkfs/work/workspace/scratch/ku4408-special-couscous/}}
28+
29+
export OMP_NUM_THREADS=${{SLURM_CPUS_PER_TASK}} # Set number of threads to number of CPUs per task as provided by SLURM.
30+
export PYDIR=${{BASE_DIR}}/special-couscous/specialcouscous # Set path to Python package directory.
31+
32+
ml purge # Unload all currently loaded modules.
33+
ml load compiler/gnu # Load required modules.
3334
ml load mpi/openmpi
34-
source /hkfs/work/workspace/scratch/ku4408-special-couscous/special-couscous-venv/bin/activate # Activate venv.
3535
36+
source "${{BASE_DIR}}"/special-couscous-venv/bin/activate # Activate virtual environment.
37+
38+
# Set hyperparameters of synthetic dataset and random forest model.
39+
# We estimated 450 trees to be trainable in serial in 3 days
40+
and chose the closest number evenly diviisble by 64 as a baseline for scaling exps.
3641
N_SAMPLES=10000000
3742
N_FEATURES=1000
38-
N_TREES=448 # We estimated 450 trees should be trainable in serial in 3 d and chose the closest number evenly divisble by 64 as a baseline for scaling exps.
43+
N_TREES=448
3944
40-
SCRIPT="rf_scaling_synthetic.py"
45+
SCRIPT="scripts/examples/rf_parallel_synthetic.py"
4146
42-
RESDIR=/hkfs/work/workspace/scratch/ku4408-special-couscous/results/strong_scaling/n7_m3_nodes_${{SLURM_NPROCS}}_job_${{SLURM_JOB_ID}}/
43-
mkdir ${{RESDIR}}
44-
cd ${{RESDIR}}
47+
RESDIR=${{BASE_DIR}}/results/strong_scaling/n7_m3_nodes_${{SLURM_NPROCS}}_job_${{SLURM_JOB_ID}}/
48+
mkdir "${{RESDIR}}"
49+
cd "${{RESDIR}}" || exit
4550
46-
srun python -u ${{PYDIR}}/${{SCRIPT}} --n_samples ${{N_SAMPLES}} --n_features ${{N_FEATURES}} --n_trees ${{N_TREES}} --output_dir ${{RESDIR}} --output_label ${{SLURM_JOB_ID}} --detailed_evaluation
51+
# Run script
52+
srun python -u ${{PYDIR}}/${{SCRIPT}} --n_samples ${{N_SAMPLES}} --n_features ${{N_FEATURES}} --n_trees ${{N_TREES}} --output_dir ${{RESDIR}} --output_label ${{SLURM_JOB_ID}} --detailed_evaluation --save_model
4753
"""
4854

4955
with open(job_script_name, "wt") as f:

scripts/scaling_experiments/weak_scaling/n6_m4/generate_job_scripts.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66

77
def main() -> None:
8-
"""Generate the n6_m4 job scripts."""
8+
"""Generate the job scripts for the n6_m4 weak scaling experiments."""
99
for num_nodes in nodes:
1010
num_trees = num_trees_base * num_nodes
1111
print(f"Current config uses {num_nodes} nodes and {num_trees} trees.")
@@ -36,7 +36,7 @@ def main() -> None:
3636
N_FEATURES=10000
3737
N_TREES={num_trees}
3838
39-
SCRIPT="rf_parallel_synthetic.py"
39+
SCRIPT="scripts/examples/rf_parallel_synthetic.py"
4040
4141
RESDIR="${{BASE_DIR}}"/results/weak_scaling/n6_m4_nodes_${{SLURM_NPROCS}}_${{SLURM_JOB_ID}}/
4242
mkdir "${{RESDIR}}"
Lines changed: 28 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,36 @@
11
#!/bin/bash
2-
#SBATCH --job-name=n7_m3_base # job name
3-
#SBATCH --partition=cpuonly # queue for resource allocation
4-
#SBATCH --mem=501600mb
5-
#SBATCH --time=3-00:00:00 # wall-clock time limit
6-
#SBATCH --cpus-per-task=76 # number of CPUs required per (MPI) task
7-
#SBATCH --mail-type=ALL # Notify user by email when certain event types occur.
8-
#SBATCH --account=hk-project-test-aihero2
9-
10-
export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}
11-
export PYDIR=/hkfs/work/workspace/scratch/ku4408-special-couscous/special-couscous/py
12-
13-
ml purge # Unload all currently loaded modules.
14-
ml load compiler/gnu # Load required modules.
2+
#SBATCH --job-name=n7_m3_base # Job name
3+
#SBATCH --partition=cpuonly # Queue for resource allocation
4+
#SBATCH --mem=501600mb # Memory requested per node
5+
#SBATCH --time=3-00:00:00 # Wall-clock time limit
6+
#SBATCH --cpus-per-task=76 # Number of CPUs required per (MPI) task
7+
#SBATCH --mail-type=ALL # Notify user by email when certain event types occur.
8+
9+
# Overwrite base directory by running export BASE_DIR="/some/alternative/path/here" before submitting the job.
10+
BASE_DIR=${BASE_DIR:-/hkfs/work/workspace/scratch/ku4408-special-couscous/}
11+
12+
export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK} # Set number of threads to number of CPUs per task as provided by SLURM.
13+
export PYDIR=${BASE_DIR}/special-couscous/specialcouscous # Set path to Python package directory.
14+
15+
ml purge # Unload all currently loaded modules.
16+
ml load compiler/gnu # Load required modules.
1517
ml load mpi/openmpi
16-
source /hkfs/work/workspace/scratch/ku4408-special-couscous/special-couscous-venv/bin/activate # Activate venv.
1718

19+
source "${BASE_DIR}"/special-couscous-venv/bin/activate # Activate virtual environment.
20+
21+
# Set hyperparameters of synthetic dataset and random forest model.
22+
# We estimated 450 trees to be trainable in serial in 3 days
23+
# and chose the next smaller number evenly divisible by 64 (scaling experiments).
1824
N_SAMPLES=10000000
1925
N_FEATURES=1000
20-
N_TREES=448 # We estimated 450 trees to be trainable in serial in 3 days and chose the next smaller number evenly divisible by 64 (scaling experiments).
26+
N_TREES=448
2127

22-
SCRIPT="RF_serial_synthetic.py"
28+
SCRIPT="scripts/examples/rf_serial_synthetic.py" # Set script name.
2329

24-
RESDIR=/hkfs/work/workspace/scratch/ku4408-special-couscous/results/single_node_experiments/job_${SLURM_JOB_ID}_n7_m3_t2/
25-
mkdir ${RESDIR}
26-
cd ${RESDIR}
30+
# Create directory to save results to.
31+
RESDIR=/hkfs/work/workspace/scratch/ku4408-special-couscous/results/single_node_experiments/job_${SLURM_JOB_ID}_n6_m4_baseline/
32+
mkdir "${RESDIR}"
33+
cd "${RESDIR}" || exit
2734

28-
python -u ${PYDIR}/${SCRIPT} --n_samples ${N_SAMPLES} --n_features ${N_FEATURES} --n_trees ${N_TREES}
35+
# Run script.
36+
python -u "${PYDIR}"/${SCRIPT} --n_samples ${N_SAMPLES} --n_features ${N_FEATURES} --n_trees ${N_TREES} --output_dir "${RESDIR}" --output_label "${SLURM_JOB_ID}" --detailed_evaluation --save_model

scripts/scaling_experiments/weak_scaling/n7_m3/generate_job_scripts.py

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,48 @@
11
import subprocess
22

3-
nodes = [2, 4, 8, 16, 32, 64] # number of nodes for scaling exps
3+
nodes = [2, 4, 8, 16, 32, 64] # Number of nodes for scaling exps
44
num_trees_base = 224
55

66

77
def main() -> None:
8-
"""Generate the n7_m3 job scripts."""
8+
"""Generate the job scripts for the n7_m3 weak scaling experiments."""
99
for num_nodes in nodes:
1010
num_trees = num_trees_base * num_nodes
1111
print(f"Current config uses {num_nodes} nodes and {num_trees} trees.")
1212
job_name = f"n7_m3_weak_{num_nodes}"
1313
job_script_name = f"{job_name}.sh"
1414
script_content = f"""#!/bin/bash
15-
#SBATCH --job-name={job_name} # Job name
16-
#SBATCH --partition=cpuonly # Queue for resource allocation
17-
#SBATCH --time=2-12:00:00 # Wall-clock time limit
18-
#SBATCH --mem=243200mb # main memory (use standard nodes)
19-
#SBATCH --cpus-per-task=76 # Number of CPUs required per (MPI) task
20-
#SBATCH --mail-type=ALL # Notify user by email when certain event types occur.
21-
#SBATCH --account=hk-project-test-aihero2
22-
#SBATCH --nodes={num_nodes} # Number of nodes
23-
#SBATCH --ntasks-per-node=1 # One MPI rank per node.
15+
#SBATCH --job-name={job_name} # Job name
16+
#SBATCH --partition=cpuonly # Queue for resource allocation
17+
#SBATCH --time=2-12:00:00 # Wall-clock time limit
18+
#SBATCH --mem=243200mb # Main memory (use standard nodes)
19+
#SBATCH --cpus-per-task=76 # Number of CPUs required per (MPI) task
20+
#SBATCH --mail-type=ALL # Notify user by email when certain event types occur.
21+
#SBATCH --nodes={num_nodes} # Number of nodes
22+
#SBATCH --ntasks-per-node=1 # One MPI rank per node
2423
2524
# Overwrite base directory by running export BASE_DIR="/some/alternative/path/here" before submitting the job.
2625
BASE_DIR=${{BASE_DIR:-/hkfs/work/workspace/scratch/ku4408-special-couscous/}}
2726
2827
export OMP_NUM_THREADS=${{SLURM_CPUS_PER_TASK}}
2928
export PYDIR=${{BASE_DIR}}/special-couscous/specialcouscous
3029
31-
ml purge # Unload all currently loaded modules.
32-
ml load compiler/gnu # Load required modules.
30+
ml purge # Unload all currently loaded modules.
31+
ml load compiler/gnu # Load required modules.
3332
ml load mpi/openmpi
3433
source "${{BASE_DIR}}"/special-couscous-venv/bin/activate # Activate venv.
3534
3635
N_SAMPLES=10000000
3736
N_FEATURES=1000
3837
N_TREES={num_trees}
3938
40-
SCRIPT="rf_scaling_synthetic.py"
39+
SCRIPT="scripts/examples/rf_parallel_synthetic.py"
4140
4241
RESDIR="${{BASE_DIR}}"/results/weak_scaling/n7_m3_nodes_${{SLURM_NPROCS}}_${{SLURM_JOB_ID}}/
4342
mkdir "${{RESDIR}}"
4443
cd "${{RESDIR}}" || exit
4544
46-
srun python -u ${{PYDIR}}/${{SCRIPT}} --n_samples ${{N_SAMPLES}} --n_features ${{N_FEATURES}} --n_trees ${{N_TREES}} --output_dir ${{RESDIR}} --output_label ${{SLURM_JOB_ID}} --detailed_evaluation
45+
srun python -u ${{PYDIR}}/${{SCRIPT}} --n_samples ${{N_SAMPLES}} --n_features ${{N_FEATURES}} --n_trees ${{N_TREES}} --output_dir ${{RESDIR}} --output_label ${{SLURM_JOB_ID}} --detailed_evaluation --save_model
4746
"""
4847

4948
with open(job_script_name, "wt") as f:

0 commit comments

Comments
 (0)