clean up scripts and comments

mcw92 · mcw92 · commit c724407af92e · 2024-07-23T16:51:36.000+02:00
diff --git a/scripts/scaling_experiments/strong_scaling/n6_m4/generate_job_scripts.py b/scripts/scaling_experiments/strong_scaling/n6_m4/generate_job_scripts.py
@@ -1,7 +1,7 @@
 import subprocess
 
 limit = 60 * 24 * 3  # HoreKa wall-clock time limit in minutes
-nodes = [2, 4, 8, 16, 32, 64]  # number of nodes for scaling exps
+nodes = [2, 4, 8, 16, 32, 64]  # Number of nodes for scaling exps
 
 
 def main() -> None:
@@ -15,35 +15,41 @@ def main() -> None:
         job_name = f"n6_m4_strong_{num_nodes}"
         job_script_name = f"{job_name}.sh"
         scriptcontent = f"""#!/bin/bash
-#SBATCH --job-name={job_name}          # Job name
-#SBATCH --partition=cpuonly            # Queue for resource allocation
-#SBATCH --time={time}                 # Wall-clock time limit
-#SBATCH --cpus-per-task=76            # Number of CPUs required per (MPI) task
-#SBATCH --mail-type=ALL               # Notify user by email when certain event types occur.
-#SBATCH --account=hk-project-test-aihero2
-#SBATCH --nodes={num_nodes}           # Number of nodes
-#SBATCH --ntasks-per-node=1           # One MPI rank per node.
-
-
-export OMP_NUM_THREADS=${{SLURM_CPUS_PER_TASK}}
-export PYDIR=/hkfs/work/workspace/scratch/ku4408-special-couscous/special-couscous/specialcouscous
-
-ml purge                          # Unload all currently loaded modules.
-ml load compiler/gnu              # Load required modules.
+#SBATCH --job-name={job_name}  # Job name
+#SBATCH --partition=cpuonly    # Queue for resource allocation
+#SBATCH --time={time}          # Wall-clock time limit
+#SBATCH --cpus-per-task=76     # Number of CPUs required per (MPI) task
+#SBATCH --mail-type=ALL        # Notify user by email when certain event types occur.
+#SBATCH --nodes={num_nodes}    # Number of nodes
+#SBATCH --ntasks-per-node=1    # One MPI rank per node
+
+# Overwrite base directory by running export BASE_DIR="/some/alternative/path/here" before submitting the job.
+BASE_DIR=${{BASE_DIR:-/hkfs/work/workspace/scratch/ku4408-special-couscous/}}
+
+export OMP_NUM_THREADS=${{SLURM_CPUS_PER_TASK}}  # Set number of threads to number of CPUs per task as provided by SLURM.
+export PYDIR=${{BASE_DIR}}/special-couscous/specialcouscous  # Set path to Python package directory.
+
+ml purge              # Unload all currently loaded modules.
+ml load compiler/gnu  # Load required modules.
 ml load mpi/openmpi
-source /hkfs/work/workspace/scratch/ku4408-special-couscous/special-couscous-venv/bin/activate  # Activate venv.
 
+source "${{BASE_DIR}}"/special-couscous-venv/bin/activate  # Activate virtual environment.
+
+# Set hyperparameters of synthetic dataset and random forest model.
+# We estimated 1500 trees should be trainable in serial in 3 days
+# and chose the closest number evenly divisible by 64 as a baseline for scaling exps.
 N_SAMPLES=1000000
 N_FEATURES=10000
-N_TREES=1600  # We estimated 1500 trees should be trainable in serial in 3 d and chose the closest number evenly divisble by 64 as a baseline for scaling exps.
+N_TREES=1600
 
-SCRIPT="rf_scaling_synthetic.py"
+SCRIPT="scripts/examples/rf_parallel_synthetic.py"
 
-RESDIR=/hkfs/work/workspace/scratch/ku4408-special-couscous/results/strong_scaling/n6_m4_nodes_${{SLURM_NPROCS}}_job_${{SLURM_JOB_ID}}/
-mkdir ${{RESDIR}}
-cd ${{RESDIR}}
+RESDIR=${{BASE_DIR}}/results/strong_scaling/n6_m4_nodes_${{SLURM_NPROCS}}_job_${{SLURM_JOB_ID}}/
+mkdir "${{RESDIR}}"
+cd "${{RESDIR}}" || exit
 
-srun python -u ${{PYDIR}}/${{SCRIPT}} --n_samples ${{N_SAMPLES}} --n_features ${{N_FEATURES}} --n_trees ${{N_TREES}} --output_dir ${{RESDIR}} --output_label ${{SLURM_JOB_ID}} --detailed_evaluation
+# Run script
+srun python -u ${{PYDIR}}/${{SCRIPT}} --n_samples ${{N_SAMPLES}} --n_features ${{N_FEATURES}} --n_trees ${{N_TREES}} --output_dir ${{RESDIR}} --output_label ${{SLURM_JOB_ID}} --detailed_evaluation --save_model
                                 """
 
         with open(job_script_name, "wt") as f:
diff --git a/scripts/scaling_experiments/strong_scaling/n7_m3/baseline.sh b/scripts/scaling_experiments/strong_scaling/n7_m3/baseline.sh
@@ -28,7 +28,7 @@ N_TREES=448
 SCRIPT="scripts/examples/rf_serial_synthetic.py"  # Set script name.
 
 # Create directory to save results to.
-RESDIR=/hkfs/work/workspace/scratch/ku4408-special-couscous/results/single_node_experiments/job_${SLURM_JOB_ID}_n6_m4_baseline/
+RESDIR=${BASE_DIR}/results/single_node_experiments/job_${SLURM_JOB_ID}_n6_m4_baseline/
 mkdir "${RESDIR}"
 cd "${RESDIR}" || exit
 
diff --git a/scripts/scaling_experiments/strong_scaling/n7_m3/generate_job_scripts.py b/scripts/scaling_experiments/strong_scaling/n7_m3/generate_job_scripts.py
@@ -1,7 +1,7 @@
 import subprocess
 
 limit = 60 * 24 * 3  # HoreKa wall-clock time limit in minutes
-nodes = [2, 4, 8, 16, 32, 64]  # number of nodes for scaling exps
+nodes = [2, 4, 8, 16, 32, 64]  # Number of nodes for scaling exps
 
 
 def main() -> None:
@@ -15,35 +15,41 @@ def main() -> None:
         job_name = f"n7_m3_strong_{num_nodes}"
         job_script_name = f"{job_name}.sh"
         scriptcontent = f"""#!/bin/bash
-#SBATCH --job-name={job_name}          # Job name
-#SBATCH --partition=cpuonly            # Queue for resource allocation
-#SBATCH --time={time}                 # Wall-clock time limit
-#SBATCH --cpus-per-task=76            # Number of CPUs required per (MPI) task
-#SBATCH --mail-type=ALL               # Notify user by email when certain event types occur.
-#SBATCH --account=hk-project-test-aihero2
-#SBATCH --nodes={num_nodes}           # Number of nodes
-#SBATCH --ntasks-per-node=1           # One MPI rank per node.
-
-
-export OMP_NUM_THREADS=${{SLURM_CPUS_PER_TASK}}
-export PYDIR=/hkfs/work/workspace/scratch/ku4408-special-couscous/special-couscous/specialcouscous
-
-ml purge                          # Unload all currently loaded modules.
-ml load compiler/gnu              # Load required modules.
+#SBATCH --job-name={job_name}  # Job name
+#SBATCH --partition=cpuonly    # Queue for resource allocation
+#SBATCH --time={time}          # Wall-clock time limit
+#SBATCH --cpus-per-task=76     # Number of CPUs required per (MPI) task
+#SBATCH --mail-type=ALL        # Notify user by email when certain event types occur.
+#SBATCH --nodes={num_nodes}    # Number of nodes
+#SBATCH --ntasks-per-node=1    # One MPI rank per node
+
+# Overwrite base directory by running export BASE_DIR="/some/alternative/path/here" before submitting the job.
+BASE_DIR=${{BASE_DIR:-/hkfs/work/workspace/scratch/ku4408-special-couscous/}}
+
+export OMP_NUM_THREADS=${{SLURM_CPUS_PER_TASK}}  # Set number of threads to number of CPUs per task as provided by SLURM.
+export PYDIR=${{BASE_DIR}}/special-couscous/specialcouscous  # Set path to Python package directory.
+
+ml purge              # Unload all currently loaded modules.
+ml load compiler/gnu  # Load required modules.
 ml load mpi/openmpi
-source /hkfs/work/workspace/scratch/ku4408-special-couscous/special-couscous-venv/bin/activate  # Activate venv.
 
+source "${{BASE_DIR}}"/special-couscous-venv/bin/activate  # Activate virtual environment.
+
+# Set hyperparameters of synthetic dataset and random forest model.
+# We estimated 450 trees to be trainable in serial in 3 days
+ and chose the closest number evenly diviisble by 64 as a baseline for scaling exps.
 N_SAMPLES=10000000
 N_FEATURES=1000
-N_TREES=448  # We estimated 450 trees should be trainable in serial in 3 d and chose the closest number evenly divisble by 64 as a baseline for scaling exps.
+N_TREES=448
 
-SCRIPT="rf_scaling_synthetic.py"
+SCRIPT="scripts/examples/rf_parallel_synthetic.py"
 
-RESDIR=/hkfs/work/workspace/scratch/ku4408-special-couscous/results/strong_scaling/n7_m3_nodes_${{SLURM_NPROCS}}_job_${{SLURM_JOB_ID}}/
-mkdir ${{RESDIR}}
-cd ${{RESDIR}}
+RESDIR=${{BASE_DIR}}/results/strong_scaling/n7_m3_nodes_${{SLURM_NPROCS}}_job_${{SLURM_JOB_ID}}/
+mkdir "${{RESDIR}}"
+cd "${{RESDIR}}" || exit
 
-srun python -u ${{PYDIR}}/${{SCRIPT}} --n_samples ${{N_SAMPLES}} --n_features ${{N_FEATURES}} --n_trees ${{N_TREES}} --output_dir ${{RESDIR}} --output_label ${{SLURM_JOB_ID}} --detailed_evaluation
+# Run script
+srun python -u ${{PYDIR}}/${{SCRIPT}} --n_samples ${{N_SAMPLES}} --n_features ${{N_FEATURES}} --n_trees ${{N_TREES}} --output_dir ${{RESDIR}} --output_label ${{SLURM_JOB_ID}} --detailed_evaluation --save_model
                                 """
 
         with open(job_script_name, "wt") as f:
diff --git a/scripts/scaling_experiments/weak_scaling/n6_m4/generate_job_scripts.py b/scripts/scaling_experiments/weak_scaling/n6_m4/generate_job_scripts.py
@@ -5,7 +5,7 @@
 
 
 def main() -> None:
-    """Generate the n6_m4 job scripts."""
+    """Generate the job scripts for the n6_m4 weak scaling experiments."""
     for num_nodes in nodes:
         num_trees = num_trees_base * num_nodes
         print(f"Current config uses {num_nodes} nodes and {num_trees} trees.")
@@ -36,7 +36,7 @@ def main() -> None:
 N_FEATURES=10000
 N_TREES={num_trees}
 
-SCRIPT="rf_parallel_synthetic.py"
+SCRIPT="scripts/examples/rf_parallel_synthetic.py"
 
 RESDIR="${{BASE_DIR}}"/results/weak_scaling/n6_m4_nodes_${{SLURM_NPROCS}}_${{SLURM_JOB_ID}}/
 mkdir "${{RESDIR}}"
diff --git a/scripts/scaling_experiments/weak_scaling/n7_m3/baseline.sh b/scripts/scaling_experiments/weak_scaling/n7_m3/baseline.sh
@@ -1,28 +1,36 @@
 #!/bin/bash
-#SBATCH --job-name=n7_m3_base         # job name
-#SBATCH --partition=cpuonly           # queue for resource allocation
-#SBATCH --mem=501600mb
-#SBATCH --time=3-00:00:00             # wall-clock time limit
-#SBATCH --cpus-per-task=76            # number of CPUs required per (MPI) task
-#SBATCH --mail-type=ALL               # Notify user by email when certain event types occur.
-#SBATCH --account=hk-project-test-aihero2
-
-export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}
-export PYDIR=/hkfs/work/workspace/scratch/ku4408-special-couscous/special-couscous/py
-
-ml purge                          # Unload all currently loaded modules.
-ml load compiler/gnu              # Load required modules.
+#SBATCH --job-name=n7_m3_base  # Job name
+#SBATCH --partition=cpuonly    # Queue for resource allocation
+#SBATCH --mem=501600mb         # Memory requested per node
+#SBATCH --time=3-00:00:00      # Wall-clock time limit
+#SBATCH --cpus-per-task=76     # Number of CPUs required per (MPI) task
+#SBATCH --mail-type=ALL        # Notify user by email when certain event types occur.
+
+# Overwrite base directory by running export BASE_DIR="/some/alternative/path/here" before submitting the job.
+BASE_DIR=${BASE_DIR:-/hkfs/work/workspace/scratch/ku4408-special-couscous/}
+
+export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}  # Set number of threads to number of CPUs per task as provided by SLURM.
+export PYDIR=${BASE_DIR}/special-couscous/specialcouscous  # Set path to Python package directory.
+
+ml purge              # Unload all currently loaded modules.
+ml load compiler/gnu  # Load required modules.
 ml load mpi/openmpi
-source /hkfs/work/workspace/scratch/ku4408-special-couscous/special-couscous-venv/bin/activate  # Activate venv.
 
+source "${BASE_DIR}"/special-couscous-venv/bin/activate  # Activate virtual environment.
+
+# Set hyperparameters of synthetic dataset and random forest model.
+# We estimated 450 trees to be trainable in serial in 3 days
+# and chose the next smaller number evenly divisible by 64 (scaling experiments).
 N_SAMPLES=10000000
 N_FEATURES=1000
-N_TREES=448  # We estimated 450 trees to be trainable in serial in 3 days and chose the next smaller number evenly divisible by 64 (scaling experiments).
+N_TREES=448
 
-SCRIPT="RF_serial_synthetic.py"
+SCRIPT="scripts/examples/rf_serial_synthetic.py"  # Set script name.
 
-RESDIR=/hkfs/work/workspace/scratch/ku4408-special-couscous/results/single_node_experiments/job_${SLURM_JOB_ID}_n7_m3_t2/
-mkdir ${RESDIR}
-cd ${RESDIR}
+# Create directory to save results to.
+RESDIR=/hkfs/work/workspace/scratch/ku4408-special-couscous/results/single_node_experiments/job_${SLURM_JOB_ID}_n6_m4_baseline/
+mkdir "${RESDIR}"
+cd "${RESDIR}" || exit
 
-python -u ${PYDIR}/${SCRIPT} --n_samples ${N_SAMPLES} --n_features ${N_FEATURES} --n_trees ${N_TREES}
+# Run script.
+python -u "${PYDIR}"/${SCRIPT} --n_samples ${N_SAMPLES} --n_features ${N_FEATURES} --n_trees ${N_TREES} --output_dir "${RESDIR}" --output_label "${SLURM_JOB_ID}" --detailed_evaluation --save_model
diff --git a/scripts/scaling_experiments/weak_scaling/n7_m3/generate_job_scripts.py b/scripts/scaling_experiments/weak_scaling/n7_m3/generate_job_scripts.py
@@ -1,49 +1,48 @@
 import subprocess
 
-nodes = [2, 4, 8, 16, 32, 64]  # number of nodes for scaling exps
+nodes = [2, 4, 8, 16, 32, 64]  # Number of nodes for scaling exps
 num_trees_base = 224
 
 
 def main() -> None:
-    """Generate the n7_m3 job scripts."""
+    """Generate the job scripts for the n7_m3 weak scaling experiments."""
     for num_nodes in nodes:
         num_trees = num_trees_base * num_nodes
         print(f"Current config uses {num_nodes} nodes and {num_trees} trees.")
         job_name = f"n7_m3_weak_{num_nodes}"
         job_script_name = f"{job_name}.sh"
         script_content = f"""#!/bin/bash
-#SBATCH --job-name={job_name}          # Job name
-#SBATCH --partition=cpuonly            # Queue for resource allocation
-#SBATCH --time=2-12:00:00             # Wall-clock time limit
-#SBATCH --mem=243200mb		      # main memory (use standard nodes)
-#SBATCH --cpus-per-task=76            # Number of CPUs required per (MPI) task
-#SBATCH --mail-type=ALL               # Notify user by email when certain event types occur.
-#SBATCH --account=hk-project-test-aihero2
-#SBATCH --nodes={num_nodes}           # Number of nodes
-#SBATCH --ntasks-per-node=1           # One MPI rank per node.
+#SBATCH --job-name={job_name}  # Job name
+#SBATCH --partition=cpuonly    # Queue for resource allocation
+#SBATCH --time=2-12:00:00      # Wall-clock time limit
+#SBATCH --mem=243200mb		   # Main memory (use standard nodes)
+#SBATCH --cpus-per-task=76     # Number of CPUs required per (MPI) task
+#SBATCH --mail-type=ALL        # Notify user by email when certain event types occur.
+#SBATCH --nodes={num_nodes}    # Number of nodes
+#SBATCH --ntasks-per-node=1    # One MPI rank per node
 
 # Overwrite base directory by running export BASE_DIR="/some/alternative/path/here" before submitting the job.
 BASE_DIR=${{BASE_DIR:-/hkfs/work/workspace/scratch/ku4408-special-couscous/}}
 
 export OMP_NUM_THREADS=${{SLURM_CPUS_PER_TASK}}
 export PYDIR=${{BASE_DIR}}/special-couscous/specialcouscous
 
-ml purge                          # Unload all currently loaded modules.
-ml load compiler/gnu              # Load required modules.
+ml purge              # Unload all currently loaded modules.
+ml load compiler/gnu  # Load required modules.
 ml load mpi/openmpi
 source "${{BASE_DIR}}"/special-couscous-venv/bin/activate  # Activate venv.
 
 N_SAMPLES=10000000
 N_FEATURES=1000
 N_TREES={num_trees}
 
-SCRIPT="rf_scaling_synthetic.py"
+SCRIPT="scripts/examples/rf_parallel_synthetic.py"
 
 RESDIR="${{BASE_DIR}}"/results/weak_scaling/n7_m3_nodes_${{SLURM_NPROCS}}_${{SLURM_JOB_ID}}/
 mkdir "${{RESDIR}}"
 cd "${{RESDIR}}" || exit
 
-srun python -u ${{PYDIR}}/${{SCRIPT}} --n_samples ${{N_SAMPLES}} --n_features ${{N_FEATURES}} --n_trees ${{N_TREES}} --output_dir ${{RESDIR}} --output_label ${{SLURM_JOB_ID}} --detailed_evaluation
+srun python -u ${{PYDIR}}/${{SCRIPT}} --n_samples ${{N_SAMPLES}} --n_features ${{N_FEATURES}} --n_trees ${{N_TREES}} --output_dir ${{RESDIR}} --output_label ${{SLURM_JOB_ID}} --detailed_evaluation --save_model
                                 """
 
         with open(job_script_name, "wt") as f: