From 5f902befdb913e81055528e870908055d178c8d5 Mon Sep 17 00:00:00 2001 From: Ludovic Raess Date: Fri, 5 Apr 2024 16:45:57 +0300 Subject: [PATCH] Fix parallel init --- scripts_future_API/runme.sh | 12 ++- .../test_levelsets_volumefractions_mpi.jl | 97 +++++++++++-------- 2 files changed, 65 insertions(+), 44 deletions(-) diff --git a/scripts_future_API/runme.sh b/scripts_future_API/runme.sh index 3aec1999..4f92df52 100755 --- a/scripts_future_API/runme.sh +++ b/scripts_future_API/runme.sh @@ -1,10 +1,15 @@ #!/bin/bash -source /users/lurass/scratch/setenv_lumi.sh +# source /users/lurass/scratch/setenv_lumi.sh # module load LUMI/23.09 # module load partition/G # module load rocm/5.6.1 +module use /appl/local/csc/modulefiles +module load julia +module load julia-mpi +module load julia-amdgpu + # ROCm-aware MPI set to 1, else 0 export MPICH_GPU_SUPPORT_ENABLED=1 @@ -14,10 +19,11 @@ export MPICH_GPU_SUPPORT_ENABLED=1 ## optimal using only single GCD per MI250x Module # srun --cpu-bind=map_cpu:49,17,1,33 -N1 -n1 --gpus-per-node=8 runme.sh # srun --cpu-bind=map_cpu:49,17,1,33 -N4 -n16 --gpus-per-node=8 runme.sh -export ROCR_VISIBLE_DEVICES=0,2,4,6 +# export ROCR_VISIBLE_DEVICES=0,2,4,6 # julia --project benchmark_diffusion_3D.jl -julia --project --color=yes tm_stokes_mpi_wip.jl +# julia --project --color=yes tm_stokes_mpi_wip.jl +julia --project --color=yes test_levelsets_volumefractions_mpi.jl # Profiling # ENABLE_JITPROFILING=1 rocprof --hip-trace --hsa-trace -d ./prof_out${SLURM_PROCID} -o ./prof_out${SLURM_PROCID}/results${SLURM_PROCID}.csv julia --project bench3d.jl diff --git a/scripts_future_API/test_levelsets_volumefractions_mpi.jl b/scripts_future_API/test_levelsets_volumefractions_mpi.jl index 41901239..41b5d78e 100644 --- a/scripts_future_API/test_levelsets_volumefractions_mpi.jl +++ b/scripts_future_API/test_levelsets_volumefractions_mpi.jl @@ -11,13 +11,13 @@ using JLD2 using KernelAbstractions using CairoMakie # using CUDA -# using AMDGPU -# AMDGPU.allowscalar(false) +using AMDGPU +AMDGPU.allowscalar(false) # Select backend -backend = CPU() +# backend = CPU() # backend = CUDABackend() -# backend = ROCBackend() +backend = ROCBackend() using Chmy.Distributed using MPI @@ -25,41 +25,28 @@ MPI.Init() conv(nx, tx) = tx * ((nx + tx ÷ 2 -1 ) ÷ tx) -function make_synthetic(backend=CPU(); nx, ny, lx, ly, lz, amp, ω, tanβ, el, gl) - arch = Arch(backend) - grid = UniformGrid(arch; origin=(-lx/2, -ly/2), extent=(lx, ly), dims=(nx, ny)) +function make_synthetic(arch::Architecture, nx, ny, lx, ly, lz, amp, ω, tanβ, el, gl) + backend = Architectures.get_backend(arch) + topo = topology(arch) + device_id = shared_rank(topo) + 1 + + arch_2d = Arch(backend, device_id=device_id) + grid_2d = UniformGrid(arch_2d; origin=(-lx/2, -ly/2), extent=(lx, ly), dims=(nx, ny)) # type = :turtle generate_surf(x, y, gl, lx, ly) = gl * (1.0 - ((1.7 * x / lx + 0.22)^2 + (0.5 * y / ly)^2)) generate_bed(x, y, amp, ω, tanβ, el, lx, ly, lz) = lz * (amp * sin(ω * x / lx) * sin(ω * y /ly) + tanβ * x / lx + el + (1.5 * y / ly)^2) - surf = FunctionField(generate_surf, grid, Vertex(); parameters=(; gl, lx, ly)) - bed = FunctionField(generate_bed, grid, Vertex(); parameters=(; amp, ω, tanβ, el, lx, ly, lz)) - - return (; backend, bed, surf, grid, lx, ly, lz) -end - -function extract_dem(backend=CPU(); data_path::String) - data = load(data_path) - dtype = first(keys(data)) - z_surf = data[dtype].z_surf - z_bed = data[dtype].z_bed - dm = data[dtype].domain - lx, ly, lz = extents(dm) - nx, ny = size(z_surf) .- 1 - - arch = Arch(backend) - grid = UniformGrid(arch; origin=(-lx/2, -ly/2), extent=(lx, ly), dims=(nx, ny)) + surf = FunctionField(generate_surf, grid_2d, Vertex(); parameters=(; gl, lx, ly)) + bed = FunctionField(generate_bed, grid_2d, Vertex(); parameters=(; amp, ω, tanβ, el, lx, ly, lz)) - surf = Field(backend, grid, Vertex()) - bed = Field(backend, grid, Vertex()) - set!(surf, z_surf) - set!(bed, z_bed) - - return (; backend, bed, surf, grid, lx, ly, lz) + return (; arch, bed, surf, grid_2d, lx, ly, lz) end function main_synthetic(backend=CPU()) + # set-up distributed + arch = Arch(backend, MPI.COMM_WORLD, (0, 0, 1)) + # synthetic topo lx, ly, lz = 5.0, 5.0, 1.0 amp = 0.1 @@ -69,35 +56,63 @@ function main_synthetic(backend=CPU()) gl = 0.9 nx, ny = 126, 126 - nz = max(conv(ceil(Int, lz / lx * nx), 32), 32) + nz = max(conv(ceil(Int, lz / lx * nx), 30), 30) resol = (nx, ny, nz) - synthetic_elevation = make_synthetic(backend; nx, ny, lx, ly, lz, amp, ω, tanβ, el, gl) + synthetic_elevation = make_synthetic(arch, nx, ny, lx, ly, lz, amp, ω, tanβ, el, gl) run_simulation(synthetic_elevation..., resol...) return end +function extract_dem(arch::Architecture, data_path::String) + backend = Architectures.get_backend(arch) + topo = topology(arch) + device_id = shared_rank(topo) + 1 + + data = load(data_path) + dtype = first(keys(data)) + + z_surf = data[dtype].z_surf + z_bed = data[dtype].z_bed + dm = data[dtype].domain + lx, ly, lz = extents(dm) + nx, ny = size(z_surf) .- 1 + + arch_2d = Arch(backend, device_id=device_id) + grid_2d = UniformGrid(arch_2d; origin=(-lx/2, -ly/2), extent=(lx, ly), dims=(nx, ny)) + + surf = Field(backend, grid_2d, Vertex()) + bed = Field(backend, grid_2d, Vertex()) + set!(surf, z_surf) + set!(bed, z_bed) + + return (; arch, bed, surf, grid_2d, lx, ly, lz) +end + function main_vavilov(backend=CPU()) # load dem data data_path = "./vavilov_dem2.jld2" - data_elevation = extract_dem(backend; data_path) + # set-up distributed + arch = Arch(backend, MPI.COMM_WORLD, (0, 0, 0)) - nx, ny = 126, 126 - nz = max(conv(ceil(Int, data_elevation.lz / data_elevation.lx * nx), 32), 32) + data_elevation = extract_dem(arch, data_path) + + nx, ny = 254, 254 + nz = max(conv(ceil(Int, data_elevation.lz / data_elevation.lx * nx), 30), 62) resol = (nx, ny, nz) run_simulation(data_elevation..., resol...) return end -@views function run_simulation(backend, bed, surf, grid_2d, lx, ly, lz, nx, ny, nz) - arch = Arch(backend, MPI.COMM_WORLD, (0, 0, 0)) +@views function run_simulation(arch::Architecture, bed, surf, grid_2d, lx, ly, lz, nx, ny, nz) + # arch = Arch(backend, MPI.COMM_WORLD, (0, 0, 1)) topo = topology(arch) me = global_rank(topo) # geometry - @info "size = ($nx, $ny, $nz), extent = ($lx, $ly, $lz)" + (me == 0) && @info "size = ($nx, $ny, $nz), extent = ($lx, $ly, $lz)" dims_l = (nx, ny, nz) dims_g = dims_l .* dims(topo) grid = UniformGrid(arch; origin=(-lx/2, -ly/2, 0.0), extent=(lx, ly, lz), dims=dims_g) @@ -150,13 +165,13 @@ end p6 = heatmap!(axs.ax6, wt_ns_c[slx, :, :]; colormap=:turbo)) Colorbar(fig[1, 1][1, 2], plt.p1) Colorbar(fig[2, 2][1, 2], plt.p4) - display(fig) - # save("levset_$nx.png", fig) + # display(fig) + save("levset4_$nx.png", fig) end return end main_synthetic(backend) -main_vavilov(backend) +# main_vavilov(backend) MPI.Finalize()