diff --git a/CHANGELOG.md b/CHANGELOG.md index bb256e6a43bc..fbb4fe703c2c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ - [[PR 996]](https://github.com/parthenon-hpc-lab/parthenon/pull/996) Remove dynamic allocations from swarm particle creation ### Changed (changing behavior/API/variables/...) +- [[PR 973]](https://github.com/parthenon-hpc-lab/parthenon/pull/973) Multigrid performance upgrades ### Fixed (not changing behavior/API/variables/...) - [[PR1023]](https://github.com/parthenon-hpc-lab/parthenon/pull/1023) Fix broken param of a scalar bool diff --git a/example/poisson_gmg/main.cpp b/example/poisson_gmg/main.cpp index b237393b1b8f..87e53aea72e1 100644 --- a/example/poisson_gmg/main.cpp +++ b/example/poisson_gmg/main.cpp @@ -51,6 +51,8 @@ int main(int argc, char *argv[]) { if (driver_status != parthenon::DriverStatus::complete || driver.final_rms_residual > 1.e-10 || driver.final_rms_error > 1.e-12) success = false; + if (driver.final_rms_residual != driver.final_rms_residual) success = false; + if (driver.final_rms_error != driver.final_rms_error) success = false; } // call MPI_Finalize and Kokkos::finalize if necessary pman.ParthenonFinalize(); diff --git a/example/poisson_gmg/poisson_driver.cpp b/example/poisson_gmg/poisson_driver.cpp index 656dd3fcac87..0a09d0eb931d 100644 --- a/example/poisson_gmg/poisson_driver.cpp +++ b/example/poisson_gmg/poisson_driver.cpp @@ -79,7 +79,6 @@ TaskCollection PoissonDriver::MakeTaskCollection(BlockList_t &blocks) { const int num_partitions = pmesh->DefaultNumPartitions(); TaskRegion ®ion = tc.AddRegion(num_partitions); - int reg_dep_id = 0; for (int i = 0; i < num_partitions; ++i) { TaskList &tl = region[i]; auto &md = pmesh->mesh_data.GetOrAdd("base", i); @@ -100,9 +99,11 @@ TaskCollection PoissonDriver::MakeTaskCollection(BlockList_t &blocks) { auto solve = zero_u; if (solver == "BiCGSTAB") { - solve = bicgstab_solver->AddTasks(tl, zero_u, pmesh, i); + auto setup = bicgstab_solver->AddSetupTasks(tl, zero_u, i, pmesh); + solve = bicgstab_solver->AddTasks(tl, setup, pmesh, i); } else if (solver == "MG") { - solve = mg_solver->AddTasks(tl, zero_u, pmesh, i); + auto setup = mg_solver->AddSetupTasks(tl, zero_u, i, pmesh); + solve = mg_solver->AddTasks(tl, setup, pmesh, i); } else { PARTHENON_FAIL("Unknown solver type."); } diff --git a/example/poisson_gmg/poisson_driver.hpp b/example/poisson_gmg/poisson_driver.hpp index 348946ab2fb7..17a3cf6989c9 100644 --- a/example/poisson_gmg/poisson_driver.hpp +++ b/example/poisson_gmg/poisson_driver.hpp @@ -1,5 +1,5 @@ //======================================================================================== -// (C) (or copyright) 2021-2023. Triad National Security, LLC. All rights reserved. +// (C) (or copyright) 2021-2024. Triad National Security, LLC. All rights reserved. // // This program was produced under U.S. Government contract 89233218CNA000001 for Los // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC diff --git a/example/poisson_gmg/poisson_package.cpp b/example/poisson_gmg/poisson_package.cpp index a932ce4aaa5e..fddc087a4524 100644 --- a/example/poisson_gmg/poisson_package.cpp +++ b/example/poisson_gmg/poisson_package.cpp @@ -1,5 +1,5 @@ //======================================================================================== -// (C) (or copyright) 2021-2023. Triad National Security, LLC. All rights reserved. +// (C) (or copyright) 2021-2024. Triad National Security, LLC. All rights reserved. // // This program was produced under U.S. Government contract 89233218CNA000001 for Los // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC @@ -147,6 +147,7 @@ std::shared_ptr Initialize(ParameterInput *pin) { bicgstab_params.max_iters = max_poisson_iterations; bicgstab_params.residual_tolerance = res_tol; bicgstab_params.precondition = precondition; + bicgstab_params.print_per_step = true; parthenon::solvers::BiCGSTABSolver bicg_solver( pkg.get(), bicgstab_params, eq); pkg->AddParam<>("MGBiCGSTABsolver", bicg_solver, diff --git a/src/bvals/boundary_conditions_generic.hpp b/src/bvals/boundary_conditions_generic.hpp index a79f8479d197..12a798403d57 100644 --- a/src/bvals/boundary_conditions_generic.hpp +++ b/src/bvals/boundary_conditions_generic.hpp @@ -1,5 +1,5 @@ //======================================================================================== -// (C) (or copyright) 2020-2023. Triad National Security, LLC. All rights reserved. +// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved. // // This program was produced under U.S. Government contract 89233218CNA000001 for Los // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC @@ -18,6 +18,9 @@ #include #include #include +#include +#include +#include #include #include "basic_types.hpp" @@ -34,6 +37,35 @@ namespace BoundaryFunction { enum class BCSide { Inner, Outer }; enum class BCType { Outflow, Reflect, ConstantDeriv, Fixed, FixedFace }; +namespace impl { +using desc_key_t = std::tuple; +template +using map_bc_pack_descriptor_t = + std::unordered_map::Descriptor, + tuple_hash>; + +template +map_bc_pack_descriptor_t +GetPackDescriptorMap(std::shared_ptr> &rc) { + std::vector> elements{ + {TopologicalType::Cell, Metadata::Cell}, + {TopologicalType::Face, Metadata::Face}, + {TopologicalType::Edge, Metadata::Edge}, + {TopologicalType::Node, Metadata::Node}}; + map_bc_pack_descriptor_t my_map; + for (auto [tt, md] : elements) { + std::vector flags{Metadata::FillGhost}; + flags.push_back(md); + std::set opts{PDOpt::Coarse}; + my_map.emplace(std::make_pair(desc_key_t{true, tt}, + MakePackDescriptor(rc.get(), flags, opts))); + my_map.emplace(std::make_pair(desc_key_t{false, tt}, + MakePackDescriptor(rc.get(), flags))); + } + return my_map; +} +} // namespace impl + template void GenericBC(std::shared_ptr> &rc, bool coarse, TopologicalElement el, Real val) { @@ -46,17 +78,9 @@ void GenericBC(std::shared_ptr> &rc, bool coarse, constexpr bool X3 = (DIR == X3DIR); constexpr bool INNER = (SIDE == BCSide::Inner); - std::vector flags{Metadata::FillGhost}; - if (GetTopologicalType(el) == TopologicalType::Cell) flags.push_back(Metadata::Cell); - if (GetTopologicalType(el) == TopologicalType::Face) flags.push_back(Metadata::Face); - if (GetTopologicalType(el) == TopologicalType::Edge) flags.push_back(Metadata::Edge); - if (GetTopologicalType(el) == TopologicalType::Node) flags.push_back(Metadata::Node); - - std::set opts; - if (coarse) opts = {PDOpt::Coarse}; - auto desc = MakePackDescriptor( - rc->GetBlockPointer()->pmy_mesh->resolved_packages.get(), flags, opts); - auto q = desc.GetPack(rc.get()); + static auto descriptors = impl::GetPackDescriptorMap(rc); + auto q = + descriptors[impl::desc_key_t{coarse, GetTopologicalType(el)}].GetPack(rc.get()); const int b = 0; const int lstart = q.GetLowerBoundHost(b); const int lend = q.GetUpperBoundHost(b); diff --git a/src/interface/mesh_data.cpp b/src/interface/mesh_data.cpp index ac1a12e5bfaa..27a4dc520180 100644 --- a/src/interface/mesh_data.cpp +++ b/src/interface/mesh_data.cpp @@ -1,5 +1,5 @@ //======================================================================================== -// (C) (or copyright) 2020-2023. Triad National Security, LLC. All rights reserved. +// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved. // // This program was produced under U.S. Government contract 89233218CNA000001 for Los // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC @@ -25,9 +25,19 @@ void MeshData::Initialize(const MeshData *src, pmy_mesh_ = src->GetParentPointer(); const int nblocks = src->NumBlocks(); block_data_.resize(nblocks); - for (int i = 0; i < nblocks; i++) { - block_data_[i] = pmy_mesh_->block_list[i]->meshblock_data.Add( - stage_name_, src->GetBlockData(i), names, shallow); + + grid = src->grid; + if (grid.type == GridType::two_level_composite) { + int gmg_level = src->grid.logical_level - pmy_mesh_->GetGMGMinLogicalLevel(); + for (int i = 0; i < nblocks; i++) { + block_data_[i] = pmy_mesh_->gmg_block_lists[gmg_level][i]->meshblock_data.Add( + stage_name_, src->GetBlockData(i), names, shallow); + } + } else { + for (int i = 0; i < nblocks; i++) { + block_data_[i] = pmy_mesh_->block_list[i]->meshblock_data.Add( + stage_name_, src->GetBlockData(i), names, shallow); + } } } diff --git a/src/interface/sparse_pack.hpp b/src/interface/sparse_pack.hpp index 9753c401561b..d5fdf37a2a30 100644 --- a/src/interface/sparse_pack.hpp +++ b/src/interface/sparse_pack.hpp @@ -181,7 +181,7 @@ class SparsePack : public SparsePackBase { // accessed on device via instance of types in the type list Ts... // The pack will be created and accessible on the device template - SparsePack GetPack(T *pmd, std::vector include_block = {}, + SparsePack GetPack(T *pmd, std::vector &include_block, bool only_fine_two_level_composite_blocks = true) const { // If this is a composite grid MeshData object and if // only_fine_two_level_composite_blocks is true, only @@ -189,9 +189,8 @@ class SparsePack : public SparsePackBase { if constexpr (std::is_same>::value) { if (pmd->grid.type == GridType::two_level_composite && only_fine_two_level_composite_blocks) { - if (include_block.size() != pmd->NumBlocks()) { - include_block = std::vector(pmd->NumBlocks(), true); - } + PARTHENON_REQUIRE(include_block.size() == pmd->NumBlocks(), + "Passed wrong size block include list."); int fine_level = pmd->grid.logical_level; for (int b = 0; b < pmd->NumBlocks(); ++b) include_block[b] = @@ -202,6 +201,27 @@ class SparsePack : public SparsePackBase { return SparsePack(SparsePackBase::GetPack(pmd, *this, include_block)); } + template + SparsePack GetPack(T *pmd, bool only_fine_two_level_composite_blocks = true) const { + // If this is a composite grid MeshData object, only include blocks on + // the finer level + if constexpr (std::is_same>::value) { + if (pmd->grid.type == GridType::two_level_composite && + only_fine_two_level_composite_blocks) { + auto include_block = std::vector(pmd->NumBlocks(), true); + int fine_level = pmd->grid.logical_level; + for (int b = 0; b < pmd->NumBlocks(); ++b) + include_block[b] = + include_block[b] && + (fine_level == pmd->GetBlockData(b)->GetBlockPointer()->loc.level()); + return SparsePack(SparsePackBase::GetPack(pmd, *this, include_block)); + } else { + return SparsePack(SparsePackBase::GetPack(pmd, *this, std::vector{})); + } + } + return SparsePack(SparsePackBase::GetPack(pmd, *this, std::vector{})); + } + SparsePackIdxMap GetMap() const { PARTHENON_REQUIRE(sizeof...(Ts) == 0, "Should not be getting an IdxMap for a type based pack"); diff --git a/src/solvers/bicgstab_solver.hpp b/src/solvers/bicgstab_solver.hpp index b82d27bd13dd..2cc6bf7f7033 100644 --- a/src/solvers/bicgstab_solver.hpp +++ b/src/solvers/bicgstab_solver.hpp @@ -1,5 +1,5 @@ //======================================================================================== -// (C) (or copyright) 2023. Triad National Security, LLC. All rights reserved. +// (C) (or copyright) 2023-2024. Triad National Security, LLC. All rights reserved. // // This program was produced under U.S. Government contract 89233218CNA000001 for Los // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC @@ -32,10 +32,12 @@ namespace parthenon { namespace solvers { struct BiCGSTABParams { + MGParams mg_params; int max_iters = 10; Real residual_tolerance = 1.e-12; Real restart_threshold = -1.0; bool precondition = true; + bool print_per_step = false; }; // The equations class must include a template method @@ -45,30 +47,36 @@ struct BiCGSTABParams { // // that takes a field associated with x_t and applies // the matrix A to it and stores the result in y_t. -template +template class BiCGSTABSolver { public: - PARTHENON_INTERNALSOLVERVARIABLE(x, rhat0); - PARTHENON_INTERNALSOLVERVARIABLE(x, v); - PARTHENON_INTERNALSOLVERVARIABLE(x, h); - PARTHENON_INTERNALSOLVERVARIABLE(x, s); - PARTHENON_INTERNALSOLVERVARIABLE(x, t); - PARTHENON_INTERNALSOLVERVARIABLE(x, r); - PARTHENON_INTERNALSOLVERVARIABLE(x, p); - PARTHENON_INTERNALSOLVERVARIABLE(x, u); + PARTHENON_INTERNALSOLVERVARIABLE(u, rhat0); + PARTHENON_INTERNALSOLVERVARIABLE(u, v); + PARTHENON_INTERNALSOLVERVARIABLE(u, h); + PARTHENON_INTERNALSOLVERVARIABLE(u, s); + PARTHENON_INTERNALSOLVERVARIABLE(u, t); + PARTHENON_INTERNALSOLVERVARIABLE(u, r); + PARTHENON_INTERNALSOLVERVARIABLE(u, p); + PARTHENON_INTERNALSOLVERVARIABLE(u, x); + + std::vector GetInternalVariableNames() const { + std::vector names{rhat0::name(), v::name(), h::name(), s::name(), + t::name(), r::name(), p::name(), x::name()}; + if (params_.precondition) { + auto pre_names = preconditioner.GetInternalVariableNames(); + names.insert(names.end(), pre_names.begin(), pre_names.end()); + } + return names; + } BiCGSTABSolver(StateDescriptor *pkg, BiCGSTABParams params_in, equations eq_in = equations(), std::vector shape = {}) - : preconditioner(pkg, MGParams(), eq_in, shape), params_(params_in), - iter_counter(0), eqs_(eq_in) { + : preconditioner(pkg, params_in.mg_params, eq_in, shape), params_(params_in), + iter_counter(0), eqs_(eq_in), presidual_tolerance(nullptr) { using namespace refinement_ops; - auto mu = Metadata({Metadata::Cell, Metadata::Independent, Metadata::FillGhost, - Metadata::WithFluxes, Metadata::GMGRestrict}, - shape); - mu.RegisterRefinementOps(); auto m_no_ghost = Metadata({Metadata::Cell, Metadata::Derived, Metadata::OneCopy}, shape); - pkg->AddField(u::name(), mu); + pkg->AddField(x::name(), m_no_ghost); pkg->AddField(rhat0::name(), m_no_ghost); pkg->AddField(v::name(), m_no_ghost); pkg->AddField(h::name(), m_no_ghost); @@ -78,11 +86,20 @@ class BiCGSTABSolver { pkg->AddField(p::name(), m_no_ghost); } + template + TaskID AddSetupTasks(TL_t &tl, TaskID dependence, int partition, Mesh *pmesh) { + return preconditioner.AddSetupTasks(tl, dependence, partition, pmesh); + } + TaskID AddTasks(TaskList &tl, TaskID dependence, Mesh *pmesh, const int partition) { using namespace utils; TaskID none; auto &md = pmesh->mesh_data.GetOrAdd("base", partition); + std::string label = "bicg_comm_" + std::to_string(partition); + auto &md_comm = + pmesh->mesh_data.AddShallow(label, md, std::vector{u::name()}); iter_counter = 0; + bool multilevel = pmesh->multilevel; // Initialization: x <- 0, r <- rhs, rhat0 <- rhs, // rhat0r_old <- (rhat0, r), p <- r, u <- 0 @@ -102,11 +119,12 @@ class BiCGSTABSolver { solver->ts.val = 0.0; solver->tt.val = 0.0; solver->residual.val = 0.0; + solver->iter_counter = 0; return TaskStatus::complete; }, this); tl.AddTask(TaskQualifier::once_per_region, dependence, [&]() { - if (Globals::my_rank == 0) + if (Globals::my_rank == 0 && params_.print_per_step) printf("# [0] v-cycle\n# [1] rms-residual\n# [2] rms-error\n"); return TaskStatus::complete; }); @@ -126,7 +144,8 @@ class BiCGSTABSolver { } // 2. v <- A u - auto comm = AddBoundaryExchangeTasks(precon1, itl, md, true); + auto comm = + AddBoundaryExchangeTasks(precon1, itl, md_comm, multilevel); auto get_v = eqs_.template Ax(itl, comm, md); // 3. rhat0v <- (rhat0, v) @@ -157,7 +176,7 @@ class BiCGSTABSolver { TaskQualifier::once_per_region, get_res, [&](BiCGSTABSolver *solver, Mesh *pmesh) { Real rms_res = std::sqrt(solver->residual.val / pmesh->GetTotalCells()); - if (Globals::my_rank == 0) + if (Globals::my_rank == 0 && solver->params_.print_per_step) printf("%i %e\n", solver->iter_counter * 2 + 1, rms_res); return TaskStatus::complete; }, @@ -175,7 +194,8 @@ class BiCGSTABSolver { } // 7. t <- A u - auto pre_t_comm = AddBoundaryExchangeTasks(precon2, itl, md, true); + auto pre_t_comm = + AddBoundaryExchangeTasks(precon2, itl, md_comm, multilevel); auto get_t = eqs_.template Ax(itl, pre_t_comm, md); // 8. omega <- (t,s) / (t,t) @@ -207,7 +227,7 @@ class BiCGSTABSolver { TaskQualifier::once_per_region, get_res2, [&](BiCGSTABSolver *solver, Mesh *pmesh) { Real rms_err = std::sqrt(solver->residual.val / pmesh->GetTotalCells()); - if (Globals::my_rank == 0) + if (Globals::my_rank == 0 && solver->params_.print_per_step) printf("%i %e\n", solver->iter_counter * 2 + 2, rms_err); return TaskStatus::complete; }, @@ -231,16 +251,20 @@ class BiCGSTABSolver { this, md); // 14. rhat0r_old <- rhat0r, zero all reductions + Real *ptol = presidual_tolerance == nullptr ? &(params_.residual_tolerance) + : presidual_tolerance; auto check = itl.AddTask( TaskQualifier::completion | TaskQualifier::once_per_region | TaskQualifier::global_sync, update_p | correct_x, - [](BiCGSTABSolver *solver, Mesh *pmesh, Real res_tol) { + [](BiCGSTABSolver *solver, Mesh *pmesh, int max_iter, Real *res_tol) { solver->iter_counter++; Real rms_res = std::sqrt(solver->residual.val / pmesh->GetTotalCells()); solver->final_residual = rms_res; solver->final_iteration = solver->iter_counter; - if (rms_res < res_tol) { + if (rms_res < *res_tol || solver->iter_counter >= max_iter) { + solver->final_residual = rms_res; + solver->final_iteration = solver->iter_counter; return TaskStatus::complete; } solver->rhat0r_old = solver->rhat0r.val; @@ -251,9 +275,9 @@ class BiCGSTABSolver { solver->residual.val = 0.0; return TaskStatus::iterate; }, - this, pmesh, params_.residual_tolerance); + this, pmesh, params_.max_iters, ptol); - return solver_id; + return tl.AddTask(solver_id, CopyData, md); } Real GetSquaredResidualSum() const { return residual.val; } @@ -262,6 +286,8 @@ class BiCGSTABSolver { Real GetFinalResidual() const { return final_residual; } int GetFinalIterations() const { return final_iteration; } + void UpdateResidualTolerance(Real *ptol) { presidual_tolerance = ptol; } + protected: MGSolver preconditioner; BiCGSTABParams params_; @@ -271,6 +297,7 @@ class BiCGSTABSolver { equations eqs_; Real final_residual; int final_iteration; + Real *presidual_tolerance; }; } // namespace solvers diff --git a/src/solvers/mg_solver.hpp b/src/solvers/mg_solver.hpp index 427b1802e7e2..153220a4a4cf 100644 --- a/src/solvers/mg_solver.hpp +++ b/src/solvers/mg_solver.hpp @@ -1,5 +1,5 @@ //======================================================================================== -// (C) (or copyright) 2023. Triad National Security, LLC. All rights reserved. +// (C) (or copyright) 2023-2024. Triad National Security, LLC. All rights reserved. // // This program was produced under U.S. Government contract 89233218CNA000001 for Los // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC @@ -13,6 +13,8 @@ #ifndef SOLVERS_MG_SOLVER_HPP_ #define SOLVERS_MG_SOLVER_HPP_ +#include +#include #include #include #include @@ -35,6 +37,8 @@ struct MGParams { Real residual_tolerance = 1.e-12; bool do_FAS = true; std::string smoother = "SRJ2"; + bool two_by_two_diagonal = false; + int max_coarsenings = std::numeric_limits::max(); }; // The equations class must include a template method @@ -59,27 +63,37 @@ class MGSolver { PARTHENON_INTERNALSOLVERVARIABLE(u, temp); // Temporary storage PARTHENON_INTERNALSOLVERVARIABLE(u, u0); // Storage for initial solution during FAS PARTHENON_INTERNALSOLVERVARIABLE(u, D); // Storage for (approximate) diagonal + std::vector GetInternalVariableNames() const { + return {res_err::name(), temp::name(), u0::name(), D::name()}; + } MGSolver(StateDescriptor *pkg, MGParams params_in, equations eq_in = equations(), std::vector shape = {}) : params_(params_in), iter_counter(0), eqs_(eq_in) { using namespace parthenon::refinement_ops; + // The ghost cells of res_err need to be filled, but this is accomplished by + // copying res_err into u, communicating, then copying u back into res_err + // across all zones in a block auto mres_err = - Metadata({Metadata::Cell, Metadata::Independent, Metadata::FillGhost, - Metadata::GMGRestrict, Metadata::GMGProlongate, Metadata::OneCopy}, + Metadata({Metadata::Cell, Metadata::Independent, Metadata::GMGRestrict, + Metadata::GMGProlongate, Metadata::OneCopy}, shape); mres_err.RegisterRefinementOps(); pkg->AddField(res_err::name(), mres_err); - auto mtemp = Metadata({Metadata::Cell, Metadata::Independent, Metadata::FillGhost, - Metadata::WithFluxes, Metadata::OneCopy}, - shape); + auto mtemp = + Metadata({Metadata::Cell, Metadata::Independent, Metadata::OneCopy}, shape); mtemp.RegisterRefinementOps(); pkg->AddField(temp::name(), mtemp); auto mu0 = Metadata({Metadata::Cell, Metadata::Derived, Metadata::OneCopy}, shape); pkg->AddField(u0::name(), mu0); - pkg->AddField(D::name(), mu0); + auto Dshape = shape; + if (params_.two_by_two_diagonal) { + Dshape = std::vector{4}; + } + auto mD = Metadata({Metadata::Cell, Metadata::Derived, Metadata::OneCopy}, Dshape); + pkg->AddField(D::name(), mD); } TaskID AddTasks(TaskList &tl, TaskID dependence, Mesh *pmesh, const int partition) { @@ -97,7 +111,9 @@ class MGSolver { &iter_counter); auto mg_finest = AddLinearOperatorTasks(itl, none, partition, pmesh); auto &md = pmesh->mesh_data.GetOrAdd("base", partition); - auto calc_pointwise_res = eqs_.template Ax(itl, mg_finest, md); + auto comm = AddBoundaryExchangeTasks(mg_finest, itl, md, + pmesh->multilevel); + auto calc_pointwise_res = eqs_.template Ax(itl, comm, md); calc_pointwise_res = itl.AddTask( calc_pointwise_res, AddFieldsAndStoreInteriorSelect, md, 1.0, -1.0, false); @@ -126,13 +142,24 @@ class MGSolver { using namespace utils; iter_counter = 0; - int min_level = 0; + int min_level = std::max(pmesh->GetGMGMaxLevel() - params_.max_coarsenings, 0); int max_level = pmesh->GetGMGMaxLevel(); return AddMultiGridTasksPartitionLevel(tl, dependence, partition, max_level, min_level, max_level, pmesh); } + template + TaskID AddSetupTasks(TL_t &tl, TaskID dependence, int partition, Mesh *pmesh) { + using namespace utils; + + int min_level = std::max(pmesh->GetGMGMaxLevel() - params_.max_coarsenings, 0); + int max_level = pmesh->GetGMGMaxLevel(); + + return AddMultiGridSetupPartitionLevel(tl, dependence, partition, max_level, + min_level, max_level, pmesh); + } + Real GetSquaredResidualSum() const { return residual.val; } int GetCurrentIterations() const { return iter_counter; } Real GetFinalResidual() const { return final_residual; } @@ -148,11 +175,8 @@ class MGSolver { // These functions apparently have to be public to compile with cuda since // they contain device side lambdas public: - enum class GSType { all, red, black }; - template - static TaskStatus Jacobi(std::shared_ptr> &md, double weight, - GSType gs_type = GSType::all) { + TaskStatus Jacobi(std::shared_ptr> &md, double weight) { using namespace parthenon; const int ndim = md->GetMeshPointer()->ndim; using TE = parthenon::TopologicalElement; @@ -164,49 +188,88 @@ class MGSolver { int nblocks = md->NumBlocks(); std::vector include_block(nblocks, true); - auto desc = + static auto desc = parthenon::MakePackDescriptor(md.get()); auto pack = desc.GetPack(md.get(), include_block); - parthenon::par_for( - DEFAULT_LOOP_PATTERN, "CaclulateFluxes", DevExecSpace(), 0, pack.GetNBlocks() - 1, - kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, - KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) { - const auto &coords = pack.GetCoordinates(b); - if ((i + j + k) % 2 == 1 && gs_type == GSType::red) return; - if ((i + j + k) % 2 == 0 && gs_type == GSType::black) return; - - const int nvars = - pack.GetUpperBound(b, D_t()) - pack.GetLowerBound(b, D_t()) + 1; - for (int c = 0; c < nvars; ++c) { - Real diag_elem = pack(b, te, D_t(c), k, j, i); - - // Get the off-diagonal contribution to Ax = (D + L + U)x = y - Real off_diag = pack(b, te, Axold_t(c), k, j, i) - - diag_elem * pack(b, te, xold_t(c), k, j, i); - - Real val = pack(b, te, rhs_t(c), k, j, i) - off_diag; - pack(b, te, xnew_t(c), k, j, i) = - weight * val / diag_elem + - (1.0 - weight) * pack(b, te, xold_t(c), k, j, i); - } - }); + if (params_.two_by_two_diagonal) { + parthenon::par_for( + DEFAULT_LOOP_PATTERN, "CaclulateFluxes", DevExecSpace(), 0, + pack.GetNBlocks() - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) { + const auto &coords = pack.GetCoordinates(b); + + const Real D11 = pack(b, te, D_t(0), k, j, i); + const Real D22 = pack(b, te, D_t(1), k, j, i); + const Real D12 = pack(b, te, D_t(2), k, j, i); + const Real D21 = pack(b, te, D_t(3), k, j, i); + const Real det = D11 * D22 - D12 * D21; + + const Real Du0 = D11 * pack(b, te, xold_t(0), k, j, i) + + D12 * pack(b, te, xold_t(1), k, j, i); + const Real Du1 = D21 * pack(b, te, xold_t(0), k, j, i) + + D22 * pack(b, te, xold_t(1), k, j, i); + + const Real t0 = + pack(b, te, rhs_t(0), k, j, i) - pack(b, te, Axold_t(0), k, j, i) + Du0; + const Real t1 = + pack(b, te, rhs_t(1), k, j, i) - pack(b, te, Axold_t(1), k, j, i) + Du1; + + const Real v0 = (D22 * t0 - D12 * t1) / det; + const Real v1 = (-D21 * t0 + D11 * t1) / det; + + pack(b, te, xnew_t(0), k, j, i) = + weight * v0 + (1.0 - weight) * pack(b, te, xold_t(0), k, j, i); + pack(b, te, xnew_t(1), k, j, i) = + weight * v1 + (1.0 - weight) * pack(b, te, xold_t(1), k, j, i); + }); + } else { + const int scratch_size = 0; + const int scratch_level = 0; + const int nghost = Globals::nghost; + parthenon::par_for_outer( + DEFAULT_OUTER_LOOP_PATTERN, "Jacobi", DevExecSpace(), scratch_size, + scratch_level, 0, pack.GetNBlocks() - 1, kb.s, kb.e, + KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int k) { + const int nvars = + pack.GetUpperBound(b, xnew_t()) - pack.GetLowerBound(b, xnew_t()) + 1; + for (int c = 0; c < nvars; ++c) { + Real *Ax = &pack(b, te, Axold_t(c), k, jb.s, ib.s); + Real *diag = &pack(b, te, D_t(c), k, jb.s, ib.s); + Real *prhs = &pack(b, te, rhs_t(c), k, jb.s, ib.s); + Real *xo = &pack(b, te, xold_t(c), k, jb.s, ib.s); + Real *xn = &pack(b, te, xnew_t(c), k, jb.s, ib.s); + // Use ptr arithmetic to get the number of points we need to go over + // (including ghost zones) to get from (k, jb.s, ib.s) to (k, jb.e, ib.e) + const int npoints = &pack(b, te, Axold_t(c), k, jb.e, ib.e) - Ax + 1; + parthenon::par_for_inner( + DEFAULT_INNER_LOOP_PATTERN, member, 0, npoints - 1, [&](const int idx) { + const Real off_diag = Ax[idx] - diag[idx] * xo[idx]; + const Real val = prhs[idx] - off_diag; + xn[idx] = weight * val / diag[idx] + (1.0 - weight) * xo[idx]; + }); + } + }); + } return TaskStatus::complete; } template TaskID AddJacobiIteration(TL_t &tl, TaskID depends_on, bool multilevel, Real omega, - std::shared_ptr> &md) { + std::shared_ptr> &md, + std::shared_ptr> &md_comm) { using namespace utils; - auto comm = AddBoundaryExchangeTasks(depends_on, tl, md, multilevel); + auto comm = + AddBoundaryExchangeTasks(depends_on, tl, md_comm, multilevel); auto mat_mult = eqs_.template Ax(tl, comm, md); - return tl.AddTask(mat_mult, Jacobi, md, omega, - GSType::all); + return tl.AddTask(mat_mult, &MGSolver::Jacobi, this, md, + omega); } template TaskID AddSRJIteration(TL_t &tl, TaskID depends_on, int stages, bool multilevel, - std::shared_ptr> &md) { + std::shared_ptr> &md, + std::shared_ptr> &md_comm) { using namespace utils; int ndim = md->GetParentPointer()->ndim; @@ -225,19 +288,47 @@ class MGSolver { // This copy is to set the coarse blocks in temp to the values in u so that // fine-coarse boundaries of temp are correctly updated during communication depends_on = tl.AddTask(depends_on, CopyData, md); - auto jacobi1 = AddJacobiIteration(tl, depends_on, multilevel, - omega[ndim - 1][0], md); - if (stages < 2) { - return tl.AddTask(jacobi1, CopyData, md); - } - auto jacobi2 = AddJacobiIteration(tl, jacobi1, multilevel, - omega[ndim - 1][1], md); - if (stages < 3) return jacobi2; - auto jacobi3 = AddJacobiIteration(tl, jacobi2, multilevel, - omega[ndim - 1][2], md); + auto jacobi1 = AddJacobiIteration( + tl, depends_on, multilevel, omega[ndim - 1][0], md, md_comm); + auto copy1 = tl.AddTask(jacobi1, CopyData, md); + if (stages < 2) return copy1; + auto jacobi2 = AddJacobiIteration( + tl, copy1, multilevel, omega[ndim - 1][1], md, md_comm); + auto copy2 = tl.AddTask(jacobi2, CopyData, md); + if (stages < 3) return copy2; + auto jacobi3 = AddJacobiIteration( + tl, copy2, multilevel, omega[ndim - 1][2], md, md_comm); return tl.AddTask(jacobi3, CopyData, md); } + template + TaskID AddMultiGridSetupPartitionLevel(TL_t &tl, TaskID dependence, int partition, + int level, int min_level, int max_level, + Mesh *pmesh) { + using namespace utils; + + bool multilevel = (level != min_level); + + auto &md = pmesh->gmg_mesh_data[level].GetOrAdd(level, "base", partition); + + auto task_out = dependence; + if (level < max_level) { + task_out = + tl.AddTask(task_out, ReceiveBoundBufs, md); + task_out = tl.AddTask(task_out, SetBounds, md); + } + + // If we are finer than the coarsest level: + if (level > min_level) { + task_out = tl.AddTask(task_out, SendBoundBufs, md); + task_out = AddMultiGridSetupPartitionLevel(tl, task_out, partition, level - 1, + min_level, max_level, pmesh); + } + + // The boundaries are not up to date on return + return task_out; + } + TaskID AddMultiGridTasksPartitionLevel(TaskList &tl, TaskID dependence, int partition, int level, int min_level, int max_level, Mesh *pmesh) { @@ -264,25 +355,32 @@ class MGSolver { bool multilevel = (level != min_level); auto &md = pmesh->gmg_mesh_data[level].GetOrAdd(level, "base", partition); + std::string label = "comm_" + std::to_string(level) + "_" + std::to_string(partition); + auto &md_comm = pmesh->gmg_mesh_data[level].AddShallow( + label, md, std::vector{u::name(), res_err::name()}); // 0. Receive residual from coarser level if there is one auto set_from_finer = dependence; if (level < max_level) { // Fill fields with restricted values - auto recv_from_finer = - tl.AddTask(dependence, ReceiveBoundBufs, md); + auto recv_from_finer = tl.AddTask( + dependence, ReceiveBoundBufs, md_comm); set_from_finer = tl.AddTask( // TaskQualifier::local_sync, // is this required? - recv_from_finer, SetBounds, md); + recv_from_finer, SetBounds, md_comm); // 1. Copy residual from dual purpose communication field to the rhs, should be // actual RHS for finest level - auto copy_u = tl.AddTask(set_from_finer, CopyData, md); if (!do_FAS) { - auto zero_u = tl.AddTask(copy_u, SetToZero, md); + auto zero_u = tl.AddTask(set_from_finer, SetToZero, md); auto copy_rhs = tl.AddTask(set_from_finer, CopyData, md); - set_from_finer = zero_u | copy_u | copy_rhs; + set_from_finer = zero_u | copy_rhs; } else { + // TODO(LFR): Determine if this boundary exchange task is required, I think it is + // to make sure that the boundaries of the restricted u are up to date before + // calling Ax. That being said, at least in one case commenting this line out + // didn't seem to impact the solution. set_from_finer = AddBoundaryExchangeTasks( - set_from_finer, tl, md, multilevel); + set_from_finer, tl, md_comm, multilevel); + set_from_finer = tl.AddTask(set_from_finer, CopyData, md); // This should set the rhs only in blocks that correspond to interior nodes, the // RHS of leaf blocks that are on this GMG level should have already been set on // entry into multigrid @@ -290,7 +388,6 @@ class MGSolver { set_from_finer = tl.AddTask( set_from_finer, AddFieldsAndStoreInteriorSelect, md, 1.0, 1.0, true); - set_from_finer = set_from_finer | copy_u; } } else { set_from_finer = tl.AddTask(set_from_finer, CopyData, md); @@ -299,14 +396,14 @@ class MGSolver { // 2. Do pre-smooth and fill solution on this level set_from_finer = tl.AddTask(set_from_finer, &equations::template SetDiagonal, &eqs_, md); - auto pre_smooth = AddSRJIteration(tl, set_from_finer, - pre_stages, multilevel, md); + auto pre_smooth = AddSRJIteration( + tl, set_from_finer, pre_stages, multilevel, md, md_comm); // If we are finer than the coarsest level: auto post_smooth = pre_smooth; if (level > min_level) { // 3. Communicate same level boundaries so that u is up to date everywhere - auto comm_u = AddBoundaryExchangeTasks(pre_smooth, tl, md, - multilevel); + auto comm_u = AddBoundaryExchangeTasks(pre_smooth, tl, + md_comm, multilevel); // 4. Caclulate residual and store in communication field auto residual = eqs_.template Ax(tl, comm_u, md); @@ -316,18 +413,18 @@ class MGSolver { // 5. Restrict communication field and send to next level auto communicate_to_coarse = - tl.AddTask(residual, SendBoundBufs, md); + tl.AddTask(residual, SendBoundBufs, md_comm); auto coarser = AddMultiGridTasksPartitionLevel( tl, communicate_to_coarse, partition, level - 1, min_level, max_level, pmesh); // 6. Receive error field into communication field and prolongate - auto recv_from_coarser = - tl.AddTask(coarser, ReceiveBoundBufs, md); - auto set_from_coarser = - tl.AddTask(recv_from_coarser, SetBounds, md); + auto recv_from_coarser = tl.AddTask( + coarser, ReceiveBoundBufs, md_comm); + auto set_from_coarser = tl.AddTask( + recv_from_coarser, SetBounds, md_comm); auto prolongate = tl.AddTask( // TaskQualifier::local_sync, // is this required? - set_from_coarser, ProlongateBounds, md); + set_from_coarser, ProlongateBounds, md_comm); // 7. Correct solution on this level with res_err field and store in // communication field @@ -336,14 +433,14 @@ class MGSolver { // 8. Post smooth using communication field and stored RHS post_smooth = AddSRJIteration(tl, update_sol, post_stages, - multilevel, md); + multilevel, md, md_comm); } else { post_smooth = tl.AddTask(pre_smooth, CopyData, md); } // 9. Send communication field to next finer level (should be error field for that // level) - TaskID last_task; + TaskID last_task = post_smooth; if (level < max_level) { auto copy_over = post_smooth; if (!do_FAS) { @@ -353,14 +450,18 @@ class MGSolver { md, 1.0, -1.0); copy_over = calc_err; } + // This is required to make sure boundaries of res_err are up to date before + // prolongation + copy_over = tl.AddTask(copy_over, CopyData, md); + copy_over = tl.AddTask(copy_over, CopyData, md); auto boundary = AddBoundaryExchangeTasks(copy_over, tl, md, multilevel); + auto copy_back = tl.AddTask(boundary, CopyData, md); + copy_back = tl.AddTask(copy_back, CopyData, md); last_task = - tl.AddTask(boundary, SendBoundBufs, md); - } else { - last_task = AddBoundaryExchangeTasks(post_smooth, tl, md, - multilevel); + tl.AddTask(copy_back, SendBoundBufs, md); } + // The boundaries are not up to date on return return last_task; } }; diff --git a/src/solvers/solver_utils.hpp b/src/solvers/solver_utils.hpp index 76f77ec7a298..871462d11f31 100644 --- a/src/solvers/solver_utils.hpp +++ b/src/solvers/solver_utils.hpp @@ -1,5 +1,5 @@ //======================================================================================== -// (C) (or copyright) 2021. Triad National Security, LLC. All rights reserved. +// (C) (or copyright) 2021-2024. Triad National Security, LLC. All rights reserved. // // This program was produced under U.S. Government contract 89233218CNA000001 for Los // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC @@ -13,6 +13,8 @@ #ifndef SOLVERS_SOLVER_UTILS_HPP_ #define SOLVERS_SOLVER_UTILS_HPP_ +#include +#include #include #include #include @@ -146,7 +148,7 @@ struct Stencil { }; namespace utils { -template +template TaskStatus CopyData(const std::shared_ptr> &md) { using TE = parthenon::TopologicalElement; TE te = TE::CC; @@ -154,26 +156,32 @@ TaskStatus CopyData(const std::shared_ptr> &md) { IndexRange jb = md->GetBoundsJ(IndexDomain::entire, te); IndexRange kb = md->GetBoundsK(IndexDomain::entire, te); - auto desc = parthenon::MakePackDescriptor(md.get()); - auto pack = desc.GetPack(md.get(), {}, only_fine_on_composite); - parthenon::par_for( - DEFAULT_LOOP_PATTERN, "CopyData", DevExecSpace(), 0, pack.GetNBlocks() - 1, kb.s, - kb.e, jb.s, jb.e, ib.s, ib.e, - KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) { - // TODO(LFR): If this becomes a bottleneck, exploit hierarchical parallelism and - // pull the loop over vars outside of the innermost loop to promote - // vectorization. - const int nvars = pack.GetUpperBound(b, in()) - pack.GetLowerBound(b, in()) + 1; - for (int c = 0; c < nvars; ++c) - pack(b, te, out(c), k, j, i) = pack(b, te, in(c), k, j, i); + static auto desc = parthenon::MakePackDescriptor(md.get()); + auto pack = desc.GetPack(md.get(), only_fine_on_composite); + const int scratch_size = 0; + const int scratch_level = 0; + // Warning: This inner loop strategy only works because we are using IndexDomain::entire + const int npoints_inner = (kb.e - kb.s + 1) * (jb.e - jb.s + 1) * (ib.e - ib.s + 1); + parthenon::par_for_outer( + DEFAULT_OUTER_LOOP_PATTERN, "CopyData", DevExecSpace(), scratch_size, scratch_level, + 0, pack.GetNBlocks() - 1, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b) { + const int nvars = + pack.GetUpperBound(b, in_t()) - pack.GetLowerBound(b, in_t()) + 1; + for (int c = 0; c < nvars; ++c) { + Real *in = &pack(b, te, in_t(c), kb.s, jb.s, ib.s); + Real *out = &pack(b, te, out_t(c), kb.s, jb.s, ib.s); + parthenon::par_for_inner(DEFAULT_INNER_LOOP_PATTERN, member, 0, + npoints_inner - 1, + [&](const int idx) { out[idx] = in[idx]; }); + } }); return TaskStatus::complete; } -template +template TaskStatus AddFieldsAndStoreInteriorSelect(const std::shared_ptr> &md, Real wa = 1.0, Real wb = 1.0, - bool only_interior = false) { + bool only_interior_blocks = false) { using TE = parthenon::TopologicalElement; TE te = TE::CC; IndexRange ib = md->GetBoundsI(IndexDomain::entire, te); @@ -182,25 +190,30 @@ TaskStatus AddFieldsAndStoreInteriorSelect(const std::shared_ptr> int nblocks = md->NumBlocks(); std::vector include_block(nblocks, true); - if (only_interior) { + if (only_interior_blocks) { // The neighbors array will only be set for a block if its a leaf block for (int b = 0; b < nblocks; ++b) include_block[b] = md->GetBlockData(b)->GetBlockPointer()->neighbors.size() == 0; } - auto desc = parthenon::MakePackDescriptor(md.get()); + static auto desc = parthenon::MakePackDescriptor(md.get()); auto pack = desc.GetPack(md.get(), include_block, only_fine_on_composite); - parthenon::par_for( - DEFAULT_LOOP_PATTERN, "AddFieldsAndStore", DevExecSpace(), 0, pack.GetNBlocks() - 1, - kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, - KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) { - // TODO(LFR): If this becomes a bottleneck, exploit hierarchical parallelism and - // pull the loop over vars outside of the innermost loop to promote - // vectorization. + const int scratch_size = 0; + const int scratch_level = 0; + // Warning: This inner loop strategy only works because we are using IndexDomain::entire + const int npoints_inner = (kb.e - kb.s + 1) * (jb.e - jb.s + 1) * (ib.e - ib.s + 1); + parthenon::par_for_outer( + DEFAULT_OUTER_LOOP_PATTERN, "AddFieldsAndStore", DevExecSpace(), scratch_size, + scratch_level, 0, pack.GetNBlocks() - 1, + KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b) { const int nvars = pack.GetUpperBound(b, a_t()) - pack.GetLowerBound(b, a_t()) + 1; for (int c = 0; c < nvars; ++c) { - pack(b, te, out(c), k, j, i) = - wa * pack(b, te, a_t(c), k, j, i) + wb * pack(b, te, b_t(c), k, j, i); + Real *avar = &pack(b, te, a_t(c), kb.s, jb.s, ib.s); + Real *bvar = &pack(b, te, b_t(c), kb.s, jb.s, ib.s); + Real *out = &pack(b, te, out_t(c), kb.s, jb.s, ib.s); + parthenon::par_for_inner( + DEFAULT_INNER_LOOP_PATTERN, member, 0, npoints_inner - 1, + [&](const int idx) { out[idx] = wa * avar[idx] + wb * bvar[idx]; }); } }); return TaskStatus::complete; @@ -218,9 +231,8 @@ TaskStatus SetToZero(const std::shared_ptr> &md) { int nblocks = md->NumBlocks(); using TE = parthenon::TopologicalElement; TE te = TE::CC; - std::vector include_block(nblocks, true); - auto desc = parthenon::MakePackDescriptor(md.get()); - auto pack = desc.GetPack(md.get(), include_block, only_fine_on_composite); + static auto desc = parthenon::MakePackDescriptor(md.get()); + auto pack = desc.GetPack(md.get(), only_fine_on_composite); const size_t scratch_size_in_bytes = 0; const int scratch_level = 1; const int ng = parthenon::Globals::nghost; @@ -253,7 +265,7 @@ TaskStatus DotProductLocal(const std::shared_ptr> &md, IndexRange jb = md->GetBoundsJ(IndexDomain::interior, te); IndexRange kb = md->GetBoundsK(IndexDomain::interior, te); - auto desc = parthenon::MakePackDescriptor(md.get()); + static auto desc = parthenon::MakePackDescriptor(md.get()); auto pack = desc.GetPack(md.get()); Real gsum(0); parthenon::par_reduce( @@ -293,6 +305,53 @@ TaskID DotProduct(TaskID dependency_in, TaskList &tl, AllReduce *adotb, return finish_global_adotb; } +template +TaskStatus GlobalMinLocal(const std::shared_ptr> &md, + AllReduce *amin) { + using TE = parthenon::TopologicalElement; + TE te = TE::CC; + IndexRange ib = md->GetBoundsI(IndexDomain::interior, te); + IndexRange jb = md->GetBoundsJ(IndexDomain::interior, te); + IndexRange kb = md->GetBoundsK(IndexDomain::interior, te); + + static auto desc = parthenon::MakePackDescriptor(md.get()); + auto pack = desc.GetPack(md.get()); + Real gmin(0); + parthenon::par_reduce( + parthenon::loop_pattern_mdrange_tag, "DotProduct", DevExecSpace(), 0, + pack.GetNBlocks() - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + KOKKOS_LAMBDA(const int b, const int k, const int j, const int i, Real &lmin) { + const int nvars = pack.GetUpperBound(b, a_t()) - pack.GetLowerBound(b, a_t()) + 1; + // TODO(LFR): If this becomes a bottleneck, exploit hierarchical parallelism and + // pull the loop over vars outside of the innermost loop to promote + // vectorization. + for (int c = 0; c < nvars; ++c) + lmin = std::min(lmin, pack(b, te, a_t(c), k, j, i)); + }, + Kokkos::Min(gmin)); + amin->val = std::min(gmin, amin->val); + return TaskStatus::complete; +} + +template +TaskID GlobalMin(TaskID dependency_in, TaskList &tl, AllReduce *amin, + const std::shared_ptr> &md) { + using namespace impl; + auto max_amin = tl.AddTask( + TaskQualifier::once_per_region | TaskQualifier::local_sync, dependency_in, + [](AllReduce *r) { + r->val = std::numeric_limits::max(); + return TaskStatus::complete; + }, + amin); + auto get_amin = + tl.AddTask(TaskQualifier::local_sync, max_amin, GlobalMinLocal, md, amin); + auto start_global_amin = tl.AddTask(TaskQualifier::once_per_region, get_amin, + &AllReduce::StartReduce, amin, MPI_MIN); + return tl.AddTask(TaskQualifier::once_per_region | TaskQualifier::local_sync, + start_global_amin, &AllReduce::CheckReduce, amin); +} + } // namespace utils } // namespace solvers