From d682ec81c065140de14feb05cfb41b28d626b32c Mon Sep 17 00:00:00 2001 From: Nai-Yuan Chiang Date: Mon, 13 Mar 2023 20:16:04 -0700 Subject: [PATCH] Sparse solver on GPU (#600) * add sparse raja example * update cmake file * add driver * clean the code --- src/Drivers/MDS/NlpMdsRajaEx1.hpp | 4 +- src/Drivers/Sparse/CMakeLists.txt | 19 + src/Drivers/Sparse/NlpSparseEx2.cpp | 3 +- src/Drivers/Sparse/NlpSparseRajaEx2.cpp | 484 ++++++++++++++++++ src/Drivers/Sparse/NlpSparseRajaEx2.hpp | 309 +++++++++++ src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp | 318 ++++++++++++ src/LinAlg/hiopLinSolver.cpp | 4 +- src/LinAlg/hiopLinSolverSparseCUSOLVER.cpp | 85 +++ src/LinAlg/hiopLinSolverSparseCUSOLVER.hpp | 27 +- src/LinAlg/hiopMatrixRajaSparseTriplet.hpp | 4 +- .../hiopMatrixRajaSparseTripletImpl.hpp | 14 +- src/Optimization/hiopKKTLinSysSparse.cpp | 45 +- .../hiopKKTLinSysSparseCondensed.hpp | 1 - 13 files changed, 1286 insertions(+), 31 deletions(-) create mode 100644 src/Drivers/Sparse/NlpSparseRajaEx2.cpp create mode 100644 src/Drivers/Sparse/NlpSparseRajaEx2.hpp create mode 100644 src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp diff --git a/src/Drivers/MDS/NlpMdsRajaEx1.hpp b/src/Drivers/MDS/NlpMdsRajaEx1.hpp index c43346ccd..fef3747d6 100644 --- a/src/Drivers/MDS/NlpMdsRajaEx1.hpp +++ b/src/Drivers/MDS/NlpMdsRajaEx1.hpp @@ -170,7 +170,7 @@ class MdsEx1 : public hiop::hiopInterfaceMDS * Returns the sizes and number of nonzeros of the sparse and dense blocks within MDS * * @param[out] nx_sparse number of sparse variables - * @param[out] nx_ense number of dense variables + * @param[out] nx_dense number of dense variables * @param[out] nnz_sparse_Jace number of nonzeros in the Jacobian of the equalities w.r.t. * sparse variables * @param[out] nnz_sparse_Jaci number of nonzeros in the Jacobian of the inequalities w.r.t. @@ -233,7 +233,7 @@ class MdsEx1 : public hiop::hiopInterfaceMDS * @param[in] x array with the optimization variables or point at which to evaluate * (managed by Umpire) * @param[in] new_x indicates whether any of the other eval functions have been evaluated - * previously (false) or not (true) at x + * previously (false) or not (true) at x * @param[out] gradf array with the values of the gradient (managed by Umpire) */ bool eval_grad_f(const size_type& n, const double* x, bool new_x, double* gradf); diff --git a/src/Drivers/Sparse/CMakeLists.txt b/src/Drivers/Sparse/CMakeLists.txt index 646c6e43b..7c96910e4 100644 --- a/src/Drivers/Sparse/CMakeLists.txt +++ b/src/Drivers/Sparse/CMakeLists.txt @@ -7,6 +7,20 @@ target_link_libraries(NlpSparseEx2.exe HiOp::HiOp) add_executable(NlpSparseEx3.exe NlpSparseEx3.cpp NlpSparseEx3Driver.cpp) target_link_libraries(NlpSparseEx3.exe HiOp::HiOp) +if(HIOP_USE_RAJA) + if(HIOP_USE_GPU AND HIOP_USE_CUDA) + set_source_files_properties( + NlpSparseRajaEx2.cpp + NlpSparseRajaEx2Driver.cpp + PROPERTIES LANGUAGE CUDA + ) + + add_executable(NlpSparseRajaEx2.exe NlpSparseRajaEx2Driver.cpp NlpSparseRajaEx2.cpp) + target_link_libraries(NlpSparseRajaEx2.exe HiOp::HiOp) + install(TARGETS NlpSparseRajaEx2.exe DESTINATION bin) + endif() +endif() + if(HIOP_BUILD_SHARED) add_executable(NlpSparseCEx1.exe NlpSparseCEx1.c) target_link_libraries(NlpSparseCEx1.exe HiOp::HiOp) @@ -52,6 +66,11 @@ if(HIOP_USE_GINKGO) add_test(NAME NlpSparse2_6 COMMAND ${RUNCMD} "$" "500" "-ginkgo_hip" "-inertiafree" "-selfcheck") endif(HIOP_USE_HIP) endif(HIOP_USE_GINKGO) + +if(HIOP_USE_RAJA AND HIOP_USE_GPU AND HIOP_USE_CUDA) + add_test(NAME NlpSparseRaja2_1 COMMAND ${RUNCMD} bash -c "$ 500 -selfcheck ") +endif() + add_test(NAME NlpSparse3_1 COMMAND ${RUNCMD} "$" "500" "-selfcheck") if(HIOP_BUILD_SHARED AND NOT HIOP_USE_GPU) add_test(NAME NlpSparseCinterface COMMAND ${RUNCMD} "$") diff --git a/src/Drivers/Sparse/NlpSparseEx2.cpp b/src/Drivers/Sparse/NlpSparseEx2.cpp index 72073e914..ce4a0615e 100644 --- a/src/Drivers/Sparse/NlpSparseEx2.cpp +++ b/src/Drivers/Sparse/NlpSparseEx2.cpp @@ -153,7 +153,6 @@ bool SparseEx2::eval_cons(const size_type& n, return false; } -/* Four constraints no matter how large n is */ bool SparseEx2::eval_cons(const size_type& n, const size_type& m, const double* x, bool new_x, double* cons) { assert(n==n_vars); assert(m==n_cons); @@ -170,7 +169,7 @@ bool SparseEx2::eval_cons(const size_type& n, const size_type& m, const double* // --- constraint 2 body ---> 2*x_1 + x_3 cons[conidx++] += 2*x[0] + 1*x[2]; - // --- constraint 3 body ---> 2*x_1 + 0.5*x_i, for i>=4 + // --- constraint 3 body ---> 2*x_1 + 0.5*x_i, for i>=4 for(auto i=3; i, LNNL + * + */ + +#include "NlpSparseRajaEx2.hpp" + +#include +#include +#include + +//#include + +#include +#include //for memcpy +#include + + +//TODO: A good idea to not use the internal HiOp Raja policies here and, instead, give self-containing +// definitions of the policies here so that the user gets a better grasp of the concept and does not +// rely on the internals of HiOp. For example: +// #define RAJA_LAMBDA [=] __device__ +// using ex2_raja_exec = RAJA::cuda_exec<128>; +// more defs here + + +#if defined(HIOP_USE_CUDA) +#include "ExecPoliciesRajaCudaImpl.hpp" +using ex2_raja_exec = hiop::ExecRajaPoliciesBackend::hiop_raja_exec; +using ex2_raja_reduce = hiop::ExecRajaPoliciesBackend::hiop_raja_reduce; +//using hiopMatrixRajaSparse = hiop::hiopMatrixRajaSparseTriplet; +#elif defined(HIOP_USE_HIP) +#include +using ex2_raja_exec = hiop::ExecRajaPoliciesBackend::hiop_raja_exec; +using ex2_raja_reduce = hiop::ExecRajaPoliciesBackend::hiop_raja_reduce; +//using hiopMatrixRajaSparse = hiop::hiopMatrixRajaSparseTriplet; +#else +//#if !defined(HIOP_USE_CUDA) && !defined(HIOP_USE_HIP) +#include +using ex2_raja_exec = hiop::ExecRajaPoliciesBackend::hiop_raja_exec; +using ex2_raja_reduce = hiop::ExecRajaPoliciesBackend::hiop_raja_reduce; +//using hiopMatrixRajaSparse = hiop::hiopMatrixRajaSparseTriplet; +#endif + + +/** Nonlinear *highly nonconvex* and *rank deficient* problem test for the Filter IPM + * Newton of HiOp. It uses a Sparse NLP formulation. The problem is based on SparseEx1. + * + * min (2*convex_obj-1)*scal*sum 1/4* { (x_{i}-1)^4 : i=1,...,n} + 0.5x^Tx + * s.t. + * 4*x_1 + 2*x_2 == 10 + * 5<= 2*x_1 + x_3 + * 1<= 2*x_1 + 0.5*x_i <= 2*n, for i=4,...,n + * x_1 free + * 0.0 <= x_2 + * 1.0 <= x_3 <= 10 + * x_i >=0.5, i=4,...,n + * + * Optionally, one can add the following constraints to obtain a rank-deficient Jacobian + * + * s.t. [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ] (rnkdef-con1) + * 4*x_1 + 2*x_2 == 10 (rnkdef-con2) + * + * other parameters are: + * convex_obj: set to 1 to have a convex problem, otherwise set it to 0. + * scale_quartic_obj_term: scaling factor for the quartic term in the objective (1.0 by default). + * + */ +SparseRajaEx2::SparseRajaEx2(std::string mem_space, + int n, + bool convex_obj, + bool rankdefic_Jac_eq, + bool rankdefic_Jac_ineq, + double scal_neg_obj) + : mem_space_{mem_space}, + convex_obj_{convex_obj}, + rankdefic_eq_{rankdefic_Jac_eq}, + rankdefic_ineq_{rankdefic_Jac_ineq}, + n_vars_{n}, + scal_neg_obj_{scal_neg_obj}, + n_cons_{2} +{ + // Make sure mem_space_ is uppercase + transform(mem_space_.begin(), mem_space_.end(), mem_space_.begin(), ::toupper); + + assert(n>=3 && "number of variables should be greater than 3 for this example"); + if(n>3) { + n_cons_ += n-3; + } + n_cons_ += rankdefic_eq_ + rankdefic_ineq_; +} + +SparseRajaEx2::~SparseRajaEx2() +{ +} + +bool SparseRajaEx2::get_prob_sizes(size_type& n, size_type& m) +{ + n = n_vars_; + m = n_cons_; + return true; +} + +bool SparseRajaEx2::get_vars_info(const size_type& n, double *xlow, double* xupp, NonlinearityType* type) +{ + assert(n==n_vars_); + + RAJA::forall(RAJA::RangeSegment(0, 1), + RAJA_LAMBDA(RAJA::Index_type i) + { + xlow[0] = -1e20; + xupp[0] = 1e20; +// type[0] = hiopNonlinear; + xlow[1] = 0.0; + xupp[1] = 1e20; +// type[1] = hiopNonlinear; + xlow[2] = 1.0; + xupp[2] = 10.0; +// type[2] = hiopNonlinear; + }); + + if(n>3) { + RAJA::forall(RAJA::RangeSegment(3, n), + RAJA_LAMBDA(RAJA::Index_type i) + { + xlow[i] = 0.5; + xupp[i] = 1e20; +// type[i] = hiopNonlinear; + }); + } + + // Use a sequential policy for host computations for now + RAJA::forall(RAJA::RangeSegment(0, n), + [=] (RAJA::Index_type i) + { + type[i] = hiopNonlinear; + }); + + return true; +} + +bool SparseRajaEx2::get_cons_info(const size_type& m, double* clow, double* cupp, NonlinearityType* type) +{ + assert(m==n_cons_); + size_type n = n_vars_; + assert(m-1 == n-1+rankdefic_ineq_); + + // RAJA doesn't like member objects + bool rankdefic_eq = rankdefic_eq_; + bool rankdefic_ineq = rankdefic_ineq_; + + // serial part + RAJA::forall(RAJA::RangeSegment(0, 1), + RAJA_LAMBDA(RAJA::Index_type i) + { + clow[0] = 10.0; + cupp[0] = 10.0; +// type[0] = hiopInterfaceBase::hiopNonlinear; + clow[1] = 5.0; + cupp[1] = 1e20; +// type[1] = hiopInterfaceBase::hiopNonlinear; + + if(rankdefic_ineq) { + // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ] + clow[n-1] = -1e+20; + cupp[n-1] = 19.; +// type[n-1] = hiopInterfaceBase::hiopNonlinear; + } + + if(rankdefic_eq) { + // 4*x_1 + 2*x_2 == 10 + clow[m-1] = 10; + cupp[m-1] = 10; +// type[m-1] = hiopInterfaceBase::hiopNonlinear; + } + }); + + if(n>3) { + RAJA::forall(RAJA::RangeSegment(2, n-1), + RAJA_LAMBDA(RAJA::Index_type conidx) + { + clow[conidx] = 1.0; + cupp[conidx] = 2*n; +// type[conidx] = hiopInterfaceBase::hiopNonlinear; + }); + } + + // Must be a sequential host policy for now + RAJA::forall(RAJA::RangeSegment(0, m), + [=] (RAJA::Index_type i) + { + type[i] = hiopNonlinear; + }); + + return true; +} + +bool SparseRajaEx2::get_sparse_blocks_info(int& nx, + int& nnz_sparse_Jaceq, + int& nnz_sparse_Jacineq, + int& nnz_sparse_Hess_Lagr) +{ + nx = n_vars_;; + nnz_sparse_Jaceq = 2 + 2*rankdefic_eq_; + nnz_sparse_Jacineq = 2 + 2*(n_vars_-3) + 2*rankdefic_ineq_; + nnz_sparse_Hess_Lagr = n_vars_; + return true; +} + +bool SparseRajaEx2::eval_f(const size_type& n, const double* x, bool new_x, double& obj_value) +{ + assert(n==n_vars_); + obj_value=0.; + { + int convex_obj = (int) convex_obj_; + double scal_neg_obj = scal_neg_obj_; + + RAJA::ReduceSum aux(0); + RAJA::forall(RAJA::RangeSegment(0, n), + RAJA_LAMBDA(RAJA::Index_type i) + { + aux += (2*convex_obj-1) * scal_neg_obj * 0.25 * std::pow(x[i]-1., 4) + 0.5 * std::pow(x[i], 2); + }); + obj_value += aux.get(); + } + return true; +} + +bool SparseRajaEx2::eval_grad_f(const size_type& n, const double* x, bool new_x, double* gradf) +{ + assert(n==n_vars_); + { + int convex_obj = (int) convex_obj_; + double scal_neg_obj = scal_neg_obj_; + + RAJA::forall(RAJA::RangeSegment(0, n), + RAJA_LAMBDA(RAJA::Index_type i) + { + gradf[i] = (2*convex_obj-1) * scal_neg_obj * std::pow(x[i]-1.,3) + x[i]; + }); + } + return true; +} + +bool SparseRajaEx2::eval_cons(const size_type& n, const size_type& m, const double* x, bool new_x, double* cons) +{ + assert(n==n_vars_); + assert(m==n_cons_); + assert(n_cons_==2+n-3+rankdefic_eq_+rankdefic_ineq_); + + // RAJA doesn't like member objects + bool rankdefic_eq = rankdefic_eq_; + bool rankdefic_ineq = rankdefic_ineq_; + + // serial part + RAJA::forall(RAJA::RangeSegment(0, 1), + RAJA_LAMBDA(RAJA::Index_type i) + { + // --- constraint 1 body ---> 4*x_1 + 2*x_2 == 10 + cons[0] = 4*x[0] + 2*x[1]; + // --- constraint 2 body ---> 2*x_1 + x_3 + cons[1] = 2*x[0] + 1*x[2]; + + if(rankdefic_ineq) { + // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ] + cons[n-1] = 4*x[0] + 2*x[2]; + } + + if(rankdefic_eq) { + // 4*x_1 + 2*x_2 == 10 + cons[m-1] = 4*x[0] + 2*x[1]; + } + }); + + RAJA::forall(RAJA::RangeSegment(2, n-1), + RAJA_LAMBDA(RAJA::Index_type i) + { + // --- constraint 3 body ---> 2*x_1 + 0.5*x_i, for i>=4 + cons[i] = 2*x[0] + 0.5*x[i+1]; + }); + + return true; +} + +bool SparseRajaEx2::eval_Jac_cons(const size_type& n, + const size_type& m, + const double* x, + bool new_x, + const int& nnzJacS, + index_type* iJacS, + index_type* jJacS, + double* MJacS) +{ + assert(n==n_vars_); assert(m==n_cons_); + assert(n>=3); + + assert(nnzJacS == 4 + 2*(n-3) + 2*rankdefic_eq_ + 2*rankdefic_ineq_); + + // RAJA doesn't like member objects + bool rankdefic_eq = rankdefic_eq_; + bool rankdefic_ineq = rankdefic_ineq_; + + if(iJacS !=nullptr && jJacS != nullptr) { + // serial part + RAJA::forall(RAJA::RangeSegment(0, 1), + RAJA_LAMBDA(RAJA::Index_type itrow) + { + // --- constraint 1 body ---> 4*x_1 + 2*x_2 == 10 + iJacS[0] = 0; + jJacS[0] = 0; + iJacS[1] = 0; + jJacS[1] = 1; + // --- constraint 2 body ---> 2*x_1 + x_3 + iJacS[2] = 1; + jJacS[2] = 0; + iJacS[3] = 1; + jJacS[3] = 2; + + if(rankdefic_ineq) { + // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ] + iJacS[2*n-2] = n-1; + jJacS[2*n-2] = 0; + iJacS[2*n-1] = n-1; + jJacS[2*n-1] = 2; + } + + if(rankdefic_eq) { + // 4*x_1 + 2*x_2 == 10 + iJacS[2*m-2] = m-1; + jJacS[2*m-2] = 0; + iJacS[2*m-1] = m-1; + jJacS[2*m-1] = 1; + } + }); + + RAJA::forall(RAJA::RangeSegment(2, n-1), + RAJA_LAMBDA(RAJA::Index_type itrow) + { + // --- constraint 3 body ---> 2*x_1 + 0.5*x_i, for i>=4 + iJacS[2*itrow] = itrow; + jJacS[2*itrow] = 0; + iJacS[2*itrow+1] = itrow; + jJacS[2*itrow+1] = itrow+1; + }); + + } + + if(MJacS != nullptr) { + // serial part + RAJA::forall(RAJA::RangeSegment(0, 1), + RAJA_LAMBDA(RAJA::Index_type itrow) + { + // --- constraint 1 body ---> 4*x_1 + 2*x_2 == 10 + MJacS[0] = 4.0; + MJacS[1] = 2.0; + // --- constraint 2 body ---> 2*x_1 + x_3 + MJacS[2] = 2.0; + MJacS[3] = 1.0; + + if(rankdefic_ineq) { + // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ] + MJacS[2*n-2] = 4.0; + MJacS[2*n-1] = 2.0; + } + + if(rankdefic_eq) { + // 4*x_1 + 2*x_2 == 10 + MJacS[2*m-2] = 4.0; + MJacS[2*m-1] = 2.0; + } + }); + + RAJA::forall(RAJA::RangeSegment(2, n-1), + RAJA_LAMBDA(RAJA::Index_type itrow) + { + // --- constraint 3 body ---> 2*x_1 + 0.5*x_i, for i>=4 + MJacS[2*itrow] = 2.0; + MJacS[2*itrow+1] = 0.5; + }); + + } + + return true; +} + +bool SparseRajaEx2::eval_Hess_Lagr(const size_type& n, + const size_type& m, + const double* x, + bool new_x, + const double& obj_factor, + const double* lambda, + bool new_lambda, + const size_type& nnzHSS, + index_type* iHSS, + index_type* jHSS, + double* MHSS) +{ + //Note: lambda is not used since all the constraints are linear and, therefore, do + //not contribute to the Hessian of the Lagrangian + assert(nnzHSS == n); + + if(iHSS!=nullptr && jHSS!=nullptr) { + RAJA::forall(RAJA::RangeSegment(0, n), + RAJA_LAMBDA(RAJA::Index_type i) + { + iHSS[i] = i; + jHSS[i] = i; + }); + } + + int convex_obj = (int) convex_obj_; + double scal_neg_obj = scal_neg_obj_; + if(MHSS!=nullptr) { + RAJA::forall(RAJA::RangeSegment(0, n), + RAJA_LAMBDA(RAJA::Index_type i) + { + MHSS[i] = obj_factor * ( (2*convex_obj-1) * scal_neg_obj * 3 * std::pow(x[i]-1., 2) + 1); + }); + } + return true; +} + + + + +bool SparseRajaEx2::get_starting_point(const size_type& n, double* x0) +{ + assert(n==n_vars_); + RAJA::forall(RAJA::RangeSegment(0, n), + RAJA_LAMBDA(RAJA::Index_type i) + { + x0[i] = 0.0; + }); + return true; +} diff --git a/src/Drivers/Sparse/NlpSparseRajaEx2.hpp b/src/Drivers/Sparse/NlpSparseRajaEx2.hpp new file mode 100644 index 000000000..acec2cb13 --- /dev/null +++ b/src/Drivers/Sparse/NlpSparseRajaEx2.hpp @@ -0,0 +1,309 @@ +// Copyright (c) 2017, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory (LLNL). +// LLNL-CODE-742473. All rights reserved. +// +// This file is part of HiOp. For details, see https://github.com/LLNL/hiop. HiOp +// is released under the BSD 3-clause license (https://opensource.org/licenses/BSD-3-Clause). +// Please also read "Additional BSD Notice" below. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// i. Redistributions of source code must retain the above copyright notice, this list +// of conditions and the disclaimer below. +// ii. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the disclaimer (as noted below) in the documentation and/or +// other materials provided with the distribution. +// iii. Neither the name of the LLNS/LLNL nor the names of its contributors may be used to +// endorse or promote products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +// SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, THE U.S. DEPARTMENT OF ENERGY OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED +// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Additional BSD Notice +// 1. This notice is required to be provided under our contract with the U.S. Department +// of Energy (DOE). This work was produced at Lawrence Livermore National Laboratory under +// Contract No. DE-AC52-07NA27344 with the DOE. +// 2. Neither the United States Government nor Lawrence Livermore National Security, LLC +// nor any of their employees, makes any warranty, express or implied, or assumes any +// liability or responsibility for the accuracy, completeness, or usefulness of any +// information, apparatus, product, or process disclosed, or represents that its use would +// not infringe privately-owned rights. +// 3. Also, reference herein to any specific commercial products, process, or services by +// trade name, trademark, manufacturer or otherwise does not necessarily constitute or +// imply its endorsement, recommendation, or favoring by the United States Government or +// Lawrence Livermore National Security, LLC. The views and opinions of authors expressed +// herein do not necessarily state or reflect those of the United States Government or +// Lawrence Livermore National Security, LLC, and shall not be used for advertising or +// product endorsement purposes. + +/** + * @file NlpSparseRajaEx2.hpp + * + * @author Nai-Yuan Chiang , LNNL + * + */ + +#ifndef HIOP_EXAMPLE_SPARSE_RAJA_EX2 +#define HIOP_EXAMPLE_SPARSE_RAJA_EX2 + +#include "hiopInterface.hpp" +#include "LinAlgFactory.hpp" + +#ifdef HIOP_USE_MPI +#include "mpi.h" +#else +#define MPI_COMM_WORLD 0 +#define MPI_Comm int +#endif + +#include +#include //for memcpy +#include +#include + +using size_type = hiop::size_type; +using index_type = hiop::index_type; + +/** Nonlinear *highly nonconvex* and *rank deficient* problem test for the Filter IPM + * Newton of HiOp. It uses a Sparse NLP formulation. The problem is based on SparseEx2. + * + * min (2*convex_obj-1)*scal*sum 1/4* { (x_{i}-1)^4 : i=1,...,n} + 0.5x^Tx + * s.t. + * 4*x_1 + 2*x_2 == 10 + * 5<= 2*x_1 + x_3 + * 1<= 2*x_1 + 0.5*x_i <= 2*n, for i=4,...,n + * x_1 free + * 0.0 <= x_2 + * 1.0 <= x_3 <= 10 + * x_i >=0.5, i=4,...,n + * + * Optionally, one can add the following constraints to obtain a rank-deficient Jacobian + * + * s.t. [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ] (rnkdef-con1 --- ineq con) + * 4*x_1 + 2*x_2 == 10 (rnkdef-con2 --- eq con) + * + * other parameters are: + * convex_obj: set to 1 to have a convex problem, otherwise set it to 0 + * scale_quartic_obj_term: scaling factor for the quartic term in the objective (1.0 by default). + * + * @note All pointers marked as "managed by Umpire" are allocated by HiOp using the + * Umpire's API. They all are addresses in the same memory space; however, the memory + * space can be host (typically CPU), device (typically GPU), or unified memory (um) + * spaces as per Umpire specification. The selection of the memory space is done via + * the option "mem_space" of HiOp. It is the responsibility of the implementers of + * the HiOp's interfaces (such as the hiop::hiopInterfaceMDS used in this example) to + * work with the "managed by Umpire" pointers in the same memory space as the one + * specified by the "mem_space" option. + * + */ +class SparseRajaEx2 : public hiop::hiopInterfaceSparse +{ +public: + SparseRajaEx2(std::string mem_space, + int n, + bool convex_obj, + bool rankdefic_Jac_eq, + bool rankdefic_Jac_ineq, + double scal_neg_obj = 1.0); + virtual ~SparseRajaEx2(); + + /** + * @brief Number of variables and constraints. + */ + virtual bool get_prob_sizes(size_type& n, size_type& m); + + /** + * @brief Get types and bounds on the variables. + * + * @param[in] n number of variables + * @param[out] ixlow array with lower bounds (managed by Umpire) + * @param[out] ixupp array with upper bounds (managed by Umpire) + * @param[out] type array with the variable types (on host) + */ + virtual bool get_vars_info(const size_type& n, double *xlow, double* xupp, NonlinearityType* type); + + /** + * Get types and bounds corresponding to constraints. An equality constraint is specified + * by setting the lower and upper bounds equal. + * + * @param[in] m Number of constraints + * @param[out] iclow array with lower bounds (managed by Umpire) + * @param[out] icupp array with upper bounds (managed by Umpire) + * @param[out] type array with the variable types (on host) + */ + virtual bool get_cons_info(const size_type& m, double* clow, double* cupp, NonlinearityType* type); + + /** + * Returns the sizes and number of nonzeros of the sparse blocks + * + * @param[out] nx number of variables + * @param[out] nnz_sparse_Jace number of nonzeros in the Jacobian of the equalities w.r.t. + * sparse variables + * @param[out] nnz_sparse_Jaci number of nonzeros in the Jacobian of the inequalities w.r.t. + * sparse variables + * @param[out] nnz_sparse_Hess_Lagr number of nonzeros in the (sparse) Hessian + */ + virtual bool get_sparse_blocks_info(index_type& nx, + index_type& nnz_sparse_Jace, + index_type& nnz_sparse_Jaci, + index_type& nnz_sparse_Hess_Lagr); + + /** + * Evaluate objective. + * + * @param[in] n number of variables + * @param[in] x array with the optimization variables or point at which to evaluate + * (managed by Umpire) + * @param[in] new_x indicates whether any of the other eval functions have been + * evaluated previously (false) or not (true) at x + * @param[out] obj_value the objective function value. + */ + virtual bool eval_f(const size_type& n, const double* x, bool new_x, double& obj_value); + + /** + * Evaluate a subset of the constraints specified by idx_cons + * + * @param[in] num_cons number of constraints to evaluate (size of idx_cons array) + * @param[in] idx_cons indexes of the constraints to evaluate (managed by Umpire) + * @param[in] x the point at which to evaluate (managed by Umpire) + * @param[in] new_x indicates whether any of the other eval functions have been evaluated + * previously (false) or not (true) at x + * @param[out] cons array with values of the constraints (managed by Umpire, size num_cons) + */ + virtual bool eval_cons(const size_type& n, + const size_type& m, + const size_type& num_cons, + const index_type* idx_cons, + const double* x, + bool new_x, + double* cons) + { + //return false so that HiOp will rely on the constraint evaluator defined below + return false; + } + + virtual bool eval_cons(const size_type& n, + const size_type& m, + const double* x, + bool new_x, + double* cons); + + /** + * Evaluation of the gradient of the objective. + * + * @param[in] n number of variables + * @param[in] x array with the optimization variables or point at which to evaluate + * (managed by Umpire) + * @param[in] new_x indicates whether any of the other eval functions have been evaluated + * previously (false) or not (true) at x + * @param[out] gradf array with the values of the gradient (managed by Umpire) + */ + virtual bool eval_grad_f(const size_type& n, + const double* x, + bool new_x, + double* gradf); + + /** + * Evaluates the Jacobian of the constraints. Please check the user manual and the + * documentation of hiop::hiopInterfaceMDS for a detailed discussion of how the last + * four arguments are expected to behave. + * + * @param[in] n number of variables + * @param[in] m Number of constraints + * @param[in] num_cons number of constraints to evaluate (size of idx_cons array) + * @param[in] idx_cons indexes of the constraints to evaluate (managed by Umpire) + * @param[in] x the point at which to evaluate (managed by Umpire) + * @param[in] new_x indicates whether any of the other eval functions have been evaluated + * previously (false) or not (true) at x + * @param[in] nnzJacS number of nonzeros in the sparse Jacobian + * @param[out] iJacS array of row indexes in the sparse Jacobian (managed by Umpire) + * @param[out] jJacS array of column indexes in the sparse Jacobian (managed by Umpire) + * @param[out] MJacS array of nonzero values in the sparse Jacobian (managed by Umpire) + */ + virtual bool eval_Jac_cons(const size_type& n, + const size_type& m, + const size_type& num_cons, + const index_type* idx_cons, + const double* x, + bool new_x, + const size_type& nnzJacS, + index_type* iJacS, + index_type* jJacS, + double* MJacS) + { + //return false so that HiOp will rely on the Jacobian evaluator defined below + return false; + } + + /// Similar to the above, but not used in this example. + virtual bool eval_Jac_cons(const size_type& n, + const size_type& m, + const double* x, + bool new_x, + const size_type& nnzJacS, + index_type* iJacS, + index_type* jJacS, + double* MJacS); + + + /** + * Evaluate the Hessian of the Lagrangian function. Please consult the user manual for a + * detailed discussion of the form the Lagrangian function takes. + * + * @param[in] n number of variables + * @param[in] m Number of constraints + * @param[in] x the point at which to evaluate (managed by Umpire) + * @param[in] new_x indicates whether any of the other eval functions have been evaluated + * previously (false) or not (true) at x + * @param[in] obj_factor scalar that multiplies the objective term in the Lagrangian function + * @param[in] lambda array with values of the multipliers used by the Lagrangian function + * @param[in] new_lambda indicates whether lambda values changed since last call + * @param[in] nnzHSS number of nonzeros in the (sparse) Hessian w.r.t. sparse variables + * @param[out] iHSS array of row indexes in the Hessian w.r.t. sparse variables + * (managed by Umpire) + * @param[out] jHSS array of column indexes in the Hessian w.r.t. sparse variables + * (managed by Umpire) + * @param[out] MHSS array of nonzero values in the Hessian w.r.t. sparse variables + * (managed by Umpire) + */ + virtual bool eval_Hess_Lagr(const size_type& n, + const size_type& m, + const double* x, + bool new_x, + const double& obj_factor, + const double* lambda, + bool new_lambda, + const size_type& nnzHSS, + index_type* iHSS, + index_type* jHSS, + double* MHSS); + + /** + * Implementation of the primal starting point specification + * + * @param[in] n number of variables + * @param[in] x0 the primal starting point(managed by Umpire) + */ + virtual bool get_starting_point(const size_type&n, double* x0); + +private: + int n_vars_; + int n_cons_; + bool convex_obj_; + bool rankdefic_eq_; + bool rankdefic_ineq_; + double scal_neg_obj_; + + std::string mem_space_; +}; + +#endif diff --git a/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp b/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp new file mode 100644 index 000000000..4df767d99 --- /dev/null +++ b/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp @@ -0,0 +1,318 @@ +#include "NlpSparseRajaEx2.hpp" +#include "hiopNlpFormulation.hpp" +#include "hiopAlgFilterIPM.hpp" + +#include +#include + +#include +#include +#include + +using namespace hiop; + +static bool self_check(size_type n, double obj_value, const bool inertia_free); + +static bool parse_arguments(int argc, + char **argv, + size_type& n, + bool& self_check, + bool& inertia_free, + bool& use_cusolver, + bool& use_cusolver_lu, + bool& use_ginkgo, + bool& use_ginkgo_cuda, + bool& use_ginkgo_hip) +{ + self_check = false; + n = 3; + inertia_free = false; + use_cusolver = false; + use_cusolver_lu = false; + use_ginkgo = false; + use_ginkgo_cuda = false; + use_ginkgo_cuda = false; + switch(argc) { + case 1: + //no arguments + return true; + break; + case 5: //4 arguments + { + if(std::string(argv[4]) == "-selfcheck") { + self_check = true; + } else if(std::string(argv[4]) == "-inertiafree") { + inertia_free = true; + } else if(std::string(argv[4]) == "-cusolver") { + use_cusolver = true; + } else if(std::string(argv[4]) == "-ginkgo"){ + use_ginkgo = true; + } else if(std::string(argv[4]) == "-ginkgo_cuda"){ + use_ginkgo = true; + use_ginkgo_cuda = true; + } else if(std::string(argv[4]) == "-ginkgo_hip"){ + use_ginkgo = true; + use_ginkgo_hip = true; + } else { + n = std::atoi(argv[4]); + if(n<=0) { + return false; + } + } + } + case 4: //3 arguments + { + if(std::string(argv[3]) == "-selfcheck") { + self_check = true; + } else if(std::string(argv[3]) == "-inertiafree") { + inertia_free = true; + } else if(std::string(argv[3]) == "-cusolver") { + use_cusolver = true; + } else if(std::string(argv[3]) == "-ginkgo"){ + use_ginkgo = true; + } else if(std::string(argv[3]) == "-ginkgo_cuda"){ + use_ginkgo = true; + use_ginkgo_cuda = true; + } else if(std::string(argv[3]) == "-ginkgo_hip"){ + use_ginkgo = true; + use_ginkgo_hip = true; + } else { + n = std::atoi(argv[3]); + if(n<=0) { + return false; + } + } + } + case 3: //2 arguments + { + if(std::string(argv[2]) == "-selfcheck") { + self_check = true; + } else if(std::string(argv[2]) == "-inertiafree") { + inertia_free = true; + } else if(std::string(argv[2]) == "-cusolver") { + use_cusolver = true; + } else if(std::string(argv[2]) == "-ginkgo"){ + use_ginkgo = true; + } else if(std::string(argv[2]) == "-ginkgo_cuda"){ + use_ginkgo = true; + use_ginkgo_cuda = true; + } else if(std::string(argv[2]) == "-ginkgo_hip"){ + use_ginkgo = true; + use_ginkgo_hip = true; + } else { + n = std::atoi(argv[2]); + if(n<=0) { + return false; + } + } + } + case 2: //1 argument + { + if(std::string(argv[1]) == "-selfcheck") { + self_check = true; + } else if(std::string(argv[1]) == "-inertiafree") { + inertia_free = true; + } else if(std::string(argv[1]) == "-cusolver") { + use_cusolver = true; + } else if(std::string(argv[1]) == "-ginkgo"){ + use_ginkgo = true; + } else if(std::string(argv[1]) == "-ginkgo_cuda"){ + use_ginkgo = true; + use_ginkgo_cuda = true; + } else if(std::string(argv[1]) == "-ginkgo_hip"){ + use_ginkgo = true; + use_ginkgo_hip = true; + } else { + n = std::atoi(argv[1]); + if(n<=0) { + return false; + } + } + } + break; + default: + return false; // 4 or more arguments + } + +// If CUDA is not available, de-select cuSOLVER +#ifndef HIOP_USE_CUDA + if(use_cusolver) { + printf("HiOp built without CUDA support. "); + printf("Using default instead of cuSOLVER ...\n"); + use_cusolver = false; + } +#endif + +// Use cuSOLVER's LU factorization, if it was configured +#ifdef HIOP_USE_CUSOLVER_LU + if(use_cusolver) { + use_cusolver_lu = true; + } +#endif + + // If cuSOLVER was selected, but inertia free approach was not, add inertia-free + if(use_cusolver && !(inertia_free)) { + inertia_free = true; + printf("LU solver from cuSOLVER library requires inertia free approach. "); + printf("Enabling now ...\n"); + } + +// If Ginkgo is not available, de-select it. +#ifndef HIOP_USE_GINKGO + if(use_ginkgo) { + printf("HiOp not built with GINKGO support, using default linear solver ...\n"); + use_ginkgo = false; + } +#endif + + // If Ginkgo was selected, but inertia free approach was not, add inertia-free + if(use_ginkgo && !(inertia_free)) { + inertia_free = true; + printf("LU solver from GINKGO library requires inertia free approach. "); + printf("Enabling now ...\n"); + } + + return true; +}; + +static void usage(const char* exeName) +{ + printf("hiOp driver %s that solves a synthetic convex problem of variable size.\n", exeName); + printf("Usage: \n"); + printf(" '$ %s problem_size -inertiafree -selfcheck'\n", exeName); + printf("Arguments:\n"); + printf(" 'problem_size': number of decision variables [optional, default is 50]\n"); + printf(" '-inertiafree': indicate if inertia free approach should be used [optional]\n"); + printf(" '-selfcheck': compares the optimal objective with a previously saved value for the " + "problem specified by 'problem_size'. [optional]\n"); + printf(" '-cusolver': use cuSOLVER linear solver [optional]\n"); + printf(" '-ginkgo': use GINKGO linear solver [optional]\n"); +} + + +int main(int argc, char **argv) +{ + int rank=0; +#ifdef HIOP_USE_MPI + MPI_Init(&argc, &argv); + int comm_size; + int ierr = MPI_Comm_size(MPI_COMM_WORLD, &comm_size); assert(MPI_SUCCESS==ierr); + if(comm_size != 1) { + printf("[error] driver detected more than one rank but the driver should be run " + "in serial only; will exit\n"); + MPI_Finalize(); + return 1; + } +#endif + + // Set memory space where to create models and perform NLP solve +#ifdef HIOP_USE_GPU + std::string mem_space = "device"; +#else + std::string mem_space = "host"; +#endif + + bool selfCheck = false; + size_type n = 50; + bool inertia_free = false; + bool use_cusolver = false; + bool use_cusolver_lu = false; + bool use_ginkgo = false; + bool use_ginkgo_cuda = false; + bool use_ginkgo_hip = false; + if(!parse_arguments(argc, argv, n, selfCheck, inertia_free, use_cusolver, use_cusolver_lu, use_ginkgo, use_ginkgo_cuda, use_ginkgo_hip)) { + usage(argv[0]); +#ifdef HIOP_USE_MPI + MPI_Finalize(); +#endif + return 1; + } + + bool convex_obj = false; + bool rankdefic_Jac_eq = true; + bool rankdefic_Jac_ineq = true; + double scal_neg_obj = 0.1; + + //first test + { + SparseRajaEx2 nlp_interface(mem_space, n, convex_obj, rankdefic_Jac_eq, rankdefic_Jac_ineq, scal_neg_obj); + hiopNlpSparse nlp(nlp_interface); + nlp.options->SetStringValue("compute_mode", "gpu"); + nlp.options->SetStringValue("KKTLinsys", "xdycyd"); + + // only support cusolverLU right now, 2023.02.28 + //lsq initialization of the duals fails for this example since the Jacobian is rank deficient + //use zero initialization + nlp.options->SetStringValue("duals_init", "zero"); + nlp.options->SetStringValue("mem_space", "device"); + nlp.options->SetStringValue("fact_acceptor", "inertia_free"); + nlp.options->SetStringValue("linsol_mode", "speculative"); + + hiopAlgFilterIPMNewton solver(&nlp); + hiopSolveStatus status = solver.run(); + + double obj_value = solver.getObjective(); + + if(status<0) { + if(rank==0) { + printf("solver returned negative solve status: %d (with objective is %18.12e)\n", status, obj_value); + } +#ifdef HIOP_USE_MPI + MPI_Finalize(); +#endif + return -1; + } + + //this is used for "regression" testing when the driver is called with -selfcheck + if(selfCheck) { + if(!self_check(n, obj_value, inertia_free)) { +#ifdef HIOP_USE_MPI + MPI_Finalize(); +#endif + return -1; + } + } else { + if(rank==0) { + printf("Optimal objective: %22.14e. Solver status: %d\n", obj_value, status); + } + } + } + + +#ifdef HIOP_USE_MPI + MPI_Finalize(); +#endif + + return 0; +} + + +static bool self_check(size_type n, double objval, const bool inertia_free) +{ +#define num_n_saved 3 //keep this is sync with n_saved and objval_saved + const size_type n_saved[] = {50, 500, 10000}; + const double objval_saved[] = { 8.7754974e+00, 6.4322371e+01, 1.2369786e+03}; + +#define relerr 1e-6 + bool found=false; + for(int it=0; it relerr) { + printf("selfcheck failure. Objective (%18.12e) does not agree (%d digits) with the saved value (%18.12e) for n=%d.\n", + objval, -(int)log10(relerr), objval_saved[it], n); + return false; + } else { + printf("selfcheck success (%d digits)\n", -(int)log10(relerr)); + } + break; + } + } + + if(!found) { + printf("selfcheck: driver does not have the objective for n=%d saved. BTW, obj=%18.12e was obtained for this n.\n", n, objval); + return false; + } + + return true; +} diff --git a/src/LinAlg/hiopLinSolver.cpp b/src/LinAlg/hiopLinSolver.cpp index 0c3760dcc..868981175 100644 --- a/src/LinAlg/hiopLinSolver.cpp +++ b/src/LinAlg/hiopLinSolver.cpp @@ -99,7 +99,7 @@ namespace hiop { //we default to triplet matrix for now; derived classes using CSR matrices will not call //this constructor (will call the 1-parameter constructor below) so they avoid creating //the triplet matrix - M_ = new hiopMatrixSparseTriplet(n, n, nnz); + M_ = LinearAlgebraFactory::create_matrix_sparse(nlp->options->GetString("mem_space"), n, n, nnz); //this class will own `M_` sys_mat_owned_ = true; nlp_ = nlp; @@ -124,7 +124,7 @@ namespace hiop { hiopLinSolverNonSymSparse::hiopLinSolverNonSymSparse(int n, int nnz, hiopNlpFormulation* nlp) { - M_ = new hiopMatrixSparseTriplet(n, n, nnz); + M_ = LinearAlgebraFactory::create_matrix_sparse(nlp->options->GetString("mem_space"), n, n, nnz); sys_mat_owned_ = false; nlp_ = nlp; perf_report_ = "on"==hiop::tolower(nlp->options->GetString("time_kkt")); diff --git a/src/LinAlg/hiopLinSolverSparseCUSOLVER.cpp b/src/LinAlg/hiopLinSolverSparseCUSOLVER.cpp index 957b31f33..b50537324 100644 --- a/src/LinAlg/hiopLinSolverSparseCUSOLVER.cpp +++ b/src/LinAlg/hiopLinSolverSparseCUSOLVER.cpp @@ -1619,4 +1619,89 @@ namespace hiop break; } // switch } // GramSchmidt + + + + hiopLinSolverSymSparseCUSOLVERGPU::hiopLinSolverSymSparseCUSOLVERGPU(const int& n, + const int& nnz, + hiopNlpFormulation* nlp) + : hiopLinSolverSymSparseCUSOLVER(n, nnz, nlp), + rhs_host_{nullptr}, + M_host_{nullptr} + { + rhs_host_ = LinearAlgebraFactory::create_vector("default", n); + M_host_ = LinearAlgebraFactory::create_matrix_sparse("default", n, n, nnz); + } + + hiopLinSolverSymSparseCUSOLVERGPU::~hiopLinSolverSymSparseCUSOLVERGPU() + { + delete rhs_host_; + delete M_host_; + } + + int hiopLinSolverSymSparseCUSOLVERGPU::matrixChanged() + { + size_type nnz = M_->numberOfNonzeros(); + double* mval_dev = M_->M(); + double* mval_host= M_host_->M(); + + index_type* irow_dev = M_->i_row(); + index_type* irow_host= M_host_->i_row(); + + index_type* jcol_dev = M_->j_col(); + index_type* jcol_host= M_host_->j_col(); + + if("device" == nlp_->options->GetString("mem_space")) { + checkCudaErrors(cudaMemcpy(mval_host, mval_dev, sizeof(double) * nnz, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(irow_host, irow_dev, sizeof(index_type) * nnz, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(jcol_host, jcol_dev, sizeof(index_type) * nnz, cudaMemcpyDeviceToHost)); + } else { + checkCudaErrors(cudaMemcpy(mval_host, mval_dev, sizeof(double) * nnz, cudaMemcpyHostToHost)); + checkCudaErrors(cudaMemcpy(irow_host, irow_dev, sizeof(index_type) * nnz, cudaMemcpyHostToHost)); + checkCudaErrors(cudaMemcpy(jcol_host, jcol_dev, sizeof(index_type) * nnz, cudaMemcpyHostToHost)); + } + + hiopMatrixSparse* swap_ptr = M_; + M_ = M_host_; + M_host_ = swap_ptr; + + int vret = hiopLinSolverSymSparseCUSOLVER::matrixChanged(); + + swap_ptr = M_; + M_ = M_host_; + M_host_ = swap_ptr; + + return vret; + } + + bool hiopLinSolverSymSparseCUSOLVERGPU::solve(hiopVector& x) + { + double* mval_dev = x.local_data(); + double* mval_host= rhs_host_->local_data(); + + if("device" == nlp_->options->GetString("mem_space")) { + checkCudaErrors(cudaMemcpy(mval_host, mval_dev, sizeof(double) * n_, cudaMemcpyDeviceToHost)); + } else { + checkCudaErrors(cudaMemcpy(mval_host, mval_dev, sizeof(double) * n_, cudaMemcpyHostToHost)); + } + + hiopMatrixSparse* swap_ptr = M_; + M_ = M_host_; + M_host_ = swap_ptr; + + bool bret = hiopLinSolverSymSparseCUSOLVER::solve(*rhs_host_); + + swap_ptr = M_; + M_ = M_host_; + M_host_ = swap_ptr; + + if("device" == nlp_->options->GetString("mem_space")) { + checkCudaErrors(cudaMemcpy(mval_dev, mval_host, sizeof(double) * n_, cudaMemcpyHostToDevice)); + } else { + checkCudaErrors(cudaMemcpy(mval_dev, mval_host, sizeof(double) * n_, cudaMemcpyHostToHost)); + } + + return bret; + } + } // namespace hiop diff --git a/src/LinAlg/hiopLinSolverSparseCUSOLVER.hpp b/src/LinAlg/hiopLinSolverSparseCUSOLVER.hpp index 16518052c..1833afe67 100644 --- a/src/LinAlg/hiopLinSolverSparseCUSOLVER.hpp +++ b/src/LinAlg/hiopLinSolverSparseCUSOLVER.hpp @@ -88,12 +88,12 @@ class hiopLinSolverSymSparseCUSOLVER : public hiopLinSolverSymSparse /** Triggers a refactorization of the matrix, if necessary. * Overload from base class. * In this case, KLU (SuiteSparse) is used to refactor*/ - int matrixChanged(); + virtual int matrixChanged(); /** solves a linear system. * param 'x' is on entry the right hand side(s) of the system to be solved. * On exit is contains the solution(s). */ - bool solve(hiopVector& x_); + virtual bool solve(hiopVector& x_); /** Multiple rhs not supported yet */ virtual bool @@ -103,7 +103,7 @@ class hiopLinSolverSymSparseCUSOLVER : public hiopLinSolverSymSparse return false; } -private: +protected: // int m_; // number of rows of the whole matrix int n_; // number of cols of the whole matrix @@ -204,6 +204,27 @@ class hiopLinSolverSymSparseCUSOLVER : public hiopLinSolverSymSparse void IRsetup(); }; +class hiopLinSolverSymSparseCUSOLVERGPU : public hiopLinSolverSymSparseCUSOLVER +{ +public: + hiopLinSolverSymSparseCUSOLVERGPU(const int& n, const int& nnz, hiopNlpFormulation* nlp); + virtual ~hiopLinSolverSymSparseCUSOLVERGPU(); + + virtual int matrixChanged(); + virtual bool solve(hiopVector& x_); + + /** Multiple rhs not supported yet */ + virtual bool + solve(hiopMatrix& /* x */) + { + assert(false && "not yet supported"); + return false; + } + +private: + hiopVector* rhs_host_; + hiopMatrixSparse* M_host_; +}; // Forward declaration of LU class class hiopLinSolverSymSparseCUSOLVERLU; diff --git a/src/LinAlg/hiopMatrixRajaSparseTriplet.hpp b/src/LinAlg/hiopMatrixRajaSparseTriplet.hpp index 4769d47fa..00bc19a08 100644 --- a/src/LinAlg/hiopMatrixRajaSparseTriplet.hpp +++ b/src/LinAlg/hiopMatrixRajaSparseTriplet.hpp @@ -390,11 +390,13 @@ class hiopMatrixRajaSparseTriplet : public hiopMatrixSparse { index_type *idx_start_; //size num_rows+1 index_type *idx_start_host_; //size num_rows+1 + index_type register_row_st_; size_type num_rows_; std::string mem_space_; RowStartsInfo() : idx_start_(nullptr), - num_rows_(0) + num_rows_(0), + register_row_st_{0} {} RowStartsInfo(size_type n_rows, std::string memspace); virtual ~RowStartsInfo(); diff --git a/src/LinAlg/hiopMatrixRajaSparseTripletImpl.hpp b/src/LinAlg/hiopMatrixRajaSparseTripletImpl.hpp index 11919070f..fd538740f 100644 --- a/src/LinAlg/hiopMatrixRajaSparseTripletImpl.hpp +++ b/src/LinAlg/hiopMatrixRajaSparseTripletImpl.hpp @@ -1123,8 +1123,6 @@ copyRowsBlockFrom(const hiopMatrix& src_gen, assert(n_rows + rows_src_idx_st <= src.m()); assert(n_rows + rows_dst_idx_st <= this->m()); - index_type* src_row_st = src.row_starts_->idx_start_; - const index_type* iRow_src = src.i_row(); const index_type* jCol_src = src.j_col(); const double* values_src = src.M(); @@ -1143,6 +1141,8 @@ copyRowsBlockFrom(const hiopMatrix& src_gen, } assert(src.row_starts_); + index_type* src_row_st = src.row_starts_->idx_start_; + // // The latest CPU code can be found in 342eb99ec16d45f57a492be1bf1e39cce73995a5 // It is replaced by RAJA::inclusive_scan after that commit @@ -1165,10 +1165,12 @@ copyRowsBlockFrom(const hiopMatrix& src_gen, umpire::Allocator hostalloc = rm.getAllocator("HOST"); int *next_row_nnz = static_cast(hostalloc.allocate(sizeof(size_type))); + index_type register_row_st = row_starts_->register_row_st_; rm.copy(next_row_nnz, dst_row_st_dev+1+rows_dst_idx_st, 1*sizeof(size_type)); if(next_row_nnz[0] == 0) { + assert(rows_dst_idx_st >= register_row_st); // comput nnz in each row from source RAJA::forall( RAJA::RangeSegment(0, n_rows), @@ -1179,7 +1181,8 @@ copyRowsBlockFrom(const hiopMatrix& src_gen, dst_row_st_dev[row_dst+1] = src_row_st[row_src+1] - src_row_st[row_src]; } ); - RAJA::inclusive_scan_inplace(RAJA::make_span(dst_row_st_dev, n_rows+1), RAJA::operators::plus()); + RAJA::inclusive_scan_inplace(RAJA::make_span(dst_row_st_dev+register_row_st, n_rows+1+rows_dst_idx_st-register_row_st), RAJA::operators::plus()); + row_starts_->register_row_st_ = n_rows + rows_dst_idx_st; } index_type* dst_row_st = row_starts_->idx_start_; @@ -1191,7 +1194,7 @@ copyRowsBlockFrom(const hiopMatrix& src_gen, const index_type row_src = rows_src_idx_st + row_add; const index_type row_dst = rows_dst_idx_st + row_add; index_type k_src = src_row_st[row_src]; - index_type k_dst = dst_row_st[row_dst]; + index_type k_dst = dst_row_st[row_dst] - dst_row_st[rows_dst_idx_st] + dest_nnz_st; // copy from src while(k_src < src_row_st[row_src+1]) { @@ -1283,7 +1286,8 @@ template hiopMatrixRajaSparseTriplet::RowStartsInfo:: RowStartsInfo(size_type n_rows, std::string memspace) : num_rows_(n_rows), - mem_space_(memspace) + mem_space_(memspace), + register_row_st_{0} { auto& resmgr = umpire::ResourceManager::getInstance(); umpire::Allocator alloc = resmgr.getAllocator(mem_space_); diff --git a/src/Optimization/hiopKKTLinSysSparse.cpp b/src/Optimization/hiopKKTLinSysSparse.cpp index 7a1360bf2..c3c43c832 100644 --- a/src/Optimization/hiopKKTLinSysSparse.cpp +++ b/src/Optimization/hiopKKTLinSysSparse.cpp @@ -63,6 +63,7 @@ #ifdef HIOP_USE_GINKGO #include "hiopLinSolverSparseGinkgo.hpp" #endif + #endif namespace hiop @@ -184,7 +185,12 @@ namespace hiop write_linsys_counter_++; } if(write_linsys_counter_>=0) { - csr_writer_.writeMatToFile(*Msys, write_linsys_counter_, nx, neq, nineq); +#ifndef HIOP_USE_GPU + auto* MsysSp = dynamic_cast(linSys->sys_matrix()); + csr_writer_.writeMatToFile(*MsysSp, write_linsys_counter_, nx, neq, nineq); +#else + //TODO csr_writer_.writeMatToFile(*Msys, write_linsys_counter_, nx, neq, nineq); +#endif } return true; @@ -329,7 +335,7 @@ namespace hiop if( (nullptr == linSys_ && linear_solver == "auto") || linear_solver == "cusolver-lu") { #if defined(HIOP_USE_CUSOLVER_LU) - linSys_ = new hiopLinSolverSymSparseCUSOLVER(n, nnz, nlp_); + linSys_ = new hiopLinSolverSymSparseCUSOLVERGPU(n, nnz, nlp_); linsol_actual = "CUSOLVER-LU"; auto* fact_acceptor_ic = dynamic_cast (fact_acceptor_); if(fact_acceptor_ic) { @@ -418,14 +424,13 @@ namespace hiop delta_wd_ = perturb_calc_->get_curr_delta_wd(); delta_cc_ = perturb_calc_->get_curr_delta_cc(); delta_cd_ = perturb_calc_->get_curr_delta_cd(); - - HessSp_ = dynamic_cast(Hess_); - if(!HessSp_) { assert(false); return false; } - Jac_cSp_ = dynamic_cast(Jac_c_); + HessSp_ = dynamic_cast(Hess_); + Jac_cSp_ = dynamic_cast(Jac_c_); + Jac_dSp_ = dynamic_cast(Jac_d_); + + if(!HessSp_) { assert(false); return false; } if(!Jac_cSp_) { assert(false); return false; } - - Jac_dSp_ = dynamic_cast(Jac_d_); if(!Jac_dSp_) { assert(false); return false; } size_type nx = HessSp_->n(), nd=Jac_dSp_->m(), neq=Jac_cSp_->m(), nineq=Jac_dSp_->m(); @@ -436,7 +441,7 @@ namespace hiop auto* linSys = dynamic_cast (linSys_); assert(linSys); - auto* Msys = dynamic_cast(linSys->sys_matrix()); + auto* Msys = dynamic_cast(linSys->sys_matrix()); assert(Msys); if(perf_report_) { nlp_->log->printf(hovSummary, @@ -510,7 +515,12 @@ namespace hiop write_linsys_counter_++; } if(write_linsys_counter_>=0) { - csr_writer_.writeMatToFile(*Msys, write_linsys_counter_, nx, neq, nineq); +#ifndef HIOP_USE_GPU + auto* MsysSp = dynamic_cast(linSys->sys_matrix()); + csr_writer_.writeMatToFile(*MsysSp, write_linsys_counter_, nx, neq, nineq); +#else + //TODO csr_writer_.writeMatToFile(*Msys, write_linsys_counter_, nx, neq, nineq); +#endif } return true; @@ -685,7 +695,7 @@ namespace hiop if(linear_solver == "cusolver-lu" || linear_solver == "auto") { #if defined(HIOP_USE_CUSOLVER_LU) actual_lin_solver = "CUSOLVER-LU"; - linSys_ = new hiopLinSolverSymSparseCUSOLVER(n, nnz, nlp_); + linSys_ = new hiopLinSolverSymSparseCUSOLVERGPU(n, nnz, nlp_); auto* fact_acceptor_ic = dynamic_cast (fact_acceptor_); if(fact_acceptor_ic) { nlp_->log->printf(hovError, @@ -750,13 +760,13 @@ namespace hiop // // We don't allow CPU linear solvers. ///////////////////////////////////////////////////////////////////////////////////////////// - assert(false && "KKT_SPARSE_XDYcYd linsys: GPU compute mode not yet supported."); - assert(false == safe_mode_); + // assert(false && "KKT_SPARSE_XDYcYd linsys: GPU compute mode not yet supported."); + // assert(false == safe_mode_); assert(nullptr == linSys_); if(linear_solver == "cusolver-lu" || linear_solver == "auto") { #if defined(HIOP_USE_CUSOLVER_LU) - linSys_ = new hiopLinSolverSymSparseCUSOLVER(n, nnz, nlp_); + linSys_ = new hiopLinSolverSymSparseCUSOLVERGPU(n, nnz, nlp_); nlp_->log->printf(hovScalars, "KKT_SPARSE_XDYcYd linsys: alloc CUSOLVER-LU size %d (%d cons) (gpu)\n", n, @@ -1046,7 +1056,12 @@ namespace hiop write_linsys_counter_++; } if(write_linsys_counter_>=0) { - csr_writer_.writeMatToFile(*Msys, write_linsys_counter_, nx, neq, nineq); +#ifndef HIOP_USE_GPU + auto* MsysSp = dynamic_cast(linSys->sys_matrix()); + csr_writer_.writeMatToFile(*MsysSp, write_linsys_counter_, nx, neq, nineq); +#else + //TODO csr_writer_.writeMatToFile(*Msys, write_linsys_counter_, nx, neq, nineq); +#endif } return true; diff --git a/src/Optimization/hiopKKTLinSysSparseCondensed.hpp b/src/Optimization/hiopKKTLinSysSparseCondensed.hpp index 1e0747c61..cb9c6fc8d 100644 --- a/src/Optimization/hiopKKTLinSysSparseCondensed.hpp +++ b/src/Optimization/hiopKKTLinSysSparseCondensed.hpp @@ -210,7 +210,6 @@ class hiopKKTLinSysCondensedSparse : public hiopKKTLinSysCompressedSparseXDYcYd } else { //(opt_compute_mode == "hybrid" || opt_compute_mode == "gpu") { #ifdef HIOP_USE_CUDA - assert(opt_compute_mode != "gpu" && "When code is GPU-ready, remove this method"); return "CUDA"; #else assert(false && "compute mode not supported without HIOP_USE_CUDA build");