From d682ec81c065140de14feb05cfb41b28d626b32c Mon Sep 17 00:00:00 2001
From: Nai-Yuan Chiang <chiang7@llnl.gov>
Date: Mon, 13 Mar 2023 20:16:04 -0700
Subject: [PATCH] Sparse solver on GPU (#600)

* add sparse raja example

* update cmake file

* add driver

* clean the code
---
 src/Drivers/MDS/NlpMdsRajaEx1.hpp             |   4 +-
 src/Drivers/Sparse/CMakeLists.txt             |  19 +
 src/Drivers/Sparse/NlpSparseEx2.cpp           |   3 +-
 src/Drivers/Sparse/NlpSparseRajaEx2.cpp       | 484 ++++++++++++++++++
 src/Drivers/Sparse/NlpSparseRajaEx2.hpp       | 309 +++++++++++
 src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp | 318 ++++++++++++
 src/LinAlg/hiopLinSolver.cpp                  |   4 +-
 src/LinAlg/hiopLinSolverSparseCUSOLVER.cpp    |  85 +++
 src/LinAlg/hiopLinSolverSparseCUSOLVER.hpp    |  27 +-
 src/LinAlg/hiopMatrixRajaSparseTriplet.hpp    |   4 +-
 .../hiopMatrixRajaSparseTripletImpl.hpp       |  14 +-
 src/Optimization/hiopKKTLinSysSparse.cpp      |  45 +-
 .../hiopKKTLinSysSparseCondensed.hpp          |   1 -
 13 files changed, 1286 insertions(+), 31 deletions(-)
 create mode 100644 src/Drivers/Sparse/NlpSparseRajaEx2.cpp
 create mode 100644 src/Drivers/Sparse/NlpSparseRajaEx2.hpp
 create mode 100644 src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp

diff --git a/src/Drivers/MDS/NlpMdsRajaEx1.hpp b/src/Drivers/MDS/NlpMdsRajaEx1.hpp
index c43346ccd..fef3747d6 100644
--- a/src/Drivers/MDS/NlpMdsRajaEx1.hpp
+++ b/src/Drivers/MDS/NlpMdsRajaEx1.hpp
@@ -170,7 +170,7 @@ class MdsEx1 : public hiop::hiopInterfaceMDS
    *  Returns the sizes and number of nonzeros of the sparse and dense blocks within MDS
    *
    * @param[out] nx_sparse number of sparse variables
-   * @param[out] nx_ense number of dense variables
+   * @param[out] nx_dense number of dense variables
    * @param[out] nnz_sparse_Jace number of nonzeros in the Jacobian of the equalities w.r.t. 
    *                             sparse variables 
    * @param[out] nnz_sparse_Jaci number of nonzeros in the Jacobian of the inequalities w.r.t. 
@@ -233,7 +233,7 @@ class MdsEx1 : public hiop::hiopInterfaceMDS
    * @param[in] x array with the optimization variables or point at which to evaluate
    *              (managed by Umpire)
    * @param[in] new_x indicates whether any of the other eval functions have been evaluated 
-   *             previously (false) or not (true) at x
+   *                  previously (false) or not (true) at x
    * @param[out] gradf array with the values of the gradient (managed by Umpire) 
    */
   bool eval_grad_f(const size_type& n, const double* x, bool new_x, double* gradf);
diff --git a/src/Drivers/Sparse/CMakeLists.txt b/src/Drivers/Sparse/CMakeLists.txt
index 646c6e43b..7c96910e4 100644
--- a/src/Drivers/Sparse/CMakeLists.txt
+++ b/src/Drivers/Sparse/CMakeLists.txt
@@ -7,6 +7,20 @@ target_link_libraries(NlpSparseEx2.exe HiOp::HiOp)
 add_executable(NlpSparseEx3.exe NlpSparseEx3.cpp NlpSparseEx3Driver.cpp)
 target_link_libraries(NlpSparseEx3.exe HiOp::HiOp)
 
+if(HIOP_USE_RAJA)
+  if(HIOP_USE_GPU AND HIOP_USE_CUDA)
+    set_source_files_properties(
+      NlpSparseRajaEx2.cpp 
+      NlpSparseRajaEx2Driver.cpp 
+      PROPERTIES LANGUAGE CUDA
+    )
+  
+    add_executable(NlpSparseRajaEx2.exe  NlpSparseRajaEx2Driver.cpp  NlpSparseRajaEx2.cpp)
+    target_link_libraries(NlpSparseRajaEx2.exe HiOp::HiOp)
+    install(TARGETS NlpSparseRajaEx2.exe DESTINATION bin)
+  endif()
+endif()
+
 if(HIOP_BUILD_SHARED)
   add_executable(NlpSparseCEx1.exe NlpSparseCEx1.c)
   target_link_libraries(NlpSparseCEx1.exe HiOp::HiOp)
@@ -52,6 +66,11 @@ if(HIOP_USE_GINKGO)
       add_test(NAME NlpSparse2_6 COMMAND ${RUNCMD} "$<TARGET_FILE:NlpSparseEx2.exe>" "500" "-ginkgo_hip" "-inertiafree" "-selfcheck")
   endif(HIOP_USE_HIP)
 endif(HIOP_USE_GINKGO)
+
+if(HIOP_USE_RAJA AND HIOP_USE_GPU AND HIOP_USE_CUDA)
+  add_test(NAME NlpSparseRaja2_1 COMMAND ${RUNCMD} bash -c "$<TARGET_FILE:NlpSparseRajaEx2.exe> 500 -selfcheck ")
+endif()
+
 add_test(NAME NlpSparse3_1 COMMAND ${RUNCMD} "$<TARGET_FILE:NlpSparseEx3.exe>" "500" "-selfcheck")
 if(HIOP_BUILD_SHARED AND NOT HIOP_USE_GPU)
   add_test(NAME NlpSparseCinterface COMMAND ${RUNCMD} "$<TARGET_FILE:NlpSparseCEx1.exe>")
diff --git a/src/Drivers/Sparse/NlpSparseEx2.cpp b/src/Drivers/Sparse/NlpSparseEx2.cpp
index 72073e914..ce4a0615e 100644
--- a/src/Drivers/Sparse/NlpSparseEx2.cpp
+++ b/src/Drivers/Sparse/NlpSparseEx2.cpp
@@ -153,7 +153,6 @@ bool SparseEx2::eval_cons(const size_type& n,
   return false;
 }
 
-/* Four constraints no matter how large n is */
 bool SparseEx2::eval_cons(const size_type& n, const size_type& m, const double* x, bool new_x, double* cons)
 {
   assert(n==n_vars); assert(m==n_cons);
@@ -170,7 +169,7 @@ bool SparseEx2::eval_cons(const size_type& n, const size_type& m, const double*
   // --- constraint 2 body ---> 2*x_1 + x_3
   cons[conidx++] += 2*x[0] + 1*x[2];
 
-  // --- constraint 3 body --->   2*x_1 + 0.5*x_i, for i>=4
+  // --- constraint 3 body ---> 2*x_1 + 0.5*x_i, for i>=4
   for(auto i=3; i<n; i++) {
     cons[conidx++] += 2*x[0] + 0.5*x[i];
   }
diff --git a/src/Drivers/Sparse/NlpSparseRajaEx2.cpp b/src/Drivers/Sparse/NlpSparseRajaEx2.cpp
new file mode 100644
index 000000000..bd9fca748
--- /dev/null
+++ b/src/Drivers/Sparse/NlpSparseRajaEx2.cpp
@@ -0,0 +1,484 @@
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory (LLNL).
+// LLNL-CODE-742473. All rights reserved.
+//
+// This file is part of HiOp. For details, see https://github.com/LLNL/hiop. HiOp
+// is released under the BSD 3-clause license (https://opensource.org/licenses/BSD-3-Clause).
+// Please also read "Additional BSD Notice" below.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+// i. Redistributions of source code must retain the above copyright notice, this list
+// of conditions and the disclaimer below.
+// ii. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the disclaimer (as noted below) in the documentation and/or
+// other materials provided with the distribution.
+// iii. Neither the name of the LLNS/LLNL nor the names of its contributors may be used to
+// endorse or promote products derived from this software without specific prior written
+// permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+// SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, THE U.S. DEPARTMENT OF ENERGY OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Additional BSD Notice
+// 1. This notice is required to be provided under our contract with the U.S. Department
+// of Energy (DOE). This work was produced at Lawrence Livermore National Laboratory under
+// Contract No. DE-AC52-07NA27344 with the DOE.
+// 2. Neither the United States Government nor Lawrence Livermore National Security, LLC
+// nor any of their employees, makes any warranty, express or implied, or assumes any
+// liability or responsibility for the accuracy, completeness, or usefulness of any
+// information, apparatus, product, or process disclosed, or represents that its use would
+// not infringe privately-owned rights.
+// 3. Also, reference herein to any specific commercial products, process, or services by
+// trade name, trademark, manufacturer or otherwise does not necessarily constitute or
+// imply its endorsement, recommendation, or favoring by the United States Government or
+// Lawrence Livermore National Security, LLC. The views and opinions of authors expressed
+// herein do not necessarily state or reflect those of the United States Government or
+// Lawrence Livermore National Security, LLC, and shall not be used for advertising or
+// product endorsement purposes.
+
+/**
+ * @file NlpSparseRajaEx2.cpp
+ * 
+ * @author Nai-Yuan Chiang <chiang7@lnnl.gov>, LNNL
+ *
+ */
+
+#include "NlpSparseRajaEx2.hpp"
+
+#include <umpire/Allocator.hpp>
+#include <umpire/ResourceManager.hpp>
+#include <RAJA/RAJA.hpp>
+
+//#include <hiopMatrixRajaSparseTriplet.hpp>
+
+#include <cmath>
+#include <cstring> //for memcpy
+#include <cstdio>
+
+
+//TODO: A good idea to not use the internal HiOp Raja policies here and, instead, give self-containing
+// definitions of the policies here so that the user gets a better grasp of the concept and does not
+// rely on the internals of HiOp. For example:
+// #define RAJA_LAMBDA [=] __device__
+// using ex2_raja_exec = RAJA::cuda_exec<128>;
+// more defs here
+
+
+#if defined(HIOP_USE_CUDA)
+#include "ExecPoliciesRajaCudaImpl.hpp"
+using ex2_raja_exec = hiop::ExecRajaPoliciesBackend<hiop::ExecPolicyRajaCuda>::hiop_raja_exec;
+using ex2_raja_reduce = hiop::ExecRajaPoliciesBackend<hiop::ExecPolicyRajaCuda>::hiop_raja_reduce;
+//using hiopMatrixRajaSparse = hiop::hiopMatrixRajaSparseTriplet<hiop::MemBackendUmpire, hiop::ExecPolicyRajaCuda>;
+#elif defined(HIOP_USE_HIP)
+#include <ExecPoliciesRajaHipImpl.hpp>
+using ex2_raja_exec = hiop::ExecRajaPoliciesBackend<hiop::ExecPolicyRajaHip>::hiop_raja_exec;
+using ex2_raja_reduce = hiop::ExecRajaPoliciesBackend<hiop::ExecPolicyRajaHip>::hiop_raja_reduce;
+//using hiopMatrixRajaSparse = hiop::hiopMatrixRajaSparseTriplet<hiop::MemBackendUmpire, hiop::ExecPolicyRajaHip>;
+#else
+//#if !defined(HIOP_USE_CUDA) && !defined(HIOP_USE_HIP)
+#include <ExecPoliciesRajaOmpImpl.hpp>
+using ex2_raja_exec = hiop::ExecRajaPoliciesBackend<hiop::ExecPolicyRajaOmp>::hiop_raja_exec;
+using ex2_raja_reduce = hiop::ExecRajaPoliciesBackend<hiop::ExecPolicyRajaOmp>::hiop_raja_reduce;
+//using hiopMatrixRajaSparse = hiop::hiopMatrixRajaSparseTriplet<hiop::MemBackendUmpire, hiop::ExecPolicyRajaOmp>;
+#endif
+
+
+/** Nonlinear *highly nonconvex* and *rank deficient* problem test for the Filter IPM
+ * Newton of HiOp. It uses a Sparse NLP formulation. The problem is based on SparseEx1.
+ *
+ *  min   (2*convex_obj-1)*scal*sum 1/4* { (x_{i}-1)^4 : i=1,...,n} + 0.5x^Tx
+ *  s.t.
+ *            4*x_1 + 2*x_2                     == 10
+ *        5<= 2*x_1         + x_3
+ *        1<= 2*x_1                 + 0.5*x_i   <= 2*n, for i=4,...,n
+ *        x_1 free
+ *        0.0 <= x_2
+ *        1.0 <= x_3 <= 10
+ *        x_i >=0.5, i=4,...,n
+ *
+ * Optionally, one can add the following constraints to obtain a rank-deficient Jacobian
+ *
+ *  s.t.  [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]                  (rnkdef-con1)
+ *        4*x_1 + 2*x_2 == 10                                (rnkdef-con2)
+ *
+ *  other parameters are:
+ *  convex_obj: set to 1 to have a convex problem, otherwise set it to 0.
+ *  scale_quartic_obj_term: scaling factor for the quartic term in the objective (1.0 by default).
+ *
+ */
+SparseRajaEx2::SparseRajaEx2(std::string mem_space,
+                             int n,
+                             bool convex_obj,
+                             bool rankdefic_Jac_eq,
+                             bool rankdefic_Jac_ineq,
+                             double scal_neg_obj)
+  : mem_space_{mem_space},
+    convex_obj_{convex_obj},
+    rankdefic_eq_{rankdefic_Jac_eq},
+    rankdefic_ineq_{rankdefic_Jac_ineq},
+    n_vars_{n},
+    scal_neg_obj_{scal_neg_obj},
+    n_cons_{2}
+{
+  // Make sure mem_space_ is uppercase
+  transform(mem_space_.begin(), mem_space_.end(), mem_space_.begin(), ::toupper);
+
+  assert(n>=3 && "number of variables should be greater than 3 for this example");
+  if(n>3) {
+    n_cons_ += n-3;
+  }
+  n_cons_ += rankdefic_eq_ + rankdefic_ineq_;
+}
+
+SparseRajaEx2::~SparseRajaEx2()
+{
+}
+
+bool SparseRajaEx2::get_prob_sizes(size_type& n, size_type& m)
+{
+  n = n_vars_;
+  m = n_cons_;
+  return true;
+}
+
+bool SparseRajaEx2::get_vars_info(const size_type& n, double *xlow, double* xupp, NonlinearityType* type)
+{
+  assert(n==n_vars_);
+
+  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, 1),
+    RAJA_LAMBDA(RAJA::Index_type i)
+    {
+      xlow[0] = -1e20;
+      xupp[0] =  1e20;
+//      type[0] = hiopNonlinear;
+      xlow[1] = 0.0;
+      xupp[1] = 1e20;
+//      type[1] = hiopNonlinear;
+      xlow[2] = 1.0;
+      xupp[2] = 10.0;
+//      type[2] = hiopNonlinear;      
+    });
+
+  if(n>3) {
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(3, n),
+      RAJA_LAMBDA(RAJA::Index_type i)
+      {
+        xlow[i] = 0.5;
+        xupp[i] = 1e20;
+//        type[i] = hiopNonlinear;
+      });
+  }
+
+  // Use a sequential policy for host computations for now
+  RAJA::forall<RAJA::loop_exec>(RAJA::RangeSegment(0, n),
+    [=] (RAJA::Index_type i)
+    {
+      type[i] = hiopNonlinear;
+    });
+
+  return true;
+}
+
+bool SparseRajaEx2::get_cons_info(const size_type& m, double* clow, double* cupp, NonlinearityType* type)
+{
+  assert(m==n_cons_);
+  size_type n = n_vars_;
+  assert(m-1 == n-1+rankdefic_ineq_);
+
+  // RAJA doesn't like member objects
+  bool rankdefic_eq = rankdefic_eq_;
+  bool rankdefic_ineq = rankdefic_ineq_;
+  
+  // serial part
+  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, 1),
+    RAJA_LAMBDA(RAJA::Index_type i)
+    {      
+      clow[0] = 10.0;
+      cupp[0] = 10.0;
+//      type[0] = hiopInterfaceBase::hiopNonlinear;
+      clow[1] = 5.0;
+      cupp[1] = 1e20;
+//      type[1] = hiopInterfaceBase::hiopNonlinear;
+
+      if(rankdefic_ineq) {
+        // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]
+        clow[n-1] = -1e+20;
+        cupp[n-1] = 19.;
+//        type[n-1] = hiopInterfaceBase::hiopNonlinear;
+      }
+
+      if(rankdefic_eq) {
+          //  4*x_1 + 2*x_2 == 10
+          clow[m-1] = 10;
+          cupp[m-1] = 10;
+//          type[m-1] = hiopInterfaceBase::hiopNonlinear;
+      }
+    });
+
+  if(n>3) {
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(2, n-1),
+      RAJA_LAMBDA(RAJA::Index_type conidx)
+      {
+        clow[conidx] = 1.0;
+        cupp[conidx] = 2*n;
+//        type[conidx] = hiopInterfaceBase::hiopNonlinear;
+      });
+  }
+
+  // Must be a sequential host policy for now
+  RAJA::forall<RAJA::loop_exec>(RAJA::RangeSegment(0, m),
+    [=] (RAJA::Index_type i)
+    {
+      type[i] = hiopNonlinear;
+    });
+
+  return true;
+}
+
+bool SparseRajaEx2::get_sparse_blocks_info(int& nx,
+                                           int& nnz_sparse_Jaceq,
+                                           int& nnz_sparse_Jacineq,
+                                           int& nnz_sparse_Hess_Lagr)
+{
+  nx = n_vars_;;
+  nnz_sparse_Jaceq = 2 + 2*rankdefic_eq_;
+  nnz_sparse_Jacineq = 2 + 2*(n_vars_-3) + 2*rankdefic_ineq_;
+  nnz_sparse_Hess_Lagr = n_vars_;
+  return true;
+}
+
+bool SparseRajaEx2::eval_f(const size_type& n, const double* x, bool new_x, double& obj_value)
+{
+  assert(n==n_vars_);
+  obj_value=0.;
+  {
+    int convex_obj = (int) convex_obj_;
+    double scal_neg_obj = scal_neg_obj_;
+
+    RAJA::ReduceSum<ex2_raja_reduce, double> aux(0);
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, n),
+      RAJA_LAMBDA(RAJA::Index_type i)
+      {
+        aux += (2*convex_obj-1) * scal_neg_obj * 0.25 * std::pow(x[i]-1., 4) + 0.5 * std::pow(x[i], 2);
+      });
+    obj_value += aux.get();
+  }
+  return true;
+}
+
+bool SparseRajaEx2::eval_grad_f(const size_type& n, const double* x, bool new_x, double* gradf)
+{
+  assert(n==n_vars_);
+  {
+    int convex_obj = (int) convex_obj_;
+    double scal_neg_obj = scal_neg_obj_;
+
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, n),
+      RAJA_LAMBDA(RAJA::Index_type i)
+      {
+        gradf[i] = (2*convex_obj-1) * scal_neg_obj * std::pow(x[i]-1.,3) + x[i];
+      });
+  }
+  return true;
+}
+
+bool SparseRajaEx2::eval_cons(const size_type& n, const size_type& m, const double* x, bool new_x, double* cons)
+{
+  assert(n==n_vars_);
+  assert(m==n_cons_);
+  assert(n_cons_==2+n-3+rankdefic_eq_+rankdefic_ineq_);
+
+  // RAJA doesn't like member objects
+  bool rankdefic_eq = rankdefic_eq_;
+  bool rankdefic_ineq = rankdefic_ineq_;
+
+  // serial part
+  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, 1),
+    RAJA_LAMBDA(RAJA::Index_type i)
+    {      
+      // --- constraint 1 body --->  4*x_1 + 2*x_2 == 10
+      cons[0] = 4*x[0] + 2*x[1];
+      // --- constraint 2 body ---> 2*x_1 + x_3
+      cons[1] = 2*x[0] + 1*x[2];
+
+      if(rankdefic_ineq) {
+        // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]
+        cons[n-1] = 4*x[0] + 2*x[2];
+      }
+
+      if(rankdefic_eq) {
+        //  4*x_1 + 2*x_2 == 10
+        cons[m-1] = 4*x[0] + 2*x[1];
+      }
+    });
+
+  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(2, n-1),
+    RAJA_LAMBDA(RAJA::Index_type i)
+    {
+      // --- constraint 3 body ---> 2*x_1 + 0.5*x_i, for i>=4
+      cons[i] = 2*x[0] + 0.5*x[i+1];
+    });
+
+  return true;
+}
+
+bool SparseRajaEx2::eval_Jac_cons(const size_type& n,
+                                  const size_type& m,
+                                  const double* x,
+                                  bool new_x,
+                                  const int& nnzJacS,
+                                  index_type* iJacS,
+                                  index_type* jJacS,
+                                  double* MJacS)
+{
+  assert(n==n_vars_); assert(m==n_cons_);
+  assert(n>=3);
+
+  assert(nnzJacS == 4 + 2*(n-3) + 2*rankdefic_eq_ + 2*rankdefic_ineq_);
+
+  // RAJA doesn't like member objects
+  bool rankdefic_eq = rankdefic_eq_;
+  bool rankdefic_ineq = rankdefic_ineq_;
+
+  if(iJacS !=nullptr && jJacS != nullptr) {
+    // serial part
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, 1),
+      RAJA_LAMBDA(RAJA::Index_type itrow)
+      {
+        // --- constraint 1 body --->  4*x_1 + 2*x_2 == 10
+        iJacS[0] = 0;
+        jJacS[0] = 0;
+        iJacS[1] = 0;
+        jJacS[1] = 1;
+        // --- constraint 2 body ---> 2*x_1 + x_3
+        iJacS[2] = 1;
+        jJacS[2] = 0;
+        iJacS[3] = 1;
+        jJacS[3] = 2;
+
+        if(rankdefic_ineq) {
+          // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]
+          iJacS[2*n-2] = n-1;
+          jJacS[2*n-2] = 0;
+          iJacS[2*n-1] = n-1;
+          jJacS[2*n-1] = 2;
+        }
+
+        if(rankdefic_eq) {
+          //  4*x_1 + 2*x_2 == 10
+          iJacS[2*m-2] = m-1;
+          jJacS[2*m-2] = 0;
+          iJacS[2*m-1] = m-1;
+          jJacS[2*m-1] = 1;
+        }
+      });
+    
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(2, n-1),
+      RAJA_LAMBDA(RAJA::Index_type itrow)
+      {
+        // --- constraint 3 body ---> 2*x_1 + 0.5*x_i, for i>=4
+        iJacS[2*itrow] = itrow;
+        jJacS[2*itrow] = 0;
+        iJacS[2*itrow+1] = itrow;
+        jJacS[2*itrow+1] = itrow+1;
+      });
+
+  }
+
+  if(MJacS != nullptr) {
+    // serial part
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, 1),
+      RAJA_LAMBDA(RAJA::Index_type itrow)
+      {
+        // --- constraint 1 body --->  4*x_1 + 2*x_2 == 10
+        MJacS[0] = 4.0;
+        MJacS[1] = 2.0;
+        // --- constraint 2 body ---> 2*x_1 + x_3
+        MJacS[2] = 2.0;
+        MJacS[3] = 1.0;
+
+        if(rankdefic_ineq) {
+          // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]
+          MJacS[2*n-2] = 4.0;
+          MJacS[2*n-1] = 2.0;
+        }
+
+        if(rankdefic_eq) {
+          //  4*x_1 + 2*x_2 == 10
+          MJacS[2*m-2] = 4.0;
+          MJacS[2*m-1] = 2.0;
+        }
+      });
+
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(2, n-1),
+      RAJA_LAMBDA(RAJA::Index_type itrow)
+      {
+        // --- constraint 3 body ---> 2*x_1 + 0.5*x_i, for i>=4
+        MJacS[2*itrow] = 2.0;
+        MJacS[2*itrow+1] = 0.5;
+      });
+
+  }
+
+  return true;
+}
+
+bool SparseRajaEx2::eval_Hess_Lagr(const size_type& n,
+                               const size_type& m,
+                               const double* x,
+                               bool new_x,
+                               const double& obj_factor,
+                               const double* lambda,
+                               bool new_lambda,
+                               const size_type& nnzHSS,
+                               index_type* iHSS,
+                               index_type* jHSS,
+                               double* MHSS)
+{
+  //Note: lambda is not used since all the constraints are linear and, therefore, do
+  //not contribute to the Hessian of the Lagrangian
+  assert(nnzHSS == n);
+
+  if(iHSS!=nullptr && jHSS!=nullptr) {
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, n),
+      RAJA_LAMBDA(RAJA::Index_type i)
+      {
+        iHSS[i] = i;
+        jHSS[i] = i;
+      });
+  }
+
+  int convex_obj = (int) convex_obj_;
+  double scal_neg_obj = scal_neg_obj_;
+  if(MHSS!=nullptr) {
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, n),
+      RAJA_LAMBDA(RAJA::Index_type i)
+      {
+        MHSS[i] = obj_factor * ( (2*convex_obj-1) * scal_neg_obj * 3 * std::pow(x[i]-1., 2) + 1);
+      });
+  }
+  return true;
+}
+
+
+
+
+bool SparseRajaEx2::get_starting_point(const size_type& n, double* x0)
+{
+  assert(n==n_vars_);
+  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, n),
+    RAJA_LAMBDA(RAJA::Index_type i)
+    {
+      x0[i] = 0.0;
+    });
+  return true;
+}
diff --git a/src/Drivers/Sparse/NlpSparseRajaEx2.hpp b/src/Drivers/Sparse/NlpSparseRajaEx2.hpp
new file mode 100644
index 000000000..acec2cb13
--- /dev/null
+++ b/src/Drivers/Sparse/NlpSparseRajaEx2.hpp
@@ -0,0 +1,309 @@
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory (LLNL).
+// LLNL-CODE-742473. All rights reserved.
+//
+// This file is part of HiOp. For details, see https://github.com/LLNL/hiop. HiOp
+// is released under the BSD 3-clause license (https://opensource.org/licenses/BSD-3-Clause).
+// Please also read "Additional BSD Notice" below.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+// i. Redistributions of source code must retain the above copyright notice, this list
+// of conditions and the disclaimer below.
+// ii. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the disclaimer (as noted below) in the documentation and/or
+// other materials provided with the distribution.
+// iii. Neither the name of the LLNS/LLNL nor the names of its contributors may be used to
+// endorse or promote products derived from this software without specific prior written
+// permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+// SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, THE U.S. DEPARTMENT OF ENERGY OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Additional BSD Notice
+// 1. This notice is required to be provided under our contract with the U.S. Department
+// of Energy (DOE). This work was produced at Lawrence Livermore National Laboratory under
+// Contract No. DE-AC52-07NA27344 with the DOE.
+// 2. Neither the United States Government nor Lawrence Livermore National Security, LLC
+// nor any of their employees, makes any warranty, express or implied, or assumes any
+// liability or responsibility for the accuracy, completeness, or usefulness of any
+// information, apparatus, product, or process disclosed, or represents that its use would
+// not infringe privately-owned rights.
+// 3. Also, reference herein to any specific commercial products, process, or services by
+// trade name, trademark, manufacturer or otherwise does not necessarily constitute or
+// imply its endorsement, recommendation, or favoring by the United States Government or
+// Lawrence Livermore National Security, LLC. The views and opinions of authors expressed
+// herein do not necessarily state or reflect those of the United States Government or
+// Lawrence Livermore National Security, LLC, and shall not be used for advertising or
+// product endorsement purposes.
+
+/**
+ * @file NlpSparseRajaEx2.hpp
+ * 
+ * @author Nai-Yuan Chiang <chiang7@lnnl.gov>, LNNL
+ *
+ */
+
+#ifndef HIOP_EXAMPLE_SPARSE_RAJA_EX2
+#define HIOP_EXAMPLE_SPARSE_RAJA_EX2
+
+#include "hiopInterface.hpp"
+#include "LinAlgFactory.hpp"
+
+#ifdef HIOP_USE_MPI
+#include "mpi.h"
+#else
+#define MPI_COMM_WORLD 0
+#define MPI_Comm int
+#endif
+
+#include <cassert>
+#include <cstring> //for memcpy
+#include <cstdio>
+#include <cmath>
+
+using size_type = hiop::size_type;
+using index_type = hiop::index_type;
+
+/** Nonlinear *highly nonconvex* and *rank deficient* problem test for the Filter IPM
+ * Newton of HiOp. It uses a Sparse NLP formulation. The problem is based on SparseEx2.
+ *
+ *  min   (2*convex_obj-1)*scal*sum 1/4* { (x_{i}-1)^4 : i=1,...,n} + 0.5x^Tx
+ *  s.t.
+ *            4*x_1 + 2*x_2                     == 10
+ *        5<= 2*x_1         + x_3
+ *        1<= 2*x_1                 + 0.5*x_i   <= 2*n, for i=4,...,n
+ *        x_1 free
+ *        0.0 <= x_2
+ *        1.0 <= x_3 <= 10
+ *        x_i >=0.5, i=4,...,n
+ *
+ * Optionally, one can add the following constraints to obtain a rank-deficient Jacobian
+ *
+ *  s.t.  [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]                  (rnkdef-con1 --- ineq con)
+ *        4*x_1 + 2*x_2 == 10                                (rnkdef-con2 ---   eq con)
+ *
+ *  other parameters are:
+ *  convex_obj: set to 1 to have a convex problem, otherwise set it to 0
+ *  scale_quartic_obj_term: scaling factor for the quartic term in the objective (1.0 by default).
+ *
+ * @note All pointers marked as "managed by Umpire" are allocated by HiOp using the
+ * Umpire's API. They all are addresses in the same memory space; however, the memory 
+ * space can be host (typically CPU), device (typically GPU), or unified memory (um) 
+ * spaces as per Umpire specification. The selection of the memory space is done via 
+ * the option "mem_space" of HiOp. It is the responsibility of the implementers of 
+ * the HiOp's interfaces (such as the hiop::hiopInterfaceMDS used in this example) to 
+ * work with the "managed by Umpire" pointers in the same memory space as the one 
+ * specified by the "mem_space" option.
+ * 
+ */
+class SparseRajaEx2 : public hiop::hiopInterfaceSparse
+{
+public:
+  SparseRajaEx2(std::string mem_space,
+                int n,
+                bool convex_obj,
+                bool rankdefic_Jac_eq,
+                bool rankdefic_Jac_ineq,
+                double scal_neg_obj = 1.0);
+  virtual ~SparseRajaEx2();
+
+  /**
+   * @brief Number of variables and constraints.
+   */
+  virtual bool get_prob_sizes(size_type& n, size_type& m);
+
+  /**
+   * @brief Get types and bounds on the variables. 
+   * 
+   * @param[in] n number of variables
+   * @param[out] ixlow array with lower bounds (managed by Umpire)
+   * @param[out] ixupp array with upper bounds (managed by Umpire)
+   * @param[out] type array with the variable types (on host)
+   */
+  virtual bool get_vars_info(const size_type& n, double *xlow, double* xupp, NonlinearityType* type);
+
+  /**
+   * Get types and bounds corresponding to constraints. An equality constraint is specified
+   * by setting the lower and upper bounds equal.
+   * 
+   * @param[in] m Number of constraints
+   * @param[out] iclow array with lower bounds (managed by Umpire)
+   * @param[out] icupp array with upper bounds (managed by Umpire)
+   * @param[out] type array with the variable types (on host)
+   */
+  virtual bool get_cons_info(const size_type& m, double* clow, double* cupp, NonlinearityType* type);
+
+  /**
+   *  Returns the sizes and number of nonzeros of the sparse blocks
+   *
+   * @param[out] nx number of variables
+   * @param[out] nnz_sparse_Jace number of nonzeros in the Jacobian of the equalities w.r.t. 
+   *                             sparse variables 
+   * @param[out] nnz_sparse_Jaci number of nonzeros in the Jacobian of the inequalities w.r.t. 
+   *                             sparse variables 
+   * @param[out] nnz_sparse_Hess_Lagr number of nonzeros in the (sparse) Hessian
+   */
+  virtual bool get_sparse_blocks_info(index_type& nx,
+                                      index_type& nnz_sparse_Jace,
+                                      index_type& nnz_sparse_Jaci,
+                                      index_type& nnz_sparse_Hess_Lagr);
+
+  /**
+   * Evaluate objective. 
+   * 
+   * @param[in] n number of variables
+   * @param[in] x array with the optimization variables or point at which to evaluate 
+   *              (managed by Umpire)
+   * @param[in] new_x indicates whether any of the other eval functions have been 
+   * evaluated previously (false) or not (true) at x
+   * @param[out] obj_value the objective function value.
+   */
+  virtual bool eval_f(const size_type& n, const double* x, bool new_x, double& obj_value);
+
+  /**
+   * Evaluate a subset of the constraints specified by idx_cons
+   *
+   * @param[in] num_cons number of constraints to evaluate (size of idx_cons array)
+   * @param[in] idx_cons indexes of the constraints to evaluate (managed by Umpire)
+   * @param[in] x the point at which to evaluate (managed by Umpire) 
+   * @param[in] new_x indicates whether any of the other eval functions have been evaluated 
+   *            previously (false) or not (true) at x
+   * @param[out] cons array with values of the constraints (managed by Umpire, size num_cons) 
+   */
+  virtual bool eval_cons(const size_type& n,
+                         const size_type& m,
+                         const size_type& num_cons,
+                         const index_type* idx_cons,
+                         const double* x,
+                         bool new_x,
+                         double* cons)
+  {
+    //return false so that HiOp will rely on the constraint evaluator defined below
+    return false;
+  }
+
+  virtual bool eval_cons(const size_type& n,
+                         const size_type& m,
+                         const double* x,
+                         bool new_x,
+                         double* cons);
+
+  /**
+   * Evaluation of the gradient of the objective. 
+   *
+   * @param[in] n number of variables
+   * @param[in] x array with the optimization variables or point at which to evaluate
+   *              (managed by Umpire)
+   * @param[in] new_x indicates whether any of the other eval functions have been evaluated 
+   *                  previously (false) or not (true) at x
+   * @param[out] gradf array with the values of the gradient (managed by Umpire) 
+   */
+  virtual bool eval_grad_f(const size_type& n,
+                           const double* x,
+                           bool new_x,
+                           double* gradf);
+
+  /**
+   * Evaluates the Jacobian of the constraints. Please check the user manual and the 
+   * documentation of hiop::hiopInterfaceMDS for a detailed discussion of how the last 
+   * four arguments are expected to behave. 
+   *
+   * @param[in] n number of variables
+   * @param[in] m Number of constraints
+   * @param[in] num_cons number of constraints to evaluate (size of idx_cons array)
+   * @param[in] idx_cons indexes of the constraints to evaluate (managed by Umpire)
+   * @param[in] x the point at which to evaluate (managed by Umpire)
+   * @param[in] new_x indicates whether any of the other eval functions have been evaluated 
+   *                  previously (false) or not (true) at x
+   * @param[in] nnzJacS number of nonzeros in the sparse Jacobian
+   * @param[out] iJacS array of row indexes in the sparse Jacobian (managed by Umpire)
+   * @param[out] jJacS array of column indexes in the sparse Jacobian (managed by Umpire)
+   * @param[out] MJacS array of nonzero values in the sparse Jacobian (managed by Umpire)
+   */
+  virtual bool eval_Jac_cons(const size_type& n,
+                             const size_type& m,
+                             const size_type& num_cons, 
+                             const index_type* idx_cons,
+                             const double* x,
+                             bool new_x,
+                             const size_type& nnzJacS,
+                             index_type* iJacS,
+                             index_type* jJacS,
+                             double* MJacS)
+  {
+    //return false so that HiOp will rely on the Jacobian evaluator defined below
+    return false;
+  }
+
+  /// Similar to the above, but not used in this example.
+  virtual bool eval_Jac_cons(const size_type& n,
+                             const size_type& m,
+                             const double* x,
+                             bool new_x,
+                             const size_type& nnzJacS,
+                             index_type* iJacS,
+                             index_type* jJacS,
+                             double* MJacS);
+
+
+  /**
+   * Evaluate the Hessian of the Lagrangian function. Please consult the user manual for a 
+   * detailed discussion of the form the Lagrangian function takes.
+   * 
+   * @param[in] n number of variables
+   * @param[in] m Number of constraints
+   * @param[in] x the point at which to evaluate (managed by Umpire)
+   * @param[in] new_x indicates whether any of the other eval functions have been evaluated 
+   *                  previously (false) or not (true) at x
+   * @param[in] obj_factor scalar that multiplies the objective term in the Lagrangian function
+   * @param[in] lambda array with values of the multipliers used by the Lagrangian function
+   * @param[in] new_lambda indicates whether lambda  values changed since last call
+   * @param[in] nnzHSS number of nonzeros in the (sparse) Hessian w.r.t. sparse variables 
+   * @param[out] iHSS array of row indexes in the Hessian w.r.t. sparse variables
+   *                  (managed by Umpire)
+   * @param[out] jHSS array of column indexes in the Hessian w.r.t. sparse variables 
+   *                  (managed by Umpire)
+   * @param[out] MHSS array of nonzero values in the Hessian w.r.t. sparse variables
+   *                  (managed by Umpire)
+   */
+  virtual bool eval_Hess_Lagr(const size_type& n,
+                              const size_type& m,
+                              const double* x,
+                              bool new_x,
+                              const double& obj_factor,
+                              const double* lambda,
+                              bool new_lambda,
+                              const size_type& nnzHSS,
+                              index_type* iHSS,
+                              index_type* jHSS,
+                              double* MHSS);
+
+  /**
+   * Implementation of the primal starting point specification
+   * 
+   * @param[in] n number of variables
+   * @param[in] x0 the primal starting point(managed by Umpire)
+   */
+  virtual bool get_starting_point(const size_type&n, double* x0);
+
+private:
+  int n_vars_;
+  int n_cons_;
+  bool convex_obj_;
+  bool rankdefic_eq_;
+  bool rankdefic_ineq_;
+  double scal_neg_obj_;
+  
+  std::string mem_space_;
+};
+
+#endif
diff --git a/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp b/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp
new file mode 100644
index 000000000..4df767d99
--- /dev/null
+++ b/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp
@@ -0,0 +1,318 @@
+#include "NlpSparseRajaEx2.hpp"
+#include "hiopNlpFormulation.hpp"
+#include "hiopAlgFilterIPM.hpp"
+
+#include <cstdlib>
+#include <string>
+
+#include <umpire/Allocator.hpp>
+#include <umpire/ResourceManager.hpp>
+#include <RAJA/RAJA.hpp>
+
+using namespace hiop;
+
+static bool self_check(size_type n, double obj_value, const bool inertia_free);
+
+static bool parse_arguments(int argc,
+                            char **argv,
+                            size_type& n,
+                            bool& self_check,
+                            bool& inertia_free,
+                            bool& use_cusolver,
+                            bool& use_cusolver_lu,
+                            bool& use_ginkgo,
+                            bool& use_ginkgo_cuda,
+                            bool& use_ginkgo_hip)
+{
+  self_check = false;
+  n = 3;
+  inertia_free = false;
+  use_cusolver = false;
+  use_cusolver_lu = false;
+  use_ginkgo = false;
+  use_ginkgo_cuda = false;
+  use_ginkgo_cuda = false;
+  switch(argc) {
+  case 1:
+    //no arguments
+    return true;
+    break;
+  case 5: //4 arguments
+    {
+      if(std::string(argv[4]) == "-selfcheck") {
+        self_check = true;    
+      } else if(std::string(argv[4]) == "-inertiafree") {
+        inertia_free = true;
+      } else if(std::string(argv[4]) == "-cusolver") {
+        use_cusolver = true;
+      } else if(std::string(argv[4]) == "-ginkgo"){
+        use_ginkgo = true;
+      } else if(std::string(argv[4]) == "-ginkgo_cuda"){
+        use_ginkgo = true;
+        use_ginkgo_cuda = true;
+      } else if(std::string(argv[4]) == "-ginkgo_hip"){
+        use_ginkgo = true;
+        use_ginkgo_hip = true;
+      } else {
+        n = std::atoi(argv[4]);
+        if(n<=0) {
+          return false;
+        }
+      }
+    }
+  case 4: //3 arguments
+    {
+      if(std::string(argv[3]) == "-selfcheck") {
+        self_check = true;    
+      } else if(std::string(argv[3]) == "-inertiafree") {
+        inertia_free = true;
+      } else if(std::string(argv[3]) == "-cusolver") {
+        use_cusolver = true;
+      } else if(std::string(argv[3]) == "-ginkgo"){
+        use_ginkgo = true;
+      } else if(std::string(argv[3]) == "-ginkgo_cuda"){
+        use_ginkgo = true;
+        use_ginkgo_cuda = true;
+      } else if(std::string(argv[3]) == "-ginkgo_hip"){
+        use_ginkgo = true;
+        use_ginkgo_hip = true;
+      } else {
+        n = std::atoi(argv[3]);
+        if(n<=0) {
+          return false;
+        }
+      }
+    }
+    case 3: //2 arguments
+    {
+      if(std::string(argv[2]) == "-selfcheck") {
+        self_check = true;    
+      } else if(std::string(argv[2]) == "-inertiafree") {
+        inertia_free = true;
+      } else if(std::string(argv[2]) == "-cusolver") {
+        use_cusolver = true;
+      } else if(std::string(argv[2]) == "-ginkgo"){
+        use_ginkgo = true;
+      } else if(std::string(argv[2]) == "-ginkgo_cuda"){
+        use_ginkgo = true;
+        use_ginkgo_cuda = true;
+      } else if(std::string(argv[2]) == "-ginkgo_hip"){
+        use_ginkgo = true;
+        use_ginkgo_hip = true;
+      } else {
+        n = std::atoi(argv[2]);
+        if(n<=0) {
+          return false;
+        }
+      }
+    }
+  case 2: //1 argument
+    {
+      if(std::string(argv[1]) == "-selfcheck") {
+        self_check = true;    
+      } else if(std::string(argv[1]) == "-inertiafree") {
+        inertia_free = true;
+      } else if(std::string(argv[1]) == "-cusolver") {
+        use_cusolver = true;
+      } else if(std::string(argv[1]) == "-ginkgo"){
+        use_ginkgo = true;
+      } else if(std::string(argv[1]) == "-ginkgo_cuda"){
+        use_ginkgo = true;
+        use_ginkgo_cuda = true;
+      } else if(std::string(argv[1]) == "-ginkgo_hip"){
+        use_ginkgo = true;
+        use_ginkgo_hip = true;
+      } else {
+        n = std::atoi(argv[1]);
+        if(n<=0) {
+          return false;
+        }
+      }
+    }
+    break;
+  default:
+    return false; // 4 or more arguments
+  }
+
+// If CUDA is not available, de-select cuSOLVER
+#ifndef HIOP_USE_CUDA
+  if(use_cusolver) {
+    printf("HiOp built without CUDA support. ");
+    printf("Using default instead of cuSOLVER ...\n");
+    use_cusolver = false;
+  }
+#endif
+
+// Use cuSOLVER's LU factorization, if it was configured
+#ifdef HIOP_USE_CUSOLVER_LU
+  if(use_cusolver) {
+    use_cusolver_lu = true;
+  }
+#endif
+
+  // If cuSOLVER was selected, but inertia free approach was not, add inertia-free
+  if(use_cusolver && !(inertia_free)) {
+    inertia_free = true;
+    printf("LU solver from cuSOLVER library requires inertia free approach. ");
+    printf("Enabling now ...\n");
+  }
+
+// If Ginkgo is not available, de-select it.
+#ifndef HIOP_USE_GINKGO
+  if(use_ginkgo) {
+    printf("HiOp not built with GINKGO support, using default linear solver ...\n");
+    use_ginkgo = false;
+  }
+#endif
+
+  // If Ginkgo was selected, but inertia free approach was not, add inertia-free
+  if(use_ginkgo && !(inertia_free)) {
+    inertia_free = true;
+    printf("LU solver from GINKGO library requires inertia free approach. ");
+    printf("Enabling now ...\n");
+  }
+
+  return true;
+};
+
+static void usage(const char* exeName)
+{
+  printf("hiOp driver %s that solves a synthetic convex problem of variable size.\n", exeName);
+  printf("Usage: \n");
+  printf("  '$ %s problem_size -inertiafree -selfcheck'\n", exeName);
+  printf("Arguments:\n");
+  printf("  'problem_size': number of decision variables [optional, default is 50]\n");
+  printf("  '-inertiafree': indicate if inertia free approach should be used [optional]\n");
+  printf("  '-selfcheck': compares the optimal objective with a previously saved value for the "
+         "problem specified by 'problem_size'. [optional]\n");
+  printf("  '-cusolver': use cuSOLVER linear solver [optional]\n");
+  printf("  '-ginkgo': use GINKGO linear solver [optional]\n");
+}
+
+
+int main(int argc, char **argv)
+{
+  int rank=0;
+#ifdef HIOP_USE_MPI
+  MPI_Init(&argc, &argv);
+  int comm_size;
+  int ierr = MPI_Comm_size(MPI_COMM_WORLD, &comm_size); assert(MPI_SUCCESS==ierr);
+  if(comm_size != 1) {
+    printf("[error] driver detected more than one rank but the driver should be run "
+           "in serial only; will exit\n");
+    MPI_Finalize();
+    return 1;
+  }
+#endif
+
+  // Set memory space where to create models and perform NLP solve
+#ifdef HIOP_USE_GPU
+  std::string mem_space = "device";
+#else
+  std::string mem_space = "host";
+#endif
+
+  bool selfCheck = false;
+  size_type n = 50;
+  bool inertia_free = false;
+  bool use_cusolver = false;
+  bool use_cusolver_lu = false;
+  bool use_ginkgo = false;
+  bool use_ginkgo_cuda = false;
+  bool use_ginkgo_hip = false;
+  if(!parse_arguments(argc, argv, n, selfCheck, inertia_free, use_cusolver, use_cusolver_lu, use_ginkgo, use_ginkgo_cuda, use_ginkgo_hip)) { 
+    usage(argv[0]);
+#ifdef HIOP_USE_MPI
+    MPI_Finalize();
+#endif
+    return 1;
+  }
+
+  bool convex_obj = false;
+  bool rankdefic_Jac_eq = true;
+  bool rankdefic_Jac_ineq = true;
+  double scal_neg_obj = 0.1;
+
+  //first test
+  {
+    SparseRajaEx2 nlp_interface(mem_space, n, convex_obj, rankdefic_Jac_eq, rankdefic_Jac_ineq, scal_neg_obj);
+    hiopNlpSparse nlp(nlp_interface);
+    nlp.options->SetStringValue("compute_mode", "gpu");
+    nlp.options->SetStringValue("KKTLinsys", "xdycyd");
+
+    // only support cusolverLU right now, 2023.02.28
+    //lsq initialization of the duals fails for this example since the Jacobian is rank deficient
+    //use zero initialization
+    nlp.options->SetStringValue("duals_init", "zero");
+    nlp.options->SetStringValue("mem_space", "device");
+    nlp.options->SetStringValue("fact_acceptor", "inertia_free");
+    nlp.options->SetStringValue("linsol_mode", "speculative");
+
+    hiopAlgFilterIPMNewton solver(&nlp);
+    hiopSolveStatus status = solver.run();
+    
+    double obj_value = solver.getObjective();
+    
+    if(status<0) {
+      if(rank==0) {
+        printf("solver returned negative solve status: %d (with objective is %18.12e)\n", status, obj_value);
+      }
+#ifdef HIOP_USE_MPI
+      MPI_Finalize();
+#endif
+      return -1;
+    }
+
+    //this is used for "regression" testing when the driver is called with -selfcheck
+    if(selfCheck) {
+      if(!self_check(n, obj_value, inertia_free)) {
+#ifdef HIOP_USE_MPI
+        MPI_Finalize();
+#endif
+        return -1;
+      }
+    } else {
+      if(rank==0) {
+        printf("Optimal objective: %22.14e. Solver status: %d\n", obj_value, status);
+      }
+    }
+  }
+  
+  
+#ifdef HIOP_USE_MPI
+  MPI_Finalize();
+#endif
+
+  return 0;
+}
+
+
+static bool self_check(size_type n, double objval, const bool inertia_free)
+{
+#define num_n_saved 3 //keep this is sync with n_saved and objval_saved
+  const size_type n_saved[] = {50, 500, 10000};
+  const double objval_saved[] = { 8.7754974e+00,  6.4322371e+01,  1.2369786e+03};
+
+#define relerr 1e-6
+  bool found=false;
+  for(int it=0; it<num_n_saved; it++) {
+    if(n_saved[it]==n) {
+      found=true;
+      if(fabs( (objval_saved[it]-objval)/(1+objval_saved[it])) > relerr) {
+        printf("selfcheck failure. Objective (%18.12e) does not agree (%d digits) with the saved value (%18.12e) for n=%d.\n",
+               objval, -(int)log10(relerr), objval_saved[it], n);
+        return false;
+      } else {
+        printf("selfcheck success (%d digits)\n",  -(int)log10(relerr));
+      }
+      break;
+    }
+  }
+
+  if(!found) {
+    printf("selfcheck: driver does not have the objective for n=%d saved. BTW, obj=%18.12e was obtained for this n.\n", n, objval);
+    return false;
+  }
+
+  return true;
+}
diff --git a/src/LinAlg/hiopLinSolver.cpp b/src/LinAlg/hiopLinSolver.cpp
index 0c3760dcc..868981175 100644
--- a/src/LinAlg/hiopLinSolver.cpp
+++ b/src/LinAlg/hiopLinSolver.cpp
@@ -99,7 +99,7 @@ namespace hiop {
     //we default to triplet matrix for now; derived classes using CSR matrices will not call
     //this constructor (will call the 1-parameter constructor below) so they avoid creating
     //the triplet matrix
-    M_ = new hiopMatrixSparseTriplet(n, n, nnz);
+    M_ = LinearAlgebraFactory::create_matrix_sparse(nlp->options->GetString("mem_space"), n, n, nnz);
     //this class will own `M_`
     sys_mat_owned_ = true;
     nlp_ = nlp;
@@ -124,7 +124,7 @@ namespace hiop {
   
   hiopLinSolverNonSymSparse::hiopLinSolverNonSymSparse(int n, int nnz, hiopNlpFormulation* nlp)
   {
-    M_ = new hiopMatrixSparseTriplet(n, n, nnz);
+    M_ = LinearAlgebraFactory::create_matrix_sparse(nlp->options->GetString("mem_space"), n, n, nnz);
     sys_mat_owned_ = false;
     nlp_ = nlp;
     perf_report_ = "on"==hiop::tolower(nlp->options->GetString("time_kkt"));
diff --git a/src/LinAlg/hiopLinSolverSparseCUSOLVER.cpp b/src/LinAlg/hiopLinSolverSparseCUSOLVER.cpp
index 957b31f33..b50537324 100644
--- a/src/LinAlg/hiopLinSolverSparseCUSOLVER.cpp
+++ b/src/LinAlg/hiopLinSolverSparseCUSOLVER.cpp
@@ -1619,4 +1619,89 @@ namespace hiop
         break;
     } // switch
   } // GramSchmidt
+
+
+
+  hiopLinSolverSymSparseCUSOLVERGPU::hiopLinSolverSymSparseCUSOLVERGPU(const int& n, 
+                                                                       const int& nnz, 
+                                                                       hiopNlpFormulation* nlp)
+    : hiopLinSolverSymSparseCUSOLVER(n, nnz, nlp), 
+      rhs_host_{nullptr},
+      M_host_{nullptr}
+  {
+    rhs_host_ = LinearAlgebraFactory::create_vector("default", n);
+    M_host_ = LinearAlgebraFactory::create_matrix_sparse("default", n, n, nnz);
+  }
+  
+  hiopLinSolverSymSparseCUSOLVERGPU::~hiopLinSolverSymSparseCUSOLVERGPU()
+  {
+    delete rhs_host_;
+    delete M_host_;
+  }
+
+  int hiopLinSolverSymSparseCUSOLVERGPU::matrixChanged()
+  {
+    size_type nnz = M_->numberOfNonzeros();
+    double* mval_dev = M_->M();
+    double* mval_host= M_host_->M();
+
+    index_type* irow_dev = M_->i_row();
+    index_type* irow_host= M_host_->i_row();
+
+    index_type* jcol_dev = M_->j_col();
+    index_type* jcol_host= M_host_->j_col();
+
+    if("device" == nlp_->options->GetString("mem_space")) {
+      checkCudaErrors(cudaMemcpy(mval_host, mval_dev, sizeof(double) * nnz, cudaMemcpyDeviceToHost));
+      checkCudaErrors(cudaMemcpy(irow_host, irow_dev, sizeof(index_type) * nnz, cudaMemcpyDeviceToHost));
+      checkCudaErrors(cudaMemcpy(jcol_host, jcol_dev, sizeof(index_type) * nnz, cudaMemcpyDeviceToHost));      
+    } else {
+      checkCudaErrors(cudaMemcpy(mval_host, mval_dev, sizeof(double) * nnz, cudaMemcpyHostToHost));
+      checkCudaErrors(cudaMemcpy(irow_host, irow_dev, sizeof(index_type) * nnz, cudaMemcpyHostToHost));
+      checkCudaErrors(cudaMemcpy(jcol_host, jcol_dev, sizeof(index_type) * nnz, cudaMemcpyHostToHost));
+    }
+    
+    hiopMatrixSparse* swap_ptr = M_;
+    M_ = M_host_;
+    M_host_ = swap_ptr;
+    
+    int vret = hiopLinSolverSymSparseCUSOLVER::matrixChanged();
+
+    swap_ptr = M_;
+    M_ = M_host_;
+    M_host_ = swap_ptr;
+    
+    return vret;
+  }
+  
+  bool hiopLinSolverSymSparseCUSOLVERGPU::solve(hiopVector& x)
+  {
+    double* mval_dev = x.local_data();
+    double* mval_host= rhs_host_->local_data();
+   
+    if("device" == nlp_->options->GetString("mem_space")) {
+      checkCudaErrors(cudaMemcpy(mval_host, mval_dev, sizeof(double) * n_, cudaMemcpyDeviceToHost));
+    } else {
+      checkCudaErrors(cudaMemcpy(mval_host, mval_dev, sizeof(double) * n_, cudaMemcpyHostToHost));
+    }
+
+    hiopMatrixSparse* swap_ptr = M_;
+    M_ = M_host_;
+    M_host_ = swap_ptr;
+
+    bool bret = hiopLinSolverSymSparseCUSOLVER::solve(*rhs_host_);
+
+    swap_ptr = M_;
+    M_ = M_host_;
+    M_host_ = swap_ptr;
+
+    if("device" == nlp_->options->GetString("mem_space")) {
+      checkCudaErrors(cudaMemcpy(mval_dev, mval_host, sizeof(double) * n_, cudaMemcpyHostToDevice));
+    } else {
+      checkCudaErrors(cudaMemcpy(mval_dev, mval_host, sizeof(double) * n_, cudaMemcpyHostToHost));     
+    }
+ 
+    return bret;
+  }
+
 } // namespace hiop
diff --git a/src/LinAlg/hiopLinSolverSparseCUSOLVER.hpp b/src/LinAlg/hiopLinSolverSparseCUSOLVER.hpp
index 16518052c..1833afe67 100644
--- a/src/LinAlg/hiopLinSolverSparseCUSOLVER.hpp
+++ b/src/LinAlg/hiopLinSolverSparseCUSOLVER.hpp
@@ -88,12 +88,12 @@ class hiopLinSolverSymSparseCUSOLVER : public hiopLinSolverSymSparse
   /** Triggers a refactorization of the matrix, if necessary.
    * Overload from base class.
    * In this case, KLU (SuiteSparse) is used to refactor*/
-  int matrixChanged();
+  virtual int matrixChanged();
 
   /** solves a linear system.
    * param 'x' is on entry the right hand side(s) of the system to be solved.
    * On exit is contains the solution(s).  */
-  bool solve(hiopVector& x_);
+  virtual bool solve(hiopVector& x_);
 
   /** Multiple rhs not supported yet */
   virtual bool
@@ -103,7 +103,7 @@ class hiopLinSolverSymSparseCUSOLVER : public hiopLinSolverSymSparse
     return false;
   }
 
-private:
+protected:
   //
   int m_;   // number of rows of the whole matrix
   int n_;   // number of cols of the whole matrix
@@ -204,6 +204,27 @@ class hiopLinSolverSymSparseCUSOLVER : public hiopLinSolverSymSparse
   void IRsetup();
 };
 
+class hiopLinSolverSymSparseCUSOLVERGPU : public hiopLinSolverSymSparseCUSOLVER
+{
+public:  
+  hiopLinSolverSymSparseCUSOLVERGPU(const int& n, const int& nnz, hiopNlpFormulation* nlp);
+  virtual ~hiopLinSolverSymSparseCUSOLVERGPU();
+
+  virtual int matrixChanged();
+  virtual bool solve(hiopVector& x_);
+
+  /** Multiple rhs not supported yet */
+  virtual bool
+  solve(hiopMatrix& /* x */)
+  {
+    assert(false && "not yet supported");
+    return false;
+  }
+
+private:
+  hiopVector* rhs_host_;
+  hiopMatrixSparse* M_host_;
+};
 
 // Forward declaration of LU class
 class hiopLinSolverSymSparseCUSOLVERLU;
diff --git a/src/LinAlg/hiopMatrixRajaSparseTriplet.hpp b/src/LinAlg/hiopMatrixRajaSparseTriplet.hpp
index 4769d47fa..00bc19a08 100644
--- a/src/LinAlg/hiopMatrixRajaSparseTriplet.hpp
+++ b/src/LinAlg/hiopMatrixRajaSparseTriplet.hpp
@@ -390,11 +390,13 @@ class hiopMatrixRajaSparseTriplet : public hiopMatrixSparse
   {
     index_type *idx_start_; //size num_rows+1
     index_type *idx_start_host_; //size num_rows+1
+    index_type register_row_st_;
     size_type num_rows_;
     std::string mem_space_;
     RowStartsInfo()
       : idx_start_(nullptr),
-        num_rows_(0)
+        num_rows_(0), 
+        register_row_st_{0}
     {}
     RowStartsInfo(size_type n_rows, std::string memspace);
     virtual ~RowStartsInfo();
diff --git a/src/LinAlg/hiopMatrixRajaSparseTripletImpl.hpp b/src/LinAlg/hiopMatrixRajaSparseTripletImpl.hpp
index 11919070f..fd538740f 100644
--- a/src/LinAlg/hiopMatrixRajaSparseTripletImpl.hpp
+++ b/src/LinAlg/hiopMatrixRajaSparseTripletImpl.hpp
@@ -1123,8 +1123,6 @@ copyRowsBlockFrom(const hiopMatrix& src_gen,
   assert(n_rows + rows_src_idx_st <= src.m());
   assert(n_rows + rows_dst_idx_st <= this->m());
 
-  index_type* src_row_st = src.row_starts_->idx_start_;
-
   const index_type* iRow_src = src.i_row();
   const index_type* jCol_src = src.j_col();
   const double* values_src = src.M();
@@ -1143,6 +1141,8 @@ copyRowsBlockFrom(const hiopMatrix& src_gen,
   }
   assert(src.row_starts_);
 
+  index_type* src_row_st = src.row_starts_->idx_start_;
+  
   //
   // The latest CPU code can be found in 342eb99ec16d45f57a492be1bf1e39cce73995a5
   // It is replaced by RAJA::inclusive_scan after that commit
@@ -1165,10 +1165,12 @@ copyRowsBlockFrom(const hiopMatrix& src_gen,
   umpire::Allocator hostalloc = rm.getAllocator("HOST");
 
   int *next_row_nnz = static_cast<size_type*>(hostalloc.allocate(sizeof(size_type)));
+  index_type register_row_st = row_starts_->register_row_st_;
 
   rm.copy(next_row_nnz, dst_row_st_dev+1+rows_dst_idx_st, 1*sizeof(size_type));
 
   if(next_row_nnz[0] == 0) {
+    assert(rows_dst_idx_st >= register_row_st);
     // comput nnz in each row from source
     RAJA::forall<hiop_raja_exec>(
       RAJA::RangeSegment(0, n_rows),
@@ -1179,7 +1181,8 @@ copyRowsBlockFrom(const hiopMatrix& src_gen,
         dst_row_st_dev[row_dst+1] = src_row_st[row_src+1] - src_row_st[row_src];
       }
     );
-    RAJA::inclusive_scan_inplace<hiop_raja_exec>(RAJA::make_span(dst_row_st_dev, n_rows+1), RAJA::operators::plus<index_type>());
+    RAJA::inclusive_scan_inplace<hiop_raja_exec>(RAJA::make_span(dst_row_st_dev+register_row_st, n_rows+1+rows_dst_idx_st-register_row_st), RAJA::operators::plus<index_type>());
+    row_starts_->register_row_st_ = n_rows + rows_dst_idx_st;
   }
 
   index_type* dst_row_st = row_starts_->idx_start_;
@@ -1191,7 +1194,7 @@ copyRowsBlockFrom(const hiopMatrix& src_gen,
       const index_type row_src = rows_src_idx_st + row_add;
       const index_type row_dst = rows_dst_idx_st + row_add;
       index_type k_src = src_row_st[row_src];
-      index_type k_dst = dst_row_st[row_dst];
+      index_type k_dst = dst_row_st[row_dst] - dst_row_st[rows_dst_idx_st] + dest_nnz_st;
   
       // copy from src
       while(k_src < src_row_st[row_src+1]) {
@@ -1283,7 +1286,8 @@ template<class MEMBACKEND, class RAJAEXECPOL>
 hiopMatrixRajaSparseTriplet<MEMBACKEND, RAJAEXECPOL>::RowStartsInfo::
 RowStartsInfo(size_type n_rows, std::string memspace)
   : num_rows_(n_rows),
-    mem_space_(memspace)
+    mem_space_(memspace), 
+    register_row_st_{0}
 {
   auto& resmgr = umpire::ResourceManager::getInstance();
   umpire::Allocator alloc = resmgr.getAllocator(mem_space_);
diff --git a/src/Optimization/hiopKKTLinSysSparse.cpp b/src/Optimization/hiopKKTLinSysSparse.cpp
index 7a1360bf2..c3c43c832 100644
--- a/src/Optimization/hiopKKTLinSysSparse.cpp
+++ b/src/Optimization/hiopKKTLinSysSparse.cpp
@@ -63,6 +63,7 @@
 #ifdef HIOP_USE_GINKGO
 #include "hiopLinSolverSparseGinkgo.hpp"
 #endif
+
 #endif
 
 namespace hiop
@@ -184,7 +185,12 @@ namespace hiop
       write_linsys_counter_++;
     }
     if(write_linsys_counter_>=0) {
-      csr_writer_.writeMatToFile(*Msys, write_linsys_counter_, nx, neq, nineq);
+#ifndef HIOP_USE_GPU
+      auto* MsysSp = dynamic_cast<hiopMatrixSparseTriplet*>(linSys->sys_matrix());
+      csr_writer_.writeMatToFile(*MsysSp, write_linsys_counter_, nx, neq, nineq);
+#else
+    //TODO csr_writer_.writeMatToFile(*Msys, write_linsys_counter_, nx, neq, nineq);
+#endif
     }
 
     return true;
@@ -329,7 +335,7 @@ namespace hiop
 
         if( (nullptr == linSys_ && linear_solver == "auto") || linear_solver == "cusolver-lu") {
 #if defined(HIOP_USE_CUSOLVER_LU)
-          linSys_ = new hiopLinSolverSymSparseCUSOLVER(n, nnz, nlp_);
+          linSys_ = new hiopLinSolverSymSparseCUSOLVERGPU(n, nnz, nlp_);
           linsol_actual = "CUSOLVER-LU";
           auto* fact_acceptor_ic = dynamic_cast<hiopFactAcceptorIC*> (fact_acceptor_);
           if(fact_acceptor_ic) {
@@ -418,14 +424,13 @@ namespace hiop
     delta_wd_ = perturb_calc_->get_curr_delta_wd();
     delta_cc_ = perturb_calc_->get_curr_delta_cc();
     delta_cd_ = perturb_calc_->get_curr_delta_cd();
-    
-    HessSp_ = dynamic_cast<hiopMatrixSymSparseTriplet*>(Hess_);
-    if(!HessSp_) { assert(false); return false; }
 
-    Jac_cSp_ = dynamic_cast<const hiopMatrixSparseTriplet*>(Jac_c_);
+    HessSp_ = dynamic_cast<hiopMatrixSparse*>(Hess_);
+    Jac_cSp_ = dynamic_cast<const hiopMatrixSparse*>(Jac_c_);
+    Jac_dSp_ = dynamic_cast<const hiopMatrixSparse*>(Jac_d_);
+ 
+    if(!HessSp_) { assert(false); return false; }
     if(!Jac_cSp_) { assert(false); return false; }
-
-    Jac_dSp_ = dynamic_cast<const hiopMatrixSparseTriplet*>(Jac_d_);
     if(!Jac_dSp_) { assert(false); return false; }
 
     size_type nx = HessSp_->n(), nd=Jac_dSp_->m(), neq=Jac_cSp_->m(), nineq=Jac_dSp_->m();
@@ -436,7 +441,7 @@ namespace hiop
     auto* linSys = dynamic_cast<hiopLinSolverSymSparse*> (linSys_);
     assert(linSys);
 
-    auto* Msys = dynamic_cast<hiopMatrixSparseTriplet*>(linSys->sys_matrix());
+    auto* Msys = dynamic_cast<hiopMatrixSparse*>(linSys->sys_matrix());
     assert(Msys);
     if(perf_report_) {
       nlp_->log->printf(hovSummary,
@@ -510,7 +515,12 @@ namespace hiop
       write_linsys_counter_++;
     }
     if(write_linsys_counter_>=0) {
-      csr_writer_.writeMatToFile(*Msys, write_linsys_counter_, nx, neq, nineq);
+#ifndef HIOP_USE_GPU
+      auto* MsysSp = dynamic_cast<hiopMatrixSparseTriplet*>(linSys->sys_matrix());
+      csr_writer_.writeMatToFile(*MsysSp, write_linsys_counter_, nx, neq, nineq);
+#else
+    //TODO csr_writer_.writeMatToFile(*Msys, write_linsys_counter_, nx, neq, nineq);
+#endif
     }
 
     return true;
@@ -685,7 +695,7 @@ namespace hiop
         if(linear_solver == "cusolver-lu" || linear_solver == "auto") {
 #if defined(HIOP_USE_CUSOLVER_LU)
           actual_lin_solver = "CUSOLVER-LU";
-          linSys_ = new hiopLinSolverSymSparseCUSOLVER(n, nnz, nlp_);
+          linSys_ = new hiopLinSolverSymSparseCUSOLVERGPU(n, nnz, nlp_);
           auto* fact_acceptor_ic = dynamic_cast<hiopFactAcceptorIC*> (fact_acceptor_);
           if(fact_acceptor_ic) {
             nlp_->log->printf(hovError,
@@ -750,13 +760,13 @@ namespace hiop
         //
         // We don't allow CPU linear solvers.
         /////////////////////////////////////////////////////////////////////////////////////////////
-        assert(false && "KKT_SPARSE_XDYcYd linsys: GPU compute mode not yet supported.");
-        assert(false == safe_mode_);
+ //       assert(false && "KKT_SPARSE_XDYcYd linsys: GPU compute mode not yet supported.");
+ //       assert(false == safe_mode_);
         assert(nullptr == linSys_);
         
         if(linear_solver == "cusolver-lu" || linear_solver == "auto") {
 #if defined(HIOP_USE_CUSOLVER_LU)        
-          linSys_ = new hiopLinSolverSymSparseCUSOLVER(n, nnz, nlp_);
+          linSys_ = new hiopLinSolverSymSparseCUSOLVERGPU(n, nnz, nlp_);
           nlp_->log->printf(hovScalars,
                             "KKT_SPARSE_XDYcYd linsys: alloc CUSOLVER-LU size %d (%d cons) (gpu)\n",
                             n,
@@ -1046,7 +1056,12 @@ namespace hiop
       write_linsys_counter_++;
     }
     if(write_linsys_counter_>=0) {
-      csr_writer_.writeMatToFile(*Msys, write_linsys_counter_, nx, neq, nineq);
+#ifndef HIOP_USE_GPU
+      auto* MsysSp = dynamic_cast<hiopMatrixSparseTriplet*>(linSys->sys_matrix());
+      csr_writer_.writeMatToFile(*MsysSp, write_linsys_counter_, nx, neq, nineq);
+#else
+    //TODO csr_writer_.writeMatToFile(*Msys, write_linsys_counter_, nx, neq, nineq);
+#endif
     }
 
     return true;
diff --git a/src/Optimization/hiopKKTLinSysSparseCondensed.hpp b/src/Optimization/hiopKKTLinSysSparseCondensed.hpp
index 1e0747c61..cb9c6fc8d 100644
--- a/src/Optimization/hiopKKTLinSysSparseCondensed.hpp
+++ b/src/Optimization/hiopKKTLinSysSparseCondensed.hpp
@@ -210,7 +210,6 @@ class hiopKKTLinSysCondensedSparse : public hiopKKTLinSysCompressedSparseXDYcYd
     } else {
       //(opt_compute_mode == "hybrid" || opt_compute_mode == "gpu") {
 #ifdef HIOP_USE_CUDA
-      assert(opt_compute_mode != "gpu" && "When code is GPU-ready, remove this method");
       return "CUDA";
 #else
       assert(false && "compute mode not supported without HIOP_USE_CUDA build");