Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Checkpointing (quasi-Newton solver) #693

Merged
merged 27 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
7a939b7
added notes and some drafty interface
cnpetra Aug 26, 2024
181f7f9
added draft of the api for checkpointing
cnpetra Aug 28, 2024
0e428f5
fixed compilation issues
cnpetra Aug 28, 2024
354b82b
integrated AXOM
cnpetra Aug 28, 2024
a14a445
added user options for checkpointing
cnpetra Aug 28, 2024
d497955
more work on load checkpoint EOD
cnpetra Sep 3, 2024
8b2342c
Merge branch 'develop' into chkpnt-dev
cnpetra Sep 4, 2024
d4900a9
semi-operation checkpointing
cnpetra Sep 4, 2024
3579828
removed save checkpoint callback from the interface
cnpetra Sep 4, 2024
d213774
fixed typos in comments
cnpetra Sep 4, 2024
4a9fbf1
moved sidre-related code from Algorithm class to a "utils" helper
cnpetra Sep 7, 2024
0498564
switched to refs; some testing of options-based checkpointing
cnpetra Sep 8, 2024
085eb88
added sidre copy to/from dense matrices
cnpetra Sep 11, 2024
b21c3c5
instrumentation for saving quasi-Newton internals to sidre
cnpetra Sep 11, 2024
bfe1b40
updated iteration counter to keep track of total number over restarts
cnpetra Sep 12, 2024
5bc2af6
updated doc; replace all #
cnpetra Sep 13, 2024
a664e35
added example on how to use checkpoint API
cnpetra Sep 13, 2024
261ccf9
clean up
cnpetra Sep 13, 2024
0506d38
added metadata
cnpetra Sep 14, 2024
b785475
testing and clean up
cnpetra Sep 14, 2024
adf30e6
Merge branch 'develop' into chkpnt-dev
cnpetra Sep 22, 2024
4e673d5
update user manual with checkpointing
cnpetra Sep 22, 2024
77c88a2
updated pdf user manual
cnpetra Sep 22, 2024
5631b6c
fix ci errors (compilation)
cnpetra Sep 23, 2024
ea5cafd
fix adtl compilation issues
cnpetra Sep 23, 2024
28c4567
fixed compil error
cnpetra Sep 23, 2024
27d15f2
addresed reviews
cnpetra Sep 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ option(HIOP_USE_EIGEN "Build with Eigen support" ON)
option(HIOP_USE_MPI "Build with MPI support" ON)
option(HIOP_USE_GPU "Build with support for GPUs - CUDA or HIP libraries" OFF)
option(HIOP_TEST_WITH_BSUB "Use `jsrun` instead of `mpirun` commands when running tests" OFF)
option(HIOP_USE_RAJA "Build with portability abstraction library RAJA" OFF)
option(HIOP_USE_RAJA "Build with portability abstraction library RAJA" OFF)
option(HIOP_USE_AXOM "Build with AXOM to use Sidre for scalable checkpointing" OFF)
option(HIOP_DEEPCHECKS "Extra checks and asserts in the code with a high penalty on performance" OFF)
option(HIOP_WITH_KRON_REDUCTION "Build Kron Reduction code (requires UMFPACK)" OFF)
option(HIOP_DEVELOPER_MODE "Build with extended warnings and options" OFF)
Expand Down Expand Up @@ -289,6 +290,15 @@ if(HIOP_USE_RAJA)
message(STATUS "Found umpire pkg-config: ${umpire_CONFIG}")
endif()

if(HIOP_USE_AXOM)
find_package(AXOM CONFIG
PATHS ${AXOM_DIR} ${AXOM_DIR}/lib/cmake/
REQUIRED)
target_link_libraries(hiop_tpl INTERFACE axom)
message(STATUS "Found AXOM pkg-config: ${AXOM_CONFIG}")
endif()


cnpetra marked this conversation as resolved.
Show resolved Hide resolved
if(HIOP_WITH_KRON_REDUCTION)
set(HIOP_UMFPACK_DIR CACHE PATH "Path to UMFPACK directory")
include(FindUMFPACK)
Expand Down
6 changes: 3 additions & 3 deletions src/Interface/hiopInterface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -467,8 +467,8 @@ class hiopInterfaceBase
}

/**
* This method is used to provide an user all the hiop iterate
* procedure. @see solution_callback() for an explanation of the parameters.
* This method is used to provide user all the internal hiop iterates. @see solution_callback()
* for an explanation of the parameters.
*
* @param[in] x array of (local) entries of the primal variables (managed by Umpire, see note below)
* @param[in] z_L array of (local) entries of the dual variables for lower bounds (managed by Umpire, see note below)
Expand Down Expand Up @@ -496,7 +496,7 @@ class hiopInterfaceBase
{
return true;
}

/**
* A wildcard function used to change the primal variables.
*
Expand Down
1 change: 1 addition & 0 deletions src/Interface/hiop_defs.hpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#cmakedefine HIOP_USE_PARDISO
#cmakedefine HIOP_USE_RESOLVE
#cmakedefine HIOP_USE_GINKGO
#cmakedefine HIOP_USE_AXOM
#define HIOP_VERSION "@PROJECT_VERSION@"
#define HIOP_VERSION_MAJOR @PROJECT_VERSION_MAJOR@
#define HIOP_VERSION_MINOR @PROJECT_VERSION_MINOR@
Expand Down
189 changes: 186 additions & 3 deletions src/Optimization/hiopAlgFilterIPM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,11 @@

#include "hiopCppStdUtils.hpp"

#ifdef HIOP_USE_AXOM
#include "SidreHelper.hpp"
using namespace axom;
#endif

#include <cmath>
#include <cstring>
#include <cassert>
Expand Down Expand Up @@ -976,8 +981,38 @@ hiopSolveStatus hiopAlgFilterIPMQuasiNewton::run()

nlp->runStats.tmOptimizTotal.start();

startingProcedure(*it_curr, _f_nlp, *_c, *_d, *_grad_f, *_Jac_c, *_Jac_d); //this also evaluates the nlp
_mu=mu0;
//
// starting point:
// - user provided (with slack adjustments and lsq eq. duals initialization)
// or
// - loaded checkpoint
//
if(nlp->options->GetString("checkpoint_load_on_start") != "yes") {
//this also evaluates the nlp
startingProcedure(*it_curr, _f_nlp, *_c, *_d, *_grad_f, *_Jac_c, *_Jac_d);
_mu=mu0;
iter_num = 0;
} else {
//
//checkpoint load
//
//load from file: will populate it_curr, _Hess_lagr, and algorithmic parameters
auto chkpnt_ok = load_state_from_file(nlp->options->GetString("checkpoint_file"));
if(chkpnt_ok) {
//additionally: need to evaluate the nlp
if(!this->evalNlp_noHess(*it_curr, _f_nlp, *_c, *_d, *_grad_f, *_Jac_c, *_Jac_d)) {
nlp->log->printf(hovError, "Failure in evaluating user NLP functions at loaded checkpoint.");
return Error_In_User_Function;
}
} else {
nlp->log->printf(hovWarning, "Using default starting procedure (no checkpoint load!).\n");
//this also evaluates the nlp
startingProcedure(*it_curr, _f_nlp, *_c, *_d, *_grad_f, *_Jac_c, *_Jac_d);
_mu=mu0;
iter_num = 0;
}
solver_status_ = NlpSolve_SolveNotCalled;
}

//update log bar
logbar->updateWithNlpInfo(*it_curr, _mu, _f_nlp, *_c, *_d, *_grad_f, *_Jac_c, *_Jac_d);
Expand All @@ -987,7 +1022,7 @@ hiopSolveStatus hiopAlgFilterIPMQuasiNewton::run()

nlp->log->write("First residual-------------", *resid, hovIteration);

iter_num=0; nlp->runStats.nIter=iter_num;
nlp->runStats.nIter = iter_num;
bool disableLS = nlp->options->GetString("accept_every_trial_step")=="yes";

theta_max = theta_max_fact_*fmax(1.0,resid->get_theta());
Expand Down Expand Up @@ -1095,6 +1130,11 @@ hiopSolveStatus hiopAlgFilterIPMQuasiNewton::run()
solver_status_ = User_Stopped; break;
}

#ifdef HIOP_USE_AXOM
//checkpointing - based on options provided by the user
checkpointing_stuff();
#endif

/*************************************************
* Termination check
************************************************/
Expand Down Expand Up @@ -1485,6 +1525,149 @@ void hiopAlgFilterIPMQuasiNewton::outputIteration(int lsStatus, int lsNum, int u
}
}

#ifdef HIOP_USE_AXOM

bool hiopAlgFilterIPMQuasiNewton::save_state_to_file(const ::std::string& path) noexcept
{
try {
sidre::DataStore ds;
sidre::Group* group = ds.getRoot()->createGroup("HiOp quasi-Newton alg state");
this->save_state_to_sidre_group(*group);

sidre::IOManager writer(this->get_nlp()->get_comm());
int n_files;
MPI_Comm_size(this->get_nlp()->get_comm(), &n_files);
writer.write(ds.getRoot(), n_files, path, sidre::Group::getDefaultIOProtocol());
return true;
} catch(const std::exception& exp) {
nlp->log->printf(hovError, "Error when saving checkpoint to file '%s'\n", path.c_str());
nlp->log->printf(hovError, " Addtl info: %s\n", exp.what());
return false;
}
}

bool hiopAlgFilterIPMQuasiNewton::load_state_from_file(const ::std::string& path) noexcept
{
try {
sidre::DataStore ds;

sidre::IOManager reader(this->get_nlp()->get_comm());
auto path2 = SidreHelper::check_path(path);
reader.read(ds.getRoot(), path2, false);
nlp->log->printf(hovScalars, "Loaded checkpoint file [%s].\n", path2.c_str());

const sidre::Group* group = ds.getRoot()->getGroup("HiOp quasi-Newton alg state");
this->load_state_from_sidre_group(*group);
return true;
} catch(const std::exception& exp) {
nlp->log->printf(hovError, "Error in loading checkpoint from file '%s'\n", path.c_str());
nlp->log->printf(hovError, " Addtl info: %s\n", exp.what());
return false;
}
}

void hiopAlgFilterIPMQuasiNewton::save_state_to_sidre_group(::axom::sidre::Group& group)
{
using IndType = sidre::IndexType;

//metadata

//iterate states
//create views for each member that needs to be saved
cnpetra marked this conversation as resolved.
Show resolved Hide resolved
SidreHelper::copy_vec_to_view(group, "x", *it_curr->get_x());
SidreHelper::copy_vec_to_view(group, "d", *it_curr->get_d());
SidreHelper::copy_vec_to_view(group, "sxl", *it_curr->get_sxl());
SidreHelper::copy_vec_to_view(group, "sxu", *it_curr->get_sxu());
SidreHelper::copy_vec_to_view(group, "sdl", *it_curr->get_sdl());
SidreHelper::copy_vec_to_view(group, "sdu", *it_curr->get_sdu());
SidreHelper::copy_vec_to_view(group, "yc", *it_curr->get_yc());
SidreHelper::copy_vec_to_view(group, "yd", *it_curr->get_yd());
SidreHelper::copy_vec_to_view(group, "zl", *it_curr->get_zl());
SidreHelper::copy_vec_to_view(group, "zu", *it_curr->get_zu());
SidreHelper::copy_vec_to_view(group, "vl", *it_curr->get_vl());
SidreHelper::copy_vec_to_view(group, "vu", *it_curr->get_vu());

//state of quasi-Newton Hessian approximation

//algorithmic parameters for this state
//mu, iteration number, num MPI ranks
int nranks=1;
#ifdef HIOP_USE_MPI
MPI_Comm_size(get_nlp()->get_comm(), &nranks);
#endif

const double alg_params[] = {_mu, (double)iter_num, (double)nranks};
const size_type nparams = sizeof(alg_params) / sizeof(double);

SidreHelper::copy_array_to_view(group, "alg_params", alg_params, nparams);
}

void hiopAlgFilterIPMQuasiNewton::load_state_from_sidre_group(const sidre::Group& group)
{
//metadata

//algorithmic parameters
//!!! dev note: nparams needs to match the nparams from save_state_to_data_store
const int nparams = 3;
double alg_params[nparams];
SidreHelper::copy_array_from_view(group, "alg_params", alg_params, nparams);
//!!! dev note: match order in save_state_to_data_store
_mu = alg_params[0];
iter_num = alg_params[1];

int nranks=1;
#ifdef HIOP_USE_MPI
MPI_Comm_size(get_nlp()->get_comm(), &nranks);
#endif
if( (int)alg_params[2] != nranks ) {
::std::stringstream ss;
ss << "Mismatch in the number of MPI ranks used to checkpoint. Checkpointing was " <<
"done on " << (int)alg_params[2] << " ranks while HiOp currently runs on " <<
nranks << " ranks.\n";
//throw std::runtime_error(ss.str());
}

//iterate states
SidreHelper::copy_vec_from_view(group, "x", *it_curr->get_x());
SidreHelper::copy_vec_from_view(group, "d", *it_curr->get_d());
SidreHelper::copy_vec_from_view(group, "sxl", *it_curr->get_sxl());
SidreHelper::copy_vec_from_view(group, "sxu", *it_curr->get_sxu());
SidreHelper::copy_vec_from_view(group, "sdl", *it_curr->get_sdl());
SidreHelper::copy_vec_from_view(group, "sdu", *it_curr->get_sdu());
SidreHelper::copy_vec_from_view(group, "yc", *it_curr->get_yc());
SidreHelper::copy_vec_from_view(group, "yd", *it_curr->get_yd());
SidreHelper::copy_vec_from_view(group, "zl", *it_curr->get_zl());
SidreHelper::copy_vec_from_view(group, "zu", *it_curr->get_zu());
SidreHelper::copy_vec_from_view(group, "vl", *it_curr->get_vl());
SidreHelper::copy_vec_from_view(group, "vu", *it_curr->get_vu());

//state of quasi-Newton Hessian approximation

}

void hiopAlgFilterIPMQuasiNewton::checkpointing_stuff()
Copy link
Collaborator

@nychiang nychiang Sep 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Style Guidelines.
There should be spaces before and after each operator, e.g. line 1675, 1655,...

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Style Guidelines. There should be spaces before and after each operator, e.g. line 1675, 1680,...

can you be more specific about the guideline?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean space before and after operator "==".

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Our CMake build that uses BLT auto formats everything using clang-format (and a configuration file) with a make style. Also make check verifies that the code matches the clang-format style configuration, so a PR can't be merged without complying with the style.

{
if(nlp->options->GetString("checkpoint_save")=="no") {
return;
}
int chk_every_N = nlp->options->GetInteger("checkpoint_save_every_N_iter");
//check iteration
if(iter_num>0 && iter_num % chk_every_N==0) {
using ::std::string;
// replace "#" in checkpointing file with iteration number
cnpetra marked this conversation as resolved.
Show resolved Hide resolved
string path = nlp->options->GetString("checkpoint_file");
auto pos = path.find("#");
if(string::npos != pos) {
auto s_it_num = ::std::to_string(iter_num);
path.replace(pos, 1, s_it_num);
}

nlp->log->printf(hovSummary, "Saving checkpoint at iter %d in '%s'.\n", iter_num, path.c_str());
//actual checkpointing via axom::sidre
save_state_to_file(path);
}
}
#endif // HIOP_USE_AXOM

/******************************************************************************************************
* FULL NEWTON IPM
Expand Down
85 changes: 84 additions & 1 deletion src/Optimization/hiopAlgFilterIPM.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,19 @@
#include "hiopPDPerturbation.hpp"
#include "hiopFactAcceptor.hpp"

#ifdef HIOP_USE_AXOM
namespace axom {
namespace sidre {
class Group; // forward declaration
}
}
#endif

#include "hiopTimer.hpp"

namespace hiop
{

cnpetra marked this conversation as resolved.
Show resolved Hide resolved
class hiopAlgFilterIPMBase {
public:
hiopAlgFilterIPMBase(hiopNlpFormulation* nlp_, const bool within_FR = false);
Expand Down Expand Up @@ -117,6 +125,8 @@ class hiopAlgFilterIPMBase {
{
return filter.contains(theta, logbar_obj);
}

/// Setter for the primal steplength.
inline void set_alpha_primal(const double alpha_primal) { _alpha_primal = alpha_primal; }

protected:
Expand Down Expand Up @@ -339,8 +349,81 @@ class hiopAlgFilterIPMQuasiNewton : public hiopAlgFilterIPMBase
virtual ~hiopAlgFilterIPMQuasiNewton();

virtual hiopSolveStatus run();

// note that checkpointing is only available with a axom-enabled build
#ifdef HIOP_USE_AXOM
/**
* @brief Save state of HiOp algorithm to a sidre::Group as a checkpoint.
*
* @param group a reference to the group where state will be saved to
*
* @exception std::runtime indicates the group contains a view whose size does not match
* the size of the corresponding HiOp algorithm state variable of parameter.
*
* @details
* Each state variable of each parameter of HiOp algorithm will be saved in a named
* view within the group. A new view will be created within the group if it does not
* already exist. If it exists, the view must have same number of elements as the
* as the size of the corresponding state variable. This means that this method will
* throw an exception if an existing group is reused to save a problem that changed
* sizes since the group was created.
*/
virtual void save_state_to_sidre_group(::axom::sidre::Group& group);

/**
* @brief Load state of HiOp algorithm from a sidre::Group checkpoint.
*
* @param group a pointer to group containing the a prevously saved HiOp algorithm state.
*
* @exception std::runtime indicates the group does not contain a view expected by this
* method or the view's number of elements mismatches the size of the corresponding HiOp
* state. The latter can occur if the file was saved with a different number of MPI ranks.
*
cnpetra marked this conversation as resolved.
Show resolved Hide resolved
* @details
* Copies views from the sidre::Group passed as argument to HiOp algorithm's state variables
* and parameters. The group should be created by first calling save_state_to_sidre_group
* for a problem/NLP of the same sizes as the problem for which this method is called.
* The method expects views within the group with certain names. If one such view is not
* found or has a number of elements different than the size of the corresponding HiOp state,
* then a std::runtime_error exception is thrown. The latter can occur when the loading
* occurs for a instance of HiOp that is not ran on the same number of MPI ranks used to
* save the file.
*/
virtual void load_state_from_sidre_group(const ::axom::sidre::Group& group);

/**
* @brief Save the state of the algorithm to the file for checkpointing.
*
* @param path the name of the file
cnpetra marked this conversation as resolved.
Show resolved Hide resolved
* @return true if successful, false otherwise
*
* @details
* Internally, HiOp uses axom::sidre::DataStore and sidre's scalable IO. A detailed
* error description is sent to the log if an error or exception is caught.
*/
bool save_state_to_file(const ::std::string& path) noexcept;

/**
* @brief Load the state of the algorithm from checkpoint file.
*
* @param path the name of the file to load from
cnpetra marked this conversation as resolved.
Show resolved Hide resolved
* @return true if successful, false otherwise
*
* @details
* The file should contains a axom::sidre::DataStore that was previously saved using
* save_state_to_file(). A detailed error description is sent to the log if an error
* or exception is caught.
*/
bool load_state_from_file(const ::std::string& path) noexcept;
#endif // HIOP_USE_AXOM
private:
virtual void outputIteration(int lsStatus, int lsNum, int use_soc = 0, int use_fr = 0);

#ifdef HIOP_USE_AXOM
///@brief The options-based logic for saving checkpoint and the call to save_state().
void checkpointing_stuff();
#endif // HIOP_USE_AXOM

private:
hiopNlpDenseConstraints* nlpdc;
private:
Expand Down
Loading
Loading