From ebd5271d5777ff3720d342f249ae5871fc042d00 Mon Sep 17 00:00:00 2001 From: gcuser Date: Sun, 24 Apr 2022 11:52:33 +0000 Subject: [PATCH 1/5] Add more debug information to check the coredump when destroyComputation --- ODLA/platforms/odla_popart/odla_popart.cc | 34 +++++++++++++++++++++-- ODLA/platforms/odla_popart/odla_popart.h | 31 +++++++++++++-------- 2 files changed, 52 insertions(+), 13 deletions(-) diff --git a/ODLA/platforms/odla_popart/odla_popart.cc b/ODLA/platforms/odla_popart/odla_popart.cc index 3c2ce3e74..0d35ce802 100644 --- a/ODLA/platforms/odla_popart/odla_popart.cc +++ b/ODLA/platforms/odla_popart/odla_popart.cc @@ -121,6 +121,30 @@ void compute_loop(odla_computation comp) { #undef RETURN_ERROR #define RETURN_ERROR(ERR_CODE) return ERR_CODE; +void _odla_computation::release_session() { + if (nullptr == session) + popart::logging::warn("session is nullptr when try to release it"); + // else if(session->getDevice() == nullptr) + // popart::logging::warn("session->getDevice() is nullptr when try to release + // it"); + else if (session->getDevice().getDeviceInfo() == nullptr) + popart::logging::warn( + "session->getDevice().getDeviceInfo() is nullptr when try to release " + "it"); + else { + popart::logging::warn( + "Calling session->getDevice().getDeviceInfo()->detach() to detach the " + "device when QManager Status is {}", + QManager::instance()->get_status()); + session->getDevice().getDeviceInfo()->detach(); + popart::logging::warn("The computation:{} session:{} detached from device", + this, session.get()); + session.reset(); + assert(session == nullptr); + popart::logging::warn("The computation:{} session has been reset", this); + } +} + odla_status _odla_computation::compile_and_export() { odla_status ret_value = ODLA_SUCCESS; POPLAR_TRY @@ -213,7 +237,7 @@ odla_status _odla_computation::init(bool is_compile) { popart::AnchorReturnType("All")); // Acquire IPU if (opts.use_ipu_model) { - popart::logging::info("Using IPU Model to run."); + popart::logging::warn("Using IPU Model to run."); std::map deviceOpts{ {"numIPUs", std::to_string(opts.ipu_num)}, {"tilesPerIPU", "1216"}}; device = @@ -230,6 +254,7 @@ odla_status _odla_computation::init(bool is_compile) { throw std::runtime_error( "Failed to get a device when initializing odla_computation"); } + popart::logging::warn("Device acquired to run model"); // Create and config SessionOptions set_session_opts(); @@ -255,6 +280,9 @@ odla_status _odla_computation::init(bool is_compile) { // Create InferenceSession new_session = std::move(popart::InferenceSession::createFromOnnxModel( proto, data_flow, device, popart::InputShapeInfo(), session_opts_)); + popart::logging::warn( + "New session: {} has been created for computation: {}", + new_session.get(), this); if (!is_compile) { if (PopartConfig::instance()->load_or_save_cache()) { @@ -297,7 +325,9 @@ odla_status _odla_computation::init(bool is_compile) { is_compile_only_ = true; } // set session after all initialization done. + popart::logging::warn("Moving new_session to session: {}", session.get()); session = std::move(new_session); + popart::logging::warn("Moved new_session to session: {}", session.get()); // Thread must be started after all initialization done if (!is_compile) { ExecutionMode mode = PopartConfig::instance()->execution_mode(); @@ -404,7 +434,7 @@ bool _odla_computation::hold() { } else { std::stringstream ss_holder; ss_holder << thread_id_of_holder; - popart::logging::warn( + popart::logging::info( "The odla_computation {} has been held by thread: {}" ", when thread {} try to hold it.", this, thread_id_of_holder, this_thread_id); diff --git a/ODLA/platforms/odla_popart/odla_popart.h b/ODLA/platforms/odla_popart/odla_popart.h index 331a82dfa..2a5fd1dbe 100644 --- a/ODLA/platforms/odla_popart/odla_popart.h +++ b/ODLA/platforms/odla_popart/odla_popart.h @@ -151,17 +151,26 @@ struct _odla_computation { inline Execution* executor() { return executor_; } inline bool is_done() { return thread_state_ != RUNNING; } inline bool is_compile_only() { return is_compile_only_; } - inline void release_session() { - if (session != nullptr) { - session->getDevice().getDeviceInfo()->detach(); - popart::logging::warn( - "The computation:{} session:{} detached from device", this, - session.get()); - session.reset(); - assert(session == nullptr); - popart::logging::warn("The computation:{} session has been reset", this); - } - } + inline void release_session(); + /* { + if (nullptr == session) + popart::logging::warn("session is nullptr when try to release it"); + //else if(session->getDevice() == nullptr) + // popart::logging::warn("session->getDevice() is nullptr when try to + release it"); else if(session->getDevice().getDeviceInfo() == nullptr) + popart::logging::warn("session->getDevice().getDeviceInfo() is + nullptr when try to release it"); else { popart::logging::warn("Calling + session->getDevice().getDeviceInfo()->detach() to detach the device when + QManager Status is {}", QManager::instance()->get_status()); + session->getDevice().getDeviceInfo()->detach(); + popart::logging::warn( + "The computation:{} session:{} detached from device", this, + session.get()); + session.reset(); + assert(session == nullptr); + popart::logging::warn("The computation:{} session has been reset", this); + } + }*/ inline void set_thread_run() { std::unique_lock lock(thread_done_mutex_); thread_state_ = RUNNING; From 953005d5dec5b1610b4cb9d677d0bc047196bfd9 Mon Sep 17 00:00:00 2001 From: gcuser Date: Sun, 24 Apr 2022 12:21:45 +0000 Subject: [PATCH 2/5] remove inline for the release_session --- ODLA/platforms/odla_popart/odla_popart.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ODLA/platforms/odla_popart/odla_popart.h b/ODLA/platforms/odla_popart/odla_popart.h index 2a5fd1dbe..4d8c3c6b2 100644 --- a/ODLA/platforms/odla_popart/odla_popart.h +++ b/ODLA/platforms/odla_popart/odla_popart.h @@ -151,7 +151,7 @@ struct _odla_computation { inline Execution* executor() { return executor_; } inline bool is_done() { return thread_state_ != RUNNING; } inline bool is_compile_only() { return is_compile_only_; } - inline void release_session(); + void release_session(); /* { if (nullptr == session) popart::logging::warn("session is nullptr when try to release it"); From 00f984013e884b220601c78f726f745b6c229de4 Mon Sep 17 00:00:00 2001 From: gcuser Date: Wed, 27 Apr 2022 03:03:46 +0000 Subject: [PATCH 3/5] Fix the problem of core dump when cache not used --- ODLA/platforms/odla_popart/odla_compute.cc | 1 + ODLA/platforms/odla_popart/odla_popart.h | 19 ------------------- ODLA/platforms/odla_popart/popart_config.h | 2 +- 3 files changed, 2 insertions(+), 20 deletions(-) diff --git a/ODLA/platforms/odla_popart/odla_compute.cc b/ODLA/platforms/odla_popart/odla_compute.cc index 7ed3d0e0e..24ef24494 100644 --- a/ODLA/platforms/odla_popart/odla_compute.cc +++ b/ODLA/platforms/odla_popart/odla_compute.cc @@ -240,6 +240,7 @@ odla_status odla_DestroyComputation(odla_computation comp) { } popart::logging::warn("reset config state, comp: {}", comp); PopartConfig::instance()->reset_init_state(); + popart::logging::warn("reset config state DONE, comp: {}", comp); return ODLA_SUCCESS; } diff --git a/ODLA/platforms/odla_popart/odla_popart.h b/ODLA/platforms/odla_popart/odla_popart.h index 4d8c3c6b2..37bcfbf9b 100644 --- a/ODLA/platforms/odla_popart/odla_popart.h +++ b/ODLA/platforms/odla_popart/odla_popart.h @@ -152,25 +152,6 @@ struct _odla_computation { inline bool is_done() { return thread_state_ != RUNNING; } inline bool is_compile_only() { return is_compile_only_; } void release_session(); - /* { - if (nullptr == session) - popart::logging::warn("session is nullptr when try to release it"); - //else if(session->getDevice() == nullptr) - // popart::logging::warn("session->getDevice() is nullptr when try to - release it"); else if(session->getDevice().getDeviceInfo() == nullptr) - popart::logging::warn("session->getDevice().getDeviceInfo() is - nullptr when try to release it"); else { popart::logging::warn("Calling - session->getDevice().getDeviceInfo()->detach() to detach the device when - QManager Status is {}", QManager::instance()->get_status()); - session->getDevice().getDeviceInfo()->detach(); - popart::logging::warn( - "The computation:{} session:{} detached from device", this, - session.get()); - session.reset(); - assert(session == nullptr); - popart::logging::warn("The computation:{} session has been reset", this); - } - }*/ inline void set_thread_run() { std::unique_lock lock(thread_done_mutex_); thread_state_ = RUNNING; diff --git a/ODLA/platforms/odla_popart/popart_config.h b/ODLA/platforms/odla_popart/popart_config.h index 1a676ccaf..512705f99 100644 --- a/ODLA/platforms/odla_popart/popart_config.h +++ b/ODLA/platforms/odla_popart/popart_config.h @@ -115,7 +115,7 @@ class PopartConfig { std::lock_guard guard(config_mutex_); if (inited_) { inited_ = false; - if (cache_fs->is_open()) { + if (cache_fs && cache_fs->is_open()) { cache_fs->close(); cache_fs->clear(); } From 7031efb0226e51fa3c0d04a1d457549b574c4d4c Mon Sep 17 00:00:00 2001 From: gcuser Date: Wed, 27 Apr 2022 03:05:16 +0000 Subject: [PATCH 4/5] reset session when it is not nullptr --- ODLA/platforms/odla_popart/odla_popart.cc | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/ODLA/platforms/odla_popart/odla_popart.cc b/ODLA/platforms/odla_popart/odla_popart.cc index 0d35ce802..1616de6d1 100644 --- a/ODLA/platforms/odla_popart/odla_popart.cc +++ b/ODLA/platforms/odla_popart/odla_popart.cc @@ -124,9 +124,6 @@ void compute_loop(odla_computation comp) { void _odla_computation::release_session() { if (nullptr == session) popart::logging::warn("session is nullptr when try to release it"); - // else if(session->getDevice() == nullptr) - // popart::logging::warn("session->getDevice() is nullptr when try to release - // it"); else if (session->getDevice().getDeviceInfo() == nullptr) popart::logging::warn( "session->getDevice().getDeviceInfo() is nullptr when try to release " @@ -139,10 +136,11 @@ void _odla_computation::release_session() { session->getDevice().getDeviceInfo()->detach(); popart::logging::warn("The computation:{} session:{} detached from device", this, session.get()); - session.reset(); - assert(session == nullptr); - popart::logging::warn("The computation:{} session has been reset", this); } + if (session != nullptr) + session.reset(); + assert(session == nullptr); + popart::logging::warn("The computation:{} session has been reset", this); } odla_status _odla_computation::compile_and_export() { From fbc8b2dc0a921ce919a8f516e4a34ee6ba845102 Mon Sep 17 00:00:00 2001 From: gcuser Date: Wed, 27 Apr 2022 03:14:42 +0000 Subject: [PATCH 5/5] lint format --- ODLA/platforms/odla_popart/odla_popart.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ODLA/platforms/odla_popart/odla_popart.cc b/ODLA/platforms/odla_popart/odla_popart.cc index 1616de6d1..73078a49e 100644 --- a/ODLA/platforms/odla_popart/odla_popart.cc +++ b/ODLA/platforms/odla_popart/odla_popart.cc @@ -137,8 +137,7 @@ void _odla_computation::release_session() { popart::logging::warn("The computation:{} session:{} detached from device", this, session.get()); } - if (session != nullptr) - session.reset(); + if (session != nullptr) session.reset(); assert(session == nullptr); popart::logging::warn("The computation:{} session has been reset", this); }