Skip to content
This repository has been archived by the owner on Aug 5, 2022. It is now read-only.

Commit

Permalink
fix hang issue if resuming training and compilation issue as macro is…
Browse files Browse the repository at this point in the history
… called in another function than ForwardBackwardImpl
  • Loading branch information
fzou1 committed Jul 22, 2017
1 parent bc0877f commit 65b29ef
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 49 deletions.
2 changes: 1 addition & 1 deletion include/caffe/multinode/multi_solver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class MultiSolver {
Net<Dtype>& net = *root_solver_->net();
const std::vector<shared_ptr<Layer<Dtype>>> & layers{ net.layers() };
layer_finished_flags_.resize(layers.size());
std::fill(layer_finished_flags_.begin(), layer_finished_flags_.end(), false);
std::fill(layer_finished_flags_.begin(), layer_finished_flags_.end(), true);
#endif
}

Expand Down
20 changes: 10 additions & 10 deletions include/caffe/multinode/multi_sync.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,15 +182,15 @@ namespace caffe {
mn::train::commit();

#ifdef PERFORMANCE_MONITORING
statsIterResult.resize(caffe::mn::train::get_session().get_operation_count());
caffe::mn::train::stats::start();
statsIterResult.resize(caffe::mn::train::get_session().get_operation_count());
caffe::mn::train::stats::start();
#endif

solver->add_callback(this);
solver->Solve();

#ifdef PERFORMANCE_MONITORING
dump_stats_to_file();
dump_stats_to_file();
#endif
}

Expand All @@ -206,6 +206,10 @@ namespace caffe {
}

void on_iter_finished(int layer_id) {
#ifdef FW_OVERLAP_OPT
solver->set_layer_finished_flag(layer_id, false);
#endif

boost::shared_ptr<Layer<Dtype>> &layer = layers[layer_id];
if (layer->layerOp == nullptr) {
return;
Expand Down Expand Up @@ -238,16 +242,11 @@ namespace caffe {
}

std::vector<int> &param_ids = layer_param_ids[layer_id];

#ifdef FW_OVERLAP_OPT
int finished_count = 0;
#endif

for (int i=0; i<param_ids.size(); i++) {
if (!layer->ParamNeedReduce(i)
#ifdef FW_OVERLAP_OPT
|| (param_ids_finished_flags[layer_id][i] == true)) {
finished_count++;
param_ids_finished_flags[layer_id][i] = true;
#else
) {
#endif
Expand All @@ -264,7 +263,6 @@ namespace caffe {
#ifdef FW_OVERLAP_OPT
assert(is_completed);
param_ids_finished_flags[layer_id][i] = true;
finished_count++;
#endif
if (CAN_USE_PRV(net_params[param_ids[i]])) {
if (delwt_buf != net_params[param_ids[i]]->prv_diff())
Expand All @@ -279,6 +277,8 @@ namespace caffe {
}

#ifdef FW_OVERLAP_OPT
int finished_count = std::count(param_ids_finished_flags[layer_id].begin(),
param_ids_finished_flags[layer_id].end(), true);
if (finished_count == param_ids.size()) {
solver->set_layer_finished_flag(layer_id, true);
}
Expand Down
50 changes: 12 additions & 38 deletions src/caffe/multinode/multi_solver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

namespace caffe {

#define START_ITER 1


#ifdef CAFFE_PER_LAYER_TIMINGS
#define LAYER_TIMING_START() do { \
timer.Start(); \
root_solver_->timer.Start(); \
}while(0)

#define LAYER_TIMING_STOP(name, index) do { \
name##_time_per_layer[index] += timer.MicroSeconds(); \
root_solver_->name##_time_per_layer[index] += root_solver_->timer.MicroSeconds(); \
}while(0)
#else
#define LAYER_TIMING_START()
Expand Down Expand Up @@ -101,50 +98,29 @@ inline void MultiSolver<Dtype>::WaitAndUpdateGradient(int layer_id) {

template <typename Dtype>
Dtype MultiSolver<Dtype>::ForwardBackwardImpl(bool first, bool last) {

Dtype loss = 0;
Net<Dtype>& net = *root_solver_->net();
const std::vector<shared_ptr<Layer<Dtype>>>& layers{ net.layers() };
const std::vector<bool>& layer_need_backward{ net.layer_need_backward() };
#ifdef FW_OVERLAP_OPT
int iter = root_solver_->iter();
#endif

#ifdef CAFFE_PER_LAYER_TIMINGS
Timer& timer = root_solver_->timer;
std::vector<double>& forward_time_per_layer = root_solver_->forward_time_per_layer;
std::vector<double>& backward_time_per_layer = root_solver_->backward_time_per_layer;
std::vector<double>& update_time_per_layer = root_solver_->update_time_per_layer;
std::vector<double>& startcomm_time_per_layer = root_solver_->startcomm_time_per_layer;
std::vector<double>& waitcomm_time_per_layer = root_solver_->waitcomm_time_per_layer;
#endif /* CAFFE_PER_LAYER_TIMINGS */


for (int i = 0; i < layers.size(); ++i) {
#ifdef FW_OVERLAP_OPT
if (first && iter >= START_ITER + 1) {
if (first && IsSkipWaitGradient(i) == false) {
while (layer_finished_flags_[i] == false) {
if (IsSkipWaitGradient(i)) {
break;
}

WaitAndUpdateGradient(i);
if (layer_finished_flags_[i]) {
if (layer_finished_flags_[i])
break;
}

for (int k=i+1; k<layers.size(); k++) {
if (layer_finished_flags_[k] || IsSkipWaitGradient(k)) {
layer_finished_flags_[k] = true;
continue;
}

WaitAndUpdateGradient(k);
if (layer_finished_flags_[k])
break;
}
}
layer_finished_flags_[i] = false;
}
#endif

Expand All @@ -159,12 +135,11 @@ Dtype MultiSolver<Dtype>::ForwardBackwardImpl(bool first, bool last) {
}

LAYER_TIMING_START();

net.BackwardFromTo(i, i);

LAYER_TIMING_STOP(backward, i);

if (last && (layers[i]->layerOp != nullptr) && layers[i]->layerOp->HasParameterSets()) {
if (last && (layers[i]->layerOp != nullptr)
&& layers[i]->layerOp->HasParameterSets()) {
LAYER_TIMING_START();
for (int j = 0; j < callbacks_.size(); ++j) {
callbacks_[j]->on_iter_finished(i);
Expand All @@ -174,6 +149,7 @@ Dtype MultiSolver<Dtype>::ForwardBackwardImpl(bool first, bool last) {
}

#ifdef FW_OVERLAP_OPT
int iter = root_solver_->iter();
int max_iter = root_solver_->param().max_iter();
bool test = (root_solver_->param().test_interval()
&& ((iter + 1) % root_solver_->param().test_interval() == 0));
Expand All @@ -183,22 +159,20 @@ Dtype MultiSolver<Dtype>::ForwardBackwardImpl(bool first, bool last) {
#else
if (last) {
#endif

for (int i = 0; i < layers.size(); ++i) {
#ifdef FW_OVERLAP_OPT
if (layer_finished_flags_[i])
continue;
#endif
for (int i = 0; i < layers.size(); ++i) {
if (IsSkipWaitGradient(i)) {
#ifdef FW_OVERLAP_OPT
finished_count++;
layer_finished_flags_[i] = true;
#endif
continue;
}
#ifdef FW_OVERLAP_OPT
if (layer_finished_flags_[i])
continue;
#endif

WaitAndUpdateGradient(i);

#ifdef FW_OVERLAP_OPT
if (layer_finished_flags_[i])
finished_count++;
Expand Down

0 comments on commit 65b29ef

Please sign in to comment.