From d626ff6ab396cf401eb2a75753bc12740b0967c6 Mon Sep 17 00:00:00 2001
From: fzou1 <feng.zou@intel.com>
Date: Tue, 1 Aug 2017 15:32:50 +0800
Subject: [PATCH 01/38] Merge pull request #106 for fixing crash issue of
 classification/batch_classification

---
 examples/cpp_classification/batch_classification.cpp | 4 ++++
 examples/cpp_classification/classification.cpp       | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/examples/cpp_classification/batch_classification.cpp b/examples/cpp_classification/batch_classification.cpp
index 374671baa..8295bf4e5 100644
--- a/examples/cpp_classification/batch_classification.cpp
+++ b/examples/cpp_classification/batch_classification.cpp
@@ -422,6 +422,10 @@ int main(int argc, char** argv) {
         cout<<"Use mean file: "<<FLAGS_mean_file<<endl;
     }
 
+#ifdef USE_MLSL
+    caffe::mn::init(&argc,&argv);
+#endif
+
     Classifier classifier(FLAGS_model, FLAGS_weights, FLAGS_mean_file,
             FLAGS_mean_value, FLAGS_label_file, FLAGS_engine, FLAGS_batch_size);
 
diff --git a/examples/cpp_classification/classification.cpp b/examples/cpp_classification/classification.cpp
index 5b8aa21e4..3d8448033 100644
--- a/examples/cpp_classification/classification.cpp
+++ b/examples/cpp_classification/classification.cpp
@@ -285,6 +285,10 @@ int main(int argc, char** argv) {
     engine = argv[6];
   }
 
+#ifdef USE_MLSL
+  caffe::mn::init(&argc,&argv);
+#endif
+
   Classifier classifier(model_file, trained_file, mean_file, label_file, engine);
 
 

From 7f25d661a524a86faccd5af34f38f3b9719b4475 Mon Sep 17 00:00:00 2001
From: "Haihao.Shen" <haihao.shen@intel.com>
Date: Tue, 1 Aug 2017 16:31:12 +0800
Subject: [PATCH 02/38] Fix the python typo

---
 examples/pycaffe/tune_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pycaffe/tune_model.py b/examples/pycaffe/tune_model.py
index 8305b081b..628adf9c0 100644
--- a/examples/pycaffe/tune_model.py
+++ b/examples/pycaffe/tune_model.py
@@ -23,7 +23,7 @@ def tuneModelDefinition(model_path, iteration):
     caffe_path = os.path.join(working_dir, "..", "..", "build", "tools", "caffe")
     if not os.path.exists(caffe_path):
         print "Caffe binary does not exist; please build Caffe binary first."
-        sys,exit(1)
+        sys.exit(1)
 
     base_model_name = os.path.basename(model_path)
     model_dir = os.path.dirname(model_path)

From 96ee2c6990519107d5fc36a5e9d230d0e5fb488a Mon Sep 17 00:00:00 2001
From: "Haihao.Shen" <haihao.shen@intel.com>
Date: Tue, 1 Aug 2017 16:36:56 +0800
Subject: [PATCH 03/38] Upgrade MKLML version from 0425 to 0720

---
 external/mkl/prepare_mkl.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/external/mkl/prepare_mkl.sh b/external/mkl/prepare_mkl.sh
index b68bc7aec..09284dc41 100755
--- a/external/mkl/prepare_mkl.sh
+++ b/external/mkl/prepare_mkl.sh
@@ -74,10 +74,10 @@ echo $VERSION_LINE  # Return Version Line
 # MKL
 DST=`dirname $0`
 OMP=0 
-VERSION_MATCH=20170425
-ARCHIVE_BASENAME=mklml_lnx_2018.0.20170425.tgz
+VERSION_MATCH=20170720
+ARCHIVE_BASENAME=mklml_lnx_2018.0.20170720.tgz
 MKL_CONTENT_DIR=`echo $ARCHIVE_BASENAME | rev | cut -d "." -f 2- | rev`
-GITHUB_RELEASE_TAG=1.0.0
+GITHUB_RELEASE_TAG=1.0.2
 
 MKLURL="https://github.com/intel/caffe/releases/download/$GITHUB_RELEASE_TAG/$ARCHIVE_BASENAME"
 # there are diffrent MKL lib to be used for GCC and for ICC

From aaba8d24d851100db954ed12cb1655b8622fad56 Mon Sep 17 00:00:00 2001
From: Feng Tian <feng.tian@intel.com>
Date: Tue, 1 Aug 2017 16:25:14 +0800
Subject: [PATCH 04/38] Fix the fusion bug for MKLDNN relu and conv layer.

As MKLDNN currently doesn't support dilation convolution, we couldn't fuse them but use caffe engine to calculate.

Change-Id: Icfbac285ac98e4fcefd540791bcccf6692849904
---
 src/caffe/layer_factory.cpp | 15 +++++++------
 src/caffe/net.cpp           | 43 ++++++++++++++++++++++++-------------
 2 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
index 2b52007cc..0a6f83a21 100644
--- a/src/caffe/layer_factory.cpp
+++ b/src/caffe/layer_factory.cpp
@@ -92,6 +92,7 @@ shared_ptr<Layer<Dtype> > GetConvolutionLayer(
   for (int i = 0; i < conv_param.dilation_size(); ++i) {
     if (conv_param.dilation(i) > 1) {
       use_dilation = true;
+      break;
     }
   }
 #endif
@@ -589,10 +590,10 @@ shared_ptr<Layer<Dtype> > GetEltwiseLayer(const LayerParameter& param) {
 #if defined(MKL2017_SUPPORTED)
     else if (ep.isEngine("MKL2017"))
       engine = EltwiseParameter_Engine_MKL2017;
-#endif
-#if defined(MKLDNN_SUPPORTED)
-    else if (ep.isEngine("MKLDNN"))
-      engine = EltwiseParameter_Engine_MKLDNN;
+#endif
+#if defined(MKLDNN_SUPPORTED)
+    else if (ep.isEngine("MKLDNN"))
+      engine = EltwiseParameter_Engine_MKLDNN;
 #endif
   }
 
@@ -605,9 +606,9 @@ shared_ptr<Layer<Dtype> > GetEltwiseLayer(const LayerParameter& param) {
   } else if (engine == EltwiseParameter_Engine_MKL2017) {
     return shared_ptr<Layer<Dtype> >(new MKLEltwiseLayer<Dtype>(param));
 #endif
-#ifdef MKLDNN_SUPPORTED
-  } else if (engine == EltwiseParameter_Engine_MKLDNN) {
-    return shared_ptr<Layer<Dtype> >(new MKLDNNEltwiseLayer<Dtype>(param));
+#ifdef MKLDNN_SUPPORTED
+  } else if (engine == EltwiseParameter_Engine_MKLDNN) {
+    return shared_ptr<Layer<Dtype> >(new MKLDNNEltwiseLayer<Dtype>(param));
 #endif
   } else {
     LOG(FATAL) << "Layer " << param.name() << " has unknow engine.";
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 0a8aeb981..98395e82a 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -628,13 +628,24 @@ void Net<Dtype>::CompilationRuleTwo(const NetParameter& param,
     // Note: Currently merging of convolution and relu layers is feasible
     // If current layer is Convolution of MKLDNN engine..
     if ((layer_param->type().compare("Convolution") == 0) &&
-       ((layer_param->convolution_param().engine() == ConvolutionParameter_Engine_MKLDNN)
-       || (((layer_param->convolution_param().engine() == ConvolutionParameter_Engine_DEFAULT) &&
-            (param.engine().compare(0, 6, "MKLDNN") == 0
-            && param.engine().find(":DLA", 6) == string::npos)) ||
-            (param.engine() == "" &&
-              layer_param->engine().compare(0, 6, "MKLDNN") == 0 &&
-              layer_param->engine().find(":DLA", 6) == string::npos)))) {
+        ((layer_param->convolution_param().engine() == ConvolutionParameter_Engine_MKLDNN) ||
+         ((layer_param->convolution_param().engine() == ConvolutionParameter_Engine_DEFAULT) &&
+          (layer_param->engine().compare(0, 6, "MKLDNN") == 0) &&
+          (layer_param->engine().find(":DLA", 6) == string::npos)) ||
+         ((layer_param->convolution_param().engine() == ConvolutionParameter_Engine_DEFAULT) &&
+          (layer_param->engine() == "") &&
+          (param.engine().compare(0, 6, "MKLDNN") == 0 &&
+           param.engine().find(":DLA", 6) == string::npos)))) {
+      // check if Dialation is larger than 1. if yes, don't fuse the following Relu layer with this conv layer
+      // as MKLDNN doesn't support dilation convolution yet.
+      bool dilation = false;
+      for (int i = 0; i < layer_param->convolution_param().dilation_size(); ++i) {
+        if (layer_param->convolution_param().dilation(i) > 1) {
+          dilation = true;
+          break;
+        }
+      }
+
       std::vector<const LayerParameter*> consumer_layer_params;
       GetBlobConsumers(consumer_layer_params, layer_param->top(0),
                        param, i+1 < param.layer_size() ? i+1 : i);
@@ -644,14 +655,16 @@ void Net<Dtype>::CompilationRuleTwo(const NetParameter& param,
 
       // Consumer layer of blob produced by Conv
       // has to be ReLU layer with one Input Blob
-      if ((consumer_layer_param.type().compare("ReLU") == 0) &&
-        ((consumer_layer_param.relu_param().engine() == ReLUParameter_Engine_MKLDNN)
-        || (((consumer_layer_param.relu_param().engine() == ReLUParameter_Engine_DEFAULT) &&
-            (param.engine().compare(0, 6, "MKLDNN") == 0
-            && param.engine().find(":DLA", 6) == string::npos)) ||
-            (param.engine() == "" &&
-              layer_param->engine().compare(0, 6, "MKLDNN") == 0 &&
-              layer_param->engine().find(":DLA", 6) == string::npos)))) {
+      if (!dilation &&
+          (consumer_layer_param.type().compare("ReLU") == 0) &&
+          ((consumer_layer_param.relu_param().engine() == ReLUParameter_Engine_MKLDNN) ||
+           ((consumer_layer_param.relu_param().engine() == ReLUParameter_Engine_DEFAULT) &&
+            (consumer_layer_param.engine().compare(0, 6, "MKLDNN") == 0 &&
+             consumer_layer_param.engine().find(":DLA", 6) == string::npos)) ||
+           ((consumer_layer_param.relu_param().engine() == ReLUParameter_Engine_DEFAULT) &&
+            (consumer_layer_param.engine() == "") &&
+            (param.engine().compare(0, 6, "MKLDNN") == 0 &&
+             param.engine().find(":DLA", 6) == string::npos)))) {
         string& convolution_top_blob_name =
             const_cast<string&>(layer_param->top(0));
 

From 84fae0eb7b4052b2ab769d99cf46d43e661ae832 Mon Sep 17 00:00:00 2001
From: Feng Tian <feng.tian@intel.com>
Date: Wed, 2 Aug 2017 09:22:44 +0800
Subject: [PATCH 05/38] fix crash due to null pointer dereference.

This issue was found during enabling debug_info option in solver.prototxt
with mkldnn engine.
---
 src/caffe/blob.cpp | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp
index dd5546bde..dddb0f2db 100644
--- a/src/caffe/blob.cpp
+++ b/src/caffe/blob.cpp
@@ -369,7 +369,13 @@ Dtype Blob<Dtype>::asum_diff() const {
   switch (diff_->head()) {
   case SyncedMemory::SYNCED_PRV:
   case SyncedMemory::HEAD_AT_PRV:
-    return caffe_cpu_asum( prv_diff_count(), prv_diff());
+    {
+      const Dtype* prv_ptr = prv_diff();
+      if (prv_ptr == NULL)
+        return caffe_cpu_asum(count_, cpu_diff());
+      else
+        return caffe_cpu_asum(prv_diff_count(), prv_diff());
+    }
   case SyncedMemory::HEAD_AT_CPU:
     return caffe_cpu_asum(count_, cpu_diff());
   case SyncedMemory::HEAD_AT_GPU:
@@ -462,7 +468,11 @@ Dtype Blob<Dtype>::sumsq_diff() const {
   case SyncedMemory::SYNCED_PRV:
   case SyncedMemory::HEAD_AT_PRV:
       diff = prv_diff();
-      sumsq = caffe_cpu_dot(prv_diff_count(), diff, diff);
+      if (diff == NULL) {
+        diff = cpu_diff();
+        sumsq = caffe_cpu_dot(count_, diff, diff); 
+      } else
+        sumsq = caffe_cpu_dot(prv_diff_count(), diff, diff);
       break;
   case SyncedMemory::HEAD_AT_CPU:
     diff = cpu_diff();

From 79e05ccd5c7225a892d1d30fea86caee38b97d7a Mon Sep 17 00:00:00 2001
From: "Zhang, Guoming" <guoming.zhang@intel.com>
Date: Wed, 2 Aug 2017 20:11:55 +0800
Subject: [PATCH 06/38] Fix for the issue that Intel caffe couldn't converge on
 Resnet-50.

---
 src/caffe/net.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 98395e82a..10621c531 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -728,11 +728,12 @@ void Net<Dtype>::CompilationRuleThree(const NetParameter& param,
     // If current layer is BatchNorm of MKL2017 engine..
     if (((layer_param->type().compare("BatchNorm") == 0) &&
         ((layer_param->batch_norm_param().engine() ==
-         BatchNormParameter_Engine_MKL2017)
+         BatchNormParameter_Engine_MKL2017 || layer_param->batch_norm_param().engine() ==
+         BatchNormParameter_Engine_MKLDNN)
         || ((layer_param->batch_norm_param().engine() ==
            BatchNormParameter_Engine_DEFAULT) &&
-            param.engine().compare("MKL2017") == 0))) &&
-        (layer_param->top(0) == layer_param->bottom(0) )) {
+            (param.engine().compare("MKL2017") == 0 || param.engine().compare("MKLDNN") == 0)))) &&
+        (layer_param->top(0) == layer_param->bottom(0))) {
       std::string& batch_norm_top = const_cast<string&>(layer_param->top(0));
       std::vector<const LayerParameter*> consumer_layer_params;
       GetBlobConsumers(consumer_layer_params,

From f787234872adf1de7192663aa1ae0a872b5a5176 Mon Sep 17 00:00:00 2001
From: fzou1 <feng.zou@intel.com>
Date: Fri, 4 Aug 2017 11:16:19 +0800
Subject: [PATCH 07/38] add random resizing param for test net

Change-Id: I0f1e7e2b758e666f6eec8c8c71a1cd905de7b1e1
---
 .../resnet_50_256_nodes_8k_batch/train_val.prototxt | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/models/intel_optimized_models/multinode/resnet_50_256_nodes_8k_batch/train_val.prototxt b/models/intel_optimized_models/multinode/resnet_50_256_nodes_8k_batch/train_val.prototxt
index e5c7a9128..d98323ed6 100644
--- a/models/intel_optimized_models/multinode/resnet_50_256_nodes_8k_batch/train_val.prototxt
+++ b/models/intel_optimized_models/multinode/resnet_50_256_nodes_8k_batch/train_val.prototxt
@@ -14,8 +14,10 @@ layer {
     mean_value: 104
     mean_value: 117
     mean_value: 123
-    random_resize_param {
-      min_size: 256 max_size: 480
+    random_aspect_ratio_param {
+      min_area_ratio: 0.08
+      max_area_ratio: 1
+      aspect_ratio_change: 0.75
       resize_param {
         interp_mode: CUBIC
       }
@@ -44,6 +46,13 @@ layer {
     mean_value: 104
     mean_value: 117
     mean_value: 123
+    random_resize_param {
+      min_size: 256
+      max_size: 256
+      resize_param {
+        interp_mode: CUBIC
+      }
+    }
   }
   data_param {
     source: "examples/imagenet/ilsvrc12_val_lmdb"

From 70ef5231c261795faf8dc99abe60bc756068c89b Mon Sep 17 00:00:00 2001
From: fzou1 <feng.zou@intel.com>
Date: Mon, 7 Aug 2017 14:11:14 +0800
Subject: [PATCH 08/38] Remove additional normalization overhead with iter_size
 1 for multi-node and make it consistent with single node. loss is divided
 during setting up

---
 include/caffe/layer.hpp          |  5 +++++
 src/caffe/solvers/sgd_solver.cpp | 12 ------------
 2 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp
index 45d65c799..9dc4d557b 100644
--- a/include/caffe/layer.hpp
+++ b/include/caffe/layer.hpp
@@ -521,7 +521,12 @@ class Layer {
       CHECK_EQ(top.size(), num_loss_weights) << "loss_weight must be "
           "unspecified or specified once per top blob.";
       for (int top_id = 0; top_id < top.size(); ++top_id) {
+#ifdef USE_MLSL
+        const Dtype loss_weight = layer_param_.loss_weight(top_id) /
+          GetDistribution().get_data_parts();
+#else
         const Dtype loss_weight = layer_param_.loss_weight(top_id);
+#endif
         if (loss_weight == Dtype(0)) { continue; }
         this->set_loss(top_id, loss_weight);
         const int count = top[top_id]->count();
diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp
index 264ac954f..df5fbcd26 100644
--- a/src/caffe/solvers/sgd_solver.cpp
+++ b/src/caffe/solvers/sgd_solver.cpp
@@ -226,23 +226,11 @@ void SGDSolver<Dtype>::ApplyUpdate(int param_id) {
 
 template <typename Dtype>
 void SGDSolver<Dtype>::Normalize(int param_id) {
-
-#ifdef USE_MLSL
-  if ((this->param_.iter_size() == 1) && !mn::is_multinode()) {
-    return;
-  }
-#else /* !USE_MLSL */
   if (this->param_.iter_size() == 1) { return; }
-#endif /* USE_MLSL */
 
   // Scale gradient to counterbalance accumulation.
   const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-
-#ifdef USE_MLSL
-  const Dtype accum_normalization = Dtype(1.) / (this->param_.iter_size() * mn::get_nodes_count());
-#else /* !USE_MLSL */
   const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size();
-#endif /* USE_MLSL */
 
   switch (Caffe::mode()) {
   case Caffe::CPU: {

From cd36cd9fe2aa2c7983185af87a5dc6b7a5c45457 Mon Sep 17 00:00:00 2001
From: "Yu, Chong" <chong.yu@intel.com>
Date: Mon, 7 Aug 2017 18:48:24 +0800
Subject: [PATCH 09/38] SGD update optimization by fusion. Need ICC build.

---
 Makefile                         |   6 +
 Makefile.config.example          |   3 +
 include/caffe/sgd_solvers.hpp    |   5 +
 src/caffe/solvers/sgd_solver.cpp | 386 ++++++++++++++++++++++++++++++-
 4 files changed, 397 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index fd6e78bc8..46d259a37 100644
--- a/Makefile
+++ b/Makefile
@@ -547,6 +547,12 @@ LIBRARY_DIRS += $(LIB_BUILD_DIR)
 # Automatic dependency generation (nvcc is handled separately)
 CXXFLAGS += -MMD -MP
 
+##########SGD FUSION#######################
+ifeq ($(ENABLE_SGD_FUSION), 1)
+        COMMON_FLAGS += -DENABLE_SGD_FUSION
+endif
+###########################################
+#
 # Complete build flags.
 COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir))
 CXXFLAGS += -std=c++11 -pthread -fPIC $(COMMON_FLAGS) $(WARNINGS)
diff --git a/Makefile.config.example b/Makefile.config.example
index 8bfcc57a3..539a00a67 100644
--- a/Makefile.config.example
+++ b/Makefile.config.example
@@ -170,5 +170,8 @@ DISTRIBUTE_DIR := distribute
 # The ID of the GPU that 'make runtest' will use to run unit tests.
 TEST_GPUID := 0
 
+# Uncomment for enabling SGD fusion
+# ENABLE_SGD_FUSION := 1
+
 # enable pretty build (comment to see full commands)
 Q ?= @
diff --git a/include/caffe/sgd_solvers.hpp b/include/caffe/sgd_solvers.hpp
index a11da89de..9741ef212 100644
--- a/include/caffe/sgd_solvers.hpp
+++ b/include/caffe/sgd_solvers.hpp
@@ -81,6 +81,11 @@ class SGDSolver : public Solver<Dtype> {
   //   of gradients/updates and is not needed in snapshots
   vector<shared_ptr<Blob<Dtype> > > history_, update_, temp_;
 
+#ifdef ENABLE_SGD_FUSION
+  //Fuse the Normalize, Regularize and ComputeUpdateValue process together
+  void Normalize_Regularize_ComputeUpdateValue_Fusion(int param_id, Dtype rate);
+#endif /* ENABLE_SGD_FUSION */
+
   // loss history for 'plateau' LR policy (should be stored in snapshots)
   Dtype minimum_loss_;
   int iter_last_event_;
diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp
index df5fbcd26..1480d3005 100644
--- a/src/caffe/solvers/sgd_solver.cpp
+++ b/src/caffe/solvers/sgd_solver.cpp
@@ -42,6 +42,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "caffe/util/hdf5.hpp"
 #include "caffe/util/io.hpp"
 #include "caffe/util/upgrade_proto.hpp"
+#include <immintrin.h>
+
 
 namespace caffe {
 template <typename Dtype>
@@ -208,13 +210,38 @@ void SGDSolver<Dtype>::ApplyUpdate(int param_id) {
     return;
   }
 
+#ifdef ENABLE_SGD_FUSION
+  switch (Caffe::mode()) {
+  case Caffe::CPU: {
+    //VLOG(1) << "Use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD";
+    //LOG(INFO) << "Use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD";
+    Normalize_Regularize_ComputeUpdateValue_Fusion(param_id, rate);
+    break;
+  }
+  case Caffe::GPU: {
+#ifndef CPU_ONLY
+    //VLOG(1) << "Currently we do not support use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD in GPU mode.";
+    //LOG(INFO) << "Currently we do not support use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD in GPU mode.";
+#else
+    NO_GPU;
+#endif
+    break;
+  }
+  default:
+    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+  }
+#else /* !ENABLE_SGD_FUSION */
+  //LOG(INFO) << "No Fusion: Param_id: " << param_id;
   Normalize(param_id);
+  
   LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], diff, param_id, "ApplyUpdate: delwt after Normalize:");
 
   Regularize(param_id);
   LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], diff, param_id, "ApplyUpdate: delwt after Regularize:");
 
   ComputeUpdateValue(param_id, rate);
+#endif /* ENABLE_SGD_FUSION */
+
   LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], diff, param_id, "ApplyUpdate: wtinc:");
 
   LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], data, param_id, "ApplyUpdate: weight before update:");
@@ -224,12 +251,359 @@ void SGDSolver<Dtype>::ApplyUpdate(int param_id) {
   LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], data, param_id, "ApplyUpdate: weight after update:");
 }
 
+#ifdef ENABLE_SGD_FUSION
+//Math function for fusion
+template <typename Dtype>
+void axpy_axpby_copy(size_t count, const Dtype decay, const Dtype* net_params_data, Dtype *net_params_diff,
+                     const Dtype rate, const Dtype momentum, Dtype* history_data);
+
+template <>
+void axpy_axpby_copy<float>(size_t count, const float decay, const float* net_params_data, float *net_params_diff,
+                            const float rate, const float momentum, float* history_data)
+{
+  float temp_result = 0.;
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif  
+  for (size_t i = 0; i < count; ++i) {
+    temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i];
+    history_data[i] =  temp_result;
+    net_params_diff[i] =  temp_result;
+  }
+}
+
+template <>
+void axpy_axpby_copy<double>(size_t count, const double decay, const double* net_params_data, double *net_params_diff,
+                             const double rate, const double momentum, double* history_data)
+{
+  double temp_result = 0.;
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif  
+  for (size_t i = 0; i < count; ++i) {
+    temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i];
+    history_data[i] =  temp_result;
+    net_params_diff[i] =  temp_result;
+  }
+}
+
+template <typename Dtype>
+void avx512_axpy_axpby_copy(size_t count, const Dtype decay, const Dtype* net_params_data, Dtype *net_params_diff,
+                            const Dtype rate, const Dtype momentum, Dtype* history_data);
+
+template <>
+void avx512_axpy_axpby_copy<float>(size_t count, const float decay, const float* net_params_data, float *net_params_diff,
+                                  const float rate, const float momentum, float* history_data)
+{
+    // If count is smaller than 16 we use non-avx512 implementation
+    // 16 is the element number which one avx512 register can hold
+    if (count < 16) {
+        return axpy_axpby_copy(count, decay, net_params_data, net_params_diff,
+                                     rate, momentum, history_data);
+    }
+
+    // If count can't be divided by 16, we handle tailing remainder
+    // with non-avx512 imeplementation
+    if (count % 16 != 0) {
+        size_t remainder = count % 16;
+        count -= remainder;
+        axpy_axpby_copy(remainder, decay, net_params_data+count, net_params_diff+count,
+                              rate, momentum, history_data+count);
+    }
+
+    size_t group_size = 16;
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (size_t idx = 0; idx < count; idx += group_size) {
+        const float *fnet_params_data  = net_params_data + idx;
+        float *fnet_params_diff        = net_params_diff + idx;
+        float *fhistory_data           = history_data    + idx;
+        __m512 operand1_v              = _mm512_loadu_ps(fnet_params_data);
+        __m512 operand2_v              = _mm512_loadu_ps(fnet_params_diff);
+        __m512 operand3_v              = _mm512_loadu_ps(fhistory_data);
+        __m512 decay_operand_v         = _mm512_set1_ps(decay);
+        __m512 rate_operand_v          = _mm512_set1_ps(rate);
+        __m512 momentum_operand_v      = _mm512_set1_ps(momentum);
+        __m512 decay_result            = _mm512_mul_ps(decay_operand_v, operand1_v);
+        __m512 axpy_result             = _mm512_add_ps(decay_result, operand2_v);
+        __m512 rate_result             = _mm512_mul_ps(rate_operand_v, axpy_result);
+        __m512 momentum_result         = _mm512_mul_ps(momentum_operand_v, operand3_v);
+        __m512 axpby_result            = _mm512_add_ps(rate_result, momentum_result);
+        _mm512_storeu_ps(fhistory_data, axpby_result);
+        _mm512_storeu_ps(fnet_params_diff, axpby_result);
+    }
+}
+
+template <>
+void avx512_axpy_axpby_copy<double>(size_t count, const double decay, const double* net_params_data, double* net_params_diff,
+                                    const double rate, const double momentum, double* history_data)
+{
+    // If count is smaller than 8 we use non-avx512 implementation
+    // 8 is the element number which one avx512 register can hold
+    if (count < 8) {
+        return axpy_axpby_copy(count, decay, net_params_data, net_params_diff,
+                               rate, momentum, history_data);
+    }
+
+    // If count can't be divided by 8, we handle tailing remainder
+    // with non-avx512 imeplementation
+    if (count % 8 != 0) {
+        size_t remainder = count % 8;
+        count -= remainder;
+        axpy_axpby_copy(remainder, decay, net_params_data+count, net_params_diff+count,
+                        rate, momentum, history_data+count);
+    }
+
+    size_t group_size = 8;
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (size_t idx = 0; idx < count; idx += group_size) {
+        const double *fnet_params_data  = net_params_data + idx;
+        double *fnet_params_diff        = net_params_diff + idx;
+        double *fhistory_data           = history_data    + idx;
+        __m512 operand1_v               = _mm512_loadu_pd(fnet_params_data);
+        __m512 operand2_v               = _mm512_loadu_pd(fnet_params_diff);
+        __m512 operand3_v               = _mm512_loadu_pd(fhistory_data);
+        __m512 decay_operand_v          = _mm512_set1_pd(decay);
+        __m512 rate_operand_v           = _mm512_set1_pd(rate);
+        __m512 momentum_operand_v       = _mm512_set1_pd(momentum);
+        __m512 decay_result             = _mm512_mul_pd(decay_operand_v, operand1_v);
+        __m512 axpy_result              = _mm512_add_pd(decay_result, operand2_v);
+        __m512 rate_result              = _mm512_mul_pd(rate_operand_v, axpy_result);
+        __m512 momentum_result          = _mm512_mul_pd(momentum_operand_v, operand3_v);
+        __m512 axpby_result             = _mm512_add_pd(rate_result, momentum_result);
+        _mm512_storeu_pd(fhistory_data, axpby_result);
+        _mm512_storeu_pd(fnet_params_diff, axpby_result);
+    }
+}
+
+
+template <typename Dtype>
+void SGDSolver<Dtype>::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_id, Dtype rate) {
+//LOG(INFO) << "Fusion: Param_id: " << param_id;
+
+//#pragma region 1. Common initialization
+  //Normalize initialization
+  bool skip_Normalize_stage_flag = false;
+  if (this->param_.iter_size() == 1) { skip_Normalize_stage_flag = true; }
+
+  // Scale gradient to counterbalance accumulation.
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
+
+  //Regularize initialization
+  const vector<float>& net_params_weight_decay =
+    this->net_->params_weight_decay();
+  Dtype weight_decay = this->param_.weight_decay();
+  string regularization_type = this->param_.regularization_type();
+  Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
+
+  //ComputeUpdateValue  initialization
+  const vector<float>& net_params_lr = this->net_->params_lr();
+  Dtype momentum = this->param_.momentum();
+  Dtype local_rate = rate * net_params_lr[param_id];
+//#pragma endregion
+
+//#pragma region 2. Common condition judgement
+  bool prv_diff_condition_flag = false;
+  if (net_params[param_id]->prv_diff()
+    && (net_params[param_id]->prv_diff_count()
+    == net_params[param_id]->count())) {
+      prv_diff_condition_flag = true;
+      //LOG(INFO) << "Common condition judgement: prv_diff_condition_flag = true.";
+  }
+  else
+  {
+    //LOG(INFO) << "Common condition judgement: prv_diff_condition_flag = false.";
+  }
+//#pragma endregion
+
+//#pragma region 3. Normalize stage    
+  if (skip_Normalize_stage_flag == false)
+  {
+    //LOG(INFO) << "Normalize stage: Normalize stage is not skipped.";
+
+    const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size();
+      
+    if (prv_diff_condition_flag) {
+      //LOG(INFO) << "Normalize stage: prv_diff_condition_flag = true.";
+      caffe_scal(net_params[param_id]->count(), accum_normalization,
+        net_params[param_id]->mutable_prv_diff());
+    }
+    else {
+      //LOG(INFO) << "Normalize stage: prv_diff_condition_flag = false.";
+      caffe_scal(net_params[param_id]->count(), accum_normalization,
+        net_params[param_id]->mutable_cpu_diff());
+    }
+  }
+  else
+  {
+    //LOG(INFO) << "Normalize stage: Normalize stage is skipped.";
+  }
+//#pragma endregion
+
+//For POR topologies from BVLC, all skipped the Normalize stage, and use L2 regularization
+//If prv_diff_condition_flag == true, then prv_data_condition_flag == true    (1)
+//If prv_diff_condition_flag == false, then prv_data_condition_flag == false  (2)
+//Another case is local_decay == 0, prv_diff_condition_flag == false          (3)
+//So only need to consider the fusion in situations (1) and (2), set execute_separate_ComputeUpdateValue_stage_flag to false value
+  bool execute_separate_ComputeUpdateValue_stage_flag = true;
+  //Regularize stage (Fused ComputeUpdateValue_stage in some situations)
+  if (local_decay) {
+    if (regularization_type == "L2") {
+      //LOG(INFO) << "Regularize stage: regularization_type == L2.";
+      // add weight decay
+      if (net_params[param_id]->prv_data()
+        && (net_params[param_id]->prv_data_count()
+        == net_params[param_id]->count())) {
+        //LOG(INFO) << "Regularize stage: prv_data_condition_flag = true.";
+          CHECK_EQ(true,
+            net_params[param_id]->get_prv_data_descriptor()->layout_compare(
+            net_params[param_id]->get_prv_diff_descriptor()));
+          /*  
+          caffe_axpy(net_params[param_id]->count(), 
+                      local_decay,
+                      net_params[param_id]->prv_data(),
+                      net_params[param_id]->mutable_prv_diff());
+          */
+          if (prv_diff_condition_flag) {
+            //situation (1)
+            //LOG(INFO) << "Fused ComputeUpdateValue stage: prv_diff_condition_flag = true.";
+            /*
+            caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
+                            net_params[param_id]->prv_diff(), momentum,
+                            history_[param_id]->mutable_cpu_data());
+
+            caffe_copy(net_params[param_id]->count(),
+                        history_[param_id]->cpu_data(),
+                        net_params[param_id]->mutable_prv_diff());
+            */
+
+            avx512_axpy_axpby_copy(net_params[param_id]->count(), local_decay,
+                                net_params[param_id]->prv_data(), net_params[param_id]->mutable_prv_diff(),
+                                local_rate, momentum, history_[param_id]->mutable_cpu_data());
+
+            execute_separate_ComputeUpdateValue_stage_flag = false;
+          }
+          else
+          {
+            //Will not happen!
+            //LOG(INFO) << "Cannot Fused ComputeUpdateValue stage: prv_diff_condition_flag = false.";
+            caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
+                      net_params[param_id]->cpu_diff(), momentum,
+                      history_[param_id]->mutable_cpu_data());
+
+            caffe_copy(net_params[param_id]->count(),
+                        history_[param_id]->cpu_data(),
+                        net_params[param_id]->mutable_cpu_diff());
+
+            execute_separate_ComputeUpdateValue_stage_flag = false;
+            //You can set the flag to true value, and not execute caffe_cpu_axpby and caffe_copy
+            //But set to false value and execute caffe_cpu_axpby and caffe_copy inside will save one condition judgement time
+          }
+      } else {
+        //LOG(INFO) << "Regularize stage: prv_data_condition_flag = false.";
+        /*
+        caffe_axpy(net_params[param_id]->count(),
+                    local_decay,
+                    net_params[param_id]->cpu_data(),
+                    net_params[param_id]->mutable_cpu_diff());
+        */
+        if (!prv_diff_condition_flag)
+        {
+          //situation (2)
+          //LOG(INFO) << "Fused ComputeUpdateValue stage: prv_diff_condition_flag = false.";
+          /*
+          caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
+                    net_params[param_id]->cpu_diff(), momentum,
+                    history_[param_id]->mutable_cpu_data());
+
+          caffe_copy(net_params[param_id]->count(),
+                      history_[param_id]->cpu_data(),
+                      net_params[param_id]->mutable_cpu_diff());
+          */
+
+          avx512_axpy_axpby_copy(net_params[param_id]->count(), local_decay,
+                                net_params[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff(),
+                                local_rate, momentum, history_[param_id]->mutable_cpu_data());
+
+          execute_separate_ComputeUpdateValue_stage_flag = false;
+        }
+        else
+        {
+          //Will not happen!
+          //LOG(INFO) << "Cannot Fused ComputeUpdateValue stage: prv_diff_condition_flag = true.";
+          caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
+                          net_params[param_id]->prv_diff(), momentum,
+                          history_[param_id]->mutable_cpu_data());
+
+          caffe_copy(net_params[param_id]->count(),
+                      history_[param_id]->cpu_data(),
+                      net_params[param_id]->mutable_prv_diff());
+
+          execute_separate_ComputeUpdateValue_stage_flag = false;
+          //You can set the flag to true value, and not execute caffe_cpu_axpby and caffe_copy
+          //But set to false value and execute caffe_cpu_axpby and caffe_copy inside will save one condition judgement time
+        }        
+      }
+    } else if (regularization_type == "L1") {
+      //LOG(INFO) << "Regularize stage: regularization_type == L1.";
+      caffe_cpu_sign(net_params[param_id]->count(),
+                      net_params[param_id]->cpu_data(),
+                      temp_[param_id]->mutable_cpu_data());
+      caffe_axpy(net_params[param_id]->count(),
+                  local_decay,
+                  temp_[param_id]->cpu_data(),
+                  net_params[param_id]->mutable_cpu_diff());
+    } else {
+      LOG(FATAL) << "Unknown regularization type: " << regularization_type;
+    }
+  }
+  
+  //ComputeUpdateValue stage (separate)
+  if (execute_separate_ComputeUpdateValue_stage_flag == true)
+  {
+    //Include the situation: regularization_type == "L1"/"Unknown"
+    //Include situations (3): local_decay == 0
+    //No Regularize stage, only ComputeUpdateValue stage
+    //ComputeUpdateValue stage
+    if (prv_diff_condition_flag) {
+      //LOG(INFO) << "ComputeUpdateValue stage: prv_diff_condition_flag = true.";
+      caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
+                      net_params[param_id]->prv_diff(), momentum,
+                      history_[param_id]->mutable_cpu_data());
+
+      caffe_copy(net_params[param_id]->count(),
+                  history_[param_id]->cpu_data(),
+                  net_params[param_id]->mutable_prv_diff());
+    } else {
+      //LOG(INFO) << "ComputeUpdateValue stage: prv_diff_condition_flag = false.";
+      caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
+                      net_params[param_id]->cpu_diff(), momentum,
+                      history_[param_id]->mutable_cpu_data());
+
+      caffe_copy(net_params[param_id]->count(),
+                  history_[param_id]->cpu_data(),
+                  net_params[param_id]->mutable_cpu_diff());
+    }
+  }
+
+}
+#endif /* ENABLE_SGD_FUSION */
+
 template <typename Dtype>
 void SGDSolver<Dtype>::Normalize(int param_id) {
-  if (this->param_.iter_size() == 1) { return; }
 
+  if (this->param_.iter_size() == 1) { 
+    //LOG(INFO) << "Normalize stage: Normalize stage is skipped.";
+    return;
+  }
+
+  //LOG(INFO) << "Normalize stage: Normalize stage is not skipped.";
   // Scale gradient to counterbalance accumulation.
   const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
+  
   const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size();
 
   switch (Caffe::mode()) {
@@ -238,11 +612,12 @@ void SGDSolver<Dtype>::Normalize(int param_id) {
     if (net_params[param_id]->prv_diff()
         && (net_params[param_id]->prv_diff_count()
             == net_params[param_id]->count())) {
-
+        //LOG(INFO) << "Normalize stage: prv_diff_condition_flag = true.";
         caffe_scal(net_params[param_id]->count(), accum_normalization,
             net_params[param_id]->mutable_prv_diff());
     }
     else {
+        //LOG(INFO) << "Normalize stage: prv_diff_condition_flag = false.";
         caffe_scal(net_params[param_id]->count(), accum_normalization,
             net_params[param_id]->mutable_cpu_diff());
     }
@@ -275,10 +650,12 @@ void SGDSolver<Dtype>::Regularize(int param_id) {
   case Caffe::CPU: {
     if (local_decay) {
       if (regularization_type == "L2") {
+        //LOG(INFO) << "Regularize stage: regularization_type == L2.";
         // add weight decay
         if (net_params[param_id]->prv_data()
              && (net_params[param_id]->prv_data_count()
                  == net_params[param_id]->count())) {
+          //LOG(INFO) << "Regularize stage: prv_data_condition_flag = true.";
           CHECK_EQ(true,
             net_params[param_id]->get_prv_data_descriptor()->layout_compare(
             net_params[param_id]->get_prv_diff_descriptor()));
@@ -288,12 +665,14 @@ void SGDSolver<Dtype>::Regularize(int param_id) {
                      net_params[param_id]->prv_data(),
                      net_params[param_id]->mutable_prv_diff());
         } else {
+          //LOG(INFO) << "Regularize stage: prv_data_condition_flag = false.";
           caffe_axpy(net_params[param_id]->count(),
               local_decay,
               net_params[param_id]->cpu_data(),
               net_params[param_id]->mutable_cpu_diff());
         }
       } else if (regularization_type == "L1") {
+        //LOG(INFO) << "Regularize stage: regularization_type == L1.";
         caffe_cpu_sign(net_params[param_id]->count(),
             net_params[param_id]->cpu_data(),
             temp_[param_id]->mutable_cpu_data());
@@ -364,7 +743,7 @@ void SGDSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
     if (net_params[param_id]->prv_diff()
         && (net_params[param_id]->prv_diff_count()
             == net_params[param_id]->count())) {
-
+      //LOG(INFO) << "ComputeUpdateValue stage: prv_diff_condition_flag = true.";
       caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
                       net_params[param_id]->prv_diff(), momentum,
                       history_[param_id]->mutable_cpu_data());
@@ -373,6 +752,7 @@ void SGDSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
                  history_[param_id]->cpu_data(),
                  net_params[param_id]->mutable_prv_diff());
     } else {
+      //LOG(INFO) << "ComputeUpdateValue stage: prv_diff_condition_flag = false.";
       caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
                      net_params[param_id]->cpu_diff(), momentum,
                      history_[param_id]->mutable_cpu_data());

From a1d000d85378d8afee4204098c05f06b409e9c42 Mon Sep 17 00:00:00 2001
From: "Yu, Chong" <chong.yu@intel.com>
Date: Tue, 8 Aug 2017 23:54:34 +0800
Subject: [PATCH 10/38] 1. Use Macro for header file include. 2. Support L1
 Regulization fusion optimization.

---
 src/caffe/solvers/sgd_solver.cpp | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp
index 1480d3005..f8709412b 100644
--- a/src/caffe/solvers/sgd_solver.cpp
+++ b/src/caffe/solvers/sgd_solver.cpp
@@ -42,8 +42,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "caffe/util/hdf5.hpp"
 #include "caffe/util/io.hpp"
 #include "caffe/util/upgrade_proto.hpp"
-#include <immintrin.h>
 
+#ifdef ENABLE_SGD_FUSION
+#include <immintrin.h>
+#endif /* ENABLE_SGD_FUSION */
 
 namespace caffe {
 template <typename Dtype>
@@ -443,11 +445,12 @@ void SGDSolver<Dtype>::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_
   }
 //#pragma endregion
 
-//For POR topologies from BVLC, all skipped the Normalize stage, and use L2 regularization
+//For most common topologies from BVLC, all skipped the Normalize stage, and use L2 regularization
 //If prv_diff_condition_flag == true, then prv_data_condition_flag == true    (1)
 //If prv_diff_condition_flag == false, then prv_data_condition_flag == false  (2)
 //Another case is local_decay == 0, prv_diff_condition_flag == false          (3)
 //So only need to consider the fusion in situations (1) and (2), set execute_separate_ComputeUpdateValue_stage_flag to false value
+//We can extend the fusion in L1 regularization
   bool execute_separate_ComputeUpdateValue_stage_flag = true;
   //Regularize stage (Fused ComputeUpdateValue_stage in some situations)
   if (local_decay) {
@@ -552,10 +555,19 @@ void SGDSolver<Dtype>::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_
       caffe_cpu_sign(net_params[param_id]->count(),
                       net_params[param_id]->cpu_data(),
                       temp_[param_id]->mutable_cpu_data());
+
+      /*
       caffe_axpy(net_params[param_id]->count(),
                   local_decay,
                   temp_[param_id]->cpu_data(),
                   net_params[param_id]->mutable_cpu_diff());
+      */
+
+      avx512_axpy_axpby_copy(net_params[param_id]->count(), local_decay,
+                                temp_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff(),
+                                local_rate, momentum, history_[param_id]->mutable_cpu_data());
+      
+      execute_separate_ComputeUpdateValue_stage_flag = false;
     } else {
       LOG(FATAL) << "Unknown regularization type: " << regularization_type;
     }
@@ -564,7 +576,7 @@ void SGDSolver<Dtype>::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_
   //ComputeUpdateValue stage (separate)
   if (execute_separate_ComputeUpdateValue_stage_flag == true)
   {
-    //Include the situation: regularization_type == "L1"/"Unknown"
+    //Include the situation: regularization_type == "Unknown"
     //Include situations (3): local_decay == 0
     //No Regularize stage, only ComputeUpdateValue stage
     //ComputeUpdateValue stage

From 29c8bd382822a68588935c5ffe00bd023c792f35 Mon Sep 17 00:00:00 2001
From: "Yu, Chong" <chong.yu@intel.com>
Date: Wed, 9 Aug 2017 09:36:36 +0800
Subject: [PATCH 11/38] Check whether the machine support the avx512 command
 before using the SGD fusion.

Change-Id: I768bd16c5aadd5a17a78e7d4b72fbd0e05685994
---
 src/caffe/solvers/sgd_solver.cpp | 40 ++++++++++++++++----------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp
index f8709412b..378fecf9b 100644
--- a/src/caffe/solvers/sgd_solver.cpp
+++ b/src/caffe/solvers/sgd_solver.cpp
@@ -213,36 +213,36 @@ void SGDSolver<Dtype>::ApplyUpdate(int param_id) {
   }
 
 #ifdef ENABLE_SGD_FUSION
-  switch (Caffe::mode()) {
-  case Caffe::CPU: {
-    //VLOG(1) << "Use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD";
-    //LOG(INFO) << "Use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD";
-    Normalize_Regularize_ComputeUpdateValue_Fusion(param_id, rate);
-    break;
-  }
-  case Caffe::GPU: {
-#ifndef CPU_ONLY
-    //VLOG(1) << "Currently we do not support use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD in GPU mode.";
-    //LOG(INFO) << "Currently we do not support use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD in GPU mode.";
-#else
-    NO_GPU;
-#endif
-    break;
-  }
-  default:
-    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+  if (Caffe::mode() == Caffe::CPU) 
+  {
+    const unsigned long avx512_features = (_FEATURE_AVX512F | _FEATURE_AVX512CD);
+    bool avx512_enabled_ = _may_i_use_cpu_feature(avx512_features);
+    if (avx512_enabled_)
+    {
+      //LOG(INFO) << "Avx512 command is supported!";
+      //VLOG(1) << "Use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD";
+      //LOG(INFO) << "Use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD";
+      Normalize_Regularize_ComputeUpdateValue_Fusion(param_id, rate);
+      this->net_->learnable_params()[param_id]->Update();
+      return;
+    }
+    else
+    {
+      //LOG(INFO) << "Avx512 command is not supported, so cannot use the SGD fusion!";
+    }
   }
-#else /* !ENABLE_SGD_FUSION */
+#endif /* ENABLE_SGD_FUSION */
+
   //LOG(INFO) << "No Fusion: Param_id: " << param_id;
   Normalize(param_id);
   
   LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], diff, param_id, "ApplyUpdate: delwt after Normalize:");
 
   Regularize(param_id);
+
   LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], diff, param_id, "ApplyUpdate: delwt after Regularize:");
 
   ComputeUpdateValue(param_id, rate);
-#endif /* ENABLE_SGD_FUSION */
 
   LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], diff, param_id, "ApplyUpdate: wtinc:");
 

From 98dc3fd24e4acdeb4ef9a9fdb2911c7bd9bacb45 Mon Sep 17 00:00:00 2001
From: xiaolil1 <xiaoli.liu@intel.com>
Date: Wed, 9 Aug 2017 11:47:09 +0800
Subject: [PATCH 12/38] Fix mkldnn split layer for accuracy issue

---
 src/caffe/layers/mkldnn_split_layer.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/caffe/layers/mkldnn_split_layer.cpp b/src/caffe/layers/mkldnn_split_layer.cpp
index ab2c5156a..12359c141 100644
--- a/src/caffe/layers/mkldnn_split_layer.cpp
+++ b/src/caffe/layers/mkldnn_split_layer.cpp
@@ -94,10 +94,15 @@ void MKLDNNSplitLayer<Dtype>::InitSplitBwd(const vector<Blob<Dtype>*>& bottom,
 
   // Dimensions of bottom and top blobs. There is a number of
   // top blobs each of the same size as the bottom one
-  memory::dims bottom_tz = {static_cast<int>(this->sizes_src_[0]),
-                            static_cast<int>(this->sizes_src_[1]),
-                            static_cast<int>(this->sizes_src_[2]),
-                            static_cast<int>(this->sizes_src_[3])};
+  memory::dims bottom_tz;
+  bottom_tz.resize(4);
+  for(int i=0; i<4; i++) {
+    if(i < this->sizes_src_.size()) {
+      bottom_tz[i] = static_cast<int>(this->sizes_src_[i]);
+    } else {
+      bottom_tz[i] = 1;
+    }
+  }
 
   shared_ptr<memory::primitive_desc> prv_diff_dst_mpd;
   shared_ptr<memory::primitive_desc> usr_diff_dst_mpd(

From 872ac8a5c9252701435b2c396ac6e48acd5747bd Mon Sep 17 00:00:00 2001
From: Feng Tian <feng.tian@intel.com>
Date: Wed, 9 Aug 2017 14:04:44 +0800
Subject: [PATCH 13/38] add 64-bits blob size support

---
 include/caffe/blob.hpp  | 6 +++---
 include/caffe/layer.hpp | 8 ++++----
 python/caffe/_caffe.cpp | 2 +-
 src/caffe/blob.cpp      | 2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
index 3295f7ab1..47d0d751c 100644
--- a/include/caffe/blob.hpp
+++ b/include/caffe/blob.hpp
@@ -109,7 +109,7 @@ class Blob {
     return shape_[CanonicalAxisIndex(index)];
   }
   inline int num_axes() const { return shape_.size(); }
-  inline int count() const { return count_; }
+  inline long count() const { return count_; }
 
   /**
    * @brief Compute the volume of a slice; i.e., the product of dimensions
@@ -332,8 +332,8 @@ class Blob {
   shared_ptr<SyncedMemory> shape_data_;
 #endif
   vector<int> shape_;
-  int count_;
-  int capacity_;
+  long count_;
+  long capacity_;
 
   DISABLE_COPY_AND_ASSIGN(Blob);
 };  // class Blob
diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp
index 9dc4d557b..5a95a7730 100644
--- a/include/caffe/layer.hpp
+++ b/include/caffe/layer.hpp
@@ -55,8 +55,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define LOG_BLOB(layer, blob, part, blob_id, description)              \
   do                                                                   \
   {                                                                    \
-      int elems_to_log = std::min(MAX_ELEMS_TO_LOG, blob->count());    \
-      for (int idx = 0; idx < elems_to_log; idx++)                     \
+      long elems_to_log = std::min(static_cast<long>(MAX_ELEMS_TO_LOG), blob->count());    \
+      for (long idx = 0; idx < elems_to_log; idx++)                     \
       {                                                                \
           LOG_LAYER(layer) << description                              \
                            << ", blob_id " << blob_id                  \
@@ -68,8 +68,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define LOG_PARAM_BLOB(blob, part, blob_id, description)               \
   do                                                                   \
   {                                                                    \
-      int elems_to_log = std::min(MAX_ELEMS_TO_LOG, blob->count());    \
-      for (int idx = 0; idx < elems_to_log; idx++)                     \
+      long elems_to_log = std::min(static_cast<long>(MAX_ELEMS_TO_LOG), blob->count());    \
+      for (long idx = 0; idx < elems_to_log; idx++)                     \
       {                                                                \
           DLOG(INFO) << description                                    \
                      << ", blob_id " << blob_id                        \
diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index b9dc23e24..3b02f509b 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -411,7 +411,7 @@ BOOST_PYTHON_MODULE(_caffe) {
     .add_property("channels", &Blob<Dtype>::channels)
     .add_property("height",   &Blob<Dtype>::height)
     .add_property("width",    &Blob<Dtype>::width)
-    .add_property("count",    static_cast<int (Blob<Dtype>::*)() const>(
+    .add_property("count",    static_cast<long (Blob<Dtype>::*)() const>(
         &Blob<Dtype>::count))
     .def("reshape",           bp::raw_function(&Blob_Reshape))
     .add_property("data",     bp::make_function(&Blob<Dtype>::mutable_cpu_data,
diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp
index dddb0f2db..48ae68dc7 100644
--- a/src/caffe/blob.cpp
+++ b/src/caffe/blob.cpp
@@ -72,7 +72,7 @@ void Blob<Dtype>::Reshape(const vector<int>& shape) {
   for (int i = 0; i < shape.size(); ++i) {
     CHECK_GE(shape[i], 0);
     if (count_ != 0) {
-      CHECK_LE(shape[i], INT_MAX / count_) << "blob size exceeds INT_MAX";
+      CHECK_LE(shape[i], LONG_MAX / count_) << "blob size exceeds LONG_MAX";
     }
     count_ *= shape[i];
     if (shape_[i] != shape[i]) {

From 6200f6ba848254f661f4eea41e46c8faccb1b9e0 Mon Sep 17 00:00:00 2001
From: fzou1 <feng.zou@intel.com>
Date: Wed, 9 Aug 2017 23:43:19 +0800
Subject: [PATCH 14/38] fix convergence issue of forward overlapping
 optimization by moving ClearParamDiffs after WaitGradientComm; and enable it
 by default

Change-Id: I4dac71a49720cd72b6df2eb14047ad5ad1fd1098
---
 Makefile                               |  2 +-
 cmake/Dependencies.cmake               |  2 +-
 include/caffe/multinode/multi_sync.hpp |  4 ----
 src/caffe/multinode/multi_solver.cpp   | 18 +++++++++++++-----
 4 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/Makefile b/Makefile
index 46d259a37..f7144b7db 100644
--- a/Makefile
+++ b/Makefile
@@ -80,7 +80,7 @@ ifeq ($(CAFFE_MLSL_SHUFFLE), 1)
 	COMMON_FLAGS += -DCAFFE_MLSL_SHUFFLE
 endif
 
-ifeq ($(FW_OVERLAP_OPT), 1)
+ifneq ($(FW_OVERLAP_OPT), 0)
 	COMMON_FLAGS += -DFW_OVERLAP_OPT
 endif
 endif
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 67adf4ba7..b8c5577c6 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -122,7 +122,7 @@ if(USE_MLSL)
   if(CAFFE_MLSL_SHUFFLE)
     add_definitions("-DCAFFE_MLSL_SHUFFLE")
   endif()
-  if(FW_OVERLAP_OPT)
+  if(FW_OVERLAP_OPT OR NOT DEFINED FW_OVERLAP_OPT)
     message(STATUS "Forward overlapping optimization is enabled!")
     add_definitions("-DFW_OVERLAP_OPT")
   endif()
diff --git a/include/caffe/multinode/multi_sync.hpp b/include/caffe/multinode/multi_sync.hpp
index 6300c4876..905d9fce7 100644
--- a/include/caffe/multinode/multi_sync.hpp
+++ b/include/caffe/multinode/multi_sync.hpp
@@ -215,10 +215,6 @@ namespace caffe {
     }
 
     void on_iter_finished(int layer_id) {
-#ifdef FW_OVERLAP_OPT
-      solver->set_layer_finished_flag(layer_id, false);
-#endif
-
       boost::shared_ptr<Layer<Dtype>> &layer = layers[layer_id];
       if (layer->layerOp == nullptr) {
         return;
diff --git a/src/caffe/multinode/multi_solver.cpp b/src/caffe/multinode/multi_solver.cpp
index 13ad8da2b..54e256631 100644
--- a/src/caffe/multinode/multi_solver.cpp
+++ b/src/caffe/multinode/multi_solver.cpp
@@ -105,12 +105,13 @@ Dtype MultiSolver<Dtype>::ForwardBackwardImpl(bool first, bool last) {
 
   for (int i = 0; i < layers.size(); ++i) {
 #ifdef FW_OVERLAP_OPT
-    if (first && IsSkipWaitGradient(i) == false) {
+    if (first && (IsSkipWaitGradient(i) == false)) {
       while (layer_finished_flags_[i] == false) {
         WaitAndUpdateGradient(i);
         if (layer_finished_flags_[i])
           break;
 
+        // wait and update gradient for next layers
         for (int k=i+1; k<layers.size(); k++) {
           if (layer_finished_flags_[k] || IsSkipWaitGradient(k)) {
             layer_finished_flags_[k] = true;
@@ -121,6 +122,7 @@ Dtype MultiSolver<Dtype>::ForwardBackwardImpl(bool first, bool last) {
             break;
         }
       }
+      layer_finished_flags_[i] = false;
     }
 #endif
 
@@ -129,6 +131,11 @@ Dtype MultiSolver<Dtype>::ForwardBackwardImpl(bool first, bool last) {
     LAYER_TIMING_STOP(forward, i);
   }
 
+  // Clear parameter diffs after communication is finished (that is, after 
+  // calling WaitGradientComm)
+  if (first)
+    root_solver_->net()->ClearParamDiffs();
+
   for (int i = layers.size() - 1; i >= 0; --i) {
     if (!layer_need_backward[i]) {
       continue;
@@ -160,6 +167,11 @@ Dtype MultiSolver<Dtype>::ForwardBackwardImpl(bool first, bool last) {
   if (last) {
 #endif
       for (int i = 0; i < layers.size(); ++i) {
+#ifdef FW_OVERLAP_OPT
+        if (layer_finished_flags_[i])
+          continue;
+#endif
+
         if (IsSkipWaitGradient(i)) {
 #ifdef FW_OVERLAP_OPT
           finished_count++;
@@ -167,10 +179,6 @@ Dtype MultiSolver<Dtype>::ForwardBackwardImpl(bool first, bool last) {
 #endif
           continue;
         }
-#ifdef FW_OVERLAP_OPT
-        if (layer_finished_flags_[i])
-          continue;
-#endif
 
         WaitAndUpdateGradient(i);
 #ifdef FW_OVERLAP_OPT

From 5522134cca276eba46ddee6cf194d7182e39d679 Mon Sep 17 00:00:00 2001
From: "Yu, Chong" <chong.yu@intel.com>
Date: Thu, 10 Aug 2017 14:17:35 +0800
Subject: [PATCH 15/38] Take out the dependency of ICC.

---
 src/caffe/solvers/sgd_solver.cpp | 107 ++-----------------------------
 1 file changed, 7 insertions(+), 100 deletions(-)

diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp
index 378fecf9b..e32307dda 100644
--- a/src/caffe/solvers/sgd_solver.cpp
+++ b/src/caffe/solvers/sgd_solver.cpp
@@ -43,9 +43,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "caffe/util/io.hpp"
 #include "caffe/util/upgrade_proto.hpp"
 
-#ifdef ENABLE_SGD_FUSION
-#include <immintrin.h>
-#endif /* ENABLE_SGD_FUSION */
 
 namespace caffe {
 template <typename Dtype>
@@ -265,7 +262,8 @@ void axpy_axpby_copy<float>(size_t count, const float decay, const float* net_pa
 {
   float temp_result = 0.;
 #ifdef _OPENMP
-#pragma omp parallel for
+//#pragma omp parallel for
+#pragma omp parallel for simd schedule(static)
 #endif  
   for (size_t i = 0; i < count; ++i) {
     temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i];
@@ -280,7 +278,8 @@ void axpy_axpby_copy<double>(size_t count, const double decay, const double* net
 {
   double temp_result = 0.;
 #ifdef _OPENMP
-#pragma omp parallel for
+//#pragma omp parallel for
+#pragma omp parallel for simd schedule(static)
 #endif  
   for (size_t i = 0; i < count; ++i) {
     temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i];
@@ -289,98 +288,6 @@ void axpy_axpby_copy<double>(size_t count, const double decay, const double* net
   }
 }
 
-template <typename Dtype>
-void avx512_axpy_axpby_copy(size_t count, const Dtype decay, const Dtype* net_params_data, Dtype *net_params_diff,
-                            const Dtype rate, const Dtype momentum, Dtype* history_data);
-
-template <>
-void avx512_axpy_axpby_copy<float>(size_t count, const float decay, const float* net_params_data, float *net_params_diff,
-                                  const float rate, const float momentum, float* history_data)
-{
-    // If count is smaller than 16 we use non-avx512 implementation
-    // 16 is the element number which one avx512 register can hold
-    if (count < 16) {
-        return axpy_axpby_copy(count, decay, net_params_data, net_params_diff,
-                                     rate, momentum, history_data);
-    }
-
-    // If count can't be divided by 16, we handle tailing remainder
-    // with non-avx512 imeplementation
-    if (count % 16 != 0) {
-        size_t remainder = count % 16;
-        count -= remainder;
-        axpy_axpby_copy(remainder, decay, net_params_data+count, net_params_diff+count,
-                              rate, momentum, history_data+count);
-    }
-
-    size_t group_size = 16;
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-    for (size_t idx = 0; idx < count; idx += group_size) {
-        const float *fnet_params_data  = net_params_data + idx;
-        float *fnet_params_diff        = net_params_diff + idx;
-        float *fhistory_data           = history_data    + idx;
-        __m512 operand1_v              = _mm512_loadu_ps(fnet_params_data);
-        __m512 operand2_v              = _mm512_loadu_ps(fnet_params_diff);
-        __m512 operand3_v              = _mm512_loadu_ps(fhistory_data);
-        __m512 decay_operand_v         = _mm512_set1_ps(decay);
-        __m512 rate_operand_v          = _mm512_set1_ps(rate);
-        __m512 momentum_operand_v      = _mm512_set1_ps(momentum);
-        __m512 decay_result            = _mm512_mul_ps(decay_operand_v, operand1_v);
-        __m512 axpy_result             = _mm512_add_ps(decay_result, operand2_v);
-        __m512 rate_result             = _mm512_mul_ps(rate_operand_v, axpy_result);
-        __m512 momentum_result         = _mm512_mul_ps(momentum_operand_v, operand3_v);
-        __m512 axpby_result            = _mm512_add_ps(rate_result, momentum_result);
-        _mm512_storeu_ps(fhistory_data, axpby_result);
-        _mm512_storeu_ps(fnet_params_diff, axpby_result);
-    }
-}
-
-template <>
-void avx512_axpy_axpby_copy<double>(size_t count, const double decay, const double* net_params_data, double* net_params_diff,
-                                    const double rate, const double momentum, double* history_data)
-{
-    // If count is smaller than 8 we use non-avx512 implementation
-    // 8 is the element number which one avx512 register can hold
-    if (count < 8) {
-        return axpy_axpby_copy(count, decay, net_params_data, net_params_diff,
-                               rate, momentum, history_data);
-    }
-
-    // If count can't be divided by 8, we handle tailing remainder
-    // with non-avx512 imeplementation
-    if (count % 8 != 0) {
-        size_t remainder = count % 8;
-        count -= remainder;
-        axpy_axpby_copy(remainder, decay, net_params_data+count, net_params_diff+count,
-                        rate, momentum, history_data+count);
-    }
-
-    size_t group_size = 8;
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-    for (size_t idx = 0; idx < count; idx += group_size) {
-        const double *fnet_params_data  = net_params_data + idx;
-        double *fnet_params_diff        = net_params_diff + idx;
-        double *fhistory_data           = history_data    + idx;
-        __m512 operand1_v               = _mm512_loadu_pd(fnet_params_data);
-        __m512 operand2_v               = _mm512_loadu_pd(fnet_params_diff);
-        __m512 operand3_v               = _mm512_loadu_pd(fhistory_data);
-        __m512 decay_operand_v          = _mm512_set1_pd(decay);
-        __m512 rate_operand_v           = _mm512_set1_pd(rate);
-        __m512 momentum_operand_v       = _mm512_set1_pd(momentum);
-        __m512 decay_result             = _mm512_mul_pd(decay_operand_v, operand1_v);
-        __m512 axpy_result              = _mm512_add_pd(decay_result, operand2_v);
-        __m512 rate_result              = _mm512_mul_pd(rate_operand_v, axpy_result);
-        __m512 momentum_result          = _mm512_mul_pd(momentum_operand_v, operand3_v);
-        __m512 axpby_result             = _mm512_add_pd(rate_result, momentum_result);
-        _mm512_storeu_pd(fhistory_data, axpby_result);
-        _mm512_storeu_pd(fnet_params_diff, axpby_result);
-    }
-}
-
 
 template <typename Dtype>
 void SGDSolver<Dtype>::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_id, Dtype rate) {
@@ -483,7 +390,7 @@ void SGDSolver<Dtype>::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_
                         net_params[param_id]->mutable_prv_diff());
             */
 
-            avx512_axpy_axpby_copy(net_params[param_id]->count(), local_decay,
+            axpy_axpby_copy(net_params[param_id]->count(), local_decay,
                                 net_params[param_id]->prv_data(), net_params[param_id]->mutable_prv_diff(),
                                 local_rate, momentum, history_[param_id]->mutable_cpu_data());
 
@@ -527,7 +434,7 @@ void SGDSolver<Dtype>::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_
                       net_params[param_id]->mutable_cpu_diff());
           */
 
-          avx512_axpy_axpby_copy(net_params[param_id]->count(), local_decay,
+          axpy_axpby_copy(net_params[param_id]->count(), local_decay,
                                 net_params[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff(),
                                 local_rate, momentum, history_[param_id]->mutable_cpu_data());
 
@@ -563,7 +470,7 @@ void SGDSolver<Dtype>::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_
                   net_params[param_id]->mutable_cpu_diff());
       */
 
-      avx512_axpy_axpby_copy(net_params[param_id]->count(), local_decay,
+      axpy_axpby_copy(net_params[param_id]->count(), local_decay,
                                 temp_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff(),
                                 local_rate, momentum, history_[param_id]->mutable_cpu_data());
       

From 1fb45f844c229655e8ed2554c54467661a4a24fd Mon Sep 17 00:00:00 2001
From: fzou1 <feng.zou@intel.com>
Date: Thu, 10 Aug 2017 15:42:23 +0800
Subject: [PATCH 16/38] fix regression by removing duplicated ClearParamDiffs
 call; and correct loss for displaying by multiplying NO of nodes for data
 parallelism

---
 src/caffe/multinode/multi_solver.cpp |  1 -
 src/caffe/solver.cpp                 | 11 ++++++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/caffe/multinode/multi_solver.cpp b/src/caffe/multinode/multi_solver.cpp
index 54e256631..d21fb5580 100644
--- a/src/caffe/multinode/multi_solver.cpp
+++ b/src/caffe/multinode/multi_solver.cpp
@@ -198,7 +198,6 @@ Dtype MultiSolver<Dtype>::ForwardBackwardImpl(bool first, bool last) {
 template <typename Dtype>
 Dtype MultiSolver<Dtype>::ForwardBackward() {
   Dtype loss = 0;
-  root_solver_->net()->ClearParamDiffs();
   for (int i = 0; i < iter_size; ++i) {
     loss += ForwardBackwardImpl(
       (i == 0), (i + 1 == iter_size));
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 3c8d1e66b..f7e7ac1cd 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -321,7 +321,12 @@ void Solver<Dtype>::Step(int iters) {
         const string& output_name =
             net_->blob_names()[net_->output_blob_indices()[j]];
         const Dtype loss_weight =
-            net_->blob_loss_weights()[net_->output_blob_indices()[j]];
+            net_->blob_loss_weights()[net_->output_blob_indices()[j]]
+#ifdef USE_MLSL
+            * mn::get_distrib()->get_data_parts()
+#endif
+              ;
+
         for (int k = 0; k < result[j]->count(); ++k) {
           ostringstream loss_msg_stream;
           if (loss_weight) {
@@ -928,6 +933,10 @@ void Solver<Dtype>::Restore(const char* state_file) {
 template <typename Dtype>
 void Solver<Dtype>::UpdateSmoothedLoss(Dtype loss, int start_iter,
     int average_loss) {
+#ifdef USE_MLSL
+  loss *= mn::get_distrib()->get_data_parts();
+#endif
+
   if (losses_.size() < average_loss) {
     losses_.push_back(loss);
     int size = losses_.size();

From 63c3bb5c84c812230ab869e08f4efa337a67a3b7 Mon Sep 17 00:00:00 2001
From: "Yu, Chong" <chong.yu@intel.com>
Date: Thu, 10 Aug 2017 15:47:59 +0800
Subject: [PATCH 17/38] Fix the error to build with gcc 4.8.

---
 src/caffe/solvers/sgd_solver.cpp | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp
index e32307dda..8b1a70e87 100644
--- a/src/caffe/solvers/sgd_solver.cpp
+++ b/src/caffe/solvers/sgd_solver.cpp
@@ -212,21 +212,11 @@ void SGDSolver<Dtype>::ApplyUpdate(int param_id) {
 #ifdef ENABLE_SGD_FUSION
   if (Caffe::mode() == Caffe::CPU) 
   {
-    const unsigned long avx512_features = (_FEATURE_AVX512F | _FEATURE_AVX512CD);
-    bool avx512_enabled_ = _may_i_use_cpu_feature(avx512_features);
-    if (avx512_enabled_)
-    {
-      //LOG(INFO) << "Avx512 command is supported!";
-      //VLOG(1) << "Use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD";
-      //LOG(INFO) << "Use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD";
-      Normalize_Regularize_ComputeUpdateValue_Fusion(param_id, rate);
-      this->net_->learnable_params()[param_id]->Update();
-      return;
-    }
-    else
-    {
-      //LOG(INFO) << "Avx512 command is not supported, so cannot use the SGD fusion!";
-    }
+    //VLOG(1) << "Use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD";
+    //LOG(INFO) << "Use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD";
+    Normalize_Regularize_ComputeUpdateValue_Fusion(param_id, rate);
+    this->net_->learnable_params()[param_id]->Update();
+    return;
   }
 #endif /* ENABLE_SGD_FUSION */
 
@@ -262,8 +252,9 @@ void axpy_axpby_copy<float>(size_t count, const float decay, const float* net_pa
 {
   float temp_result = 0.;
 #ifdef _OPENMP
-//#pragma omp parallel for
-#pragma omp parallel for simd schedule(static)
+//#pragma omp parallel for simd schedule(static)  //Not work for GCC 4.8
+#pragma omp parallel for schedule(static)
+#pragma simd
 #endif  
   for (size_t i = 0; i < count; ++i) {
     temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i];
@@ -278,8 +269,9 @@ void axpy_axpby_copy<double>(size_t count, const double decay, const double* net
 {
   double temp_result = 0.;
 #ifdef _OPENMP
-//#pragma omp parallel for
-#pragma omp parallel for simd schedule(static)
+//#pragma omp parallel for simd schedule(static)  //Not work for GCC 4.8
+#pragma omp parallel for schedule(static)
+#pragma simd
 #endif  
   for (size_t i = 0; i < count; ++i) {
     temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i];

From 02e0829418e23863dd9b06ce694859dbaf43290e Mon Sep 17 00:00:00 2001
From: fzou1 <feng.zou@intel.com>
Date: Fri, 11 Aug 2017 10:28:43 +0800
Subject: [PATCH 18/38] fix hang issue during testing by moving checking flag
 after IsSkipWaitGradient

---
 src/caffe/multinode/multi_solver.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/caffe/multinode/multi_solver.cpp b/src/caffe/multinode/multi_solver.cpp
index d21fb5580..ad5a7066b 100644
--- a/src/caffe/multinode/multi_solver.cpp
+++ b/src/caffe/multinode/multi_solver.cpp
@@ -167,11 +167,6 @@ Dtype MultiSolver<Dtype>::ForwardBackwardImpl(bool first, bool last) {
   if (last) {
 #endif
       for (int i = 0; i < layers.size(); ++i) {
-#ifdef FW_OVERLAP_OPT
-        if (layer_finished_flags_[i])
-          continue;
-#endif
-
         if (IsSkipWaitGradient(i)) {
 #ifdef FW_OVERLAP_OPT
           finished_count++;
@@ -179,6 +174,10 @@ Dtype MultiSolver<Dtype>::ForwardBackwardImpl(bool first, bool last) {
 #endif
           continue;
         }
+#ifdef FW_OVERLAP_OPT
+        if (layer_finished_flags_[i])
+          continue;
+#endif
 
         WaitAndUpdateGradient(i);
 #ifdef FW_OVERLAP_OPT

From 668599e62a877f5382bcb6c068e0fc9943891c5c Mon Sep 17 00:00:00 2001
From: linxinan <xinan.lin@intel.com>
Date: Mon, 14 Aug 2017 12:33:10 +0800
Subject: [PATCH 19/38] Add 5 warm up iterations in caffe time, because the
 first several iteration times have huge variance in some machines.

---
 tools/caffe.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index 231209127..5d0ea7f49 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -547,6 +547,22 @@ int time() {
   const vector<vector<Blob<float>*> >& top_vecs = caffe_net.top_vecs();
   const vector<vector<bool> >& bottom_need_backward =
       caffe_net.bottom_need_backward();
+
+  // Warm up 5 iterations here, because the first several iteration times
+  // have huge variance in some machines.
+  int warmup_iterations = 5;
+  for (int j = 0; j < warmup_iterations; ++j) {
+    for (int i = 0; i < layers.size(); ++i) {
+      layers[i]->Forward(bottom_vecs[i], top_vecs[i]);
+    }
+    if (!FLAGS_forward_only) {
+      for (int i = layers.size() - 1; i >= 0; --i) {
+        layers[i]->Backward(top_vecs[i], bottom_need_backward[i],
+                            bottom_vecs[i]);
+      }
+    }
+  }
+
   LOG(INFO) << "*** Benchmark begins ***";
   LOG(INFO) << "Testing for " << FLAGS_iterations << " iterations.";
   Timer total_timer;

From 5fb759e3e7134bcb2d9efcb2b01d9bfa59f64cb1 Mon Sep 17 00:00:00 2001
From: fzou1 <feng.zou@intel.com>
Date: Mon, 14 Aug 2017 16:38:11 +0800
Subject: [PATCH 20/38] fix accuracy issue

---
 src/caffe/multinode/multi_solver.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/caffe/multinode/multi_solver.cpp b/src/caffe/multinode/multi_solver.cpp
index ad5a7066b..59eec8c7c 100644
--- a/src/caffe/multinode/multi_solver.cpp
+++ b/src/caffe/multinode/multi_solver.cpp
@@ -105,8 +105,10 @@ Dtype MultiSolver<Dtype>::ForwardBackwardImpl(bool first, bool last) {
 
   for (int i = 0; i < layers.size(); ++i) {
 #ifdef FW_OVERLAP_OPT
-    if (first && (IsSkipWaitGradient(i) == false)) {
+    if (first) {
       while (layer_finished_flags_[i] == false) {
+        if (IsSkipWaitGradient(i))
+         break;
         WaitAndUpdateGradient(i);
         if (layer_finished_flags_[i])
           break;
@@ -167,6 +169,10 @@ Dtype MultiSolver<Dtype>::ForwardBackwardImpl(bool first, bool last) {
   if (last) {
 #endif
       for (int i = 0; i < layers.size(); ++i) {
+#ifdef FW_OVERLAP_OPT
+        if (layer_finished_flags_[i])
+          continue;
+#endif
         if (IsSkipWaitGradient(i)) {
 #ifdef FW_OVERLAP_OPT
           finished_count++;
@@ -174,10 +180,6 @@ Dtype MultiSolver<Dtype>::ForwardBackwardImpl(bool first, bool last) {
 #endif
           continue;
         }
-#ifdef FW_OVERLAP_OPT
-        if (layer_finished_flags_[i])
-          continue;
-#endif
 
         WaitAndUpdateGradient(i);
 #ifdef FW_OVERLAP_OPT

From 1f9468a0b0f1c72ab45c8d02ce51194a74c98871 Mon Sep 17 00:00:00 2001
From: "Yu, Chong" <chong.yu@intel.com>
Date: Mon, 14 Aug 2017 23:47:30 +0800
Subject: [PATCH 21/38] Fuse the Update stage together in SGD update process.

---
 include/caffe/sgd_solvers.hpp    |   4 +-
 src/caffe/solvers/sgd_solver.cpp | 135 ++++++++++++++++++++-----------
 2 files changed, 88 insertions(+), 51 deletions(-)

diff --git a/include/caffe/sgd_solvers.hpp b/include/caffe/sgd_solvers.hpp
index 9741ef212..09f6ff26e 100644
--- a/include/caffe/sgd_solvers.hpp
+++ b/include/caffe/sgd_solvers.hpp
@@ -82,8 +82,8 @@ class SGDSolver : public Solver<Dtype> {
   vector<shared_ptr<Blob<Dtype> > > history_, update_, temp_;
 
 #ifdef ENABLE_SGD_FUSION
-  //Fuse the Normalize, Regularize and ComputeUpdateValue process together
-  void Normalize_Regularize_ComputeUpdateValue_Fusion(int param_id, Dtype rate);
+  //Fuse the Normalize, Regularize, ComputeUpdateValue and Update process together
+  void SGDFusion(int param_id, Dtype rate);
 #endif /* ENABLE_SGD_FUSION */
 
   // loss history for 'plateau' LR policy (should be stored in snapshots)
diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp
index 8b1a70e87..fafe8a418 100644
--- a/src/caffe/solvers/sgd_solver.cpp
+++ b/src/caffe/solvers/sgd_solver.cpp
@@ -212,10 +212,9 @@ void SGDSolver<Dtype>::ApplyUpdate(int param_id) {
 #ifdef ENABLE_SGD_FUSION
   if (Caffe::mode() == Caffe::CPU) 
   {
-    //VLOG(1) << "Use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD";
-    //LOG(INFO) << "Use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD";
-    Normalize_Regularize_ComputeUpdateValue_Fusion(param_id, rate);
-    this->net_->learnable_params()[param_id]->Update();
+    //VLOG(1) << "Use Normalize_Regularize_ComputeUpdateValue_Update_Fusion for SGD";
+    //LOG(INFO) << "Use Normalize_Regularize_ComputeUpdateValue_Update_Fusion for SGD";
+    SGDFusion(param_id, rate);
     return;
   }
 #endif /* ENABLE_SGD_FUSION */
@@ -242,6 +241,8 @@ void SGDSolver<Dtype>::ApplyUpdate(int param_id) {
 
 #ifdef ENABLE_SGD_FUSION
 //Math function for fusion
+//Function 1: axpy_axpby_copy
+//Start: For L1 Regularize_ComputeUpdateValue_Fusion
 template <typename Dtype>
 void axpy_axpby_copy(size_t count, const Dtype decay, const Dtype* net_params_data, Dtype *net_params_diff,
                      const Dtype rate, const Dtype momentum, Dtype* history_data);
@@ -253,13 +254,13 @@ void axpy_axpby_copy<float>(size_t count, const float decay, const float* net_pa
   float temp_result = 0.;
 #ifdef _OPENMP
 //#pragma omp parallel for simd schedule(static)  //Not work for GCC 4.8
-#pragma omp parallel for schedule(static)
+#pragma omp parallel for schedule(static)
 #pragma simd
 #endif  
   for (size_t i = 0; i < count; ++i) {
     temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i];
-    history_data[i] =  temp_result;
-    net_params_diff[i] =  temp_result;
+    history_data[i] = temp_result;
+    net_params_diff[i] = temp_result;
   }
 }
 
@@ -270,19 +271,62 @@ void axpy_axpby_copy<double>(size_t count, const double decay, const double* net
   double temp_result = 0.;
 #ifdef _OPENMP
 //#pragma omp parallel for simd schedule(static)  //Not work for GCC 4.8
-#pragma omp parallel for schedule(static)
+#pragma omp parallel for schedule(static)
+#pragma simd
+#endif  
+  for (size_t i = 0; i < count; ++i) {
+    temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i];
+    history_data[i] = temp_result;
+    net_params_diff[i] = temp_result;
+  }
+}
+//End: For L1 Regularize_ComputeUpdateValue_Fusion
+
+//Function 2: axpy_axpby_copy_axpy
+//Start: For L2 Regularize_ComputeUpdateValue_Update_Fusion
+template <typename Dtype>
+void axpy_axpby_copy_axpy(size_t count, const Dtype decay, Dtype* net_params_data, Dtype *net_params_diff,
+                     const Dtype rate, const Dtype momentum, Dtype* history_data, const Dtype update_param);
+
+template <>
+void axpy_axpby_copy_axpy<float>(size_t count, const float decay, float* net_params_data, float *net_params_diff,
+                            const float rate, const float momentum, float* history_data, const float update_param)
+{
+  float temp_result = 0.;
+#ifdef _OPENMP
+//#pragma omp parallel for simd schedule(static)  //Not work for GCC 4.8
+#pragma omp parallel for schedule(static)
 #pragma simd
 #endif  
   for (size_t i = 0; i < count; ++i) {
     temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i];
     history_data[i] =  temp_result;
-    net_params_diff[i] =  temp_result;
+    net_params_diff[i] = temp_result;
+    net_params_data[i] = update_param * temp_result + net_params_data[i];
   }
 }
 
+template <>
+void axpy_axpby_copy_axpy<double>(size_t count, const double decay, double* net_params_data, double *net_params_diff,
+                             const double rate, const double momentum, double* history_data, const double update_param)
+{
+  double temp_result = 0.;
+#ifdef _OPENMP
+//#pragma omp parallel for simd schedule(static)  //Not work for GCC 4.8
+#pragma omp parallel for schedule(static)
+#pragma simd
+#endif  
+  for (size_t i = 0; i < count; ++i) {
+    temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i];
+    net_params_diff[i] = temp_result;
+    net_params_data[i] = update_param * temp_result + net_params_data[i];
+  }
+}
+//End: For L2 Regularize_ComputeUpdateValue_Update_Fusion
+
 
 template <typename Dtype>
-void SGDSolver<Dtype>::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_id, Dtype rate) {
+void SGDSolver<Dtype>::SGDFusion(int param_id, Dtype rate) {
 //LOG(INFO) << "Fusion: Param_id: " << param_id;
 
 //#pragma region 1. Common initialization
@@ -310,7 +354,7 @@ void SGDSolver<Dtype>::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_
   bool prv_diff_condition_flag = false;
   if (net_params[param_id]->prv_diff()
     && (net_params[param_id]->prv_diff_count()
-    == net_params[param_id]->count())) {
+    == net_params[param_id]->prv_data_count())) {
       prv_diff_condition_flag = true;
       //LOG(INFO) << "Common condition judgement: prv_diff_condition_flag = true.";
   }
@@ -329,7 +373,7 @@ void SGDSolver<Dtype>::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_
       
     if (prv_diff_condition_flag) {
       //LOG(INFO) << "Normalize stage: prv_diff_condition_flag = true.";
-      caffe_scal(net_params[param_id]->count(), accum_normalization,
+      caffe_scal(net_params[param_id]->prv_data_count(), accum_normalization,
         net_params[param_id]->mutable_prv_diff());
     }
     else {
@@ -349,8 +393,10 @@ void SGDSolver<Dtype>::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_
 //If prv_diff_condition_flag == false, then prv_data_condition_flag == false  (2)
 //Another case is local_decay == 0, prv_diff_condition_flag == false          (3)
 //So only need to consider the fusion in situations (1) and (2), set execute_separate_ComputeUpdateValue_stage_flag to false value
-//We can extend the fusion in L1 regularization
-  bool execute_separate_ComputeUpdateValue_stage_flag = true;
+//We can extend the fusion in L1 regularization by axpy_axpby_copy
+//We extend the fusion of Update stage in L2 regularization by axpy_axpby_copy_axpy,
+//then need to change execute_separate_ComputeUpdateValue_stage_flag to execute_separate_ComputeUpdateValue_Update_stage_flag
+  bool execute_separate_ComputeUpdateValue_Update_stage_flag = true;
   //Regularize stage (Fused ComputeUpdateValue_stage in some situations)
   if (local_decay) {
     if (regularization_type == "L2") {
@@ -381,28 +427,20 @@ void SGDSolver<Dtype>::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_
                         history_[param_id]->cpu_data(),
                         net_params[param_id]->mutable_prv_diff());
             */
+            
+            if(net_params[param_id]->prv_data_count() != history_[param_id]->count())
+              history_[param_id]->Reshape(net_params[param_id]->shape());
 
-            axpy_axpby_copy(net_params[param_id]->count(), local_decay,
-                                net_params[param_id]->prv_data(), net_params[param_id]->mutable_prv_diff(),
-                                local_rate, momentum, history_[param_id]->mutable_cpu_data());
+            axpy_axpby_copy_axpy(net_params[param_id]->prv_data_count(), local_decay,
+                                net_params[param_id]->mutable_prv_data(), net_params[param_id]->mutable_prv_diff(),
+                                local_rate, momentum, history_[param_id]->mutable_cpu_data(), Dtype(-1));
 
-            execute_separate_ComputeUpdateValue_stage_flag = false;
+            execute_separate_ComputeUpdateValue_Update_stage_flag = false;
           }
           else
           {
             //Will not happen!
             //LOG(INFO) << "Cannot Fused ComputeUpdateValue stage: prv_diff_condition_flag = false.";
-            caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-                      net_params[param_id]->cpu_diff(), momentum,
-                      history_[param_id]->mutable_cpu_data());
-
-            caffe_copy(net_params[param_id]->count(),
-                        history_[param_id]->cpu_data(),
-                        net_params[param_id]->mutable_cpu_diff());
-
-            execute_separate_ComputeUpdateValue_stage_flag = false;
-            //You can set the flag to true value, and not execute caffe_cpu_axpby and caffe_copy
-            //But set to false value and execute caffe_cpu_axpby and caffe_copy inside will save one condition judgement time
           }
       } else {
         //LOG(INFO) << "Regularize stage: prv_data_condition_flag = false.";
@@ -426,27 +464,18 @@ void SGDSolver<Dtype>::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_
                       net_params[param_id]->mutable_cpu_diff());
           */
 
-          axpy_axpby_copy(net_params[param_id]->count(), local_decay,
-                                net_params[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff(),
-                                local_rate, momentum, history_[param_id]->mutable_cpu_data());
+          axpy_axpby_copy_axpy(net_params[param_id]->count(), local_decay,
+                                net_params[param_id]->mutable_cpu_data(), net_params[param_id]->mutable_cpu_diff(),
+                                local_rate, momentum, history_[param_id]->mutable_cpu_data(), Dtype(-1));
 
-          execute_separate_ComputeUpdateValue_stage_flag = false;
+          execute_separate_ComputeUpdateValue_Update_stage_flag = false;
         }
         else
         {
           //Will not happen!
           //LOG(INFO) << "Cannot Fused ComputeUpdateValue stage: prv_diff_condition_flag = true.";
-          caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-                          net_params[param_id]->prv_diff(), momentum,
-                          history_[param_id]->mutable_cpu_data());
-
-          caffe_copy(net_params[param_id]->count(),
-                      history_[param_id]->cpu_data(),
-                      net_params[param_id]->mutable_prv_diff());
-
-          execute_separate_ComputeUpdateValue_stage_flag = false;
-          //You can set the flag to true value, and not execute caffe_cpu_axpby and caffe_copy
-          //But set to false value and execute caffe_cpu_axpby and caffe_copy inside will save one condition judgement time
+          if(net_params[param_id]->prv_data_count() != history_[param_id]->count())
+              history_[param_id]->Reshape(net_params[param_id]->shape());
         }        
       }
     } else if (regularization_type == "L1") {
@@ -466,14 +495,17 @@ void SGDSolver<Dtype>::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_
                                 temp_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff(),
                                 local_rate, momentum, history_[param_id]->mutable_cpu_data());
       
-      execute_separate_ComputeUpdateValue_stage_flag = false;
+      execute_separate_ComputeUpdateValue_Update_stage_flag = false;
+      
+      //Update stage (separate)
+      net_params[param_id]->Update();
     } else {
       LOG(FATAL) << "Unknown regularization type: " << regularization_type;
     }
   }
   
-  //ComputeUpdateValue stage (separate)
-  if (execute_separate_ComputeUpdateValue_stage_flag == true)
+  //ComputeUpdateValue_Update stage (separate)
+  if (execute_separate_ComputeUpdateValue_Update_stage_flag == true)
   {
     //Include the situation: regularization_type == "Unknown"
     //Include situations (3): local_decay == 0
@@ -481,11 +513,13 @@ void SGDSolver<Dtype>::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_
     //ComputeUpdateValue stage
     if (prv_diff_condition_flag) {
       //LOG(INFO) << "ComputeUpdateValue stage: prv_diff_condition_flag = true.";
-      caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
+      if(net_params[param_id]->prv_data_count() != history_[param_id]->count())
+        history_[param_id]->Reshape(net_params[param_id]->shape());
+      caffe_cpu_axpby(net_params[param_id]->prv_data_count(), local_rate,
                       net_params[param_id]->prv_diff(), momentum,
                       history_[param_id]->mutable_cpu_data());
 
-      caffe_copy(net_params[param_id]->count(),
+      caffe_copy(net_params[param_id]->prv_data_count(),
                   history_[param_id]->cpu_data(),
                   net_params[param_id]->mutable_prv_diff());
     } else {
@@ -498,6 +532,9 @@ void SGDSolver<Dtype>::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_
                   history_[param_id]->cpu_data(),
                   net_params[param_id]->mutable_cpu_diff());
     }
+
+    //Update stage (separate)
+    net_params[param_id]->Update();
   }
 
 }

From ad50db99b6c654db39b84b62121f3136beb87a3d Mon Sep 17 00:00:00 2001
From: "Yu, Chong" <chong.yu@intel.com>
Date: Tue, 15 Aug 2017 10:29:27 +0800
Subject: [PATCH 22/38] Simplify the flag name.

---
 src/caffe/solvers/sgd_solver.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp
index fafe8a418..929ff050f 100644
--- a/src/caffe/solvers/sgd_solver.cpp
+++ b/src/caffe/solvers/sgd_solver.cpp
@@ -396,7 +396,8 @@ void SGDSolver<Dtype>::SGDFusion(int param_id, Dtype rate) {
 //We can extend the fusion in L1 regularization by axpy_axpby_copy
 //We extend the fusion of Update stage in L2 regularization by axpy_axpby_copy_axpy,
 //then need to change execute_separate_ComputeUpdateValue_stage_flag to execute_separate_ComputeUpdateValue_Update_stage_flag
-  bool execute_separate_ComputeUpdateValue_Update_stage_flag = true;
+//Simplify the execute_separate_ComputeUpdateValue_Update_stage_flag to is_separate_ComputeUpdateValue_Update
+  bool is_separate_ComputeUpdateValue_Update = true;
   //Regularize stage (Fused ComputeUpdateValue_stage in some situations)
   if (local_decay) {
     if (regularization_type == "L2") {
@@ -435,7 +436,7 @@ void SGDSolver<Dtype>::SGDFusion(int param_id, Dtype rate) {
                                 net_params[param_id]->mutable_prv_data(), net_params[param_id]->mutable_prv_diff(),
                                 local_rate, momentum, history_[param_id]->mutable_cpu_data(), Dtype(-1));
 
-            execute_separate_ComputeUpdateValue_Update_stage_flag = false;
+            is_separate_ComputeUpdateValue_Update = false;
           }
           else
           {
@@ -468,7 +469,7 @@ void SGDSolver<Dtype>::SGDFusion(int param_id, Dtype rate) {
                                 net_params[param_id]->mutable_cpu_data(), net_params[param_id]->mutable_cpu_diff(),
                                 local_rate, momentum, history_[param_id]->mutable_cpu_data(), Dtype(-1));
 
-          execute_separate_ComputeUpdateValue_Update_stage_flag = false;
+          is_separate_ComputeUpdateValue_Update = false;
         }
         else
         {
@@ -495,7 +496,7 @@ void SGDSolver<Dtype>::SGDFusion(int param_id, Dtype rate) {
                                 temp_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff(),
                                 local_rate, momentum, history_[param_id]->mutable_cpu_data());
       
-      execute_separate_ComputeUpdateValue_Update_stage_flag = false;
+      is_separate_ComputeUpdateValue_Update = false;
       
       //Update stage (separate)
       net_params[param_id]->Update();
@@ -505,7 +506,7 @@ void SGDSolver<Dtype>::SGDFusion(int param_id, Dtype rate) {
   }
   
   //ComputeUpdateValue_Update stage (separate)
-  if (execute_separate_ComputeUpdateValue_Update_stage_flag == true)
+  if (is_separate_ComputeUpdateValue_Update == true)
   {
     //Include the situation: regularization_type == "Unknown"
     //Include situations (3): local_decay == 0

From 149b4a9be9f2305cae080a03443a9bba067b11fc Mon Sep 17 00:00:00 2001
From: "Gong, Jiong" <jiong.gong@intel.com>
Date: Fri, 18 Aug 2017 00:57:46 +0800
Subject: [PATCH 23/38] Enable bn stats batch size in caffe engine

---
 include/caffe/layers/batch_norm_layer.hpp     |  8 ++
 .../caffe/util/apply_bn_stats_batch_size.hpp  | 45 +++++++++
 src/caffe/layers/batch_norm_layer.cpp         | 93 ++++++++++++-------
 src/caffe/net.cpp                             |  7 ++
 src/caffe/proto/caffe.proto                   |  5 +
 src/caffe/util/apply_bn_stats_batch_size.cpp  | 57 ++++++++++++
 6 files changed, 184 insertions(+), 31 deletions(-)
 create mode 100644 include/caffe/util/apply_bn_stats_batch_size.hpp
 create mode 100644 src/caffe/util/apply_bn_stats_batch_size.cpp

diff --git a/include/caffe/layers/batch_norm_layer.hpp b/include/caffe/layers/batch_norm_layer.hpp
index e83bab953..c777de30c 100644
--- a/include/caffe/layers/batch_norm_layer.hpp
+++ b/include/caffe/layers/batch_norm_layer.hpp
@@ -117,11 +117,19 @@ class BatchNormLayer : public Layer<Dtype> {
                        const Dtype* data_to_be_replicated,
                        FuncTy op_func);
 
+  void ForwardStatsBatch_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top, int stats_batch_idx);
+  void BackwardStatsBatch_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom,
+      int stats_batch_idx);
+
   Blob<Dtype> mean_, variance_, temp_, x_norm_;
   bool use_global_stats_;
   Dtype moving_average_fraction_;
   int channels_;
   Dtype eps_;
+  int num_stats_batches_;
+  int stats_batch_size_;
 
   // extra temporarary variables is used to carry out sums/broadcasting
   // using BLAS
diff --git a/include/caffe/util/apply_bn_stats_batch_size.hpp b/include/caffe/util/apply_bn_stats_batch_size.hpp
new file mode 100644
index 000000000..872b2c5bf
--- /dev/null
+++ b/include/caffe/util/apply_bn_stats_batch_size.hpp
@@ -0,0 +1,45 @@
+/*
+All modification made by Intel Corporation: © 2017 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#ifndef APPLY_BN_STATS_BATCH_SIZE_HPP_
+#define APPLY_BN_STATS_BATCH_SIZE_HPP_
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+void ApplyBnStatsBatchSize(const NetParameter& param,
+    NetParameter* param_with_stats_batch_size);
+}
+#endif // APPLY_BN_STATS_BATCH_SIZE_HPP_
diff --git a/src/caffe/layers/batch_norm_layer.cpp b/src/caffe/layers/batch_norm_layer.cpp
index b7746d988..dada5873d 100644
--- a/src/caffe/layers/batch_norm_layer.cpp
+++ b/src/caffe/layers/batch_norm_layer.cpp
@@ -81,13 +81,22 @@ void BatchNormLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     CHECK_EQ(bottom[0]->shape(1), channels_);
   top[0]->ReshapeLike(*bottom[0]);
 
+  num_stats_batches_ = 1;
+  stats_batch_size_ = bottom[0]->shape(0);
+  BatchNormParameter param = this->layer_param_.batch_norm_param();
+  if (!use_global_stats_ && param.stats_batch_size() > 0) {
+    CHECK_EQ(bottom[0]->shape(0) % param.stats_batch_size(), 0);
+    num_stats_batches_ = bottom[0]->shape(0) / param.stats_batch_size();
+    stats_batch_size_ = param.stats_batch_size();
+  }
+
   vector<int> sz;
   sz.push_back(channels_);
   mean_.Reshape(sz);
   variance_.Reshape(sz);
   temp_.ReshapeLike(*bottom[0]);
   x_norm_.ReshapeLike(*bottom[0]);
-  sz[0]=bottom[0]->shape(0);
+  sz[0]=stats_batch_size_;
   batch_sum_multiplier_.Reshape(sz);
 
   int spatial_dim = bottom[0]->count(2);
@@ -99,7 +108,7 @@ void BatchNormLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     caffe_set(spatial_sum_multiplier_.count(), Dtype(1), multiplier_data);
   }
 
-  int numbychans = channels_*bottom[0]->shape(0);
+  int numbychans = channels_*stats_batch_size_;
   if (num_by_chans_.num_axes() == 0 ||
       num_by_chans_.shape(0) != numbychans) {
     sz[0] = numbychans;
@@ -149,18 +158,20 @@ void BatchNormLayer<Dtype>::replicate_to_op(Dtype* buffer_to_write,
   }
 }
 
-
-
 template <typename Dtype>
-void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  int num = bottom[0]->shape(0);
+void BatchNormLayer<Dtype>::ForwardStatsBatch_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top, int stats_batch_idx) {
+  long data_stats_count = stats_batch_size_ * bottom[0]->count(1);
+  long data_offset = stats_batch_idx * data_stats_count;
+  const Dtype* bottom_data = bottom[0]->cpu_data() + data_offset;
+  Dtype* top_data = top[0]->mutable_cpu_data() + data_offset;
+  Dtype* temp_data = temp_.mutable_cpu_data() + data_offset;
+  Dtype* x_norm_data = x_norm_.mutable_cpu_data() + data_offset;
+  int num = stats_batch_size_;
   int spatial_dim = bottom[0]->count()/(bottom[0]->shape(0)*channels_);
 
   if (bottom[0] != top[0]) {
-    caffe_copy(bottom[0]->count(), bottom_data, top_data);
+    caffe_copy(data_stats_count, bottom_data, top_data);
   }
 
   if (use_global_stats_) {
@@ -192,10 +203,10 @@ void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
   if (!use_global_stats_) {
     // compute variance using var(X) = E((X-EX)^2)
-    caffe_powx(top[0]->count(), top_data, Dtype(2),
-        temp_.mutable_cpu_data());  // (X-EX)^2
+    caffe_powx(data_stats_count, top_data, Dtype(2),
+        temp_data);  // (X-EX)^2
     caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
-        1. / (num * spatial_dim), temp_.cpu_data(),
+        1. / (num * spatial_dim), temp_data,
         spatial_sum_multiplier_.cpu_data(), 0.,
         num_by_chans_.mutable_cpu_data());
     caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
@@ -220,37 +231,40 @@ void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
              variance_.mutable_cpu_data());
 
   // replicate variance to input size
-  this->replicate(temp_.mutable_cpu_data(),
+  this->replicate(temp_data,
                   num,
                   spatial_dim*channels_,
                   spatial_dim,
                   variance_.cpu_data());
 
-  caffe_div(temp_.count(), top_data, temp_.cpu_data(), top_data);
+  caffe_div(data_stats_count, top_data, temp_data, top_data);
   // TODO(cdoersch): The caching is only needed because later in-place layers
   //                 might clobber the data.  Can we skip this if they won't?
-  caffe_copy(x_norm_.count(), top_data,
-      x_norm_.mutable_cpu_data());
+  caffe_copy(data_stats_count, top_data,
+      x_norm_data);
 }
 
 template <typename Dtype>
-void BatchNormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
+void BatchNormLayer<Dtype>::BackwardStatsBatch_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom,
+    int stats_batch_idx) {
+  long data_stats_count = stats_batch_size_ * bottom[0]->count(1);
+  long data_offset = stats_batch_idx * data_stats_count;
   const Dtype* top_diff;
   if (bottom[0] != top[0]) {
-    top_diff = top[0]->cpu_diff();
+    top_diff = top[0]->cpu_diff() + data_offset;
   } else {
-    caffe_copy(x_norm_.count(), top[0]->cpu_diff(), x_norm_.mutable_cpu_diff());
-    top_diff = x_norm_.cpu_diff();
+    caffe_copy(data_stats_count, top[0]->cpu_diff() + data_offset,
+               x_norm_.mutable_cpu_diff() + data_offset);
+    top_diff = x_norm_.cpu_diff() + data_offset;
   }
-  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff() + data_offset;
   if (use_global_stats_) {
-    caffe_div(temp_.count(), top_diff, temp_.cpu_data(), bottom_diff);
+    caffe_div(data_stats_count, top_diff, temp_.cpu_data() + data_offset, bottom_diff);
     return;
   }
-  const Dtype* top_data = x_norm_.cpu_data();
-  int num = bottom[0]->shape()[0];
+  const Dtype* top_data = x_norm_.cpu_data() + data_offset;
+  int num = stats_batch_size_;
   int spatial_dim = bottom[0]->count()/(bottom[0]->shape(0)*channels_);
   // if Y = (X-mean(X))/(sqrt(var(X)+eps)), then
   //
@@ -265,7 +279,7 @@ void BatchNormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   // dimensions except the channels dimension where required.
 
   // sum(dE/dY \cdot Y)
-  caffe_mul(temp_.count(), top_data, top_diff, bottom_diff);
+  caffe_mul(data_stats_count, top_data, top_diff, bottom_diff);
   caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim, 1.,
       bottom_diff, spatial_sum_multiplier_.cpu_data(), 0.,
       num_by_chans_.mutable_cpu_data());
@@ -280,7 +294,7 @@ void BatchNormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
                   mean_.cpu_data());
 
   // sum(dE/dY \cdot Y) \cdot Y
-  caffe_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
+  caffe_mul(data_stats_count, top_data, bottom_diff, bottom_diff);
 
   // sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y
   caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim, 1.,
@@ -300,12 +314,29 @@ void BatchNormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
                   std::plus<Dtype>());
 
   // dE/dY - mean(dE/dY)-mean(dE/dY \cdot Y) \cdot Y
-  caffe_cpu_axpby(temp_.count(), Dtype(1), top_diff,
+  caffe_cpu_axpby(data_stats_count, Dtype(1), top_diff,
       Dtype(-1. / (num * spatial_dim)), bottom_diff);
 
   // note: temp_ still contains sqrt(var(X)+eps), computed during the forward
   // pass.
-  caffe_div(temp_.count(), bottom_diff, temp_.cpu_data(), bottom_diff);
+  caffe_div(data_stats_count, bottom_diff, temp_.cpu_data() + data_offset, bottom_diff);
+}
+
+template <typename Dtype>
+void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  for (int i = 0; i < num_stats_batches_; i++) {
+    ForwardStatsBatch_cpu(bottom, top, i);
+  }
+}
+
+template <typename Dtype>
+void BatchNormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  for (int i = 0; i < num_stats_batches_; i++) {
+    BackwardStatsBatch_cpu(top, propagate_down, bottom, i);
+  }
 }
 
 
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 10621c531..9fda127c6 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -62,6 +62,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "caffe/multinode/mlsl.hpp"
 #include "caffe/multinode/apply_mn_param.hpp"
 #include "caffe/util/remove_batch_norm.hpp"
+#include "caffe/util/apply_bn_stats_batch_size.hpp"
 
 PERFORMANCE_CREATE_MONITOR();
 
@@ -147,6 +148,12 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
     this->kept_bn_layers_.push_back(param.compile_net_state().kept_bn_layers(idx));
   }
 
+  NetParameter param_with_stats_batch_size;
+  if (param.has_bn_stats_batch_size()) {
+    ApplyBnStatsBatchSize(param, &param_with_stats_batch_size);
+    param = param_with_stats_batch_size;
+  }
+
 #ifdef USE_MLSL
   NetParameter param_with_mn;
   if (mn::is_multinode()) {
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index eaf9b6e6b..cd6cb761f 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -208,6 +208,9 @@ message NetParameter {
 
   optional string engine = 9 [default = ""];
 
+  // Batch size used for BatchNorm statistics, 0 would use the batch size of bottom blob
+  optional uint32 bn_stats_batch_size = 11 [default = 0];
+
   // The layers that make up the net.  Each of their configurations, including
   // connectivity and behavior, is specified as a LayerParameter.
   repeated LayerParameter layer = 100;  // ID 100 so layers are printed last.
@@ -900,6 +903,8 @@ message BatchNormParameter {
   optional bool bias_term = 6 [default = true]; // whether to have bias terms
   optional FillerParameter filler = 7; // The filler for the weight
   optional FillerParameter bias_filler = 8; // The filler for the bias
+  // Batch size used for statistics, 0 would use the batch size of bottom blob
+  optional uint32 stats_batch_size = 9 [default = 0];
 }
 
 message SplitParameter {
diff --git a/src/caffe/util/apply_bn_stats_batch_size.cpp b/src/caffe/util/apply_bn_stats_batch_size.cpp
new file mode 100644
index 000000000..078cf6bc5
--- /dev/null
+++ b/src/caffe/util/apply_bn_stats_batch_size.cpp
@@ -0,0 +1,57 @@
+/*
+All modification made by Intel Corporation: © 2017 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <string>
+#include "caffe/common.hpp"
+#include "caffe/util/apply_bn_stats_batch_size.hpp"
+
+namespace caffe {
+void ApplyBnStatsBatchSize(const NetParameter& param,
+    NetParameter* param_with_stats_batch_size) {
+  CHECK(param.has_bn_stats_batch_size());
+  param_with_stats_batch_size->CopyFrom(param);
+  param_with_stats_batch_size->clear_layer();
+  int bn_stats_batch_size = param.bn_stats_batch_size();
+  for (int i = 0; i < param.layer_size(); i++) {
+    LayerParameter *layer_param = param_with_stats_batch_size->add_layer();
+    layer_param->CopyFrom(param.layer(i));
+    if (layer_param->type() == "BatchNorm") {
+      layer_param->mutable_batch_norm_param()->set_stats_batch_size(bn_stats_batch_size);
+    }
+  }
+}
+}

From 22cba68429089db02e0efc1a6167f7b452b76659 Mon Sep 17 00:00:00 2001
From: xinanlin <xinan.lin@intel.com>
Date: Fri, 18 Aug 2017 13:14:32 +0800
Subject: [PATCH 24/38] change MKLDNN LD path from relative path to absoulte
 path when use raw Makefile

---
 Makefile.mkldnn | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Makefile.mkldnn b/Makefile.mkldnn
index ec1a70bc5..d113a8923 100644
--- a/Makefile.mkldnn
+++ b/Makefile.mkldnn
@@ -1,5 +1,5 @@
 CAFFE_ROOTDIR := $(shell pwd)
-MKLDNN_ROOTDIR := external/mkldnn
+MKLDNN_ROOTDIR := $(CAFFE_ROOTDIR)/external/mkldnn
 MKLDNN_TMPDIR := $(MKLDNN_ROOTDIR)/tmp
 MKLDNN_SRCDIR := $(MKLDNN_ROOTDIR)/src
 MKLDNN_BUILDDIR := $(MKLDNN_ROOTDIR)/build
@@ -22,7 +22,7 @@ ifneq (,$(findstring ccache,$(CC)))
 endif
 
 MKLDNN_GITHUB := https://github.com/01org/mkl-dnn.git
-MKLDNN_CMAKE_FLAGS += $(MKLDNN_SRCDIR) -DCMAKE_INSTALL_PREFIX=$(CAFFE_ROOTDIR)/$(MKLDNN_INSTALLDIR) -DMKLROOT=${MKL_ROOTDIR} -B$(CAFFE_ROOTDIR)/$(MKLDNN_BUILDDIR) -DCMAKE_CXX_COMPILER="$(MKLDNN_CXX)" -DCMAKE_C_COMPILER="$(MKLDNN_CC)"
+MKLDNN_CMAKE_FLAGS += $(MKLDNN_SRCDIR) -DCMAKE_INSTALL_PREFIX=$(MKLDNN_INSTALLDIR) -DMKLROOT=${MKL_ROOTDIR} -B$(MKLDNN_BUILDDIR) -DCMAKE_CXX_COMPILER="$(MKLDNN_CXX)" -DCMAKE_C_COMPILER="$(MKLDNN_CC)"
 
 ifeq ("$(wildcard $(MKLDNN_INSTALLDIR)/include/mkldnn.hpp)", "")
 mkldnn_download:
@@ -32,8 +32,8 @@ mkldnn_download:
 
 mkldnn_build: mkldnn_download
 	cmake $(MKLDNN_CMAKE_FLAGS)
-	make -C $(CAFFE_ROOTDIR)/$(MKLDNN_BUILDDIR) -j$(shell cat /proc/cpuinfo |grep 'processor'|wc -l)
-	make -C $(CAFFE_ROOTDIR)/$(MKLDNN_BUILDDIR) install
+	make -C $(MKLDNN_BUILDDIR) -j$(shell cat /proc/cpuinfo |grep 'processor'|wc -l)
+	make -C $(MKLDNN_BUILDDIR) install
 else
 mkldnn_download:
 mkldnn_build:

From 1e238296b85dbf7a7b8f57ef6e331245b78597fd Mon Sep 17 00:00:00 2001
From: "Gong, Jiong" <jiong.gong@intel.com>
Date: Fri, 18 Aug 2017 20:55:33 +0800
Subject: [PATCH 25/38] support bn stats batch size in mkl bn

---
 include/caffe/layers/mkl_layers.hpp       |  18 ++-
 src/caffe/layers/batch_norm_layer.cpp     |   2 +-
 src/caffe/layers/mkl_batch_norm_layer.cpp | 143 ++++++++++++++--------
 3 files changed, 109 insertions(+), 54 deletions(-)

diff --git a/include/caffe/layers/mkl_layers.hpp b/include/caffe/layers/mkl_layers.hpp
index 0d5d66416..c9806daee 100644
--- a/include/caffe/layers/mkl_layers.hpp
+++ b/include/caffe/layers/mkl_layers.hpp
@@ -481,12 +481,12 @@ class MKLBatchNormLayer : public Layer<Dtype> {
         batchNormFwd(static_cast<dnnPrimitive_t>(NULL)),
         batchNormFwdInference(static_cast<dnnPrimitive_t>(NULL)),
         batchNormBwd(static_cast<dnnPrimitive_t>(NULL)),
-        mean_buffer_(static_cast<Dtype*>(NULL)),
-        variance_buffer_(static_cast<Dtype*>(NULL)),
         scaleShift_buffer_(static_cast<Dtype*>(NULL)),
         diffScaleShift_buffer_(static_cast<Dtype*>(NULL)),
         layout_usr_(static_cast<dnnLayout_t>(NULL)),
-        use_global_stats_(false)
+        use_global_stats_(false),
+        num_stats_batches_(1),
+        stats_batch_size_(0)
       {
         PERFORMANCE_EVENT_ID_RESET(perf_id_fw_);
         PERFORMANCE_EVENT_ID_RESET(perf_id_bw_);
@@ -515,6 +515,12 @@ class MKLBatchNormLayer : public Layer<Dtype> {
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
+  void ForwardStatsBatch_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top, int stats_batch_idx);
+  void BackwardStatsBatch_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom,
+      int stats_batch_idx);
+
   void Init(const vector<Blob<Dtype>*>& bottom,
             const vector<Blob<Dtype>*>& top);
 
@@ -534,12 +540,14 @@ class MKLBatchNormLayer : public Layer<Dtype> {
   shared_ptr<MKLDiff<Dtype> > bwd_bottom_diff;
   Blob<Dtype> temp_;
   dnnPrimitive_t batchNormFwd, batchNormFwdInference, batchNormBwd;
-  Dtype *mean_buffer_;
-  Dtype *variance_buffer_;
+  vector<Dtype *> mean_buffers_;
+  vector<Dtype *> variance_buffers_;
   Dtype *scaleShift_buffer_;
   Dtype *diffScaleShift_buffer_;
   dnnLayout_t layout_usr_;
   bool use_global_stats_;
+  int num_stats_batches_;
+  int stats_batch_size_;
 
   PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
   PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
diff --git a/src/caffe/layers/batch_norm_layer.cpp b/src/caffe/layers/batch_norm_layer.cpp
index dada5873d..8331dd7d7 100644
--- a/src/caffe/layers/batch_norm_layer.cpp
+++ b/src/caffe/layers/batch_norm_layer.cpp
@@ -218,7 +218,7 @@ void BatchNormLayer<Dtype>::ForwardStatsBatch_cpu(const vector<Blob<Dtype>*>& bo
     this->blobs_[2]->mutable_cpu_data()[0] += 1;
     caffe_cpu_axpby(mean_.count(), Dtype(1), mean_.cpu_data(),
         moving_average_fraction_, this->blobs_[0]->mutable_cpu_data());
-    int m = bottom[0]->count()/channels_;
+    int m = bottom[0]->count()/num_stats_batches_/channels_;
     Dtype bias_correction_factor = m > 1 ? Dtype(m)/(m-1) : 1;
     caffe_cpu_axpby(variance_.count(), bias_correction_factor,
         variance_.cpu_data(), moving_average_fraction_,
diff --git a/src/caffe/layers/mkl_batch_norm_layer.cpp b/src/caffe/layers/mkl_batch_norm_layer.cpp
index 6dce50243..a24500c69 100755
--- a/src/caffe/layers/mkl_batch_norm_layer.cpp
+++ b/src/caffe/layers/mkl_batch_norm_layer.cpp
@@ -52,8 +52,12 @@ MKLBatchNormLayer<Dtype>::~MKLBatchNormLayer() {
   dnnDelete<Dtype>(batchNormFwdInference);
   dnnDelete<Dtype>(batchNormBwd);
   dnnLayoutDelete<Dtype>(layout_usr_);
-  dnnReleaseBuffer<Dtype>(mean_buffer_);
-  dnnReleaseBuffer<Dtype>(variance_buffer_);
+  for (int i = 0; i < mean_buffers_.size(); i++) {
+    dnnReleaseBuffer<Dtype>(mean_buffers_[i]);
+  }
+  for (int i = 0; i < variance_buffers_.size(); i++) {
+    dnnReleaseBuffer<Dtype>(variance_buffers_[i]);
+  }
   dnnReleaseBuffer<Dtype>(scaleShift_buffer_);
   dnnReleaseBuffer<Dtype>(diffScaleShift_buffer_);
 }
@@ -71,6 +75,15 @@ void MKLBatchNormLayer<Dtype>::Init(const vector<Blob<Dtype>*>& bottom,
   if (this->layer_param_.batch_norm_param().has_use_global_stats())
     use_global_stats_ = this->layer_param_.batch_norm_param().use_global_stats();
 
+  num_stats_batches_ = 1;
+  stats_batch_size_ = bottom[0]->shape(0);
+  BatchNormParameter param = this->layer_param_.batch_norm_param();
+  if (!use_global_stats_ && param.stats_batch_size() > 0) {
+    CHECK_EQ(bottom[0]->shape(0) % param.stats_batch_size(), 0);
+    num_stats_batches_ = bottom[0]->shape(0) / param.stats_batch_size();
+    stats_batch_size_ = param.stats_batch_size();
+  }
+
   CHECK(use_weight_bias_) << "BatchNorm without scaling have not supported yet";
 
   size_t dim = 4, sizes[4], strides[4];
@@ -99,18 +112,25 @@ void MKLBatchNormLayer<Dtype>::Init(const vector<Blob<Dtype>*>& bottom,
   // TODO: Make a cleanup routine to avoid
   // copy of following code in the Destructor
 
-  dnnError_t e;
-  dnnLayoutDelete<Dtype>(layout_usr_);
-  e = dnnLayoutCreate<Dtype>(&layout_usr_, dim, sizes, strides);
-  CHECK_EQ(e, E_SUCCESS);
-
   fwd_bottom_data->create_user_layout(dim, sizes, strides, false);
   fwd_top_data   ->create_user_layout(dim, sizes, strides, false);
   bwd_bottom_diff->create_user_layout(dim, sizes, strides, false);
   bwd_top_diff   ->create_user_layout(dim, sizes, strides, false);
 
-  dnnReleaseBuffer<Dtype>(mean_buffer_);
-  dnnReleaseBuffer<Dtype>(variance_buffer_);
+  sizes[3] /= num_stats_batches_;
+  dnnError_t e;
+  dnnLayoutDelete<Dtype>(layout_usr_);
+  e = dnnLayoutCreate<Dtype>(&layout_usr_, dim, sizes, strides);
+  CHECK_EQ(e, E_SUCCESS);
+
+  for (int i = 0; i < mean_buffers_.size(); i++) {
+    dnnReleaseBuffer<Dtype>(mean_buffers_[i]);
+  }
+  for (int i = 0; i < variance_buffers_.size(); i++) {
+    dnnReleaseBuffer<Dtype>(variance_buffers_[i]);
+  }
+  mean_buffers_.resize(num_stats_batches_, NULL);
+  variance_buffers_.resize(num_stats_batches_, NULL);
   dnnReleaseBuffer<Dtype>(scaleShift_buffer_);
   dnnReleaseBuffer<Dtype>(diffScaleShift_buffer_);
 
@@ -223,26 +243,30 @@ void MKLBatchNormLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     strides[2] = sizes[0]*sizes[1];
     strides[3] = sizes[0]*sizes[1]*sizes[2];
 
-    dnnError_t e;
-    dnnLayoutDelete<Dtype>(layout_usr_);
-    e = dnnLayoutCreate<Dtype>(&layout_usr_, dim, sizes, strides);
-    CHECK_EQ(e, E_SUCCESS);
     fwd_bottom_data->create_user_layout(dim, sizes, strides, false);
     fwd_top_data   ->create_user_layout(dim, sizes, strides, false);
     bwd_bottom_diff->create_user_layout(dim, sizes, strides, false);
     bwd_top_diff   ->create_user_layout(dim, sizes, strides, false);
+
+    sizes[3] /= num_stats_batches_;
+    dnnError_t e;
+    dnnLayoutDelete<Dtype>(layout_usr_);
+    e = dnnLayoutCreate<Dtype>(&layout_usr_, dim, sizes, strides);
+    CHECK_EQ(e, E_SUCCESS);
   }
 }
 
 template <typename Dtype>
-void MKLBatchNormLayer<Dtype>::Forward_cpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+void MKLBatchNormLayer<Dtype>::ForwardStatsBatch_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top, int stats_batch_idx) {
+  long data_offset = stats_batch_idx * stats_batch_size_ * bottom[0]->count(1);
   void* bottom_data =
     reinterpret_cast<void *>(const_cast<Dtype*>(bottom[0]->prv_data()));
   int is_first_pass = 0;
-  unsigned int amount_to_copy =0;
+  long amount_to_copy =0;
 
-  if (NULL != bottom_data) {
+  // TODO: support private memory with num_stats_batches_ > 1
+  if (NULL != bottom_data && num_stats_batches_ == 1) {
     amount_to_copy = bottom[0]->prv_data_count();
     // Is it the first pass? Create a primitive.
     if (batchNormFwd == NULL) {
@@ -311,7 +335,7 @@ void MKLBatchNormLayer<Dtype>::Forward_cpu(
     }
     bottom_data =
       reinterpret_cast<void *>(const_cast<Dtype*>(bottom[0]->cpu_data()));
-    amount_to_copy = bottom[0]->count();
+    amount_to_copy = bottom[0]->count() / num_stats_batches_;
   }
   if (is_first_pass == 1) {
       dnnError_t e;
@@ -319,18 +343,22 @@ void MKLBatchNormLayer<Dtype>::Forward_cpu(
       e = dnnLayoutCreateFromPrimitive<Dtype>(
         &mean_buffer_l, batchNormFwd, dnnResourceMean);
       CHECK_EQ(e, E_SUCCESS);
-      e = dnnAllocateBuffer<Dtype>(
-        reinterpret_cast<void**>(&mean_buffer_), mean_buffer_l);
-      CHECK_EQ(e, E_SUCCESS);
+      for (int i = 0; i < num_stats_batches_; i++) {
+        e = dnnAllocateBuffer<Dtype>(
+          reinterpret_cast<void**>(&mean_buffers_[i]), mean_buffer_l);
+        CHECK_EQ(e, E_SUCCESS);
+      }
       dnnLayoutDelete<Dtype>(mean_buffer_l);
 
       dnnLayout_t variance_buffer_l = NULL;
       e = dnnLayoutCreateFromPrimitive<Dtype>(
         &variance_buffer_l, batchNormFwd, dnnResourceVariance);
       CHECK_EQ(e, E_SUCCESS);
-      e = dnnAllocateBuffer<Dtype>(
-        reinterpret_cast<void**>(&variance_buffer_), variance_buffer_l);
-      CHECK_EQ(e, E_SUCCESS);
+      for (int i = 0; i < num_stats_batches_; i++) {
+        e = dnnAllocateBuffer<Dtype>(
+          reinterpret_cast<void**>(&variance_buffers_[i]), variance_buffer_l);
+        CHECK_EQ(e, E_SUCCESS);
+      }
       dnnLayoutDelete<Dtype>(variance_buffer_l);
 
        dnnLayout_t diffScaleShift_buffer_l = NULL;
@@ -374,8 +402,8 @@ void MKLBatchNormLayer<Dtype>::Forward_cpu(
     // Note that this is only necessary for Backward; we skip this if not
     // doing Backward
     // TODO: make a caffe_coppy working on blobs
-    caffe_copy(amount_to_copy, static_cast<Dtype*>(bottom_data),
-               temp_.mutable_cpu_data());
+    caffe_copy(amount_to_copy, static_cast<Dtype*>(bottom_data) + data_offset,
+               temp_.mutable_cpu_data() + data_offset);
   }
 
   if (use_global_stats_) {
@@ -383,24 +411,25 @@ void MKLBatchNormLayer<Dtype>::Forward_cpu(
     const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ?
                                0 : 1 / this->blobs_[2]->cpu_data()[0];
     caffe_cpu_scale(this->blobs_[0]->count(), scale_factor,
-                    this->blobs_[0]->cpu_data(), mean_buffer_);
+                    this->blobs_[0]->cpu_data(), mean_buffers_[stats_batch_idx]);
     caffe_cpu_scale(this->blobs_[1]->count(), scale_factor,
-                    this->blobs_[1]->cpu_data(), variance_buffer_);
+                    this->blobs_[1]->cpu_data(), variance_buffers_[stats_batch_idx]);
   }
 
   dnnError_t e;
   void* BatchNorm_res[dnnResourceNumber];
-  BatchNorm_res[dnnResourceMean] = mean_buffer_;
-  BatchNorm_res[dnnResourceVariance] = variance_buffer_;
-  BatchNorm_res[dnnResourceSrc] = bottom_data;
+  BatchNorm_res[dnnResourceMean] = mean_buffers_[stats_batch_idx];
+  BatchNorm_res[dnnResourceVariance] = variance_buffers_[stats_batch_idx];
+  BatchNorm_res[dnnResourceSrc] = (Dtype*)bottom_data + data_offset;
   BatchNorm_res[dnnResourceScaleShift] = scaleShift_buffer_;
   if (fwd_top_data->conversion_needed()) {
     top[0]->set_prv_data_descriptor(fwd_top_data);
+    data_offset = stats_batch_idx * (top[0]->prv_data_count() / num_stats_batches_);
     BatchNorm_res[dnnResourceDst] =
-            reinterpret_cast<void *>(top[0]->mutable_prv_data());
+            reinterpret_cast<void *>(top[0]->mutable_prv_data() + data_offset);
   } else {
     BatchNorm_res[dnnResourceDst] =
-            reinterpret_cast<void *>(top[0]->mutable_cpu_data());
+            reinterpret_cast<void *>(top[0]->mutable_cpu_data() + data_offset);
     DLOG(INFO) << "Using cpu_data for top in DnnBatchNorm.";
   }
 
@@ -415,20 +444,21 @@ void MKLBatchNormLayer<Dtype>::Forward_cpu(
      // compute and save moving average
     this->blobs_[2]->mutable_cpu_data()[0] *= moving_average_fraction_;
     this->blobs_[2]->mutable_cpu_data()[0] += 1;
-    caffe_cpu_axpby(this->blobs_[0]->count(), Dtype(1), mean_buffer_,
+    caffe_cpu_axpby(this->blobs_[0]->count(), Dtype(1), mean_buffers_[stats_batch_idx],
         moving_average_fraction_, this->blobs_[0]->mutable_cpu_data());
-    int m = bottom[0]->count()/channels_;
+    int m = bottom[0]->count()/num_stats_batches_/channels_;
     Dtype bias_correction_factor = m > 1 ? Dtype(m)/(m-1) : 1;
     caffe_cpu_axpby(this->blobs_[1]->count(), bias_correction_factor,
-        variance_buffer_, moving_average_fraction_,
+        variance_buffers_[stats_batch_idx], moving_average_fraction_,
         this->blobs_[1]->mutable_cpu_data());
   }
 }
 
 template <typename Dtype>
-void MKLBatchNormLayer<Dtype>::Backward_cpu(
-    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
+void MKLBatchNormLayer<Dtype>::BackwardStatsBatch_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom,
+    int stats_batch_idx) {
+  long data_offset = stats_batch_idx * stats_batch_size_ * bottom[0]->count(1);
   void *bottom_data = NULL;
   if (bottom[0] == top[0]) {
     bottom_data = reinterpret_cast<void *>(
@@ -437,7 +467,7 @@ void MKLBatchNormLayer<Dtype>::Backward_cpu(
     bottom_data =
             reinterpret_cast<void *>(
                         const_cast<Dtype*>(bottom[0]->prv_data()));
-    if (NULL == bottom_data)
+    if (NULL == bottom_data || num_stats_batches_ > 1)
       bottom_data =
             reinterpret_cast<void *>(
                         const_cast<Dtype*>(bottom[0]->cpu_data()));
@@ -445,19 +475,19 @@ void MKLBatchNormLayer<Dtype>::Backward_cpu(
 
   dnnError_t e;
   void* BatchNorm_res[dnnResourceNumber];
-  BatchNorm_res[dnnResourceMean] = mean_buffer_;
-  BatchNorm_res[dnnResourceVariance] = variance_buffer_;
-  BatchNorm_res[dnnResourceSrc] = bottom_data;
+  BatchNorm_res[dnnResourceMean] = mean_buffers_[stats_batch_idx];
+  BatchNorm_res[dnnResourceVariance] = variance_buffers_[stats_batch_idx];
+  BatchNorm_res[dnnResourceSrc] = (Dtype*)bottom_data + data_offset;
   BatchNorm_res[dnnResourceScaleShift] = scaleShift_buffer_;
   BatchNorm_res[dnnResourceDiffScaleShift] = diffScaleShift_buffer_;
-
-  BatchNorm_res[dnnResourceDiffDst] = bwd_top_diff->get_converted_prv(top[0],
-          true);
+  BatchNorm_res[dnnResourceDiffDst] =
+    bwd_top_diff->get_converted_prv(top[0], true) + data_offset;
   if (bwd_bottom_diff->conversion_needed()) {
     bottom[0]->set_prv_diff_descriptor(bwd_bottom_diff);
-    BatchNorm_res[dnnResourceDiffSrc] = bottom[0]->mutable_prv_diff();
+    data_offset = stats_batch_idx * (bottom[0]->prv_diff_count() / num_stats_batches_);
+    BatchNorm_res[dnnResourceDiffSrc] = bottom[0]->mutable_prv_diff() + data_offset;
   } else {
-    BatchNorm_res[dnnResourceDiffSrc] = bottom[0]->mutable_cpu_diff();
+    BatchNorm_res[dnnResourceDiffSrc] = bottom[0]->mutable_cpu_diff() + data_offset;
   }
 
   PERFORMANCE_EVENT_ID_INIT(perf_id_bw_, PERFORMANCE_MKL_NAME("BW"));
@@ -479,6 +509,23 @@ void MKLBatchNormLayer<Dtype>::Backward_cpu(
   }
 }
 
+template <typename Dtype>
+void MKLBatchNormLayer<Dtype>::Forward_cpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  for (int i = 0; i < num_stats_batches_; i++) {
+    ForwardStatsBatch_cpu(bottom, top, i);
+  }
+}
+
+template <typename Dtype>
+void MKLBatchNormLayer<Dtype>::Backward_cpu(
+    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  for (int i = 0; i < num_stats_batches_; i++) {
+    BackwardStatsBatch_cpu(top, propagate_down, bottom, i);
+  }
+}
+
 
 #ifdef CPU_ONLY
 STUB_GPU(MKLBatchNormLayer);

From f4c0f7758f81c613d7be388aa88000bb10654ef1 Mon Sep 17 00:00:00 2001
From: "Gong, Jiong" <jiong.gong@intel.com>
Date: Sat, 19 Aug 2017 19:15:36 +0800
Subject: [PATCH 26/38] enable bn stats batch size in mkldnn

---
 include/caffe/layers/mkldnn_layers.hpp       |  12 +-
 include/caffe/mkldnn_memory.hpp              |   3 +
 src/caffe/layers/mkldnn_batch_norm_layer.cpp | 266 ++++++++++++-------
 src/caffe/mkldnn_memory.cpp                  |  26 ++
 4 files changed, 209 insertions(+), 98 deletions(-)

diff --git a/include/caffe/layers/mkldnn_layers.hpp b/include/caffe/layers/mkldnn_layers.hpp
index f63301e2a..bf23438bd 100644
--- a/include/caffe/layers/mkldnn_layers.hpp
+++ b/include/caffe/layers/mkldnn_layers.hpp
@@ -68,7 +68,6 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer<Dtype>, public Layer<Dtype> {
         , fwd_top_data(), fwd_bottom_data()
         , bwd_top_diff(), bwd_bottom_diff()
         , BatchNormFwd_pd(), BatchNormBwd_pd()
-        , mean_memory(), variance_memory()
         , scaleshift_memory(), bwd_scaleshift_diff_memory()
         , output_memory(), bwd_bottom_diff_memory()
         , input_primitive(), bwd_top_diff_primitive()
@@ -96,22 +95,29 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer<Dtype>, public Layer<Dtype> {
     void InitBatchNormBwd(const vector<Blob<Dtype>*>& top,
             const vector<bool>& propagate_down,
             const vector<Blob<Dtype>*>& bottom);
+    void InitBatchNormFwdPrimitive(int stats_batch_idx);
+    void InitBatchNormBwdPrimitive(int stats_batch_idx);
+    template <bool diff> shared_ptr<memory> GetStatsBatchMemory(
+      shared_ptr<MKLDNNMemoryDescriptor<Dtype, diff> > mkldnn_data, int idx);
     shared_ptr<MKLDNNData<Dtype> > fwd_top_data, fwd_bottom_data;
     shared_ptr<MKLDNNDiff<Dtype> > bwd_top_diff, bwd_bottom_diff;
     shared_ptr<batch_normalization_forward::primitive_desc> BatchNormFwd_pd;
     shared_ptr<batch_normalization_backward::primitive_desc> BatchNormBwd_pd;
 
-    MKLDNNPrimitive<Dtype> BatchNormFwd, BatchNormBwd;
-    shared_ptr<memory> mean_memory, variance_memory;
+    vector<MKLDNNPrimitive<Dtype> > BatchNormFwd, BatchNormBwd;
+    vector<shared_ptr<memory> > mean_memory, variance_memory;
 
     shared_ptr<memory> scaleshift_memory, bwd_scaleshift_diff_memory;
     shared_ptr<memory> output_memory, bwd_bottom_diff_memory;
+    vector<shared_ptr<memory> > input_stats, output_stats, top_diff_stats, bottom_diff_stats;
 
     shared_ptr<primitive> input_primitive, bwd_top_diff_primitive;
 
     int32_t num_, width_, height_, channels_;
     Dtype eps_, moving_average_fraction_;
     bool use_weight_bias_, bias_term_, use_global_stats_;
+    int num_stats_batches_;
+    int stats_batch_size_;
 
     PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
     PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
diff --git a/include/caffe/mkldnn_memory.hpp b/include/caffe/mkldnn_memory.hpp
index a59ce6e12..3b1a1c6ad 100644
--- a/include/caffe/mkldnn_memory.hpp
+++ b/include/caffe/mkldnn_memory.hpp
@@ -94,6 +94,7 @@ class MKLDNNMemoryDescriptorBase : public PrvMemDescr
         if (_prv_memory == NULL) allocate();
         return _internal_ptr;
     }
+
     shared_ptr<primitive>  reorder_usr2prv() { return _reorder_usr2prv.aprimitive; }
     shared_ptr<primitive>  reorder_prv2usr() { return _reorder_prv2usr.aprimitive; }
     shared_ptr<primitive>  reorder_extprv2prv() { return _reorder_extprv2prv.aprimitive; }
@@ -201,6 +202,8 @@ class MKLDNNMemoryDescriptor : public MKLDNNMemoryDescriptorBase<Dtype> {
     shared_ptr<memory> create_output_memory(Blob<Dtype> * blob, bool inplace = false);
     shared_ptr<primitive> create_input(bool set_prv_ptr);
     shared_ptr<memory> create_output_memory(bool inplace = false);
+    Dtype* get_memory_ptr(long offset = 0);
+    shared_ptr<memory::desc> get_memory_desc();
 
     void set_mkldnn_primitive(MKLDNNPrimitive<Dtype>& mprimitive) { CHECK(mprimitive.aprimitive); _mkldnn_primitive = mprimitive;  }
     MKLDNNPrimitive<Dtype>&  mkldnn_primitive() { return _mkldnn_primitive; }
diff --git a/src/caffe/layers/mkldnn_batch_norm_layer.cpp b/src/caffe/layers/mkldnn_batch_norm_layer.cpp
index 4db92b943..6688f8584 100644
--- a/src/caffe/layers/mkldnn_batch_norm_layer.cpp
+++ b/src/caffe/layers/mkldnn_batch_norm_layer.cpp
@@ -130,6 +130,15 @@ void MKLDNNBatchNormLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom
     this->num_ = bottom[0]->num();
     this->channels_ = bottom[0]->channels();
 
+    num_stats_batches_ = 1;
+    stats_batch_size_ = bottom[0]->shape(0);
+    BatchNormParameter param = this->layer_param_.batch_norm_param();
+    if (!use_global_stats_ && param.stats_batch_size() > 0) {
+      CHECK_EQ(bottom[0]->shape(0) % param.stats_batch_size(), 0);
+      num_stats_batches_ = bottom[0]->shape(0) / param.stats_batch_size();
+      stats_batch_size_ = param.stats_batch_size();
+    }
+
     //Fix: should reshape the top blob with the real size of bottom blob
     //top[0]->Reshape(this->num_, this->channels_, this->height_, this->width_);
 #ifdef DEBUG
@@ -159,8 +168,9 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNorm(const vector<Blob<Dtype>*>& bott
     memory::data_type mpcsn = memory::data_type::f32;
     
     // ---- Initialize memory descriptors -------------
-    shared_ptr<memory::desc> input_md, output_md, scaleshift_md;
-    shared_ptr<memory::primitive_desc> usr_mpd, prv_mpd, scaleshift_mpd;
+    shared_ptr<memory::desc> input_md, input_stats_md, output_md, scaleshift_md;
+    shared_ptr<memory::primitive_desc> usr_mpd, prv_mpd;
+    shared_ptr<memory::primitive_desc> scaleshift_mpd;
     if (bottom_data_is_prv) {
         shared_ptr<MKLDNNMemoryDescriptor<Dtype, false> > mem_descr
             = get_mkldnn_prv_descriptor<Dtype, false>(bottom[0]);
@@ -172,9 +182,13 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNorm(const vector<Blob<Dtype>*>& bott
         usr_mpd.reset(new memory::primitive_desc(*input_md, cpu_engine));
     }
     output_md = input_md;
+    input_stats_md.reset(new memory::desc(*input_md));
+    CHECK(input_stats_md->data.ndims > 0 &&
+          input_stats_md->data.dims[0] == this->num_);
+    input_stats_md->data.dims[0] = stats_batch_size_;
 
     // ---- Initialize BatchNorm primitive descriptor -------------
-    batch_normalization_forward::desc BatchNormFwd_desc(propagation, *input_md, eps_, flags);
+    batch_normalization_forward::desc BatchNormFwd_desc(propagation, *input_stats_md, eps_, flags);
     // ---- Determining engine to use -----------------------
     std::string subengines = this->layer_param_.engine();
     if (subengines == "" || subengines == "MKLDNN")
@@ -206,44 +220,13 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNorm(const vector<Blob<Dtype>*>& bott
     fwd_top_data.reset(new MKLDNNData<Dtype>(usr_mpd, prv_mpd, top[0], this));
     output_memory = fwd_top_data->create_output_memory();
 
-    // ---- Create BatchNorm --------------------
-    if (this->phase_ == TEST && !use_global_stats_) {
-        if (use_weight_bias_) {
-            BatchNormFwd.reset(new batch_normalization_forward(*BatchNormFwd_pd,
-                    *input_primitive, *scaleshift_memory, *output_memory));
-        } else {
-            BatchNormFwd.reset(new batch_normalization_forward(*BatchNormFwd_pd,
-                    *input_primitive, *output_memory));
-        }
-    } else {
-        mean_memory.reset(new memory(BatchNormFwd_pd->mean_primitive_desc()));
-        variance_memory.reset(new memory(BatchNormFwd_pd->variance_primitive_desc()));
-
-        if (use_global_stats_) {
-            caffe_copy<Dtype>(this->channels_, this->blobs_[0]->cpu_data(),
-                static_cast<Dtype *>(mean_memory->get_data_handle()));
-            caffe_copy<Dtype>(this->channels_, this->blobs_[1]->cpu_data(),
-               static_cast<Dtype *>(variance_memory->get_data_handle()));
-            if (use_weight_bias_) {
-                BatchNormFwd.reset(new batch_normalization_forward(*BatchNormFwd_pd,
-                        *input_primitive, (const primitive::at)*mean_memory,
-                        (const primitive::at)*variance_memory, *scaleshift_memory,
-                        *output_memory));
-            } else {
-                BatchNormFwd.reset(new batch_normalization_forward(*BatchNormFwd_pd,
-                        *input_primitive, (const primitive::at)*mean_memory,
-                        (const primitive::at)*variance_memory, *output_memory));
-            }
-        } else {
-            if (use_weight_bias_) {
-                BatchNormFwd.reset(new batch_normalization_forward(*BatchNormFwd_pd,
-                        *input_primitive, *scaleshift_memory, *output_memory,
-                        *mean_memory, *variance_memory));
-            } else {
-                BatchNormFwd.reset(new batch_normalization_forward(*BatchNormFwd_pd,
-                        *input_primitive, *output_memory, *mean_memory, *variance_memory));
-            }
-        }
+    mean_memory.resize(num_stats_batches_);
+    variance_memory.resize(num_stats_batches_);
+    input_stats.resize(num_stats_batches_);
+    output_stats.resize(num_stats_batches_);
+    BatchNormFwd.resize(num_stats_batches_);
+    for (int i = 0; i < num_stats_batches_; i++) {
+      InitBatchNormFwdPrimitive(i);
     }
 
     //fwd_bottom_data->set_mkldnn_primitive(BatchNormFwd);  //Wrong passed primitive! (TODO: Checking!)
@@ -272,6 +255,70 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNorm(const vector<Blob<Dtype>*>& bott
     }
 }
 
+template <typename Dtype>
+template <bool diff>
+shared_ptr<memory> MKLDNNBatchNormLayer<Dtype>::GetStatsBatchMemory(
+  shared_ptr<MKLDNNMemoryDescriptor<Dtype, diff> > mkldnn_mem, int idx) {
+    long data_offset =
+      idx * stats_batch_size_ * this->channels_ * this->width_ * this->height_;
+    engine cpu_engine = CpuEngine::Instance().get_engine();
+    shared_ptr<memory::desc> stats_md = mkldnn_mem->get_memory_desc();
+    CHECK(stats_md->data.ndims > 0 &&
+          stats_md->data.dims[0] == this->num_);
+    stats_md->data.dims[0] = stats_batch_size_;
+    shared_ptr<memory::primitive_desc> stats_mpd(
+      new memory::primitive_desc(*stats_md, cpu_engine));
+    shared_ptr<memory> stats(
+      new memory(*stats_mpd, mkldnn_mem->get_memory_ptr(data_offset)));
+    return stats;
+}
+
+template <typename Dtype>
+void MKLDNNBatchNormLayer<Dtype>::InitBatchNormFwdPrimitive(int idx) {
+    input_stats[idx] = GetStatsBatchMemory<false>(fwd_bottom_data, idx);
+    output_stats[idx] = GetStatsBatchMemory<false>(fwd_top_data, idx);
+
+    // ---- Create BatchNorm --------------------
+    if (this->phase_ == TEST && !use_global_stats_) {
+        if (use_weight_bias_) {
+            BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd,
+                    *input_stats[idx], *scaleshift_memory,
+                    *output_stats[idx]));
+        } else {
+            BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd,
+                    *input_stats[idx], *output_stats[idx]));
+        }
+    } else {
+        mean_memory[idx].reset(new memory(BatchNormFwd_pd->mean_primitive_desc()));
+        variance_memory[idx].reset(new memory(BatchNormFwd_pd->variance_primitive_desc()));
+
+        if (use_global_stats_) {
+            caffe_copy<Dtype>(this->channels_, this->blobs_[0]->cpu_data(),
+                static_cast<Dtype *>(mean_memory[idx]->get_data_handle()));
+            caffe_copy<Dtype>(this->channels_, this->blobs_[1]->cpu_data(),
+               static_cast<Dtype *>(variance_memory[idx]->get_data_handle()));
+            if (use_weight_bias_) {
+                BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd,
+                        *input_stats[idx], *mean_memory[idx],
+                        *variance_memory[idx], *scaleshift_memory,
+                        *output_stats[idx]));
+            } else {
+                BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd,
+                        *input_stats[idx], *mean_memory[idx],
+                        *variance_memory[idx], *output_stats[idx]));
+            }
+        } else {
+            if (use_weight_bias_) {
+                BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd,
+                        *input_stats[idx], *scaleshift_memory, *output_stats[idx],
+                        *mean_memory[idx], *variance_memory[idx]));
+            } else {
+                BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd,
+                        *input_stats[idx], *output_stats[idx], *mean_memory[idx], *variance_memory[idx]));
+            }
+        }
+    }
+}
 
 template <typename Dtype>
 void MKLDNNBatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom
@@ -289,20 +336,21 @@ void MKLDNNBatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom
     // update top that head at prv
     fwd_top_data->sync_before_write();
 
-    if (use_global_stats_) {
+    for (int stats_batch_idx = 0; stats_batch_idx < num_stats_batches_; stats_batch_idx++) {
+      if (use_global_stats_) {
         // use the stored mean/variance estimates.
         const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ?
             0 : 1 / this->blobs_[2]->cpu_data()[0];
-        Dtype *mean_buffer_ = (Dtype *)(mean_memory->get_data_handle());
-        Dtype *variance_buffer_ = (Dtype *)(variance_memory->get_data_handle());
+        Dtype *mean_buffer_ = (Dtype *)(mean_memory[stats_batch_idx]->get_data_handle());
+        Dtype *variance_buffer_ = (Dtype *)(variance_memory[stats_batch_idx]->get_data_handle());
 
         //TODO: optimize, do this operation in the InitBatchNorm, so no need to calculate each time
         caffe_cpu_scale(this->blobs_[0]->count(), scale_factor,
                     this->blobs_[0]->cpu_data(), mean_buffer_);
         caffe_cpu_scale(this->blobs_[1]->count(), scale_factor,
                     this->blobs_[1]->cpu_data(), variance_buffer_);
-    }
-    if (use_weight_bias_) {
+      }
+      if (use_weight_bias_) {
         Dtype* scaleShift_buffer_ = (Dtype *)(scaleshift_memory->get_data_handle());
         // Fill ScaleShift buffer
         for (int i = 0; i < this->channels_; i++) {
@@ -312,26 +360,27 @@ void MKLDNNBatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom
                 scaleShift_buffer_[channels_ + i] = this->blobs_[4]->cpu_data()[i];
             }
         }
-    }
+      }
 
-    PERFORMANCE_EVENT_ID_INIT(perf_id_fw_, PERFORMANCE_MKLDNN_NAME("FW"));
-    PERFORMANCE_MEASUREMENT_BEGIN();
-    BatchNormFwd.submit();
-    PERFORMANCE_MEASUREMENT_END_ID(perf_id_fw_);
+      PERFORMANCE_EVENT_ID_INIT(perf_id_fw_, PERFORMANCE_MKLDNN_NAME("FW"));
+      PERFORMANCE_MEASUREMENT_BEGIN();
+      BatchNormFwd[stats_batch_idx].submit();
+      PERFORMANCE_MEASUREMENT_END_ID(perf_id_fw_);
 
-    if (this->phase_ == TRAIN && !use_global_stats_) {
+      if (this->phase_ == TRAIN && !use_global_stats_) {
         // compute and save moving average
-        Dtype *mean_buffer_ = (Dtype *)(mean_memory->get_data_handle());
-        Dtype *variance_buffer_ = (Dtype *)(variance_memory->get_data_handle());
+        Dtype *mean_buffer_ = (Dtype *)(mean_memory[stats_batch_idx]->get_data_handle());
+        Dtype *variance_buffer_ = (Dtype *)(variance_memory[stats_batch_idx]->get_data_handle());
         this->blobs_[2]->mutable_cpu_data()[0] *= moving_average_fraction_;
         this->blobs_[2]->mutable_cpu_data()[0] += 1;
         caffe_cpu_axpby<Dtype>(this->channels_, Dtype(1), mean_buffer_,
             moving_average_fraction_, this->blobs_[0]->mutable_cpu_data());
-        int m = bottom[0]->count()/channels_;
+        int m = bottom[0]->count()/num_stats_batches_/channels_;
         Dtype bias_correction_factor = m > 1 ? Dtype(m)/(m-1) : 1;
         caffe_cpu_axpby<Dtype>(this->channels_, bias_correction_factor,
             variance_buffer_, moving_average_fraction_,
             this->blobs_[1]->mutable_cpu_data());
+      }
     }
 
 }
@@ -359,7 +408,7 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNormBwd(
     memory::data_type mpcsn = memory::data_type::f32;
 
     // ---- Initialize memory descriptors -------------
-    shared_ptr<memory::desc> top_diff_md, top_data_md;
+    shared_ptr<memory::desc> top_diff_md, top_diff_stats_md, top_data_md, output_stats_md;
     shared_ptr<memory::primitive_desc> usr_diff_mpd(NULL), prv_diff_mpd(NULL);
     if (top_diff_is_prv) {
         shared_ptr<MKLDNNMemoryDescriptor<Dtype, true> > mem_descr
@@ -371,10 +420,18 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNormBwd(
         top_diff_md.reset(new memory::desc({{n, c, h, w}}, mpcsn, memory::format::nchw));   //MKLDNN batch norm only support 4D memory descriptor!
         usr_diff_mpd.reset(new memory::primitive_desc(*top_diff_md, cpu_engine));
     }
+    top_diff_stats_md.reset(new memory::desc(*top_diff_md));
+    CHECK(top_diff_stats_md->data.ndims > 0 &&
+          top_diff_stats_md->data.dims[0] == this->num_);
+    top_diff_stats_md->data.dims[0] = stats_batch_size_;
+    output_stats_md.reset(new memory::desc(output_memory->get_primitive_desc().desc()));
+    CHECK(output_stats_md->data.ndims > 0 &&
+          output_stats_md->data.dims[0] == this->num_);
+    output_stats_md->data.dims[0] = stats_batch_size_;
 
     // ---- Initialize bnrm primitive descriptor -------------
     batch_normalization_backward::desc BatchNormBwd_desc(prop_kind::backward,
-            *top_diff_md, output_memory->get_primitive_desc().desc(), eps_,
+            *top_diff_stats_md, *output_stats_md, eps_,
             flags);
     // ---- Determining engine to use -----------------------
     std::string subengines = this->layer_param_.engine();
@@ -396,6 +453,11 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNormBwd(
 
     CHECK(BatchNormBwd_pd);
 
+    if (use_weight_bias_) {
+        bwd_scaleshift_diff_memory.reset(new memory(
+                    BatchNormFwd_pd->weights_primitive_desc()));
+    }
+
     // ---  init primitive and prv_memory descriptors ----------------------
     bwd_top_diff.reset(new MKLDNNDiff<Dtype>(usr_diff_mpd, prv_diff_mpd, top[0], this));
     bwd_top_diff->name = "bwd_top_diff_data   @ " + this->layer_param_.name();
@@ -405,17 +467,11 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNormBwd(
     bwd_bottom_diff->name = "bwd_bottom_diff_data   @ " + this->layer_param_.name();
     bwd_bottom_diff_memory = bwd_bottom_diff->create_output_memory(inplace);
 
-    if (use_weight_bias_) {
-        bwd_scaleshift_diff_memory.reset(new memory(
-                    BatchNormFwd_pd->weights_primitive_desc()));
-        BatchNormBwd.reset(new batch_normalization_backward(*BatchNormBwd_pd,
-                    *input_primitive, *mean_memory, *variance_memory,
-                    *bwd_top_diff_primitive, *scaleshift_memory,
-                    *bwd_bottom_diff_memory, *bwd_scaleshift_diff_memory));
-    } else {
-        BatchNormBwd.reset(new batch_normalization_backward(*BatchNormBwd_pd,
-                    *input_primitive, *mean_memory, *variance_memory,
-                    *bwd_top_diff_primitive, *bwd_bottom_diff_memory));
+    top_diff_stats.resize(num_stats_batches_);
+    bottom_diff_stats.resize(num_stats_batches_);
+    BatchNormBwd.resize(num_stats_batches_);
+    for (int i = 0; i < num_stats_batches_; i++) {
+      InitBatchNormBwdPrimitive(i);
     }
 
     //bwd_top_diff->set_mkldnn_primitive(BatchNormBwd);     //Wrong passed primitive! (TODO: Checking!)
@@ -427,6 +483,23 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNormBwd(
     bwd_bottom_diff->set_mkldnn_primitive(bwd_bottom_diff_memory_transfer);
 }
 
+template <typename Dtype>
+void MKLDNNBatchNormLayer<Dtype>::InitBatchNormBwdPrimitive(int idx) {
+    top_diff_stats[idx] = GetStatsBatchMemory<true>(bwd_top_diff, idx);
+    bottom_diff_stats[idx] = GetStatsBatchMemory<true>(bwd_bottom_diff, idx);
+
+    if (use_weight_bias_) {
+        BatchNormBwd[idx].reset(new batch_normalization_backward(*BatchNormBwd_pd,
+                    *input_stats[idx], *mean_memory[idx], *variance_memory[idx],
+                    *top_diff_stats[idx], *scaleshift_memory,
+                    *bottom_diff_stats[idx], *bwd_scaleshift_diff_memory));
+    } else {
+        BatchNormBwd[idx].reset(new batch_normalization_backward(*BatchNormBwd_pd,
+                    *input_stats[idx], *mean_memory[idx], *variance_memory[idx],
+                    *top_diff_stats[idx], *bottom_diff_stats[idx]));
+    }
+}
+
 template <typename Dtype>
 void MKLDNNBatchNormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
         const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom)
@@ -443,53 +516,56 @@ void MKLDNNBatchNormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     // update bottom that head at prv
     bwd_bottom_diff->sync_before_write();
 
-    PERFORMANCE_EVENT_ID_INIT(perf_id_bw_, PERFORMANCE_MKLDNN_NAME("BW"));
-    PERFORMANCE_MEASUREMENT_BEGIN();
+    for (int stats_batch_idx = 0; stats_batch_idx < num_stats_batches_; stats_batch_idx++) {
+
+      PERFORMANCE_EVENT_ID_INIT(perf_id_bw_, PERFORMANCE_MKLDNN_NAME("BW"));
+      PERFORMANCE_MEASUREMENT_BEGIN();
 #ifdef DEBUG
-    if (bottom[0]->prv_data() != NULL)
-    {
+      if (bottom[0]->prv_data() != NULL)
+      {
         LOG(INFO) << "Debug: Bottom prv data: " << *bottom[0]->prv_data();
-    }
-    else
-    {
+      }
+      else
+      {
         LOG(INFO) << "Debug: Bottom prv data is NULL!";
-    }
-
-    if (top[0]->prv_diff() != NULL)
-    {
+      }
+      
+      if (top[0]->prv_diff() != NULL)
+      {
         LOG(INFO) << "Debug: Top prv diff: " << *top[0]->prv_diff();
-    }
-    else
-    {
+      }
+      else
+      {
         LOG(INFO) << "Debug: Top prv diff is NULL!";
         LOG(INFO) << "Debug: Top cpu diff: " << *top[0]->cpu_diff();
-    }
+      }
 #endif
-    BatchNormBwd.submit();
+      BatchNormBwd[stats_batch_idx].submit();
 #ifdef DEBUG
-    if (bottom[0]->prv_diff() != NULL)
-    {
+      if (bottom[0]->prv_diff() != NULL)
+      {
         LOG(INFO) << "Debug: Bottom prv diff: " << *bottom[0]->prv_diff();
-    }
-    else
-    {
+      }
+      else
+      {
         LOG(INFO) << "Debug: Bottom prv diff is NULL!";
         LOG(INFO) << "Debug: Bottom cpu diff: " << *bottom[0]->cpu_diff();
-    }
+      }
 #endif
-    PERFORMANCE_MEASUREMENT_END_ID(perf_id_bw_);
+      PERFORMANCE_MEASUREMENT_END_ID(perf_id_bw_);
 
-    /* FIXME: this wouldn't work with lazy stream */
-    if (use_weight_bias_) {
+      /* FIXME: this wouldn't work with lazy stream */
+      if (use_weight_bias_) {
         Dtype* dw = (Dtype *)(bwd_scaleshift_diff_memory->get_data_handle());
         for (int i = 0; i < this->channels_; i++)
-            this->blobs_[3]->mutable_cpu_diff()[i] = dw[i];
+            this->blobs_[3]->mutable_cpu_diff()[i] += dw[i];
 
         if (bias_term_) {
             dw += channels_;
             for (int i = 0; i < this->channels_; i++)
-                this->blobs_[4]->mutable_cpu_diff()[i] = dw[i];
+                this->blobs_[4]->mutable_cpu_diff()[i] += dw[i];
         }
+      }
     }
 }
 
diff --git a/src/caffe/mkldnn_memory.cpp b/src/caffe/mkldnn_memory.cpp
index bacb6ae61..6e42e691d 100644
--- a/src/caffe/mkldnn_memory.cpp
+++ b/src/caffe/mkldnn_memory.cpp
@@ -453,6 +453,32 @@ shared_ptr<memory> MKLDNNMemoryDescriptor<Dtype, is_diff>::create_output_memory(
     return omem;
 }
 
+template <typename Dtype, bool is_diff>
+Dtype* MKLDNNMemoryDescriptor<Dtype, is_diff>::get_memory_ptr(long offset) {
+    if (this->conversion_needed()) {
+      // TODO: support DFP16 offset
+      if (this->prv_ptr() != NULL) return (Dtype*)this->prv_ptr() + offset;
+      // when _internal_ptr is null, having same private layout as _blob
+      else return is_diff ?
+             (Dtype*)this->_blob->prv_diff() + offset :
+             (Dtype*)this->_blob->prv_data() + offset;
+    } else {
+      return const_cast<Dtype*>(
+        is_diff ? this->_blob->cpu_diff() + offset : this->_blob->cpu_data() + offset);
+    }
+}
+
+template <typename Dtype, bool is_diff>
+shared_ptr<memory::desc> MKLDNNMemoryDescriptor<Dtype, is_diff>::get_memory_desc() {
+    shared_ptr<memory::desc> desc;
+    if (this->conversion_needed()) {
+        desc.reset(new memory::desc(this->prv_memory_pd()->desc()));
+    } else {
+        desc.reset(new memory::desc(this->usr_memory_pd()->desc()));
+    }
+    return desc;
+}
+
 template <typename Dtype, bool is_diff>
 shared_ptr<MKLDNNMemoryDescriptor<Dtype, is_diff> > get_mkldnn_prv_descriptor(Blob<Dtype>* blob)
 {

From 4e4aecb2d9391056a60449271a3f3845eaf4cb6d Mon Sep 17 00:00:00 2001
From: "Yu, Chong" <chong.yu@intel.com>
Date: Mon, 21 Aug 2017 11:10:23 +0800
Subject: [PATCH 27/38] Update MKLDNN version to
 27420a241b2efd8d88f1e003635434194fdfb1b8

---
 mkldnn.commit | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mkldnn.commit b/mkldnn.commit
index 7eb0167ed..4c279b216 100644
--- a/mkldnn.commit
+++ b/mkldnn.commit
@@ -1 +1 @@
-171572a205c71f5bbb08657de5660c9d06cf2d8f
+27420a241b2efd8d88f1e003635434194fdfb1b8

From 445a396817ccea903ad538e066a84f5252a82cd0 Mon Sep 17 00:00:00 2001
From: linxinan <xinan.lin@intel.com>
Date: Tue, 22 Aug 2017 20:27:03 +0800
Subject: [PATCH 28/38] add vgg_16_8nodes solver prototxt

---
 .../multinode/vgg_16_8nodes/solver.prototxt   |  14 +
 .../vgg_16_8nodes/train_val.prototxt          | 612 ++++++++++++++++++
 2 files changed, 626 insertions(+)
 create mode 100644 models/intel_optimized_models/multinode/vgg_16_8nodes/solver.prototxt
 create mode 100644 models/intel_optimized_models/multinode/vgg_16_8nodes/train_val.prototxt

diff --git a/models/intel_optimized_models/multinode/vgg_16_8nodes/solver.prototxt b/models/intel_optimized_models/multinode/vgg_16_8nodes/solver.prototxt
new file mode 100644
index 000000000..1b55e4c7d
--- /dev/null
+++ b/models/intel_optimized_models/multinode/vgg_16_8nodes/solver.prototxt
@@ -0,0 +1,14 @@
+net: "models/intel_optimized_models/multinode/vgg_16_8nodes/train_val.prototxt"
+test_iter: 1563
+test_interval: 10000
+momentum: 0.9
+weight_decay: 0.0005
+base_lr: 0.01
+lr_policy: "poly"
+power: 2
+max_iter: 300000
+display: 40
+snapshot: 100000
+solver_mode: CPU
+snapshot_prefix: "models/intel_optimized_models/multinode/vgg_16_8nodes"
+
diff --git a/models/intel_optimized_models/multinode/vgg_16_8nodes/train_val.prototxt b/models/intel_optimized_models/multinode/vgg_16_8nodes/train_val.prototxt
new file mode 100644
index 000000000..5571737db
--- /dev/null
+++ b/models/intel_optimized_models/multinode/vgg_16_8nodes/train_val.prototxt
@@ -0,0 +1,612 @@
+name: "VGG_ILSVRC_16_layer"
+layer {
+  name: "data"
+  type: "Data"
+  include {
+    phase: TRAIN
+  }
+ transform_param {
+    crop_size: 224
+    mean_value: 104
+    mean_value: 117
+    mean_value: 124
+    mirror: true
+ }
+ data_param {
+    source: "examples/imagenet/ilsvrc12_train_lmdb"
+    batch_size: 32
+    backend: LMDB
+  }
+  top: "data"
+  top: "label"
+}
+layer {
+  name: "data"
+  type: "Data"
+  include {
+    phase: TEST
+  }
+ transform_param {
+    crop_size: 224
+    mean_value: 104
+    mean_value: 117
+    mean_value: 124
+    mirror: false
+ }
+ data_param {
+    source: "examples/imagenet/ilsvrc12_val_lmdb"
+    batch_size: 50
+    backend: LMDB
+  }
+  top: "data"
+  top: "label"
+}
+layer {
+   name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+ convolution_param {
+    
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+        type: "xavier"
+    }
+    bias_filler {
+        type: "constant"
+        value: 0
+    }
+  }
+
+  
+}
+layer {
+  bottom: "conv1_1"
+  top: "conv1_1"
+  name: "relu1_1"
+  type: "ReLU"
+  relu_param {
+    
+  }
+}
+layer {
+  bottom: "conv1_1"
+  top: "conv1_2"
+  name: "conv1_2"
+  type: "Convolution"
+  convolution_param {
+    
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+        type: "xavier"
+    }
+    bias_filler {
+        type: "constant"
+        value: 0
+    }
+  }
+ 
+ 
+}
+layer {
+  bottom: "conv1_2"
+  top: "conv1_2"
+  name: "relu1_2"
+  type: "ReLU"
+  relu_param {
+    
+  }
+}
+layer {
+  bottom: "conv1_2"
+  top: "pool1"
+  name: "pool1"
+  type: "Pooling"
+  pooling_param {
+    
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  bottom: "pool1"
+  top: "conv2_1"
+  name: "conv2_1"
+  type: "Convolution"
+  convolution_param {
+    
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+        type: "xavier"
+    }
+    bias_filler {
+        type: "constant"
+        value: 0
+    }
+  }
+ 
+ 
+}
+layer {
+  bottom: "conv2_1"
+  top: "conv2_1"
+  name: "relu2_1"
+  type: "ReLU"
+  relu_param {
+    
+  }
+}
+layer {
+  bottom: "conv2_1"
+  top: "conv2_2"
+  name: "conv2_2"
+  type: "Convolution"
+  convolution_param {
+    
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+        type: "xavier"
+    }
+    bias_filler {
+        type: "constant"
+        value: 0
+    }
+  }
+ 
+ 
+}
+layer {
+  bottom: "conv2_2"
+  top: "conv2_2"
+  name: "relu2_2"
+  type: "ReLU"
+  relu_param {
+    
+  }
+}
+layer {
+  bottom: "conv2_2"
+  top: "pool2"
+  name: "pool2"
+  type: "Pooling"
+  pooling_param {
+    
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  bottom: "pool2"
+  top: "conv3_1"
+  name: "conv3_1"
+  type: "Convolution"
+  convolution_param {
+    
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+        type: "xavier"
+    }
+    bias_filler {
+        type: "constant"
+        value: 0
+    }
+  }
+ 
+ 
+}
+layer {
+  bottom: "conv3_1"
+  top: "conv3_1"
+  name: "relu3_1"
+  type: "ReLU"
+  relu_param {
+    
+  }
+}
+layer {
+  bottom: "conv3_1"
+  top: "conv3_2"
+  name: "conv3_2"
+  type: "Convolution"
+  convolution_param {
+    
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+        type: "xavier"
+    }
+    bias_filler {
+        type: "constant"
+        value: 0
+    }
+  }
+ 
+ 
+}
+layer {
+  bottom: "conv3_2"
+  top: "conv3_2"
+  name: "relu3_2"
+  type: "ReLU"
+  relu_param {
+    
+  }
+}
+layer {
+  bottom: "conv3_2"
+  top: "conv3_3"
+  name: "conv3_3"
+  type: "Convolution"
+  convolution_param {
+    
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+        type: "xavier"
+    }
+    bias_filler {
+        type: "constant"
+        value: 0
+    }
+  }
+ 
+ 
+}
+layer {
+  bottom: "conv3_3"
+  top: "conv3_3"
+  name: "relu3_3"
+  type: "ReLU"
+  relu_param {
+    
+  }
+}
+layer {
+  bottom: "conv3_3"
+  top: "pool3"
+  name: "pool3"
+  type: "Pooling"
+  pooling_param {
+    
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  bottom: "pool3"
+  top: "conv4_1"
+  name: "conv4_1"
+  type: "Convolution"
+  convolution_param {
+    
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+        type: "xavier"
+    }
+    bias_filler {
+        type: "constant"
+        value: 0
+    }
+  }
+ 
+ 
+}
+layer {
+  bottom: "conv4_1"
+  top: "conv4_1"
+  name: "relu4_1"
+  type: "ReLU"
+  relu_param {
+    
+  }
+}
+layer {
+  bottom: "conv4_1"
+  top: "conv4_2"
+  name: "conv4_2"
+  type: "Convolution"
+  convolution_param {
+    
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+        type: "xavier"
+    }
+    bias_filler {
+        type: "constant"
+        value: 0
+    }
+  }
+ 
+ 
+}
+layer {
+  bottom: "conv4_2"
+  top: "conv4_2"
+  name: "relu4_2"
+  type: "ReLU"
+  relu_param {
+    
+  }
+}
+layer {
+  bottom: "conv4_2"
+  top: "conv4_3"
+  name: "conv4_3"
+  type: "Convolution"
+  convolution_param {
+    
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+        type: "xavier"
+    }
+    bias_filler {
+        type: "constant"
+        value: 0
+    }
+  }
+ 
+ 
+}
+layer {
+  bottom: "conv4_3"
+  top: "conv4_3"
+  name: "relu4_3"
+  type: "ReLU"
+  relu_param {
+    
+  }
+}
+layer {
+  bottom: "conv4_3"
+  top: "pool4"
+  name: "pool4"
+  type: "Pooling"
+  pooling_param {
+    
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  bottom: "pool4"
+  top: "conv5_1"
+  name: "conv5_1"
+  type: "Convolution"
+  convolution_param {
+    
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+        type: "xavier"
+    }
+    bias_filler {
+        type: "constant"
+        value: 0
+    }
+  }
+ 
+ 
+}
+layer {
+  bottom: "conv5_1"
+  top: "conv5_1"
+  name: "relu5_1"
+  type: "ReLU"
+  relu_param {
+    
+  }
+}
+layer {
+  bottom: "conv5_1"
+  top: "conv5_2"
+  name: "conv5_2"
+  type: "Convolution"
+  convolution_param {
+    
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+        type: "xavier"
+    }
+    bias_filler {
+        type: "constant"
+        value: 0
+    }
+  }
+ 
+ 
+}
+layer {
+  bottom: "conv5_2"
+  top: "conv5_2"
+  name: "relu5_2"
+  type: "ReLU"
+  relu_param {
+    
+  }
+}
+layer {
+  bottom: "conv5_2"
+  top: "conv5_3"
+  name: "conv5_3"
+  type: "Convolution"
+  convolution_param {
+    
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+        type: "xavier"
+    }
+    bias_filler {
+        type: "constant"
+        value: 0.0
+    }
+  }
+ 
+ 
+}
+layer {
+  bottom: "conv5_3"
+  top: "conv5_3"
+  name: "relu5_3"
+  type: "ReLU"
+  relu_param {
+    
+  }
+}
+layer {
+  bottom: "conv5_3"
+  top: "pool5"
+  name: "pool5"
+  type: "Pooling"
+  pooling_param {
+    
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  bottom: "pool5"
+  top: "fc6"
+  name: "fc6"
+  type: "InnerProduct"
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+        type: "xavier"
+    }
+    bias_filler {
+        type: "constant"
+        value: 0.1
+    }
+  }
+ 
+ 
+}
+layer {
+  bottom: "fc6"
+  top: "fc6"
+  name: "relu6"
+  type: "ReLU"
+  relu_param {
+    
+  }
+}
+layer {
+  bottom: "fc6"
+  top: "fc6"
+  name: "drop6"
+  type: "Dropout"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  bottom: "fc6"
+  top: "fc7"
+  name: "fc7"
+  type: "InnerProduct"
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+        type: "xavier"
+    }
+    bias_filler {
+        type: "constant"
+        value: 0.1
+    }
+  }
+ 
+ 
+}
+layer {
+  bottom: "fc7"
+  top: "fc7"
+  name: "relu7"
+  type: "ReLU"
+  relu_param {
+    
+  }
+}
+layer {
+  bottom: "fc7"
+  top: "fc7"
+  name: "drop7"
+  type: "Dropout"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc8"
+  bottom: "fc7"
+  top: "fc8"
+  type: "InnerProduct"
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+        type: "xavier"
+    }
+    bias_filler {
+        type: "constant"
+        value: 0.1
+    }
+  }
+ 
+ 
+}
+layer {
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "fc8"
+  bottom: "label"
+  top: "loss/loss"
+}
+layer {
+  name: "accuracy/top1"
+  type: "Accuracy"
+  bottom: "fc8"
+  bottom: "label"
+  top: "accuracy@1"
+  include: { phase: TEST }
+  accuracy_param {
+    top_k: 1
+  }
+}
+layer {
+  name: "accuracy/top5"
+  type: "Accuracy"
+  bottom: "fc8"
+  bottom: "label"
+  top: "accuracy@5"
+  include: { phase: TEST }
+  accuracy_param {
+    top_k: 5
+  }
+}

From d3ec15c34388d4f40659f243bf5ae482e3a73252 Mon Sep 17 00:00:00 2001
From: "Gong, Jiong" <jiong.gong@intel.com>
Date: Tue, 22 Aug 2017 22:37:09 +0800
Subject: [PATCH 29/38] fix icl197 and a regression in bn

---
 src/caffe/layers/mkldnn_batch_norm_layer.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/caffe/layers/mkldnn_batch_norm_layer.cpp b/src/caffe/layers/mkldnn_batch_norm_layer.cpp
index 6688f8584..91c753bba 100644
--- a/src/caffe/layers/mkldnn_batch_norm_layer.cpp
+++ b/src/caffe/layers/mkldnn_batch_norm_layer.cpp
@@ -62,6 +62,8 @@ void MKLDNNBatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom
     bias_term_ = this->layer_param_.batch_norm_param().bias_term();
     moving_average_fraction_ = this->layer_param_.batch_norm_param().moving_average_fraction();
     use_global_stats_ = this->phase_ == TEST;
+    if (this->layer_param_.batch_norm_param().has_use_global_stats())
+      use_global_stats_ = this->layer_param_.batch_norm_param().use_global_stats();
 
     this->blobs_.resize(3 + (use_weight_bias_ ? 1:0) + (use_weight_bias_ && bias_term_ ? 1:0));
 
@@ -299,13 +301,13 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNormFwdPrimitive(int idx) {
                static_cast<Dtype *>(variance_memory[idx]->get_data_handle()));
             if (use_weight_bias_) {
                 BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd,
-                        *input_stats[idx], *mean_memory[idx],
-                        *variance_memory[idx], *scaleshift_memory,
+                        *input_stats[idx], (const primitive::at)*mean_memory[idx],
+                        (const primitive::at)*variance_memory[idx], *scaleshift_memory,
                         *output_stats[idx]));
             } else {
                 BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd,
-                        *input_stats[idx], *mean_memory[idx],
-                        *variance_memory[idx], *output_stats[idx]));
+                        *input_stats[idx], (const primitive::at)*mean_memory[idx],
+                        (const primitive::at)*variance_memory[idx], *output_stats[idx]));
             }
         } else {
             if (use_weight_bias_) {

From 3fb9b9acb6bbf776119eb37b7f0455ad22501d71 Mon Sep 17 00:00:00 2001
From: "Yu, Chong" <chong.yu@intel.com>
Date: Wed, 23 Aug 2017 22:24:30 +0800
Subject: [PATCH 30/38] Avoid unnecessary scale and shift coping back and forth
 in BatchNorm.

---
 include/caffe/layers/mkldnn_layers.hpp       |  2 +
 src/caffe/layers/mkldnn_batch_norm_layer.cpp | 49 +++++++++-----------
 2 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/include/caffe/layers/mkldnn_layers.hpp b/include/caffe/layers/mkldnn_layers.hpp
index bf23438bd..7d5e0dbed 100644
--- a/include/caffe/layers/mkldnn_layers.hpp
+++ b/include/caffe/layers/mkldnn_layers.hpp
@@ -71,6 +71,7 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer<Dtype>, public Layer<Dtype> {
         , scaleshift_memory(), bwd_scaleshift_diff_memory()
         , output_memory(), bwd_bottom_diff_memory()
         , input_primitive(), bwd_top_diff_primitive()
+        , scaleshift_combination()
         {
           PERFORMANCE_EVENT_ID_RESET(perf_id_fw_);
           PERFORMANCE_EVENT_ID_RESET(perf_id_bw_);
@@ -118,6 +119,7 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer<Dtype>, public Layer<Dtype> {
     bool use_weight_bias_, bias_term_, use_global_stats_;
     int num_stats_batches_;
     int stats_batch_size_;
+    shared_ptr<Blob<Dtype>> scaleshift_combination;
 
     PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
     PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
diff --git a/src/caffe/layers/mkldnn_batch_norm_layer.cpp b/src/caffe/layers/mkldnn_batch_norm_layer.cpp
index 91c753bba..0b9bb2343 100644
--- a/src/caffe/layers/mkldnn_batch_norm_layer.cpp
+++ b/src/caffe/layers/mkldnn_batch_norm_layer.cpp
@@ -78,6 +78,19 @@ void MKLDNNBatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom
             this->blobs_[i]->mutable_cpu_data());
     }
 
+    //IntelCaffe treat scale and shift as different blobs, so current MKL-DNN integration has additional copies from Caffe to MKL-DNN buffer on fwd pass and from MKL-DNN to Caffe buffer on bwd pass.
+    //Optimization: use the temp blob to combine the scale and shift together. Avoid the additional copies.
+    // Initialize scale and shift combination blob
+    vector<int> scaleshift_combination_shape(1);
+    scaleshift_combination_shape[0] = 2*channels_;
+    this->scaleshift_combination.reset(new Blob<Dtype>(scaleshift_combination_shape));
+    //Should initialize the scaleshift_combine buffer to 0, because when bias_term_ == false, need to pass zero bias to MKLDNN
+    caffe_set(scaleshift_combination_shape[0], static_cast<Dtype>(0),
+              scaleshift_combination->mutable_cpu_data());
+    //Not so necessary, because the diff will initialize to 0 automatically
+    caffe_set(scaleshift_combination_shape[0], static_cast<Dtype>(0),
+              scaleshift_combination->mutable_cpu_diff());
+
     if (use_weight_bias_) {
         // Initialize scale and shift
         vector<int> scaleshift_shape(1);
@@ -85,6 +98,8 @@ void MKLDNNBatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom
         VLOG(1) << "MKLDNNBatchNormLayer<Dtype>::LayerSetUp: channels_  = " << channels_;
 
         this->blobs_[3].reset(new Blob<Dtype>(scaleshift_shape));
+        this->blobs_[3]->set_cpu_data(scaleshift_combination->mutable_cpu_data());
+        this->blobs_[3]->set_cpu_diff(scaleshift_combination->mutable_cpu_diff());
         FillerParameter filler_param(this->layer_param_.batch_norm_param().filler());
         if (!this->layer_param_.batch_norm_param().has_filler()) {
             filler_param.set_type("constant");
@@ -94,8 +109,10 @@ void MKLDNNBatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom
         VLOG(1) << "MKLDNNBatchNormLayer<Dtype>::LayerSetUp: scaleshift " << __LINE__ << ":" << this->layer_param_.name();
         filler->Fill(this->blobs_[3].get());
 
-        if ( bias_term_ ) {
+        if (bias_term_) {
             this->blobs_[4].reset(new Blob<Dtype>(scaleshift_shape));
+            this->blobs_[4]->set_cpu_data(scaleshift_combination->mutable_cpu_data() + scaleshift_combination->offset(channels_));
+            this->blobs_[4]->set_cpu_diff(scaleshift_combination->mutable_cpu_diff() + scaleshift_combination->offset(channels_));
             FillerParameter bias_filler_param(this->layer_param_.batch_norm_param().bias_filler());
             if (!this->layer_param_.batch_norm_param().has_bias_filler()) {
                 bias_filler_param.set_type("constant");
@@ -212,7 +229,7 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNorm(const vector<Blob<Dtype>*>& bott
 
     // ---- Create memory  ---------------------
     if (use_weight_bias_) {
-        scaleshift_memory.reset(new memory(BatchNormFwd_pd->weights_primitive_desc()));
+        scaleshift_memory.reset(new memory(BatchNormFwd_pd->weights_primitive_desc(), this->scaleshift_combination->mutable_cpu_data()));
     }
 
     // ---  init primitive and prv_memory descriptors ----------------------
@@ -352,18 +369,7 @@ void MKLDNNBatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom
         caffe_cpu_scale(this->blobs_[1]->count(), scale_factor,
                     this->blobs_[1]->cpu_data(), variance_buffer_);
       }
-      if (use_weight_bias_) {
-        Dtype* scaleShift_buffer_ = (Dtype *)(scaleshift_memory->get_data_handle());
-        // Fill ScaleShift buffer
-        for (int i = 0; i < this->channels_; i++) {
-            scaleShift_buffer_[i] = this->blobs_[3]->cpu_data()[i];
-            scaleShift_buffer_[channels_ + i] = 0;
-            if (bias_term_) {
-                scaleShift_buffer_[channels_ + i] = this->blobs_[4]->cpu_data()[i];
-            }
-        }
-      }
-
+      
       PERFORMANCE_EVENT_ID_INIT(perf_id_fw_, PERFORMANCE_MKLDNN_NAME("FW"));
       PERFORMANCE_MEASUREMENT_BEGIN();
       BatchNormFwd[stats_batch_idx].submit();
@@ -457,7 +463,7 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNormBwd(
 
     if (use_weight_bias_) {
         bwd_scaleshift_diff_memory.reset(new memory(
-                    BatchNormFwd_pd->weights_primitive_desc()));
+                    BatchNormFwd_pd->weights_primitive_desc(), this->scaleshift_combination->mutable_cpu_diff()));
     }
 
     // ---  init primitive and prv_memory descriptors ----------------------
@@ -555,19 +561,6 @@ void MKLDNNBatchNormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       }
 #endif
       PERFORMANCE_MEASUREMENT_END_ID(perf_id_bw_);
-
-      /* FIXME: this wouldn't work with lazy stream */
-      if (use_weight_bias_) {
-        Dtype* dw = (Dtype *)(bwd_scaleshift_diff_memory->get_data_handle());
-        for (int i = 0; i < this->channels_; i++)
-            this->blobs_[3]->mutable_cpu_diff()[i] += dw[i];
-
-        if (bias_term_) {
-            dw += channels_;
-            for (int i = 0; i < this->channels_; i++)
-                this->blobs_[4]->mutable_cpu_diff()[i] += dw[i];
-        }
-      }
     }
 }
 

From bf824c47654bc22b12f5f5f757b049bda7611ada Mon Sep 17 00:00:00 2001
From: "Gong, Jiong" <jiong.gong@intel.com>
Date: Fri, 25 Aug 2017 22:21:41 +0800
Subject: [PATCH 31/38] support scaleshift accum with stats batch size>1

Change-Id: I3b1a16dae1a6a2965b43ce61109d0a58b70e9093
---
 include/caffe/layers/mkldnn_layers.hpp       |  5 +-
 src/caffe/layers/mkldnn_batch_norm_layer.cpp | 63 +++++++++++++-------
 2 files changed, 43 insertions(+), 25 deletions(-)

diff --git a/include/caffe/layers/mkldnn_layers.hpp b/include/caffe/layers/mkldnn_layers.hpp
index 7d5e0dbed..f7ce1062e 100644
--- a/include/caffe/layers/mkldnn_layers.hpp
+++ b/include/caffe/layers/mkldnn_layers.hpp
@@ -71,7 +71,6 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer<Dtype>, public Layer<Dtype> {
         , scaleshift_memory(), bwd_scaleshift_diff_memory()
         , output_memory(), bwd_bottom_diff_memory()
         , input_primitive(), bwd_top_diff_primitive()
-        , scaleshift_combination()
         {
           PERFORMANCE_EVENT_ID_RESET(perf_id_fw_);
           PERFORMANCE_EVENT_ID_RESET(perf_id_bw_);
@@ -100,6 +99,7 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer<Dtype>, public Layer<Dtype> {
     void InitBatchNormBwdPrimitive(int stats_batch_idx);
     template <bool diff> shared_ptr<memory> GetStatsBatchMemory(
       shared_ptr<MKLDNNMemoryDescriptor<Dtype, diff> > mkldnn_data, int idx);
+    void InitStatsBatchVars(int batch_size);
     shared_ptr<MKLDNNData<Dtype> > fwd_top_data, fwd_bottom_data;
     shared_ptr<MKLDNNDiff<Dtype> > bwd_top_diff, bwd_bottom_diff;
     shared_ptr<batch_normalization_forward::primitive_desc> BatchNormFwd_pd;
@@ -119,7 +119,8 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer<Dtype>, public Layer<Dtype> {
     bool use_weight_bias_, bias_term_, use_global_stats_;
     int num_stats_batches_;
     int stats_batch_size_;
-    shared_ptr<Blob<Dtype>> scaleshift_combination;
+    shared_ptr<Blob<Dtype> > scaleshift_blob_;
+    shared_ptr<Blob<Dtype> > scaleshift_acc_;
 
     PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
     PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
diff --git a/src/caffe/layers/mkldnn_batch_norm_layer.cpp b/src/caffe/layers/mkldnn_batch_norm_layer.cpp
index 0b9bb2343..f1edfebd4 100644
--- a/src/caffe/layers/mkldnn_batch_norm_layer.cpp
+++ b/src/caffe/layers/mkldnn_batch_norm_layer.cpp
@@ -44,6 +44,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 namespace caffe {
 
+template <typename Dtype>
+void MKLDNNBatchNormLayer<Dtype>::InitStatsBatchVars(int batch_size) {
+    num_stats_batches_ = 1;
+    stats_batch_size_ = batch_size;
+    BatchNormParameter param = this->layer_param_.batch_norm_param();
+    if (!use_global_stats_ && param.stats_batch_size() > 0) {
+      CHECK_EQ(batch_size % param.stats_batch_size(), 0);
+      num_stats_batches_ = batch_size / param.stats_batch_size();
+      stats_batch_size_ = param.stats_batch_size();
+    }
+}
+
 template <typename Dtype>
 void MKLDNNBatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom
                                         ,const vector<Blob<Dtype>*>& top)
@@ -65,6 +77,8 @@ void MKLDNNBatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom
     if (this->layer_param_.batch_norm_param().has_use_global_stats())
       use_global_stats_ = this->layer_param_.batch_norm_param().use_global_stats();
 
+    InitStatsBatchVars(num_);
+
     this->blobs_.resize(3 + (use_weight_bias_ ? 1:0) + (use_weight_bias_ && bias_term_ ? 1:0));
 
     vector<int> sz;
@@ -81,15 +95,18 @@ void MKLDNNBatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom
     //IntelCaffe treat scale and shift as different blobs, so current MKL-DNN integration has additional copies from Caffe to MKL-DNN buffer on fwd pass and from MKL-DNN to Caffe buffer on bwd pass.
     //Optimization: use the temp blob to combine the scale and shift together. Avoid the additional copies.
     // Initialize scale and shift combination blob
-    vector<int> scaleshift_combination_shape(1);
-    scaleshift_combination_shape[0] = 2*channels_;
-    this->scaleshift_combination.reset(new Blob<Dtype>(scaleshift_combination_shape));
-    //Should initialize the scaleshift_combine buffer to 0, because when bias_term_ == false, need to pass zero bias to MKLDNN
-    caffe_set(scaleshift_combination_shape[0], static_cast<Dtype>(0),
-              scaleshift_combination->mutable_cpu_data());
-    //Not so necessary, because the diff will initialize to 0 automatically
-    caffe_set(scaleshift_combination_shape[0], static_cast<Dtype>(0),
-              scaleshift_combination->mutable_cpu_diff());
+    vector<int> scaleshift_blob_shape(1);
+    scaleshift_blob_shape[0] = 2*channels_;
+    scaleshift_blob_.reset(new Blob<Dtype>(scaleshift_blob_shape));
+    //Should initialize the scaleshift_blob_ buffer to 0, because when bias_term_ == false, need to pass zero bias to MKLDNN
+    caffe_set(scaleshift_blob_shape[0], static_cast<Dtype>(0),
+              scaleshift_blob_->mutable_cpu_data());
+    shared_ptr<Blob<Dtype> > scaleshift_diff_blob = scaleshift_blob_;
+    scaleshift_acc_ = scaleshift_blob_;
+    if (num_stats_batches_ > 1) {
+      this->scaleshift_acc_.reset(new Blob<Dtype>(scaleshift_blob_shape));
+      scaleshift_diff_blob = scaleshift_acc_;
+    }
 
     if (use_weight_bias_) {
         // Initialize scale and shift
@@ -98,8 +115,8 @@ void MKLDNNBatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom
         VLOG(1) << "MKLDNNBatchNormLayer<Dtype>::LayerSetUp: channels_  = " << channels_;
 
         this->blobs_[3].reset(new Blob<Dtype>(scaleshift_shape));
-        this->blobs_[3]->set_cpu_data(scaleshift_combination->mutable_cpu_data());
-        this->blobs_[3]->set_cpu_diff(scaleshift_combination->mutable_cpu_diff());
+        this->blobs_[3]->set_cpu_data(scaleshift_blob_->mutable_cpu_data());
+        this->blobs_[3]->set_cpu_diff(scaleshift_diff_blob->mutable_cpu_diff());
         FillerParameter filler_param(this->layer_param_.batch_norm_param().filler());
         if (!this->layer_param_.batch_norm_param().has_filler()) {
             filler_param.set_type("constant");
@@ -111,8 +128,8 @@ void MKLDNNBatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom
 
         if (bias_term_) {
             this->blobs_[4].reset(new Blob<Dtype>(scaleshift_shape));
-            this->blobs_[4]->set_cpu_data(scaleshift_combination->mutable_cpu_data() + scaleshift_combination->offset(channels_));
-            this->blobs_[4]->set_cpu_diff(scaleshift_combination->mutable_cpu_diff() + scaleshift_combination->offset(channels_));
+            this->blobs_[4]->set_cpu_data(scaleshift_blob_->mutable_cpu_data() + scaleshift_blob_->offset(channels_));
+            this->blobs_[4]->set_cpu_diff(scaleshift_diff_blob->mutable_cpu_diff() + scaleshift_blob_->offset(channels_));
             FillerParameter bias_filler_param(this->layer_param_.batch_norm_param().bias_filler());
             if (!this->layer_param_.batch_norm_param().has_bias_filler()) {
                 bias_filler_param.set_type("constant");
@@ -149,14 +166,7 @@ void MKLDNNBatchNormLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom
     this->num_ = bottom[0]->num();
     this->channels_ = bottom[0]->channels();
 
-    num_stats_batches_ = 1;
-    stats_batch_size_ = bottom[0]->shape(0);
-    BatchNormParameter param = this->layer_param_.batch_norm_param();
-    if (!use_global_stats_ && param.stats_batch_size() > 0) {
-      CHECK_EQ(bottom[0]->shape(0) % param.stats_batch_size(), 0);
-      num_stats_batches_ = bottom[0]->shape(0) / param.stats_batch_size();
-      stats_batch_size_ = param.stats_batch_size();
-    }
+    InitStatsBatchVars(this->num_);
 
     //Fix: should reshape the top blob with the real size of bottom blob
     //top[0]->Reshape(this->num_, this->channels_, this->height_, this->width_);
@@ -229,7 +239,7 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNorm(const vector<Blob<Dtype>*>& bott
 
     // ---- Create memory  ---------------------
     if (use_weight_bias_) {
-        scaleshift_memory.reset(new memory(BatchNormFwd_pd->weights_primitive_desc(), this->scaleshift_combination->mutable_cpu_data()));
+        scaleshift_memory.reset(new memory(BatchNormFwd_pd->weights_primitive_desc(), this->scaleshift_blob_->mutable_cpu_data()));
     }
 
     // ---  init primitive and prv_memory descriptors ----------------------
@@ -463,7 +473,7 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNormBwd(
 
     if (use_weight_bias_) {
         bwd_scaleshift_diff_memory.reset(new memory(
-                    BatchNormFwd_pd->weights_primitive_desc(), this->scaleshift_combination->mutable_cpu_diff()));
+                    BatchNormFwd_pd->weights_primitive_desc(), this->scaleshift_blob_->mutable_cpu_diff()));
     }
 
     // ---  init primitive and prv_memory descriptors ----------------------
@@ -561,6 +571,13 @@ void MKLDNNBatchNormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       }
 #endif
       PERFORMANCE_MEASUREMENT_END_ID(perf_id_bw_);
+      if (num_stats_batches_ > 1) {
+        CHECK(scaleshift_blob_ != scaleshift_acc_);
+        CHECK(scaleshift_blob_->count() == scaleshift_acc_->count());
+        caffe_cpu_axpby(scaleshift_acc_->count(), Dtype(1),
+                        scaleshift_blob_->mutable_cpu_diff(),
+                        Dtype(1), scaleshift_acc_->mutable_cpu_diff());
+      }
     }
 }
 

From 28c46874c16e7c7e10cc8325331ce77b47c83edb Mon Sep 17 00:00:00 2001
From: "Gong, Jiong" <jiong.gong@intel.com>
Date: Sat, 26 Aug 2017 02:30:56 +0800
Subject: [PATCH 32/38] add resnet 64 node prototxt

Change-Id: I57b497a6a2f028d998301f23b64965b3ab24edff
---
 .../solver.prototxt                           |   19 +
 .../train_val.prototxt                        | 3322 +++++++++++++++++
 2 files changed, 3341 insertions(+)
 create mode 100644 models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/solver.prototxt
 create mode 100644 models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/train_val.prototxt

diff --git a/models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/solver.prototxt b/models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/solver.prototxt
new file mode 100644
index 000000000..4f4f21a93
--- /dev/null
+++ b/models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/solver.prototxt
@@ -0,0 +1,19 @@
+net: "models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/train_val.prototxt"
+test_iter: 1000
+test_interval: 156
+test_initialization: false
+display: 40
+base_lr: 3.2
+lr_policy: "multistep"
+stepvalue:4680
+stepvalue:9360
+stepvalue:12480
+gamma: 0.1
+max_iter: 14075
+warmup_iter: 780 # 1281167 / 8192 * 5 epochs
+warmup_start_lr: 0.1
+momentum: 0.9
+weight_decay: 0.0001
+snapshot: 156
+snapshot_prefix: "models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/resnet_50_64_nodes_8k"
+solver_mode: CPU
diff --git a/models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/train_val.prototxt b/models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/train_val.prototxt
new file mode 100644
index 000000000..3dd57aaac
--- /dev/null
+++ b/models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/train_val.prototxt
@@ -0,0 +1,3322 @@
+name: "ResNet-50"
+bn_stats_batch_size: 32
+layer {
+  name: "data"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TRAIN
+  }
+  transform_param {
+    mirror: true
+    crop_size: 224
+    scale: 0.0078125
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+    random_aspect_ratio_param {
+      min_area_ratio: 0.08
+      max_area_ratio: 1
+      aspect_ratio_change: 0.75
+      resize_param {
+        interp_mode: CUBIC
+      }
+    }
+  }
+  data_param {
+    source: "examples/imagenet/ilsvrc12_train_lmdb"
+    batch_size: 128
+    backend: LMDB
+    prefetch: 2
+    shuffle: true
+  }
+}
+layer {
+  name: "data"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+  }
+  transform_param {
+    mirror: false
+    crop_size: 224
+    scale: 0.0078125
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+    random_resize_param {
+      min_size: 256
+      max_size: 256
+      resize_param {
+        interp_mode: CUBIC
+      }
+    }
+  }
+  data_param {
+    source: "examples/imagenet/ilsvrc12_val_lmdb"
+    batch_size: 50
+    backend: LMDB
+  }
+}
+
+layer {
+  bottom: "data"
+  top: "conv1"
+  name: "conv1"
+  type: "Convolution"
+  convolution_param {
+    num_output: 64
+    kernel_size: 7
+    pad: 3
+    stride: 2
+    weight_filler {
+      type: "msra"
+      variance_norm: FAN_OUT
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "conv1"
+  top: "conv1"
+  name: "bn_conv1"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "conv1"
+  top: "conv1"
+  name: "scale_conv1"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "conv1"
+  top: "conv1"
+  name: "conv1_relu"
+  type: "ReLU"
+  relu_param {
+  }
+}
+
+layer {
+  bottom: "conv1"
+  top: "pool1"
+  name: "pool1"
+  type: "Pooling"
+  pooling_param {
+    kernel_size: 3
+    stride: 2
+    pool: MAX
+  }
+}
+
+layer {
+  bottom: "pool1"
+  top: "res2a_branch1"
+  name: "res2a_branch1"
+  type: "Convolution"
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2a_branch1"
+  top: "res2a_branch1"
+  name: "bn2a_branch1"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2a_branch1"
+  top: "res2a_branch1"
+  name: "scale2a_branch1"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "pool1"
+  top: "res2a_branch2a"
+  name: "res2a_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 64
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2a_branch2a"
+  top: "res2a_branch2a"
+  name: "bn2a_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2a_branch2a"
+  top: "res2a_branch2a"
+  name: "scale2a_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2a_branch2a"
+  top: "res2a_branch2a"
+  name: "res2a_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2a_branch2a"
+  top: "res2a_branch2b"
+  name: "res2a_branch2b"
+  type: "Convolution"
+  convolution_param {
+    num_output: 64
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2a_branch2b"
+  top: "res2a_branch2b"
+  name: "bn2a_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2a_branch2b"
+  top: "res2a_branch2b"
+  name: "scale2a_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2a_branch2b"
+  top: "res2a_branch2b"
+  name: "res2a_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2a_branch2b"
+  top: "res2a_branch2c"
+  name: "res2a_branch2c"
+  type: "Convolution"
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2a_branch2c"
+  top: "res2a_branch2c"
+  name: "bn2a_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2a_branch2c"
+  top: "res2a_branch2c"
+  name: "scale2a_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2a_branch1"
+  bottom: "res2a_branch2c"
+  top: "res2a"
+  name: "res2a"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res2a"
+  top: "res2a"
+  name: "res2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2a"
+  top: "res2b_branch2a"
+  name: "res2b_branch2a"
+  type: "Convolution"
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2b_branch2a"
+  top: "res2b_branch2a"
+  name: "bn2b_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2b_branch2a"
+  top: "res2b_branch2a"
+  name: "scale2b_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2b_branch2a"
+  top: "res2b_branch2a"
+  name: "res2b_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2b_branch2a"
+  top: "res2b_branch2b"
+  name: "res2b_branch2b"
+  type: "Convolution"
+  convolution_param {
+    num_output: 64
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2b_branch2b"
+  top: "res2b_branch2b"
+  name: "bn2b_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2b_branch2b"
+  top: "res2b_branch2b"
+  name: "scale2b_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2b_branch2b"
+  top: "res2b_branch2b"
+  name: "res2b_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2b_branch2b"
+  top: "res2b_branch2c"
+  name: "res2b_branch2c"
+  type: "Convolution"
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2b_branch2c"
+  top: "res2b_branch2c"
+  name: "bn2b_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2b_branch2c"
+  top: "res2b_branch2c"
+  name: "scale2b_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2a"
+  bottom: "res2b_branch2c"
+  top: "res2b"
+  name: "res2b"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res2b"
+  top: "res2b"
+  name: "res2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2b"
+  top: "res2c_branch2a"
+  name: "res2c_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 64
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2c_branch2a"
+  top: "res2c_branch2a"
+  name: "bn2c_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2c_branch2a"
+  top: "res2c_branch2a"
+  name: "scale2c_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2c_branch2a"
+  top: "res2c_branch2a"
+  name: "res2c_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2c_branch2a"
+  top: "res2c_branch2b"
+  name: "res2c_branch2b"
+  type: "Convolution"
+  convolution_param {
+    num_output: 64
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2c_branch2b"
+  top: "res2c_branch2b"
+  name: "bn2c_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2c_branch2b"
+  top: "res2c_branch2b"
+  name: "scale2c_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2c_branch2b"
+  top: "res2c_branch2b"
+  name: "res2c_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2c_branch2b"
+  top: "res2c_branch2c"
+  name: "res2c_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2c_branch2c"
+  top: "res2c_branch2c"
+  name: "bn2c_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+  filler { value: 0 }
+  }
+}
+
+layer {
+  bottom: "res2c_branch2c"
+  top: "res2c_branch2c"
+  name: "scale2c_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2b"
+  bottom: "res2c_branch2c"
+  top: "res2c"
+  name: "res2c"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res2c"
+  top: "res2c"
+  name: "res2c_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2c"
+  top: "res3a_branch1"
+  name: "res3a_branch1"
+  type: "Convolution"
+  convolution_param {
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 2
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3a_branch1"
+  top: "res3a_branch1"
+  name: "bn3a_branch1"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3a_branch1"
+  top: "res3a_branch1"
+  name: "scale3a_branch1"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2c"
+  top: "res3a_branch2a"
+  name: "res3a_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 128
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3a_branch2a"
+  top: "res3a_branch2a"
+  name: "bn3a_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3a_branch2a"
+  top: "res3a_branch2a"
+  name: "scale3a_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3a_branch2a"
+  top: "res3a_branch2a"
+  name: "res3a_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3a_branch2a"
+  top: "res3a_branch2b"
+  name: "res3a_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 128
+    kernel_size: 3
+    pad: 1
+    stride: 2
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3a_branch2b"
+  top: "res3a_branch2b"
+  name: "bn3a_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3a_branch2b"
+  top: "res3a_branch2b"
+  name: "scale3a_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3a_branch2b"
+  top: "res3a_branch2b"
+  name: "res3a_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3a_branch2b"
+  top: "res3a_branch2c"
+  name: "res3a_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3a_branch2c"
+  top: "res3a_branch2c"
+  name: "bn3a_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3a_branch2c"
+  top: "res3a_branch2c"
+  name: "scale3a_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3a_branch1"
+  bottom: "res3a_branch2c"
+  top: "res3a"
+  name: "res3a"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res3a"
+  top: "res3a"
+  name: "res3a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3a"
+  top: "res3b_branch2a"
+  name: "res3b_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 128
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3b_branch2a"
+  top: "res3b_branch2a"
+  name: "bn3b_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3b_branch2a"
+  top: "res3b_branch2a"
+  name: "scale3b_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3b_branch2a"
+  top: "res3b_branch2a"
+  name: "res3b_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3b_branch2a"
+  top: "res3b_branch2b"
+  name: "res3b_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 128
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3b_branch2b"
+  top: "res3b_branch2b"
+  name: "bn3b_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3b_branch2b"
+  top: "res3b_branch2b"
+  name: "scale3b_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3b_branch2b"
+  top: "res3b_branch2b"
+  name: "res3b_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3b_branch2b"
+  top: "res3b_branch2c"
+  name: "res3b_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3b_branch2c"
+  top: "res3b_branch2c"
+  name: "bn3b_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3b_branch2c"
+  top: "res3b_branch2c"
+  name: "scale3b_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3a"
+  bottom: "res3b_branch2c"
+  top: "res3b"
+  name: "res3b"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res3b"
+  top: "res3b"
+  name: "res3b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3b"
+  top: "res3c_branch2a"
+  name: "res3c_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 128
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3c_branch2a"
+  top: "res3c_branch2a"
+  name: "bn3c_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3c_branch2a"
+  top: "res3c_branch2a"
+  name: "scale3c_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3c_branch2a"
+  top: "res3c_branch2a"
+  name: "res3c_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3c_branch2a"
+  top: "res3c_branch2b"
+  name: "res3c_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 128
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3c_branch2b"
+  top: "res3c_branch2b"
+  name: "bn3c_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3c_branch2b"
+  top: "res3c_branch2b"
+  name: "scale3c_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3c_branch2b"
+  top: "res3c_branch2b"
+  name: "res3c_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3c_branch2b"
+  top: "res3c_branch2c"
+  name: "res3c_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3c_branch2c"
+  top: "res3c_branch2c"
+  name: "bn3c_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3c_branch2c"
+  top: "res3c_branch2c"
+  name: "scale3c_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3b"
+  bottom: "res3c_branch2c"
+  top: "res3c"
+  name: "res3c"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res3c"
+  top: "res3c"
+  name: "res3c_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3c"
+  top: "res3d_branch2a"
+  name: "res3d_branch2a"
+  type: "Convolution"
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3d_branch2a"
+  top: "res3d_branch2a"
+  name: "bn3d_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3d_branch2a"
+  top: "res3d_branch2a"
+  name: "scale3d_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3d_branch2a"
+  top: "res3d_branch2a"
+  name: "res3d_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3d_branch2a"
+  top: "res3d_branch2b"
+  name: "res3d_branch2b"
+  type: "Convolution"
+  convolution_param {
+    num_output: 128
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3d_branch2b"
+  top: "res3d_branch2b"
+  name: "bn3d_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3d_branch2b"
+  top: "res3d_branch2b"
+  name: "scale3d_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3d_branch2b"
+  top: "res3d_branch2b"
+  name: "res3d_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3d_branch2b"
+  top: "res3d_branch2c"
+  name: "res3d_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3d_branch2c"
+  top: "res3d_branch2c"
+  name: "bn3d_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+  filler { value: 0 }
+  }
+}
+
+layer {
+  bottom: "res3d_branch2c"
+  top: "res3d_branch2c"
+  name: "scale3d_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3c"
+  bottom: "res3d_branch2c"
+  top: "res3d"
+  name: "res3d"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res3d"
+  top: "res3d"
+  name: "res3d_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3d"
+  top: "res4a_branch1"
+  name: "res4a_branch1"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 2
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4a_branch1"
+  top: "res4a_branch1"
+  name: "bn4a_branch1"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4a_branch1"
+  top: "res4a_branch1"
+  name: "scale4a_branch1"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3d"
+  top: "res4a_branch2a"
+  name: "res4a_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4a_branch2a"
+  top: "res4a_branch2a"
+  name: "bn4a_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4a_branch2a"
+  top: "res4a_branch2a"
+  name: "scale4a_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4a_branch2a"
+  top: "res4a_branch2a"
+  name: "res4a_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4a_branch2a"
+  top: "res4a_branch2b"
+  name: "res4a_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 3
+    pad: 1
+    stride: 2
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4a_branch2b"
+  top: "res4a_branch2b"
+  name: "bn4a_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4a_branch2b"
+  top: "res4a_branch2b"
+  name: "scale4a_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4a_branch2b"
+  top: "res4a_branch2b"
+  name: "res4a_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4a_branch2b"
+  top: "res4a_branch2c"
+  name: "res4a_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4a_branch2c"
+  top: "res4a_branch2c"
+  name: "bn4a_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4a_branch2c"
+  top: "res4a_branch2c"
+  name: "scale4a_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4a_branch1"
+  bottom: "res4a_branch2c"
+  top: "res4a"
+  name: "res4a"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res4a"
+  top: "res4a"
+  name: "res4a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4a"
+  top: "res4b_branch2a"
+  name: "res4b_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4b_branch2a"
+  top: "res4b_branch2a"
+  name: "bn4b_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4b_branch2a"
+  top: "res4b_branch2a"
+  name: "scale4b_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4b_branch2a"
+  top: "res4b_branch2a"
+  name: "res4b_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4b_branch2a"
+  top: "res4b_branch2b"
+  name: "res4b_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4b_branch2b"
+  top: "res4b_branch2b"
+  name: "bn4b_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4b_branch2b"
+  top: "res4b_branch2b"
+  name: "scale4b_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4b_branch2b"
+  top: "res4b_branch2b"
+  name: "res4b_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4b_branch2b"
+  top: "res4b_branch2c"
+  name: "res4b_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4b_branch2c"
+  top: "res4b_branch2c"
+  name: "bn4b_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4b_branch2c"
+  top: "res4b_branch2c"
+  name: "scale4b_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4a"
+  bottom: "res4b_branch2c"
+  top: "res4b"
+  name: "res4b"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res4b"
+  top: "res4b"
+  name: "res4b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4b"
+  top: "res4c_branch2a"
+  name: "res4c_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4c_branch2a"
+  top: "res4c_branch2a"
+  name: "bn4c_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4c_branch2a"
+  top: "res4c_branch2a"
+  name: "scale4c_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4c_branch2a"
+  top: "res4c_branch2a"
+  name: "res4c_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4c_branch2a"
+  top: "res4c_branch2b"
+  name: "res4c_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4c_branch2b"
+  top: "res4c_branch2b"
+  name: "bn4c_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4c_branch2b"
+  top: "res4c_branch2b"
+  name: "scale4c_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4c_branch2b"
+  top: "res4c_branch2b"
+  name: "res4c_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4c_branch2b"
+  top: "res4c_branch2c"
+  name: "res4c_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4c_branch2c"
+  top: "res4c_branch2c"
+  name: "bn4c_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4c_branch2c"
+  top: "res4c_branch2c"
+  name: "scale4c_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4b"
+  bottom: "res4c_branch2c"
+  top: "res4c"
+  name: "res4c"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res4c"
+  top: "res4c"
+  name: "res4c_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4c"
+  top: "res4d_branch2a"
+  name: "res4d_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4d_branch2a"
+  top: "res4d_branch2a"
+  name: "bn4d_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4d_branch2a"
+  top: "res4d_branch2a"
+  name: "scale4d_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4d_branch2a"
+  top: "res4d_branch2a"
+  name: "res4d_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4d_branch2a"
+  top: "res4d_branch2b"
+  name: "res4d_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4d_branch2b"
+  top: "res4d_branch2b"
+  name: "bn4d_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4d_branch2b"
+  top: "res4d_branch2b"
+  name: "scale4d_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4d_branch2b"
+  top: "res4d_branch2b"
+  name: "res4d_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4d_branch2b"
+  top: "res4d_branch2c"
+  name: "res4d_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4d_branch2c"
+  top: "res4d_branch2c"
+  name: "bn4d_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4d_branch2c"
+  top: "res4d_branch2c"
+  name: "scale4d_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4c"
+  bottom: "res4d_branch2c"
+  top: "res4d"
+  name: "res4d"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res4d"
+  top: "res4d"
+  name: "res4d_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4d"
+  top: "res4e_branch2a"
+  name: "res4e_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4e_branch2a"
+  top: "res4e_branch2a"
+  name: "bn4e_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4e_branch2a"
+  top: "res4e_branch2a"
+  name: "scale4e_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4e_branch2a"
+  top: "res4e_branch2a"
+  name: "res4e_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4e_branch2a"
+  top: "res4e_branch2b"
+  name: "res4e_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4e_branch2b"
+  top: "res4e_branch2b"
+  name: "bn4e_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4e_branch2b"
+  top: "res4e_branch2b"
+  name: "scale4e_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4e_branch2b"
+  top: "res4e_branch2b"
+  name: "res4e_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4e_branch2b"
+  top: "res4e_branch2c"
+  name: "res4e_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4e_branch2c"
+  top: "res4e_branch2c"
+  name: "bn4e_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4e_branch2c"
+  top: "res4e_branch2c"
+  name: "scale4e_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4d"
+  bottom: "res4e_branch2c"
+  top: "res4e"
+  name: "res4e"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res4e"
+  top: "res4e"
+  name: "res4e_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4e"
+  top: "res4f_branch2a"
+  name: "res4f_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4f_branch2a"
+  top: "res4f_branch2a"
+  name: "bn4f_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4f_branch2a"
+  top: "res4f_branch2a"
+  name: "scale4f_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4f_branch2a"
+  top: "res4f_branch2a"
+  name: "res4f_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4f_branch2a"
+  top: "res4f_branch2b"
+  name: "res4f_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4f_branch2b"
+  top: "res4f_branch2b"
+  name: "bn4f_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4f_branch2b"
+  top: "res4f_branch2b"
+  name: "scale4f_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4f_branch2b"
+  top: "res4f_branch2b"
+  name: "res4f_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4f_branch2b"
+  top: "res4f_branch2c"
+  name: "res4f_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4f_branch2c"
+  top: "res4f_branch2c"
+  name: "bn4f_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+  filler { value: 0 }
+  }
+}
+
+layer {
+  bottom: "res4f_branch2c"
+  top: "res4f_branch2c"
+  name: "scale4f_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4e"
+  bottom: "res4f_branch2c"
+  top: "res4f"
+  name: "res4f"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res4f"
+  top: "res4f"
+  name: "res4f_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4f"
+  top: "res5a_branch1"
+  name: "res5a_branch1"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 2048
+    kernel_size: 1
+    pad: 0
+    stride: 2
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5a_branch1"
+  top: "res5a_branch1"
+  name: "bn5a_branch1"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5a_branch1"
+  top: "res5a_branch1"
+  name: "scale5a_branch1"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4f"
+  top: "res5a_branch2a"
+  name: "res5a_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5a_branch2a"
+  top: "res5a_branch2a"
+  name: "bn5a_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5a_branch2a"
+  top: "res5a_branch2a"
+  name: "scale5a_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5a_branch2a"
+  top: "res5a_branch2a"
+  name: "res5a_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5a_branch2a"
+  top: "res5a_branch2b"
+  name: "res5a_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 3
+    pad: 1
+    stride: 2
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5a_branch2b"
+  top: "res5a_branch2b"
+  name: "bn5a_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5a_branch2b"
+  top: "res5a_branch2b"
+  name: "scale5a_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5a_branch2b"
+  top: "res5a_branch2b"
+  name: "res5a_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5a_branch2b"
+  top: "res5a_branch2c"
+  name: "res5a_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 2048
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5a_branch2c"
+  top: "res5a_branch2c"
+  name: "bn5a_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5a_branch2c"
+  top: "res5a_branch2c"
+  name: "scale5a_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5a_branch1"
+  bottom: "res5a_branch2c"
+  top: "res5a"
+  name: "res5a"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res5a"
+  top: "res5a"
+  name: "res5a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5a"
+  top: "res5b_branch2a"
+  name: "res5b_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5b_branch2a"
+  top: "res5b_branch2a"
+  name: "bn5b_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5b_branch2a"
+  top: "res5b_branch2a"
+  name: "scale5b_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5b_branch2a"
+  top: "res5b_branch2a"
+  name: "res5b_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5b_branch2a"
+  top: "res5b_branch2b"
+  name: "res5b_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5b_branch2b"
+  top: "res5b_branch2b"
+  name: "bn5b_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5b_branch2b"
+  top: "res5b_branch2b"
+  name: "scale5b_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5b_branch2b"
+  top: "res5b_branch2b"
+  name: "res5b_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5b_branch2b"
+  top: "res5b_branch2c"
+  name: "res5b_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 2048
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5b_branch2c"
+  top: "res5b_branch2c"
+  name: "bn5b_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5b_branch2c"
+  top: "res5b_branch2c"
+  name: "scale5b_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5a"
+  bottom: "res5b_branch2c"
+  top: "res5b"
+  name: "res5b"
+  type: "Eltwise"
+  eltwise_param {
+  }
+}
+
+layer {
+  bottom: "res5b"
+  top: "res5b"
+  name: "res5b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5b"
+  top: "res5c_branch2a"
+  name: "res5c_branch2a"
+  type: "Convolution"
+  convolution_param {
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5c_branch2a"
+  top: "res5c_branch2a"
+  name: "bn5c_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5c_branch2a"
+  top: "res5c_branch2a"
+  name: "scale5c_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5c_branch2a"
+  top: "res5c_branch2a"
+  name: "res5c_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5c_branch2a"
+  top: "res5c_branch2b"
+  name: "res5c_branch2b"
+  type: "Convolution"
+  convolution_param {
+    num_output: 512
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5c_branch2b"
+  top: "res5c_branch2b"
+  name: "bn5c_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5c_branch2b"
+  top: "res5c_branch2b"
+  name: "scale5c_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5c_branch2b"
+  top: "res5c_branch2b"
+  name: "res5c_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+  }
+}
+
+layer {
+  bottom: "res5c_branch2b"
+  top: "res5c_branch2c"
+  name: "res5c_branch2c"
+  type: "Convolution"
+  convolution_param {
+    num_output: 2048
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5c_branch2c"
+  top: "res5c_branch2c"
+  name: "bn5c_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+  filler { value: 0 }
+  }
+}
+
+layer {
+  bottom: "res5c_branch2c"
+  top: "res5c_branch2c"
+  name: "scale5c_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5b"
+  bottom: "res5c_branch2c"
+  top: "res5c"
+  name: "res5c"
+  type: "Eltwise"
+  eltwise_param {
+  }
+}
+
+layer {
+  bottom: "res5c"
+  top: "res5c"
+  name: "res5c_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5c"
+  top: "pool5"
+  name: "pool5"
+  type: "Pooling"
+  pooling_param {
+    kernel_size: 7
+    stride: 1
+    pool: AVE
+  }
+}
+
+layer {
+  bottom: "pool5"
+  top: "fc1000"
+  name: "fc1000"
+  type: "InnerProduct"
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "fc1000"
+  bottom: "label"
+  top: "loss"
+  name: "prob"
+  type: "SoftmaxWithLoss"
+}
+layer {
+  name: "loss3/top-1"
+  type: "Accuracy"
+  bottom: "fc1000"
+  bottom: "label"
+  top: "loss3/top-1"
+}
+layer {
+  name: "loss3/top-5"
+  type: "Accuracy"
+  bottom: "fc1000"
+  bottom: "label"
+  top: "loss3/top-5"
+  accuracy_param {
+    top_k: 5
+  }
+}

From c55cac361e62ef1d99b9974d1d4d51665ad26e66 Mon Sep 17 00:00:00 2001
From: Haihao Shen <haihao.shen@intel.com>
Date: Sat, 26 Aug 2017 06:42:13 +0800
Subject: [PATCH 33/38] Support padded layout

---
 src/caffe/mkldnn_memory.cpp      |   3 +-
 src/caffe/solvers/sgd_solver.cpp | 107 ++++---------------------------
 2 files changed, 13 insertions(+), 97 deletions(-)

diff --git a/src/caffe/mkldnn_memory.cpp b/src/caffe/mkldnn_memory.cpp
index 6e42e691d..c53cff7ff 100644
--- a/src/caffe/mkldnn_memory.cpp
+++ b/src/caffe/mkldnn_memory.cpp
@@ -212,8 +212,7 @@ void MKLDNNMemoryDescriptor<Dtype, is_diff>::convert_from_extprv(shared_ptr<prim
     CHECK(aprimitive);
     if(this->_reorder_extprv2prv_pd == NULL)
         return;
-    if (this->_extprv_memory_pd->desc().data.format == this->_prv_memory_pd->desc().data.format &&
-        this->_extprv_memory_pd->desc().data.data_type == this->_prv_memory_pd->desc().data.data_type)
+    if (*this->_extprv_memory_pd == *this->_prv_memory_pd)
     {
 #ifdef DEBUG
         LOG(INFO) << "The format and data_type of _extprv_memory_pd and _prv_memory_pd is same, no need do conversion.";
diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp
index 929ff050f..6a7e2ca43 100644
--- a/src/caffe/solvers/sgd_solver.cpp
+++ b/src/caffe/solvers/sgd_solver.cpp
@@ -354,38 +354,25 @@ void SGDSolver<Dtype>::SGDFusion(int param_id, Dtype rate) {
   bool prv_diff_condition_flag = false;
   if (net_params[param_id]->prv_diff()
     && (net_params[param_id]->prv_diff_count()
-    == net_params[param_id]->prv_data_count())) {
+    == net_params[param_id]->count())) {
       prv_diff_condition_flag = true;
-      //LOG(INFO) << "Common condition judgement: prv_diff_condition_flag = true.";
-  }
-  else
-  {
-    //LOG(INFO) << "Common condition judgement: prv_diff_condition_flag = false.";
   }
 //#pragma endregion
 
 //#pragma region 3. Normalize stage    
   if (skip_Normalize_stage_flag == false)
   {
-    //LOG(INFO) << "Normalize stage: Normalize stage is not skipped.";
-
     const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size();
       
     if (prv_diff_condition_flag) {
-      //LOG(INFO) << "Normalize stage: prv_diff_condition_flag = true.";
-      caffe_scal(net_params[param_id]->prv_data_count(), accum_normalization,
+      caffe_scal(net_params[param_id]->prv_diff_count(), accum_normalization,
         net_params[param_id]->mutable_prv_diff());
     }
     else {
-      //LOG(INFO) << "Normalize stage: prv_diff_condition_flag = false.";
       caffe_scal(net_params[param_id]->count(), accum_normalization,
         net_params[param_id]->mutable_cpu_diff());
     }
   }
-  else
-  {
-    //LOG(INFO) << "Normalize stage: Normalize stage is skipped.";
-  }
 //#pragma endregion
 
 //For most common topologies from BVLC, all skipped the Normalize stage, and use L2 regularization
@@ -401,97 +388,35 @@ void SGDSolver<Dtype>::SGDFusion(int param_id, Dtype rate) {
   //Regularize stage (Fused ComputeUpdateValue_stage in some situations)
   if (local_decay) {
     if (regularization_type == "L2") {
-      //LOG(INFO) << "Regularize stage: regularization_type == L2.";
       // add weight decay
       if (net_params[param_id]->prv_data()
         && (net_params[param_id]->prv_data_count()
         == net_params[param_id]->count())) {
-        //LOG(INFO) << "Regularize stage: prv_data_condition_flag = true.";
           CHECK_EQ(true,
             net_params[param_id]->get_prv_data_descriptor()->layout_compare(
             net_params[param_id]->get_prv_diff_descriptor()));
-          /*  
-          caffe_axpy(net_params[param_id]->count(), 
-                      local_decay,
-                      net_params[param_id]->prv_data(),
-                      net_params[param_id]->mutable_prv_diff());
-          */
           if (prv_diff_condition_flag) {
-            //situation (1)
-            //LOG(INFO) << "Fused ComputeUpdateValue stage: prv_diff_condition_flag = true.";
-            /*
-            caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-                            net_params[param_id]->prv_diff(), momentum,
-                            history_[param_id]->mutable_cpu_data());
-
-            caffe_copy(net_params[param_id]->count(),
-                        history_[param_id]->cpu_data(),
-                        net_params[param_id]->mutable_prv_diff());
-            */
-            
-            if(net_params[param_id]->prv_data_count() != history_[param_id]->count())
-              history_[param_id]->Reshape(net_params[param_id]->shape());
-
             axpy_axpby_copy_axpy(net_params[param_id]->prv_data_count(), local_decay,
                                 net_params[param_id]->mutable_prv_data(), net_params[param_id]->mutable_prv_diff(),
                                 local_rate, momentum, history_[param_id]->mutable_cpu_data(), Dtype(-1));
 
             is_separate_ComputeUpdateValue_Update = false;
           }
-          else
-          {
-            //Will not happen!
-            //LOG(INFO) << "Cannot Fused ComputeUpdateValue stage: prv_diff_condition_flag = false.";
-          }
       } else {
-        //LOG(INFO) << "Regularize stage: prv_data_condition_flag = false.";
-        /*
-        caffe_axpy(net_params[param_id]->count(),
-                    local_decay,
-                    net_params[param_id]->cpu_data(),
-                    net_params[param_id]->mutable_cpu_diff());
-        */
         if (!prv_diff_condition_flag)
         {
-          //situation (2)
-          //LOG(INFO) << "Fused ComputeUpdateValue stage: prv_diff_condition_flag = false.";
-          /*
-          caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-                    net_params[param_id]->cpu_diff(), momentum,
-                    history_[param_id]->mutable_cpu_data());
-
-          caffe_copy(net_params[param_id]->count(),
-                      history_[param_id]->cpu_data(),
-                      net_params[param_id]->mutable_cpu_diff());
-          */
-
           axpy_axpby_copy_axpy(net_params[param_id]->count(), local_decay,
                                 net_params[param_id]->mutable_cpu_data(), net_params[param_id]->mutable_cpu_diff(),
                                 local_rate, momentum, history_[param_id]->mutable_cpu_data(), Dtype(-1));
 
           is_separate_ComputeUpdateValue_Update = false;
         }
-        else
-        {
-          //Will not happen!
-          //LOG(INFO) << "Cannot Fused ComputeUpdateValue stage: prv_diff_condition_flag = true.";
-          if(net_params[param_id]->prv_data_count() != history_[param_id]->count())
-              history_[param_id]->Reshape(net_params[param_id]->shape());
-        }        
       }
     } else if (regularization_type == "L1") {
-      //LOG(INFO) << "Regularize stage: regularization_type == L1.";
       caffe_cpu_sign(net_params[param_id]->count(),
                       net_params[param_id]->cpu_data(),
                       temp_[param_id]->mutable_cpu_data());
 
-      /*
-      caffe_axpy(net_params[param_id]->count(),
-                  local_decay,
-                  temp_[param_id]->cpu_data(),
-                  net_params[param_id]->mutable_cpu_diff());
-      */
-
       axpy_axpby_copy(net_params[param_id]->count(), local_decay,
                                 temp_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff(),
                                 local_rate, momentum, history_[param_id]->mutable_cpu_data());
@@ -513,18 +438,14 @@ void SGDSolver<Dtype>::SGDFusion(int param_id, Dtype rate) {
     //No Regularize stage, only ComputeUpdateValue stage
     //ComputeUpdateValue stage
     if (prv_diff_condition_flag) {
-      //LOG(INFO) << "ComputeUpdateValue stage: prv_diff_condition_flag = true.";
-      if(net_params[param_id]->prv_data_count() != history_[param_id]->count())
-        history_[param_id]->Reshape(net_params[param_id]->shape());
-      caffe_cpu_axpby(net_params[param_id]->prv_data_count(), local_rate,
+      caffe_cpu_axpby(net_params[param_id]->prv_diff_count(), local_rate,
                       net_params[param_id]->prv_diff(), momentum,
                       history_[param_id]->mutable_cpu_data());
 
-      caffe_copy(net_params[param_id]->prv_data_count(),
+      caffe_copy(net_params[param_id]->count(),
                   history_[param_id]->cpu_data(),
                   net_params[param_id]->mutable_prv_diff());
     } else {
-      //LOG(INFO) << "ComputeUpdateValue stage: prv_diff_condition_flag = false.";
       caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
                       net_params[param_id]->cpu_diff(), momentum,
                       history_[param_id]->mutable_cpu_data());
@@ -537,7 +458,6 @@ void SGDSolver<Dtype>::SGDFusion(int param_id, Dtype rate) {
     //Update stage (separate)
     net_params[param_id]->Update();
   }
-
 }
 #endif /* ENABLE_SGD_FUSION */
 
@@ -561,12 +481,10 @@ void SGDSolver<Dtype>::Normalize(int param_id) {
     if (net_params[param_id]->prv_diff()
         && (net_params[param_id]->prv_diff_count()
             == net_params[param_id]->count())) {
-        //LOG(INFO) << "Normalize stage: prv_diff_condition_flag = true.";
-        caffe_scal(net_params[param_id]->count(), accum_normalization,
+        caffe_scal(net_params[param_id]->prv_diff_count(), accum_normalization,
             net_params[param_id]->mutable_prv_diff());
     }
     else {
-        //LOG(INFO) << "Normalize stage: prv_diff_condition_flag = false.";
         caffe_scal(net_params[param_id]->count(), accum_normalization,
             net_params[param_id]->mutable_cpu_diff());
     }
@@ -599,29 +517,25 @@ void SGDSolver<Dtype>::Regularize(int param_id) {
   case Caffe::CPU: {
     if (local_decay) {
       if (regularization_type == "L2") {
-        //LOG(INFO) << "Regularize stage: regularization_type == L2.";
         // add weight decay
         if (net_params[param_id]->prv_data()
              && (net_params[param_id]->prv_data_count()
                  == net_params[param_id]->count())) {
-          //LOG(INFO) << "Regularize stage: prv_data_condition_flag = true.";
           CHECK_EQ(true,
             net_params[param_id]->get_prv_data_descriptor()->layout_compare(
             net_params[param_id]->get_prv_diff_descriptor()));
 
-          caffe_axpy(net_params[param_id]->count(),
+          caffe_axpy(net_params[param_id]->prv_data_count(),
                      local_decay,
                      net_params[param_id]->prv_data(),
                      net_params[param_id]->mutable_prv_diff());
         } else {
-          //LOG(INFO) << "Regularize stage: prv_data_condition_flag = false.";
           caffe_axpy(net_params[param_id]->count(),
               local_decay,
               net_params[param_id]->cpu_data(),
               net_params[param_id]->mutable_cpu_diff());
         }
       } else if (regularization_type == "L1") {
-        //LOG(INFO) << "Regularize stage: regularization_type == L1.";
         caffe_cpu_sign(net_params[param_id]->count(),
             net_params[param_id]->cpu_data(),
             temp_[param_id]->mutable_cpu_data());
@@ -692,8 +606,7 @@ void SGDSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
     if (net_params[param_id]->prv_diff()
         && (net_params[param_id]->prv_diff_count()
             == net_params[param_id]->count())) {
-      //LOG(INFO) << "ComputeUpdateValue stage: prv_diff_condition_flag = true.";
-      caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
+      caffe_cpu_axpby(net_params[param_id]->prv_diff_count(), local_rate,
                       net_params[param_id]->prv_diff(), momentum,
                       history_[param_id]->mutable_cpu_data());
 
@@ -701,7 +614,6 @@ void SGDSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
                  history_[param_id]->cpu_data(),
                  net_params[param_id]->mutable_prv_diff());
     } else {
-      //LOG(INFO) << "ComputeUpdateValue stage: prv_diff_condition_flag = false.";
       caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
                      net_params[param_id]->cpu_diff(), momentum,
                      history_[param_id]->mutable_cpu_data());
@@ -709,6 +621,11 @@ void SGDSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
       caffe_copy(net_params[param_id]->count(),
                  history_[param_id]->cpu_data(),
                  net_params[param_id]->mutable_cpu_diff());
+
+      if (net_params[param_id]->prv_diff_count()
+             != net_params[param_id]->count()) {
+          net_params[param_id]->mutable_prv_diff();
+      }
     }
     break;
   }

From d5789e76dc7c413dfc22dc6d71d05d81335c93d2 Mon Sep 17 00:00:00 2001
From: Haihao Shen <haihao.shen@intel.com>
Date: Tue, 29 Aug 2017 07:50:39 +0800
Subject: [PATCH 34/38] Fix the issue of padded layout under MKL2017

---
 src/caffe/solvers/sgd_solver.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp
index 6a7e2ca43..5347dcdf7 100644
--- a/src/caffe/solvers/sgd_solver.cpp
+++ b/src/caffe/solvers/sgd_solver.cpp
@@ -622,8 +622,9 @@ void SGDSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
                  history_[param_id]->cpu_data(),
                  net_params[param_id]->mutable_cpu_diff());
 
-      if (net_params[param_id]->prv_diff_count()
-             != net_params[param_id]->count()) {
+      if (net_params[param_id]->prv_diff() 
+          && (net_params[param_id]->prv_diff_count()
+              != net_params[param_id]->count())) {
           net_params[param_id]->mutable_prv_diff();
       }
     }

From 5440cd4f693e81c11eb74841f2c0bd903e268a75 Mon Sep 17 00:00:00 2001
From: fzou1 <feng.zou@intel.com>
Date: Thu, 31 Aug 2017 12:42:37 +0800
Subject: [PATCH 35/38] add script for running caffe on single node and
 multiple nodes with Intel CPUs

Change-Id: I0299102309bd6e18794f6e454002faa5db63613e
---
 scripts/run_intelcaffe.sh | 604 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 604 insertions(+)
 create mode 100755 scripts/run_intelcaffe.sh

diff --git a/scripts/run_intelcaffe.sh b/scripts/run_intelcaffe.sh
new file mode 100755
index 000000000..29a5309ab
--- /dev/null
+++ b/scripts/run_intelcaffe.sh
@@ -0,0 +1,604 @@
+#!/bin/sh
+ set -x
+
+benchmark_mode="all"
+
+# time/train/resume_train
+mode="train"
+
+# it's assigned by detect_cpu
+cpu_model=skx
+
+# a list of nodes
+host_file=""
+
+# network parameters
+network="opa"
+tcp_netmask=""
+
+# specify number of MLSL ep servers in command
+num_mlsl_servers=-1
+
+# parameters for caffe time
+iteration=0
+model_file=""
+# parameters for resuming training
+snapshot=""
+# parameters for training
+solver_file=""
+
+# specify engine for running caffe
+engine="MKL2017"
+
+result_dir=""
+debug="off"
+
+function usage
+{
+    script_name=$0
+    echo "Usage:"
+    echo "  $script_name --host host_file [--solver solver_file]"
+    echo "               [--network opa/tcp] [--netmask tcp_netmask] [--debug on/off]"
+    echo "               [--mode train/resume_train/time/none] [--benchmark all/qperf/mpi/none]"
+    echo "               [--iteration iter] [--model_file deploy.prototxt]"
+    echo "               [--snapshot snapshot.caffemodel]"
+    echo "               [--num_mlsl_servers num_mlsl_servers]"
+    echo "               [--output output_folder]"
+    echo ""
+    echo "  Parameters:"
+    echo "    host: host file includes list of nodes."
+    echo ""
+    echo "  Optional parameters:"
+    echo "    solver: specify solver file if mode is train/resume_train"
+    echo "    network: opa(default), tcp"
+    echo "    netmask: only used if network is tcp"
+    echo "    debug: off(default). MLSL debug information is outputed if it's on"
+    echo "    mode: train(default), resume_train, time, none(not to run caffe test)"
+    echo "    benchmark: all(default). Includes qperf, all-reduce performance"
+    echo "      Dependency: user needs to install qperf, IMB-MPI1;"
+    echo "                  and add them in system path."
+    echo "    iteration and model_file: only used if mode is time (caffe time)"
+    echo "    snapshot: only used if mode is resume_train"
+    echo "    num_mlsl_servers: number of MLSL ep servers"
+    echo "    output_folder: output folder for storing results"
+}
+
+declare -a cpu_list=("Intel Xeon E5-26xx (Broadwell)" "Intel Xeon Phi 72xx (Knight Landing)" 
+                     "Intel Xeon Platinum 8180 (Skylake)" "Intel Xeon 6148 (Skylake)")
+
+function detect_cpu
+{
+    # detect cpu model
+    model_string=`lscpu | grep "Model name" | awk -F ':' '{print $2}'`
+    if [[ $model_string == *"72"* ]]; then
+        cpu_model=knl
+    elif [[ $model_string == *"8180"* ]]; then
+        cpu_model=skx
+    elif [[ $model_string == *"6148"* ]]; then
+        cpu_model=skx
+    elif [[ $model_string == *"E5-26"* ]]; then
+        cpu_model=bdw
+    else
+        echo "CPU model: $model_string"
+        echo "  Use default settings, which may not be optimal ones."
+    fi
+}
+
+function set_numa_node
+{
+    # detect numa mode: cache and flat mode for KNL
+    numa_node=($(numactl -H | grep "available" | awk -F ' ' '{print $2}'))
+    if [ $numa_node -eq 1 ]; then
+        echo "Cache mode."
+        # cache mode, use numa node 0
+        numanode=0
+    else
+        echo "Flat mode."
+        numanode=1
+    fi
+}
+
+
+function check_dependency
+{
+    dep=$1
+    which $dep >/dev/null 2>&1
+    if [ $? -ne 0 ]; then
+        echo "Warning: cannot find $dep"
+        return 1
+    fi
+    return 0
+}
+
+
+function init_mpi_envs
+{
+    # IMPI configuration
+    if [ "$network" == "opa" ]; then
+        export I_MPI_FABRICS=tmi
+        export I_MPI_TMI_PROVIDER=psm2
+        if [ "$cpu_model" == "knl" ];  then
+            # PSM2 configuration
+            export PSM2_MQ_RNDV_HFI_WINDOW=4194304 #2097152 # to workaround PSM2 bug in IFS 10.2 and 10.3
+            export PSM2_MQ_EAGER_SDMA_SZ=65536
+            export PSM2_MQ_RNDV_HFI_THRESH=200000
+        fi
+
+        export PSM2_IDENTIFY=1 # for debug
+    elif [ "$network" == "tcp" ]; then
+        export I_MPI_FABRICS=tcp
+        export I_MPI_TCP_NETMASK=$tcp_netmask
+    else
+        echo "Invalid network: $network"
+        exit 1
+    fi
+
+    export I_MPI_FALLBACK=0
+    export I_MPI_DEBUG=6
+}
+
+
+function clear_shm
+{
+    clear_command="rm -rf /dev/shm/*"
+    check_shm_command="df -h | grep shm"
+
+    # TODO: check if 50G is the minimum shm size?
+    min_shm_size=50
+    shm_unit="G"
+
+    for node in "${nodenames[@]}"
+    do
+        ssh ${node} "$clear_command"
+        shm_line=`ssh ${node} "$check_shm_command"`
+        shm_string=`echo $shm_line | awk -F ' ' '{print $(NF-2)}'`
+        unit="${shm_string:(-1)}"
+        shm_size=${shm_string::-1}
+        if [ "$unit" == "$shm_unit" ] && [ $shm_size -ge ${min_shm_size} ]; then
+            continue
+        else
+            echo "Error: /dev/shm size = ${shm_size}${unit}, on node: ${node}."
+            echo "       It's less than minimum size: ${min_shm_size}${shm_unit}."
+            echo "       Please clean or enlarge it."
+            exit 1
+        fi
+    done
+}
+
+function kill_zombie_processes
+{
+    kill_command="for process in ep_server caffe mpiexec.hydra; do for i in \$(ps -e | grep -w \$process | awk -F ' ' '{print \$1}'); do kill -9 \$i; echo \"\$process \$i killed.\"; done done"
+    for node in "${nodenames[@]}"
+    do
+        ssh ${node} "$kill_command"
+    done
+}
+
+function clear_envs
+{
+    clear_shm
+    kill_zombie_processes
+}
+
+function set_mlsl_vars
+{
+    if [ "${num_mlsl_servers}" -eq -1 ]; then
+        if [ ${numnodes} -eq 1 ]; then
+            numservers=0
+        else
+            if [ ${cpu_model} == knl ]; then
+                numservers=4
+            else
+                numservers=2
+            fi
+        fi
+    else
+        numservers=$((num_mlsl_servers))
+    fi
+
+    echo "MLSL_NUM_SERVERS: $numservers"
+    export MLSL_NUM_SERVERS=${numservers}
+
+    if [ ${numservers} -gt 0 ]; then
+        if [ ${cpu_model} == knl ]; then
+            listep=6,7,8,9,10,11,12,13
+        else
+            listep=6,7,8,9
+        fi
+        export MLSL_SERVER_AFFINITY="${listep}"
+        echo "MLSL_SERVER_AFFINITY: ${listep}"
+    fi
+
+    # MLSL configuration
+    if [ "$debug" == "on" ]; then
+        export MLSL_LOG_LEVEL=3
+    else
+        export MLSL_LOG_LEVEL=0
+    fi
+}
+
+function set_env_vars
+{
+    set_mlsl_vars
+
+    ppncpu=1
+    threadspercore=1
+
+    cores=`lscpu | grep "Core(s) per socket:" | awk '{print $4}'`
+    sockets=`lscpu | grep "Socket(s)" | awk  '{print $2}'`
+    maxcores=$((cores*sockets))
+
+    numthreads=$(((maxcores-numservers)*threadspercore))
+    numthreads_per_proc=$((numthreads/ppncpu))
+
+    export OMP_NUM_THREADS=${numthreads_per_proc}
+
+    # OMP configuration
+    # threadspercore=1
+    affinitystr="proclist=[0-5,$((5+numservers+1))-$((maxcores-1))],granularity=thread,explicit"
+    export KMP_HW_SUBSET=1t
+    export KMP_AFFINITY=$affinitystr
+}
+
+function execute_command
+{
+    local xeonbin_=$1
+    local result_dir_=$2
+
+    if [ ${cpu_model} == knl ]; then
+        exec_command="numactl --preferred=$numanode $xeonbin_"
+    else
+        exec_command="$xeonbin_"
+    fi
+
+    if [ ${numnodes} -gt 1 ]; then
+        # Produce the configuration file for mpiexec. 
+        # Each line of the config file contains a # host, environment, binary name.
+        cfile_=nodeconfig-${cpu_model}-${numnodes}.txt
+        rm -f $cfile_
+
+        for node in "${nodenames[@]}"
+        do
+            echo "-host ${node} -n $ppncpu $exec_command" >> $cfile_
+        done
+    fi
+
+    clear_envs
+    log_file=outputCluster-${cpu_model}-${numnodes}.txt
+
+    sensors_bin="sensors"
+    check_dependency $sensors_bin
+    has_sensors=$?
+    if [ $has_sensors -eq 0 ]; then
+        sensor_log_file=sensors-${cpu_model}-${numnodes}-start.log
+        $sensors_bin >$sensor_log_file
+        mv $sensor_log_file $result_dir_/
+    fi
+    
+    if [ ${numnodes} -eq 1 ]; then
+        time GLOG_minloglevel=0 $exec_command >${log_file} 2>&1
+    else
+        init_mpi_envs
+        exec_command="-l -configfile $cfile_"
+        time GLOG_minloglevel=0 mpiexec.hydra $exec_command >${log_file} 2>&1 
+    fi
+
+    if [ $has_sensors -eq 0 ]; then
+        sensor_log_file=sensors-${cpu_model}-${numnodes}-end.log
+        $sensors_bin >$sensor_log_file
+        mv $sensor_log_file $result_dir_/
+    fi
+    mv $log_file $cfile_ $result_dir_/
+}
+
+function run_qperf_bench
+{
+    qperf_bin="qperf"
+    check_dependency $qperf_bin
+    if [ $? -ne 0 ]; then
+        echo "Skip qperf benchmark."
+        return
+    fi
+
+    # measure bandwidth and latency
+    qperf_result_log="qperf_bench_result.log"
+    rm -f $qperf_result_log
+
+    server_node=""
+    port=1234567
+    qperf_param="-lp $port -oo msg_size:1024:512M:*2 -vu tcp_bw tcp_lat"
+
+    for ((i=0; i<numnodes-1; i++))
+    do
+        server_node=${nodenames[$i]}
+        echo "Run qperf server on ${server_node}..." | tee -a $qperf_result_log
+        ssh -f $server_node "$qperf_bin -lp $port" >> $qperf_result_log
+        echo >>$qperf_result_log
+
+        for ((j=i+1; j<numnodes; j++))
+        do
+            client_node=${nodenames[$j]}
+            echo "Run qperf client on ${client_node}..." | tee -a $qperf_result_log
+            qperf_command="$qperf_bin $server_node $qperf_param"
+            if [ ${j} == ${numnodes} ]; then
+                qperf_command+=" quit"
+            fi
+            echo "ssh $client_node $qperf_command" | tee -a $qperf_result_log
+            ssh $client_node "$qperf_command" | tee -a $qperf_result_log
+            echo >>$qperf_result_log
+        done
+    done
+
+    mv $qperf_result_log $result_dir/
+}
+
+function run_mpi_bench
+{
+    # MPI benchmark
+    mpibench_bin="IMB-MPI1"
+    check_dependency $mpibench_bin
+    if [ $? -ne 0 ]; then
+        echo "Skip MPI benchmark..."
+        return
+    fi
+
+    xeonbin="$mpibench_bin allreduce"
+
+    declare -a adjust_values=(1 2 3 5 7 8 9 0)
+    declare -a collective_values=('tmi' 'none')
+
+    echo "Start mpi bench..."
+    for ((i=0; i<${#adjust_values[@]}; i++))
+    do
+        for ((j=0; j<${#collective_values[@]}; j++))
+        do
+            if [ ${adjust_values[$i]} -eq 0 ]; then
+                unset I_MPI_ADJUST_ALLREDUCE
+            else
+                export I_MPI_ADJUST_ALLREDUCE=${adjust_values[$i]}
+            fi
+
+            if [ "${collective_values[$j]}" == "none" ]; then
+                unset I_MPI_COLLECTIVE_DEFAULTS
+            else
+                export I_MPI_COLLECTIVE_DEFAULTS=${collective_values[$j]}
+            fi
+            echo "iteration $i, ${j}..."
+            echo "I_MPI_ADJUST_ALLREDUCE=$I_MPI_ADJUST_ALLREDUCE"
+            echo "I_MPI_COLLECTIVE_DEFAULTS=$I_MPI_COLLECTIVE_DEFAULTS"
+
+            test_result_dir=$result_dir/mpibench-${adjust_values[$i]}-${collective_values[$j]}
+            mkdir -p $test_result_dir
+            execute_command "$xeonbin" $test_result_dir
+        done
+    done
+
+    # TODO: analyze the report and select the best algorithm and setting
+    unset I_MPI_COLLECTIVE_DEFAULTS
+    unset I_MPI_ADJUST_ALLREDUCE
+
+    echo "Finished."
+}
+
+function run_benchmark
+{
+    echo "Run benchmark with ${numnodes} nodes..."
+    if [ $numnodes -gt 1 ]; then
+        if [ "$benchmark_mode" == "all" ] || [ "$benchmark_mode" == "qperf" ]; then
+            run_qperf_bench
+        fi
+
+        if [ "$benchmark_mode" == "all" ] || [ "$benchmark_mode == mpi" ]; then
+            set_env_vars
+            run_mpi_bench
+        fi
+    fi
+}
+
+function run_caffe
+{
+    echo "Run caffe with ${numnodes} nodes..."
+
+    if [ ${mode} == "time" ]; then
+        xeonbin="$caffe_bin time --iterations $iteration --model $model_file  -engine=$engine"
+    else
+        xeonbin="$caffe_bin train --solver $solver_file -engine=$engine"
+        if [ ${mode} == "resume_train" ]; then
+            xeonbin+=" --snapshot=${snapshot}"
+        fi
+    fi
+
+    set_env_vars
+    execute_command "$xeonbin" $result_dir
+}
+
+
+if [ $# -le 1 ]; then
+    usage
+    exit 0
+fi
+
+root_dir=$(cd $(dirname $(dirname $0)); pwd)
+result_dir=${root_dir}/"result-`date +%Y%m%d%H%M%S`"
+
+while [[ $# -gt 1 ]]
+do
+    key="$1"
+    case $key in
+        --solver)
+            solver_file="$2"
+            shift
+            ;;
+        --host)
+            host_file="$2"
+            shift
+            ;;
+        --network)
+            network="$2"
+            shift
+            ;;
+        --netmask)
+            tcp_netmask="$2"
+            shift
+            ;;
+        --debug)
+            debug="$2"
+            shift
+            ;;
+        --num_mlsl_servers)
+            num_mlsl_servers=$2
+            shift
+            ;;
+        --mode)
+            mode=$2
+            shift
+            ;;
+        --iteration)
+            iteration=$2
+            shift
+            ;;
+        --model_file)
+            model_file=$2
+            shift
+            ;;
+        --snapshot)
+            snapshot=$2
+            shift
+            ;;
+        --engine)
+            engine=$2
+            shift
+            ;;
+        --benchmark)
+            benchmark_mode=$2
+            shift
+            ;;
+        --output)
+            result_dir=$2
+            shift
+            ;;
+        *)
+            echo "Unknown option: $key"
+            usage
+            exit 1
+            ;;
+    esac
+    shift
+done
+
+# check parameters
+if [ "$host_file" == "" ]; then
+    echo "Error: host file is NOT specified."
+    exit 1
+fi
+if [ ! -f $host_file ]; then
+    echo "Error: host file does NOT exist."
+    exit 1
+fi
+
+echo ""
+echo "CPUs with optimal settings:"
+for ((i=0; i<${#cpu_list[@]}; i++))
+do
+    echo "    ${cpu_list[$i]}"
+done
+echo ""
+echo "Settings:"
+echo "    Host file: $host_file"
+echo "    Running mode: $mode"
+echo "    Benchmark: $benchmark_mode"
+echo "    Debug option: $debug"
+echo "    Engine: $engine"
+echo "    Number of MLSL servers: $num_mlsl_servers"
+echo "        -1: selected automatically according to CPU model."
+echo "            BDW/SKX: 2, KNL: 4"
+
+
+if [ "$mode" == "train" ] || [ "$mode" == "resume_train" ]; then
+    if [ "$solver_file" == "" ]; then
+        echo "Error: solver file is NOT specified."
+        exit 1
+    fi
+    if [ ! -f $solver_file ]; then
+        echo "Error: solver file does NOT exist."
+        exit 1
+    fi
+
+    echo "    Solver file: $solver_file"
+
+    if [ "$mode" == "resume_train" ]; then
+        if [ "$snapshot" == "" ]; then
+            echo "Error: snapshot is NOT specified."
+            exit 1
+        fi
+        if [ ! -f $snapshot ]; then
+            echo "Eror: snapshot file does NOT exist."
+            exit 1
+        fi
+        echo "    Snapshot for resuming train: $snapshot"
+    fi
+fi
+
+if [ "$mode" == "time" ]; then
+    if [ "$model_file" == "" ]; then
+        echo "Error: model file is NOT specified."
+        exit 1
+    fi
+    if [ ! -f $model_file ]; then
+        echo "Eror: model file does NOT exist."
+        exit 1
+    fi
+
+    if [ $iteration -le 0 ]; then
+        echo "Error: iteration ($iteration) <= 0."
+        exit 1
+    fi        
+    echo "    Iteration for running caffe time: $iteration"
+    echo "    Model file for running caffe time: $model_file"
+fi
+
+echo "    Network: $network"
+if [ "$network" == "tcp" ]; then
+    if  [ "$tcp_netmask" == "" ]; then
+        echo "Error: TCP netmask is NOT specified."
+        exit 0
+    fi
+    echo "    Netmask for TCP network: $tcp_netmask"
+fi
+
+# Names to configfile, binary (executable) files #
+nodenames=( `cat $host_file | sort | uniq ` )
+if [ ${#nodenames[@]} -eq 0 ]; then
+    echo "Error: empty host file! Exit."
+    exit 0
+fi
+numnodes=${#nodenames[@]}
+echo "Number of nodes: $numnodes"
+
+detect_cpu
+
+if [ $cpu_model == knl ]; then
+    set_numa_node
+fi
+
+if [ ! -d $result_dir ]; then
+    echo "Create result directory: $result_dir"
+    mkdir -p $result_dir
+fi
+
+if [ "${benchmark_mode}" != "none" ]; then
+    run_benchmark
+fi
+
+if [ "${mode}" != "none" ]; then
+    caffe_bin="./build/tools/caffe"
+    check_dependency $caffe_bin
+    if [ $? -ne 0 ]; then
+        echo "Exit."
+        exit 0
+    fi
+
+    run_caffe
+fi
+
+echo "Result folder: $result_dir"

From 29faeaee6dbb3d67242a7d2efbf0f52016568ad4 Mon Sep 17 00:00:00 2001
From: Daisy Deng <daisy.deng@intel.com>
Date: Tue, 5 Sep 2017 00:14:42 +0800
Subject: [PATCH 36/38] update mkldnn version to
 b01e3a55a07be62172e713bcd2644c5176360212

---
 mkldnn.commit | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mkldnn.commit b/mkldnn.commit
index 4c279b216..9abcb727c 100644
--- a/mkldnn.commit
+++ b/mkldnn.commit
@@ -1 +1 @@
-27420a241b2efd8d88f1e003635434194fdfb1b8
+b01e3a55a07be62172e713bcd2644c5176360212

From 7605ad37f69d9a4dcf35f3efe714b59f4d87c0c3 Mon Sep 17 00:00:00 2001
From: Daisy Deng <daisy.deng@intel.com>
Date: Tue, 5 Sep 2017 00:30:18 +0800
Subject: [PATCH 37/38] change googlenet_4node max_iteration to 450000

---
 .../multinode/googlenet_4nodes/solver.prototxt                  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/intel_optimized_models/multinode/googlenet_4nodes/solver.prototxt b/models/intel_optimized_models/multinode/googlenet_4nodes/solver.prototxt
index 589971c10..773a61852 100644
--- a/models/intel_optimized_models/multinode/googlenet_4nodes/solver.prototxt
+++ b/models/intel_optimized_models/multinode/googlenet_4nodes/solver.prototxt
@@ -19,7 +19,7 @@ average_loss: 40
 base_lr: 0.04
 lr_policy: "poly"
 power: 0.5
-max_iter: 350000
+max_iter: 450000
 momentum: 0.9
 weight_decay: 0.0002
 snapshot: 50000

From d9d52b7e2c972310a09c8e84766067576fc4bd75 Mon Sep 17 00:00:00 2001
From: Daisy Deng <daisy.deng@intel.com>
Date: Tue, 5 Sep 2017 00:35:35 +0800
Subject: [PATCH 38/38] add googlenet v2 8 nodes solver

---
 .../googlenet_16nodes/solver.prototxt         |   27 -
 .../googlenet_16nodes/train_val.prototxt      | 2434 -----------------
 .../googlenet_v2_4nodes/solver.prototxt       |   24 -
 .../googlenet_v2_8nodes/solver.prototxt       |   15 +
 .../train_val.prototxt                        |    0
 .../resnet_50_16_nodes/solver.prototxt        |   15 -
 .../resnet_50_16_nodes/train_val.prototxt     | 2306 ----------------
 7 files changed, 15 insertions(+), 4806 deletions(-)
 delete mode 100644 models/intel_optimized_models/multinode/googlenet_16nodes/solver.prototxt
 delete mode 100644 models/intel_optimized_models/multinode/googlenet_16nodes/train_val.prototxt
 delete mode 100644 models/intel_optimized_models/multinode/googlenet_v2_4nodes/solver.prototxt
 create mode 100644 models/intel_optimized_models/multinode/googlenet_v2_8nodes/solver.prototxt
 rename models/intel_optimized_models/multinode/{googlenet_v2_4nodes => googlenet_v2_8nodes}/train_val.prototxt (100%)
 delete mode 100644 models/intel_optimized_models/multinode/resnet_50_16_nodes/solver.prototxt
 delete mode 100644 models/intel_optimized_models/multinode/resnet_50_16_nodes/train_val.prototxt

diff --git a/models/intel_optimized_models/multinode/googlenet_16nodes/solver.prototxt b/models/intel_optimized_models/multinode/googlenet_16nodes/solver.prototxt
deleted file mode 100644
index 4c9b59fc4..000000000
--- a/models/intel_optimized_models/multinode/googlenet_16nodes/solver.prototxt
+++ /dev/null
@@ -1,27 +0,0 @@
-#This is Intel(R) optimized (in terms of time to train) version of solver for model described in the [GoogLeNet](http://arxiv.org/abs/1409.4842) publication.
-#Original solver.prototxt can be found in /models/bvlc_googlenet/ directory of this repository.
-#Differences:
-#- base_lr is set to 0.065
-#- max_iter is set to 100000
-#
-#- bias_filler value changed to 0.1
-#
-#Top-5 and Top-1 results achieved with this version of solver:
-#Top-5: 88.74%
-#Top-1: 68.35%
-#Training was performed using server equipped with Intel(R) Xeon Phi(TM) CPU 7250 processor. 
-net: "models/intel_optimized_models/multinode/googlenet_16nodes/train_val.prototxt"
-#test_iter: 1000
-#test_interval: 10000
-#test_initialization: false
-display: 40
-average_loss: 40
-base_lr: 0.065
-lr_policy: "poly"
-power: 0.5
-max_iter: 100000
-momentum: 0.9
-weight_decay: 0.0002
-snapshot: 50000
-snapshot_prefix: "models/intel_optimized_models/multinode/googlenet_16nodes/googlenet"
-solver_mode: CPU
diff --git a/models/intel_optimized_models/multinode/googlenet_16nodes/train_val.prototxt b/models/intel_optimized_models/multinode/googlenet_16nodes/train_val.prototxt
deleted file mode 100644
index f5276ab97..000000000
--- a/models/intel_optimized_models/multinode/googlenet_16nodes/train_val.prototxt
+++ /dev/null
@@ -1,2434 +0,0 @@
-name: "GoogleNet"
-layer {
-  name: "data"
-  type: "Data"
-  top: "data"
-  top: "label"
-  include {
-    phase: TRAIN
-  }
-  transform_param {
-    mirror: true
-    crop_size: 224
-    mean_value: 104
-    mean_value: 117
-    mean_value: 123
-  }
-  data_param {
-    source: "examples/imagenet/ilsvrc12_train_lmdb"
-    batch_size: 64
-    backend: LMDB
-    shuffle: true
-  }
-}
-layer {
-  name: "data"
-  type: "Data"
-  top: "data"
-  top: "label"
-  include {
-    phase: TEST
-  }
-  transform_param {
-    mirror: false
-    crop_size: 224
-    mean_value: 104
-    mean_value: 117
-    mean_value: 123
-  }
-  data_param {
-    source: "examples/imagenet/ilsvrc12_val_lmdb"
-    batch_size: 50
-    backend: LMDB
-  }
-}
-layer {
-  name: "conv1/7x7_s2"
-  type: "Convolution"
-  bottom: "data"
-  top: "conv1/7x7_s2"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 64
-    pad: 3
-    kernel_size: 7
-    stride: 2
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "conv1/relu_7x7"
-  type: "ReLU"
-  bottom: "conv1/7x7_s2"
-  top: "conv1/7x7_s2"
-}
-layer {
-  name: "pool1/3x3_s2"
-  type: "Pooling"
-  bottom: "conv1/7x7_s2"
-  top: "pool1/3x3_s2"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "pool1/norm1"
-  type: "LRN"
-  bottom: "pool1/3x3_s2"
-  top: "pool1/norm1"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "conv2/3x3_reduce"
-  type: "Convolution"
-  bottom: "pool1/norm1"
-  top: "conv2/3x3_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 64
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "conv2/relu_3x3_reduce"
-  type: "ReLU"
-  bottom: "conv2/3x3_reduce"
-  top: "conv2/3x3_reduce"
-}
-layer {
-  name: "conv2/3x3"
-  type: "Convolution"
-  bottom: "conv2/3x3_reduce"
-  top: "conv2/3x3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 192
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "conv2/relu_3x3"
-  type: "ReLU"
-  bottom: "conv2/3x3"
-  top: "conv2/3x3"
-}
-layer {
-  name: "conv2/norm2"
-  type: "LRN"
-  bottom: "conv2/3x3"
-  top: "conv2/norm2"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layer {
-  name: "pool2/3x3_s2"
-  type: "Pooling"
-  bottom: "conv2/norm2"
-  top: "pool2/3x3_s2"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "inception_3a/1x1"
-  type: "Convolution"
-  bottom: "pool2/3x3_s2"
-  top: "inception_3a/1x1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 64
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_3a/relu_1x1"
-  type: "ReLU"
-  bottom: "inception_3a/1x1"
-  top: "inception_3a/1x1"
-}
-layer {
-  name: "inception_3a/3x3_reduce"
-  type: "Convolution"
-  bottom: "pool2/3x3_s2"
-  top: "inception_3a/3x3_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 96
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_3a/relu_3x3_reduce"
-  type: "ReLU"
-  bottom: "inception_3a/3x3_reduce"
-  top: "inception_3a/3x3_reduce"
-}
-layer {
-  name: "inception_3a/3x3"
-  type: "Convolution"
-  bottom: "inception_3a/3x3_reduce"
-  top: "inception_3a/3x3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_3a/relu_3x3"
-  type: "ReLU"
-  bottom: "inception_3a/3x3"
-  top: "inception_3a/3x3"
-}
-layer {
-  name: "inception_3a/5x5_reduce"
-  type: "Convolution"
-  bottom: "pool2/3x3_s2"
-  top: "inception_3a/5x5_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 16
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_3a/relu_5x5_reduce"
-  type: "ReLU"
-  bottom: "inception_3a/5x5_reduce"
-  top: "inception_3a/5x5_reduce"
-}
-layer {
-  name: "inception_3a/5x5"
-  type: "Convolution"
-  bottom: "inception_3a/5x5_reduce"
-  top: "inception_3a/5x5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 32
-    pad: 2
-    kernel_size: 5
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_3a/relu_5x5"
-  type: "ReLU"
-  bottom: "inception_3a/5x5"
-  top: "inception_3a/5x5"
-}
-layer {
-  name: "inception_3a/pool"
-  type: "Pooling"
-  bottom: "pool2/3x3_s2"
-  top: "inception_3a/pool"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 1
-    pad: 1
-  }
-}
-layer {
-  name: "inception_3a/pool_proj"
-  type: "Convolution"
-  bottom: "inception_3a/pool"
-  top: "inception_3a/pool_proj"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 32
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_3a/relu_pool_proj"
-  type: "ReLU"
-  bottom: "inception_3a/pool_proj"
-  top: "inception_3a/pool_proj"
-}
-layer {
-  name: "inception_3a/output"
-  type: "Concat"
-  bottom: "inception_3a/1x1"
-  bottom: "inception_3a/3x3"
-  bottom: "inception_3a/5x5"
-  bottom: "inception_3a/pool_proj"
-  top: "inception_3a/output"
-}
-layer {
-  name: "inception_3b/1x1"
-  type: "Convolution"
-  bottom: "inception_3a/output"
-  top: "inception_3b/1x1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_3b/relu_1x1"
-  type: "ReLU"
-  bottom: "inception_3b/1x1"
-  top: "inception_3b/1x1"
-}
-layer {
-  name: "inception_3b/3x3_reduce"
-  type: "Convolution"
-  bottom: "inception_3a/output"
-  top: "inception_3b/3x3_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_3b/relu_3x3_reduce"
-  type: "ReLU"
-  bottom: "inception_3b/3x3_reduce"
-  top: "inception_3b/3x3_reduce"
-}
-layer {
-  name: "inception_3b/3x3"
-  type: "Convolution"
-  bottom: "inception_3b/3x3_reduce"
-  top: "inception_3b/3x3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 192
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_3b/relu_3x3"
-  type: "ReLU"
-  bottom: "inception_3b/3x3"
-  top: "inception_3b/3x3"
-}
-layer {
-  name: "inception_3b/5x5_reduce"
-  type: "Convolution"
-  bottom: "inception_3a/output"
-  top: "inception_3b/5x5_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 32
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_3b/relu_5x5_reduce"
-  type: "ReLU"
-  bottom: "inception_3b/5x5_reduce"
-  top: "inception_3b/5x5_reduce"
-}
-layer {
-  name: "inception_3b/5x5"
-  type: "Convolution"
-  bottom: "inception_3b/5x5_reduce"
-  top: "inception_3b/5x5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 96
-    pad: 2
-    kernel_size: 5
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_3b/relu_5x5"
-  type: "ReLU"
-  bottom: "inception_3b/5x5"
-  top: "inception_3b/5x5"
-}
-layer {
-  name: "inception_3b/pool"
-  type: "Pooling"
-  bottom: "inception_3a/output"
-  top: "inception_3b/pool"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 1
-    pad: 1
-  }
-}
-layer {
-  name: "inception_3b/pool_proj"
-  type: "Convolution"
-  bottom: "inception_3b/pool"
-  top: "inception_3b/pool_proj"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 64
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_3b/relu_pool_proj"
-  type: "ReLU"
-  bottom: "inception_3b/pool_proj"
-  top: "inception_3b/pool_proj"
-}
-layer {
-  name: "inception_3b/output"
-  type: "Concat"
-  bottom: "inception_3b/1x1"
-  bottom: "inception_3b/3x3"
-  bottom: "inception_3b/5x5"
-  bottom: "inception_3b/pool_proj"
-  top: "inception_3b/output"
-}
-layer {
-  name: "pool3/3x3_s2"
-  type: "Pooling"
-  bottom: "inception_3b/output"
-  top: "pool3/3x3_s2"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "inception_4a/1x1"
-  type: "Convolution"
-  bottom: "pool3/3x3_s2"
-  top: "inception_4a/1x1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 192
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4a/relu_1x1"
-  type: "ReLU"
-  bottom: "inception_4a/1x1"
-  top: "inception_4a/1x1"
-}
-layer {
-  name: "inception_4a/3x3_reduce"
-  type: "Convolution"
-  bottom: "pool3/3x3_s2"
-  top: "inception_4a/3x3_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 96
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4a/relu_3x3_reduce"
-  type: "ReLU"
-  bottom: "inception_4a/3x3_reduce"
-  top: "inception_4a/3x3_reduce"
-}
-layer {
-  name: "inception_4a/3x3"
-  type: "Convolution"
-  bottom: "inception_4a/3x3_reduce"
-  top: "inception_4a/3x3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 208
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4a/relu_3x3"
-  type: "ReLU"
-  bottom: "inception_4a/3x3"
-  top: "inception_4a/3x3"
-}
-layer {
-  name: "inception_4a/5x5_reduce"
-  type: "Convolution"
-  bottom: "pool3/3x3_s2"
-  top: "inception_4a/5x5_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 16
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4a/relu_5x5_reduce"
-  type: "ReLU"
-  bottom: "inception_4a/5x5_reduce"
-  top: "inception_4a/5x5_reduce"
-}
-layer {
-  name: "inception_4a/5x5"
-  type: "Convolution"
-  bottom: "inception_4a/5x5_reduce"
-  top: "inception_4a/5x5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 48
-    pad: 2
-    kernel_size: 5
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4a/relu_5x5"
-  type: "ReLU"
-  bottom: "inception_4a/5x5"
-  top: "inception_4a/5x5"
-}
-layer {
-  name: "inception_4a/pool"
-  type: "Pooling"
-  bottom: "pool3/3x3_s2"
-  top: "inception_4a/pool"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 1
-    pad: 1
-  }
-}
-layer {
-  name: "inception_4a/pool_proj"
-  type: "Convolution"
-  bottom: "inception_4a/pool"
-  top: "inception_4a/pool_proj"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 64
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4a/relu_pool_proj"
-  type: "ReLU"
-  bottom: "inception_4a/pool_proj"
-  top: "inception_4a/pool_proj"
-}
-layer {
-  name: "inception_4a/output"
-  type: "Concat"
-  bottom: "inception_4a/1x1"
-  bottom: "inception_4a/3x3"
-  bottom: "inception_4a/5x5"
-  bottom: "inception_4a/pool_proj"
-  top: "inception_4a/output"
-}
-layer {
-  name: "loss1/ave_pool"
-  type: "Pooling"
-  bottom: "inception_4a/output"
-  top: "loss1/ave_pool"
-  pooling_param {
-    pool: AVE
-    kernel_size: 5
-    stride: 3
-  }
-}
-layer {
-  name: "loss1/conv"
-  type: "Convolution"
-  bottom: "loss1/ave_pool"
-  top: "loss1/conv"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "loss1/relu_conv"
-  type: "ReLU"
-  bottom: "loss1/conv"
-  top: "loss1/conv"
-}
-layer {
-  name: "loss1/fc"
-  type: "InnerProduct"
-  bottom: "loss1/conv"
-  top: "loss1/fc"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 1024
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "loss1/relu_fc"
-  type: "ReLU"
-  bottom: "loss1/fc"
-  top: "loss1/fc"
-}
-layer {
-  name: "loss1/drop_fc"
-  type: "Dropout"
-  bottom: "loss1/fc"
-  top: "loss1/fc"
-  dropout_param {
-    dropout_ratio: 0.7
-  }
-}
-layer {
-  name: "loss1/classifier"
-  type: "InnerProduct"
-  bottom: "loss1/fc"
-  top: "loss1/classifier"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 1000
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "loss1/loss"
-  type: "SoftmaxWithLoss"
-  bottom: "loss1/classifier"
-  bottom: "label"
-  top: "loss1/loss1"
-  loss_weight: 0.3
-}
-layer {
-  name: "loss1/top-1"
-  type: "Accuracy"
-  bottom: "loss1/classifier"
-  bottom: "label"
-  top: "loss1/top-1"
-  include {
-    phase: TEST
-  }
-}
-layer {
-  name: "loss1/top-5"
-  type: "Accuracy"
-  bottom: "loss1/classifier"
-  bottom: "label"
-  top: "loss1/top-5"
-  include {
-    phase: TEST
-  }
-  accuracy_param {
-    top_k: 5
-  }
-}
-layer {
-  name: "inception_4b/1x1"
-  type: "Convolution"
-  bottom: "inception_4a/output"
-  top: "inception_4b/1x1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 160
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4b/relu_1x1"
-  type: "ReLU"
-  bottom: "inception_4b/1x1"
-  top: "inception_4b/1x1"
-}
-layer {
-  name: "inception_4b/3x3_reduce"
-  type: "Convolution"
-  bottom: "inception_4a/output"
-  top: "inception_4b/3x3_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 112
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4b/relu_3x3_reduce"
-  type: "ReLU"
-  bottom: "inception_4b/3x3_reduce"
-  top: "inception_4b/3x3_reduce"
-}
-layer {
-  name: "inception_4b/3x3"
-  type: "Convolution"
-  bottom: "inception_4b/3x3_reduce"
-  top: "inception_4b/3x3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 224
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4b/relu_3x3"
-  type: "ReLU"
-  bottom: "inception_4b/3x3"
-  top: "inception_4b/3x3"
-}
-layer {
-  name: "inception_4b/5x5_reduce"
-  type: "Convolution"
-  bottom: "inception_4a/output"
-  top: "inception_4b/5x5_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 24
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4b/relu_5x5_reduce"
-  type: "ReLU"
-  bottom: "inception_4b/5x5_reduce"
-  top: "inception_4b/5x5_reduce"
-}
-layer {
-  name: "inception_4b/5x5"
-  type: "Convolution"
-  bottom: "inception_4b/5x5_reduce"
-  top: "inception_4b/5x5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 64
-    pad: 2
-    kernel_size: 5
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4b/relu_5x5"
-  type: "ReLU"
-  bottom: "inception_4b/5x5"
-  top: "inception_4b/5x5"
-}
-layer {
-  name: "inception_4b/pool"
-  type: "Pooling"
-  bottom: "inception_4a/output"
-  top: "inception_4b/pool"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 1
-    pad: 1
-  }
-}
-layer {
-  name: "inception_4b/pool_proj"
-  type: "Convolution"
-  bottom: "inception_4b/pool"
-  top: "inception_4b/pool_proj"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 64
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4b/relu_pool_proj"
-  type: "ReLU"
-  bottom: "inception_4b/pool_proj"
-  top: "inception_4b/pool_proj"
-}
-layer {
-  name: "inception_4b/output"
-  type: "Concat"
-  bottom: "inception_4b/1x1"
-  bottom: "inception_4b/3x3"
-  bottom: "inception_4b/5x5"
-  bottom: "inception_4b/pool_proj"
-  top: "inception_4b/output"
-}
-layer {
-  name: "inception_4c/1x1"
-  type: "Convolution"
-  bottom: "inception_4b/output"
-  top: "inception_4c/1x1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4c/relu_1x1"
-  type: "ReLU"
-  bottom: "inception_4c/1x1"
-  top: "inception_4c/1x1"
-}
-layer {
-  name: "inception_4c/3x3_reduce"
-  type: "Convolution"
-  bottom: "inception_4b/output"
-  top: "inception_4c/3x3_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4c/relu_3x3_reduce"
-  type: "ReLU"
-  bottom: "inception_4c/3x3_reduce"
-  top: "inception_4c/3x3_reduce"
-}
-layer {
-  name: "inception_4c/3x3"
-  type: "Convolution"
-  bottom: "inception_4c/3x3_reduce"
-  top: "inception_4c/3x3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4c/relu_3x3"
-  type: "ReLU"
-  bottom: "inception_4c/3x3"
-  top: "inception_4c/3x3"
-}
-layer {
-  name: "inception_4c/5x5_reduce"
-  type: "Convolution"
-  bottom: "inception_4b/output"
-  top: "inception_4c/5x5_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 24
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4c/relu_5x5_reduce"
-  type: "ReLU"
-  bottom: "inception_4c/5x5_reduce"
-  top: "inception_4c/5x5_reduce"
-}
-layer {
-  name: "inception_4c/5x5"
-  type: "Convolution"
-  bottom: "inception_4c/5x5_reduce"
-  top: "inception_4c/5x5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 64
-    pad: 2
-    kernel_size: 5
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4c/relu_5x5"
-  type: "ReLU"
-  bottom: "inception_4c/5x5"
-  top: "inception_4c/5x5"
-}
-layer {
-  name: "inception_4c/pool"
-  type: "Pooling"
-  bottom: "inception_4b/output"
-  top: "inception_4c/pool"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 1
-    pad: 1
-  }
-}
-layer {
-  name: "inception_4c/pool_proj"
-  type: "Convolution"
-  bottom: "inception_4c/pool"
-  top: "inception_4c/pool_proj"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 64
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4c/relu_pool_proj"
-  type: "ReLU"
-  bottom: "inception_4c/pool_proj"
-  top: "inception_4c/pool_proj"
-}
-layer {
-  name: "inception_4c/output"
-  type: "Concat"
-  bottom: "inception_4c/1x1"
-  bottom: "inception_4c/3x3"
-  bottom: "inception_4c/5x5"
-  bottom: "inception_4c/pool_proj"
-  top: "inception_4c/output"
-}
-layer {
-  name: "inception_4d/1x1"
-  type: "Convolution"
-  bottom: "inception_4c/output"
-  top: "inception_4d/1x1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 112
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4d/relu_1x1"
-  type: "ReLU"
-  bottom: "inception_4d/1x1"
-  top: "inception_4d/1x1"
-}
-layer {
-  name: "inception_4d/3x3_reduce"
-  type: "Convolution"
-  bottom: "inception_4c/output"
-  top: "inception_4d/3x3_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 144
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4d/relu_3x3_reduce"
-  type: "ReLU"
-  bottom: "inception_4d/3x3_reduce"
-  top: "inception_4d/3x3_reduce"
-}
-layer {
-  name: "inception_4d/3x3"
-  type: "Convolution"
-  bottom: "inception_4d/3x3_reduce"
-  top: "inception_4d/3x3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 288
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4d/relu_3x3"
-  type: "ReLU"
-  bottom: "inception_4d/3x3"
-  top: "inception_4d/3x3"
-}
-layer {
-  name: "inception_4d/5x5_reduce"
-  type: "Convolution"
-  bottom: "inception_4c/output"
-  top: "inception_4d/5x5_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 32
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4d/relu_5x5_reduce"
-  type: "ReLU"
-  bottom: "inception_4d/5x5_reduce"
-  top: "inception_4d/5x5_reduce"
-}
-layer {
-  name: "inception_4d/5x5"
-  type: "Convolution"
-  bottom: "inception_4d/5x5_reduce"
-  top: "inception_4d/5x5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 64
-    pad: 2
-    kernel_size: 5
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4d/relu_5x5"
-  type: "ReLU"
-  bottom: "inception_4d/5x5"
-  top: "inception_4d/5x5"
-}
-layer {
-  name: "inception_4d/pool"
-  type: "Pooling"
-  bottom: "inception_4c/output"
-  top: "inception_4d/pool"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 1
-    pad: 1
-  }
-}
-layer {
-  name: "inception_4d/pool_proj"
-  type: "Convolution"
-  bottom: "inception_4d/pool"
-  top: "inception_4d/pool_proj"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 64
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4d/relu_pool_proj"
-  type: "ReLU"
-  bottom: "inception_4d/pool_proj"
-  top: "inception_4d/pool_proj"
-}
-layer {
-  name: "inception_4d/output"
-  type: "Concat"
-  bottom: "inception_4d/1x1"
-  bottom: "inception_4d/3x3"
-  bottom: "inception_4d/5x5"
-  bottom: "inception_4d/pool_proj"
-  top: "inception_4d/output"
-}
-layer {
-  name: "loss2/ave_pool"
-  type: "Pooling"
-  bottom: "inception_4d/output"
-  top: "loss2/ave_pool"
-  pooling_param {
-    pool: AVE
-    kernel_size: 5
-    stride: 3
-  }
-}
-layer {
-  name: "loss2/conv"
-  type: "Convolution"
-  bottom: "loss2/ave_pool"
-  top: "loss2/conv"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "loss2/relu_conv"
-  type: "ReLU"
-  bottom: "loss2/conv"
-  top: "loss2/conv"
-}
-layer {
-  name: "loss2/fc"
-  type: "InnerProduct"
-  bottom: "loss2/conv"
-  top: "loss2/fc"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 1024
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "loss2/relu_fc"
-  type: "ReLU"
-  bottom: "loss2/fc"
-  top: "loss2/fc"
-}
-layer {
-  name: "loss2/drop_fc"
-  type: "Dropout"
-  bottom: "loss2/fc"
-  top: "loss2/fc"
-  dropout_param {
-    dropout_ratio: 0.7
-  }
-}
-layer {
-  name: "loss2/classifier"
-  type: "InnerProduct"
-  bottom: "loss2/fc"
-  top: "loss2/classifier"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 1000
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "loss2/loss"
-  type: "SoftmaxWithLoss"
-  bottom: "loss2/classifier"
-  bottom: "label"
-  top: "loss2/loss1"
-  loss_weight: 0.3
-}
-layer {
-  name: "loss2/top-1"
-  type: "Accuracy"
-  bottom: "loss2/classifier"
-  bottom: "label"
-  top: "loss2/top-1"
-  include {
-    phase: TEST
-  }
-}
-layer {
-  name: "loss2/top-5"
-  type: "Accuracy"
-  bottom: "loss2/classifier"
-  bottom: "label"
-  top: "loss2/top-5"
-  include {
-    phase: TEST
-  }
-  accuracy_param {
-    top_k: 5
-  }
-}
-layer {
-  name: "inception_4e/1x1"
-  type: "Convolution"
-  bottom: "inception_4d/output"
-  top: "inception_4e/1x1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4e/relu_1x1"
-  type: "ReLU"
-  bottom: "inception_4e/1x1"
-  top: "inception_4e/1x1"
-}
-layer {
-  name: "inception_4e/3x3_reduce"
-  type: "Convolution"
-  bottom: "inception_4d/output"
-  top: "inception_4e/3x3_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 160
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4e/relu_3x3_reduce"
-  type: "ReLU"
-  bottom: "inception_4e/3x3_reduce"
-  top: "inception_4e/3x3_reduce"
-}
-layer {
-  name: "inception_4e/3x3"
-  type: "Convolution"
-  bottom: "inception_4e/3x3_reduce"
-  top: "inception_4e/3x3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 320
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4e/relu_3x3"
-  type: "ReLU"
-  bottom: "inception_4e/3x3"
-  top: "inception_4e/3x3"
-}
-layer {
-  name: "inception_4e/5x5_reduce"
-  type: "Convolution"
-  bottom: "inception_4d/output"
-  top: "inception_4e/5x5_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 32
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4e/relu_5x5_reduce"
-  type: "ReLU"
-  bottom: "inception_4e/5x5_reduce"
-  top: "inception_4e/5x5_reduce"
-}
-layer {
-  name: "inception_4e/5x5"
-  type: "Convolution"
-  bottom: "inception_4e/5x5_reduce"
-  top: "inception_4e/5x5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    pad: 2
-    kernel_size: 5
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4e/relu_5x5"
-  type: "ReLU"
-  bottom: "inception_4e/5x5"
-  top: "inception_4e/5x5"
-}
-layer {
-  name: "inception_4e/pool"
-  type: "Pooling"
-  bottom: "inception_4d/output"
-  top: "inception_4e/pool"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 1
-    pad: 1
-  }
-}
-layer {
-  name: "inception_4e/pool_proj"
-  type: "Convolution"
-  bottom: "inception_4e/pool"
-  top: "inception_4e/pool_proj"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_4e/relu_pool_proj"
-  type: "ReLU"
-  bottom: "inception_4e/pool_proj"
-  top: "inception_4e/pool_proj"
-}
-layer {
-  name: "inception_4e/output"
-  type: "Concat"
-  bottom: "inception_4e/1x1"
-  bottom: "inception_4e/3x3"
-  bottom: "inception_4e/5x5"
-  bottom: "inception_4e/pool_proj"
-  top: "inception_4e/output"
-}
-layer {
-  name: "pool4/3x3_s2"
-  type: "Pooling"
-  bottom: "inception_4e/output"
-  top: "pool4/3x3_s2"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layer {
-  name: "inception_5a/1x1"
-  type: "Convolution"
-  bottom: "pool4/3x3_s2"
-  top: "inception_5a/1x1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 256
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_5a/relu_1x1"
-  type: "ReLU"
-  bottom: "inception_5a/1x1"
-  top: "inception_5a/1x1"
-}
-layer {
-  name: "inception_5a/3x3_reduce"
-  type: "Convolution"
-  bottom: "pool4/3x3_s2"
-  top: "inception_5a/3x3_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 160
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_5a/relu_3x3_reduce"
-  type: "ReLU"
-  bottom: "inception_5a/3x3_reduce"
-  top: "inception_5a/3x3_reduce"
-}
-layer {
-  name: "inception_5a/3x3"
-  type: "Convolution"
-  bottom: "inception_5a/3x3_reduce"
-  top: "inception_5a/3x3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 320
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_5a/relu_3x3"
-  type: "ReLU"
-  bottom: "inception_5a/3x3"
-  top: "inception_5a/3x3"
-}
-layer {
-  name: "inception_5a/5x5_reduce"
-  type: "Convolution"
-  bottom: "pool4/3x3_s2"
-  top: "inception_5a/5x5_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 32
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_5a/relu_5x5_reduce"
-  type: "ReLU"
-  bottom: "inception_5a/5x5_reduce"
-  top: "inception_5a/5x5_reduce"
-}
-layer {
-  name: "inception_5a/5x5"
-  type: "Convolution"
-  bottom: "inception_5a/5x5_reduce"
-  top: "inception_5a/5x5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    pad: 2
-    kernel_size: 5
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_5a/relu_5x5"
-  type: "ReLU"
-  bottom: "inception_5a/5x5"
-  top: "inception_5a/5x5"
-}
-layer {
-  name: "inception_5a/pool"
-  type: "Pooling"
-  bottom: "pool4/3x3_s2"
-  top: "inception_5a/pool"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 1
-    pad: 1
-  }
-}
-layer {
-  name: "inception_5a/pool_proj"
-  type: "Convolution"
-  bottom: "inception_5a/pool"
-  top: "inception_5a/pool_proj"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_5a/relu_pool_proj"
-  type: "ReLU"
-  bottom: "inception_5a/pool_proj"
-  top: "inception_5a/pool_proj"
-}
-layer {
-  name: "inception_5a/output"
-  type: "Concat"
-  bottom: "inception_5a/1x1"
-  bottom: "inception_5a/3x3"
-  bottom: "inception_5a/5x5"
-  bottom: "inception_5a/pool_proj"
-  top: "inception_5a/output"
-}
-layer {
-  name: "inception_5b/1x1"
-  type: "Convolution"
-  bottom: "inception_5a/output"
-  top: "inception_5b/1x1"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_5b/relu_1x1"
-  type: "ReLU"
-  bottom: "inception_5b/1x1"
-  top: "inception_5b/1x1"
-}
-layer {
-  name: "inception_5b/3x3_reduce"
-  type: "Convolution"
-  bottom: "inception_5a/output"
-  top: "inception_5b/3x3_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 192
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_5b/relu_3x3_reduce"
-  type: "ReLU"
-  bottom: "inception_5b/3x3_reduce"
-  top: "inception_5b/3x3_reduce"
-}
-layer {
-  name: "inception_5b/3x3"
-  type: "Convolution"
-  bottom: "inception_5b/3x3_reduce"
-  top: "inception_5b/3x3"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_5b/relu_3x3"
-  type: "ReLU"
-  bottom: "inception_5b/3x3"
-  top: "inception_5b/3x3"
-}
-layer {
-  name: "inception_5b/5x5_reduce"
-  type: "Convolution"
-  bottom: "inception_5a/output"
-  top: "inception_5b/5x5_reduce"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 48
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_5b/relu_5x5_reduce"
-  type: "ReLU"
-  bottom: "inception_5b/5x5_reduce"
-  top: "inception_5b/5x5_reduce"
-}
-layer {
-  name: "inception_5b/5x5"
-  type: "Convolution"
-  bottom: "inception_5b/5x5_reduce"
-  top: "inception_5b/5x5"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    pad: 2
-    kernel_size: 5
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_5b/relu_5x5"
-  type: "ReLU"
-  bottom: "inception_5b/5x5"
-  top: "inception_5b/5x5"
-}
-layer {
-  name: "inception_5b/pool"
-  type: "Pooling"
-  bottom: "inception_5a/output"
-  top: "inception_5b/pool"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 1
-    pad: 1
-  }
-}
-layer {
-  name: "inception_5b/pool_proj"
-  type: "Convolution"
-  bottom: "inception_5b/pool"
-  top: "inception_5b/pool_proj"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  convolution_param {
-    num_output: 128
-    kernel_size: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.1
-    }
-  }
-}
-layer {
-  name: "inception_5b/relu_pool_proj"
-  type: "ReLU"
-  bottom: "inception_5b/pool_proj"
-  top: "inception_5b/pool_proj"
-}
-layer {
-  name: "inception_5b/output"
-  type: "Concat"
-  bottom: "inception_5b/1x1"
-  bottom: "inception_5b/3x3"
-  bottom: "inception_5b/5x5"
-  bottom: "inception_5b/pool_proj"
-  top: "inception_5b/output"
-}
-layer {
-  name: "pool5/7x7_s1"
-  type: "Pooling"
-  bottom: "inception_5b/output"
-  top: "pool5/7x7_s1"
-  pooling_param {
-    pool: AVE
-    kernel_size: 7
-    stride: 1
-  }
-}
-layer {
-  name: "pool5/drop_7x7_s1"
-  type: "Dropout"
-  bottom: "pool5/7x7_s1"
-  top: "pool5/7x7_s1"
-  dropout_param {
-    dropout_ratio: 0.4
-  }
-}
-layer {
-  name: "loss3/classifier"
-  type: "InnerProduct"
-  bottom: "pool5/7x7_s1"
-  top: "loss3/classifier"
-  param {
-    lr_mult: 1
-    decay_mult: 1
-  }
-  param {
-    lr_mult: 2
-    decay_mult: 0
-  }
-  inner_product_param {
-    num_output: 1000
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layer {
-  name: "loss3/loss3"
-  type: "SoftmaxWithLoss"
-  bottom: "loss3/classifier"
-  bottom: "label"
-  top: "loss3/loss3"
-  loss_weight: 1
-}
-layer {
-  name: "loss3/top-1"
-  type: "Accuracy"
-  bottom: "loss3/classifier"
-  bottom: "label"
-  top: "loss3/top-1"
-  include {
-    phase: TEST
-  }
-}
-layer {
-  name: "loss3/top-5"
-  type: "Accuracy"
-  bottom: "loss3/classifier"
-  bottom: "label"
-  top: "loss3/top-5"
-  include {
-    phase: TEST
-  }
-  accuracy_param {
-    top_k: 5
-  }
-}
diff --git a/models/intel_optimized_models/multinode/googlenet_v2_4nodes/solver.prototxt b/models/intel_optimized_models/multinode/googlenet_v2_4nodes/solver.prototxt
deleted file mode 100644
index dda5240f3..000000000
--- a/models/intel_optimized_models/multinode/googlenet_v2_4nodes/solver.prototxt
+++ /dev/null
@@ -1,24 +0,0 @@
-#This is Intel(R) optimized (in terms of time to train) version of solver for model GoogLeNet v2.
-#Original solver.prototxt can be found in /models/default_resnet_50/ directory of this repository.
-#Differences:
-#- lr_policy is set to poly instead of step
-#- base_lr is set to 0.05
-#- max_iter is decreased to 100000
-#
-#Top-5 and Top-1 results achieved with this version of solver:
-#Top-5: 89.40%
-#Top-1: 69.02%
-#Training was performed using server equipped with Intel(R) Xeon Phi(TM) CPU 7250 processor. 
-
-net: "models/intel_optimized_models/multinode/googlenet_v2_4nodes/train_val.prototxt"
-base_lr: 0.05
-display: 40
-max_iter: 100000
-lr_policy: "poly"
-power: 0.5
-momentum: 0.9
-weight_decay: 0.0002
-snapshot: 10000
-snapshot_prefix: "models/intel_optimized_models/multinode/googlenet_v2_4nodes/default_googlenet_v2"
-solver_mode: CPU
-average_loss: 40
\ No newline at end of file
diff --git a/models/intel_optimized_models/multinode/googlenet_v2_8nodes/solver.prototxt b/models/intel_optimized_models/multinode/googlenet_v2_8nodes/solver.prototxt
new file mode 100644
index 000000000..a39aedfe5
--- /dev/null
+++ b/models/intel_optimized_models/multinode/googlenet_v2_8nodes/solver.prototxt
@@ -0,0 +1,15 @@
+net: "models/intel_optimized_models/multinode/googlenet_v2_8nodes/train_val.prototxt"
+test_iter: 1000
+test_interval: 10000
+test_initialization: false
+base_lr: 0.06
+display: 40
+max_iter: 182000
+lr_policy: "poly"
+power: 0.5
+momentum: 0.9
+weight_decay: 0.0002
+snapshot: 10000
+snapshot_prefix: "models/intel_optimized_models/multinode/googlenet_v2_8nodes/default_googlenet_v2"
+solver_mode: CPU
+average_loss: 40
diff --git a/models/intel_optimized_models/multinode/googlenet_v2_4nodes/train_val.prototxt b/models/intel_optimized_models/multinode/googlenet_v2_8nodes/train_val.prototxt
similarity index 100%
rename from models/intel_optimized_models/multinode/googlenet_v2_4nodes/train_val.prototxt
rename to models/intel_optimized_models/multinode/googlenet_v2_8nodes/train_val.prototxt
diff --git a/models/intel_optimized_models/multinode/resnet_50_16_nodes/solver.prototxt b/models/intel_optimized_models/multinode/resnet_50_16_nodes/solver.prototxt
deleted file mode 100644
index a66f60dfa..000000000
--- a/models/intel_optimized_models/multinode/resnet_50_16_nodes/solver.prototxt
+++ /dev/null
@@ -1,15 +0,0 @@
-#This solver is described by Computer Vision Group Jena (CVGJ) in [ImageNet pre-trained models with batch normalization] (https://arxiv.org/pdf/1612.01452.pdf)
-net: "models/intel_optimized_models/multinode/resnet_50_16_nodes/train_val.prototxt"
-#test_iter: 5000
-#test_interval: 15000
-#test_initialization: false
-base_lr: 0.1
-display: 20
-max_iter: 320000
-lr_policy: "poly"
-power: 1
-momentum: 0.9
-weight_decay: 0.0001
-snapshot: 30000
-snapshot_prefix: "caffe-resnet50"
-solver_mode: CPU
diff --git a/models/intel_optimized_models/multinode/resnet_50_16_nodes/train_val.prototxt b/models/intel_optimized_models/multinode/resnet_50_16_nodes/train_val.prototxt
deleted file mode 100644
index 71b07d00a..000000000
--- a/models/intel_optimized_models/multinode/resnet_50_16_nodes/train_val.prototxt
+++ /dev/null
@@ -1,2306 +0,0 @@
-#This is Intel(R) optimized (in terms of time to train) version of topology described in the [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) publication.
-#
-#Top-5 and Top-1 results achieved with this topology:
-#Top-5: 92%
-#Top-1: 73.9%
-#Training was performed using server equipped with Intel(R) Xeon Phi(TM) CPU 7250 processor.
-
-layer {
-name: "data"
-type: "Data"
-top: "data"
-top: "label"
-include {
-  phase: TRAIN
-}
-transform_param {
-  scale: 0.0078125
-  mirror: true
-  crop_size: 224
-   mean_value: 104
-   mean_value: 117
-   mean_value: 123
-}
-  data_param {
-    source: "examples/imagenet/ilsvrc12_train_lmdb"
-    batch_size: 16
-    backend: LMDB
-    shuffle: true
-  }
-
-}
-layer {
-name: "data"
-type: "Data"
-top: "data"
-top: "label"
-include {
-  phase: TEST
-}
-transform_param {
-  scale: 0.0078125
-  mirror: false
-  crop_size: 224
-   mean_value: 104
-   mean_value: 117
-   mean_value: 123
-}
-  data_param {
-    source: "examples/imagenet/ilsvrc12_val_lmdb"
-    batch_size: 10
-    backend: LMDB
-  }
-
-}
-
-layer {
-name: "conv1"
-type: "Convolution"
-bottom: "data"
-top: "conv1"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-param {
-  lr_mult: 2.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 64
-  pad: 3
-  kernel_size: 7
-  stride: 2
-  weight_filler {
-    type: "msra"
-    variance_norm: FAN_OUT
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "conv1_bn"
-type: "BatchNorm"
-bottom: "conv1"
-top: "conv1_pcs_arm_sim"
-  batch_norm_param {
- }       
-}
-layer {
-name: "conv1_relu"
-type: "ReLU"
-bottom: "conv1_pcs_arm_sim"
-top: "conv1_pcs_arm_sim"
-
-}
-layer {
-name: "conv1_pool"
-type: "Pooling"
-bottom: "conv1_pcs_arm_sim"
-top: "conv1_pool"
-pooling_param {
-  kernel_size: 3
-  stride: 2
-}
-
-}
-layer {
-name: "layer_64_1_conv1"
-type: "Convolution"
-bottom: "conv1_pool"
-top: "layer_64_1_conv1"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 64
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_64_1_bn2"
-type: "BatchNorm"
-bottom: "layer_64_1_conv1"
-top: "layer_64_1_conv1_pcs_arm_sim"
-  batch_norm_param {
- }     
-}
-layer {
-name: "layer_64_1_relu2"
-type: "ReLU"
-bottom: "layer_64_1_conv1_pcs_arm_sim"
-top: "layer_64_1_conv1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_64_1_conv2"
-type: "Convolution"
-bottom: "layer_64_1_conv1_pcs_arm_sim"
-top: "layer_64_1_conv2"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 64
-  bias_term: false
-  pad: 1
-  kernel_size: 3
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_64_1_bn3"
-type: "BatchNorm"
-bottom: "layer_64_1_conv2"
-top: "layer_64_1_conv2_pcs_arm_sim"
-  batch_norm_param {
- }        
-}
-layer {
-name: "layer_64_1_relu3"
-type: "ReLU"
-bottom: "layer_64_1_conv2_pcs_arm_sim"
-top: "layer_64_1_conv2_pcs_arm_sim"
-
-}
-layer {
-name: "layer_64_1_conv3"
-type: "Convolution"
-bottom: "layer_64_1_conv2_pcs_arm_sim"
-top: "layer_64_1_conv3"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 256
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_64_1_conv_expand"
-type: "Convolution"
-bottom: "layer_64_1_conv1_pcs_arm_sim"
-top: "layer_64_1_conv_expand"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 256
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_64_1_sum"
-type: "Eltwise"
-bottom: "layer_64_1_conv3"
-bottom: "layer_64_1_conv_expand"
-top: "layer_64_1_sum"
-
-}
-layer {
-name: "layer_64_2_bn1"
-type: "BatchNorm"
-bottom: "layer_64_1_sum"
-top: "layer_64_2_bn1_pcs_arm_sim"
-  batch_norm_param {
- }       
-}
-layer {
-name: "layer_64_2_relu1"
-type: "ReLU"
-bottom: "layer_64_2_bn1_pcs_arm_sim"
-top: "layer_64_2_bn1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_64_2_conv1"
-type: "Convolution"
-bottom: "layer_64_2_bn1_pcs_arm_sim"
-top: "layer_64_2_conv1"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 64
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_64_2_bn2"
-type: "BatchNorm"
-bottom: "layer_64_2_conv1"
-top: "layer_64_2_conv1_pcs_arm_sim"
-  batch_norm_param {
- }        
-}
-layer {
-name: "layer_64_2_relu2"
-type: "ReLU"
-bottom: "layer_64_2_conv1_pcs_arm_sim"
-top: "layer_64_2_conv1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_64_2_conv2"
-type: "Convolution"
-bottom: "layer_64_2_conv1_pcs_arm_sim"
-top: "layer_64_2_conv2"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 64
-  bias_term: false
-  pad: 1
-  kernel_size: 3
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_64_2_bn3"
-type: "BatchNorm"
-bottom: "layer_64_2_conv2"
-top: "layer_64_2_conv2_pcs_arm_sim"
-  batch_norm_param {
- }        
-}
-layer {
-name: "layer_64_2_relu3"
-type: "ReLU"
-bottom: "layer_64_2_conv2_pcs_arm_sim"
-top: "layer_64_2_conv2_pcs_arm_sim"
-
-}
-layer {
-name: "layer_64_2_conv3"
-type: "Convolution"
-bottom: "layer_64_2_conv2_pcs_arm_sim"
-top: "layer_64_2_conv3"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 256
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_64_2_sum"
-type: "Eltwise"
-bottom: "layer_64_2_conv3"
-bottom: "layer_64_1_sum"
-top: "layer_64_2_sum"
-
-}
-layer {
-name: "layer_64_3_bn1"
-type: "BatchNorm"
-bottom: "layer_64_2_sum"
-top: "layer_64_3_bn1_pcs_arm_sim"
-  batch_norm_param {
- }       
-}
-layer {
-name: "layer_64_3_relu1"
-type: "ReLU"
-bottom: "layer_64_3_bn1_pcs_arm_sim"
-top: "layer_64_3_bn1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_64_3_conv1"
-type: "Convolution"
-bottom: "layer_64_3_bn1_pcs_arm_sim"
-top: "layer_64_3_conv1"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 64
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_64_3_bn2"
-type: "BatchNorm"
-bottom: "layer_64_3_conv1"
-top: "layer_64_3_conv1_pcs_arm_sim"
-  batch_norm_param {
- }
-}
-layer {
-name: "layer_64_3_relu2"
-type: "ReLU"
-bottom: "layer_64_3_conv1_pcs_arm_sim"
-top: "layer_64_3_conv1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_64_3_conv2"
-type: "Convolution"
-bottom: "layer_64_3_conv1_pcs_arm_sim"
-top: "layer_64_3_conv2"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 64
-  bias_term: false
-  pad: 1
-  kernel_size: 3
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_64_3_bn3"
-type: "BatchNorm"
-bottom: "layer_64_3_conv2"
-top: "layer_64_3_conv2_pcs_arm_sim"
-  batch_norm_param {
- } 
-}
-layer {
-name: "layer_64_3_relu3"
-type: "ReLU"
-bottom: "layer_64_3_conv2_pcs_arm_sim"
-top: "layer_64_3_conv2_pcs_arm_sim"
-
-}
-layer {
-name: "layer_64_3_conv3"
-type: "Convolution"
-bottom: "layer_64_3_conv2_pcs_arm_sim"
-top: "layer_64_3_conv3"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 256
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_64_3_sum"
-type: "Eltwise"
-bottom: "layer_64_3_conv3"
-bottom: "layer_64_2_sum"
-top: "layer_64_3_sum"
-
-}
-layer {
-name: "layer_128_1_bn1"
-type: "BatchNorm"
-bottom: "layer_64_3_sum"
-top: "layer_128_1_bn1_pcs_arm_sim"
-  batch_norm_param {
- }  
-}
-layer {
-name: "layer_128_1_relu1"
-type: "ReLU"
-bottom: "layer_128_1_bn1_pcs_arm_sim"
-top: "layer_128_1_bn1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_128_1_conv1"
-type: "Convolution"
-bottom: "layer_128_1_bn1_pcs_arm_sim"
-top: "layer_128_1_conv1"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 128
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_128_1_bn2"
-type: "BatchNorm"
-bottom: "layer_128_1_conv1"
-top: "layer_128_1_conv1_pcs_arm_sim"
-  batch_norm_param {
- } 
-}
-layer {
-name: "layer_128_1_relu2"
-type: "ReLU"
-bottom: "layer_128_1_conv1_pcs_arm_sim"
-top: "layer_128_1_conv1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_128_1_conv2"
-type: "Convolution"
-bottom: "layer_128_1_conv1_pcs_arm_sim"
-top: "layer_128_1_conv2"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 128
-  bias_term: false
-  pad: 1
-  kernel_size: 3
-  stride: 2
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_128_1_bn3"
-type: "BatchNorm"
-bottom: "layer_128_1_conv2"
-top: "layer_128_1_conv2_pcs_arm_sim"
-  batch_norm_param {
- }  
-}
-layer {
-name: "layer_128_1_relu3"
-type: "ReLU"
-bottom: "layer_128_1_conv2_pcs_arm_sim"
-top: "layer_128_1_conv2_pcs_arm_sim"
-
-}
-layer {
-name: "layer_128_1_conv3"
-type: "Convolution"
-bottom: "layer_128_1_conv2_pcs_arm_sim"
-top: "layer_128_1_conv3"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 512
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_128_1_conv_expand"
-type: "Convolution"
-bottom: "layer_128_1_bn1_pcs_arm_sim"
-top: "layer_128_1_conv_expand"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 512
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 2
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_128_1_sum"
-type: "Eltwise"
-bottom: "layer_128_1_conv3"
-bottom: "layer_128_1_conv_expand"
-top: "layer_128_1_sum"
-
-}
-layer {
-name: "layer_128_2_bn1"
-type: "BatchNorm"
-bottom: "layer_128_1_sum"
-top: "layer_128_2_bn1_pcs_arm_sim"
-  batch_norm_param {
- }
-}
-layer {
-name: "layer_128_2_relu1"
-type: "ReLU"
-bottom: "layer_128_2_bn1_pcs_arm_sim"
-top: "layer_128_2_bn1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_128_2_conv1"
-type: "Convolution"
-bottom: "layer_128_2_bn1_pcs_arm_sim"
-top: "layer_128_2_conv1"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 128
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_128_2_bn2"
-type: "BatchNorm"
-bottom: "layer_128_2_conv1"
-top: "layer_128_2_conv1_pcs_arm_sim"
-  batch_norm_param {
- }
-}
-layer {
-name: "layer_128_2_relu2"
-type: "ReLU"
-bottom: "layer_128_2_conv1_pcs_arm_sim"
-top: "layer_128_2_conv1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_128_2_conv2"
-type: "Convolution"
-bottom: "layer_128_2_conv1_pcs_arm_sim"
-top: "layer_128_2_conv2"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 128
-  bias_term: false
-  pad: 1
-  kernel_size: 3
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_128_2_bn3"
-type: "BatchNorm"
-bottom: "layer_128_2_conv2"
-top: "layer_128_2_conv2_pcs_arm_sim"
-  batch_norm_param {
- }
-}
-layer {
-name: "layer_128_2_relu3"
-type: "ReLU"
-bottom: "layer_128_2_conv2_pcs_arm_sim"
-top: "layer_128_2_conv2_pcs_arm_sim"
-
-}
-layer {
-name: "layer_128_2_conv3"
-type: "Convolution"
-bottom: "layer_128_2_conv2_pcs_arm_sim"
-top: "layer_128_2_conv3"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 512
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_128_2_sum"
-type: "Eltwise"
-bottom: "layer_128_2_conv3"
-bottom: "layer_128_1_sum"
-top: "layer_128_2_sum"
-
-}
-layer {
-name: "layer_128_3_bn1"
-type: "BatchNorm"
-bottom: "layer_128_2_sum"
-top: "layer_128_3_bn1_pcs_arm_sim"
-  batch_norm_param {
- }
-}
-layer {
-name: "layer_128_3_relu1"
-type: "ReLU"
-bottom: "layer_128_3_bn1_pcs_arm_sim"
-top: "layer_128_3_bn1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_128_3_conv1"
-type: "Convolution"
-bottom: "layer_128_3_bn1_pcs_arm_sim"
-top: "layer_128_3_conv1"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 128
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_128_3_bn2"
-type: "BatchNorm"
-bottom: "layer_128_3_conv1"
-top: "layer_128_3_conv1_pcs_arm_sim"
-  batch_norm_param {
- } 
-}
-layer {
-name: "layer_128_3_relu2"
-type: "ReLU"
-bottom: "layer_128_3_conv1_pcs_arm_sim"
-top: "layer_128_3_conv1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_128_3_conv2"
-type: "Convolution"
-bottom: "layer_128_3_conv1_pcs_arm_sim"
-top: "layer_128_3_conv2"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 128
-  bias_term: false
-  pad: 1
-  kernel_size: 3
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_128_3_bn3"
-type: "BatchNorm"
-bottom: "layer_128_3_conv2"
-top: "layer_128_3_conv2_pcs_arm_sim"
-  batch_norm_param {
- }
-}
-layer {
-name: "layer_128_3_relu3"
-type: "ReLU"
-bottom: "layer_128_3_conv2_pcs_arm_sim"
-top: "layer_128_3_conv2_pcs_arm_sim"
-
-}
-layer {
-name: "layer_128_3_conv3"
-type: "Convolution"
-bottom: "layer_128_3_conv2_pcs_arm_sim"
-top: "layer_128_3_conv3"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 512
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_128_3_sum"
-type: "Eltwise"
-bottom: "layer_128_3_conv3"
-bottom: "layer_128_2_sum"
-top: "layer_128_3_sum"
-
-}
-layer {
-name: "layer_128_4_bn1"
-type: "BatchNorm"
-bottom: "layer_128_3_sum"
-top: "layer_128_4_bn1_pcs_arm_sim"
-  batch_norm_param {
- }
-}
-layer {
-name: "layer_128_4_relu1"
-type: "ReLU"
-bottom: "layer_128_4_bn1_pcs_arm_sim"
-top: "layer_128_4_bn1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_128_4_conv1"
-type: "Convolution"
-bottom: "layer_128_4_bn1_pcs_arm_sim"
-top: "layer_128_4_conv1"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 128
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_128_4_bn2"
-type: "BatchNorm"
-bottom: "layer_128_4_conv1"
-top: "layer_128_4_conv1_pcs_arm_sim"
-  batch_norm_param {
- }
-}
-layer {
-name: "layer_128_4_relu2"
-type: "ReLU"
-bottom: "layer_128_4_conv1_pcs_arm_sim"
-top: "layer_128_4_conv1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_128_4_conv2"
-type: "Convolution"
-bottom: "layer_128_4_conv1_pcs_arm_sim"
-top: "layer_128_4_conv2"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 128
-  bias_term: false
-  pad: 1
-  kernel_size: 3
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_128_4_bn3"
-type: "BatchNorm"
-bottom: "layer_128_4_conv2"
-top: "layer_128_4_conv2_pcs_arm_sim"
-  batch_norm_param {
- }       
-}
-layer {
-name: "layer_128_4_relu3"
-type: "ReLU"
-bottom: "layer_128_4_conv2_pcs_arm_sim"
-top: "layer_128_4_conv2_pcs_arm_sim"
-
-}
-layer {
-name: "layer_128_4_conv3"
-type: "Convolution"
-bottom: "layer_128_4_conv2_pcs_arm_sim"
-top: "layer_128_4_conv3"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 512
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_128_4_sum"
-type: "Eltwise"
-bottom: "layer_128_4_conv3"
-bottom: "layer_128_3_sum"
-top: "layer_128_4_sum"
-
-}
-layer {
-name: "layer_256_1_bn1"
-type: "BatchNorm"
-bottom: "layer_128_4_sum"
-top: "layer_256_1_bn1_pcs_arm_sim"
-  batch_norm_param {
- }
-}
-layer {
-name: "layer_256_1_relu1"
-type: "ReLU"
-bottom: "layer_256_1_bn1_pcs_arm_sim"
-top: "layer_256_1_bn1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_256_1_conv1"
-type: "Convolution"
-bottom: "layer_256_1_bn1_pcs_arm_sim"
-top: "layer_256_1_conv1"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 256
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_256_1_bn2"
-type: "BatchNorm"
-bottom: "layer_256_1_conv1"
-top: "layer_256_1_conv1_pcs_arm_sim"
-  batch_norm_param {
- } 
-}
-layer {
-name: "layer_256_1_relu2"
-type: "ReLU"
-bottom: "layer_256_1_conv1_pcs_arm_sim"
-top: "layer_256_1_conv1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_256_1_conv2"
-type: "Convolution"
-bottom: "layer_256_1_conv1_pcs_arm_sim"
-top: "layer_256_1_conv2"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 256
-  bias_term: false
-  pad: 1
-  kernel_size: 3
-  stride: 2
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_256_1_bn3"
-type: "BatchNorm"
-bottom: "layer_256_1_conv2"
-top: "layer_256_1_conv2_pcs_arm_sim"
-  batch_norm_param {
- } 
-}
-layer {
-name: "layer_256_1_relu3"
-type: "ReLU"
-bottom: "layer_256_1_conv2_pcs_arm_sim"
-top: "layer_256_1_conv2_pcs_arm_sim"
-
-}
-layer {
-name: "layer_256_1_conv3"
-type: "Convolution"
-bottom: "layer_256_1_conv2_pcs_arm_sim"
-top: "layer_256_1_conv3"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 1024
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_256_1_conv_expand"
-type: "Convolution"
-bottom: "layer_256_1_bn1_pcs_arm_sim"
-top: "layer_256_1_conv_expand"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 1024
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 2
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_256_1_sum"
-type: "Eltwise"
-bottom: "layer_256_1_conv3"
-bottom: "layer_256_1_conv_expand"
-top: "layer_256_1_sum"
-
-}
-layer {
-name: "layer_256_2_bn1"
-type: "BatchNorm"
-bottom: "layer_256_1_sum"
-top: "layer_256_2_bn1_pcs_arm_sim"
-  batch_norm_param {
- }
-}
-layer {
-name: "layer_256_2_relu1"
-type: "ReLU"
-bottom: "layer_256_2_bn1_pcs_arm_sim"
-top: "layer_256_2_bn1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_256_2_conv1"
-type: "Convolution"
-bottom: "layer_256_2_bn1_pcs_arm_sim"
-top: "layer_256_2_conv1"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 256
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_256_2_bn2"
-type: "BatchNorm"
-bottom: "layer_256_2_conv1"
-top: "layer_256_2_conv1_pcs_arm_sim"
-  batch_norm_param {
- }
-}
-layer {
-name: "layer_256_2_relu2"
-type: "ReLU"
-bottom: "layer_256_2_conv1_pcs_arm_sim"
-top: "layer_256_2_conv1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_256_2_conv2"
-type: "Convolution"
-bottom: "layer_256_2_conv1_pcs_arm_sim"
-top: "layer_256_2_conv2"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 256
-  bias_term: false
-  pad: 1
-  kernel_size: 3
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_256_2_bn3"
-type: "BatchNorm"
-bottom: "layer_256_2_conv2"
-top: "layer_256_2_conv2_pcs_arm_sim"
-  batch_norm_param {
- }
-}
-layer {
-name: "layer_256_2_relu3"
-type: "ReLU"
-bottom: "layer_256_2_conv2_pcs_arm_sim"
-top: "layer_256_2_conv2_pcs_arm_sim"
-
-}
-layer {
-name: "layer_256_2_conv3"
-type: "Convolution"
-bottom: "layer_256_2_conv2_pcs_arm_sim"
-top: "layer_256_2_conv3"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 1024
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_256_2_sum"
-type: "Eltwise"
-bottom: "layer_256_2_conv3"
-bottom: "layer_256_1_sum"
-top: "layer_256_2_sum"
-
-}
-layer {
-name: "layer_256_3_bn1"
-type: "BatchNorm"
-bottom: "layer_256_2_sum"
-top: "layer_256_3_bn1_pcs_arm_sim"
-  batch_norm_param {
- }
-}
-layer {
-name: "layer_256_3_relu1"
-type: "ReLU"
-bottom: "layer_256_3_bn1_pcs_arm_sim"
-top: "layer_256_3_bn1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_256_3_conv1"
-type: "Convolution"
-bottom: "layer_256_3_bn1_pcs_arm_sim"
-top: "layer_256_3_conv1"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 256
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_256_3_bn2"
-type: "BatchNorm"
-bottom: "layer_256_3_conv1"
-top: "layer_256_3_conv1_pcs_arm_sim"
-  batch_norm_param {
- }
-}
-layer {
-name: "layer_256_3_relu2"
-type: "ReLU"
-bottom: "layer_256_3_conv1_pcs_arm_sim"
-top: "layer_256_3_conv1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_256_3_conv2"
-type: "Convolution"
-bottom: "layer_256_3_conv1_pcs_arm_sim"
-top: "layer_256_3_conv2"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 256
-  bias_term: false
-  pad: 1
-  kernel_size: 3
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_256_3_bn3"
-type: "BatchNorm"
-bottom: "layer_256_3_conv2"
-top: "layer_256_3_conv2_pcs_arm_sim"
-  batch_norm_param {
- } 
-}
-layer {
-name: "layer_256_3_relu3"
-type: "ReLU"
-bottom: "layer_256_3_conv2_pcs_arm_sim"
-top: "layer_256_3_conv2_pcs_arm_sim"
-
-}
-layer {
-name: "layer_256_3_conv3"
-type: "Convolution"
-bottom: "layer_256_3_conv2_pcs_arm_sim"
-top: "layer_256_3_conv3"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 1024
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_256_3_sum"
-type: "Eltwise"
-bottom: "layer_256_3_conv3"
-bottom: "layer_256_2_sum"
-top: "layer_256_3_sum"
-
-}
-layer {
-name: "layer_256_4_bn1"
-type: "BatchNorm"
-bottom: "layer_256_3_sum"
-top: "layer_256_4_bn1_pcs_arm_sim"
-  batch_norm_param {
- } 
-}
-layer {
-name: "layer_256_4_relu1"
-type: "ReLU"
-bottom: "layer_256_4_bn1_pcs_arm_sim"
-top: "layer_256_4_bn1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_256_4_conv1"
-type: "Convolution"
-bottom: "layer_256_4_bn1_pcs_arm_sim"
-top: "layer_256_4_conv1"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 256
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_256_4_bn2"
-type: "BatchNorm"
-bottom: "layer_256_4_conv1"
-top: "layer_256_4_conv1_pcs_arm_sim"
-  batch_norm_param {
- }  
-}
-layer {
-name: "layer_256_4_relu2"
-type: "ReLU"
-bottom: "layer_256_4_conv1_pcs_arm_sim"
-top: "layer_256_4_conv1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_256_4_conv2"
-type: "Convolution"
-bottom: "layer_256_4_conv1_pcs_arm_sim"
-top: "layer_256_4_conv2"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 256
-  bias_term: false
-  pad: 1
-  kernel_size: 3
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_256_4_bn3"
-type: "BatchNorm"
-bottom: "layer_256_4_conv2"
-top: "layer_256_4_conv2_pcs_arm_sim"
-  batch_norm_param {
- } 
-}
-layer {
-name: "layer_256_4_relu3"
-type: "ReLU"
-bottom: "layer_256_4_conv2_pcs_arm_sim"
-top: "layer_256_4_conv2_pcs_arm_sim"
-
-}
-layer {
-name: "layer_256_4_conv3"
-type: "Convolution"
-bottom: "layer_256_4_conv2_pcs_arm_sim"
-top: "layer_256_4_conv3"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 1024
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_256_4_sum"
-type: "Eltwise"
-bottom: "layer_256_4_conv3"
-bottom: "layer_256_3_sum"
-top: "layer_256_4_sum"
-
-}
-layer {
-name: "layer_256_5_bn1"
-type: "BatchNorm"
-bottom: "layer_256_4_sum"
-top: "layer_256_5_bn1_pcs_arm_sim"
-  batch_norm_param {
- } 
-}
-layer {
-name: "layer_256_5_relu1"
-type: "ReLU"
-bottom: "layer_256_5_bn1_pcs_arm_sim"
-top: "layer_256_5_bn1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_256_5_conv1"
-type: "Convolution"
-bottom: "layer_256_5_bn1_pcs_arm_sim"
-top: "layer_256_5_conv1"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 256
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_256_5_bn2"
-type: "BatchNorm"
-bottom: "layer_256_5_conv1"
-top: "layer_256_5_conv1_pcs_arm_sim"
-  batch_norm_param {
- } 
-}
-layer {
-name: "layer_256_5_relu2"
-type: "ReLU"
-bottom: "layer_256_5_conv1_pcs_arm_sim"
-top: "layer_256_5_conv1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_256_5_conv2"
-type: "Convolution"
-bottom: "layer_256_5_conv1_pcs_arm_sim"
-top: "layer_256_5_conv2"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 256
-  bias_term: false
-  pad: 1
-  kernel_size: 3
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_256_5_bn3"
-type: "BatchNorm"
-bottom: "layer_256_5_conv2"
-top: "layer_256_5_conv2_pcs_arm_sim"
-  batch_norm_param {
- }  
-}
-layer {
-name: "layer_256_5_relu3"
-type: "ReLU"
-bottom: "layer_256_5_conv2_pcs_arm_sim"
-top: "layer_256_5_conv2_pcs_arm_sim"
-
-}
-layer {
-name: "layer_256_5_conv3"
-type: "Convolution"
-bottom: "layer_256_5_conv2_pcs_arm_sim"
-top: "layer_256_5_conv3"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 1024
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_256_5_sum"
-type: "Eltwise"
-bottom: "layer_256_5_conv3"
-bottom: "layer_256_4_sum"
-top: "layer_256_5_sum"
-
-}
-layer {
-name: "layer_256_6_bn1"
-type: "BatchNorm"
-bottom: "layer_256_5_sum"
-top: "layer_256_6_bn1_pcs_arm_sim"
-  batch_norm_param {
- }
-}
-layer {
-name: "layer_256_6_relu1"
-type: "ReLU"
-bottom: "layer_256_6_bn1_pcs_arm_sim"
-top: "layer_256_6_bn1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_256_6_conv1"
-type: "Convolution"
-bottom: "layer_256_6_bn1_pcs_arm_sim"
-top: "layer_256_6_conv1"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 256
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_256_6_bn2"
-type: "BatchNorm"
-bottom: "layer_256_6_conv1"
-top: "layer_256_6_conv1_pcs_arm_sim"
-  batch_norm_param {
- } 
-}
-layer {
-name: "layer_256_6_relu2"
-type: "ReLU"
-bottom: "layer_256_6_conv1_pcs_arm_sim"
-top: "layer_256_6_conv1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_256_6_conv2"
-type: "Convolution"
-bottom: "layer_256_6_conv1_pcs_arm_sim"
-top: "layer_256_6_conv2"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 256
-  bias_term: false
-  pad: 1
-  kernel_size: 3
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_256_6_bn3"
-type: "BatchNorm"
-bottom: "layer_256_6_conv2"
-top: "layer_256_6_conv2_pcs_arm_sim"
-  batch_norm_param {
- } 
-}
-layer {
-name: "layer_256_6_relu3"
-type: "ReLU"
-bottom: "layer_256_6_conv2_pcs_arm_sim"
-top: "layer_256_6_conv2_pcs_arm_sim"
-
-}
-layer {
-name: "layer_256_6_conv3"
-type: "Convolution"
-bottom: "layer_256_6_conv2_pcs_arm_sim"
-top: "layer_256_6_conv3"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 1024
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_256_6_sum"
-type: "Eltwise"
-bottom: "layer_256_6_conv3"
-bottom: "layer_256_5_sum"
-top: "layer_256_6_sum"
-
-}
-layer {
-name: "layer_512_1_bn1"
-type: "BatchNorm"
-bottom: "layer_256_6_sum"
-top: "layer_512_1_bn1_pcs_arm_sim"
-  batch_norm_param {
- }
-}
-layer {
-name: "layer_512_1_relu1"
-type: "ReLU"
-bottom: "layer_512_1_bn1_pcs_arm_sim"
-top: "layer_512_1_bn1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_512_1_conv1"
-type: "Convolution"
-bottom: "layer_512_1_bn1_pcs_arm_sim"
-top: "layer_512_1_conv1"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 512
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_512_1_bn2"
-type: "BatchNorm"
-bottom: "layer_512_1_conv1"
-top: "layer_512_1_conv1_pcs_arm_sim"
-  batch_norm_param {
- }  
-}
-layer {
-name: "layer_512_1_relu2"
-type: "ReLU"
-bottom: "layer_512_1_conv1_pcs_arm_sim"
-top: "layer_512_1_conv1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_512_1_conv2"
-type: "Convolution"
-bottom: "layer_512_1_conv1_pcs_arm_sim"
-top: "layer_512_1_conv2"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 512
-  bias_term: false
-  pad: 1
-  kernel_size: 3
-  stride: 2
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_512_1_bn3"
-type: "BatchNorm"
-bottom: "layer_512_1_conv2"
-top: "layer_512_1_conv2_pcs_arm_sim"
-  batch_norm_param {
- }  
-}
-layer {
-name: "layer_512_1_relu3"
-type: "ReLU"
-bottom: "layer_512_1_conv2_pcs_arm_sim"
-top: "layer_512_1_conv2_pcs_arm_sim"
-
-}
-layer {
-name: "layer_512_1_conv3"
-type: "Convolution"
-bottom: "layer_512_1_conv2_pcs_arm_sim"
-top: "layer_512_1_conv3"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 2048
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_512_1_conv_expand"
-type: "Convolution"
-bottom: "layer_512_1_bn1_pcs_arm_sim"
-top: "layer_512_1_conv_expand"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 2048
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 2
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_512_1_sum"
-type: "Eltwise"
-bottom: "layer_512_1_conv3"
-bottom: "layer_512_1_conv_expand"
-top: "layer_512_1_sum"
-
-}
-layer {
-name: "layer_512_2_bn1"
-type: "BatchNorm"
-bottom: "layer_512_1_sum"
-top: "layer_512_2_bn1_pcs_arm_sim"
-  batch_norm_param {
- }  
-}
-layer {
-name: "layer_512_2_relu1"
-type: "ReLU"
-bottom: "layer_512_2_bn1_pcs_arm_sim"
-top: "layer_512_2_bn1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_512_2_conv1"
-type: "Convolution"
-bottom: "layer_512_2_bn1_pcs_arm_sim"
-top: "layer_512_2_conv1"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 512
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_512_2_bn2"
-type: "BatchNorm"
-bottom: "layer_512_2_conv1"
-top: "layer_512_2_conv1_pcs_arm_sim"
-  batch_norm_param {
- }
-}
-layer {
-name: "layer_512_2_relu2"
-type: "ReLU"
-bottom: "layer_512_2_conv1_pcs_arm_sim"
-top: "layer_512_2_conv1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_512_2_conv2"
-type: "Convolution"
-bottom: "layer_512_2_conv1_pcs_arm_sim"
-top: "layer_512_2_conv2"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 512
-  bias_term: false
-  pad: 1
-  kernel_size: 3
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_512_2_bn3"
-type: "BatchNorm"
-bottom: "layer_512_2_conv2"
-top: "layer_512_2_conv2_pcs_arm_sim"
-  batch_norm_param {
- }   
-}
-layer {
-name: "layer_512_2_relu3"
-type: "ReLU"
-bottom: "layer_512_2_conv2_pcs_arm_sim"
-top: "layer_512_2_conv2_pcs_arm_sim"
-
-}
-layer {
-name: "layer_512_2_conv3"
-type: "Convolution"
-bottom: "layer_512_2_conv2_pcs_arm_sim"
-top: "layer_512_2_conv3"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 2048
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_512_2_sum"
-type: "Eltwise"
-bottom: "layer_512_2_conv3"
-bottom: "layer_512_1_sum"
-top: "layer_512_2_sum"
-
-}
-layer {
-name: "layer_512_3_bn1"
-type: "BatchNorm"
-bottom: "layer_512_2_sum"
-top: "layer_512_3_bn1_pcs_arm_sim"
-  batch_norm_param {
- }    
-}
-layer {
-name: "layer_512_3_relu1"
-type: "ReLU"
-bottom: "layer_512_3_bn1_pcs_arm_sim"
-top: "layer_512_3_bn1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_512_3_conv1"
-type: "Convolution"
-bottom: "layer_512_3_bn1_pcs_arm_sim"
-top: "layer_512_3_conv1"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 512
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_512_3_bn2"
-type: "BatchNorm"
-bottom: "layer_512_3_conv1"
-top: "layer_512_3_conv1_pcs_arm_sim"
-  batch_norm_param {
- }
-}
-layer {
-name: "layer_512_3_relu2"
-type: "ReLU"
-bottom: "layer_512_3_conv1_pcs_arm_sim"
-top: "layer_512_3_conv1_pcs_arm_sim"
-
-}
-layer {
-name: "layer_512_3_conv2"
-type: "Convolution"
-bottom: "layer_512_3_conv1_pcs_arm_sim"
-top: "layer_512_3_conv2"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 512
-  bias_term: false
-  pad: 1
-  kernel_size: 3
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_512_3_bn3"
-type: "BatchNorm"
-bottom: "layer_512_3_conv2"
-top: "layer_512_3_conv2_pcs_arm_sim"
-  batch_norm_param {
- } 
-}
-layer {
-name: "layer_512_3_relu3"
-type: "ReLU"
-bottom: "layer_512_3_conv2_pcs_arm_sim"
-top: "layer_512_3_conv2_pcs_arm_sim"
-
-}
-layer {
-name: "layer_512_3_conv3"
-type: "Convolution"
-bottom: "layer_512_3_conv2_pcs_arm_sim"
-top: "layer_512_3_conv3"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-convolution_param {
-  num_output: 2048
-  bias_term: false
-  pad: 0
-  kernel_size: 1
-  stride: 1
-  weight_filler {
-    type: "msra"
-  }
-  bias_filler {
-    type: "constant"
-    value: 0.0
-  }
-}
-
-}
-layer {
-name: "layer_512_3_sum"
-type: "Eltwise"
-bottom: "layer_512_3_conv3"
-bottom: "layer_512_2_sum"
-top: "layer_512_3_sum"
-
-}
-layer {
-name: "last_bn"
-type: "BatchNorm"
-bottom: "layer_512_3_sum"
-top: "layer_512_3_sum_pcs_arm_sim"
-  batch_norm_param {
- } 
-}
-layer {
-name: "last_relu"
-type: "ReLU"
-bottom: "layer_512_3_sum_pcs_arm_sim"
-top: "layer_512_3_sum_pcs_arm_sim"
-
-}
-layer {
-name: "global_pool"
-type: "Pooling"
-bottom: "layer_512_3_sum_pcs_arm_sim"
-top: "global_pool"
-pooling_param {
-  pool: AVE
-  global_pooling: true
-}
-
-}
-layer {
-name: "score"
-type: "InnerProduct"
-bottom: "global_pool"
-top: "score"
-param {
-  lr_mult: 1.0
-  decay_mult: 1.0
-}
-param {
-  lr_mult: 2.0
-  decay_mult: 1.0
-}
-inner_product_param {
-  num_output: 1000
-}
-
-}
-layer {
-name: "loss"
-type: "SoftmaxWithLoss"
-bottom: "score"
-bottom: "label"
-top: "loss"
-
-}
-layer {
-name: "accuracy"
-type: "Accuracy"
-bottom: "score"
-bottom: "label"
-top: "accuracy"
-include {
-  phase: TEST
-}
-
-}