diff --git a/Makefile b/Makefile
index 4d9eada0f..fd6e78bc8 100644
--- a/Makefile
+++ b/Makefile
@@ -77,11 +77,13 @@ ifeq ($(CAFFE_PER_LAYER_TIMINGS), 1)
 endif
 
 ifeq ($(CAFFE_MLSL_SHUFFLE), 1)
-        COMMON_FLAGS += -DCAFFE_MLSL_SHUFFLE
+	COMMON_FLAGS += -DCAFFE_MLSL_SHUFFLE
 endif
 
+ifeq ($(FW_OVERLAP_OPT), 1)
+	COMMON_FLAGS += -DFW_OVERLAP_OPT
+endif
 endif
-
 #################### MLSL ####################
 
 
diff --git a/Makefile.mkldnn b/Makefile.mkldnn
index 51f7fcab6..ec1a70bc5 100644
--- a/Makefile.mkldnn
+++ b/Makefile.mkldnn
@@ -32,7 +32,7 @@ mkldnn_download:
 
 mkldnn_build: mkldnn_download
 	cmake $(MKLDNN_CMAKE_FLAGS)
-	make -C $(CAFFE_ROOTDIR)/$(MKLDNN_BUILDDIR)
+	make -C $(CAFFE_ROOTDIR)/$(MKLDNN_BUILDDIR) -j$(shell cat /proc/cpuinfo |grep 'processor'|wc -l)
 	make -C $(CAFFE_ROOTDIR)/$(MKLDNN_BUILDDIR) install
 else
 mkldnn_download:
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 0d27a46f4..67adf4ba7 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -115,6 +115,17 @@ if(USE_MLSL)
   include_directories(SYSTEM "${MLSL_ROOT}/intel64/include")
   link_directories(SYSTEM "${MLSL_ROOT}/intel64/lib")
   list(APPEND Caffe_LINKER_LIBS mlsl)
+
+  if(CAFFE_PER_LAYER_TIMINGS)
+    add_definitions("-DCAFFE_PER_LAYER_TIMINGS")
+  endif()
+  if(CAFFE_MLSL_SHUFFLE)
+    add_definitions("-DCAFFE_MLSL_SHUFFLE")
+  endif()
+  if(FW_OVERLAP_OPT)
+    message(STATUS "Forward overlapping optimization is enabled!")
+    add_definitions("-DFW_OVERLAP_OPT")
+  endif()
 endif()
 
 # ---[ BLAS
diff --git a/cmake/MKLDNN.cmake b/cmake/MKLDNN.cmake
index 97000b7a5..43c51f7ee 100644
--- a/cmake/MKLDNN.cmake
+++ b/cmake/MKLDNN.cmake
@@ -8,7 +8,14 @@ function(Download_MKLDNN)
   execute_process(COMMAND cat mkldnn.commit
   		  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
 		  OUTPUT_VARIABLE MKLDNN_COMMIT)
-  
+
+  include(ProcessorCount)
+  ProcessorCount(NCORE)
+  if(NOT NCORE EQUAL 0)
+      set(CTEST_BUILD_FLAGS -j${NCORE})
+      set(ctest_test_args ${ctest_test_args} PARALLEL_LEVEL ${NCORE})
+  endif()
+ 
   ExternalProject_add(MKLDNN_Build
                       SOURCE_DIR ${MKLDNN_SOURCE_DIR}
                       CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} -DMKLROOT=${MKL_ROOT_DIR}
@@ -20,7 +27,7 @@ function(Download_MKLDNN)
                       BUILD_COMMAND cmake ${MKLDNN_SOURCE_DIR}
 #--Install step
                       INSTALL_DIR ${MKLDNN_INSTALL_DIR}
-                      INSTALL_COMMAND make install
+                      INSTALL_COMMAND make install -j${NCORE}
                       LOG_CONFIGURE 1
                       LOG_BUILD 1
                       LOG_INSTALL 1
diff --git a/examples/LRCN_activity_recognition/train_test_singleFrame_RGB.prototxt b/examples/LRCN_activity_recognition/train_test_singleFrame_RGB.prototxt
index 8663afe45..26e4ddc55 100644
--- a/examples/LRCN_activity_recognition/train_test_singleFrame_RGB.prototxt
+++ b/examples/LRCN_activity_recognition/train_test_singleFrame_RGB.prototxt
@@ -13,7 +13,6 @@ layer {
     mean_value: 103.939
     mean_value: 116.779
     mean_value: 123.68
-    flow: false
   }
   image_data_param {
     source: "ucf101_singleFrame_RGB_train_split1.txt"
@@ -38,7 +37,6 @@ layer {
     mean_value: 103.939
     mean_value: 116.779
     mean_value: 123.68
-    flow: false
   }
   image_data_param {
     source: "ucf101_singleFrame_RGB_test_split1.txt"
diff --git a/examples/pycaffe/tune_engine.py b/examples/pycaffe/tune_engine.py
new file mode 100755
index 000000000..850b94929
--- /dev/null
+++ b/examples/pycaffe/tune_engine.py
@@ -0,0 +1,190 @@
+import os
+import sys
+import copy
+import argparse
+
+from caffe.proto import caffe_pb2
+import google.protobuf.text_format as txtf
+
+def readFile(filePath):
+    lines = []
+    file = open(filePath, 'r')
+    for line in file.readlines():
+        lines.append(line)
+    file.close()
+
+    return lines
+
+def writeFile(filePath, lines):
+    file = open(filePath, 'w+')
+    file.write(lines)
+    file.close()
+
+def parseLog(log):
+    lines = readFile(log)
+    model_start = False
+    time_start = False
+    model_lines = []
+    time_lines = []
+    for line in lines:
+        trim_line = line.strip()
+        if trim_line.endswith("Initializing net from parameters:"):
+            model_start = True
+            continue
+        if model_start:
+            if trim_line.find("Creating layer") <> -1:
+                model_start = False
+                continue
+            model_lines.append(line)
+
+        if trim_line.endswith("Average time per layer:"):
+            time_start = True
+            continue
+        if time_start:
+            if trim_line.find("Average Forward pass") <> -1:
+                time_start = False
+                break
+            time_lines.append(line)
+
+    model_lines = model_lines[1:]
+    model_str = ""
+    for line in model_lines:
+        model_str = model_str + line
+
+    return (model_str, time_lines)
+
+def parseTimeLines(timeLines):
+    layer_map = {}
+    for line in timeLines:
+        trim_line = line.strip()
+        items = trim_line.split("\t")
+        layer_items = items[0].split(" ")
+        layer_name = layer_items[-1]
+        time_items = items[1].split(" ")
+        if layer_name not in layer_map.keys():
+            layer_map[layer_name] = (float)(time_items[1])
+        else:
+            layer_map[layer_name] = layer_map[layer_name] + (float)(time_items[1])
+
+    return layer_map
+
+def parseModelStr(modelStr):
+    net = caffe_pb2.NetParameter()
+    txtf.Merge(modelStr, net)
+    layer_model_map = {}
+    global_engine = "CAFFE"
+    if net.engine != "":
+        global_engine = net.engine
+    for index in range(0, len(net.layer)):
+        engine = global_engine
+        l = net.layer[index]
+        if l.engine != "":
+            engine = l.engine
+        param_engine = -1
+        if l.type == "Convolution" or l.type == "Deconvolution":
+            if l.convolution_param.engine != "":
+                param_engine = l.convolution_param.engine
+        elif l.type == "BatchNorm":
+            if l.batch_norm_param.engine != "":
+                param_engine = l.batch_norm_param.engine
+        elif l.type == "Concat":
+            if l.concat_param.engine != "":
+                param_engine = l.concat_param.engine
+        elif l.type == "Eltwise":
+            if l.eltwise_param.engine != "":
+                param_engine = l.eltwise_param.engine
+        elif l.type == "InnerProduct":
+            if l.inner_product_param.engine != "":
+                param_engine = l.inner_product_param.engine
+        elif l.type == "LRN":
+            if l.lrn_param.engine != "":
+                param_engine = l.lrn_param.engine
+        elif l.type == "Pooling":
+            if l.pooling_param.engine != "":
+                param_engine = l.pooling_param.engine
+        elif l.type == "ReLU":
+            if l.relu_param.engine != "":
+                param_engine = l.relu_param.engine
+
+        if param_engine == 0 or param_engine == 1:
+            engine = "CAFFE"
+        elif param_engine == 3:
+            engine = "MKL2017"
+        elif param_engine == 4:
+            engine = "MKLDNN"
+        layer_model_map[l.name] = (index, engine, l)
+
+    return (net, layer_model_map)
+
+def selectOptimalEngine(layers):
+    optimal_layer = None
+    min_time = sys.float_info.max
+    for layer in layers:
+        if layer[2] < min_time:
+            min_time = layer[2]
+            optimal_layer = layer
+
+    return optimal_layer
+
+def tuneEngine(logs, model):
+    if len(logs) <= 1:
+        print "[ERROR] Please specify two or more log files"
+        exit(1)
+
+    for log in logs:
+        if not os.path.exists(log):
+            print "[ERROR] Please specify valid log file:", log
+            exit(1)
+
+    layer_map = {}
+    net = None
+    for log in logs:
+        log_name = os.path.basename(log)
+        (model_str, time_lines) = parseLog(log)
+        (net, layer_model_map) = parseModelStr(model_str)
+        layer_time_map = parseTimeLines(time_lines)
+        for k, v in layer_model_map.items():
+            if k not in layer_map.keys():
+                layer_map[k] = [(v[0], v[1], layer_time_map[k], v[2])]
+            else:
+                layer_map_v = layer_map[k]
+                layer_map_v.append((v[0], v[1], layer_time_map[k], v[2]))
+                layer_map[k] = layer_map_v
+
+    optimal_layer_map = {}
+    for k, v in layer_map.items():
+        optimal_layer = selectOptimalEngine(v)
+        assert(optimal_layer != None)
+        optimal_layer_map[optimal_layer[0]] = optimal_layer[3]
+        
+    genModel(net, model, optimal_layer_map)
+
+def genModel(net, model, optimal_layer_map):
+    net_str = ""
+    net_str += "name: \"" + net.name + "\"\n"
+    for index in range(0, len(net.layer)):
+        net_str += "layer {\n"
+        l = net.layer[index]
+        if l.type.endswith("Data"):
+            net_str += str(l) + "\n}\n"
+            continue
+        l = optimal_layer_map[index]
+        net_str += str(l) + "\n}\n"
+    with open(model, 'w') as f:
+        net = caffe_pb2.NetParameter()
+        txtf.Merge(net_str, net)
+        f.write(str(net))
+        print "[INFO] Complete model engine tuning:", model
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('-l', '--logs', nargs='+', help='require the caffe time logs', required=True)
+
+    parser.add_argument('-o', '--output', action='store', dest='output', default="",
+                        help='require the model output')
+
+    parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.0')
+
+    params = parser.parse_args()
+    tuneEngine(params.logs, params.output)
diff --git a/examples/pycaffe/tune_model.py b/examples/pycaffe/tune_model.py
new file mode 100644
index 000000000..bb9e4bfdd
--- /dev/null
+++ b/examples/pycaffe/tune_model.py
@@ -0,0 +1,99 @@
+import os
+import datetime
+import copy
+import argparse
+
+from caffe.proto import caffe_pb2
+import google.protobuf.text_format as txtf
+import caffe
+
+def isWinogradApplicable(ic, oc, stride, kernel_size):
+    if ic % 16 != 0:
+        return False
+    if oc % 16 != 0:
+        return False
+    if stride != 1:
+        return False
+    if kernel_size != 3:
+        return False
+
+    return True
+
+def genHybridModel(net, winogradLayers, modelName):
+    newNet = copy.deepcopy(net)
+    newNetName = modelName.split(".")[0] + "_hybrid.prototxt"
+    for layer in winogradLayers:
+        newNet.layer[layer].convolution_param.conv_algorithm = "winograd"
+    with open(newNetName, 'w') as f:
+       f.write(str(newNet))
+       print "[INFO] Complete model tuning with Winograd:", newNetName
+
+def tuneModelDefinition(model):
+    net = caffe_pb2.NetParameter()
+    with open(model) as f:
+        s = f.read()
+        txtf.Merge(s, net)
+
+    net.name = 'Tuned model of ' + net.name
+    output_layer_map = {} 
+    for index in range(0, len(net.layer)):
+        l = net.layer[index]
+        if l.type == ("Convolution"):
+            stride = 0
+            kernel_size = 0
+            if len(l.convolution_param.stride) == 0:
+                stride = 1
+            else:
+                stride = l.convolution_param.stride[0]
+            kernel_size = l.convolution_param.kernel_size[0]
+            ic = 0
+            if l.bottom[0] in output_layer_map.keys():
+                ic = output_layer_map[l.bottom[0]][4]
+            oc = l.convolution_param.num_output
+            output_layer_map[l.name] = (index, stride, kernel_size, ic, oc, True)
+        elif l.type == ("InnerProduct"):
+            oc = l.inner_product_param.num_output
+            ic = 0
+            if l.bottom[0] in output_layer_map.keys():
+                ic = output_layer_map[l.bottom[0]][4]
+            output_layer_map[l.name] = (index, 0, 0, ic, oc, False)
+        elif l.type.endswith("Data") or l.type.endswith("Input"):
+            # TODO: correct the output
+            #    dynamic_net = caffe.Net(model, caffe.TEST)
+            #    for k, v in dynamic_net.blobs.items():
+            #        dynamic_net_map[k] = v.data.shape
+            ic = oc = 3
+            output_layer_map[l.name] = (index, 0, 0, ic, oc, False)
+        else:
+            ic = 0
+            if l.bottom[0] in output_layer_map.keys():
+                ic = output_layer_map[l.bottom[0]][4]
+            oc = ic
+            output_layer_map[l.name] = (index, 0, 0, ic, oc, False)
+
+    winograd_convolutions = []
+    for k,v in output_layer_map.items():
+        if v[5] and isWinogradApplicable(v[3], v[4], v[1], v[2]):
+            winograd_convolutions.append(v[0])
+
+    if len(winograd_convolutions) > 0:
+        genHybridModel(net, winograd_convolutions, model)
+    else:
+        print "[INFO] No need to tune model with Winograd:", model
+     
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('-m', '--model', action='store', dest='model', default="",
+                        help='require the model definition (prototxt)')
+
+    parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.0')
+
+    params = parser.parse_args()
+
+    model = params.model
+    if not os.path.exists(params.model):
+        print "[ERROR] Please specify the model definition file with -m"
+        exit(1)
+
+    tuneModelDefinition(model)
diff --git a/include/caffe/data_reader.hpp b/include/caffe/data_reader.hpp
index c700586be..dff77199f 100644
--- a/include/caffe/data_reader.hpp
+++ b/include/caffe/data_reader.hpp
@@ -129,6 +129,7 @@ class DataReader {
 
     const LayerParameter param_;
     BlockingQueue<shared_ptr<QueuePair> > new_queue_pairs_;
+    bool first_read_;
 
     friend class DataReader;
 
diff --git a/include/caffe/data_transformer.hpp b/include/caffe/data_transformer.hpp
index d95df5a0c..f709a281b 100644
--- a/include/caffe/data_transformer.hpp
+++ b/include/caffe/data_transformer.hpp
@@ -396,6 +396,11 @@ class DataTransformer {
           bool has_mean_values>
   void Transform(const Datum& datum, Dtype* transformed_data,
                  NormalizedBBox* crop_bbox, RandNumbers& rand_num);
+
+#ifdef USE_OPENCV
+  void RandomResizeImage(const Datum& datum, Datum *resized_datum);
+  void RandomResizeImage(const cv::Mat& img, cv::Mat *resized_img);
+#endif
 };
 
 }  // namespace caffe
diff --git a/include/caffe/mkldnn_memory.hpp b/include/caffe/mkldnn_memory.hpp
index 706dd3973..a59ce6e12 100644
--- a/include/caffe/mkldnn_memory.hpp
+++ b/include/caffe/mkldnn_memory.hpp
@@ -112,9 +112,21 @@ class MKLDNNMemoryDescriptorBase : public PrvMemDescr
 
     void allocate() {
         if (_prv_memory == NULL) {
+#ifdef USE_MLSL
+          if (mn::is_multinode()) {
+            auto mlsl_free = [](char* p) { mn::free((void*)p); };
+            _mlsl_memory.reset(
+              (char*)mn::alloc(_prv_memory_pd->get_size(), 64), mlsl_free);
+            _prv_memory = shared_ptr<memory>(
+              new memory(*_prv_memory_pd, (void*)_mlsl_memory.get()));
+          } else {
+#endif
             _prv_memory = shared_ptr<memory>(new memory(*_prv_memory_pd));
-            _internal_ptr = (Dtype *)(_prv_memory->get_data_handle());
-            // TODO: may need initialize memory by 0
+#ifdef USE_MLSL
+          }
+#endif
+          _internal_ptr = (Dtype *)(_prv_memory->get_data_handle());
+          // TODO: may need initialize memory by 0
         }
     }
     void set_prv_memory_pd(shared_ptr<memory::primitive_desc> memory_pd)  {
@@ -156,6 +168,9 @@ class MKLDNNMemoryDescriptorBase : public PrvMemDescr
 
     MKLDNNLayer<Dtype>* _mkldnn_layer;
     Blob<Dtype>* _blob;
+#ifdef USE_MLSL
+    shared_ptr<char> _mlsl_memory;
+#endif
 };
 
 template <typename Dtype, bool is_diff>
diff --git a/include/caffe/multinode/mlsl.hpp b/include/caffe/multinode/mlsl.hpp
index b135e4673..b0d3d13d6 100644
--- a/include/caffe/multinode/mlsl.hpp
+++ b/include/caffe/multinode/mlsl.hpp
@@ -48,6 +48,8 @@ namespace caffe {
 
 #define MLSL_DEFAULT_COLOR -1
 
+    void init(int* argc, char** argv[]);
+
     inline void free(void *addr) {
       return MLSL::Environment::GetEnv().Free(addr);
     }
diff --git a/include/caffe/multinode/multi_solver.hpp b/include/caffe/multinode/multi_solver.hpp
index 41b92665a..5d2082821 100644
--- a/include/caffe/multinode/multi_solver.hpp
+++ b/include/caffe/multinode/multi_solver.hpp
@@ -60,6 +60,12 @@ class MultiSolver {
       iter_size(root_solver_->param().iter_size()) {
     root_solver_->set_forward_backward(
       boost::bind(&MultiSolver<Dtype>::ForwardBackward, this));
+#ifdef FW_OVERLAP_OPT    
+    Net<Dtype>& net = *root_solver_->net();
+    const std::vector<shared_ptr<Layer<Dtype>>> & layers{ net.layers() };
+    layer_finished_flags_.resize(layers.size());
+    std::fill(layer_finished_flags_.begin(), layer_finished_flags_.end(), true);
+#endif
   }
 
 
@@ -99,14 +105,23 @@ class MultiSolver {
   boost::shared_ptr<Solver<Dtype>> root_solver() {
     return root_solver_;
   }
-
+#ifdef FW_OVERLAP_OPT
+  void set_layer_finished_flag(int layer_id, bool flag) {
+    layer_finished_flags_[layer_id] = flag;
+  }
+#endif
  private:
   virtual Dtype ForwardBackwardImpl(bool first, bool last);
+  bool IsSkipWaitGradient(int layer_id);
+  void WaitAndUpdateGradient(int layer_id);
 
  protected:
   boost::shared_ptr<Solver<Dtype>> root_solver_;
   int iter_size;
   vector<Callback*> callbacks_;
+#ifdef FW_OVERLAP_OPT
+  vector<bool> layer_finished_flags_;
+#endif
 };
 
 }  // namespace caffe
diff --git a/include/caffe/multinode/multi_sync.hpp b/include/caffe/multinode/multi_sync.hpp
index 2d4c566ae..b979e89fe 100644
--- a/include/caffe/multinode/multi_sync.hpp
+++ b/include/caffe/multinode/multi_sync.hpp
@@ -63,7 +63,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 namespace caffe {
 
-#define CAN_USE_PRV(param) false //(param->prv_diff() && (param->prv_diff_count() == param->count()))
+#define CAN_USE_PRV(param) (param->prv_diff() && (param->prv_diff_count() == param->count()))
 
   template <typename Dtype>
   class MultiSync : public MultiSolver<Dtype>::Callback {
@@ -74,6 +74,10 @@ namespace caffe {
     shared_ptr<Net<Dtype>> net;
     const vector<Blob<Dtype> *> &net_params;
     vector<vector<int>> layer_param_ids;
+#ifdef FW_OVERLAP_OPT
+    vector<vector<bool>> param_ids_finished_flags;
+#endif
+
     // layer_id -> blob_id -> cached blob to restore
     // statistics
     vector<vector<shared_ptr<Blob<Dtype>>>> cached_stats;
@@ -160,6 +164,12 @@ namespace caffe {
                    << " ENABLED"
 #else
                    << " DISABLED"
+#endif
+                   << ", FORWARD OVERLAP OPTIMIZATION IS"
+#ifdef FW_OVERLAP_OPT
+                   << " ENABLED"
+#else
+                   << " DISABLED"
 #endif
                    << ", SINGLE DB SPLITTING IS"
 #ifdef CAFFE_MLSL_SHUFFLE
@@ -172,15 +182,15 @@ namespace caffe {
       mn::train::commit();
 
 #ifdef PERFORMANCE_MONITORING
-  statsIterResult.resize(caffe::mn::train::get_session().get_operation_count());
-  caffe::mn::train::stats::start();
+      statsIterResult.resize(caffe::mn::train::get_session().get_operation_count());
+      caffe::mn::train::stats::start();
 #endif
 
       solver->add_callback(this);
       solver->Solve();
 
 #ifdef PERFORMANCE_MONITORING
-    dump_stats_to_file();
+      dump_stats_to_file();
 #endif
     }
 
@@ -196,14 +206,24 @@ namespace caffe {
     }
 
     void on_iter_finished(int layer_id) {
+#ifdef FW_OVERLAP_OPT
+      solver->set_layer_finished_flag(layer_id, false);
+#endif
+
       boost::shared_ptr<Layer<Dtype>> &layer = layers[layer_id];
       if (layer->layerOp == nullptr) {
         return;
       }
 
+#ifdef FW_OVERLAP_OPT
+      std::fill(param_ids_finished_flags[layer_id].begin(),
+          param_ids_finished_flags[layer_id].end(),
+          false);
+#endif
+
       std::vector<int> &param_ids = layer_param_ids[layer_id];
       for (int i = 0; i < param_ids.size(); ++i) {
-        if (!layer->ParamNeedReduce(param_ids[i])) continue;
+        if (!layer->ParamNeedReduce(i)) continue;
         if (CAN_USE_PRV(net_params[param_ids[i]])) {
           layer->layerOp->GetParameterSet(i)->StartGradientComm((void *) net_params[param_ids[i]]->mutable_prv_diff());
         } else {
@@ -215,15 +235,35 @@ namespace caffe {
     void on_delwt_wait(int layer_id) {
       boost::shared_ptr<Layer<Dtype>> &layer = layers[layer_id];
       if (layer->layerOp == nullptr) {
+#ifdef FW_OVERLAP_OPT
+        solver->set_layer_finished_flag(layer_id, true);
+#endif
         return;
       }
 
       std::vector<int> &param_ids = layer_param_ids[layer_id];
-
       for (int i=0; i<param_ids.size(); i++) {
-        if (!layer->ParamNeedReduce(param_ids[i])) continue;
+        if (!layer->ParamNeedReduce(i)
+#ifdef FW_OVERLAP_OPT
+            || (param_ids_finished_flags[layer_id][i] == true)) {
+          param_ids_finished_flags[layer_id][i] = true;
+#else
+          ) {
+#endif
+          continue;
+        }
+
+#ifdef FW_OVERLAP_OPT
+        bool is_completed = false;
+        Dtype *delwt_buf{(Dtype *) layer->layerOp->GetParameterSet(i)->TestGradientComm(&is_completed)};
+#else
         Dtype *delwt_buf{(Dtype *) layer->layerOp->GetParameterSet(i)->WaitGradientComm()};
+#endif
         if (delwt_buf) {
+#ifdef FW_OVERLAP_OPT
+          assert(is_completed);
+          param_ids_finished_flags[layer_id][i] = true;
+#endif
           if (CAN_USE_PRV(net_params[param_ids[i]])) {
             if (delwt_buf != net_params[param_ids[i]]->prv_diff())
               caffe_copy(net_params[param_ids[i]]->count(),
@@ -235,6 +275,14 @@ namespace caffe {
                 net_params[param_ids[i]]->mutable_cpu_diff());
         }
       }
+
+#ifdef FW_OVERLAP_OPT
+      int finished_count = std::count(param_ids_finished_flags[layer_id].begin(),
+            param_ids_finished_flags[layer_id].end(), true);
+      if (finished_count == param_ids.size()) {
+        solver->set_layer_finished_flag(layer_id, true);
+      }
+#endif
     }
 
     void on_gradients_ready() {
diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
index 0dc63436c..ba47be986 100644
--- a/include/caffe/net.hpp
+++ b/include/caffe/net.hpp
@@ -304,7 +304,6 @@ class Net {
    * @brief If find "Conv--BN--Scale" in current network, merge BN and Scale layer into Convolution
    * layers, this optimization only works in caffe TEST phase now.
    */
-  static void RemoveBNScale(const NetParameter& param, NetParameter* param_compiled);
 
   static void GetBlobConsumers(std::vector<const LayerParameter*> &cnsmer_blobs,
                                                 const string& blob_name_to_find,
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index 05413a6c9..9b97c3c0b 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -165,10 +165,18 @@ class Solver {
   std::vector<double> forward_time_per_layer;
   std::vector<double> backward_time_per_layer;
   std::vector<double> update_time_per_layer;
+#ifdef USE_MLSL
+  std::vector<double> startcomm_time_per_layer;
+  std::vector<double> waitcomm_time_per_layer;
+#endif
 
   std::vector<double> forward_time_per_layer_total;
   std::vector<double> backward_time_per_layer_total;
   std::vector<double> update_time_per_layer_total;
+#ifdef USE_MLSL
+  std::vector<double> startcomm_time_per_layer_total;
+  std::vector<double> waitcomm_time_per_layer_total;
+#endif
 
   void InitTimers();
   void ResetTimers();
diff --git a/include/caffe/util/remove_batch_norm.hpp b/include/caffe/util/remove_batch_norm.hpp
index c2e92f40f..316a4c022 100644
--- a/include/caffe/util/remove_batch_norm.hpp
+++ b/include/caffe/util/remove_batch_norm.hpp
@@ -69,5 +69,7 @@ void AdjustConvLayer(LayerParameter &conv_layer,
 template <typename Dtype>
 void RecoverBNScaleMergedNet(NetParameter * net_param, NetParameter* recovered_net_param);
 
+template <typename Dtype>
+void RemoveBNScale(const NetParameter& param, NetParameter* param_compiled);
 }
 #endif
diff --git a/mkldnn.commit b/mkldnn.commit
index 4e6af52a3..7eb0167ed 100644
--- a/mkldnn.commit
+++ b/mkldnn.commit
@@ -1 +1 @@
-22bf25f29369d247098968837b21f3d1bdb2336e
+171572a205c71f5bbb08657de5660c9d06cf2d8f
diff --git a/models/intel_optimized_models/ssd/VGGNet/VOC0712/SSD_300x300/deploy_mkl2017.prototxt b/models/intel_optimized_models/ssd/VGGNet/VOC0712/SSD_300x300/deploy_mkl2017.prototxt
new file mode 100644
index 000000000..7e2ddbbbb
--- /dev/null
+++ b/models/intel_optimized_models/ssd/VGGNet/VOC0712/SSD_300x300/deploy_mkl2017.prototxt
@@ -0,0 +1,1626 @@
+name: "VGG_VOC0712_SSD_300x300_deploy"
+input: "data"
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 300
+  dim: 300
+}
+layer {
+  engine: "MKL2017"
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  engine: "MKL2017"
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  engine: "MKL2017"
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  engine: "MKL2017"
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  engine: "MKL2017"
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  engine: "MKL2017"
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  engine: "MKL2017"
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  engine: "MKL2017"
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3_3"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  engine: "MKL2017"
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  engine: "MKL2017"
+  name: "conv4_3"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "relu4_3"
+  type: "ReLU"
+  bottom: "conv4_3"
+  top: "conv4_3"
+}
+layer {
+  engine: "MKL2017"
+  name: "pool4"
+  type: "Pooling"
+  bottom: "conv4_3"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "conv5_1"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    dilation: 1
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "relu5_1"
+  type: "ReLU"
+  bottom: "conv5_1"
+  top: "conv5_1"
+}
+layer {
+  engine: "MKL2017"
+  name: "conv5_2"
+  type: "Convolution"
+  bottom: "conv5_1"
+  top: "conv5_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    dilation: 1
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "relu5_2"
+  type: "ReLU"
+  bottom: "conv5_2"
+  top: "conv5_2"
+}
+layer {
+  engine: "MKL2017"
+  name: "conv5_3"
+  type: "Convolution"
+  bottom: "conv5_2"
+  top: "conv5_3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    dilation: 1
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "relu5_3"
+  type: "ReLU"
+  bottom: "conv5_3"
+  top: "conv5_3"
+}
+layer {
+  engine: "MKL2017"
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5_3"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "fc6"
+  type: "Convolution"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 1024
+    pad: 6
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    dilation: 6
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  engine: "MKL2017"
+  name: "fc7"
+  type: "Convolution"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 1024
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  engine: "MKL2017"
+  name: "conv6_1"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "conv6_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "conv6_1_relu"
+  type: "ReLU"
+  bottom: "conv6_1"
+  top: "conv6_1"
+}
+layer {
+  engine: "MKL2017"
+  name: "conv6_2"
+  type: "Convolution"
+  bottom: "conv6_1"
+  top: "conv6_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "conv6_2_relu"
+  type: "ReLU"
+  bottom: "conv6_2"
+  top: "conv6_2"
+}
+layer {
+  engine: "MKL2017"
+  name: "conv7_1"
+  type: "Convolution"
+  bottom: "conv6_2"
+  top: "conv7_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "conv7_1_relu"
+  type: "ReLU"
+  bottom: "conv7_1"
+  top: "conv7_1"
+}
+layer {
+  engine: "MKL2017"
+  name: "conv7_2"
+  type: "Convolution"
+  bottom: "conv7_1"
+  top: "conv7_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "conv7_2_relu"
+  type: "ReLU"
+  bottom: "conv7_2"
+  top: "conv7_2"
+}
+layer {
+  engine: "MKL2017"
+  name: "conv8_1"
+  type: "Convolution"
+  bottom: "conv7_2"
+  top: "conv8_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "conv8_1_relu"
+  type: "ReLU"
+  bottom: "conv8_1"
+  top: "conv8_1"
+}
+layer {
+  engine: "MKL2017"
+  name: "conv8_2"
+  type: "Convolution"
+  bottom: "conv8_1"
+  top: "conv8_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 0
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "conv8_2_relu"
+  type: "ReLU"
+  bottom: "conv8_2"
+  top: "conv8_2"
+}
+layer {
+  engine: "MKL2017"
+  name: "conv9_1"
+  type: "Convolution"
+  bottom: "conv8_2"
+  top: "conv9_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "conv9_1_relu"
+  type: "ReLU"
+  bottom: "conv9_1"
+  top: "conv9_1"
+}
+layer {
+  engine: "MKL2017"
+  name: "conv9_2"
+  type: "Convolution"
+  bottom: "conv9_1"
+  top: "conv9_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 0
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "conv9_2_relu"
+  type: "ReLU"
+  bottom: "conv9_2"
+  top: "conv9_2"
+}
+layer {
+  name: "conv4_3_norm"
+  type: "Normalize"
+  bottom: "conv4_3"
+  top: "conv4_3_norm"
+  norm_param {
+    across_spatial: false
+    scale_filler {
+      type: "constant"
+      value: 20
+    }
+    channel_shared: false
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc"
+  type: "Convolution"
+  bottom: "conv4_3_norm"
+  top: "conv4_3_norm_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 16
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv4_3_norm_mbox_loc"
+  top: "conv4_3_norm_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv4_3_norm_mbox_loc_perm"
+  top: "conv4_3_norm_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf"
+  type: "Convolution"
+  bottom: "conv4_3_norm"
+  top: "conv4_3_norm_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 84
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv4_3_norm_mbox_conf"
+  top: "conv4_3_norm_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv4_3_norm_mbox_conf_perm"
+  top: "conv4_3_norm_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv4_3_norm"
+  bottom: "data"
+  top: "conv4_3_norm_mbox_priorbox"
+  prior_box_param {
+    min_size: 30.0
+    max_size: 60.0
+    aspect_ratio: 2
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 8
+    offset: 0.5
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "fc7_mbox_loc"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "fc7_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 24
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "fc7_mbox_loc_perm"
+  type: "Permute"
+  bottom: "fc7_mbox_loc"
+  top: "fc7_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "fc7_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "fc7_mbox_loc_perm"
+  top: "fc7_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "fc7_mbox_conf"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "fc7_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 126
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "fc7_mbox_conf_perm"
+  type: "Permute"
+  bottom: "fc7_mbox_conf"
+  top: "fc7_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "fc7_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "fc7_mbox_conf_perm"
+  top: "fc7_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "fc7_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "fc7"
+  bottom: "data"
+  top: "fc7_mbox_priorbox"
+  prior_box_param {
+    min_size: 60.0
+    max_size: 111.0
+    aspect_ratio: 2
+    aspect_ratio: 3
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 16
+    offset: 0.5
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "conv6_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv6_2"
+  top: "conv6_2_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 24
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv6_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv6_2_mbox_loc"
+  top: "conv6_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv6_2_mbox_loc_perm"
+  top: "conv6_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv6_2"
+  top: "conv6_2_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 126
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv6_2_mbox_conf"
+  top: "conv6_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv6_2_mbox_conf_perm"
+  top: "conv6_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv6_2"
+  bottom: "data"
+  top: "conv6_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 111.0
+    max_size: 162.0
+    aspect_ratio: 2
+    aspect_ratio: 3
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 32
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv7_2"
+  top: "conv7_2_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 24
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv7_2_mbox_loc"
+  top: "conv7_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv7_2_mbox_loc_perm"
+  top: "conv7_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "conv7_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv7_2"
+  top: "conv7_2_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 126
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv7_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv7_2_mbox_conf"
+  top: "conv7_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv7_2_mbox_conf_perm"
+  top: "conv7_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv7_2"
+  bottom: "data"
+  top: "conv7_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 162.0
+    max_size: 213.0
+    aspect_ratio: 2
+    aspect_ratio: 3
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 64
+    offset: 0.5
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "conv8_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv8_2"
+  top: "conv8_2_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 16
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv8_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv8_2_mbox_loc"
+  top: "conv8_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv8_2_mbox_loc_perm"
+  top: "conv8_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "conv8_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv8_2"
+  top: "conv8_2_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 84
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv8_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv8_2_mbox_conf"
+  top: "conv8_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv8_2_mbox_conf_perm"
+  top: "conv8_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv8_2"
+  bottom: "data"
+  top: "conv8_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 213.0
+    max_size: 264.0
+    aspect_ratio: 2
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 100
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv9_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv9_2"
+  top: "conv9_2_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 16
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv9_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv9_2_mbox_loc"
+  top: "conv9_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv9_2_mbox_loc_perm"
+  top: "conv9_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  engine: "MKL2017"
+  name: "conv9_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv9_2"
+  top: "conv9_2_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 84
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv9_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv9_2_mbox_conf"
+  top: "conv9_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv9_2_mbox_conf_perm"
+  top: "conv9_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv9_2"
+  bottom: "data"
+  top: "conv9_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 264.0
+    max_size: 315.0
+    aspect_ratio: 2
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 300
+    offset: 0.5
+  }
+}
+layer {
+  name: "mbox_loc"
+  type: "Concat"
+  bottom: "conv4_3_norm_mbox_loc_flat"
+  bottom: "fc7_mbox_loc_flat"
+  bottom: "conv6_2_mbox_loc_flat"
+  bottom: "conv7_2_mbox_loc_flat"
+  bottom: "conv8_2_mbox_loc_flat"
+  bottom: "conv9_2_mbox_loc_flat"
+  top: "mbox_loc"
+  concat_param {
+    axis: 1
+  }
+  engine: "CAFFE"
+}
+layer {
+  name: "mbox_conf"
+  type: "Concat"
+  bottom: "conv4_3_norm_mbox_conf_flat"
+  bottom: "fc7_mbox_conf_flat"
+  bottom: "conv6_2_mbox_conf_flat"
+  bottom: "conv7_2_mbox_conf_flat"
+  bottom: "conv8_2_mbox_conf_flat"
+  bottom: "conv9_2_mbox_conf_flat"
+  top: "mbox_conf"
+  concat_param {
+    axis: 1
+  }
+  engine: "CAFFE"
+}
+layer {
+  name: "mbox_priorbox"
+  type: "Concat"
+  bottom: "conv4_3_norm_mbox_priorbox"
+  bottom: "fc7_mbox_priorbox"
+  bottom: "conv6_2_mbox_priorbox"
+  bottom: "conv7_2_mbox_priorbox"
+  bottom: "conv8_2_mbox_priorbox"
+  bottom: "conv9_2_mbox_priorbox"
+  top: "mbox_priorbox"
+  concat_param {
+    axis: 2
+  }
+  engine: "CAFFE"
+}
diff --git a/models/intel_optimized_models/ssd/VGGNet/VOC0712/SSD_300x300/deploy_mkldnn.prototxt b/models/intel_optimized_models/ssd/VGGNet/VOC0712/SSD_300x300/deploy_mkldnn.prototxt
new file mode 100644
index 000000000..754549d27
--- /dev/null
+++ b/models/intel_optimized_models/ssd/VGGNet/VOC0712/SSD_300x300/deploy_mkldnn.prototxt
@@ -0,0 +1,1626 @@
+name: "VGG_VOC0712_SSD_300x300_deploy"
+input: "data"
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 300
+  dim: 300
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  engine: "MKLDNN"
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  engine: "MKLDNN"
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  engine: "MKLDNN"
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3_3"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv4_3"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "relu4_3"
+  type: "ReLU"
+  bottom: "conv4_3"
+  top: "conv4_3"
+}
+layer {
+  engine: "MKLDNN"
+  name: "pool4"
+  type: "Pooling"
+  bottom: "conv4_3"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv5_1"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    dilation: 1
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "relu5_1"
+  type: "ReLU"
+  bottom: "conv5_1"
+  top: "conv5_1"
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv5_2"
+  type: "Convolution"
+  bottom: "conv5_1"
+  top: "conv5_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    dilation: 1
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "relu5_2"
+  type: "ReLU"
+  bottom: "conv5_2"
+  top: "conv5_2"
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv5_3"
+  type: "Convolution"
+  bottom: "conv5_2"
+  top: "conv5_3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    dilation: 1
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "relu5_3"
+  type: "ReLU"
+  bottom: "conv5_3"
+  top: "conv5_3"
+}
+layer {
+  engine: "MKLDNN"
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5_3"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "fc6"
+  type: "Convolution"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 1024
+    pad: 6
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    dilation: 6
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  engine: "MKLDNN"
+  name: "fc7"
+  type: "Convolution"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 1024
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv6_1"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "conv6_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv6_1_relu"
+  type: "ReLU"
+  bottom: "conv6_1"
+  top: "conv6_1"
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv6_2"
+  type: "Convolution"
+  bottom: "conv6_1"
+  top: "conv6_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv6_2_relu"
+  type: "ReLU"
+  bottom: "conv6_2"
+  top: "conv6_2"
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv7_1"
+  type: "Convolution"
+  bottom: "conv6_2"
+  top: "conv7_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv7_1_relu"
+  type: "ReLU"
+  bottom: "conv7_1"
+  top: "conv7_1"
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv7_2"
+  type: "Convolution"
+  bottom: "conv7_1"
+  top: "conv7_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv7_2_relu"
+  type: "ReLU"
+  bottom: "conv7_2"
+  top: "conv7_2"
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv8_1"
+  type: "Convolution"
+  bottom: "conv7_2"
+  top: "conv8_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv8_1_relu"
+  type: "ReLU"
+  bottom: "conv8_1"
+  top: "conv8_1"
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv8_2"
+  type: "Convolution"
+  bottom: "conv8_1"
+  top: "conv8_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 0
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv8_2_relu"
+  type: "ReLU"
+  bottom: "conv8_2"
+  top: "conv8_2"
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv9_1"
+  type: "Convolution"
+  bottom: "conv8_2"
+  top: "conv9_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv9_1_relu"
+  type: "ReLU"
+  bottom: "conv9_1"
+  top: "conv9_1"
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv9_2"
+  type: "Convolution"
+  bottom: "conv9_1"
+  top: "conv9_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 0
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv9_2_relu"
+  type: "ReLU"
+  bottom: "conv9_2"
+  top: "conv9_2"
+}
+layer {
+  name: "conv4_3_norm"
+  type: "Normalize"
+  bottom: "conv4_3"
+  top: "conv4_3_norm"
+  norm_param {
+    across_spatial: false
+    scale_filler {
+      type: "constant"
+      value: 20
+    }
+    channel_shared: false
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc"
+  type: "Convolution"
+  bottom: "conv4_3_norm"
+  top: "conv4_3_norm_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 16
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv4_3_norm_mbox_loc"
+  top: "conv4_3_norm_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv4_3_norm_mbox_loc_perm"
+  top: "conv4_3_norm_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf"
+  type: "Convolution"
+  bottom: "conv4_3_norm"
+  top: "conv4_3_norm_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 84
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv4_3_norm_mbox_conf"
+  top: "conv4_3_norm_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv4_3_norm_mbox_conf_perm"
+  top: "conv4_3_norm_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv4_3_norm"
+  bottom: "data"
+  top: "conv4_3_norm_mbox_priorbox"
+  prior_box_param {
+    min_size: 30.0
+    max_size: 60.0
+    aspect_ratio: 2
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 8
+    offset: 0.5
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "fc7_mbox_loc"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "fc7_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 24
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "fc7_mbox_loc_perm"
+  type: "Permute"
+  bottom: "fc7_mbox_loc"
+  top: "fc7_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "fc7_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "fc7_mbox_loc_perm"
+  top: "fc7_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "fc7_mbox_conf"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "fc7_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 126
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "fc7_mbox_conf_perm"
+  type: "Permute"
+  bottom: "fc7_mbox_conf"
+  top: "fc7_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "fc7_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "fc7_mbox_conf_perm"
+  top: "fc7_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "fc7_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "fc7"
+  bottom: "data"
+  top: "fc7_mbox_priorbox"
+  prior_box_param {
+    min_size: 60.0
+    max_size: 111.0
+    aspect_ratio: 2
+    aspect_ratio: 3
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 16
+    offset: 0.5
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv6_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv6_2"
+  top: "conv6_2_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 24
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv6_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv6_2_mbox_loc"
+  top: "conv6_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv6_2_mbox_loc_perm"
+  top: "conv6_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv6_2"
+  top: "conv6_2_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 126
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv6_2_mbox_conf"
+  top: "conv6_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv6_2_mbox_conf_perm"
+  top: "conv6_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv6_2"
+  bottom: "data"
+  top: "conv6_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 111.0
+    max_size: 162.0
+    aspect_ratio: 2
+    aspect_ratio: 3
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 32
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv7_2"
+  top: "conv7_2_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 24
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv7_2_mbox_loc"
+  top: "conv7_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv7_2_mbox_loc_perm"
+  top: "conv7_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv7_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv7_2"
+  top: "conv7_2_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 126
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv7_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv7_2_mbox_conf"
+  top: "conv7_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv7_2_mbox_conf_perm"
+  top: "conv7_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv7_2"
+  bottom: "data"
+  top: "conv7_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 162.0
+    max_size: 213.0
+    aspect_ratio: 2
+    aspect_ratio: 3
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 64
+    offset: 0.5
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv8_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv8_2"
+  top: "conv8_2_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 16
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv8_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv8_2_mbox_loc"
+  top: "conv8_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv8_2_mbox_loc_perm"
+  top: "conv8_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv8_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv8_2"
+  top: "conv8_2_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 84
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv8_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv8_2_mbox_conf"
+  top: "conv8_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv8_2_mbox_conf_perm"
+  top: "conv8_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv8_2"
+  bottom: "data"
+  top: "conv8_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 213.0
+    max_size: 264.0
+    aspect_ratio: 2
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 100
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv9_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv9_2"
+  top: "conv9_2_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 16
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv9_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv9_2_mbox_loc"
+  top: "conv9_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv9_2_mbox_loc_perm"
+  top: "conv9_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  engine: "MKLDNN"
+  name: "conv9_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv9_2"
+  top: "conv9_2_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 84
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv9_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv9_2_mbox_conf"
+  top: "conv9_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv9_2_mbox_conf_perm"
+  top: "conv9_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv9_2"
+  bottom: "data"
+  top: "conv9_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 264.0
+    max_size: 315.0
+    aspect_ratio: 2
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 300
+    offset: 0.5
+  }
+}
+layer {
+  name: "mbox_loc"
+  type: "Concat"
+  bottom: "conv4_3_norm_mbox_loc_flat"
+  bottom: "fc7_mbox_loc_flat"
+  bottom: "conv6_2_mbox_loc_flat"
+  bottom: "conv7_2_mbox_loc_flat"
+  bottom: "conv8_2_mbox_loc_flat"
+  bottom: "conv9_2_mbox_loc_flat"
+  top: "mbox_loc"
+  concat_param {
+    axis: 1
+  }
+  engine: "CAFFE"
+}
+layer {
+  name: "mbox_conf"
+  type: "Concat"
+  bottom: "conv4_3_norm_mbox_conf_flat"
+  bottom: "fc7_mbox_conf_flat"
+  bottom: "conv6_2_mbox_conf_flat"
+  bottom: "conv7_2_mbox_conf_flat"
+  bottom: "conv8_2_mbox_conf_flat"
+  bottom: "conv9_2_mbox_conf_flat"
+  top: "mbox_conf"
+  concat_param {
+    axis: 1
+  }
+  engine: "CAFFE"
+}
+layer {
+  name: "mbox_priorbox"
+  type: "Concat"
+  bottom: "conv4_3_norm_mbox_priorbox"
+  bottom: "fc7_mbox_priorbox"
+  bottom: "conv6_2_mbox_priorbox"
+  bottom: "conv7_2_mbox_priorbox"
+  bottom: "conv8_2_mbox_priorbox"
+  bottom: "conv9_2_mbox_priorbox"
+  top: "mbox_priorbox"
+  concat_param {
+    axis: 2
+  }
+  engine: "CAFFE"
+}
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index bf492a24b..c53299d26 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -3,13 +3,13 @@ if(NOT HAVE_PYTHON)
   return()
 endif()
 
-include_directories(${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR} ${Boost_INCLUDE_DIRS})
 file(GLOB_RECURSE python_srcs ${PROJECT_SOURCE_DIR}/python/*.cpp)
 
 add_library(pycaffe SHARED ${python_srcs})
-target_link_libraries(pycaffe ${Caffe_LINK} ${PYTHON_LIBRARIES} ${Boost_LIBRARIES})
-set_target_properties(pycaffe PROPERTIES PREFIX "" OUTPUT_NAME "_caffe")
 caffe_default_properties(pycaffe)
+set_target_properties(pycaffe PROPERTIES PREFIX "" OUTPUT_NAME "_caffe")
+target_include_directories(pycaffe PUBLIC ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR})
+target_link_libraries(pycaffe PUBLIC ${Caffe_LINK} ${PYTHON_LIBRARIES})
 
 if(UNIX OR APPLE)
     set(__linkname "${PROJECT_SOURCE_DIR}/python/caffe/_caffe.so")
diff --git a/python/caffe/__init__.py b/python/caffe/__init__.py
index a823b52e8..34c939a5b 100755
--- a/python/caffe/__init__.py
+++ b/python/caffe/__init__.py
@@ -35,7 +35,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 from .pycaffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, RMSPropSolver, AdaDeltaSolver, AdamSolver
-from ._caffe import set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed
+from ._caffe import init_log, log, set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed
 from ._caffe import __version__
 from .proto.caffe_pb2 import TRAIN, TEST
 from .classifier import Classifier
diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index b7d509ee5..b9dc23e24 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -88,6 +88,23 @@ const int NPY_DTYPE = NPY_FLOAT32;
 void set_mode_cpu() { Caffe::set_mode(Caffe::CPU); }
 void set_mode_gpu() { Caffe::set_mode(Caffe::GPU); }
 
+void InitLog() {
+  ::google::InitGoogleLogging("");
+  ::google::InstallFailureSignalHandler();
+}
+void InitLogLevel(int level) {
+  FLAGS_minloglevel = level;
+  InitLog();
+}
+void InitLogLevelPipe(int level, bool stderr) {
+  FLAGS_minloglevel = level;
+  FLAGS_logtostderr = stderr;
+  InitLog();
+}
+void Log(const string& s) {
+  LOG(INFO) << s;
+}
+
 void set_random_seed(unsigned int seed) { Caffe::set_random_seed(seed); }
 
 // For convenience, check that input files can be opened, and raise an
@@ -327,6 +344,10 @@ BOOST_PYTHON_MODULE(_caffe) {
   bp::scope().attr("__version__") = AS_STRING(CAFFE_VERSION);
 
   // Caffe utility functions
+  bp::def("init_log", &InitLog);
+  bp::def("init_log", &InitLogLevel);
+  bp::def("init_log", &InitLogLevelPipe);
+  bp::def("log", &Log);
   bp::def("set_mode_cpu", &set_mode_cpu);
   bp::def("set_mode_gpu", &set_mode_gpu);
   bp::def("set_random_seed", &set_random_seed);
diff --git a/python/caffe/draw.py b/python/caffe/draw.py
index 46ef510a0..1f3ab6f7d 100755
--- a/python/caffe/draw.py
+++ b/python/caffe/draw.py
@@ -127,11 +127,11 @@ def get_layer_label(layer, rankdir):
                       separator,
                       layer.type,
                       separator,
-                      layer.convolution_param.kernel_size[0] if len(layer.convolution_param.kernel_size._values) else 1,
+                      layer.convolution_param.kernel_size[0] if len(layer.convolution_param.kernel_size) else 1,
                       separator,
-                      layer.convolution_param.stride[0] if len(layer.convolution_param.stride._values) else 1,
+                      layer.convolution_param.stride[0] if len(layer.convolution_param.stride) else 1,
                       separator,
-                      layer.convolution_param.pad[0] if len(layer.convolution_param.pad._values) else 0)
+                      layer.convolution_param.pad[0] if len(layer.convolution_param.pad) else 0)
     elif layer.type == 'Pooling':
         pooling_types_dict = get_pooling_types_dict()
         node_label = '"%s%s(%s %s)%skernel size: %d%sstride: %d%spad: %d"' %\
diff --git a/python/caffe/io.py b/python/caffe/io.py
index 72a2fc682..0df78e7f3 100755
--- a/python/caffe/io.py
+++ b/python/caffe/io.py
@@ -117,7 +117,7 @@ def array_to_datum(arr, label=None):
     if arr.dtype == np.uint8:
         datum.data = arr.tostring()
     else:
-        datum.float_data.extend(arr.flat)
+        datum.float_data.extend(arr.astype(float).flat)
     if label is not None:
         datum.label = label
     return datum
@@ -303,7 +303,7 @@ def set_mean(self, in_, mean):
                 m_min, m_max = mean.min(), mean.max()
                 normal_mean = (mean - m_min) / (m_max - m_min)
                 mean = resize_image(normal_mean.transpose((1,2,0)),in_shape[1:]).transpose((2,0,1)) * (m_max - m_min) + m_min
-                #aise ValueError('Mean shape incompatible with input shape.')
+                #raise ValueError('Mean shape incompatible with input shape.')
         self.mean[in_] = mean
 
     def set_input_scale(self, in_, scale):
diff --git a/python/caffe/net_spec.py b/python/caffe/net_spec.py
index b8d568dcb..10ee4d4f1 100755
--- a/python/caffe/net_spec.py
+++ b/python/caffe/net_spec.py
@@ -142,6 +142,10 @@ class Function(object):
 
     def __init__(self, type_name, inputs, params):
         self.type_name = type_name
+        for index, input in enumerate(inputs):
+            if not isinstance(input, Top):
+                raise TypeError('%s input %d is not a Top (type is %s)' %
+                                (type_name, index, type(input)))
         self.inputs = inputs
         self.params = params
         self.ntop = self.params.get('ntop', 1)
diff --git a/python/caffe/pycaffe.py b/python/caffe/pycaffe.py
index d105c3f27..bc606148d 100755
--- a/python/caffe/pycaffe.py
+++ b/python/caffe/pycaffe.py
@@ -79,6 +79,16 @@ def _Net_blob_loss_weights(self):
                                                        self._blob_loss_weights))
     return self._blob_loss_weights_dict
 
+@property
+def _Net_layer_dict(self):
+    """
+    An OrderedDict (bottom to top, i.e., input to output) of network
+    layers indexed by name
+    """
+    if not hasattr(self, '_layer_dict'):
+        self._layer_dict = OrderedDict(zip(self._layer_names, self.layers))
+    return self._layer_dict
+
 
 @property
 def _Net_params(self):
@@ -139,7 +149,7 @@ def _Net_forward(self, blobs=None, start=None, end=None, **kwargs):
 
     if end is not None:
         end_ind = list(self._layer_names).index(end)
-        outputs = set([end] + blobs)
+        outputs = set(self.top_names[end] + blobs)
     else:
         end_ind = len(self.layers) - 1
         outputs = set(self.outputs + blobs)
@@ -187,7 +197,7 @@ def _Net_backward(self, diffs=None, start=None, end=None, **kwargs):
 
     if end is not None:
         end_ind = list(self._layer_names).index(end)
-        outputs = set([end] + diffs)
+        outputs = set(self.bottom_names[end] + diffs)
     else:
         end_ind = 0
         outputs = set(self.inputs + diffs)
@@ -357,6 +367,7 @@ def get_id_name(self):
 # Attach methods to Net.
 Net.blobs = _Net_blobs
 Net.blob_loss_weights = _Net_blob_loss_weights
+Net.layer_dict = _Net_layer_dict
 Net.params = _Net_params
 Net.forward = _Net_forward
 Net.backward = _Net_backward
diff --git a/python/caffe/test/test_draw.py b/python/caffe/test/test_draw.py
new file mode 100644
index 000000000..835bb5df0
--- /dev/null
+++ b/python/caffe/test/test_draw.py
@@ -0,0 +1,37 @@
+import os
+import unittest
+
+from google.protobuf import text_format
+
+import caffe.draw
+from caffe.proto import caffe_pb2
+
+def getFilenames():
+    """Yields files in the source tree which are Net prototxts."""
+    result = []
+
+    root_dir = os.path.abspath(os.path.join(
+        os.path.dirname(__file__), '..', '..', '..'))
+    assert os.path.exists(root_dir)
+
+    for dirname in ('models', 'examples'):
+        dirname = os.path.join(root_dir, dirname)
+        assert os.path.exists(dirname)
+        for cwd, _, filenames in os.walk(dirname):
+            for filename in filenames:
+                filename = os.path.join(cwd, filename)
+                if filename.endswith('.prototxt') and 'solver' not in filename:
+                    yield os.path.join(dirname, filename)
+
+
+class TestDraw(unittest.TestCase):
+    def test_draw_net(self):
+        for filename in getFilenames():
+            net = caffe_pb2.NetParameter()
+            with open(filename) as infile:
+                text_format.Merge(infile.read(), net)
+            caffe.draw.draw_net(net, 'LR')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/caffe/test/test_net.py b/python/caffe/test/test_net.py
index 85845e6c6..04198f06d 100755
--- a/python/caffe/test/test_net.py
+++ b/python/caffe/test/test_net.py
@@ -61,11 +61,11 @@ def simple_net_file(num_output):
         bias_filler { type: 'constant' value: 2 } }
         param { decay_mult: 1 } param { decay_mult: 0 }
         }
-    layer { type: 'InnerProduct' name: 'ip' bottom: 'conv' top: 'ip'
+    layer { type: 'InnerProduct' name: 'ip' bottom: 'conv' top: 'ip_blob'
       inner_product_param { num_output: """ + str(num_output) + """
         weight_filler { type: 'gaussian' std: 2.5 }
         bias_filler { type: 'constant' value: -3 } } }
-    layer { type: 'SoftmaxWithLoss' name: 'loss' bottom: 'ip' bottom: 'label'
+    layer { type: 'SoftmaxWithLoss' name: 'loss' bottom: 'ip_blob' bottom: 'label'
       top: 'loss' }""")
     f.close()
     return f.name
@@ -111,10 +111,35 @@ def test_memory(self):
         for bl in blobs:
             total += bl.data.sum() + bl.diff.sum()
 
+    def test_layer_dict(self):
+        layer_dict = self.net.layer_dict
+        self.assertEqual(list(layer_dict.keys()), list(self.net._layer_names))
+        for i, name in enumerate(self.net._layer_names):
+            self.assertEqual(layer_dict[name].type,
+                             self.net.layers[i].type)
+
     def test_forward_backward(self):
         self.net.forward()
         self.net.backward()
 
+    def test_forward_start_end(self):
+        conv_blob=self.net.blobs['conv'];
+        ip_blob=self.net.blobs['ip_blob'];
+        sample_data=np.random.uniform(size=conv_blob.data.shape);
+        sample_data=sample_data.astype(np.float32);
+        conv_blob.data[:]=sample_data;
+        forward_blob=self.net.forward(start='ip',end='ip');
+        self.assertIn('ip_blob',forward_blob);
+
+        manual_forward=[];
+        for i in range(0,conv_blob.data.shape[0]):
+          dot=np.dot(self.net.params['ip'][0].data,
+                     conv_blob.data[i].reshape(-1));
+          manual_forward.append(dot+self.net.params['ip'][1].data);
+        manual_forward=np.array(manual_forward);
+
+        np.testing.assert_allclose(ip_blob.data,manual_forward,rtol=1e-3);
+
     def test_clear_param_diffs(self):
         # Run a forward/backward step to have non-zero diffs
         self.net.forward()
@@ -134,13 +159,13 @@ def test_top_bottom_names(self):
         self.assertEqual(self.net.top_names,
                          OrderedDict([('data', ['data', 'label']),
                                       ('conv', ['conv']),
-                                      ('ip', ['ip']),
+                                      ('ip', ['ip_blob']),
                                       ('loss', ['loss'])]))
         self.assertEqual(self.net.bottom_names,
                          OrderedDict([('data', []),
                                       ('conv', ['data']),
                                       ('ip', ['conv']),
-                                      ('loss', ['ip', 'label'])]))
+                                      ('loss', ['ip_blob', 'label'])]))
 
     def test_save_and_read(self):
         f = tempfile.NamedTemporaryFile(mode='w+', delete=False)
@@ -224,12 +249,12 @@ class TestLevels(unittest.TestCase):
 """
 
     def setUp(self):
-        self.f = tempfile.NamedTemporaryFile(mode='w+')
+        self.f = tempfile.NamedTemporaryFile(mode='w+', delete=False)
         self.f.write(self.TEST_NET)
-        self.f.flush()
+        self.f.close()
 
     def tearDown(self):
-        self.f.close()
+        os.remove(self.f.name)
 
     def check_net(self, net, blobs):
         net_blobs = [b for b in net.blobs.keys() if 'data' not in b]
@@ -289,12 +314,12 @@ class TestStages(unittest.TestCase):
 """
 
     def setUp(self):
-        self.f = tempfile.NamedTemporaryFile(mode='w+')
+        self.f = tempfile.NamedTemporaryFile(mode='w+', delete=False)
         self.f.write(self.TEST_NET)
-        self.f.flush()
+        self.f.close()
 
     def tearDown(self):
-        self.f.close()
+        os.remove(self.f.name)
 
     def check_net(self, net, blobs):
         net_blobs = [b for b in net.blobs.keys() if 'data' not in b]
@@ -371,12 +396,12 @@ class TestAllInOne(unittest.TestCase):
 """
 
     def setUp(self):
-        self.f = tempfile.NamedTemporaryFile(mode='w+')
+        self.f = tempfile.NamedTemporaryFile(mode='w+', delete=False)
         self.f.write(self.TEST_NET)
-        self.f.flush()
+        self.f.close()
 
     def tearDown(self):
-        self.f.close()
+        os.remove(self.f.name)
 
     def check_net(self, net, outputs):
         self.assertEqual(list(net.blobs['data'].shape), [1,1,10,10])
diff --git a/python/caffe/test/test_net_spec.py b/python/caffe/test/test_net_spec.py
index d1b1f0af9..36520c2a5 100755
--- a/python/caffe/test/test_net_spec.py
+++ b/python/caffe/test/test_net_spec.py
@@ -115,3 +115,11 @@ def test_zero_tops(self):
         net_proto = silent_net()
         net = self.load_net(net_proto)
         self.assertEqual(len(net.forward()), 0)
+
+    def test_type_error(self):
+        """Test that a TypeError is raised when a Function input isn't a Top."""
+        data = L.DummyData(ntop=2)  # data is a 2-tuple of Tops
+        r = r"^Silence input 0 is not a Top \(type is <(type|class) 'tuple'>\)$"
+        with self.assertRaisesRegexp(TypeError, r):
+            L.Silence(data, ntop=0)  # should raise: data is a tuple, not a Top
+        L.Silence(*data, ntop=0)  # shouldn't raise: each elt of data is a Top
diff --git a/src/caffe/data_reader.cpp b/src/caffe/data_reader.cpp
index 01ebe50c8..69e8f20f5 100644
--- a/src/caffe/data_reader.cpp
+++ b/src/caffe/data_reader.cpp
@@ -44,7 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "caffe/data_reader.hpp"
 #include "caffe/layers/data_layer.hpp"
 #include "caffe/proto/caffe.pb.h"
-
+#include "caffe/multinode/mlsl.hpp"
 namespace caffe {
 
 using boost::weak_ptr;
@@ -99,7 +99,7 @@ DataReader::QueuePair::~QueuePair() {
 
 DataReader::Body::Body(const LayerParameter& param)
     : param_(param),
-      new_queue_pairs_() {
+      new_queue_pairs_(), first_read_(true) {
   StartInternalThread();
 }
 
@@ -147,18 +147,17 @@ void DataReader::Body::read_one(DBWrapper* dbw, QueuePair* qp) {
   CHECK(dbw);
   CHECK(qp);
 
-#ifdef CAFFE_MLSL_SHUFFLE
+#ifdef USE_MLSL
   string* data = qp->free_.pop();
-  static int mb=0;
-  if(!mb) { /* move each node’s file position to its node ID – this part can be move to the initialization */
-    for(int i=0;i<MLSL::GetNodeId();i++) {
+  if(first_read_) { /* move each node’s file position to its node ID – this part can be move to the initialization */
+    for(int i=0;i<mn::get_node_id();i++) {
       dbw->Next();
     }
-    mb = 1;
+    first_read_ = false;
   }
   *data = dbw->value();
   qp->full_.push(data);
-  for(int i=0;i<MLSL::GetNumNodes();i++) {
+  for(int i=0;i<mn::get_nodes_count();i++) {
     dbw->Next();
   }
 #else
@@ -191,8 +190,17 @@ DataReader::DBShuffle::DBShuffle(const LayerParameter& param):DBWrapper(param) {
 
   // randomly shuffle data
   LOG(INFO) << "Shuffling data";
+#ifdef USE_MLSL
+  mn::Distribution * distrib = mn::get_distrib();
+  float fetch_seed;
+  fetch_seed = static_cast<float>(caffe_rng_rand() % 15);
+  distrib->bcast<float, MLSL::GT_DATA>(&fetch_seed, 1);
+  LOG(INFO) << "Random seed for shuffling: " << fetch_seed;
+  prefetch_rng_.reset(new Caffe::RNG(static_cast<unsigned int>(fetch_seed)));
+#else
   const unsigned int prefetch_rng_seed = caffe_rng_rand();
   prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed));
+#endif
   ShuffleImages();
 }
 
diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp
index 7f4fbc830..5185308e8 100644
--- a/src/caffe/data_transformer.cpp
+++ b/src/caffe/data_transformer.cpp
@@ -172,14 +172,24 @@ void DataTransformer<Dtype>::Transform(const Datum& datum,
 template<typename Dtype>
 template<bool has_uint8, bool do_mirror, bool has_mean_file,
   bool has_mean_values>
-void DataTransformer<Dtype>::Transform(const Datum& datum,
+void DataTransformer<Dtype>::Transform(const Datum& datum_in,
                                        Dtype* transformed_data,
                                        NormalizedBBox* crop_bbox,
                                        RandNumbers& rand_num) {
-  const string& data = datum.data();
-  const int datum_channels = datum.channels();
-  const int datum_height = datum.height();
-  const int datum_width = datum.width();
+  const Datum *datum = &datum_in;
+  Datum resized_datum;
+  if (param_.has_random_resize_param()) {
+#ifdef USE_OPENCV
+    RandomResizeImage(datum_in, &resized_datum);
+    datum = &resized_datum;
+#else
+    LOG(FATAL) << "Random image resizing requires OpenCV; compile with USE_OPENCV.";
+#endif
+  }
+  const string& data = datum->data();
+  const int datum_channels = datum->channels();
+  const int datum_height = datum->height();
+  const int datum_width = datum->width();
 
   const int crop_size = param_.crop_size();
   const Dtype scale = param_.scale();
@@ -245,7 +255,7 @@ void DataTransformer<Dtype>::Transform(const Datum& datum,
           datum_element =
             static_cast<Dtype>(static_cast<uint8_t>(data[data_index]));
         } else {
-          datum_element = datum.float_data(data_index);
+          datum_element = datum->float_data(data_index);
         }
         if (has_mean_file) {
           transformed_data[top_index] =
@@ -756,10 +766,20 @@ void DataTransformer<Dtype>::Transform(const cv::Mat& cv_img,
 
 template<typename Dtype>
 template<bool do_mirror, bool has_mean_file, bool has_mean_values>
-void DataTransformer<Dtype>::Transform(const cv::Mat& cv_img,
+void DataTransformer<Dtype>::Transform(const cv::Mat& cv_img_in,
         Blob<Dtype>* transformed_blob, NormalizedBBox* crop_bbox, RandNumbers& rand_num) {
+  const cv::Mat *cv_img = &cv_img_in;
+  cv::Mat resized_img;
+  if (param_.has_random_resize_param()) {
+#ifdef USE_OPENCV
+    RandomResizeImage(cv_img_in, &resized_img);
+    cv_img = &resized_img;
+#else
+    LOG(FATAL) << "Random image resizing requires OpenCV; compile with USE_OPENCV.";
+#endif
+  }
   const int crop_size = param_.crop_size();
-  const int img_channels = cv_img.channels();
+  const int img_channels = cv_img->channels();
 
   // Check dimensions.
   const int channels = transformed_blob->channels();
@@ -770,7 +790,7 @@ void DataTransformer<Dtype>::Transform(const cv::Mat& cv_img,
   CHECK_EQ(channels, img_channels);
   CHECK_GE(num, 1);
 
-  CHECK(cv_img.depth() == CV_8U) << "Image data type must be unsigned byte";
+  CHECK(cv_img->depth() == CV_8U) << "Image data type must be unsigned byte";
 
   const Dtype scale = param_.scale();
 
@@ -793,9 +813,9 @@ void DataTransformer<Dtype>::Transform(const cv::Mat& cv_img,
   }
   cv::Mat cv_resized_img, cv_noised_img;
   if (param_.has_resize_param()) {
-    cv_resized_img = ApplyResize(cv_img, param_.resize_param());
+    cv_resized_img = ApplyResize(*cv_img, param_.resize_param());
   } else {
-    cv_resized_img = cv_img;
+    cv_resized_img = *cv_img;
   }
   if (param_.has_noise_param()) {
     cv_noised_img = ApplyNoise(cv_resized_img, param_.noise_param());
@@ -809,7 +829,7 @@ void DataTransformer<Dtype>::Transform(const cv::Mat& cv_img,
 
   int h_off = 0;
   int w_off = 0;
-  cv::Mat cv_cropped_img = cv_img;
+  cv::Mat cv_cropped_img = *cv_img;
   if (crop_size) {
     CHECK_EQ(crop_size, height);
     CHECK_EQ(crop_size, width);
@@ -822,7 +842,7 @@ void DataTransformer<Dtype>::Transform(const cv::Mat& cv_img,
       w_off = (img_width - crop_size) / 2;
     }
     cv::Rect roi(w_off, h_off, crop_size, crop_size);
-    cv_cropped_img = cv_img(roi);
+    cv_cropped_img = (*cv_img)(roi);
   } else {
     cv_cropped_img = cv_noised_img;
   }
@@ -1035,6 +1055,42 @@ void DataTransformer<Dtype>::ExpandImage(const cv::Mat& img,
   img.copyTo((*expand_img)(bbox_roi));
 }
 
+template<typename Dtype>
+void DataTransformer<Dtype>::RandomResizeImage(const Datum& datum, Datum *resized_datum) {
+  shared_ptr<cv::Mat> img;
+  if (datum.encoded()) {
+    img = shared_ptr<cv::Mat>(new cv::Mat(DecodeDatumToCVMatNative(datum)));
+  } else {
+    img = shared_ptr<cv::Mat>(new cv::Mat(
+                                cv::Size(datum.width(), datum.height()),
+                                CV_8UC(datum.channels()),
+                                (void*)datum.data().data()));
+  }
+  cv::Mat resized_img;
+  RandomResizeImage(*img, &resized_img);
+  CVMatToDatum(resized_img, resized_datum);
+}
+
+template<typename Dtype>
+void DataTransformer<Dtype>::RandomResizeImage(const cv::Mat& img, cv::Mat *resized_img) {
+  int h = img.size().height;
+  int w = img.size().width;
+  int min_size = param_.random_resize_param().min_size();
+  int max_size = param_.random_resize_param().max_size();
+  ResizeParameter resize_param = param_.random_resize_param().resize_param();
+  if (min_size == 0) min_size = std::min(h,w);
+  if (max_size == 0) max_size = std::max(h,w);
+  int shorter_size = rand_num_(max_size - min_size + 1) + min_size;
+  resize_param.set_height(shorter_size);
+  resize_param.set_width(shorter_size);
+  if (h < w) {
+    resize_param.set_width(int(float(w) / h * shorter_size));
+  } else {
+    resize_param.set_height(int(float(h) / w * shorter_size));
+  }
+  *resized_img = ApplyResize(img, resize_param);
+}
+
 #endif  // USE_OPENCV
 
 template<typename Dtype>
diff --git a/src/caffe/layers/mkl_batch_norm_layer.cpp b/src/caffe/layers/mkl_batch_norm_layer.cpp
index 8a1e44ab8..6dce50243 100755
--- a/src/caffe/layers/mkl_batch_norm_layer.cpp
+++ b/src/caffe/layers/mkl_batch_norm_layer.cpp
@@ -467,11 +467,12 @@ void MKLBatchNormLayer<Dtype>::Backward_cpu(
   CHECK_EQ(e, E_SUCCESS);
 
   if (use_weight_bias_) {
-    caffe_cpu_copy(this->blobs_[3]->count(),
-                   diffScaleShift_buffer_, this->blobs_[3]->mutable_cpu_diff());
+    caffe_cpu_axpby(this->blobs_[3]->count(), (Dtype)1.,
+                    diffScaleShift_buffer_, (Dtype)1., this->blobs_[3]->mutable_cpu_diff());
     if (bias_term_)
-      caffe_cpu_copy(this->blobs_[4]->count(),
-       diffScaleShift_buffer_ + channels_, this->blobs_[4]->mutable_cpu_diff());
+      caffe_cpu_axpby(this->blobs_[4]->count(), (Dtype)1.,
+                      diffScaleShift_buffer_ + channels_,
+                      (Dtype)1., this->blobs_[4]->mutable_cpu_diff());
     else
       caffe_set(this->blobs_[4]->count(),
                     static_cast<Dtype>(0), this->blobs_[4]->mutable_cpu_diff());
diff --git a/src/caffe/layers/mkldnn_batch_norm_layer.cpp b/src/caffe/layers/mkldnn_batch_norm_layer.cpp
index dd1b7f7b6..4db92b943 100644
--- a/src/caffe/layers/mkldnn_batch_norm_layer.cpp
+++ b/src/caffe/layers/mkldnn_batch_norm_layer.cpp
@@ -246,8 +246,13 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNorm(const vector<Blob<Dtype>*>& bott
         }
     }
 
-    fwd_bottom_data->set_mkldnn_primitive(BatchNormFwd);
-    fwd_top_data->set_mkldnn_primitive(BatchNormFwd);
+    //fwd_bottom_data->set_mkldnn_primitive(BatchNormFwd);  //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> fwd_bottom_data_primitive_transfer(input_primitive);
+    fwd_bottom_data->set_mkldnn_primitive(fwd_bottom_data_primitive_transfer);
+
+    //fwd_top_data->set_mkldnn_primitive(BatchNormFwd);     //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> fwd_top_data_memory_transfer(output_memory);
+    fwd_top_data->set_mkldnn_primitive(fwd_top_data_memory_transfer);
 
     //Fix: MKLDNN batch norm only support 4D memory descriptor! Use 4D for calculation and reshape to 2D for output!
     bool has_spatial = (bottom[0]->shape().size() != 2);
@@ -259,8 +264,8 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNorm(const vector<Blob<Dtype>*>& bott
 #ifdef DEBUG
         LOG(INFO) << "size of bottom blob: " << bottom[0]->shape().size();
         LOG(INFO) << "MKLDNN batch norm only support 4D memory descriptor! Use 4D for calculation and reshape to 2D for output!";
-#endif
-        vector<int> top_shape;
+#endif
+        vector<int> top_shape;
         top_shape.push_back(bottom[0]->num());
         top_shape.push_back(bottom[0]->channels());
         top[0]->Reshape(top_shape);
@@ -413,8 +418,13 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNormBwd(
                     *bwd_top_diff_primitive, *bwd_bottom_diff_memory));
     }
 
-    bwd_top_diff->set_mkldnn_primitive(BatchNormBwd);
-    bwd_bottom_diff->set_mkldnn_primitive(BatchNormBwd);
+    //bwd_top_diff->set_mkldnn_primitive(BatchNormBwd);     //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> bwd_top_diff_primitive_transfer(bwd_top_diff_primitive);
+    bwd_top_diff->set_mkldnn_primitive(bwd_top_diff_primitive_transfer);
+
+    //bwd_bottom_diff->set_mkldnn_primitive(BatchNormBwd);  //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> bwd_bottom_diff_memory_transfer(bwd_bottom_diff_memory);
+    bwd_bottom_diff->set_mkldnn_primitive(bwd_bottom_diff_memory_transfer);
 }
 
 template <typename Dtype>
diff --git a/src/caffe/layers/mkldnn_concat_layer.cpp b/src/caffe/layers/mkldnn_concat_layer.cpp
index ee2cc5026..a0a1cd487 100644
--- a/src/caffe/layers/mkldnn_concat_layer.cpp
+++ b/src/caffe/layers/mkldnn_concat_layer.cpp
@@ -101,7 +101,7 @@ void MKLDNNConcatLayer<Dtype>::InitConcatFwd(const vector<Blob<Dtype>*>& bottom,
       LOG(INFO) << "size of bottom blob: " << bottom[0]->shape().size();
       LOG(INFO) << "size of top blob: " << top[0]->shape().size();
       LOG(INFO) << "MKLDNN concat layer only support 4D blob as input! Reshape the 2D input blob into 4D for calculation!";
-#endif
+#endif
       vector<int> bottom_4D_shape;
       int bottom_4D_height = 1;
       int bottom_4D_width = 1;
@@ -168,9 +168,13 @@ void MKLDNNConcatLayer<Dtype>::InitConcatFwd(const vector<Blob<Dtype>*>& bottom,
   concatFwd.reset(new concat(*concatFwd_pd, fwd_input_primitives_at_, *fwd_output_memory));
 
   for (auto i = 0; i < num_concats_; i++) {
-    fwd_bottom_data[i]->set_mkldnn_primitive(concatFwd);
+    //fwd_bottom_data[i]->set_mkldnn_primitive(concatFwd);  //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> fwd_bottom_data_primitive_transfer(fwd_input_primitives_[i]);
+    fwd_bottom_data[i]->set_mkldnn_primitive(fwd_bottom_data_primitive_transfer);
   }
-  fwd_top_data->set_mkldnn_primitive(concatFwd);
+  //fwd_top_data->set_mkldnn_primitive(concatFwd);          //Wrong passed primitive! (TODO: Checking!)
+  MKLDNNPrimitive<Dtype> fwd_top_data_memory_transfer(fwd_output_memory);
+  fwd_top_data->set_mkldnn_primitive(fwd_top_data_memory_transfer);
 }
 
 template <typename Dtype>
@@ -237,11 +241,14 @@ void MKLDNNConcatLayer<Dtype>::InitConcatBwd(const vector<Blob<Dtype>*>& top,
 
     offsets[concat_dimension] += dims[concat_dimension];
 
-    bwd_bottom_diff[i]->set_mkldnn_primitive(reorders[i]);
+    //bwd_bottom_diff[i]->set_mkldnn_primitive(reorders[i]);  //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> bwd_bottom_diff_memory_transfer(bwd_reorder_output_memory[i]);
+    bwd_bottom_diff[i]->set_mkldnn_primitive(bwd_bottom_diff_memory_transfer);
   }
 
-  bwd_top_diff->set_mkldnn_primitive(reorders[0]);
-
+  //bwd_top_diff->set_mkldnn_primitive(reorders[0]);          //Wrong passed primitive! (TODO: Checking!)
+  MKLDNNPrimitive<Dtype> bwd_top_diff_memory_transfer(bwd_reorder_input_memory);
+  bwd_top_diff->set_mkldnn_primitive(bwd_top_diff_memory_transfer);
 }
 
 template <typename Dtype>
diff --git a/src/caffe/layers/mkldnn_convolution_layer.cpp b/src/caffe/layers/mkldnn_convolution_layer.cpp
index f6b79532b..d65dbf3bf 100644
--- a/src/caffe/layers/mkldnn_convolution_layer.cpp
+++ b/src/caffe/layers/mkldnn_convolution_layer.cpp
@@ -261,7 +261,9 @@ void MKLDNNConvolutionLayer<Dtype>::InitConvolutionFwd(const vector<Blob<Dtype>*
                           , *fwd_bottom_data_primitive, *fwd_weights_data_primitive
                           , *fwd_bias_data_primitive, *fwd_top_data_memory));
         }
-        fwd_bias_data->set_mkldnn_primitive(convFwd);
+        //fwd_bias_data->set_mkldnn_primitive(convFwd);   //Wrong passed primitive! (For sure!)
+        MKLDNNPrimitive<Dtype> fwd_bias_data_primitive_transfer(fwd_bias_data_primitive);
+        fwd_bias_data->set_mkldnn_primitive(fwd_bias_data_primitive_transfer);
     } else {
         if(relu) {
           convFwd.reset(new convolution_relu_forward(*convReluFwd_pd
@@ -273,11 +275,13 @@ void MKLDNNConvolutionLayer<Dtype>::InitConvolutionFwd(const vector<Blob<Dtype>*
                           , *fwd_top_data_memory));
         }
     }
-    fwd_bottom_data->set_mkldnn_primitive(convFwd);   //Wrong passed primitive! (TODO: Checking!)
-    //MKLDNNPrimitive<Dtype> fwd_bottom_data_primitive_transfer(fwd_bottom_data_primitive);
-    //fwd_bottom_data->set_mkldnn_primitive(fwd_bottom_data_primitive_transfer);
+    //fwd_bottom_data->set_mkldnn_primitive(convFwd);   //Wrong passed primitive! (For sure!)
+    MKLDNNPrimitive<Dtype> fwd_bottom_data_primitive_transfer(fwd_bottom_data_primitive);
+    fwd_bottom_data->set_mkldnn_primitive(fwd_bottom_data_primitive_transfer);
 
-    fwd_top_data->set_mkldnn_primitive(convFwd);
+    //fwd_top_data->set_mkldnn_primitive(convFwd);      //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> fwd_top_data_memory_transfer(fwd_top_data_memory);
+    fwd_top_data->set_mkldnn_primitive(fwd_top_data_memory_transfer);
 
     //fwd_weights_data->set_mkldnn_primitive(convFwd);  //Wrong passed primitive! (For sure!)
     MKLDNNPrimitive<Dtype> fwd_weights_data_primitive_transfer(fwd_weights_data_primitive);
@@ -447,7 +451,9 @@ void MKLDNNConvolutionLayer<Dtype>::InitConvolutionBwd(const vector<Blob<Dtype>*
                         , *bwdw_bottom_data_primitive, *bwdw_top_diff_primitive
                         , *bwdw_weights_diff_memory, *bwdw_bias_diff_memory));
 
-        bwdw_bias_diff->set_mkldnn_primitive(convBwdWeights);
+        //bwdw_bias_diff->set_mkldnn_primitive(convBwdWeights);   //Wrong passed primitive! (For sure!)
+        MKLDNNPrimitive<Dtype> bwdw_bias_diff_memory_transfer(bwdw_bias_diff_memory);
+        bwdw_bias_diff->set_mkldnn_primitive(bwdw_bias_diff_memory_transfer);
     } else {
         convBwdWeights.reset(new convolution_backward_weights(*convBwdWeights_pd
                         , *bwdw_bottom_data_primitive, *bwdw_top_diff_primitive
@@ -458,26 +464,30 @@ void MKLDNNConvolutionLayer<Dtype>::InitConvolutionBwd(const vector<Blob<Dtype>*
                     , *bwdd_top_diff_primitive, *bwdd_weights_data_primitive
                     , *bwdd_bottom_diff_memory));
 
-    bwdd_bottom_diff->set_mkldnn_primitive(convBwdData);
+    //bwdd_bottom_diff->set_mkldnn_primitive(convBwdData);      //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> bwdd_bottom_diff_memory_transfer(bwdd_bottom_diff_memory);
+    bwdd_bottom_diff->set_mkldnn_primitive(bwdd_bottom_diff_memory_transfer);
 
-    bwdd_top_diff->set_mkldnn_primitive(convBwdData);         //Wrong passed primitive! (TODO: Checking!)
-    //MKLDNNPrimitive<Dtype> bwdd_top_diff_primitive_transfer(bwdd_top_diff_primitive);
-    //bwdd_top_diff->set_mkldnn_primitive(bwdd_top_diff_primitive_transfer);
+    //bwdd_top_diff->set_mkldnn_primitive(convBwdData);         //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> bwdd_top_diff_primitive_transfer(bwdd_top_diff_primitive);
+    bwdd_top_diff->set_mkldnn_primitive(bwdd_top_diff_primitive_transfer);
 
     //bwdd_weights_data->set_mkldnn_primitive(convBwdData);     //Wrong passed primitive! (For sure!)
     MKLDNNPrimitive<Dtype> bwdd_weights_data_primitive_transfer(bwdd_weights_data_primitive);
     bwdd_weights_data->set_mkldnn_primitive(bwdd_weights_data_primitive_transfer);
 
 
-    bwdw_bottom_data->set_mkldnn_primitive(convBwdWeights);   //Wrong passed primitive! (TODO: Checking!)
-    //MKLDNNPrimitive<Dtype> bwdw_bottom_data_primitive_transfer(bwdw_bottom_data_primitive);
-    //bwdw_bottom_data->set_mkldnn_primitive(bwdw_bottom_data_primitive_transfer);
+    //bwdw_bottom_data->set_mkldnn_primitive(convBwdWeights);   //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> bwdw_bottom_data_primitive_transfer(bwdw_bottom_data_primitive);
+    bwdw_bottom_data->set_mkldnn_primitive(bwdw_bottom_data_primitive_transfer);
 
-    bwdw_top_diff->set_mkldnn_primitive(convBwdWeights);      //Wrong passed primitive! (TODO: Checking!)
-    //MKLDNNPrimitive<Dtype> bwdw_top_diff_primitive_transfer(bwdw_top_diff_primitive);
-    //bwdw_top_diff->set_mkldnn_primitive(bwdw_top_diff_primitive_transfer);
+    //bwdw_top_diff->set_mkldnn_primitive(convBwdWeights);      //Wrong passed primitive! (For sure!)
+    MKLDNNPrimitive<Dtype> bwdw_top_diff_primitive_transfer(bwdw_top_diff_primitive);
+    bwdw_top_diff->set_mkldnn_primitive(bwdw_top_diff_primitive_transfer);
 
-    bwdw_weights_diff->set_mkldnn_primitive(convBwdWeights);
+    //bwdw_weights_diff->set_mkldnn_primitive(convBwdWeights);  //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> bwdw_weights_diff_memory_transfer(bwdw_weights_diff_memory);
+    bwdw_weights_diff->set_mkldnn_primitive(bwdw_weights_diff_memory_transfer);
 
     // Names are for debugging purposes only.
 }
diff --git a/src/caffe/layers/mkldnn_eltwise_layer.cpp b/src/caffe/layers/mkldnn_eltwise_layer.cpp
index 2a4a87c79..060467e82 100644
--- a/src/caffe/layers/mkldnn_eltwise_layer.cpp
+++ b/src/caffe/layers/mkldnn_eltwise_layer.cpp
@@ -201,9 +201,13 @@ void MKLDNNEltwiseLayer<Dtype>::InitEltwiseFwd(const vector<Blob<Dtype>*>& botto
     
     for (auto i = 0; i < num_bottoms_; i++)
     {
-        fwd_bottom_data[i]->set_mkldnn_primitive(eltwiseFwd);
+        //fwd_bottom_data[i]->set_mkldnn_primitive(eltwiseFwd);   //Wrong passed primitive! (TODO: Checking!)
+        MKLDNNPrimitive<Dtype> fwd_bottom_data_primitive_transfer(fwd_bottom_data_primitives_[i]);
+        fwd_bottom_data[i]->set_mkldnn_primitive(fwd_bottom_data_primitive_transfer);
     }
-    fwd_top_data->set_mkldnn_primitive(eltwiseFwd);
+    //fwd_top_data->set_mkldnn_primitive(eltwiseFwd);             //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> fwd_top_data_memory_transfer(fwd_top_data_memory);
+    fwd_top_data->set_mkldnn_primitive(fwd_top_data_memory_transfer);
 }
 
 
@@ -214,9 +218,9 @@ void MKLDNNEltwiseLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     
     if(eltwiseFwd_pd == NULL)
         InitEltwiseFwd(bottom, top);
-    for (auto i = 0; i < num_bottoms_; i++)
-    {
-        // making reorders if needed.
+    for (auto i = 0; i < num_bottoms_; i++)
+    {
+        // making reorders if needed.
         fwd_bottom_data[i]->sync_before_read();
     }
     // update top that head at prv
@@ -233,13 +237,13 @@ void MKLDNNEltwiseLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top
                                           , const vector<bool>& propagate_down
                                           , const vector<Blob<Dtype>*>& bottom)
 {
-    VLOG(1) << "MKLDNNEltwiseLayer<Dtype>::Backward_cpu: " << this->layer_param_.name();
-
-    for (int i = 0; i < num_bottoms_; ++i) 
-    {
-        //Eltwise layer is not supporting multiplication coefficient in Backward due to lack of supporting scale and copy primitives in MKL-DNN
-        CHECK_EQ(coeffs_[i], Dtype(1)) << "Not supported yet";
-
+    VLOG(1) << "MKLDNNEltwiseLayer<Dtype>::Backward_cpu: " << this->layer_param_.name();
+
+    for (int i = 0; i < num_bottoms_; ++i) 
+    {
+        //Eltwise layer is not supporting multiplication coefficient in Backward due to lack of supporting scale and copy primitives in MKL-DNN
+        CHECK_EQ(coeffs_[i], Dtype(1)) << "Not supported yet";
+
         bottom[i]->ShareDiff(*top[0]);
     }
 }
diff --git a/src/caffe/layers/mkldnn_inner_product_layer.cpp b/src/caffe/layers/mkldnn_inner_product_layer.cpp
index d2fe6cfaa..1c92669c1 100644
--- a/src/caffe/layers/mkldnn_inner_product_layer.cpp
+++ b/src/caffe/layers/mkldnn_inner_product_layer.cpp
@@ -235,18 +235,24 @@ void MKLDNNInnerProductLayer<Dtype>::InitInnerProductFwd(const vector<Blob<Dtype
     }
     
     //Because the inputs of inner product layer always come from user memory, so will not trigger the wrong reorder from extprv to prv
-    fwd_bottom_data->set_mkldnn_primitive(ipFwd);     //Wrong passed primitive! (TODO: Checking!)
-    //MKLDNNPrimitive<Dtype> fwd_bottom_data_primitive_transfer(fwd_bottom_data_primitive);
-    //fwd_bottom_data->set_mkldnn_primitive(fwd_bottom_data_primitive_transfer);
+    //fwd_bottom_data->set_mkldnn_primitive(ipFwd);     //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> fwd_bottom_data_primitive_transfer(fwd_bottom_data_primitive);
+    fwd_bottom_data->set_mkldnn_primitive(fwd_bottom_data_primitive_transfer);
 
-    fwd_top_data->set_mkldnn_primitive(ipFwd);
-    
-    fwd_weights_data->set_mkldnn_primitive(ipFwd);    //Wrong passed primitive! (TODO: Checking!)
-    //MKLDNNPrimitive<Dtype> fwd_weights_data_primitive_transfer(fwd_weights_data_primitive);    
-    //fwd_weights_data->set_mkldnn_primitive(fwd_weights_data_primitive_transfer);
+    //fwd_top_data->set_mkldnn_primitive(ipFwd);        //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> fwd_top_data_memory_transfer(fwd_top_data_memory);
+    fwd_top_data->set_mkldnn_primitive(fwd_top_data_memory_transfer);
 
-    if (this->bias_term_) 
-      fwd_bias_data->set_mkldnn_primitive(ipFwd);
+    //fwd_weights_data->set_mkldnn_primitive(ipFwd);    //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> fwd_weights_data_primitive_transfer(fwd_weights_data_primitive);
+    fwd_weights_data->set_mkldnn_primitive(fwd_weights_data_primitive_transfer);
+
+    if (this->bias_term_)
+    {
+      //fwd_bias_data->set_mkldnn_primitive(ipFwd);       //Wrong passed primitive! (TODO: Checking!)
+      MKLDNNPrimitive<Dtype> fwd_bias_data_primitive_transfer(fwd_bias_data_primitive);
+      fwd_bias_data->set_mkldnn_primitive(fwd_bias_data_primitive_transfer);
+    }
 }
 
 template <typename Dtype>
@@ -416,29 +422,37 @@ void MKLDNNInnerProductLayer<Dtype>::InitInnerProductBwd(const vector<Blob<Dtype
                     , *bwdd_top_diff_primitive, *bwdd_weights_data_primitive
                     , *bwdd_bottom_diff_memory));
 
-    bwdd_bottom_diff->set_mkldnn_primitive(ipBwdData);
+    //bwdd_bottom_diff->set_mkldnn_primitive(ipBwdData);        //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> bwdd_bottom_diff_memory_transfer(bwdd_bottom_diff_memory);
+    bwdd_bottom_diff->set_mkldnn_primitive(bwdd_bottom_diff_memory_transfer);
     
-    bwdd_top_diff->set_mkldnn_primitive(ipBwdData);           //Wrong passed primitive! (TODO: Checking!)
-    //MKLDNNPrimitive<Dtype> bwdd_top_diff_primitive_transfer(bwdd_top_diff_primitive);
-    //bwdd_top_diff->set_mkldnn_primitive(bwdd_top_diff_primitive_transfer);
+    //bwdd_top_diff->set_mkldnn_primitive(ipBwdData);           //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> bwdd_top_diff_primitive_transfer(bwdd_top_diff_primitive);
+    bwdd_top_diff->set_mkldnn_primitive(bwdd_top_diff_primitive_transfer);
 
-    bwdd_weights_data->set_mkldnn_primitive(ipBwdData);       //Wrong passed primitive! (TODO: Checking!)
-    //MKLDNNPrimitive<Dtype> bwdd_weights_data_primitive_transfer(bwdd_weights_data_primitive);
-    //bwdd_weights_data->set_mkldnn_primitive(bwdd_weights_data_primitive_transfer);
+    //bwdd_weights_data->set_mkldnn_primitive(ipBwdData);       //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> bwdd_weights_data_primitive_transfer(bwdd_weights_data_primitive);
+    bwdd_weights_data->set_mkldnn_primitive(bwdd_weights_data_primitive_transfer);
 
 
-    bwdw_bottom_data->set_mkldnn_primitive(ipBwdWeights);     //Wrong passed primitive! (TODO: Checking!)
-    //MKLDNNPrimitive<Dtype> bwdw_bottom_data_primitive_transfer(bwdw_bottom_data_primitive);
-    //bwdw_bottom_data->set_mkldnn_primitive(bwdw_bottom_data_primitive_transfer);
+    //bwdw_bottom_data->set_mkldnn_primitive(ipBwdWeights);     //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> bwdw_bottom_data_primitive_transfer(bwdw_bottom_data_primitive);
+    bwdw_bottom_data->set_mkldnn_primitive(bwdw_bottom_data_primitive_transfer);
 
-    bwdw_top_diff->set_mkldnn_primitive(ipBwdWeights);        //Wrong passed primitive! (TODO: Checking!)
-    //MKLDNNPrimitive<Dtype> bwdw_top_diff_primitive_transfer(bwdw_top_diff_primitive);
-    //bwdw_top_diff->set_mkldnn_primitive(bwdw_top_diff_primitive_transfer);
+    //bwdw_top_diff->set_mkldnn_primitive(ipBwdWeights);        //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> bwdw_top_diff_primitive_transfer(bwdw_top_diff_primitive);
+    bwdw_top_diff->set_mkldnn_primitive(bwdw_top_diff_primitive_transfer);
 
-    bwdw_weights_diff->set_mkldnn_primitive(ipBwdWeights);
+    //bwdw_weights_diff->set_mkldnn_primitive(ipBwdWeights);    //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> bwdw_weights_diff_memory_transfer(bwdw_weights_diff_memory);
+    bwdw_weights_diff->set_mkldnn_primitive(bwdw_weights_diff_memory_transfer);
 
     if (this->bias_term_)
-        bwdw_bias_diff->set_mkldnn_primitive(ipBwdWeights);
+    {
+        //bwdw_bias_diff->set_mkldnn_primitive(ipBwdWeights);   //Wrong passed primitive! (TODO: Checking!)
+        MKLDNNPrimitive<Dtype> bwdw_bias_diff_memory_transfer(bwdw_bias_diff_memory);
+        bwdw_bias_diff->set_mkldnn_primitive(bwdw_bias_diff_memory_transfer);
+    }
 }
 
 
@@ -482,9 +496,9 @@ void MKLDNNInnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& to
         else
         {
             LOG(INFO) << "Debug: Top prv diff is NULL!";
-            LOG(INFO) << "Debug: Top cpu diff: " << *top[0]->cpu_diff();
-        }
-
+            LOG(INFO) << "Debug: Top cpu diff: " << *top[0]->cpu_diff();
+        }
+
         if (this->blobs_[0]->prv_data() != NULL)
         {
             LOG(INFO) << "Debug: Weights prv data from blobs_[0]: " << *this->blobs_[0]->prv_data();
@@ -492,9 +506,9 @@ void MKLDNNInnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& to
         else
         {
             LOG(INFO) << "Debug: Weights prv data is NULL!";
-            LOG(INFO) << "Debug: Weights cpu data: " << *this->blobs_[0]->cpu_data();
-        }
-        //Before submit, so get_prv_ptr() always has the value
+            LOG(INFO) << "Debug: Weights cpu data: " << *this->blobs_[0]->cpu_data();
+        }
+        //Before submit, so get_prv_ptr() always has the value
         LOG(INFO) << "Debug: Weights prv data from get_prv_ptr: " << *bwdd_weights_data->get_prv_ptr();
 #endif
         ipBwdData.submit();
@@ -505,8 +519,8 @@ void MKLDNNInnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& to
         }
         else
         {
-            LOG(INFO) << "Debug: Bottom prv diff is NULL!";
-            LOG(INFO) << "Debug: Bottom cpu diff: " << *bottom[0]->cpu_diff();
+            LOG(INFO) << "Debug: Bottom prv diff is NULL!";
+            LOG(INFO) << "Debug: Bottom cpu diff: " << *bottom[0]->cpu_diff();
         }
 #endif
         PERFORMANCE_MEASUREMENT_END_ID(perf_id_bw_);
diff --git a/src/caffe/layers/mkldnn_lrn_layer.cpp b/src/caffe/layers/mkldnn_lrn_layer.cpp
index c5eb48d1c..6c589c73e 100644
--- a/src/caffe/layers/mkldnn_lrn_layer.cpp
+++ b/src/caffe/layers/mkldnn_lrn_layer.cpp
@@ -198,8 +198,13 @@ void MKLDNNLRNLayer<Dtype>::InitLRNFwd(const vector<Blob<Dtype>*>& bottom, const
     } else {
         lrnFwd.reset(new lrn_forward(*lrnFwd_pd, *fwd_bottom_data_primitive, *fwd_top_data_memory));
     }
-    fwd_bottom_data->set_mkldnn_primitive(lrnFwd);
-    fwd_top_data->set_mkldnn_primitive(lrnFwd);
+    //fwd_bottom_data->set_mkldnn_primitive(lrnFwd);      //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> fwd_bottom_data_primitive_transfer(fwd_bottom_data_primitive);
+    fwd_bottom_data->set_mkldnn_primitive(fwd_bottom_data_primitive_transfer);
+
+    //fwd_top_data->set_mkldnn_primitive(lrnFwd);         //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> fwd_top_data_memory_transfer(fwd_top_data_memory);
+    fwd_top_data->set_mkldnn_primitive(fwd_top_data_memory_transfer);
 }
 
 
@@ -340,8 +345,13 @@ void MKLDNNLRNLayer<Dtype>::InitLRNBwd(const vector<Blob<Dtype>*>& top
     bwd_top_diff_primitive = bwd_top_diff->create_input(false);
 
     lrnBwd.reset(new lrn_backward(*lrnBwd_pd, *fwd_bottom_data_primitive, *bwd_top_diff_primitive, *scratch_memory, *bwd_bottom_diff_memory));
-    bwd_bottom_diff->set_mkldnn_primitive(lrnBwd);
-    bwd_top_diff->set_mkldnn_primitive(lrnBwd);
+    //bwd_bottom_diff->set_mkldnn_primitive(lrnBwd);        //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> bwd_bottom_diff_memory_transfer(bwd_bottom_diff_memory);
+    bwd_bottom_diff->set_mkldnn_primitive(bwd_bottom_diff_memory_transfer);
+
+    //bwd_top_diff->set_mkldnn_primitive(lrnBwd);           //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> bwd_top_diff_primitive_transfer(bwd_top_diff_primitive);
+    bwd_top_diff->set_mkldnn_primitive(bwd_top_diff_primitive_transfer);
 }
 
 
diff --git a/src/caffe/layers/mkldnn_pooling_layer.cpp b/src/caffe/layers/mkldnn_pooling_layer.cpp
index 849abd0f3..4a54a2efc 100644
--- a/src/caffe/layers/mkldnn_pooling_layer.cpp
+++ b/src/caffe/layers/mkldnn_pooling_layer.cpp
@@ -216,18 +216,20 @@ void MKLDNNPoolingLayer<Dtype>::InitPoolingFwd(const vector<Blob<Dtype>*>& botto
 
     // ---- Initialize memory descriptors -------------
     typedef typename memory::primitive_desc MemPD; // short name for memory::primitive_desc
-
     memory::format cmfmt = mfmt_nchw;
+
+    shared_ptr<MemPD> usr_bottom_data_mpd(new MemPD({{bottom_tz}, mpcsn, mfmt_nchw}, cpu_engine));
+    shared_ptr<MemPD> usr_top_data_mpd(new MemPD({{top_tz}, mpcsn, mfmt_nchw}, cpu_engine));
+
     if (bottom_data_is_prv) {
         shared_ptr<MKLDNNMemoryDescriptor<Dtype, false> > mem_descr
             = get_mkldnn_prv_descriptor<Dtype, false>(bottom[0]);
         cmfmt = static_cast<memory::format>(mem_descr->prv_memory_pd()->desc().data.format);
+        mpcsn = static_cast<memory::data_type>(mem_descr->prv_memory_pd()->desc().data.data_type);
     }
     shared_ptr<memory::desc> init_fwd_bottom_md(new memory::desc({bottom_tz}, mpcsn, cmfmt));
     shared_ptr<memory::desc> init_fwd_top_md(new memory::desc({top_tz}, mpcsn, cmfmt));
 
-    shared_ptr<MemPD> usr_bottom_data_mpd(new MemPD({{bottom_tz}, mpcsn, mfmt_nchw}, cpu_engine));
-    shared_ptr<MemPD> usr_top_data_mpd(new MemPD({{top_tz}, mpcsn, mfmt_nchw}, cpu_engine));
     // ---- Initialize pooling primitive descriptor -------------
     pooling_forward::desc poolingFwd_desc(propagation, pooling_algorithm, *init_fwd_bottom_md,*init_fwd_top_md
                                         , {sh, sw}, {kh, kw}, {pt, pl}, {pb, pr}, padding_kind::zero);
@@ -275,7 +277,7 @@ void MKLDNNPoolingLayer<Dtype>::InitPoolingFwd(const vector<Blob<Dtype>*>& botto
     fwd_top_data.reset(new MKLDNNData<Dtype>(usr_top_data_mpd, prv_fwd_top_data_mpd, top[0], this));
     fwd_top_data_memory = fwd_top_data->create_output_memory();
 
-    if ( propagation == prop_kind::forward_training &&
+    if (propagation == prop_kind::forward_training &&
             pooling_algorithm != algorithm::pooling_avg_exclude_padding &&
             pooling_algorithm != algorithm::pooling_avg_include_padding) {
         indices_pd.reset(new MemPD(poolingFwd_pd->workspace_primitive_desc()));
@@ -284,8 +286,13 @@ void MKLDNNPoolingLayer<Dtype>::InitPoolingFwd(const vector<Blob<Dtype>*>& botto
     } else {
         poolingFwd.reset(new pooling_forward(*poolingFwd_pd, *fwd_bottom_data_primitive, *fwd_top_data_memory));
     }
-    fwd_bottom_data->set_mkldnn_primitive(poolingFwd);
-    fwd_top_data->set_mkldnn_primitive(poolingFwd);
+    //fwd_bottom_data->set_mkldnn_primitive(poolingFwd);  //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> fwd_bottom_data_primitive_transfer(fwd_bottom_data_primitive);
+    fwd_bottom_data->set_mkldnn_primitive(fwd_bottom_data_primitive_transfer);
+
+    //fwd_top_data->set_mkldnn_primitive(poolingFwd);     //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> fwd_top_data_memory_transfer(fwd_top_data_memory);
+    fwd_top_data->set_mkldnn_primitive(fwd_top_data_memory_transfer);
 }
 
 // TODO(Yangqing): Is there a faster way to do pooling in the channel-first
@@ -418,7 +425,7 @@ void MKLDNNPoolingLayer<Dtype>::InitPoolingBwd(const vector<Blob<Dtype>*>& top
 
     // ---- Initialize remaining memory descriptors -------------
     shared_ptr<MemPD> prv_bwd_bottom_diff_mpd, prv_bwd_top_diff_mpd;
-    if (top_diff_is_prv) {
+    if (top_diff_is_prv || bottom_data_is_prv) {
         prv_bwd_bottom_diff_mpd.reset(new MemPD(*init_bwd_bottom_md, engine));
         prv_bwd_top_diff_mpd.reset(new MemPD(*init_bwd_top_md, engine));
     }
@@ -440,8 +447,13 @@ void MKLDNNPoolingLayer<Dtype>::InitPoolingBwd(const vector<Blob<Dtype>*>& top
     else
         poolingBwd.reset(new pooling_backward(*poolingBwd_pd,
                     *bwd_top_diff_primitive, *bwd_bottom_diff_memory));
-    bwd_bottom_diff->set_mkldnn_primitive(poolingBwd);
-    bwd_top_diff->set_mkldnn_primitive(poolingBwd);
+    //bwd_bottom_diff->set_mkldnn_primitive(poolingBwd);    //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> bwd_bottom_diff_memory_transfer(bwd_bottom_diff_memory);
+    bwd_bottom_diff->set_mkldnn_primitive(bwd_bottom_diff_memory_transfer);
+
+    //bwd_top_diff->set_mkldnn_primitive(poolingBwd);       //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> bwd_top_diff_primitive_transfer(bwd_top_diff_primitive);
+    bwd_top_diff->set_mkldnn_primitive(bwd_top_diff_primitive_transfer);
 }
 
 template <typename Dtype>
diff --git a/src/caffe/layers/mkldnn_relu_layer.cpp b/src/caffe/layers/mkldnn_relu_layer.cpp
index 273e834d8..6e0f93b67 100644
--- a/src/caffe/layers/mkldnn_relu_layer.cpp
+++ b/src/caffe/layers/mkldnn_relu_layer.cpp
@@ -99,7 +99,10 @@ void MKLDNNReLULayer<Dtype>::InitReLUFwd(const vector<Blob<Dtype>*>& bottom, con
     top_data_md = bottom_data_md;
 
     // ---- Initialize relu primitive descriptor -------------
-    relu_forward::desc reluFwd_desc(propagation, *bottom_data_md, negative_slope);
+    //relu_forward::desc reluFwd_desc(propagation, *bottom_data_md, negative_slope);
+    // MKLDNN is deprecating standalone relu primitive in MKL-DNN.
+    // Now MKLDNN has eltwise primitive with eltwise_relu algorithm inside.
+    eltwise_forward::desc eltwise_reluFwd_desc(propagation, eltwise_relu, *bottom_data_md, negative_slope);
 
     // ---- Determining engine to use -----------------------
     std::string subengines = this->layer_param_.engine();
@@ -109,7 +112,7 @@ void MKLDNNReLULayer<Dtype>::InitReLUFwd(const vector<Blob<Dtype>*>& bottom, con
     unsigned subEngineIndex = 0;
     for(; subEngineIndex < ep.getNumberOfSubEngines(); subEngineIndex++) {
       try {
-        reluFwd_pd.reset(new relu_forward::primitive_desc(reluFwd_desc,
+        reluFwd_pd.reset(new relu_forward::primitive_desc(eltwise_reluFwd_desc,
                 ep.getMKLDNNSubEngine(subEngineIndex)));
       }
       catch(...) {
@@ -129,9 +132,13 @@ void MKLDNNReLULayer<Dtype>::InitReLUFwd(const vector<Blob<Dtype>*>& bottom, con
     fwd_top_data_memory = fwd_top_data->create_output_memory(inplace);
 
     reluFwd.reset(new relu_forward(*reluFwd_pd, *fwd_bottom_data_primitive, *fwd_top_data_memory));
-    fwd_bottom_data->set_mkldnn_primitive(reluFwd);
-    fwd_top_data->set_mkldnn_primitive(reluFwd);
+    //fwd_bottom_data->set_mkldnn_primitive(reluFwd);     //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> fwd_bottom_data_primitive_transfer(fwd_bottom_data_primitive);
+    fwd_bottom_data->set_mkldnn_primitive(fwd_bottom_data_primitive_transfer);
 
+    //fwd_top_data->set_mkldnn_primitive(reluFwd);        //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> fwd_top_data_memory_transfer(fwd_top_data_memory);
+    fwd_top_data->set_mkldnn_primitive(fwd_top_data_memory_transfer);
 }
 
 
@@ -147,6 +154,11 @@ void MKLDNNReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom
     bool inplace = (bottom[0] == top[0]);
     if( reluFwd_pd == NULL)
         InitReLUFwd(bottom, top);
+
+    if(this->layer_param_.relu_param().fuse()) {
+      top[0]->ShareData(*bottom[0]);
+      return;
+    }
     // making reorders if needed.
     fwd_bottom_data->sync_before_read();
     // update top that head at prv
@@ -239,7 +251,10 @@ void MKLDNNReLULayer<Dtype>::InitReLUBwd(const vector<Blob<Dtype>*>& top
     bottom_diff_md = top_diff_md;
 
     // ---- Initialize relu primitive descriptor -------------
-    relu_backward::desc reluBwd_desc(*top_diff_md, *top_data_md, negative_slope);
+    //relu_backward::desc reluBwd_desc(*top_diff_md, *top_data_md, negative_slope);
+    // MKLDNN is deprecating standalone relu primitive in MKL-DNN.
+    // Now MKLDNN has eltwise primitive with eltwise_relu algorithm inside.
+    eltwise_backward::desc eltwise_reluBwd_desc(eltwise_relu, *top_diff_md, *top_data_md, negative_slope);
 
     // ---- Determining engine to use -----------------------
     std::string subengines = this->layer_param_.engine();
@@ -249,7 +264,7 @@ void MKLDNNReLULayer<Dtype>::InitReLUBwd(const vector<Blob<Dtype>*>& top
     unsigned subEngineIndex = 0;
     for(; subEngineIndex < ep.getNumberOfSubEngines(); subEngineIndex++) {
       try {
-        reluBwd_pd.reset(new relu_backward::primitive_desc(reluBwd_desc,
+        reluBwd_pd.reset(new relu_backward::primitive_desc(eltwise_reluBwd_desc,
                 ep.getMKLDNNSubEngine(subEngineIndex), *reluFwd_pd));
       }
       catch(...) {
@@ -269,8 +284,13 @@ void MKLDNNReLULayer<Dtype>::InitReLUBwd(const vector<Blob<Dtype>*>& top
     bwd_bottom_diff_memory = bwd_bottom_diff->create_output_memory(inplace);
 
     reluBwd.reset(new relu_backward(*reluBwd_pd, *fwd_bottom_data_primitive, *bwd_top_diff_primitive, *bwd_bottom_diff_memory));
-    bwd_top_diff->set_mkldnn_primitive(reluBwd);
-    bwd_bottom_diff->set_mkldnn_primitive(reluBwd);
+    //bwd_top_diff->set_mkldnn_primitive(reluBwd);          //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> bwd_top_diff_primitive_transfer(bwd_top_diff_primitive);
+    bwd_top_diff->set_mkldnn_primitive(bwd_top_diff_primitive_transfer);
+
+    //bwd_bottom_diff->set_mkldnn_primitive(reluBwd);       //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> bwd_bottom_diff_memory_transfer(bwd_bottom_diff_memory);
+    bwd_bottom_diff->set_mkldnn_primitive(bwd_bottom_diff_memory_transfer);
 }
 
 template <typename Dtype>
diff --git a/src/caffe/layers/mkldnn_split_layer.cpp b/src/caffe/layers/mkldnn_split_layer.cpp
index 5e6cf9bab..ab2c5156a 100644
--- a/src/caffe/layers/mkldnn_split_layer.cpp
+++ b/src/caffe/layers/mkldnn_split_layer.cpp
@@ -163,10 +163,14 @@ void MKLDNNSplitLayer<Dtype>::InitSplitBwd(const vector<Blob<Dtype>*>& bottom,
   // there may be reorders to be done for inputs(tops' diffs) 
   // so it match SplitBwd primitive inputs format expectations
   for(int i = 0; i < top.size(); ++i) {
-    bwd_top_diffs_[i]->set_mkldnn_primitive(splitBwd_);
+    //bwd_top_diffs_[i]->set_mkldnn_primitive(splitBwd_);     //Wrong passed primitive! (TODO: Checking!)
+    MKLDNNPrimitive<Dtype> bwd_top_diff_primitive_transfer(bwd_top_diff_primitives_[i]);
+    bwd_top_diffs_[i]->set_mkldnn_primitive(bwd_top_diff_primitive_transfer);
   }
 
-  bwd_bottom_diff_->set_mkldnn_primitive(splitBwd_);
+  //bwd_bottom_diff_->set_mkldnn_primitive(splitBwd_);        //Wrong passed primitive! (TODO: Checking!)
+  MKLDNNPrimitive<Dtype> bwd_bottom_diff_memory_transfer(bwd_bottom_diff_memory_);
+  bwd_bottom_diff_->set_mkldnn_primitive(bwd_bottom_diff_memory_transfer);
 }
 
 
diff --git a/src/caffe/mkldnn_memory.cpp b/src/caffe/mkldnn_memory.cpp
index ddad67f57..bacb6ae61 100644
--- a/src/caffe/mkldnn_memory.cpp
+++ b/src/caffe/mkldnn_memory.cpp
@@ -212,10 +212,11 @@ void MKLDNNMemoryDescriptor<Dtype, is_diff>::convert_from_extprv(shared_ptr<prim
     CHECK(aprimitive);
     if(this->_reorder_extprv2prv_pd == NULL)
         return;
-    if (this->_extprv_memory_pd->desc().data.format == this->_prv_memory_pd->desc().data.format)
+    if (this->_extprv_memory_pd->desc().data.format == this->_prv_memory_pd->desc().data.format &&
+        this->_extprv_memory_pd->desc().data.data_type == this->_prv_memory_pd->desc().data.data_type)
     {
 #ifdef DEBUG
-        LOG(INFO) << "The format of _extprv_memory_pd and _prv_memory_pd is same, no need do conversion.";
+        LOG(INFO) << "The format and data_type of _extprv_memory_pd and _prv_memory_pd is same, no need do conversion.";
 #endif
         return;
     }
diff --git a/src/caffe/multinode/mlsl.cpp b/src/caffe/multinode/mlsl.cpp
index 1653c5692..8a0f772af 100644
--- a/src/caffe/multinode/mlsl.cpp
+++ b/src/caffe/multinode/mlsl.cpp
@@ -41,26 +41,25 @@
 #include "boost/thread/mutex.hpp"
 #include "caffe/multinode/mlsl.hpp"
 
-namespace {
-
-  __attribute__((constructor))
-  void init(int argc, char **argv) {
-    static class initialize {
-    public:
-      initialize(int* argc, char** argv[]) {
-        MLSL::Environment::GetEnv().Init(argc, argv);
-      }
-      ~initialize() {
-        MLSL::Environment::GetEnv().Finalize();
-      }
-    } __init{ &argc, &argv };
-  }
-}
-
 namespace caffe {
   namespace mn {
     boost::mutex distrib_lock;
-    std::map<std::pair<int,int>, boost::shared_ptr<Distribution>> distrib_map;
+    std::map<std::pair<int,int>, boost::shared_ptr<Distribution>> *distrib_map;
+
+    void init(int* argc, char **argv[]) {
+      static class initialize {
+      public:
+        initialize(int* argc, char** argv[]) {
+          MLSL::Environment::GetEnv().Init(argc, argv);
+          distrib_map =
+            new std::map<std::pair<int,int>, boost::shared_ptr<Distribution>>();
+        }
+        ~initialize() {
+          delete distrib_map;
+          MLSL::Environment::GetEnv().Finalize();
+        }
+      } __init{ argc, argv };
+    }
     
     shared_ptr<Distribution> create_distrib(
       int dataParts, int modelParts, int dataColor, int modelColor,
@@ -73,15 +72,15 @@ namespace caffe {
     Distribution * get_distrib(int dataParts, int modelParts) {
       boost::mutex::scoped_lock l(distrib_lock);
       std::pair<int,int> key = std::make_pair(dataParts, modelParts);
-      if (distrib_map.find(key) == distrib_map.end()) {
+      if (distrib_map->find(key) == distrib_map->end()) {
         int node_id = get_node_id();
         int num_nodes = get_nodes_count();
         int modelColor = node_id / modelParts;
         int dataColor = node_id % (num_nodes / dataParts);
-        distrib_map[key] = boost::shared_ptr<Distribution>(
+        (*distrib_map)[key] = boost::shared_ptr<Distribution>(
           new Distribution(dataParts, modelParts, dataColor, modelColor));
       }
-      return distrib_map[key].get();
+      return (*distrib_map)[key].get();
     }
 
     Distribution * get_distrib() {
diff --git a/src/caffe/multinode/multi_solver.cpp b/src/caffe/multinode/multi_solver.cpp
index 86e9b37ef..13ad8da2b 100644
--- a/src/caffe/multinode/multi_solver.cpp
+++ b/src/caffe/multinode/multi_solver.cpp
@@ -46,79 +46,141 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 namespace caffe {
 
-template <typename Dtype>
-Dtype MultiSolver<Dtype>::ForwardBackwardImpl(bool first, bool last) {
+#ifdef CAFFE_PER_LAYER_TIMINGS
+#define LAYER_TIMING_START() do { \
+  root_solver_->timer.Start(); \
+}while(0)
 
-  Dtype loss = 0;
+#define LAYER_TIMING_STOP(name, index) do { \
+  root_solver_->name##_time_per_layer[index] += root_solver_->timer.MicroSeconds(); \
+}while(0)
+#else
+#define LAYER_TIMING_START()
+
+#define LAYER_TIMING_STOP(name,index)
+#endif
+
+template <typename Dtype>
+inline bool MultiSolver<Dtype>::IsSkipWaitGradient(int layer_id) {
   Net<Dtype>& net = *root_solver_->net();
   const std::vector<shared_ptr<Layer<Dtype>>>& layers{ net.layers() };
   const std::vector<bool>& layer_need_backward{ net.layer_need_backward() };
 
-#ifdef CAFFE_PER_LAYER_TIMINGS
-  Timer& timer = root_solver_->timer;
-  std::vector<double>& forward_time_per_layer = root_solver_->forward_time_per_layer;
-  std::vector<double>& backward_time_per_layer = root_solver_->backward_time_per_layer;
-  std::vector<double>& update_time_per_layer = root_solver_->update_time_per_layer;
-#endif /* CAFFE_PER_LAYER_TIMINGS */
+  if (!layer_need_backward[layer_id] || ((layers[layer_id]->layerOp != nullptr)
+        && !layers[layer_id]->layerOp->HasParameterSets())) {
+      DLOG(INFO) << "ForwardBackwardImpl: no need for apply_updates for layer # "
+        << layer_id << ", skip on_delwt_wait, apply_updates, on_wtinc_ready";
+      return true;
+  }
+  return false;
+}
 
-  net.ClearParamDiffs();
+template <typename Dtype>
+inline void MultiSolver<Dtype>::WaitAndUpdateGradient(int layer_id) {
+  LAYER_TIMING_START();
+  for (int j = 0; j < callbacks_.size(); ++j) {
+    callbacks_[j]->on_delwt_wait(layer_id);
+  }
+  LAYER_TIMING_STOP(waitcomm, layer_id);
 
-  for (int i = 0; i < layers.size(); ++i) {
-#ifdef CAFFE_PER_LAYER_TIMINGS
-    timer.Start();
+#ifdef FW_OVERLAP_OPT
+  if (layer_finished_flags_[layer_id]) {
 #endif
-    loss += net.ForwardFromTo(i, i);
+    LAYER_TIMING_START();
+    for (int j = 0; j < callbacks_.size(); ++j) {
+      callbacks_[j]->apply_updates(layer_id);
+    }
+    LAYER_TIMING_STOP(update, layer_id);
+#ifdef FW_OVERLAP_OPT
+  }
+#endif
+}
 
-#ifdef CAFFE_PER_LAYER_TIMINGS
-    forward_time_per_layer[i] += timer.MicroSeconds();
+template <typename Dtype>
+Dtype MultiSolver<Dtype>::ForwardBackwardImpl(bool first, bool last) {
+  Dtype loss = 0;
+  Net<Dtype>& net = *root_solver_->net();
+  const std::vector<shared_ptr<Layer<Dtype>>>& layers{ net.layers() };
+  const std::vector<bool>& layer_need_backward{ net.layer_need_backward() };
+
+  for (int i = 0; i < layers.size(); ++i) {
+#ifdef FW_OVERLAP_OPT
+    if (first && IsSkipWaitGradient(i) == false) {
+      while (layer_finished_flags_[i] == false) {
+        WaitAndUpdateGradient(i);
+        if (layer_finished_flags_[i])
+          break;
+
+        for (int k=i+1; k<layers.size(); k++) {
+          if (layer_finished_flags_[k] || IsSkipWaitGradient(k)) {
+            layer_finished_flags_[k] = true;
+            continue;
+          }
+          WaitAndUpdateGradient(k);
+          if (layer_finished_flags_[k])
+            break;
+        }
+      }
+    }
 #endif
+
+    LAYER_TIMING_START();
+    loss += net.ForwardFromTo(i, i);
+    LAYER_TIMING_STOP(forward, i);
   }
 
   for (int i = layers.size() - 1; i >= 0; --i) {
-#ifdef CAFFE_PER_LAYER_TIMINGS
-    timer.Start();
-#endif
-
     if (!layer_need_backward[i]) {
       continue;
     }
-
+    
+    LAYER_TIMING_START();
     net.BackwardFromTo(i, i);
+    LAYER_TIMING_STOP(backward, i);
 
-    if (last && (layers[i]->layerOp != nullptr) && layers[i]->layerOp->HasParameterSets()) {
+    if (last && (layers[i]->layerOp != nullptr)
+        && layers[i]->layerOp->HasParameterSets()) {
+      LAYER_TIMING_START();
       for (int j = 0; j < callbacks_.size(); ++j) {
-          callbacks_[j]->on_iter_finished(i);
+        callbacks_[j]->on_iter_finished(i);
       }
+      LAYER_TIMING_STOP(startcomm, i);
     }
-
-#ifdef CAFFE_PER_LAYER_TIMINGS
-    backward_time_per_layer[i] += timer.MicroSeconds();
-#endif
   }
 
+#ifdef FW_OVERLAP_OPT
+  int iter = root_solver_->iter();
+  int max_iter = root_solver_->param().max_iter();
+  bool test = (root_solver_->param().test_interval()
+          && ((iter + 1) % root_solver_->param().test_interval() == 0));
+  if (last && (test || (iter == max_iter - 1))) {
+    int finished_count = 0;
+    while (finished_count < layers.size()) {
+#else
   if (last) {
-
-    for (int i = 0; i < layers.size(); ++i) {
-#ifdef CAFFE_PER_LAYER_TIMINGS
-      timer.Start();
 #endif
-      if (!layer_need_backward[i] || ((layers[i]->layerOp != nullptr) && !layers[i]->layerOp->HasParameterSets())) {
-        DLOG(INFO) << "ForwardBackwardImpl: no need for apply_updates for layer # " << i
-          << ", skip on_delwt_wait, apply_updates, on_wtinc_ready";
-        continue;
-      }
-
-      for (int j = 0; j < callbacks_.size(); ++j) {
-        callbacks_[j]->on_delwt_wait(i);
-      }
+      for (int i = 0; i < layers.size(); ++i) {
+        if (IsSkipWaitGradient(i)) {
+#ifdef FW_OVERLAP_OPT
+          finished_count++;
+          layer_finished_flags_[i] = true;
+#endif
+          continue;
+        }
+#ifdef FW_OVERLAP_OPT
+        if (layer_finished_flags_[i])
+          continue;
+#endif
 
-      for (int j = 0; j < callbacks_.size(); ++j) {
-        callbacks_[j]->apply_updates(i);
-      }
-#ifdef CAFFE_PER_LAYER_TIMINGS
-      update_time_per_layer[i] += timer.MicroSeconds();
+        WaitAndUpdateGradient(i);
+#ifdef FW_OVERLAP_OPT
+        if (layer_finished_flags_[i])
+          finished_count++;
 #endif
+      }
+#ifdef FW_OVERLAP_OPT
     }
+#endif
   }
 
   DLOG(WARNING) << "iter " << root_solver_->iter() << ", loss " << loss;
@@ -128,6 +190,7 @@ Dtype MultiSolver<Dtype>::ForwardBackwardImpl(bool first, bool last) {
 template <typename Dtype>
 Dtype MultiSolver<Dtype>::ForwardBackward() {
   Dtype loss = 0;
+  root_solver_->net()->ClearParamDiffs();
   for (int i = 0; i < iter_size; ++i) {
     loss += ForwardBackwardImpl(
       (i == 0), (i + 1 == iter_size));
diff --git a/src/caffe/multinode/multi_sync.cpp b/src/caffe/multinode/multi_sync.cpp
index eb6229ed4..448172c7b 100644
--- a/src/caffe/multinode/multi_sync.cpp
+++ b/src/caffe/multinode/multi_sync.cpp
@@ -53,12 +53,19 @@ MultiSync<Dtype>::MultiSync(shared_ptr<Solver<Dtype> > root_solver)
     root_solver->set_iter(1);
 
   layer_param_ids.resize(layers.size());
+#ifdef FW_OVERLAP_OPT
+  param_ids_finished_flags.resize(layers.size());
+#endif
 
   for (int layer_id = 0; layer_id < layers.size(); layer_id++) {
     shared_ptr<Layer<Dtype> > layer = layers[layer_id];
 
     /* cache param ids */
     layer_param_ids[layer_id] = net->get_layer_learnable_param_ids(layer_id);
+#ifdef FW_OVERLAP_OPT
+    param_ids_finished_flags[layer_id].resize(layer_param_ids[layer_id].size());
+    std::fill(param_ids_finished_flags[layer_id].begin(), param_ids_finished_flags[layer_id].end(), false);
+#endif
   }
 }
 
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 19e3dd7e1..a4224f9ba 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -267,6 +267,8 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
             batch_size = layer_param.memory_data_param().batch_size();
         else if (!layer_param.type().compare("WindowData"))
             batch_size = layer_param.window_data_param().batch_size();
+        else if (!layer_param.type().compare("Input"))
+            batch_size = layer_param.input_param().shape(0).dim(0);
 
         if (caffe::TRAIN == param.state().phase()) {
             LOG(WARNING) << "SetMinibatchSize " << batch_size;
@@ -493,7 +495,7 @@ void Net<Dtype>::CompileNet(const NetParameter& param,
   NetParameter param_temp0;
   param_temp0.CopyFrom(param);
   param_temp0.clear_layer();
-  RemoveBNScale(param, &param_temp0);
+  RemoveBNScale<Dtype>(param, &param_temp0);
 
   NetParameter param_temp;  // temporary compiled param
   param_temp.CopyFrom(param_temp0);
@@ -616,26 +618,8 @@ void Net<Dtype>::CompilationRuleTwo(const NetParameter& param,
     // then we can remove ReLU layer
     // and rename Convolution top blob after deleted ReLU's top
     // Note: Currently merging of convolution and relu layers is feasible
-    // only for caffe::TEST phase, as there is no Backward primitive of conv Relu
-
     // If current layer is Convolution of MKLDNN engine..
-    /*
-    //Old Structure:        if ((A == TEST) && (B == 0) && ((C == ConvolutionParameter_Engine_MKLDNN) || ((D == ConvolutionParameter_Engine_DEFAULT) && ((E == 0 && F == string::npos)) || ((G == "" && H == 0 && I == string::npos)))))
-    //New tmp Structure:    if ((A == TEST) && (B == 0) && ((C == ConvolutionParameter_Engine_MKLDNN) || (((D == ConvolutionParameter_Engine_DEFAULT) && ((E == 0 && F == string::npos))) || ((G == "" && H == 0 && I == string::npos)))))
-    //New Structure:        if ((A == TEST) && (B == 0) && ((C == ConvolutionParameter_Engine_MKLDNN) || (((D == ConvolutionParameter_Engine_DEFAULT) && (E == 0 && F == string::npos)) || (G == "" && H == 0 && I == string::npos))))
-    //Old Structure:
-    //if ((A == TEST) &&
-    //    (B == 0) &&
-    //   ((C == ConvolutionParameter_Engine_MKLDNN)
-    //   || ((D == ConvolutionParameter_Engine_DEFAULT) &&
-    //        ((E == 0
-    //        && F == string::npos)) ||
-    //        ((G == "" &&
-    //          H == 0 &&
-    //          I == string::npos)))))
-    */
-    if ((param.state().phase() == TEST) &&
-        (layer_param->type().compare("Convolution") == 0) &&
+    if ((layer_param->type().compare("Convolution") == 0) &&
        ((layer_param->convolution_param().engine() == ConvolutionParameter_Engine_MKLDNN)
        || (((layer_param->convolution_param().engine() == ConvolutionParameter_Engine_DEFAULT) &&
             (param.engine().compare(0, 6, "MKLDNN") == 0
@@ -652,20 +636,6 @@ void Net<Dtype>::CompilationRuleTwo(const NetParameter& param,
 
       // Consumer layer of blob produced by Conv
       // has to be ReLU layer with one Input Blob
-      /*
-      //Old Structure:      if ((A == 0) && ((B == ReLUParameter_Engine_MKLDNN) || ((C == ReLUParameter_Engine_DEFAULT) && ((D == 0 && E == string::npos)) || ((F == "" && G == 0 && H == string::npos)))))
-      //New tmp Structure:  if ((A == 0) && ((B == ReLUParameter_Engine_MKLDNN) || (((C == ReLUParameter_Engine_DEFAULT) && ((D == 0 && E == string::npos))) || ((F == "" && G == 0 && H == string::npos)))))
-      //New Structure:      if ((A == 0) && ((B == ReLUParameter_Engine_MKLDNN) || (((C == ReLUParameter_Engine_DEFAULT) && (D == 0 && E == string::npos)) || (F == "" && G == 0 && H == string::npos))))
-      //Old Structure:
-      //if ((A == 0) &&
-      //  ((B == ReLUParameter_Engine_MKLDNN)
-      //  || ((C == ReLUParameter_Engine_DEFAULT) &&
-      //      ((D == 0
-      //      && E == string::npos)) ||
-      //      ((F == "" &&
-      //        G == 0 &&
-      //        H == string::npos)))))
-      */
       if ((consumer_layer_param.type().compare("ReLU") == 0) &&
         ((consumer_layer_param.relu_param().engine() == ReLUParameter_Engine_MKLDNN)
         || (((consumer_layer_param.relu_param().engine() == ReLUParameter_Engine_DEFAULT) &&
@@ -676,30 +646,43 @@ void Net<Dtype>::CompilationRuleTwo(const NetParameter& param,
               layer_param->engine().find(":DLA", 6) == string::npos)))) {
         string& convolution_top_blob_name =
             const_cast<string&>(layer_param->top(0));
-        const string& scale_top_blob_name = consumer_layer_param.top(0);
-        // Mark Consumer layer (its name) as the one marked for dropping
-        layers_to_drop.insert(consumer_layer_param.name());
 
-        // Replace Convolution top name with ReLU top name
-        convolution_top_blob_name.resize(scale_top_blob_name.size());
-        convolution_top_blob_name.replace(0,
-                                        scale_top_blob_name.size(),
-                                        scale_top_blob_name);
+        if(param.state().phase() == TEST) {
+          const string& scale_top_blob_name = consumer_layer_param.top(0);
+          // Mark Consumer layer (its name) as the one marked for dropping
+          layers_to_drop.insert(consumer_layer_param.name());
+
+          // Replace Convolution top name with ReLU top name
+          convolution_top_blob_name.resize(scale_top_blob_name.size());
+          convolution_top_blob_name.replace(0,
+                                          scale_top_blob_name.size(),
+                                          scale_top_blob_name);
+        }
         // set relu flag in convolution
         layer_param->mutable_convolution_param()->set_relu(true);
         float negative_slope1 =
                   consumer_layer_param.relu_param().negative_slope();
         layer_param->mutable_convolution_param()->
                     set_negative_slope(negative_slope1);
+
+        if(param.state().phase() == TRAIN) {
+          if(i+1 < param.layer_size()) {
+            LayerParameter* relu_layer_param =
+              (const_cast<NetParameter&>(param)).mutable_layer(i+1);
+            relu_layer_param->mutable_relu_param()->set_fuse(true);
+          }
+        }
       }
     }
 
-    if (layers_to_drop.find(layer_param->name()) != layers_to_drop.end()) {
-      LOG_IF(INFO, Caffe::root_solver()) << "Dropped layer: "
-             << layer_param->name() << std::endl;
-      layer_included = false;
-      // Remove dropped layer from the list of layers to be dropped
-      layers_to_drop.erase(layers_to_drop.find(layer_param->name()));
+    if(param.state().phase() == TEST) {
+      if (layers_to_drop.find(layer_param->name()) != layers_to_drop.end()) {
+        LOG_IF(INFO, Caffe::root_solver()) << "Dropped layer: "
+               << layer_param->name() << std::endl;
+        layer_included = false;
+        // Remove dropped layer from the list of layers to be dropped
+        layers_to_drop.erase(layers_to_drop.find(layer_param->name()));
+      }
     }
 
     if (layer_included) {
@@ -763,107 +746,6 @@ void Net<Dtype>::CompilationRuleThree(const NetParameter& param,
   return;
 }
 
-
-template <typename Dtype>
-void Net<Dtype>::RemoveBNScale(const NetParameter& param,
-                             NetParameter* param_compiled) {
-    // - In TEST Phase, if we detect sequential layers conv->batch norm ->scale,
-    // We will merge batch norm and scale layer into conv layer.
-  if(param.state().phase() != TEST) {
-    param_compiled->CopyFrom(param);
-    param_compiled->mutable_compile_net_state()->set_bn_scale_remove(false);
-    return ;
-  }
-
-  bool bn_scale_remove = false;
-  bool is_net_init = param.compile_net_state().is_init();
-  std::set<std::string> layers_to_drop;
-  for (int i = 0; i < param.layer_size(); ++i) {
-    LayerParameter *layer_param = (const_cast<NetParameter&>(param)).mutable_layer(i);
-    bool layer_included = true;
-    bool bn_use_global_stats_set = true;
-    if (layer_param->type().compare("Convolution") == 0) {
-      std::vector<const LayerParameter*> child_layers_params;
-      GetBlobConsumers(child_layers_params, layer_param->top(0), param, i + 1 < param.layer_size() ? i + 1 : i);
-      const LayerParameter &child_layer_param = child_layers_params.size() > 0 ? *(child_layers_params[0]) : *layer_param;
-      // check whether child layer is BatchNorm
-      if (child_layer_param.type().compare("BatchNorm") == 0) {
-        BatchNormParameter bn_param = child_layer_param.batch_norm_param();
-        if (is_net_init) {
-          //Testing Network init process
-          bool bn_use_global_stats = true;
-          if (bn_param.has_use_global_stats()) {
-            bn_use_global_stats = bn_param.use_global_stats();
-          }
-          if (!bn_use_global_stats) {
-            //This bn layer's use_global_stats is set manually! Don't remove it.
-            //remained_bn_layer_names.push_back(child_layer_param.name());
-            param_compiled->mutable_compile_net_state()->add_kept_bn_layers(child_layer_param.name());
-            bn_use_global_stats_set = false;
-          }
-        } else {
-          int kept_bn_layers_num = param.compile_net_state().kept_bn_layers_size();
-          bool in_kept_list = false;
-          for (int idx = 0; idx < kept_bn_layers_num; ++idx) {
-            if (child_layer_param.name().compare(param.compile_net_state().kept_bn_layers(idx)) == 0) {
-              in_kept_list = true;
-              break;
-            }
-          }
-          if (in_kept_list) {
-            bn_use_global_stats_set = false;
-          }
-        }
-
-        if (!bn_use_global_stats_set) {
-          //Even in caffe TEST phase, current batch norm layer has set use_global_stats = false in protxt file, so we won't
-          //merge this layer into convolution layer.
-         param_compiled->add_layer()->CopyFrom(*layer_param);
-          continue;
-        }
-        std::vector<const LayerParameter*> grandchild_layers_params;
-        GetBlobConsumers(grandchild_layers_params, child_layer_param.top(0), param, i + 2 < param.layer_size() ? i + 2 : i);
-        const LayerParameter &grandchild_layer_param = (grandchild_layers_params.size() > 0) ? *(grandchild_layers_params[0]) : child_layer_param;
-        if (grandchild_layer_param.type().compare("Scale") == 0) {
-          MergeLayer(*layer_param, grandchild_layer_param);
-          AdjustConvLayer<Dtype>(*layer_param, child_layer_param, grandchild_layer_param, is_net_init);
-          if (bn_scale_remove == false) bn_scale_remove = true;
-          layers_to_drop.insert(child_layer_param.name());
-          layers_to_drop.insert(grandchild_layer_param.name());
-        } else if (&child_layer_param != &grandchild_layer_param) {
-          //In fact, conv-->batchnorm can also be optimized. In such case, we check the blob size of batch norm layer
-          //if is 3, it means current net hasn't used scale layer, this is equivalent to scale layer with all 1 weights and 0 bias
-          //if is 4 or 5, it means intel caffe compilation rule 1 works here, we can recover the scale layer from batch norm layer
-          MergeLayer(*layer_param, child_layer_param);
-          if (!is_net_init) {
-            shared_ptr<LayerParameter> scale_layer_param(new LayerParameter());
-            RecoverScaleFromBN(child_layer_param, *scale_layer_param, (Dtype)1, (Dtype)0);
-            AdjustConvLayer<Dtype>(*layer_param, child_layer_param, *scale_layer_param, is_net_init);
-          } else {
-            AdjustConvLayer<Dtype>(*layer_param, child_layer_param, grandchild_layer_param, true);
-		  }
-          if (bn_scale_remove == false) bn_scale_remove = true;
-          layers_to_drop.insert(child_layer_param.name());
-        }
-      }
-    }
-    if (layers_to_drop.find(layer_param->name()) != layers_to_drop.end()) {
-      LOG_IF(INFO, Caffe::root_solver()) << "Dropped Layer: "<< layer_param->name() << std::endl;
-      layer_included = false;
-      // Remove dropped layer from the list of layers to be dropped
-      layers_to_drop.erase(layers_to_drop.find(layer_param->name()));
-    }
-    if (layer_included) {
-            if (layer_param->type().compare("BatchNorm") == 0) {
-              param_compiled->mutable_compile_net_state()->add_kept_bn_layers(layer_param->name());
-            }
-            param_compiled->add_layer()->CopyFrom(*layer_param);
-    }
-  }
-
-  param_compiled->mutable_compile_net_state()->set_bn_scale_remove(bn_scale_remove);
- }
-
 template <typename Dtype>
 void Net<Dtype>::GetBlobConsumers(
                   std::vector<const LayerParameter*>& consumer_blobs,
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index c4c5228e5..3bf537607 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -645,6 +645,14 @@ message TransformationParameter {
   optional ExpansionParameter expand_param = 14;
   // Constraint for emitting the annotation after transformation.
   optional EmitConstraint emit_constraint = 10;
+  // Resize the input randomly
+  optional RandomResizeParameter random_resize_param = 15;
+}
+
+message RandomResizeParameter {
+  optional uint32 min_size = 1 [default = 0];
+  optional uint32 max_size = 2 [default = 0];
+  optional ResizeParameter resize_param = 3;
 }
 
 // Message that stores parameters used by data transformer for resize policy
@@ -1626,6 +1634,7 @@ message ReLUParameter {
     MKLDNN = 4;
   }
   optional Engine engine = 2 [default = DEFAULT];
+  optional bool fuse = 3 [default = false];
 }
 
 message ReshapeParameter {
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index cf8c31b47..3f17c5c58 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -394,10 +394,17 @@ void Solver<Dtype>::InitTimers() {
   this->forward_time_per_layer.resize(layer_count, 0.0);
   this->backward_time_per_layer.resize(layer_count, 0.0);
   this->update_time_per_layer.resize(layer_count, 0.0);
-
+#ifdef USE_MLSL
+  this->startcomm_time_per_layer.resize(layer_count, 0.0);
+  this->waitcomm_time_per_layer.resize(layer_count, 0.0);
+#endif
   this->forward_time_per_layer_total.resize(layer_count, 0.0);
   this->backward_time_per_layer_total.resize(layer_count, 0.0);
   this->update_time_per_layer_total.resize(layer_count, 0.0);
+#ifdef USE_MLSL
+  this->startcomm_time_per_layer_total.resize(layer_count, 0.0);
+  this->waitcomm_time_per_layer_total.resize(layer_count, 0.0);
+#endif
 }
 
 template <typename Dtype>
@@ -419,6 +426,19 @@ void Solver<Dtype>::ResetTimers() {
                  this->update_time_per_layer.begin(),
                  this->update_time_per_layer_total.begin(),
                  std::plus<int>());
+#ifdef USE_MLSL
+  std::transform(this->startcomm_time_per_layer_total.begin(),
+                 this->startcomm_time_per_layer_total.end(),
+                 this->startcomm_time_per_layer.begin(),
+                 this->startcomm_time_per_layer_total.begin(),
+                 std::plus<int>());
+
+  std::transform(this->waitcomm_time_per_layer_total.begin(),
+                 this->waitcomm_time_per_layer_total.end(),
+                 this->waitcomm_time_per_layer.begin(),
+                 this->waitcomm_time_per_layer_total.begin(),
+                 std::plus<int>());
+#endif
 
   std::fill(this->forward_time_per_layer.begin(),
           this->forward_time_per_layer.end(), 0);
@@ -426,6 +446,12 @@ void Solver<Dtype>::ResetTimers() {
           this->backward_time_per_layer.end(), 0);
   std::fill(this->update_time_per_layer.begin(),
           this->update_time_per_layer.end(), 0);
+#ifdef USE_MLSL
+  std::fill(this->startcomm_time_per_layer.begin(),
+          this->startcomm_time_per_layer.end(), 0);
+  std::fill(this->waitcomm_time_per_layer.begin(),
+          this->waitcomm_time_per_layer.end(), 0);
+#endif
 }
 
 template <typename Dtype>
@@ -444,7 +470,13 @@ void Solver<Dtype>::PrintTimers(bool printTotal) {
         backward_time_per_layer_total : backward_time_per_layer;
     std::vector<double>& update_timers = printTotal ?
         update_time_per_layer_total : update_time_per_layer;
+#ifdef USE_MLSL
+    std::vector<double>& startcomm_timers = printTotal ?
+        startcomm_time_per_layer_total : startcomm_time_per_layer;
+    std::vector<double>& waitcomm_timers = printTotal ?
+        waitcomm_time_per_layer_total : waitcomm_time_per_layer;
     std::string prefix = printTotal ? "TOTAL " : "DELTA ";
+#endif
 
     double forward_time = std::accumulate(forward_timers.begin(),
             forward_timers.end(), 0) / 1000;
@@ -479,8 +511,37 @@ void Solver<Dtype>::PrintTimers(bool printTotal) {
     }
     LOG(WARNING) << std::endl;
 
-    LOG(WARNING) << prefix << "TIME (F+B+U): " << (forward_time +
-            backward_time + update_time) / 1000 << " sec";
+#ifdef USE_MLSL
+    double startcomm_time = std::accumulate(startcomm_timers.begin(),
+            startcomm_timers.end(), 0) / 1000;
+    LOG(WARNING) << prefix << "START COMMUNICATION TIME: " << startcomm_time << " ms";
+    for (int layer_idx = 0; layer_idx < net_->layers().size(); layer_idx++) {
+        LOG(WARNING) << "LAYER-" << layer_idx << " "
+                     << net_->layers()[layer_idx]->type()
+                     << ": startcomm_time: " << startcomm_timers[layer_idx] / 1000
+                     << " ms";
+    }
+    LOG(WARNING) << std::endl;
+
+    double waitcomm_time = std::accumulate(waitcomm_timers.begin(),
+            waitcomm_timers.end(), 0) / 1000;
+    LOG(WARNING) << prefix << "WAIT COMMUNICATION TIME: " << waitcomm_time << " ms";
+    for (int layer_idx = 0; layer_idx < net_->layers().size(); layer_idx++) {
+        LOG(WARNING) << "LAYER-" << layer_idx << " "
+                     << net_->layers()[layer_idx]->type()
+                     << ": waitcomm_time: " << waitcomm_timers[layer_idx] / 1000
+                     << " ms";
+    }
+    LOG(WARNING) << std::endl;
+
+    LOG(WARNING) << prefix << "TIME (Computation + Communication): " << (forward_time +
+        backward_time + update_time + startcomm_time + waitcomm_time) / 1000
+        << " sec";
+#else
+    LOG(WARNING) << prefix << "TIME (Computation): " << (forward_time +
+        backward_time + update_time) / 1000 << " sec";
+#endif
+
     LOG(WARNING) << "####################################################";
     LOG(WARNING) << std::endl;
 }
diff --git a/src/caffe/test/test_net.cpp b/src/caffe/test/test_net.cpp
index ae4d3f03f..5b97a8bfb 100644
--- a/src/caffe/test/test_net.cpp
+++ b/src/caffe/test/test_net.cpp
@@ -2987,6 +2987,230 @@ class CompileNetTest : public ::testing::Test {
   }
 };
 
+TEST_F(CompileNetTest, TestRemoveBatchNorm1) {
+  const string& input_proto = 
+      "name: 'TestNetwork' "
+      "layer { "
+      "  name: 'data' "
+      "  type: 'Data' "
+      "  top: 'data' "
+      "  top: 'label' "
+      "} "
+      "layer { "
+      "  bottom: 'data' "
+      "  name: 'conv' "
+      "  top: 'conv' "
+      "  type: 'Convolution' "
+      "} "
+      "layer { "
+      "  bottom: 'conv' "
+      "  name: 'bn' "
+      "  top: 'conv' "
+      "  type: 'BatchNorm' "
+      "} "
+      "layer { "
+      "  name: 'loss' "
+      "  type: 'SoftmaxWithLoss' "
+      "  bottom: 'conv' "
+      "  bottom: 'label' "
+      "} ";
+
+  const string& output_proto =
+      "name: 'TestNetwork' "
+      "layer { "
+      "  name: 'data' "
+      "  type: 'Data' "
+      "  top: 'data' "
+      "  top: 'label' "
+      "} "
+      "layer { "
+      "  bottom: 'data' "
+      "  name: 'conv' "
+      "  top: 'conv' "
+      "  type: 'Convolution' "
+      "} "
+      "layer { "
+      "  name: 'loss' "
+      "  type: 'SoftmaxWithLoss' "
+      "  bottom: 'conv' "
+      "  bottom: 'label' "
+      "} ";
+  this->RunCompilerNetTest(input_proto, output_proto);
+}
+
+TEST_F(CompileNetTest, TestRemoveBatchNorm2) {
+  const string& input_proto = 
+      "name: 'TestNetwork' "
+      "layer { "
+      "  name: 'data' "
+      "  type: 'Data' "
+      "  top: 'data' "
+      "  top: 'label' "
+      "} "
+      "layer { "
+      "  bottom: 'data' "
+      "  name: 'fc1' "
+      "  top: 'fc1' "
+      "  type: 'InnerProduct' "
+      "} "
+      "layer { "
+      "  bottom: 'fc1' "
+      "  name: 'bn' "
+      "  top: 'bn' "
+      "  type: 'BatchNorm' "
+      "} "
+      "layer { "
+      "  name: 'loss' "
+      "  type: 'SoftmaxWithLoss' "
+      "  bottom: 'bn' "
+      "  bottom: 'label' "
+      "} ";
+
+  const string& output_proto =
+      "name: 'TestNetwork' "
+      "layer { "
+      "  name: 'data' "
+      "  type: 'Data' "
+      "  top: 'data' "
+      "  top: 'label' "
+      "} "
+      "layer { "
+      "  bottom: 'data' "
+      "  name: 'fc1' "
+      "  top: 'fc1' "
+      "  type: 'InnerProduct' "
+      "} "
+      "layer { "
+      "  bottom: 'fc1' "
+      "  name: 'bn' "
+      "  top: 'bn' "
+      "  type: 'BatchNorm' "
+      "} "
+      "layer { "
+      "  name: 'loss' "
+      "  type: 'SoftmaxWithLoss' "
+      "  bottom: 'bn' "
+      "  bottom: 'label' "
+      "} ";
+  this->RunCompilerNetTest(input_proto, output_proto);
+}
+
+TEST_F(CompileNetTest, TestRemoveBatchNorm3) {
+  const string& input_proto = 
+      "name: 'TestNetwork' "
+      "layer { "
+      "  name: 'data' "
+      "  type: 'Data' "
+      "  top: 'data' "
+      "  top: 'label' "
+      "} "
+      "layer { "
+      "  bottom: 'data' "
+      "  name: 'conv' "
+      "  top: 'conv' "
+      "  type: 'Convolution' "
+      "} "
+      "layer { "
+      "  bottom: 'conv' "
+      "  name: 'bn' "
+      "  top: 'conv' "
+      "  type: 'BatchNorm' "
+	  "  batch_norm_param { "
+	  "    use_global_stats: false"
+	  "  }"
+      "} "
+      "layer { "
+      "  name: 'loss' "
+      "  type: 'SoftmaxWithLoss' "
+      "  bottom: 'conv' "
+      "  bottom: 'label' "
+      "} ";
+
+  const string& output_proto =
+      "name: 'TestNetwork' "
+      "layer { "
+      "  name: 'data' "
+      "  type: 'Data' "
+      "  top: 'data' "
+      "  top: 'label' "
+      "} "
+      "layer { "
+      "  bottom: 'data' "
+      "  name: 'conv' "
+      "  top: 'conv' "
+      "  type: 'Convolution' "
+      "} "
+      "layer { "
+      "  bottom: 'conv' "
+      "  name: 'bn' "
+      "  top: 'conv' "
+      "  type: 'BatchNorm' "
+	  "  batch_norm_param { "
+	  "    use_global_stats: false"
+	  "  }"
+      "} "
+      "layer { "
+      "  name: 'loss' "
+      "  type: 'SoftmaxWithLoss' "
+      "  bottom: 'conv' "
+      "  bottom: 'label' "
+      "} ";
+  this->RunCompilerNetTest(input_proto, output_proto);
+}
+
+TEST_F(CompileNetTest, TestRemoveBatchNorm4) {
+  const string& input_proto = 
+      "name: 'TestNetwork' "
+      "layer { "
+      "  name: 'data' "
+      "  type: 'Data' "
+      "  top: 'data' "
+      "  top: 'label' "
+      "} "
+      "layer { "
+      "  bottom: 'data' "
+      "  name: 'conv' "
+      "  top: 'conv' "
+      "  type: 'Convolution' "
+      "} "
+      "layer { "
+      "  bottom: 'conv' "
+      "  name: 'bn' "
+      "  top: 'conv' "
+      "  type: 'BatchNorm' "
+	  "  batch_norm_param { "
+	  "    use_global_stats: true"
+	  "  }"
+      "} "
+      "layer { "
+      "  name: 'loss' "
+      "  type: 'SoftmaxWithLoss' "
+      "  bottom: 'conv' "
+      "  bottom: 'label' "
+      "} ";
+
+  const string& output_proto =
+      "name: 'TestNetwork' "
+      "layer { "
+      "  name: 'data' "
+      "  type: 'Data' "
+      "  top: 'data' "
+      "  top: 'label' "
+      "} "
+      "layer { "
+      "  bottom: 'data' "
+      "  name: 'conv' "
+      "  top: 'conv' "
+      "  type: 'Convolution' "
+      "} "
+      "layer { "
+      "  name: 'loss' "
+      "  type: 'SoftmaxWithLoss' "
+      "  bottom: 'conv' "
+      "  bottom: 'label' "
+      "} ";
+  this->RunCompilerNetTest(input_proto, output_proto);
+}
 #ifdef MKL2017_SUPPORTED
 // If BatchNorm of engine MKL2017
 // produce blob consumed by
diff --git a/src/caffe/util/remove_batch_norm.cpp b/src/caffe/util/remove_batch_norm.cpp
index 63c9b3f81..8c56639fc 100644
--- a/src/caffe/util/remove_batch_norm.cpp
+++ b/src/caffe/util/remove_batch_norm.cpp
@@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "caffe/blob.hpp"
 #include "caffe/util/remove_batch_norm.hpp"
 #include "caffe/util/math_functions.hpp"
+#include "caffe/net.hpp"
 namespace caffe {
 
 template <typename Dtype>
@@ -188,6 +189,106 @@ void RecoverBNScaleMergedNet(NetParameter * net_param, NetParameter* recovered_n
   }
 }
 
+template <typename Dtype>
+void RemoveBNScale(const NetParameter& param, NetParameter* param_compiled) {
+
+  // - In TEST Phase, if we detect sequential layers conv->batch norm ->scale,
+    // We will merge batch norm and scale layer into conv layer.
+  if(param.state().phase() != TEST) {
+    param_compiled->CopyFrom(param);
+    param_compiled->mutable_compile_net_state()->set_bn_scale_remove(false);
+    return ;
+  }
+
+  bool bn_scale_remove = false;
+  bool is_net_init = param.compile_net_state().is_init();
+  std::set<std::string> layers_to_drop;
+  for (int i = 0; i < param.layer_size(); ++i) {
+    LayerParameter *layer_param = (const_cast<NetParameter&>(param)).mutable_layer(i);
+    bool layer_included = true;
+    bool bn_use_global_stats_set = true;
+    if (layer_param->type().compare("Convolution") == 0) {
+      std::vector<const LayerParameter*> child_layers_params;
+	  Net<Dtype>::GetBlobConsumers(child_layers_params, layer_param->top(0), param, i + 1 < param.layer_size() ? i + 1 : i);
+      const LayerParameter &child_layer_param = child_layers_params.size() > 0 ? *(child_layers_params[0]) : *layer_param;
+      // check whether child layer is BatchNorm
+      if (child_layer_param.type().compare("BatchNorm") == 0) {
+        BatchNormParameter bn_param = child_layer_param.batch_norm_param();
+        if (is_net_init) {
+          //Testing Network init process
+          bool bn_use_global_stats = true;
+          if (bn_param.has_use_global_stats()) {
+            bn_use_global_stats = bn_param.use_global_stats();
+          }
+          if (!bn_use_global_stats) {
+            //This bn layer's use_global_stats is set manually! Don't remove it.
+            //remained_bn_layer_names.push_back(child_layer_param.name());
+            param_compiled->mutable_compile_net_state()->add_kept_bn_layers(child_layer_param.name());
+            bn_use_global_stats_set = false;
+          }
+        } else {
+          int kept_bn_layers_num = param.compile_net_state().kept_bn_layers_size();
+          bool in_kept_list = false;
+          for (int idx = 0; idx < kept_bn_layers_num; ++idx) {
+            if (child_layer_param.name().compare(param.compile_net_state().kept_bn_layers(idx)) == 0) {
+              in_kept_list = true;
+              break;
+            }
+          }
+          if (in_kept_list) {
+            bn_use_global_stats_set = false;
+          }
+        }
+
+        if (!bn_use_global_stats_set) {
+          //Even in caffe TEST phase, current batch norm layer has set use_global_stats = false in protxt file, so we won't
+          //merge this layer into convolution layer.
+         param_compiled->add_layer()->CopyFrom(*layer_param);
+          continue;
+        }
+        std::vector<const LayerParameter*> grandchild_layers_params;
+		Net<Dtype>::GetBlobConsumers(grandchild_layers_params, child_layer_param.top(0), param, i + 2 < param.layer_size() ? i + 2 : i);
+        const LayerParameter &grandchild_layer_param = (grandchild_layers_params.size() > 0) ? *(grandchild_layers_params[0]) : child_layer_param;
+        if (grandchild_layer_param.type().compare("Scale") == 0) {
+          MergeLayer(*layer_param, grandchild_layer_param);
+          AdjustConvLayer<Dtype>(*layer_param, child_layer_param, grandchild_layer_param, is_net_init);
+          if (bn_scale_remove == false) bn_scale_remove = true;
+          layers_to_drop.insert(child_layer_param.name());
+          layers_to_drop.insert(grandchild_layer_param.name());
+        } else if (&child_layer_param != &grandchild_layer_param) {
+          //In fact, conv-->batchnorm can also be optimized. In such case, we check the blob size of batch norm layer
+          //if is 3, it means current net hasn't used scale layer, this is equivalent to scale layer with all 1 weights and 0 bias
+          //if is 4 or 5, it means intel caffe compilation rule 1 works here, we can recover the scale layer from batch norm layer
+          MergeLayer(*layer_param, child_layer_param);
+          if (!is_net_init) {
+            shared_ptr<LayerParameter> scale_layer_param(new LayerParameter());
+            RecoverScaleFromBN(child_layer_param, *scale_layer_param, (Dtype)1, (Dtype)0);
+            AdjustConvLayer<Dtype>(*layer_param, child_layer_param, *scale_layer_param, is_net_init);
+          } else {
+            AdjustConvLayer<Dtype>(*layer_param, child_layer_param, grandchild_layer_param, true);
+          }
+          if (bn_scale_remove == false) bn_scale_remove = true;
+          layers_to_drop.insert(child_layer_param.name());
+        }
+      }
+    }
+    if (layers_to_drop.find(layer_param->name()) != layers_to_drop.end()) {
+      LOG_IF(INFO, Caffe::root_solver()) << "Dropped Layer: "<< layer_param->name() << std::endl;
+      layer_included = false;
+      // Remove dropped layer from the list of layers to be dropped
+      layers_to_drop.erase(layers_to_drop.find(layer_param->name()));
+    }
+    if (layer_included) {
+            if (layer_param->type().compare("BatchNorm") == 0) {
+              param_compiled->mutable_compile_net_state()->add_kept_bn_layers(layer_param->name());
+            }
+            param_compiled->add_layer()->CopyFrom(*layer_param);
+    }
+  }
+
+  param_compiled->mutable_compile_net_state()->set_bn_scale_remove(bn_scale_remove);
+}
+
 template void RecoverScaleFromBN<float>(const LayerParameter& bn_layer_param, LayerParameter& scale_layer_param, float default_scale_weights, float default_scale_bias);
 template void RecoverScaleFromBN<double>(const LayerParameter& bn_layer_param, LayerParameter& scale_layer_param, double default_scale_weights, double default_scale_bias);
 template void AdjustConvLayer<float>(LayerParameter &conv_layer,
@@ -200,4 +301,6 @@ template void AdjustConvLayer<double>(LayerParameter &conv_layer,
 
 template void RecoverBNScaleMergedNet<float>(NetParameter * net_param, NetParameter* recovered_net_param);
 template void RecoverBNScaleMergedNet<double>(NetParameter * net_param, NetParameter* recovered_net_param);
+template void RemoveBNScale<float>(const NetParameter& param, NetParameter* param_compiled);
+template void RemoveBNScale<double>(const NetParameter& param, NetParameter* param_compiled);
 }
diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index 3cd2234f6..231209127 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -692,6 +692,9 @@ int main(int argc, char** argv) {
       "  compare         collects layer data using inputs from other device");
   // Run tool or show usage.
   caffe::GlobalInit(&argc, &argv);
+#ifdef USE_MLSL
+  caffe::mn::init(&argc, &argv);
+#endif
   if (argc == 2) {
 #ifdef WITH_PYTHON_LAYER
     try {
diff --git a/xbyak/COPYRIGHT b/xbyak/COPYRIGHT
new file mode 100644
index 000000000..78d3140b8
--- /dev/null
+++ b/xbyak/COPYRIGHT
@@ -0,0 +1,47 @@
+
+Copyright (c) 2007 MITSUNARI Shigeo
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+Neither the name of the copyright owner nor the names of its contributors may
+be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+ソースコード形式かバイナリ形式か、変更するかしないかを問わず、以下の条件を満た
+す場合に限り、再頒布および使用が許可されます。
+
+ソースコードを再頒布する場合、上記の著作権表示、本条件一覧、および下記免責条項
+を含めること。
+バイナリ形式で再頒布する場合、頒布物に付属のドキュメント等の資料に、上記の著作
+権表示、本条件一覧、および下記免責条項を含めること。
+書面による特別の許可なしに、本ソフトウェアから派生した製品の宣伝または販売促進
+に、著作権者の名前またはコントリビューターの名前を使用してはならない。
+本ソフトウェアは、著作権者およびコントリビューターによって「現状のまま」提供さ
+れており、明示黙示を問わず、商業的な使用可能性、および特定の目的に対する適合性
+に関する暗黙の保証も含め、またそれに限定されない、いかなる保証もありません。
+著作権者もコントリビューターも、事由のいかんを問わず、 損害発生の原因いかんを
+問わず、かつ責任の根拠が契約であるか厳格責任であるか（過失その他の）不法行為で
+あるかを問わず、仮にそのような損害が発生する可能性を知らされていたとしても、
+本ソフトウェアの使用によって発生した（代替品または代用サービスの調達、使用の
+喪失、データの喪失、利益の喪失、業務の中断も含め、またそれに限定されない）直接
+損害、間接損害、偶発的な損害、特別損害、懲罰的損害、または結果損害について、
+一切責任を負わないものとします。
diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h
index d0cf6f9c3..31aa0a056 100644
--- a/xbyak/xbyak.h
+++ b/xbyak/xbyak.h
@@ -1,3 +1,48 @@
+/*******************************************************************************
+* Copyright 2016-2017 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*******************************************************************************
+* Copyright (c) 2007 MITSUNARI Shigeo
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*
+* Redistributions of source code must retain the above copyright notice, this
+* list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+* this list of conditions and the following disclaimer in the documentation
+* and/or other materials provided with the distribution.
+* Neither the name of the copyright owner nor the names of its contributors may
+* be used to endorse or promote products derived from this software without
+* specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+* THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
 #pragma once
 #ifndef XBYAK_XBYAK_H_
 #define XBYAK_XBYAK_H_
diff --git a/xbyak/xbyak_bin2hex.h b/xbyak/xbyak_bin2hex.h
index 69ecdbfed..54e0d8ff1 100644
--- a/xbyak/xbyak_bin2hex.h
+++ b/xbyak/xbyak_bin2hex.h
@@ -1,3 +1,48 @@
+/*******************************************************************************
+* Copyright 2016-2017 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*******************************************************************************
+* Copyright (c) 2007 MITSUNARI Shigeo
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*
+* Redistributions of source code must retain the above copyright notice, this
+* list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+* this list of conditions and the following disclaimer in the documentation
+* and/or other materials provided with the distribution.
+* Neither the name of the copyright owner nor the names of its contributors may
+* be used to endorse or promote products derived from this software without
+* specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+* THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
 enum {
 	B00000000= 0,
 	B00000001= 1,
diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h
index ac5be9600..a781f0c30 100644
--- a/xbyak/xbyak_mnemonic.h
+++ b/xbyak/xbyak_mnemonic.h
@@ -1,3 +1,48 @@
+/*******************************************************************************
+* Copyright 2016-2017 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*******************************************************************************
+* Copyright (c) 2007 MITSUNARI Shigeo
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*
+* Redistributions of source code must retain the above copyright notice, this
+* list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+* this list of conditions and the following disclaimer in the documentation
+* and/or other materials provided with the distribution.
+* Neither the name of the copyright owner nor the names of its contributors may
+* be used to endorse or promote products derived from this software without
+* specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+* THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
 const char *getVersionString() const { return "4.87"; }
 void packssdw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x6B); }
 void packsswb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x63); }
diff --git a/xbyak/xbyak_util.h b/xbyak/xbyak_util.h
index 3a7c2c218..5854a1723 100644
--- a/xbyak/xbyak_util.h
+++ b/xbyak/xbyak_util.h
@@ -1,3 +1,48 @@
+/*******************************************************************************
+* Copyright 2016-2017 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*******************************************************************************
+* Copyright (c) 2007 MITSUNARI Shigeo
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*
+* Redistributions of source code must retain the above copyright notice, this
+* list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+* this list of conditions and the following disclaimer in the documentation
+* and/or other materials provided with the distribution.
+* Neither the name of the copyright owner nor the names of its contributors may
+* be used to endorse or promote products derived from this software without
+* specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+* THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
 #ifndef XBYAK_XBYAK_UTIL_H_
 #define XBYAK_XBYAK_UTIL_H_