diff --git a/LICENSE b/LICENSE
index 7f8d0ca57..3316218f5 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,3 +1,4 @@
+
 COPYRIGHT
 
 All modification made by Intel Corporation: © 2017 Intel Corporation.
diff --git a/Makefile.mkldnn b/Makefile.mkldnn
index 797990d77..51f7fcab6 100644
--- a/Makefile.mkldnn
+++ b/Makefile.mkldnn
@@ -8,6 +8,10 @@ MKLDNN_COMMIT := `cat ${CAFFE_ROOTDIR}/mkldnn.commit`
 MKLDNN_CXX := $(CXX)
 MKLDNN_CC := $(CC)
 
+RETURN_STRING=$(shell ./external/mkl/prepare_mkl.sh)
+MKLROOT=$(firstword $(RETURN_STRING))
+MKL_ROOTDIR := $(MKLROOT)
+
 # We do this because earlier versions of CMake have problems with ccache
 ifneq (,$(findstring ccache,$(CXX)))
 	MKLDNN_CXX := $(lastword $(CXX))
@@ -18,7 +22,7 @@ ifneq (,$(findstring ccache,$(CC)))
 endif
 
 MKLDNN_GITHUB := https://github.com/01org/mkl-dnn.git
-MKLDNN_CMAKE_FLAGS += $(MKLDNN_SRCDIR) -DCMAKE_INSTALL_PREFIX=$(CAFFE_ROOTDIR)/$(MKLDNN_INSTALLDIR) -B$(CAFFE_ROOTDIR)/$(MKLDNN_BUILDDIR) -DCMAKE_CXX_COMPILER="$(MKLDNN_CXX)" -DCMAKE_C_COMPILER="$(MKLDNN_CC)"
+MKLDNN_CMAKE_FLAGS += $(MKLDNN_SRCDIR) -DCMAKE_INSTALL_PREFIX=$(CAFFE_ROOTDIR)/$(MKLDNN_INSTALLDIR) -DMKLROOT=${MKL_ROOTDIR} -B$(CAFFE_ROOTDIR)/$(MKLDNN_BUILDDIR) -DCMAKE_CXX_COMPILER="$(MKLDNN_CXX)" -DCMAKE_C_COMPILER="$(MKLDNN_CC)"
 
 ifeq ("$(wildcard $(MKLDNN_INSTALLDIR)/include/mkldnn.hpp)", "")
 mkldnn_download:
diff --git a/README.md b/README.md
index dab45a99c..dbc117df6 100644
--- a/README.md
+++ b/README.md
@@ -71,3 +71,6 @@ Please cite Caffe in your publications if it helps your research:
 
 ***
  *Other names and brands may be claimed as the property of others
+
+
+
diff --git a/external/mkl/prepare_mkl.sh b/external/mkl/prepare_mkl.sh
index dc0eb2ecc..b68bc7aec 100755
--- a/external/mkl/prepare_mkl.sh
+++ b/external/mkl/prepare_mkl.sh
@@ -74,10 +74,11 @@ echo $VERSION_LINE  # Return Version Line
 # MKL
 DST=`dirname $0`
 OMP=0 
-VERSION_MATCH=20170101
-ARCHIVE_BASENAME=mklml_lnx_2017.0.2.20170110.tgz
+VERSION_MATCH=20170425
+ARCHIVE_BASENAME=mklml_lnx_2018.0.20170425.tgz
 MKL_CONTENT_DIR=`echo $ARCHIVE_BASENAME | rev | cut -d "." -f 2- | rev`
-GITHUB_RELEASE_TAG=self_containted_MKLGOLD_u2
+GITHUB_RELEASE_TAG=1.0.0
+
 MKLURL="https://github.com/intel/caffe/releases/download/$GITHUB_RELEASE_TAG/$ARCHIVE_BASENAME"
 # there are diffrent MKL lib to be used for GCC and for ICC
 reg='^[0-9]+$'
diff --git a/include/caffe/layers/accuracy_layer.hpp b/include/caffe/layers/accuracy_layer.hpp
index d8b4d34e5..c61255bd3 100644
--- a/include/caffe/layers/accuracy_layer.hpp
+++ b/include/caffe/layers/accuracy_layer.hpp
@@ -76,7 +76,7 @@ class AccuracyLayer : public Layer<Dtype> {
   // If there are two top blobs, then the second blob will contain
   // accuracies per class.
   virtual inline int MinTopBlobs() const { return 1; }
-  virtual inline int MaxTopBlos() const { return 2; }
+  virtual inline int MaxTopBlobs() const { return 2; }
 
  protected:
   /**
diff --git a/include/caffe/layers/base_conv_layer.hpp b/include/caffe/layers/base_conv_layer.hpp
old mode 100644
new mode 100755
index 8b3bc99ba..11236681f
--- a/include/caffe/layers/base_conv_layer.hpp
+++ b/include/caffe/layers/base_conv_layer.hpp
@@ -153,6 +153,13 @@ class BaseConvolutionLayer : public Layer<Dtype> {
           pad_.cpu_data()[0], pad_.cpu_data()[1],
           stride_.cpu_data()[0], stride_.cpu_data()[1],
           dilation_.cpu_data()[0], dilation_.cpu_data()[1], col_buff);
+    } else if (!force_nd_im2col_ && num_spatial_axes_ == 3) {
+      im3d2col_cpu(data, conv_in_channels_,
+          conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2], conv_input_shape_.cpu_data()[3],
+          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1], kernel_shape_.cpu_data()[2],
+          pad_.cpu_data()[0], pad_.cpu_data()[1], pad_.cpu_data()[2],
+          stride_.cpu_data()[0], stride_.cpu_data()[1], stride_.cpu_data()[2],
+          dilation_.cpu_data()[0], dilation_.cpu_data()[1], dilation_.cpu_data()[2], col_buff);
     } else {
       im2col_nd_cpu(data, num_spatial_axes_, conv_input_shape_.cpu_data(),
           col_buffer_shape_.data(), kernel_shape_.cpu_data(),
@@ -167,6 +174,13 @@ class BaseConvolutionLayer : public Layer<Dtype> {
           pad_.cpu_data()[0], pad_.cpu_data()[1],
           stride_.cpu_data()[0], stride_.cpu_data()[1],
           dilation_.cpu_data()[0], dilation_.cpu_data()[1], data);
+    } else if (!force_nd_im2col_ && num_spatial_axes_ == 3) {
+      col2im3d_cpu(col_buff, conv_in_channels_,
+          conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2], conv_input_shape_.cpu_data()[3],
+          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1], kernel_shape_.cpu_data()[2],
+          pad_.cpu_data()[0], pad_.cpu_data()[1], pad_.cpu_data()[2],
+          stride_.cpu_data()[0], stride_.cpu_data()[1], stride_.cpu_data()[2],
+          dilation_.cpu_data()[0], dilation_.cpu_data()[1], dilation_.cpu_data()[2], data);
     } else {
       col2im_nd_cpu(col_buff, num_spatial_axes_, conv_input_shape_.cpu_data(),
           col_buffer_shape_.data(), kernel_shape_.cpu_data(),
diff --git a/include/caffe/layers/mkldnn_layers.hpp b/include/caffe/layers/mkldnn_layers.hpp
index 5ef3e77dd..391235d4d 100644
--- a/include/caffe/layers/mkldnn_layers.hpp
+++ b/include/caffe/layers/mkldnn_layers.hpp
@@ -1,510 +1,519 @@
-/*
-All modification made by Intel Corporation: © 2016 Intel Corporation
-
-All contributions by the University of California:
-Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
-All rights reserved.
-
-All other contributions:
-Copyright (c) 2014, 2015, the respective contributors
-All rights reserved.
-For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
-
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-      this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * Neither the name of Intel Corporation nor the names of its contributors
-      may be used to endorse or promote products derived from this software
-      without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#ifndef CAFFE_MKLDNN_LAYERS_HPP_
-#define CAFFE_MKLDNN_LAYERS_HPP_
-
-#include <string>
-#include <vector>
-
-#include "boost/enable_shared_from_this.hpp"
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/engine_parser.hpp"
-#include "caffe/layers/base_conv_layer.hpp"
-#include "caffe/layers/conv_layer.hpp"
-#include "caffe/layers/inner_product_layer.hpp"
-#include "caffe/layers/neuron_layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-#include "caffe/mkldnn_memory.hpp"
-#include "mkldnn.hpp"
-
-#include "caffe/util/performance.hpp"
-
-using namespace mkldnn;
-
-namespace caffe {
-
-// =====  MKLDNNBatchNormLayer =======================================
-template <typename Dtype>
-class MKLDNNBatchNormLayer : public MKLDNNLayer<Dtype>, public Layer<Dtype> {
-public:
-    explicit MKLDNNBatchNormLayer(const LayerParameter& param)
-        : Layer<Dtype>(param)
-        , fwd_top_data(), fwd_bottom_data()
-        , bwd_top_diff(), bwd_bottom_diff()
-        , BatchNormFwd_pd(), BatchNormBwd_pd()
-        , mean_memory(), variance_memory()
-        , scaleshift_memory(), bwd_scaleshift_diff_memory()
-        , output_memory(), bwd_bottom_diff_memory()
-        , input_primitive(), bwd_top_diff_primitive()
-        {
-          PERFORMANCE_EVENT_ID_RESET(perf_id_fw_);
-          PERFORMANCE_EVENT_ID_RESET(perf_id_bw_);
-    }
-    ~MKLDNNBatchNormLayer() {}
-
-protected:
-    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual inline const char* type() const { return "BatchNorm"; }
-    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
-                                , const vector<Blob<Dtype>*>& bottom);
-    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
-                                , const vector<Blob<Dtype>*>& bottom);
-private:
-    void InitBatchNorm(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    void InitBatchNormBwd(const vector<Blob<Dtype>*>& top,
-            const vector<bool>& propagate_down,
-            const vector<Blob<Dtype>*>& bottom);
-    shared_ptr<MKLDNNData<Dtype> > fwd_top_data, fwd_bottom_data;
-    shared_ptr<MKLDNNDiff<Dtype> > bwd_top_diff, bwd_bottom_diff;
-    shared_ptr<batch_normalization_forward::primitive_desc> BatchNormFwd_pd;
-    shared_ptr<batch_normalization_backward::primitive_desc> BatchNormBwd_pd;
-
-    MKLDNNPrimitive<Dtype> BatchNormFwd, BatchNormBwd;
-    shared_ptr<memory> mean_memory, variance_memory;
-
-    shared_ptr<memory> scaleshift_memory, bwd_scaleshift_diff_memory;
-    shared_ptr<memory> output_memory, bwd_bottom_diff_memory;
-
-    shared_ptr<primitive> input_primitive, bwd_top_diff_primitive;
-
-    int32_t num_, width_, height_, channels_;
-    Dtype eps_, moving_average_fraction_;
-    bool use_weight_bias_, bias_term_, use_global_stats_;
-
-    PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
-    PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
-};
-
-// =====  MKLDNNConvolutionLayer =======================================
-template <typename Dtype>
-class MKLDNNConvolutionLayer : public MKLDNNLayer<Dtype> , public ConvolutionLayer<Dtype> {
-public:
-    explicit MKLDNNConvolutionLayer(const LayerParameter& param);
-    virtual ~MKLDNNConvolutionLayer() {}
-protected:
-    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
-                                , const vector<Blob<Dtype>*>& bottom);
-    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
-                                , const vector<Blob<Dtype>*>& bottom);
-    // Customized methods
-    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    void Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-private:
-    virtual void compute_output_shape();
-    virtual void init_properties(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    void InitConvolutionFwd(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    void InitConvolutionBwd(const vector<Blob<Dtype>*>& top
-                        , const vector<bool>& propagate_down
-                        , const vector<Blob<Dtype>*>& bottom);
-
-    shared_ptr<MKLDNNData<Dtype> > fwd_bottom_data, fwd_top_data, fwd_weights_data, fwd_bias_data
-                    , bwdd_weights_data, bwdw_bottom_data;
-    shared_ptr<MKLDNNDiff<Dtype> > bwdd_bottom_diff, bwdd_top_diff
-                    , bwdw_top_diff, bwdw_weights_diff, bwdw_bias_diff;
-    shared_ptr<convolution_forward::primitive_desc> convFwd_pd;
-    shared_ptr<convolution_backward_data::primitive_desc> convBwdData_pd;
-    shared_ptr<convolution_backward_weights::primitive_desc> convBwdWeights_pd;
-    MKLDNNPrimitive<Dtype> convFwd, convBwdData, convBwdWeights;
-    shared_ptr<memory> fwd_top_data_memory, bwdd_bottom_diff_memory
-                    , bwdw_weights_diff_memory,  bwdw_bias_diff_memory;
-    shared_ptr<primitive> fwd_bottom_data_primitive, fwd_weights_data_primitive, fwd_bias_data_primitive
-                    , bwdd_top_diff_primitive, bwdd_weights_data_primitive
-                    , bwdw_top_diff_primitive, bwdw_bottom_data_primitive;
-    int32_t width_, height_, width_out_, height_out_, kernel_w_, kernel_h_, stride_w_, stride_h_;
-    int  pad_w_, pad_h_;
-
-    PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
-    PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
-    PERFORMANCE_EVENT_ID_DECL(perf_id_bw_weights_);
-};
-
-// =====  MKLDNNInnerProductLayer =======================================
-template <typename Dtype>
-class MKLDNNInnerProductLayer : public MKLDNNLayer<Dtype> , public InnerProductLayer<Dtype>  {
-public:
-    explicit MKLDNNInnerProductLayer(const LayerParameter& param);
-    virtual ~MKLDNNInnerProductLayer();
-protected:
-    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
-                                , const vector<Blob<Dtype>*>& bottom);
-    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
-                                , const vector<Blob<Dtype>*>& bottom);
-    // Customized methods
-    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    void Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-private:
-    void InitInnerProductFwd(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    void InitInnerProductBwd(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
-                                , const vector<Blob<Dtype>*>& bottom);
-
-    shared_ptr<MKLDNNData<Dtype> > fwd_bottom_data, fwd_top_data, fwd_weights_data, fwd_bias_data
-                    , bwdd_weights_data, bwdw_bottom_data;
-    shared_ptr<MKLDNNDiff<Dtype> > bwdd_bottom_diff, bwdd_top_diff
-                    , bwdw_top_diff, bwdw_weights_diff, bwdw_bias_diff;
-    shared_ptr<inner_product_forward::primitive_desc> ipFwd_pd;
-    shared_ptr<inner_product_backward_data::primitive_desc> ipBwdData_pd;
-    shared_ptr<inner_product_backward_weights::primitive_desc> ipBwdWeights_pd;
-
-    MKLDNNPrimitive<Dtype> ipFwd, ipBwdData, ipBwdWeights;
-    shared_ptr<memory> fwd_top_data_memory, bwdd_bottom_diff_memory
-                    , bwdw_weights_diff_memory,  bwdw_bias_diff_memory;
-    shared_ptr<primitive> fwd_bottom_data_primitive, fwd_weights_data_primitive, fwd_bias_data_primitive
-                    , bwdd_top_diff_primitive, bwdd_weights_data_primitive
-                    , bwdw_top_diff_primitive, bwdw_bottom_data_primitive;
-    int32_t w_, h_;
-
-    PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
-    PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
-    PERFORMANCE_EVENT_ID_DECL(perf_id_bw_weights_);
-};
-
-
-/**
- * @brief Normalize the input in a local region across feature maps.
- */
-
-// =====  MKLDNNLRNLayer =======================================
-template <typename Dtype>
-class MKLDNNLRNLayer : public MKLDNNLayer<Dtype> , public Layer<Dtype>  {
-public:
-    explicit MKLDNNLRNLayer(const LayerParameter& param);
-    virtual ~MKLDNNLRNLayer() {}
-protected:
-    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
-                                , const vector<Blob<Dtype>*>& bottom);
-    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
-                                , const vector<Blob<Dtype>*>& bottom);
-
-    virtual inline const char* type() const { return "LRN"; }
-    virtual inline int ExactNumBottomBlobs() const { return 1; }
-    virtual inline int ExactNumTopBlobs() const { return 1; }
-private:
-    void InitLRNFwd(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    void InitLRNBwd(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
-                                , const vector<Blob<Dtype>*>& bottom);
-
-    shared_ptr<MKLDNNData<Dtype> > fwd_top_data, fwd_bottom_data;
-    shared_ptr<MKLDNNDiff<Dtype> > bwd_top_diff, bwd_bottom_diff;
-    shared_ptr<lrn_forward::primitive_desc> lrnFwd_pd;
-    shared_ptr<lrn_backward::primitive_desc> lrnBwd_pd;
-    MKLDNNPrimitive<Dtype> lrnFwd;
-    MKLDNNPrimitive<Dtype> lrnBwd;
-    shared_ptr<memory> fwd_top_data_memory, bwd_bottom_diff_memory, scratch_memory;
-    shared_ptr<primitive> fwd_bottom_data_primitive, bwd_top_diff_primitive;
-    Dtype alpha_, beta_, k_;
-    int size_, num_, width_, height_, channels_;
-
-    PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
-    PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
-};
-
-// ===== MKLDNNPoolingLayer =======================================
-template <typename Dtype>
-class MKLDNNPoolingLayer : public MKLDNNLayer<Dtype>, public Layer<Dtype>  {
-public:
-    explicit MKLDNNPoolingLayer(const LayerParameter& param)
-            : MKLDNNLayer<Dtype>(), Layer<Dtype>(param)
-            , fwd_bottom_data(), fwd_top_data()
-            , bwd_top_diff(), bwd_bottom_diff()
-            , poolingFwd_pd()
-            , poolingBwd_pd()
-            , indices_pd()
-            , indices_memory(), fwd_top_data_memory(), bwd_bottom_diff_memory()
-            , fwd_bottom_data_primitive(), bwd_top_diff_primitive()
-            , num_(0), channels_(0), width_(0), height_(0), width_out_(0), height_out_(0)
-            , kernel_w_(0), kernel_h_(0), stride_w_(0), stride_h_(0)
-            , pad_t_(0),pad_b_(0), pad_l_(0), pad_r_(0)
-            , global_pooling_(false)
-            {
-              PERFORMANCE_EVENT_ID_RESET(perf_id_fw_);
-              PERFORMANCE_EVENT_ID_RESET(perf_id_bw_);
-            }
-    ~MKLDNNPoolingLayer() {}
-protected:
-    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-
-    virtual inline const char* type() const { return "Pooling"; }
-    virtual inline int ExactNumBottomBlobs() const { return 1; }
-    virtual inline int MinTopBlobs() const { return 1; }
-    // MAX POOL layers can output an extra top blob for the mask;
-    // others can only output the pooled inputs.
-    virtual inline int MaxTopBlobs() const {
-        return (this->layer_param_.pooling_param().pool() == PoolingParameter_PoolMethod_MAX) ? 2 : 1;
-    }
-protected:
-    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,const vector<bool>& propagate_down
-                                ,const vector<Blob<Dtype>*>& bottom);
-    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
-                                ,const vector<Blob<Dtype>*>& bottom);
-
-private:
-    void InitPoolingFwd(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    void InitPoolingBwd(const vector<Blob<Dtype>*>& bottom
-                        , const vector<bool>& propagate_down
-                        , const vector<Blob<Dtype>*>& top);
-  
-    shared_ptr<MKLDNNData<Dtype>> fwd_bottom_data, fwd_top_data;
-    shared_ptr<MKLDNNDiff<Dtype>> bwd_top_diff, bwd_bottom_diff;
-    shared_ptr<pooling_forward::primitive_desc> poolingFwd_pd;
-    shared_ptr<pooling_backward::primitive_desc> poolingBwd_pd;
-    MKLDNNPrimitive<Dtype> poolingFwd, poolingBwd;
-    shared_ptr<memory::primitive_desc> indices_pd;
-    shared_ptr<memory> indices_memory, fwd_top_data_memory, bwd_bottom_diff_memory;
-    shared_ptr<primitive> fwd_bottom_data_primitive, bwd_top_diff_primitive;
-    int32_t num_, channels_, width_, height_, width_out_, height_out_;
-    int32_t kernel_w_, kernel_h_, stride_w_, stride_h_;
-    int32_t  pad_t_, pad_b_, pad_l_, pad_r_;
-    Blob<uint32_t> max_idx_;
-    bool global_pooling_;
-
-    PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
-    PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
-};
-
-// =====  MKLDNNReLULayer =======================================
-template <typename Dtype>
-class MKLDNNReLULayer : public MKLDNNLayer<Dtype> , public NeuronLayer<Dtype>  {
-public:
-    /**
-    * @param param provides ReLUParameter relu_param,
-    *     with ReLULayer options:
-    *   - negative_slope (\b optional, default 0).
-    *     the value @f$ \nu @f$ by which negative values are multiplied.
-    */
-  explicit MKLDNNReLULayer(const LayerParameter& param)
-    : MKLDNNLayer<Dtype>(), NeuronLayer<Dtype>(param)
-    , fwd_top_data(), fwd_bottom_data()
-    , bwd_top_diff(), bwd_bottom_diff()
-    , reluFwd_pd(), reluBwd_pd()
-    , fwd_top_data_memory(), bwd_bottom_diff_memory()
-    , fwd_bottom_data_primitive(), bwd_top_diff_primitive()
-    , num_(0), width_(0), height_(0), channels_(0)
-  {
-    PERFORMANCE_EVENT_ID_RESET(perf_id_fw_);
-    PERFORMANCE_EVENT_ID_RESET(perf_id_bw_);
-  }
-  ~MKLDNNReLULayer() {}
-
-protected:
-    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual inline const char* type() const { return "ReLU"; }
-    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
-                                , const vector<Blob<Dtype>*>& bottom);
-    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
-                                , const vector<Blob<Dtype>*>& bottom);
-private:
-    void InitReLUFwd(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    void InitReLUBwd(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
-                                , const vector<Blob<Dtype>*>& bottom);
-
-    shared_ptr<MKLDNNData<Dtype> > fwd_top_data, fwd_bottom_data;
-    shared_ptr<MKLDNNDiff<Dtype> > bwd_top_diff, bwd_bottom_diff;
-    shared_ptr<relu_forward::primitive_desc> reluFwd_pd;
-    shared_ptr<relu_backward::primitive_desc> reluBwd_pd;
-    MKLDNNPrimitive<Dtype> reluFwd, reluBwd;
-    shared_ptr<memory> fwd_top_data_memory, bwd_bottom_diff_memory;
-    shared_ptr<primitive> fwd_bottom_data_primitive, bwd_top_diff_primitive;
-    int32_t num_, width_, height_, channels_;
-
-    PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
-    PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
-};
-
-// ===== MKLDNNConcatLayer ======================================
-template <typename Dtype>
-class MKLDNNConcatLayer : public MKLDNNLayer<Dtype> , public Layer<Dtype> {
-public:
-    explicit MKLDNNConcatLayer(const LayerParameter& param)
-            : MKLDNNLayer<Dtype>(), Layer<Dtype>(param),
-            concatFwd_pd(), fwd_output_memory(),
-            bwd_reorder_input_memory(), bwd_reorder_output_memory(),
-            fwd_top_data(), fwd_bottom_data(), split_channels() {
-              PERFORMANCE_EVENT_ID_RESET(perf_id_fw_);
-              PERFORMANCE_EVENT_ID_RESET(perf_id_bw_);
-    }
-protected:
-    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual inline const char* type() const { return "Concat"; }
-    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
-                                , const vector<Blob<Dtype>*>& bottom);
-    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
-                                , const vector<Blob<Dtype>*>& bottom);
-private:
-    void InitConcatFwd(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    void InitConcatBwd(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
-                                , const vector<Blob<Dtype>*>& bottom);
-
-    shared_ptr<concat::primitive_desc> concatFwd_pd;
-    shared_ptr<memory> fwd_output_memory;
-    shared_ptr<primitive> bwd_reorder_input_memory;
-    vector<shared_ptr<memory>> bwd_reorder_output_memory;
-    vector<shared_ptr<memory>> bwd_bottom_memory_;
-    vector<shared_ptr<primitive>> fwd_input_primitives_;
-    vector<primitive::at> fwd_input_primitives_at_;
-    MKLDNNPrimitive<Dtype> concatFwd;
-    shared_ptr<MKLDNNData<Dtype> > fwd_top_data;
-    vector<shared_ptr<MKLDNNData<Dtype> > > fwd_bottom_data;
-    shared_ptr<MKLDNNDiff<Dtype> > bwd_top_diff;
-    vector<shared_ptr<MKLDNNDiff<Dtype> > > bwd_bottom_diff;
-    vector<MKLDNNPrimitive<Dtype> > reorders;
-    vector<int> split_channels;
-
-    int32_t num_, width_, height_, channels_, num_concats_;
-    int concat_dimension;
-
-    PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
-    PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
-};
-
-// ===== MKLDNNSplitLayer ======================================
-template <typename Dtype>
-class MKLDNNSplitLayer : public MKLDNNLayer<Dtype> , public Layer<Dtype> {
-public:
-    explicit MKLDNNSplitLayer(const LayerParameter& param)
-            : MKLDNNLayer<Dtype>(), Layer<Dtype>(param),
-              splitBwd_pd_(), bwd_bottom_diff_memory_()
-            {
-              PERFORMANCE_EVENT_ID_RESET(perf_id_bw_);
-    }
-    ~MKLDNNSplitLayer();
-
-protected:
-    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual inline const char* type() const { return "Split"; }
-    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
-                                , const vector<Blob<Dtype>*>& bottom);
-    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
-                                , const vector<Blob<Dtype>*>& bottom);
-private:
-    void InitSplitFwd(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    void InitSplitBwd(const vector<Blob<Dtype>*>& top, const vector<Blob<Dtype>*>& bottom);
-
-  private:
-    vector<size_t> sizes_src_;
-    vector<size_t> strides_src_;
-    MKLDNNPrimitive<Dtype> splitBwd_;
-    shared_ptr<sum::primitive_desc> splitBwd_pd_;
-    shared_ptr<memory> bwd_bottom_diff_memory_;
-    shared_ptr<MKLDNNDiff<Dtype> > bwd_bottom_diff_;
-    vector<shared_ptr<primitive>> bwd_top_diff_primitives_;
-    vector<primitive::at> bwd_top_diffs_primitives_at_;
-    vector<shared_ptr<MKLDNNDiff<Dtype> > > bwd_top_diffs_;
-
-    PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
-};
-
-// =====  MKLDNNEltwiseLayer =======================================
-template <typename Dtype>
-class MKLDNNEltwiseLayer : public MKLDNNLayer<Dtype> , public Layer<Dtype>  {
-public:
-  explicit MKLDNNEltwiseLayer(const LayerParameter& param)
-    : MKLDNNLayer<Dtype>(), Layer<Dtype>(param)
-    , fwd_top_data(), fwd_bottom_data()
-    , eltwiseFwd_pd()
-    , fwd_top_data_memory()
-    , fwd_bottom_data_primitives_()
-    , num_(0), width_(0), height_(0), channels_(0)
-    , num_bottoms_(0)
-  {
-    PERFORMANCE_EVENT_ID_RESET(perf_id_fw_);
-  }
-  ~MKLDNNEltwiseLayer() {}
-
-protected:
-    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual inline const char* type() const { return "Eltwise"; }
-    virtual inline int MinBottomBlobs() const { return 2; }
-    virtual inline int ExactNumTopBlobs() const { return 1; }
-    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
-                                , const vector<Blob<Dtype>*>& bottom);
-    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
-                                , const vector<Blob<Dtype>*>& bottom);
-private:
-    void InitEltwiseFwd(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-    void InitEltwiseBwd(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
-                                , const vector<Blob<Dtype>*>& bottom);
-   
-    shared_ptr<MKLDNNData<Dtype> > fwd_top_data;
-    vector<shared_ptr<MKLDNNData<Dtype> > > fwd_bottom_data;
-    shared_ptr<sum::primitive_desc> eltwiseFwd_pd;
-    MKLDNNPrimitive<Dtype> eltwiseFwd;
-
-    shared_ptr<memory> fwd_top_data_memory;
-    vector<shared_ptr<primitive>> fwd_bottom_data_primitives_;
-    vector<primitive::at> fwd_bottom_data_primitives_at_;
-
-    EltwiseParameter_EltwiseOp op_;
-    vector<Dtype> coeffs_;
-    Blob<int> max_idx_;
-    int32_t num_, width_, height_, channels_;
-    int32_t num_bottoms_;
-    bool stable_prod_grad_;
-
-    PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
-};
-
-
-}  // namespace caffe
-#endif  // #ifndef CAFFE_MKLDNN_LAYERS_HPP_
+/*
+All modification made by Intel Corporation: © 2016 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef CAFFE_MKLDNN_LAYERS_HPP_
+#define CAFFE_MKLDNN_LAYERS_HPP_
+
+#include <string>
+#include <vector>
+
+#include "boost/enable_shared_from_this.hpp"
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/engine_parser.hpp"
+#include "caffe/layers/base_conv_layer.hpp"
+#include "caffe/layers/conv_layer.hpp"
+#include "caffe/layers/inner_product_layer.hpp"
+#include "caffe/layers/neuron_layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/mkldnn_memory.hpp"
+#include "mkldnn.hpp"
+
+#include "caffe/util/performance.hpp"
+
+using namespace mkldnn;
+
+namespace caffe {
+
+// =====  MKLDNNBatchNormLayer =======================================
+template <typename Dtype>
+class MKLDNNBatchNormLayer : public MKLDNNLayer<Dtype>, public Layer<Dtype> {
+public:
+    explicit MKLDNNBatchNormLayer(const LayerParameter& param)
+        : Layer<Dtype>(param)
+        , fwd_top_data(), fwd_bottom_data()
+        , bwd_top_diff(), bwd_bottom_diff()
+        , BatchNormFwd_pd(), BatchNormBwd_pd()
+        , mean_memory(), variance_memory()
+        , scaleshift_memory(), bwd_scaleshift_diff_memory()
+        , output_memory(), bwd_bottom_diff_memory()
+        , input_primitive(), bwd_top_diff_primitive()
+        {
+          PERFORMANCE_EVENT_ID_RESET(perf_id_fw_);
+          PERFORMANCE_EVENT_ID_RESET(perf_id_bw_);
+    }
+    ~MKLDNNBatchNormLayer() {}
+
+protected:
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual inline const char* type() const { return "BatchNorm"; }
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
+                                , const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
+                                , const vector<Blob<Dtype>*>& bottom);
+private:
+    void InitBatchNorm(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    void InitBatchNormBwd(const vector<Blob<Dtype>*>& top,
+            const vector<bool>& propagate_down,
+            const vector<Blob<Dtype>*>& bottom);
+    shared_ptr<MKLDNNData<Dtype> > fwd_top_data, fwd_bottom_data;
+    shared_ptr<MKLDNNDiff<Dtype> > bwd_top_diff, bwd_bottom_diff;
+    shared_ptr<batch_normalization_forward::primitive_desc> BatchNormFwd_pd;
+    shared_ptr<batch_normalization_backward::primitive_desc> BatchNormBwd_pd;
+
+    MKLDNNPrimitive<Dtype> BatchNormFwd, BatchNormBwd;
+    shared_ptr<memory> mean_memory, variance_memory;
+
+    shared_ptr<memory> scaleshift_memory, bwd_scaleshift_diff_memory;
+    shared_ptr<memory> output_memory, bwd_bottom_diff_memory;
+
+    shared_ptr<primitive> input_primitive, bwd_top_diff_primitive;
+
+    int32_t num_, width_, height_, channels_;
+    Dtype eps_, moving_average_fraction_;
+    bool use_weight_bias_, bias_term_, use_global_stats_;
+
+    PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
+    PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
+};
+
+// =====  MKLDNNConvolutionLayer =======================================
+template <typename Dtype>
+class MKLDNNConvolutionLayer : public MKLDNNLayer<Dtype> , public ConvolutionLayer<Dtype> {
+public:
+    explicit MKLDNNConvolutionLayer(const LayerParameter& param);
+    virtual ~MKLDNNConvolutionLayer() {}
+
+    //For test the parameters of kernel/stride/pad
+    int GetKernelWidth()  { return kernel_w_; }
+    int GetKernelHeight() { return kernel_h_; }
+    int GetStrideWidth()  { return stride_w_; }
+    int GetStrideHeight() { return stride_h_; }
+    int GetPadWidth()     { return pad_w_; }
+    int GetPadHeight()    { return pad_h_; }
+protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
+                                , const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
+                                , const vector<Blob<Dtype>*>& bottom);
+    // Customized methods
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    void Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+private:
+    virtual void compute_output_shape();
+    virtual void init_properties(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    void InitConvolutionFwd(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    void InitConvolutionBwd(const vector<Blob<Dtype>*>& top
+                        , const vector<bool>& propagate_down
+                        , const vector<Blob<Dtype>*>& bottom);
+
+    shared_ptr<MKLDNNData<Dtype> > fwd_bottom_data, fwd_top_data, fwd_weights_data, fwd_bias_data
+                    , bwdd_weights_data, bwdw_bottom_data;
+    shared_ptr<MKLDNNDiff<Dtype> > bwdd_bottom_diff, bwdd_top_diff
+                    , bwdw_top_diff, bwdw_weights_diff, bwdw_bias_diff;
+    shared_ptr<convolution_forward::primitive_desc> convFwd_pd;
+    shared_ptr<convolution_backward_data::primitive_desc> convBwdData_pd;
+    shared_ptr<convolution_backward_weights::primitive_desc> convBwdWeights_pd;
+    MKLDNNPrimitive<Dtype> convFwd, convBwdData, convBwdWeights;
+    shared_ptr<memory> fwd_top_data_memory, bwdd_bottom_diff_memory
+                    , bwdw_weights_diff_memory,  bwdw_bias_diff_memory;
+    shared_ptr<primitive> fwd_bottom_data_primitive, fwd_weights_data_primitive, fwd_bias_data_primitive
+                    , bwdd_top_diff_primitive, bwdd_weights_data_primitive
+                    , bwdw_top_diff_primitive, bwdw_bottom_data_primitive;
+    int32_t width_, height_, width_out_, height_out_, kernel_w_, kernel_h_, stride_w_, stride_h_;
+    int  pad_w_, pad_h_;
+
+    PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
+    PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
+    PERFORMANCE_EVENT_ID_DECL(perf_id_bw_weights_);
+};
+
+// =====  MKLDNNInnerProductLayer =======================================
+template <typename Dtype>
+class MKLDNNInnerProductLayer : public MKLDNNLayer<Dtype> , public InnerProductLayer<Dtype>  {
+public:
+    explicit MKLDNNInnerProductLayer(const LayerParameter& param);
+    virtual ~MKLDNNInnerProductLayer();
+protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
+                                , const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
+                                , const vector<Blob<Dtype>*>& bottom);
+    // Customized methods
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    void Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+private:
+    void InitInnerProductFwd(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    void InitInnerProductBwd(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
+                                , const vector<Blob<Dtype>*>& bottom);
+
+    shared_ptr<MKLDNNData<Dtype> > fwd_bottom_data, fwd_top_data, fwd_weights_data, fwd_bias_data
+                    , bwdd_weights_data, bwdw_bottom_data;
+    shared_ptr<MKLDNNDiff<Dtype> > bwdd_bottom_diff, bwdd_top_diff
+                    , bwdw_top_diff, bwdw_weights_diff, bwdw_bias_diff;
+    shared_ptr<inner_product_forward::primitive_desc> ipFwd_pd;
+    shared_ptr<inner_product_backward_data::primitive_desc> ipBwdData_pd;
+    shared_ptr<inner_product_backward_weights::primitive_desc> ipBwdWeights_pd;
+
+    MKLDNNPrimitive<Dtype> ipFwd, ipBwdData, ipBwdWeights;
+    shared_ptr<memory> fwd_top_data_memory, bwdd_bottom_diff_memory
+                    , bwdw_weights_diff_memory,  bwdw_bias_diff_memory;
+    shared_ptr<primitive> fwd_bottom_data_primitive, fwd_weights_data_primitive, fwd_bias_data_primitive
+                    , bwdd_top_diff_primitive, bwdd_weights_data_primitive
+                    , bwdw_top_diff_primitive, bwdw_bottom_data_primitive;
+    int32_t w_, h_;
+
+    PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
+    PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
+    PERFORMANCE_EVENT_ID_DECL(perf_id_bw_weights_);
+};
+
+
+/**
+ * @brief Normalize the input in a local region across feature maps.
+ */
+
+// =====  MKLDNNLRNLayer =======================================
+template <typename Dtype>
+class MKLDNNLRNLayer : public MKLDNNLayer<Dtype> , public Layer<Dtype>  {
+public:
+    explicit MKLDNNLRNLayer(const LayerParameter& param);
+    virtual ~MKLDNNLRNLayer() {}
+protected:
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
+                                , const vector<Blob<Dtype>*>& bottom);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
+                                , const vector<Blob<Dtype>*>& bottom);
+
+    virtual inline const char* type() const { return "LRN"; }
+    virtual inline int ExactNumBottomBlobs() const { return 1; }
+    virtual inline int ExactNumTopBlobs() const { return 1; }
+private:
+    void InitLRNFwd(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    void InitLRNBwd(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
+                                , const vector<Blob<Dtype>*>& bottom);
+
+    shared_ptr<MKLDNNData<Dtype> > fwd_top_data, fwd_bottom_data;
+    shared_ptr<MKLDNNDiff<Dtype> > bwd_top_diff, bwd_bottom_diff;
+    shared_ptr<lrn_forward::primitive_desc> lrnFwd_pd;
+    shared_ptr<lrn_backward::primitive_desc> lrnBwd_pd;
+    MKLDNNPrimitive<Dtype> lrnFwd;
+    MKLDNNPrimitive<Dtype> lrnBwd;
+    shared_ptr<memory::desc> bottom_md;
+    shared_ptr<memory> fwd_top_data_memory, bwd_bottom_diff_memory, scratch_memory;
+    shared_ptr<primitive> fwd_bottom_data_primitive, bwd_top_diff_primitive;
+    Dtype alpha_, beta_, k_;
+    int size_, num_, width_, height_, channels_;
+
+    PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
+    PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
+};
+
+// ===== MKLDNNPoolingLayer =======================================
+template <typename Dtype>
+class MKLDNNPoolingLayer : public MKLDNNLayer<Dtype>, public Layer<Dtype>  {
+public:
+    explicit MKLDNNPoolingLayer(const LayerParameter& param)
+            : MKLDNNLayer<Dtype>(), Layer<Dtype>(param)
+            , fwd_bottom_data(), fwd_top_data()
+            , bwd_top_diff(), bwd_bottom_diff()
+            , poolingFwd_pd()
+            , poolingBwd_pd()
+            , indices_pd()
+            , indices_memory(), fwd_top_data_memory(), bwd_bottom_diff_memory()
+            , fwd_bottom_data_primitive(), bwd_top_diff_primitive()
+            , num_(0), channels_(0), width_(0), height_(0), width_out_(0), height_out_(0)
+            , kernel_w_(0), kernel_h_(0), stride_w_(0), stride_h_(0)
+            , pad_t_(0),pad_b_(0), pad_l_(0), pad_r_(0)
+            , global_pooling_(false)
+            {
+              PERFORMANCE_EVENT_ID_RESET(perf_id_fw_);
+              PERFORMANCE_EVENT_ID_RESET(perf_id_bw_);
+            }
+    ~MKLDNNPoolingLayer() {}
+protected:
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const { return "Pooling"; }
+    virtual inline int ExactNumBottomBlobs() const { return 1; }
+    virtual inline int MinTopBlobs() const { return 1; }
+    // MAX POOL layers can output an extra top blob for the mask;
+    // others can only output the pooled inputs.
+    virtual inline int MaxTopBlobs() const {
+        return (this->layer_param_.pooling_param().pool() == PoolingParameter_PoolMethod_MAX) ? 2 : 1;
+    }
+protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,const vector<bool>& propagate_down
+                                ,const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
+                                ,const vector<Blob<Dtype>*>& bottom);
+
+private:
+    void InitPoolingFwd(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    void InitPoolingBwd(const vector<Blob<Dtype>*>& bottom
+                        , const vector<bool>& propagate_down
+                        , const vector<Blob<Dtype>*>& top);
+  
+    shared_ptr<MKLDNNData<Dtype>> fwd_bottom_data, fwd_top_data;
+    shared_ptr<MKLDNNDiff<Dtype>> bwd_top_diff, bwd_bottom_diff;
+    shared_ptr<pooling_forward::primitive_desc> poolingFwd_pd;
+    shared_ptr<pooling_backward::primitive_desc> poolingBwd_pd;
+    MKLDNNPrimitive<Dtype> poolingFwd, poolingBwd;
+    shared_ptr<memory::primitive_desc> indices_pd;
+    shared_ptr<memory> indices_memory, fwd_top_data_memory, bwd_bottom_diff_memory;
+    shared_ptr<primitive> fwd_bottom_data_primitive, bwd_top_diff_primitive;
+    int32_t num_, channels_, width_, height_, width_out_, height_out_;
+    int32_t kernel_w_, kernel_h_, stride_w_, stride_h_;
+    int32_t  pad_t_, pad_b_, pad_l_, pad_r_;
+    Blob<uint32_t> max_idx_;
+    bool global_pooling_;
+
+    PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
+    PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
+};
+
+// =====  MKLDNNReLULayer =======================================
+template <typename Dtype>
+class MKLDNNReLULayer : public MKLDNNLayer<Dtype> , public NeuronLayer<Dtype>  {
+public:
+    /**
+    * @param param provides ReLUParameter relu_param,
+    *     with ReLULayer options:
+    *   - negative_slope (\b optional, default 0).
+    *     the value @f$ \nu @f$ by which negative values are multiplied.
+    */
+  explicit MKLDNNReLULayer(const LayerParameter& param)
+    : MKLDNNLayer<Dtype>(), NeuronLayer<Dtype>(param)
+    , fwd_top_data(), fwd_bottom_data()
+    , bwd_top_diff(), bwd_bottom_diff()
+    , reluFwd_pd(), reluBwd_pd()
+    , fwd_top_data_memory(), bwd_bottom_diff_memory()
+    , fwd_bottom_data_primitive(), bwd_top_diff_primitive()
+    , num_(0), width_(0), height_(0), channels_(0)
+  {
+    PERFORMANCE_EVENT_ID_RESET(perf_id_fw_);
+    PERFORMANCE_EVENT_ID_RESET(perf_id_bw_);
+  }
+  ~MKLDNNReLULayer() {}
+
+protected:
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual inline const char* type() const { return "ReLU"; }
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
+                                , const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
+                                , const vector<Blob<Dtype>*>& bottom);
+private:
+    void InitReLUFwd(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    void InitReLUBwd(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
+                                , const vector<Blob<Dtype>*>& bottom);
+
+    shared_ptr<MKLDNNData<Dtype> > fwd_top_data, fwd_bottom_data;
+    shared_ptr<MKLDNNDiff<Dtype> > bwd_top_diff, bwd_bottom_diff;
+    shared_ptr<relu_forward::primitive_desc> reluFwd_pd;
+    shared_ptr<relu_backward::primitive_desc> reluBwd_pd;
+    MKLDNNPrimitive<Dtype> reluFwd, reluBwd;
+    shared_ptr<memory> fwd_top_data_memory, bwd_bottom_diff_memory;
+    shared_ptr<primitive> fwd_bottom_data_primitive, bwd_top_diff_primitive;
+    int32_t num_, width_, height_, channels_;
+
+    PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
+    PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
+};
+
+// ===== MKLDNNConcatLayer ======================================
+template <typename Dtype>
+class MKLDNNConcatLayer : public MKLDNNLayer<Dtype> , public Layer<Dtype> {
+public:
+    explicit MKLDNNConcatLayer(const LayerParameter& param)
+            : MKLDNNLayer<Dtype>(), Layer<Dtype>(param),
+            concatFwd_pd(), fwd_output_memory(),
+            bwd_reorder_input_memory(), bwd_reorder_output_memory(),
+            fwd_top_data(), fwd_bottom_data(), split_channels() {
+              PERFORMANCE_EVENT_ID_RESET(perf_id_fw_);
+              PERFORMANCE_EVENT_ID_RESET(perf_id_bw_);
+    }
+protected:
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual inline const char* type() const { return "Concat"; }
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
+                                , const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
+                                , const vector<Blob<Dtype>*>& bottom);
+private:
+    void InitConcatFwd(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    void InitConcatBwd(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
+                                , const vector<Blob<Dtype>*>& bottom);
+
+    shared_ptr<concat::primitive_desc> concatFwd_pd;
+    shared_ptr<memory> fwd_output_memory;
+    shared_ptr<primitive> bwd_reorder_input_memory;
+    vector<shared_ptr<memory>> bwd_reorder_output_memory;
+    vector<shared_ptr<memory>> bwd_bottom_memory_;
+    vector<shared_ptr<primitive>> fwd_input_primitives_;
+    vector<primitive::at> fwd_input_primitives_at_;
+    MKLDNNPrimitive<Dtype> concatFwd;
+    shared_ptr<MKLDNNData<Dtype> > fwd_top_data;
+    vector<shared_ptr<MKLDNNData<Dtype> > > fwd_bottom_data;
+    shared_ptr<MKLDNNDiff<Dtype> > bwd_top_diff;
+    vector<shared_ptr<MKLDNNDiff<Dtype> > > bwd_bottom_diff;
+    vector<MKLDNNPrimitive<Dtype> > reorders;
+    vector<int> split_channels;
+
+    int32_t num_, width_, height_, channels_, num_concats_;
+    int concat_dimension;
+
+    PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
+    PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
+};
+
+// ===== MKLDNNSplitLayer ======================================
+template <typename Dtype>
+class MKLDNNSplitLayer : public MKLDNNLayer<Dtype> , public Layer<Dtype> {
+public:
+    explicit MKLDNNSplitLayer(const LayerParameter& param)
+            : MKLDNNLayer<Dtype>(), Layer<Dtype>(param),
+              splitBwd_pd_(), bwd_bottom_diff_memory_()
+            {
+              PERFORMANCE_EVENT_ID_RESET(perf_id_bw_);
+    }
+    ~MKLDNNSplitLayer();
+
+protected:
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual inline const char* type() const { return "Split"; }
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
+                                , const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
+                                , const vector<Blob<Dtype>*>& bottom);
+private:
+    void InitSplitFwd(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    void InitSplitBwd(const vector<Blob<Dtype>*>& top, const vector<Blob<Dtype>*>& bottom);
+
+  private:
+    vector<size_t> sizes_src_;
+    vector<size_t> strides_src_;
+    MKLDNNPrimitive<Dtype> splitBwd_;
+    shared_ptr<sum::primitive_desc> splitBwd_pd_;
+    shared_ptr<memory> bwd_bottom_diff_memory_;
+    shared_ptr<MKLDNNDiff<Dtype> > bwd_bottom_diff_;
+    vector<shared_ptr<primitive>> bwd_top_diff_primitives_;
+    vector<primitive::at> bwd_top_diffs_primitives_at_;
+    vector<shared_ptr<MKLDNNDiff<Dtype> > > bwd_top_diffs_;
+
+    PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
+};
+
+// =====  MKLDNNEltwiseLayer =======================================
+template <typename Dtype>
+class MKLDNNEltwiseLayer : public MKLDNNLayer<Dtype> , public Layer<Dtype>  {
+public:
+  explicit MKLDNNEltwiseLayer(const LayerParameter& param)
+    : MKLDNNLayer<Dtype>(), Layer<Dtype>(param)
+    , fwd_top_data(), fwd_bottom_data()
+    , eltwiseFwd_pd()
+    , fwd_top_data_memory()
+    , fwd_bottom_data_primitives_()
+    , num_(0), width_(0), height_(0), channels_(0)
+    , num_bottoms_(0)
+  {
+    PERFORMANCE_EVENT_ID_RESET(perf_id_fw_);
+  }
+  ~MKLDNNEltwiseLayer() {}
+
+protected:
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual inline const char* type() const { return "Eltwise"; }
+    virtual inline int MinBottomBlobs() const { return 2; }
+    virtual inline int ExactNumTopBlobs() const { return 1; }
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
+                                , const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
+                                , const vector<Blob<Dtype>*>& bottom);
+private:
+    void InitEltwiseFwd(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    void InitEltwiseBwd(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down
+                                , const vector<Blob<Dtype>*>& bottom);
+   
+    shared_ptr<MKLDNNData<Dtype> > fwd_top_data;
+    vector<shared_ptr<MKLDNNData<Dtype> > > fwd_bottom_data;
+    shared_ptr<sum::primitive_desc> eltwiseFwd_pd;
+    MKLDNNPrimitive<Dtype> eltwiseFwd;
+
+    shared_ptr<memory> fwd_top_data_memory;
+    vector<shared_ptr<primitive>> fwd_bottom_data_primitives_;
+    vector<primitive::at> fwd_bottom_data_primitives_at_;
+
+    EltwiseParameter_EltwiseOp op_;
+    vector<Dtype> coeffs_;
+    Blob<int> max_idx_;
+    int32_t num_, width_, height_, channels_;
+    int32_t num_bottoms_;
+    bool stable_prod_grad_;
+
+    PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
+};
+
+
+}  // namespace caffe
+#endif  // #ifndef CAFFE_MKLDNN_LAYERS_HPP_
diff --git a/include/caffe/layers/softmax_loss_layer.hpp b/include/caffe/layers/softmax_loss_layer.hpp
index de5fc3dc7..741d31f1d 100644
--- a/include/caffe/layers/softmax_loss_layer.hpp
+++ b/include/caffe/layers/softmax_loss_layer.hpp
@@ -100,6 +100,9 @@ class SoftmaxWithLossLayer : public LossLayer<Dtype> {
   virtual inline int MinTopBlobs() const { return 1; }
   virtual inline int MaxTopBlobs() const { return 2; }
 
+  virtual inline int ExactNumBottomBlobs() const { return -1; }
+  virtual inline int MinBottomBlobs() const { return 2; }
+  virtual inline int MaxBottomBlobs() const { return 3; }
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
diff --git a/include/caffe/mkldnn_base.hpp b/include/caffe/mkldnn_base.hpp
index 1cc8923ac..f68d590d4 100644
--- a/include/caffe/mkldnn_base.hpp
+++ b/include/caffe/mkldnn_base.hpp
@@ -203,6 +203,10 @@ template <typename Dtype>
 class MKLDNNPrimitive {
 public:
     explicit MKLDNNPrimitive():aprimitive(), mkldnn_stream() {}
+
+    //API for initializing with shared_ptr<primitive>
+    MKLDNNPrimitive(shared_ptr<primitive> aprimitive_input) {this->aprimitive = aprimitive_input;}
+
     virtual ~MKLDNNPrimitive() {}
     void reset(primitive* pprimitive) { this->aprimitive.reset(pprimitive);}
     shared_ptr<primitive> aprimitive;
diff --git a/include/caffe/util/compareToolUtilities.h b/include/caffe/util/compareToolUtilities.h
index ab1ee877d..754890b0d 100644
--- a/include/caffe/util/compareToolUtilities.h
+++ b/include/caffe/util/compareToolUtilities.h
@@ -372,13 +372,16 @@ int collectAndCheckLayerData(bool collect_step,
         }
 
         if (bottom_need_backward[i].size() > 0 && bottom_need_backward[i][0]) {
-            getFileName(file_name, false, "FwrdBtmDat", i);
-            checkData(file_name, bottom_vecs[i][0]->cpu_data(),
-                layers[i]->type(), output_dir,
-                &erronous_layers);
-            checkAllNans(bottom_vecs[i][0]->cpu_diff(),
-                bottom_vecs[i][0]->count(), "bottom.diff",
-                layers[i]->type(), &erronous_layers);
+          // We check data only for out-of-place computations
+          if (bottom_vecs[i][0] != top_vecs[i][0]) {
+              getFileName(file_name, false, "FwrdBtmDat", i);
+              checkData(file_name, bottom_vecs[i][0]->cpu_data(),
+                  layers[i]->type(), output_dir,
+                  &erronous_layers);
+          }
+          checkAllNans(bottom_vecs[i][0]->cpu_diff(),
+              bottom_vecs[i][0]->count(), "bottom.diff",
+              layers[i]->type(), &erronous_layers);
         }
 
         checkAllNans(top_vecs[i][0]->cpu_diff(),
diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp
old mode 100644
new mode 100755
index 8c3a2566c..1f2b567e3
--- a/include/caffe/util/im2col.hpp
+++ b/include/caffe/util/im2col.hpp
@@ -53,6 +53,13 @@ void im2col_cpu(const Dtype* data_im, const int channels,
     const int stride_w, const int dilation_h, const int dilation_w,
     Dtype* data_col);
 
+template <typename Dtype>
+void im3d2col_cpu(const Dtype* data_im, const int channels,
+    const int depth, const int height, const int width, const int kernel_d, const int kernel_h, const int kernel_w,
+    const int pad_d, const int pad_h, const int pad_w, const int stride_d, const int stride_h,
+    const int stride_w, const int dilation_d, const int dilation_h, const int dilation_w,
+    Dtype* data_col);
+
 template <typename Dtype>
 void col2im_nd_cpu(const Dtype* data_col, const int num_spatial_axes,
     const int* im_shape, const int* col_shape,
@@ -66,6 +73,13 @@ void col2im_cpu(const Dtype* data_col, const int channels,
     const int stride_w, const int dilation_h, const int dilation_w,
     Dtype* data_im);
 
+template <typename Dtype>
+void col2im3d_cpu(const Dtype* data_col, const int channels,
+    const int depth, const int height, const int width, const int kernel_d, const int kernel_h, const int kernel_w,
+    const int pad_d, const int pad_h, const int pad_w, const int stride_d, const int stride_h,
+    const int stride_w, const int dilation_d, const int dilation_h, const int dilation_w,
+    Dtype* data_im);
+
 template <typename Dtype>
 void im2col_nd_gpu(const Dtype* data_im, const int num_spatial_axes,
     const int col_size, const int* im_shape, const int* col_shape,
diff --git a/models/intel_optimized_models/resnet_50/solver.prototxt b/models/intel_optimized_models/resnet_50/solver.prototxt
index a18d6e572..4574a306f 100644
--- a/models/intel_optimized_models/resnet_50/solver.prototxt
+++ b/models/intel_optimized_models/resnet_50/solver.prototxt
@@ -1,5 +1,5 @@
 #This solver is described by Computer Vision Group Jena (CVGJ) in [ImageNet pre-trained models with batch normalization] (https://arxiv.org/pdf/1612.01452.pdf)
-net: "train_val.prototxt"
+net: "models/intel_optimized_models/resnet_50/train_val.prototxt"
 test_iter: 5000
 test_interval: 15000
 base_lr: 0.1
@@ -11,6 +11,7 @@ power: 1
 momentum: 0.9
 weight_decay: 0.0001
 snapshot: 30000
-snapshot_prefix: "caffe-resnet50"
+snapshot_prefix: "models/intel_optimized_models/resnet_50/caffe-resnet50"
 test_initialization: false
 solver_mode: CPU
+
diff --git a/models/intel_optimized_models/resnet_50/train_val.prototxt b/models/intel_optimized_models/resnet_50/train_val.prototxt
index 09cd2b99e..6aadf5ca5 100644
--- a/models/intel_optimized_models/resnet_50/train_val.prototxt
+++ b/models/intel_optimized_models/resnet_50/train_val.prototxt
@@ -22,7 +22,7 @@ transform_param {
    mean_value: 123
 }
   data_param {
-    source: "/data/compressed_lmdb/ilsvrc12_train_lmdb"
+    source: "examples/imagenet/ilsvrc12_train_lmdb"
     batch_size: 128
     backend: LMDB
     shuffle: true
@@ -46,7 +46,7 @@ transform_param {
    mean_value: 123
 }
   data_param {
-    source: "/data/compressed_lmdb/ilsvrc12_val_lmdb/"
+    source: "examples/imagenet/ilsvrc12_val_lmdb/"
     batch_size: 10
     backend: LMDB
   }
diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp
index e84fe9553..8a169864b 100644
--- a/src/caffe/layers/concat_layer.cpp
+++ b/src/caffe/layers/concat_layer.cpp
@@ -132,7 +132,7 @@ void ConcatLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     if (propagate_down[i]) {
       Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
 #ifdef _OPENMP
-  #pragma omp parallel for
+  #pragma omp parallel for if(num_concats_ > 1)
 #endif
       for (int n = 0; n < num_concats_; ++n) {
         caffe_copy(bottom_concat_axis * concat_input_size_, top_diff +
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 36bba5e90..82efc2410 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -73,7 +73,9 @@ void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const Dtype* bottom_data = bottom[i]->cpu_data();
     Dtype* top_data = top[i]->mutable_cpu_data();
 #ifdef _OPENMP
-    #pragma omp parallel for num_threads(this->num_of_threads_)
+    #pragma omp parallel if(this->num_of_threads_ > 1) num_threads(this->num_of_threads_)
+    {
+      #pragma omp for
 #endif
       for (int n = 0; n < this->num_; ++n) {
         this->forward_cpu_gemm(bottom_data + n*this->bottom_dim_,
@@ -84,6 +86,9 @@ void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
           this->forward_cpu_bias(top_data + n * this->top_dim_, bias);
         }
       }
+#ifdef _OPENMP
+    }
+#endif
   }
 }
 
@@ -111,8 +116,10 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 
     if (this->param_propagate_down_[0]) {
 #ifdef _OPENMP
-      this->clear_weight_mt();
-      #pragma omp parallel num_threads(this->num_of_threads_)
+      if (this->num_of_threads_ > 1) {
+        this->clear_weight_mt();
+      }
+      #pragma omp parallel if(this->num_of_threads_ > 1) num_threads(this->num_of_threads_)
 #endif
       {
 #ifdef _OPENMP
@@ -125,20 +132,27 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
         }
 
 #ifdef _OPENMP
-        this->sum_weight_mt(weight_diff);
+        if (this->num_of_threads_ > 1) {
+          this->sum_weight_mt(weight_diff);
+        }
 #endif
       }
     }
 
     if (propagate_down[i]) {
 #ifdef _OPENMP
-      #pragma omp parallel for num_threads(this->num_of_threads_)
+      #pragma omp parallel if(this->num_of_threads_ > 1) num_threads(this->num_of_threads_)
+      {
+        #pragma omp for
 #endif
         for (int n = 0; n < this->num_; ++n) {
           // gradient w.r.t. bottom data, if necessary.
           this->backward_cpu_gemm(top_diff + n * this->top_dim_, weight,
               bottom_diff + n * this->bottom_dim_);
         }
+#ifdef _OPENMP
+      }
+#endif
     }
   }
 }
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index c23c583de..a2bf24333 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -69,6 +69,15 @@ void DropoutLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = top[0]->mutable_cpu_data();
+  // below line designated to set correspondent SyncedMemory->_head to HEAD_AT_CPU
+  // Fix the issue of "Check failed: this->_cpu_ptr == cpu_ptr (0 vs. 0x5587dfc87ec0)" (GoogleNet V1)
+  // The reason is after pooling layer: MKLDNNPoolingLayer<Dtype>::Forward_cpu: pool5/7x7_s1, the top[0]->prv_data() has value
+  // It will convert to cpu data in the dropout layer, and set the _head to HEAD_AT_CPU after executing top[0]->mutable_cpu_data()
+  // Howerver, I found top[0]->cpu_data() and top[0]->prv_data() both has value
+  // So in the inner product layer: loss3/classifier, the data will convert from bottom prv data
+  // and the reorder will change from this->_reorder_usr2prv to this->_reorder_extprv2prv_pd
+  // So eventually trigger the assertion.
+  top[0]->set_prv_data_descriptor(NULL);
   unsigned int* mask = rand_vec_.mutable_cpu_data();
   const int count = bottom[0]->count();
   if (this->phase_ == TRAIN) {
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index 725dfcc0f..6124bb494 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -98,10 +98,10 @@ void HDF5DataLayer<Dtype>::LoadHDF5FileData(const char* filename) {
   // Shuffle if needed.
   if (this->layer_param_.hdf5_data_param().shuffle()) {
     std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
-    DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0)
+    DLOG(INFO) << "Successfully loaded " << hdf_blobs_[0]->shape(0)
                << " rows (shuffled)";
   } else {
-    DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0) << " rows";
+    DLOG(INFO) << "Successfully loaded " << hdf_blobs_[0]->shape(0) << " rows";
   }
 }
 
diff --git a/src/caffe/layers/mkl_batch_norm_layer.cpp b/src/caffe/layers/mkl_batch_norm_layer.cpp
old mode 100644
new mode 100755
index 79a3ceb3d..b2e86830f
--- a/src/caffe/layers/mkl_batch_norm_layer.cpp
+++ b/src/caffe/layers/mkl_batch_norm_layer.cpp
@@ -66,6 +66,7 @@ void MKLBatchNormLayer<Dtype>::Init(const vector<Blob<Dtype>*>& bottom,
   eps_ = this->layer_param_.batch_norm_param().eps();
   use_weight_bias_ = this->layer_param_.batch_norm_param().use_weight_bias();
   bias_term_ = this->layer_param_.batch_norm_param().bias_term();
+  use_global_stats_ = this->layer_param_.batch_norm_param().use_global_stats();
 
   CHECK(use_weight_bias_) << "BatchNorm without scaling have not supported yet";
 
@@ -111,6 +112,7 @@ void MKLBatchNormLayer<Dtype>::Init(const vector<Blob<Dtype>*>& bottom,
   dnnReleaseBuffer<Dtype>(variance_buffer_);
   dnnReleaseBuffer<Dtype>(scaleShift_buffer_);
   dnnReleaseBuffer<Dtype>(diffScaleShift_buffer_);
+
   // "Lazy" allocation because here we don't know
   // what layout is used by neighbours.
 
@@ -271,9 +273,15 @@ void MKLBatchNormLayer<Dtype>::Forward_cpu(
       bwd_top_diff   ->create_internal_layout(batchNormFwd, dnnResourceDst);
       bwd_bottom_diff->create_internal_layout(batchNormFwd, dnnResourceSrc);
 
-       e = dnnBatchNormalizationCreateBackward<Dtype>(
-        &batchNormBwd, NULL, mem_descr->layout_int, eps_, dnnUseScaleShift);
-      CHECK_EQ(e, E_SUCCESS);
+       if (!use_global_stats_) {
+         e = dnnBatchNormalizationCreateBackward<Dtype>(
+            &batchNormBwd, NULL, mem_descr->layout_int, eps_, dnnUseScaleShift);
+         CHECK_EQ(e, E_SUCCESS);
+       } else {
+         e = dnnBatchNormalizationCreateBackward<Dtype>(
+            &batchNormBwd, NULL, mem_descr->layout_int, eps_, dnnUseScaleShift | dnnUseInputMeanVariance);
+         CHECK_EQ(e, E_SUCCESS);
+       }
     }
   } else {
     DLOG(INFO) << "Using cpu_data in MKLBatchNormLayer.";
@@ -290,9 +298,15 @@ void MKLBatchNormLayer<Dtype>::Forward_cpu(
                                     dnnUseScaleShift | dnnUseInputMeanVariance);
       CHECK_EQ(e, E_SUCCESS);
 
-      e = dnnBatchNormalizationCreateBackward<Dtype>(
-        &batchNormBwd, NULL, layout_usr_, eps_, dnnUseScaleShift);
-      CHECK_EQ(e, E_SUCCESS);
+      if (!use_global_stats_) {
+        e = dnnBatchNormalizationCreateBackward<Dtype>(
+          &batchNormBwd, NULL, layout_usr_, eps_, dnnUseScaleShift);
+        CHECK_EQ(e, E_SUCCESS);
+      } else {
+        e = dnnBatchNormalizationCreateBackward<Dtype>(
+          &batchNormBwd, NULL, layout_usr_, eps_, dnnUseScaleShift | dnnUseInputMeanVariance);
+        CHECK_EQ(e, E_SUCCESS);
+      }
     }
     bottom_data =
       reinterpret_cast<void *>(const_cast<Dtype*>(bottom[0]->cpu_data()));
@@ -360,13 +374,13 @@ void MKLBatchNormLayer<Dtype>::Forward_cpu(
     // doing Backward
     // TODO: make a caffe_coppy working on blobs
     caffe_copy(amount_to_copy, static_cast<Dtype*>(bottom_data),
-                                                      temp_.mutable_cpu_data());
+               temp_.mutable_cpu_data());
   }
 
   if (use_global_stats_) {
     // use the stored mean/variance estimates.
     const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ?
-        0 : 1 / this->blobs_[2]->cpu_data()[0];
+                               0 : 1 / this->blobs_[2]->cpu_data()[0];
     caffe_cpu_scale(this->blobs_[0]->count(), scale_factor,
                     this->blobs_[0]->cpu_data(), mean_buffer_);
     caffe_cpu_scale(this->blobs_[1]->count(), scale_factor,
diff --git a/src/caffe/layers/mkldnn_batch_norm_layer.cpp b/src/caffe/layers/mkldnn_batch_norm_layer.cpp
index 90939c34b..b479f8828 100644
--- a/src/caffe/layers/mkldnn_batch_norm_layer.cpp
+++ b/src/caffe/layers/mkldnn_batch_norm_layer.cpp
@@ -130,7 +130,12 @@ void MKLDNNBatchNormLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom
     this->num_ = bottom[0]->num();
     this->channels_ = bottom[0]->channels();
 
-    top[0]->Reshape(this->num_, this->channels_, this->height_, this->width_);
+    //Fix: should reshape the top blob with the real size of bottom blob
+    //top[0]->Reshape(this->num_, this->channels_, this->height_, this->width_);
+#ifdef DEBUG
+    LOG(INFO) << "size of bottom blob: " << bottom[0]->shape().size();
+#endif
+    top[0]->ReshapeLike(*bottom[0]);
 }
 
 template <typename Dtype>
@@ -146,12 +151,13 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNorm(const vector<Blob<Dtype>*>& bott
     int32_t n  = this->num_;
     int32_t iw = this->width_;
     int32_t ih = this->height_;
-    int32_t ic = this->channels_;
+    int32_t ic = this->channels_;    
 
     bool bottom_data_is_prv = (const_cast<Dtype*>(bottom[0]->prv_data()) != NULL);
 
     engine cpu_engine = CpuEngine::Instance().get_engine();
     memory::data_type mpcsn = memory::data_type::f32;
+    
     // ---- Initialize memory descriptors -------------
     shared_ptr<memory::desc> input_md, output_md, scaleshift_md;
     shared_ptr<memory::primitive_desc> usr_mpd, prv_mpd, scaleshift_mpd;
@@ -162,7 +168,7 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNorm(const vector<Blob<Dtype>*>& bott
         usr_mpd = mem_descr->usr_memory_pd();
         prv_mpd = mem_descr->prv_memory_pd();
     } else {
-        input_md.reset(new memory::desc({{n, ic, ih, iw}}, mpcsn, memory::format::nchw));
+        input_md.reset(new memory::desc({{n, ic, ih, iw}}, mpcsn, memory::format::nchw));   //MKLDNN batch norm only support 4D memory descriptor!
         usr_mpd.reset(new memory::primitive_desc(*input_md, cpu_engine));
     }
     output_md = input_md;
@@ -242,6 +248,23 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNorm(const vector<Blob<Dtype>*>& bott
 
     fwd_bottom_data->set_mkldnn_primitive(BatchNormFwd);
     fwd_top_data->set_mkldnn_primitive(BatchNormFwd);
+
+    //Fix: MKLDNN batch norm only support 4D memory descriptor! Use 4D for calculation and reshape to 2D for output!
+    bool has_spatial = (bottom[0]->shape().size() != 2);
+#ifdef DEBUG
+    LOG(INFO) << "has_spatial flag value: " << has_spatial;
+#endif
+    if (has_spatial == false)
+    {
+#ifdef DEBUG
+        LOG(INFO) << "size of bottom blob: " << bottom[0]->shape().size();
+        LOG(INFO) << "MKLDNN batch norm only support 4D memory descriptor! Use 4D for calculation and reshape to 2D for output!";
+#endif
+        vector<int> top_shape;
+        top_shape.push_back(bottom[0]->num());
+        top_shape.push_back(bottom[0]->channels());
+        top[0]->Reshape(top_shape);
+    }
 }
 
 
@@ -250,8 +273,11 @@ void MKLDNNBatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom
                                         ,const vector<Blob<Dtype>*>& top)
 {
     VLOG(1) << "MKLDNNBatchNormLayer<Dtype>::Forward_cpu: " << this->layer_param_.name();
+#ifdef DEBUG
+    LOG(INFO) << "MKLDNNBatchNormLayer<Dtype>::Forward_cpu: " << this->layer_param_.name();
+#endif
 
-    if( BatchNormFwd_pd == NULL)
+    if(BatchNormFwd_pd == NULL)
         InitBatchNorm(bottom, top);
     // making reorders if needed.
     fwd_bottom_data->sync_before_read();
@@ -323,8 +349,8 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNormBwd(
 
     engine cpu_engine = CpuEngine::Instance().get_engine();
     memory::data_type mpcsn = memory::data_type::f32;
-    // ---- Initialize memory descriptors -------------
 
+    // ---- Initialize memory descriptors -------------
     shared_ptr<memory::desc> top_diff_md, top_data_md;
     shared_ptr<memory::primitive_desc> usr_diff_mpd(NULL), prv_diff_mpd(NULL);
     if (top_diff_is_prv) {
@@ -334,7 +360,7 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNormBwd(
         usr_diff_mpd = mem_descr->usr_memory_pd();
         prv_diff_mpd = mem_descr->prv_memory_pd();
     } else {
-        top_diff_md.reset(new memory::desc({{n, c, h, w}}, mpcsn, memory::format::nchw));
+        top_diff_md.reset(new memory::desc({{n, c, h, w}}, mpcsn, memory::format::nchw));   //MKLDNN batch norm only support 4D memory descriptor!
         usr_diff_mpd.reset(new memory::primitive_desc(*top_diff_md, cpu_engine));
     }
 
@@ -392,10 +418,13 @@ template <typename Dtype>
 void MKLDNNBatchNormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
         const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom)
 {
-    VLOG(1) << "MKLDNNBatchNormLayer<Dtype>::Backward_cpu: "
-        << this->layer_param_.name();
+    VLOG(1) << "MKLDNNBatchNormLayer<Dtype>::Backward_cpu: " << this->layer_param_.name();
+#ifdef DEBUG
+    LOG(INFO) << "MKLDNNBatchNormLayer<Dtype>::Backward_cpu: " << this->layer_param_.name();
+#endif
 
-    if (BatchNormBwd_pd == NULL) InitBatchNormBwd(top, propagate_down, bottom);
+    if (BatchNormBwd_pd == NULL)
+        InitBatchNormBwd(top, propagate_down, bottom);
     // making reorders if needed.
     bwd_top_diff->sync_before_read();
     // update bottom that head at prv
@@ -403,7 +432,38 @@ void MKLDNNBatchNormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 
     PERFORMANCE_EVENT_ID_INIT(perf_id_bw_, PERFORMANCE_MKLDNN_NAME("BW"));
     PERFORMANCE_MEASUREMENT_BEGIN();
+#ifdef DEBUG
+    if (bottom[0]->prv_data() != NULL)
+    {
+        LOG(INFO) << "Debug: Bottom prv data: " << *bottom[0]->prv_data();
+    }
+    else
+    {
+        LOG(INFO) << "Debug: Bottom prv data is NULL!";
+    }
+
+    if (top[0]->prv_diff() != NULL)
+    {
+        LOG(INFO) << "Debug: Top prv diff: " << *top[0]->prv_diff();
+    }
+    else
+    {
+        LOG(INFO) << "Debug: Top prv diff is NULL!";
+        LOG(INFO) << "Debug: Top cpu diff: " << *top[0]->cpu_diff();
+    }
+#endif
     BatchNormBwd.submit();
+#ifdef DEBUG
+    if (bottom[0]->prv_diff() != NULL)
+    {
+        LOG(INFO) << "Debug: Bottom prv diff: " << *bottom[0]->prv_diff();
+    }
+    else
+    {
+        LOG(INFO) << "Debug: Bottom prv diff is NULL!";
+        LOG(INFO) << "Debug: Bottom cpu diff: " << *bottom[0]->cpu_diff();
+    }
+#endif
     PERFORMANCE_MEASUREMENT_END_ID(perf_id_bw_);
 
     /* FIXME: this wouldn't work with lazy stream */
diff --git a/src/caffe/layers/mkldnn_concat_layer.cpp b/src/caffe/layers/mkldnn_concat_layer.cpp
index ffdcf8edd..ee2cc5026 100644
--- a/src/caffe/layers/mkldnn_concat_layer.cpp
+++ b/src/caffe/layers/mkldnn_concat_layer.cpp
@@ -90,6 +90,31 @@ void MKLDNNConcatLayer<Dtype>::InitConcatFwd(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
   if (std::is_same<Dtype, double>::value)  NOT_IMPLEMENTED;
 
+  //Fix: MKLDNN concat layer should use 4D blob as input! Reshape the 2D input blob into 4D for calculation!
+  bool has_spatial = (bottom[0]->shape().size() != 2);
+#ifdef DEBUG
+  LOG(INFO) << "has_spatial flag value: " << has_spatial;
+#endif
+  if (has_spatial == false)
+  {
+#ifdef DEBUG
+      LOG(INFO) << "size of bottom blob: " << bottom[0]->shape().size();
+      LOG(INFO) << "size of top blob: " << top[0]->shape().size();
+      LOG(INFO) << "MKLDNN concat layer only support 4D blob as input! Reshape the 2D input blob into 4D for calculation!";
+#endif
+      vector<int> bottom_4D_shape;
+      int bottom_4D_height = 1;
+      int bottom_4D_width = 1;
+      bottom_4D_shape.push_back(bottom[0]->num());
+      bottom_4D_shape.push_back(bottom[0]->channels());
+      bottom_4D_shape.push_back(bottom_4D_height);
+      bottom_4D_shape.push_back(bottom_4D_width);
+      for (auto i = 0; i < num_concats_; i++)
+      {
+          bottom[i]->Reshape(bottom_4D_shape);
+      }      
+  }
+
   engine cpu_engine = CpuEngine::Instance().get_engine();
   memory::data_type data_type = memory::data_type::f32;
   // memory::format mfmt_any = memory::format::any;
@@ -222,7 +247,10 @@ void MKLDNNConcatLayer<Dtype>::InitConcatBwd(const vector<Blob<Dtype>*>& top,
 template <typename Dtype>
 void MKLDNNConcatLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  //VLOG(1) << "MKLDNNConcatLayer<Dtype>::Forward_cpu: " << this->layer_param_.name();
+  VLOG(1) << "MKLDNNConcatLayer<Dtype>::Forward_cpu: " << this->layer_param_.name();
+#ifdef DEBUG
+  LOG(INFO) << "MKLDNNConcatLayer<Dtype>::Forward_cpu: " << this->layer_param_.name();
+#endif
 
   if (NULL == concatFwd_pd)
     InitConcatFwd(bottom, top);
@@ -244,7 +272,11 @@ void MKLDNNConcatLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top
                                            ,const vector<bool>& propagate_down
                                            ,const vector<Blob<Dtype>*>& bottom)
 {
-  //VLOG(1) << "MKLDNNConcatLayer<Dtype>::Backward_cpu: " << this->layer_param_.name();
+  VLOG(1) << "MKLDNNConcatLayer<Dtype>::Backward_cpu: " << this->layer_param_.name();
+#ifdef DEBUG
+  LOG(INFO) << "MKLDNNConcatLayer<Dtype>::Backward_cpu: " << this->layer_param_.name();
+#endif
+
   if (reorders.size() == 0)
     InitConcatBwd(top, propagate_down, bottom);
   bwd_top_diff->sync_before_read();
@@ -255,7 +287,6 @@ void MKLDNNConcatLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top
     reorders[i].submit();
     PERFORMANCE_MEASUREMENT_END_ID(perf_id_bw_);
   }
-
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/mkldnn_convolution_layer.cpp b/src/caffe/layers/mkldnn_convolution_layer.cpp
index 02dbd36c8..fa8e7fc15 100644
--- a/src/caffe/layers/mkldnn_convolution_layer.cpp
+++ b/src/caffe/layers/mkldnn_convolution_layer.cpp
@@ -1,511 +1,574 @@
-/*
-All modification made by Intel Corporation: © 2016 Intel Corporation
-
-All contributions by the University of California:
-Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
-All rights reserved.
-
-All other contributions:
-Copyright (c) 2014, 2015, the respective contributors
-All rights reserved.
-For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
-
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-      this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * Neither the name of Intel Corporation nor the names of its contributors
-      may be used to endorse or promote products derived from this software
-      without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#ifdef MKLDNN_SUPPORTED
-#include <algorithm>
-#include <cstdlib>
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/layers/mkldnn_layers.hpp"
-//#include "mkl_service.h"
-
-// TODO: Correct process case if there are no bias
-// TODO: Exception handling - mkl-dnn produces exceptions on errors
-
-namespace caffe {
-
-template <typename Dtype>
-MKLDNNConvolutionLayer<Dtype>::MKLDNNConvolutionLayer(const LayerParameter& param)
-            : MKLDNNLayer<Dtype>(), ConvolutionLayer<Dtype>(param)
-            , fwd_bottom_data(NULL), fwd_top_data(NULL), fwd_weights_data(NULL), fwd_bias_data(NULL)
-            , bwdd_weights_data(NULL), bwdw_bottom_data(NULL)
-            , bwdd_bottom_diff(NULL), bwdd_top_diff(NULL)
-            , bwdw_top_diff(NULL), bwdw_weights_diff(NULL), bwdw_bias_diff(NULL)
-            , convFwd_pd(NULL), convBwdData_pd(NULL), convBwdWeights_pd(NULL)
-            , fwd_top_data_memory(NULL), bwdd_bottom_diff_memory(NULL)
-            , bwdw_weights_diff_memory(NULL), bwdw_bias_diff_memory(NULL)
-            , fwd_bottom_data_primitive(NULL), fwd_weights_data_primitive(NULL), fwd_bias_data_primitive(NULL)
-            , bwdd_top_diff_primitive(NULL), bwdd_weights_data_primitive(NULL)
-            , bwdw_top_diff_primitive(NULL), bwdw_bottom_data_primitive(NULL)
-            , width_(0), height_(0), width_out_(0), height_out_(0), kernel_w_(0), kernel_h_(0)
-            , stride_w_(0), stride_h_(0), pad_w_(0), pad_h_(0)
-{
-  PERFORMANCE_EVENT_ID_RESET(perf_id_fw_);
-  PERFORMANCE_EVENT_ID_RESET(perf_id_bw_);
-  PERFORMANCE_EVENT_ID_RESET(perf_id_bw_weights_);
-}
-
-template <typename Dtype>
-void MKLDNNConvolutionLayer<Dtype>::compute_output_shape()
-{
-    ConvolutionLayer<Dtype>::compute_output_shape();
-    this->height_out_ = (this->height_ + 2 * this->pad_h_ - this->kernel_h_)
-        / this->stride_h_ + 1;
-    this->width_out_ = (this->width_ + 2 * this->pad_w_ - this->kernel_w_)
-        / this->stride_w_ + 1;
-}
-
-template <typename Dtype>
-void MKLDNNConvolutionLayer<Dtype>::init_properties(const vector<Blob<Dtype>*>& bottom
-                                                , const vector<Blob<Dtype>*>& top)
-{
-    this->stride_w_ = this->stride_.cpu_data()[0];
-    this->stride_h_ = this->stride_.cpu_data()[1];
-    this->width_ = bottom[0]->width();
-    this->height_ = bottom[0]->height();
-    this->pad_w_ = this->pad_.cpu_data()[0];
-    this->pad_h_ = this->pad_.cpu_data()[1];
-    this->kernel_w_ = this->kernel_shape_.cpu_data()[0];
-    this->kernel_h_  = this->kernel_shape_.cpu_data()[1];
-
-#ifdef USE_MLSL
-    if ((this->layerOp == nullptr) && (this->phase_ == TRAIN)) {
-      mn::OpRegInfo reg_info{ mn::train::get_session(), MLSL::OT_CC };
-      reg_info.set_name(this->layer_param_.name());
-      reg_info.add_parameter_set<Dtype>(this->channels_ * this->num_output_ / std::max(this->group_, 1), this->kernel_w_ * this->kernel_h_);
-      if (this->bias_term_) {
-        reg_info.add_parameter_set<Dtype>(this->num_output_, 1);
-      }
-      this->layerOp = mn::train::add_operation(reg_info);
-    }
-#endif /* USE_MLSL */
-}
-
-template <typename Dtype>
-void MKLDNNConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom
-                                            , const vector<Blob<Dtype>*>& top)
-{
-    VLOG(1) << "<< MKLDNNConvolutionLayer<Dtype>::LayerSetUp: " << this->layer_param_.name();
-    ConvolutionLayer<Dtype>::LayerSetUp(bottom, top);
-    init_properties(bottom, top);
-    this->bottom_shape_ = &bottom[0]->shape();
-}
-
-template <typename Dtype>
-void MKLDNNConvolutionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom
-                                            , const vector<Blob<Dtype>*>& top)
-{
-    VLOG(1) << " MKLDNNConvolutionLayer<Dtype>::Reshape: " << this->layer_param_.name();
-    BaseConvolutionLayer<Dtype>::ReshapeForMKL(bottom, top);
-    init_properties(bottom, top);
-}
-
-template <typename Dtype>
-void MKLDNNConvolutionLayer<Dtype>::InitConvolutionFwd(const vector<Blob<Dtype>*>& bottom
-                                                , const vector<Blob<Dtype>*>& top)
-{
-    if (std::is_same<Dtype, double>::value)   NOT_IMPLEMENTED;
-    auto propagation = this->phase_ == TEST ? prop_kind::forward_scoring : prop_kind::forward_training;
-    bool relu = this->layer_param_.convolution_param().relu();
-    Dtype negative_slope;
-    if(relu)
-    {
-        propagation = prop_kind::forward_inference;
-        negative_slope = this->layer_param_.relu_param().negative_slope();
-    }
-
-    int32_t g  = std::max(this->group_, 1);
-    int32_t n  = this->num_;
-    int32_t iw = this->width_;
-    int32_t ih = this->height_;
-    int32_t ic = this->channels_;
-
-    int32_t ow = this->width_out_;
-    int32_t oh = this->height_out_;
-    int32_t oc = this->num_output_;
-
-    int32_t kw = this->kernel_w_;
-    int32_t kh = this->kernel_h_;
-
-    memory::dims convolutionStrides {this->stride_h_, this->stride_w_};
-    memory::dims padding {this->pad_h_, this->pad_w_};
-
-    // ---- Initialize memory descriptors (fromat = any) to create convolution descriptor -------------
-    memory::data_type mpcsn = memory::data_type::f32;
-    memory::format mfmt_any = memory::format::any;
-
-    memory::dims bottom_tz = {n, ic, ih, iw};
-    memory::dims bias_tz = {oc};
-    memory::dims top_tz = {n, oc, oh, ow};
-    memory::dims weights_tz = ( g!= 1) ? memory::dims{g, oc/g, ic/g, kh, kw} : memory::dims{oc, ic, kh, kw};
-
-    // ---- Memory descriptors for initializing of convolution primitive descriptor -------------
-    memory::desc init_bottom_md({bottom_tz}, mpcsn, mfmt_any);
-    memory::desc init_bias_md({bias_tz}, mpcsn, mfmt_any);
-    memory::desc init_top_md({top_tz}, mpcsn, mfmt_any);
-    memory::desc init_weights_md({weights_tz}, mpcsn, mfmt_any);
-
-    // ---- Initialize convolution primitive descriptor -------------
-    shared_ptr<convolution_forward::desc> convFwd_desc;
-    if (this->bias_term_) {
-        convFwd_desc.reset(new convolution_forward::desc(propagation, algorithm::convolution_direct
-                                    , init_bottom_md, init_weights_md, init_bias_md, init_top_md
-                                    , convolutionStrides, padding, padding, padding_kind::zero));
-    } else {
-        convFwd_desc.reset(new convolution_forward::desc(propagation, algorithm::convolution_direct
-                                    , init_bottom_md, init_weights_md, init_top_md
-                                    , convolutionStrides, padding, padding, padding_kind::zero));
-    }
-    shared_ptr<convolution_relu_forward::desc> convReluFwd_desc;
-    if(relu) convReluFwd_desc.reset(new convolution_relu_forward::desc(*convFwd_desc, negative_slope));
-    // ---- Determining engine to use -----------------------
-    std::string subengines = this->layer_param_.engine();
-    if (subengines == "" || subengines == "MKLDNN")
-      subengines = "MKLDNN:CPU";
-    EngineParser ep(subengines);
-    unsigned subEngineIndex = 0;
-    shared_ptr<convolution_relu_forward::primitive_desc> convReluFwd_pd;
-    for(; subEngineIndex < ep.getNumberOfSubEngines(); subEngineIndex++) {
-      try {
-        convFwd_pd.reset(new convolution_forward::primitive_desc(*convFwd_desc,
-                ep.getMKLDNNSubEngine(subEngineIndex)));
-        if(relu) convReluFwd_pd.reset(new convolution_relu_forward::primitive_desc(*convReluFwd_desc,
-                ep.getMKLDNNSubEngine(subEngineIndex)));
-      }
-      catch(...) {
-        continue;
-      }
-      break;
-    }
-
-    CHECK(convFwd_pd);
-    engine cpu_engine = CpuEngine::Instance().get_engine();
-
-    // ---- Create priv memory primitive descriptors stored as class members -------------
-    typedef typename memory::primitive_desc MemPD; // short name for memory::primitive_desc
-
-    shared_ptr<MemPD> prv_fwd_bottom_data_memory_pd(new MemPD(convFwd_pd->src_primitive_desc()));
-    shared_ptr<MemPD> prv_fwd_top_data_memory_pd(new MemPD(convFwd_pd->dst_primitive_desc()));
-    shared_ptr<MemPD> prv_fwd_weights_data_memory_pd(new MemPD(convFwd_pd->weights_primitive_desc()));
-
-    // ---- Create usr memory primitive descriptors -------------
-    memory::format mfmt_nchw = memory::format::nchw;
-    memory::format weights_mfmt = ( g!= 1) ? memory::format::goihw : memory::format::oihw;
-
-    // TODO: There should not be a problem to use this for Backward as well
-    shared_ptr<MemPD> usr_bottom_data_memory_pd(new MemPD({{bottom_tz}, mpcsn, mfmt_nchw}, cpu_engine));
-    shared_ptr<MemPD> usr_bias_data_memory_pd(new MemPD({{bias_tz}, mpcsn, memory::format::x}, cpu_engine));
-    shared_ptr<MemPD> usr_top_data_memory_pd(new MemPD({{top_tz}, mpcsn, mfmt_nchw}, cpu_engine));
-    shared_ptr<MemPD> usr_weights_data_memory_pd(new MemPD({{weights_tz}, mpcsn, weights_mfmt}, cpu_engine));
-
-
-    // ---  init primitive and prv_memory descriptors ----------------------
-    fwd_bottom_data.reset(new MKLDNNData<Dtype>(usr_bottom_data_memory_pd, prv_fwd_bottom_data_memory_pd, bottom[0], this));
-    fwd_bottom_data ->name = "fwd_bottom_data   @ " + this->layer_param_.name();
-    fwd_bottom_data_primitive = fwd_bottom_data->create_input(false);
-
-    fwd_top_data.reset(new MKLDNNData<Dtype>(usr_top_data_memory_pd, prv_fwd_top_data_memory_pd, top[0], this));
-    fwd_top_data    ->name = "fwd_top_data      @ " + this->layer_param_.name();
-    fwd_top_data_memory = fwd_top_data->create_output_memory();
-
-    fwd_weights_data.reset(new MKLDNNData<Dtype>(usr_weights_data_memory_pd, prv_fwd_weights_data_memory_pd, this->blobs_[0].get(), this));
-    fwd_weights_data->name = "fwd_weights_data  @ " + this->layer_param_.name();
-    fwd_weights_data_primitive = fwd_weights_data->create_input(true);
-
-    if (this->bias_term_) {
-        shared_ptr<MemPD> prv_fwd_bias_data_memory_pd(new MemPD(convFwd_pd->bias_primitive_desc()));
-        fwd_bias_data.reset(new MKLDNNData<Dtype>(usr_bias_data_memory_pd, prv_fwd_bias_data_memory_pd, this->blobs_[1].get(), this));
-        fwd_bias_data->name = "fwd_bias_data     @ " + this->layer_param_.name();
-        fwd_bias_data_primitive = fwd_bias_data->create_input(true);
-        if(relu) {
-          convFwd.reset(new convolution_relu_forward(*convReluFwd_pd
-                          , *fwd_bottom_data_primitive, *fwd_weights_data_primitive
-                          , *fwd_bias_data_primitive, *fwd_top_data_memory));
-        } else {
-          convFwd.reset(new convolution_forward(*convFwd_pd
-                          , *fwd_bottom_data_primitive, *fwd_weights_data_primitive
-                          , *fwd_bias_data_primitive, *fwd_top_data_memory));
+/*
+All modification made by Intel Corporation: © 2016 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef MKLDNN_SUPPORTED
+#include <algorithm>
+#include <cstdlib>
+#include <vector>
+
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/mkldnn_layers.hpp"
+//#include "mkl_service.h"
+
+// TODO: Correct process case if there are no bias
+// TODO: Exception handling - mkl-dnn produces exceptions on errors
+
+namespace caffe {
+
+template <typename Dtype>
+MKLDNNConvolutionLayer<Dtype>::MKLDNNConvolutionLayer(const LayerParameter& param)
+            : MKLDNNLayer<Dtype>(), ConvolutionLayer<Dtype>(param)
+            , fwd_bottom_data(NULL), fwd_top_data(NULL), fwd_weights_data(NULL), fwd_bias_data(NULL)
+            , bwdd_weights_data(NULL), bwdw_bottom_data(NULL)
+            , bwdd_bottom_diff(NULL), bwdd_top_diff(NULL)
+            , bwdw_top_diff(NULL), bwdw_weights_diff(NULL), bwdw_bias_diff(NULL)
+            , convFwd_pd(NULL), convBwdData_pd(NULL), convBwdWeights_pd(NULL)
+            , fwd_top_data_memory(NULL), bwdd_bottom_diff_memory(NULL)
+            , bwdw_weights_diff_memory(NULL), bwdw_bias_diff_memory(NULL)
+            , fwd_bottom_data_primitive(NULL), fwd_weights_data_primitive(NULL), fwd_bias_data_primitive(NULL)
+            , bwdd_top_diff_primitive(NULL), bwdd_weights_data_primitive(NULL)
+            , bwdw_top_diff_primitive(NULL), bwdw_bottom_data_primitive(NULL)
+            , width_(0), height_(0), width_out_(0), height_out_(0), kernel_w_(0), kernel_h_(0)
+            , stride_w_(0), stride_h_(0), pad_w_(0), pad_h_(0)
+{
+  PERFORMANCE_EVENT_ID_RESET(perf_id_fw_);
+  PERFORMANCE_EVENT_ID_RESET(perf_id_bw_);
+  PERFORMANCE_EVENT_ID_RESET(perf_id_bw_weights_);
+}
+
+template <typename Dtype>
+void MKLDNNConvolutionLayer<Dtype>::compute_output_shape()
+{
+    ConvolutionLayer<Dtype>::compute_output_shape();
+    this->height_out_ = (this->height_ + 2 * this->pad_h_ - this->kernel_h_)
+        / this->stride_h_ + 1;
+    this->width_out_ = (this->width_ + 2 * this->pad_w_ - this->kernel_w_)
+        / this->stride_w_ + 1;
+}
+
+template <typename Dtype>
+void MKLDNNConvolutionLayer<Dtype>::init_properties(const vector<Blob<Dtype>*>& bottom
+                                                , const vector<Blob<Dtype>*>& top)
+{
+    this->stride_w_ = this->stride_.cpu_data()[1];
+    this->stride_h_ = this->stride_.cpu_data()[0];
+    this->width_ = bottom[0]->width();
+    this->height_ = bottom[0]->height();
+    this->pad_w_ = this->pad_.cpu_data()[1];
+    this->pad_h_ = this->pad_.cpu_data()[0];
+    this->kernel_w_ = this->kernel_shape_.cpu_data()[1];
+    this->kernel_h_  = this->kernel_shape_.cpu_data()[0];
+
+#ifdef USE_MLSL
+    if ((this->layerOp == nullptr) && (this->phase_ == TRAIN)) {
+      mn::OpRegInfo reg_info{ mn::train::get_session(), MLSL::OT_CC };
+      reg_info.set_name(this->layer_param_.name());
+      reg_info.add_parameter_set<Dtype>(this->channels_ * this->num_output_ / std::max(this->group_, 1), this->kernel_w_ * this->kernel_h_);
+      if (this->bias_term_) {
+        reg_info.add_parameter_set<Dtype>(this->num_output_, 1);
+      }
+      this->layerOp = mn::train::add_operation(reg_info);
+    }
+#endif /* USE_MLSL */
+}
+
+template <typename Dtype>
+void MKLDNNConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom
+                                            , const vector<Blob<Dtype>*>& top)
+{
+    VLOG(1) << "<< MKLDNNConvolutionLayer<Dtype>::LayerSetUp: " << this->layer_param_.name();
+    ConvolutionLayer<Dtype>::LayerSetUp(bottom, top);
+    init_properties(bottom, top);
+    this->bottom_shape_ = &bottom[0]->shape();
+}
+
+template <typename Dtype>
+void MKLDNNConvolutionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom
+                                            , const vector<Blob<Dtype>*>& top)
+{
+    VLOG(1) << " MKLDNNConvolutionLayer<Dtype>::Reshape: " << this->layer_param_.name();
+    BaseConvolutionLayer<Dtype>::ReshapeForMKL(bottom, top);
+    init_properties(bottom, top);
+}
+
+template <typename Dtype>
+void MKLDNNConvolutionLayer<Dtype>::InitConvolutionFwd(const vector<Blob<Dtype>*>& bottom
+                                                , const vector<Blob<Dtype>*>& top)
+{
+    if (std::is_same<Dtype, double>::value)   NOT_IMPLEMENTED;
+    auto propagation = this->phase_ == TEST ? prop_kind::forward_scoring : prop_kind::forward_training;
+    bool relu = this->layer_param_.convolution_param().relu();
+    Dtype negative_slope = 0;
+    if(relu)
+    {
+        propagation = prop_kind::forward_inference;
+        negative_slope = this->layer_param_.relu_param().negative_slope();
+    }
+
+    int32_t g  = std::max(this->group_, 1);
+    int32_t n  = this->num_;
+    int32_t iw = this->width_;
+    int32_t ih = this->height_;
+    int32_t ic = this->channels_;
+
+    int32_t ow = this->width_out_;
+    int32_t oh = this->height_out_;
+    int32_t oc = this->num_output_;
+
+    int32_t kw = this->kernel_w_;
+    int32_t kh = this->kernel_h_;
+
+    memory::dims convolutionStrides {this->stride_h_, this->stride_w_};
+    memory::dims padding {this->pad_h_, this->pad_w_};
+
+    // ---- Initialize memory descriptors (fromat = any) to create convolution descriptor -------------
+    memory::data_type mpcsn = memory::data_type::f32;
+    memory::format mfmt_any = memory::format::any;
+
+    memory::dims bottom_tz = {n, ic, ih, iw};
+    memory::dims bias_tz = {oc};
+    memory::dims top_tz = {n, oc, oh, ow};
+    memory::dims weights_tz = (g!= 1) ? memory::dims{g, oc/g, ic/g, kh, kw} : memory::dims{oc, ic, kh, kw};
+
+    // ---- Memory descriptors for initializing of convolution primitive descriptor -------------
+    memory::desc init_bottom_md({bottom_tz}, mpcsn, mfmt_any);
+    memory::desc init_bias_md({bias_tz}, mpcsn, mfmt_any);
+    memory::desc init_top_md({top_tz}, mpcsn, mfmt_any);
+    memory::desc init_weights_md({weights_tz}, mpcsn, mfmt_any);
+
+    // ---- Initialize convolution primitive descriptor -------------
+    shared_ptr<convolution_forward::desc> convFwd_desc;
+    if (this->bias_term_) {
+        convFwd_desc.reset(new convolution_forward::desc(propagation, algorithm::convolution_direct
+                                    , init_bottom_md, init_weights_md, init_bias_md, init_top_md
+                                    , convolutionStrides, padding, padding, padding_kind::zero));
+    } else {
+        convFwd_desc.reset(new convolution_forward::desc(propagation, algorithm::convolution_direct
+                                    , init_bottom_md, init_weights_md, init_top_md
+                                    , convolutionStrides, padding, padding, padding_kind::zero));
+    }
+    shared_ptr<convolution_relu_forward::desc> convReluFwd_desc;
+    if(relu) convReluFwd_desc.reset(new convolution_relu_forward::desc(*convFwd_desc, negative_slope));
+    // ---- Determining engine to use -----------------------
+    std::string subengines = this->layer_param_.engine();
+    if (subengines == "" || subengines == "MKLDNN")
+      subengines = "MKLDNN:CPU";
+    EngineParser ep(subengines);
+    unsigned subEngineIndex = 0;
+    shared_ptr<convolution_relu_forward::primitive_desc> convReluFwd_pd;
+    for(; subEngineIndex < ep.getNumberOfSubEngines(); subEngineIndex++) {
+      try {
+        convFwd_pd.reset(new convolution_forward::primitive_desc(*convFwd_desc,
+                ep.getMKLDNNSubEngine(subEngineIndex)));
+        if(relu) convReluFwd_pd.reset(new convolution_relu_forward::primitive_desc(*convReluFwd_desc,
+                ep.getMKLDNNSubEngine(subEngineIndex)));
+      }
+      catch(...) {
+        continue;
+      }
+      break;
+    }
+
+    CHECK(convFwd_pd);
+    engine cpu_engine = CpuEngine::Instance().get_engine();
+
+    // ---- Create priv memory primitive descriptors stored as class members -------------
+    typedef typename memory::primitive_desc MemPD; // short name for memory::primitive_desc
+
+    shared_ptr<MemPD> prv_fwd_bottom_data_memory_pd(new MemPD(convFwd_pd->src_primitive_desc()));
+    shared_ptr<MemPD> prv_fwd_top_data_memory_pd(new MemPD(convFwd_pd->dst_primitive_desc()));
+    shared_ptr<MemPD> prv_fwd_weights_data_memory_pd(new MemPD(convFwd_pd->weights_primitive_desc()));
+
+    // ---- Create usr memory primitive descriptors -------------
+    memory::format mfmt_nchw = memory::format::nchw;
+    memory::format weights_mfmt = (g!= 1) ? memory::format::goihw : memory::format::oihw;
+
+    // TODO: There should not be a problem to use this for Backward as well
+    shared_ptr<MemPD> usr_bottom_data_memory_pd(new MemPD({{bottom_tz}, mpcsn, mfmt_nchw}, cpu_engine));
+    shared_ptr<MemPD> usr_bias_data_memory_pd(new MemPD({{bias_tz}, mpcsn, memory::format::x}, cpu_engine));
+    shared_ptr<MemPD> usr_top_data_memory_pd(new MemPD({{top_tz}, mpcsn, mfmt_nchw}, cpu_engine));
+    shared_ptr<MemPD> usr_weights_data_memory_pd(new MemPD({{weights_tz}, mpcsn, weights_mfmt}, cpu_engine));
+
+
+    // ---  init primitive and prv_memory descriptors ----------------------
+    fwd_bottom_data.reset(new MKLDNNData<Dtype>(usr_bottom_data_memory_pd, prv_fwd_bottom_data_memory_pd, bottom[0], this));
+    fwd_bottom_data ->name = "fwd_bottom_data   @ " + this->layer_param_.name();
+    fwd_bottom_data_primitive = fwd_bottom_data->create_input(false);
+
+    fwd_top_data.reset(new MKLDNNData<Dtype>(usr_top_data_memory_pd, prv_fwd_top_data_memory_pd, top[0], this));
+    fwd_top_data    ->name = "fwd_top_data      @ " + this->layer_param_.name();
+    fwd_top_data_memory = fwd_top_data->create_output_memory();
+
+    fwd_weights_data.reset(new MKLDNNData<Dtype>(usr_weights_data_memory_pd, prv_fwd_weights_data_memory_pd, this->blobs_[0].get(), this));
+    fwd_weights_data->name = "fwd_weights_data  @ " + this->layer_param_.name();
+    fwd_weights_data_primitive = fwd_weights_data->create_input(true);
+
+    if (this->bias_term_) {
+        shared_ptr<MemPD> prv_fwd_bias_data_memory_pd(new MemPD(convFwd_pd->bias_primitive_desc()));
+        fwd_bias_data.reset(new MKLDNNData<Dtype>(usr_bias_data_memory_pd, prv_fwd_bias_data_memory_pd, this->blobs_[1].get(), this));
+        fwd_bias_data->name = "fwd_bias_data     @ " + this->layer_param_.name();
+        fwd_bias_data_primitive = fwd_bias_data->create_input(true);
+        if(relu) {
+          convFwd.reset(new convolution_relu_forward(*convReluFwd_pd
+                          , *fwd_bottom_data_primitive, *fwd_weights_data_primitive
+                          , *fwd_bias_data_primitive, *fwd_top_data_memory));
+        } else {
+          convFwd.reset(new convolution_forward(*convFwd_pd
+                          , *fwd_bottom_data_primitive, *fwd_weights_data_primitive
+                          , *fwd_bias_data_primitive, *fwd_top_data_memory));
+        }
+        fwd_bias_data->set_mkldnn_primitive(convFwd);
+    } else {
+        if(relu) {
+          convFwd.reset(new convolution_relu_forward(*convReluFwd_pd
+                          , *fwd_bottom_data_primitive, *fwd_weights_data_primitive
+                          , *fwd_top_data_memory));
+        } else {
+          convFwd.reset(new convolution_forward(*convFwd_pd
+                          , *fwd_bottom_data_primitive, *fwd_weights_data_primitive
+                          , *fwd_top_data_memory));
+        }
+    }
+    fwd_bottom_data->set_mkldnn_primitive(convFwd);   //Wrong passed primitive! (TODO: Checking!)
+    //MKLDNNPrimitive<Dtype> fwd_bottom_data_primitive_transfer(fwd_bottom_data_primitive);
+    //fwd_bottom_data->set_mkldnn_primitive(fwd_bottom_data_primitive_transfer);
+
+    fwd_top_data->set_mkldnn_primitive(convFwd);
+
+    //fwd_weights_data->set_mkldnn_primitive(convFwd);  //Wrong passed primitive! (For sure!)
+    MKLDNNPrimitive<Dtype> fwd_weights_data_primitive_transfer(fwd_weights_data_primitive);
+    fwd_weights_data->set_mkldnn_primitive(fwd_weights_data_primitive_transfer);
+
+    // Names are for debugging purposes only.
+}
+
+template <typename Dtype>
+void MKLDNNConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom
+                                                , const vector<Blob<Dtype>*>& top)
+{
+    VLOG(1) << "MKLDNNConvolutionLayer<Dtype>::Forward_cpu: " << this->layer_param_.name();
+    if( convFwd_pd == NULL)
+        InitConvolutionFwd(bottom, top);
+    // making reorders if needed.
+    fwd_bottom_data->sync_before_read();
+    fwd_weights_data->sync_before_read();
+    if (this->bias_term_)
+        fwd_bias_data->sync_before_read();
+    // update top that head at prv
+    fwd_top_data->sync_before_write();
+
+    PERFORMANCE_EVENT_ID_INIT(perf_id_fw_, PERFORMANCE_MKLDNN_NAME("FW"));
+    PERFORMANCE_MEASUREMENT_BEGIN();
+    convFwd.submit();
+    PERFORMANCE_MEASUREMENT_END_ID(perf_id_fw_);
+}
+
+
+template <typename Dtype>
+void MKLDNNConvolutionLayer<Dtype>::InitConvolutionBwd(const vector<Blob<Dtype>*>& top
+                                                    , const vector<bool>& propagate_down
+                                                    , const vector<Blob<Dtype>*>& bottom)
+{
+    if (std::is_same<Dtype, double>::value)   NOT_IMPLEMENTED;
+
+    int32_t g  = std::max(this->group_, 1);
+    int32_t n  = this->num_;
+    int32_t iw = this->width_;
+    int32_t ih = this->height_;
+    int32_t ic = this->channels_;
+
+    int32_t ow = this->width_out_;
+    int32_t oh = this->height_out_;
+    int32_t oc = this->num_output_;
+
+    int32_t kw = this->kernel_w_;
+    int32_t kh = this->kernel_h_;
+
+    memory::dims convolutionStrides {this->stride_h_, this->stride_w_};
+    memory::dims padding {this->pad_h_, this->pad_w_};
+
+    // ---- Initialize memory descriptors (fromat = any) to create convolution descriptor -------------
+    memory::data_type mpcsn = memory::data_type::f32;
+    memory::format mfmt_any = memory::format::any;
+
+    memory::dims bottom_tz = {n, ic, ih, iw};
+    memory::dims bias_tz = {oc};
+    memory::dims top_tz = {n, oc, oh, ow};
+    memory::dims weights_tz = ( g!= 1) ? memory::dims{g, oc/g, ic/g, kh, kw} : memory::dims{oc, ic, kh, kw};
+
+    // ---- Memory descriptors for initializing of convolution primitive descriptor -------------
+    memory::desc init_bottom_md({bottom_tz}, mpcsn, mfmt_any);
+    memory::desc init_bias_md({bias_tz}, mpcsn, mfmt_any);
+    memory::desc init_top_md({top_tz}, mpcsn, mfmt_any);
+    memory::desc init_weights_md({weights_tz}, mpcsn, mfmt_any);
+
+    // ---- Initialize convolution primitive descriptor -------------
+    shared_ptr<convolution_backward_data::desc> convBwdData_desc;
+    shared_ptr<convolution_backward_weights::desc> convBwdWeights_desc;
+    if (this->bias_term_) {
+        convBwdWeights_desc.reset(new convolution_backward_weights::desc(algorithm::convolution_direct
+                            , init_bottom_md, init_weights_md, init_bias_md, init_top_md
+                            , convolutionStrides, padding, padding, padding_kind::zero));
+    } else {
+        convBwdWeights_desc.reset(new convolution_backward_weights::desc(algorithm::convolution_direct
+                            , init_bottom_md, init_weights_md, init_top_md
+                            , convolutionStrides, padding, padding, padding_kind::zero));
+    }
+
+    convBwdData_desc.reset(new convolution_backward_data::desc(algorithm::convolution_direct
+                            , init_bottom_md, init_weights_md, init_top_md
+                            , convolutionStrides, padding, padding, padding_kind::zero));
+
+    // ---- Determining engine to use -----------------------
+    std::string subengines = this->layer_param_.engine();
+    if (subengines == "" || subengines == "MKLDNN")
+      subengines = "MKLDNN:CPU";
+    EngineParser ep(subengines);
+    unsigned subEngineIndex = 0;
+    for(; subEngineIndex < ep.getNumberOfSubEngines(); subEngineIndex++) {
+      try {
+        convBwdData_pd.reset(new convolution_backward_data::primitive_desc(*convBwdData_desc,
+                ep.getMKLDNNSubEngine(subEngineIndex), *convFwd_pd));
+
+        convBwdWeights_pd.reset(new convolution_backward_weights::primitive_desc(*convBwdWeights_desc,
+                ep.getMKLDNNSubEngine(subEngineIndex), *convFwd_pd));
+      }
+      catch(...) {
+        continue;
+      }
+      break;
+    }
+    CHECK(convBwdData_pd);
+    CHECK(convBwdWeights_pd);
+    engine cpu_engine = CpuEngine::Instance().get_engine();
+
+    // ---- Create priv memory primitive descriptors stored as class members -------------
+    typedef typename memory::primitive_desc MemPD; // short name for memory::primitive_desc
+
+    shared_ptr<MemPD> prv_bwdd_bottom_diff_memory_pd(new MemPD(convBwdData_pd->diff_src_primitive_desc()));
+    shared_ptr<MemPD> prv_bwdd_top_diff_memory_pd(new MemPD(convBwdData_pd->diff_dst_primitive_desc()));
+    shared_ptr<MemPD> prv_bwdd_weights_data_memory_pd(new MemPD(convBwdData_pd->weights_primitive_desc()));
+
+    shared_ptr<MemPD> prv_bwdw_bottom_data_memory_pd(new MemPD(convBwdWeights_pd->src_primitive_desc()));
+    shared_ptr<MemPD> prv_bwdw_top_diff_memory_pd(new MemPD(convBwdWeights_pd->diff_dst_primitive_desc()));
+    shared_ptr<MemPD> prv_bwdw_weights_diff_memory_pd(new MemPD(convBwdWeights_pd->diff_weights_primitive_desc()));
+
+    // ---- Create usr memory primitive descriptors -------------
+    memory::format mfmt_nchw = memory::format::nchw;
+    memory::format weights_mfmt = ( g!= 1) ? memory::format::goihw : memory::format::oihw;
+
+    // ???!!! can we use usr memory primitive descrittors for backward??
+    shared_ptr<MemPD> usr_bottom_data_memory_pd(new MemPD({{bottom_tz}, mpcsn, mfmt_nchw}, cpu_engine));
+    shared_ptr<MemPD> usr_bias_data_memory_pd(new MemPD({{bias_tz}, mpcsn, memory::format::x}, cpu_engine));
+    shared_ptr<MemPD> usr_top_data_memory_pd(new MemPD({{top_tz}, mpcsn, mfmt_nchw}, cpu_engine));
+    shared_ptr<MemPD> usr_weights_data_memory_pd(new MemPD({{weights_tz}, mpcsn, weights_mfmt}, cpu_engine));
+
+
+    // ---  init primitive and prv_memory descriptors ----------------------
+    bwdd_bottom_diff.reset(new MKLDNNDiff<Dtype>(usr_bottom_data_memory_pd, prv_bwdd_bottom_diff_memory_pd, bottom[0], this));
+    bwdd_bottom_diff ->name = "bwdd_bottom_diff   @ " + this->layer_param_.name();
+    bwdd_bottom_diff_memory = bwdd_bottom_diff->create_output_memory();
+    bwdw_bottom_data.reset(new MKLDNNData<Dtype>(usr_bottom_data_memory_pd, prv_bwdw_bottom_data_memory_pd, bottom[0], this));
+    bwdw_bottom_data ->name = "bwdw_bottom_data   @ " + this->layer_param_.name();
+    bwdw_bottom_data_primitive = bwdw_bottom_data->create_input(false);
+
+    bwdd_top_diff.reset(new MKLDNNDiff<Dtype>(usr_top_data_memory_pd, prv_bwdd_top_diff_memory_pd, top[0], this));
+    bwdd_top_diff    ->name = "bwdd_top_diff      @ " + this->layer_param_.name();
+    bwdd_top_diff_primitive = bwdd_top_diff->create_input(false);
+    bwdw_top_diff.reset(new MKLDNNDiff<Dtype>(usr_top_data_memory_pd, prv_bwdw_top_diff_memory_pd, top[0], this));
+    bwdw_top_diff    ->name = "bwdw_top_diff      @ " + this->layer_param_.name();
+    bwdw_top_diff_primitive = bwdw_top_diff->create_input(false);
+
+    bwdd_weights_data.reset(new MKLDNNData<Dtype>(usr_weights_data_memory_pd, prv_bwdd_weights_data_memory_pd, this->blobs_[0].get(), this));
+    bwdd_weights_data->name = "bwdd_weights_data  @ " + this->layer_param_.name();
+    bwdd_weights_data_primitive = bwdd_weights_data->create_input(false);
+    bwdw_weights_diff.reset(new MKLDNNDiff<Dtype>(usr_weights_data_memory_pd, prv_bwdw_weights_diff_memory_pd, this->blobs_[0].get(), this));
+    bwdw_weights_diff->name = "bwdw_weights_diff  @ " + this->layer_param_.name();
+    bwdw_weights_diff_memory = bwdw_weights_diff->create_output_memory();
+
+    if (this->bias_term_) {
+        shared_ptr<MemPD> prv_bwdw_bias_diff_memory_pd(new MemPD(convBwdWeights_pd->diff_bias_primitive_desc()));
+        bwdw_bias_diff.reset(new MKLDNNDiff<Dtype>(usr_bias_data_memory_pd, prv_bwdw_bias_diff_memory_pd, this->blobs_[1].get(), this));
+        bwdw_bias_diff->name = "bwdw_bias_diff     @ " + this->layer_param_.name();
+        bwdw_bias_diff_memory = bwdw_bias_diff->create_output_memory();
+
+        convBwdWeights.reset(new convolution_backward_weights(*convBwdWeights_pd
+                        , *bwdw_bottom_data_primitive, *bwdw_top_diff_primitive
+                        , *bwdw_weights_diff_memory, *bwdw_bias_diff_memory));
+
+        bwdw_bias_diff->set_mkldnn_primitive(convBwdWeights);
+    } else {
+        convBwdWeights.reset(new convolution_backward_weights(*convBwdWeights_pd
+                        , *bwdw_bottom_data_primitive, *bwdw_top_diff_primitive
+                        , *bwdw_weights_diff_memory));
+    }
+
+    convBwdData.reset(new convolution_backward_data(*convBwdData_pd
+                    , *bwdd_top_diff_primitive, *bwdd_weights_data_primitive
+                    , *bwdd_bottom_diff_memory));
+
+    bwdd_bottom_diff->set_mkldnn_primitive(convBwdData);
+
+    bwdd_top_diff->set_mkldnn_primitive(convBwdData);         //Wrong passed primitive! (TODO: Checking!)
+    //MKLDNNPrimitive<Dtype> bwdd_top_diff_primitive_transfer(bwdd_top_diff_primitive);
+    //bwdd_top_diff->set_mkldnn_primitive(bwdd_top_diff_primitive_transfer);
+
+    //bwdd_weights_data->set_mkldnn_primitive(convBwdData);     //Wrong passed primitive! (For sure!)
+    MKLDNNPrimitive<Dtype> bwdd_weights_data_primitive_transfer(bwdd_weights_data_primitive);
+    bwdd_weights_data->set_mkldnn_primitive(bwdd_weights_data_primitive_transfer);
+
+
+    bwdw_bottom_data->set_mkldnn_primitive(convBwdWeights);   //Wrong passed primitive! (TODO: Checking!)
+    //MKLDNNPrimitive<Dtype> bwdw_bottom_data_primitive_transfer(bwdw_bottom_data_primitive);
+    //bwdw_bottom_data->set_mkldnn_primitive(bwdw_bottom_data_primitive_transfer);
+
+    bwdw_top_diff->set_mkldnn_primitive(convBwdWeights);      //Wrong passed primitive! (TODO: Checking!)
+    //MKLDNNPrimitive<Dtype> bwdw_top_diff_primitive_transfer(bwdw_top_diff_primitive);
+    //bwdw_top_diff->set_mkldnn_primitive(bwdw_top_diff_primitive_transfer);
+
+    bwdw_weights_diff->set_mkldnn_primitive(convBwdWeights);
+
+    // Names are for debugging purposes only.
+}
+
+
+template <typename Dtype>
+void MKLDNNConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top
+                                                , const vector<bool>& propagate_down
+                                                , const vector<Blob<Dtype>*>& bottom)
+{
+    VLOG(1) << "MKLDNNConvolutionLayer<Dtype>::Backward_cpu: " << this->layer_param_.name();
+    if( convBwdData_pd == NULL)
+        InitConvolutionBwd(top, propagate_down, bottom);
+    if (propagate_down[0]) {
+        // making reorders if needed.
+        bwdd_top_diff->sync_before_read();
+        bwdd_weights_data->sync_before_read();
+        bwdd_bottom_diff->sync_before_write();
+
+        PERFORMANCE_EVENT_ID_INIT(perf_id_bw_, PERFORMANCE_MKLDNN_NAME("BW"));
+        PERFORMANCE_MEASUREMENT_BEGIN();
+#ifdef DEBUG
+        if (bottom[0]->prv_data() != NULL)
+        {
+            LOG(INFO) << "Debug: Bottom prv data: " << *bottom[0]->prv_data();
         }
-        fwd_bias_data->set_mkldnn_primitive(convFwd);
-    } else {
-        if(relu) {
-          convFwd.reset(new convolution_relu_forward(*convReluFwd_pd
-                          , *fwd_bottom_data_primitive, *fwd_weights_data_primitive
-                          , *fwd_top_data_memory));
-        } else {
-          convFwd.reset(new convolution_forward(*convFwd_pd
-                          , *fwd_bottom_data_primitive, *fwd_weights_data_primitive
-                          , *fwd_top_data_memory));
+        else
+        {
+            LOG(INFO) << "Debug: Bottom prv data is NULL!";
+            //LOG(INFO) << "Debug: Bottom cpu data: " << *bottom[0]->cpu_data();
         }
-    }
-    fwd_bottom_data->set_mkldnn_primitive(convFwd);
-    fwd_top_data->set_mkldnn_primitive(convFwd);
-    fwd_weights_data->set_mkldnn_primitive(convFwd);
-
-    // Names are for debugging purposes only.
-}
-
-template <typename Dtype>
-void MKLDNNConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom
-                                                , const vector<Blob<Dtype>*>& top)
-{
-    VLOG(1) << "MKLDNNConvolutionLayer<Dtype>::Forward_cpu: " << this->layer_param_.name();
-    if( convFwd_pd == NULL)
-        InitConvolutionFwd(bottom, top);
-    // making reorders if needed.
-    fwd_bottom_data->sync_before_read();
-    fwd_weights_data->sync_before_read();
-    if (this->bias_term_)
-        fwd_bias_data->sync_before_read();
-    // update top that head at prv
-    fwd_top_data->sync_before_write();
-
-    PERFORMANCE_EVENT_ID_INIT(perf_id_fw_, PERFORMANCE_MKLDNN_NAME("FW"));
-    PERFORMANCE_MEASUREMENT_BEGIN();
-    convFwd.submit();
-    PERFORMANCE_MEASUREMENT_END_ID(perf_id_fw_);
-}
 
-
-template <typename Dtype>
-void MKLDNNConvolutionLayer<Dtype>::InitConvolutionBwd(const vector<Blob<Dtype>*>& top
-                                                    , const vector<bool>& propagate_down
-                                                    , const vector<Blob<Dtype>*>& bottom)
-{
-    if (std::is_same<Dtype, double>::value)   NOT_IMPLEMENTED;
-
-    int32_t g  = std::max(this->group_, 1);
-    int32_t n  = this->num_;
-    int32_t iw = this->width_;
-    int32_t ih = this->height_;
-    int32_t ic = this->channels_;
-
-    int32_t ow = this->width_out_;
-    int32_t oh = this->height_out_;
-    int32_t oc = this->num_output_;
-
-    int32_t kw = this->kernel_w_;
-    int32_t kh = this->kernel_h_;
-
-    memory::dims convolutionStrides {this->stride_h_, this->stride_w_};
-    memory::dims padding {this->pad_h_, this->pad_w_};
-
-    // ---- Initialize memory descriptors (fromat = any) to create convolution descriptor -------------
-    memory::data_type mpcsn = memory::data_type::f32;
-    memory::format mfmt_any = memory::format::any;
-
-    memory::dims bottom_tz = {n, ic, ih, iw};
-    memory::dims bias_tz = {oc};
-    memory::dims top_tz = {n, oc, oh, ow};
-    memory::dims weights_tz = ( g!= 1) ? memory::dims{g, oc/g, ic/g, kh, kw} : memory::dims{oc, ic, kh, kw};
-
-    // ---- Memory descriptors for initializing of convolution primitive descriptor -------------
-    memory::desc init_bottom_md({bottom_tz}, mpcsn, mfmt_any);
-    memory::desc init_bias_md({bias_tz}, mpcsn, mfmt_any);
-    memory::desc init_top_md({top_tz}, mpcsn, mfmt_any);
-    memory::desc init_weights_md({weights_tz}, mpcsn, mfmt_any);
-
-    // ---- Initialize convolution primitive descriptor -------------
-    shared_ptr<convolution_backward_data::desc> convBwdData_desc;
-    shared_ptr<convolution_backward_weights::desc> convBwdWeights_desc;
-    if (this->bias_term_) {
-        convBwdWeights_desc.reset(new convolution_backward_weights::desc(algorithm::convolution_direct
-                            , init_bottom_md, init_weights_md, init_bias_md, init_top_md
-                            , convolutionStrides, padding, padding, padding_kind::zero));
-    } else {
-        convBwdWeights_desc.reset(new convolution_backward_weights::desc(algorithm::convolution_direct
-                            , init_bottom_md, init_weights_md, init_top_md
-                            , convolutionStrides, padding, padding, padding_kind::zero));
-    }
-
-    convBwdData_desc.reset(new convolution_backward_data::desc(algorithm::convolution_direct
-                            , init_bottom_md, init_weights_md, init_top_md
-                            , convolutionStrides, padding, padding, padding_kind::zero));
-
-    // ---- Determining engine to use -----------------------
-    std::string subengines = this->layer_param_.engine();
-    if (subengines == "" || subengines == "MKLDNN")
-      subengines = "MKLDNN:CPU";
-    EngineParser ep(subengines);
-    unsigned subEngineIndex = 0;
-    for(; subEngineIndex < ep.getNumberOfSubEngines(); subEngineIndex++) {
-      try {
-        convBwdData_pd.reset(new convolution_backward_data::primitive_desc(*convBwdData_desc,
-                ep.getMKLDNNSubEngine(subEngineIndex), *convFwd_pd));
-
-        convBwdWeights_pd.reset(new convolution_backward_weights::primitive_desc(*convBwdWeights_desc,
-                ep.getMKLDNNSubEngine(subEngineIndex), *convFwd_pd));
-      }
-      catch(...) {
-        continue;
-      }
-      break;
-    }
-    CHECK(convBwdData_pd);
-    CHECK(convBwdWeights_pd);
-    engine cpu_engine = CpuEngine::Instance().get_engine();
-
-    // ---- Create priv memory primitive descriptors stored as class members -------------
-    typedef typename memory::primitive_desc MemPD; // short name for memory::primitive_desc
-
-    shared_ptr<MemPD> prv_bwdd_bottom_diff_memory_pd(new MemPD(convBwdData_pd->diff_src_primitive_desc()));
-    shared_ptr<MemPD> prv_bwdd_top_diff_memory_pd(new MemPD(convBwdData_pd->diff_dst_primitive_desc()));
-    shared_ptr<MemPD> prv_bwdd_weights_data_memory_pd(new MemPD(convBwdData_pd->weights_primitive_desc()));
-
-    shared_ptr<MemPD> prv_bwdw_bottom_data_memory_pd(new MemPD(convBwdWeights_pd->src_primitive_desc()));
-    shared_ptr<MemPD> prv_bwdw_top_diff_memory_pd(new MemPD(convBwdWeights_pd->diff_dst_primitive_desc()));
-    shared_ptr<MemPD> prv_bwdw_weights_diff_memory_pd(new MemPD(convBwdWeights_pd->diff_weights_primitive_desc()));
-
-    // ---- Create usr memory primitive descriptors -------------
-    memory::format mfmt_nchw = memory::format::nchw;
-    memory::format weights_mfmt = ( g!= 1) ? memory::format::goihw : memory::format::oihw;
-
-    // ???!!! can we use usr memory primitive descrittors for backward??
-    shared_ptr<MemPD> usr_bottom_data_memory_pd(new MemPD({{bottom_tz}, mpcsn, mfmt_nchw}, cpu_engine));
-    shared_ptr<MemPD> usr_bias_data_memory_pd(new MemPD({{bias_tz}, mpcsn, memory::format::x}, cpu_engine));
-    shared_ptr<MemPD> usr_top_data_memory_pd(new MemPD({{top_tz}, mpcsn, mfmt_nchw}, cpu_engine));
-    shared_ptr<MemPD> usr_weights_data_memory_pd(new MemPD({{weights_tz}, mpcsn, weights_mfmt}, cpu_engine));
-
-
-    // ---  init primitive and prv_memory descriptors ----------------------
-    bwdd_bottom_diff.reset(new MKLDNNDiff<Dtype>(usr_bottom_data_memory_pd, prv_bwdd_bottom_diff_memory_pd, bottom[0], this));
-    bwdd_bottom_diff ->name = "bwdd_bottom_diff   @ " + this->layer_param_.name();
-    bwdd_bottom_diff_memory = bwdd_bottom_diff->create_output_memory();
-    bwdw_bottom_data.reset(new MKLDNNData<Dtype>(usr_bottom_data_memory_pd, prv_bwdw_bottom_data_memory_pd, bottom[0], this));
-    bwdw_bottom_data ->name = "bwdw_bottom_data   @ " + this->layer_param_.name();
-    bwdw_bottom_data_primitive = bwdw_bottom_data->create_input(false);
-
-    bwdd_top_diff.reset(new MKLDNNDiff<Dtype>(usr_top_data_memory_pd, prv_bwdd_top_diff_memory_pd, top[0], this));
-    bwdd_top_diff    ->name = "bwdd_top_diff      @ " + this->layer_param_.name();
-    bwdd_top_diff_primitive = bwdd_top_diff->create_input(false);
-    bwdw_top_diff.reset(new MKLDNNDiff<Dtype>(usr_top_data_memory_pd, prv_bwdw_top_diff_memory_pd, top[0], this));
-    bwdw_top_diff    ->name = "bwdw_top_diff      @ " + this->layer_param_.name();
-    bwdw_top_diff_primitive = bwdw_top_diff->create_input(false);
-
-    bwdd_weights_data.reset(new MKLDNNData<Dtype>(usr_weights_data_memory_pd, prv_bwdd_weights_data_memory_pd, this->blobs_[0].get(), this));
-    bwdd_weights_data->name = "bwdd_weights_data  @ " + this->layer_param_.name();
-    bwdd_weights_data_primitive = bwdd_weights_data->create_input(false);
-    bwdw_weights_diff.reset(new MKLDNNDiff<Dtype>(usr_weights_data_memory_pd, prv_bwdw_weights_diff_memory_pd, this->blobs_[0].get(), this));
-    bwdw_weights_diff->name = "bwdw_weights_diff  @ " + this->layer_param_.name();
-    bwdw_weights_diff_memory = bwdw_weights_diff->create_output_memory();
-
-    if (this->bias_term_) {
-        shared_ptr<MemPD> prv_bwdw_bias_diff_memory_pd(new MemPD(convBwdWeights_pd->diff_bias_primitive_desc()));
-        bwdw_bias_diff.reset(new MKLDNNDiff<Dtype>(usr_bias_data_memory_pd, prv_bwdw_bias_diff_memory_pd, this->blobs_[1].get(), this));
-        bwdw_bias_diff->name = "bwdw_bias_diff     @ " + this->layer_param_.name();
-        bwdw_bias_diff_memory = bwdw_bias_diff->create_output_memory();
-
-        convBwdWeights.reset(new convolution_backward_weights(*convBwdWeights_pd
-                        , *bwdw_bottom_data_primitive, *bwdw_top_diff_primitive
-                        , *bwdw_weights_diff_memory, *bwdw_bias_diff_memory));
-
-        bwdw_bias_diff->set_mkldnn_primitive(convBwdWeights);
-    } else {
-        convBwdWeights.reset(new convolution_backward_weights(*convBwdWeights_pd
-                        , *bwdw_bottom_data_primitive, *bwdw_top_diff_primitive
-                        , *bwdw_weights_diff_memory));
-    }
-
-    convBwdData.reset(new convolution_backward_data(*convBwdData_pd
-                    , *bwdd_top_diff_primitive, *bwdd_weights_data_primitive
-                    , *bwdd_bottom_diff_memory));
-
-    bwdd_bottom_diff->set_mkldnn_primitive(convBwdData);
-    bwdd_top_diff->set_mkldnn_primitive(convBwdData);
-    bwdd_weights_data->set_mkldnn_primitive(convBwdData);
-
-    bwdw_bottom_data->set_mkldnn_primitive(convBwdWeights);
-    bwdw_top_diff->set_mkldnn_primitive(convBwdWeights);
-    bwdw_weights_diff->set_mkldnn_primitive(convBwdWeights);
-
-    // Names are for debugging purposes only.
-}
-
-
-template <typename Dtype>
-void MKLDNNConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top
-                                                , const vector<bool>& propagate_down
-                                                , const vector<Blob<Dtype>*>& bottom)
-{
-    VLOG(1) << "MKLDNNConvolutionLayer<Dtype>::Backward_cpu: " << this->layer_param_.name();
-    if( convBwdData_pd == NULL)
-        InitConvolutionBwd(top, propagate_down, bottom);
-    if (propagate_down[0]) {
-        // making reorders if needed.
-        bwdd_top_diff->sync_before_read();
-        bwdd_weights_data->sync_before_read();
-        bwdd_bottom_diff->sync_before_write();
-
-        PERFORMANCE_EVENT_ID_INIT(perf_id_bw_, PERFORMANCE_MKLDNN_NAME("BW"));
-        PERFORMANCE_MEASUREMENT_BEGIN();
-        convBwdData.submit();
-        PERFORMANCE_MEASUREMENT_END_ID(perf_id_bw_);
-    }
-    if (this->param_propagate_down(0)) {
-        // making reorders if needed.
-        bwdw_top_diff->sync_before_read();
-        bwdw_bottom_data->sync_before_read();
-        // update top that head at prv
-        bwdw_weights_diff->sync_before_write();
-        if (this->param_propagate_down(1)) {
-            CHECK(bwdw_bias_diff);
-            bwdw_bias_diff->sync_before_write();
+        if (top[0]->prv_diff() != NULL)
+        {
+            LOG(INFO) << "Debug: Top prv diff: " << *top[0]->prv_diff();
         }
-        PERFORMANCE_EVENT_ID_INIT(perf_id_bw_weights_,
-          PERFORMANCE_MKLDNN_NAME_DETAILED("BW", "_weights"));
-        PERFORMANCE_MEASUREMENT_BEGIN();
-        convBwdWeights.submit();
-        PERFORMANCE_MEASUREMENT_END_ID(perf_id_bw_weights_);
-    }
-}
-
-#ifdef CPU_ONLY
-STUB_GPU(MKLDNNConvolutionLayer);
-#else
-
-template <typename Dtype>
-void MKLDNNConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom
-                                                , const vector<Blob<Dtype>*>& top)
-{
-    NOT_IMPLEMENTED;
-}
-
-template <typename Dtype>
-void MKLDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top
-                                                , const vector<bool>& propagate_down
-                                                , const vector<Blob<Dtype>*>& bottom)
-{
-    NOT_IMPLEMENTED;
-}
-#endif
-
-INSTANTIATE_CLASS(MKLDNNConvolutionLayer);
-
-}  // namespace caffe
-#endif  // #ifdef MKLDNN_SUPPORTED
+        else
+        {
+            LOG(INFO) << "Debug: Top prv diff is NULL!";
+            LOG(INFO) << "Debug: Top cpu diff: " << *top[0]->cpu_diff();
+        }
+
+        if (this->blobs_[0]->prv_data() != NULL)
+        {
+            LOG(INFO) << "Debug: Weights prv data from blobs_[0]: " << *this->blobs_[0]->prv_data();
+        }
+        else
+        {
+            LOG(INFO) << "Debug: Weights prv data is NULL!";
+            LOG(INFO) << "Debug: Weights cpu data: " << *this->blobs_[0]->cpu_data();
+        }
+        //Before submit, so get_prv_ptr() always has the value
+        LOG(INFO) << "Debug: Weights prv data from get_prv_ptr: " << *bwdd_weights_data->get_prv_ptr();
+#endif
+        convBwdData.submit();
+#ifdef DEBUG
+        if (bottom[0]->prv_diff() != NULL)
+        {
+            LOG(INFO) << "Debug: Bottom prv diff: " << *bottom[0]->prv_diff();
+        }
+        else
+        {
+            LOG(INFO) << "Debug: Bottom prv diff is NULL!";
+            LOG(INFO) << "Debug: Bottom cpu diff: " << *bottom[0]->cpu_diff();
+        }
+#endif
+        PERFORMANCE_MEASUREMENT_END_ID(perf_id_bw_);
+    }
+    if (this->param_propagate_down(0)) {
+        // making reorders if needed.
+        bwdw_top_diff->sync_before_read();
+        bwdw_bottom_data->sync_before_read();
+        // update top that head at prv
+        bwdw_weights_diff->sync_before_write();
+        if (this->param_propagate_down(1)) {
+            CHECK(bwdw_bias_diff);
+            bwdw_bias_diff->sync_before_write();
+        }
+        PERFORMANCE_EVENT_ID_INIT(perf_id_bw_weights_,
+          PERFORMANCE_MKLDNN_NAME_DETAILED("BW", "_weights"));
+        PERFORMANCE_MEASUREMENT_BEGIN();
+        convBwdWeights.submit();
+        PERFORMANCE_MEASUREMENT_END_ID(perf_id_bw_weights_);
+    }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(MKLDNNConvolutionLayer);
+#else
+
+template <typename Dtype>
+void MKLDNNConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom
+                                                , const vector<Blob<Dtype>*>& top)
+{
+    NOT_IMPLEMENTED;
+}
+
+template <typename Dtype>
+void MKLDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top
+                                                , const vector<bool>& propagate_down
+                                                , const vector<Blob<Dtype>*>& bottom)
+{
+    NOT_IMPLEMENTED;
+}
+#endif
+
+INSTANTIATE_CLASS(MKLDNNConvolutionLayer);
+
+}  // namespace caffe
+#endif  // #ifdef MKLDNN_SUPPORTED
diff --git a/src/caffe/layers/mkldnn_eltwise_layer.cpp b/src/caffe/layers/mkldnn_eltwise_layer.cpp
index 96ced307d..2a4a87c79 100644
--- a/src/caffe/layers/mkldnn_eltwise_layer.cpp
+++ b/src/caffe/layers/mkldnn_eltwise_layer.cpp
@@ -182,16 +182,17 @@ void MKLDNNEltwiseLayer<Dtype>::InitEltwiseFwd(const vector<Blob<Dtype>*>& botto
     }
 
     shared_ptr<memory::primitive_desc> usr_top_data_mpd(new memory::primitive_desc(
-        {{n, ic, ih, iw}, mpcsn, mfmt_nchw}, cpu_engine));   
-    shared_ptr<memory::primitive_desc> prv_top_data_mpd(new memory::primitive_desc({{n, ic, ih, iw}, mpcsn, mfmt_nchw}, cpu_engine));
+        {{n, ic, ih, iw}, mpcsn, mfmt_nchw}, cpu_engine));
     
     // ---- Determining engine to use -----------------------
     std::string subengines = this->layer_param_.engine();
     if (subengines == "" || subengines == "MKLDNN")
         subengines = "MKLDNN:CPU";
-    eltwiseFwd_pd.reset(new sum::primitive_desc({{n, ic, ih, iw}, mpcsn, mfmt_nchw}, scale, bottom_data_mpd));
+    eltwiseFwd_pd.reset(new sum::primitive_desc({{n, ic, ih, iw}, mpcsn, memory::format::any}, scale, bottom_data_mpd));
     CHECK(eltwiseFwd_pd);
 
+    shared_ptr<memory::primitive_desc> prv_top_data_mpd(new memory::primitive_desc(eltwiseFwd_pd->dst_primitive_desc()));
+
     fwd_top_data.reset(new MKLDNNData<Dtype>(usr_top_data_mpd, prv_top_data_mpd, top[0], this));
     fwd_top_data->name = "fwd_top_data   @ " + this->layer_param_.name();
     fwd_top_data_memory = fwd_top_data->create_output_memory();
diff --git a/src/caffe/layers/mkldnn_inner_product_layer.cpp b/src/caffe/layers/mkldnn_inner_product_layer.cpp
index c9ee14b81..d2fe6cfaa 100644
--- a/src/caffe/layers/mkldnn_inner_product_layer.cpp
+++ b/src/caffe/layers/mkldnn_inner_product_layer.cpp
@@ -127,7 +127,7 @@ void MKLDNNInnerProductLayer<Dtype>::InitInnerProductFwd(const vector<Blob<Dtype
     int32_t h = this->h_;
     int32_t oc = this->N_;
     int32_t ic = this->K_/h_/w_;
-    bool has_spatial = h > 1 || w > 1;
+    bool has_spatial = (bottom[0]->shape().size() != 2);
 
     // Initialize memory descriptors (fromat = any) to create inner_product descriptor
     memory::data_type mpcsn = memory::data_type::f32;
@@ -138,6 +138,20 @@ void MKLDNNInnerProductLayer<Dtype>::InitInnerProductFwd(const vector<Blob<Dtype
     memory::dims weights_tz = (has_spatial) ? memory::dims {oc, ic, h, w} : memory::dims{oc, ic};
     memory::dims bias_tz = {oc};
 
+#ifdef DEBUG
+    LOG(INFO) << "has_spatial flag value: " << has_spatial;
+    if (has_spatial)
+    {
+        LOG(INFO) << "Dimension of bottom for MKLDNN: " << n << " " << ic << " " << h << " " << w;
+        LOG(INFO) << "Dimension of weights for MKLDNN: " << oc << " " << ic << " " << h << " " << w;
+    }
+    else
+    {
+        LOG(INFO) << "Dimension of bottom for MKLDNN: " << n << " " << ic;
+        LOG(INFO) << "Dimension of weights for MKLDNN: " << oc << " " << ic;
+    }
+#endif
+
     memory::desc init_bottom_md({bottom_tz}, mpcsn, mfmt);
     memory::desc init_top_md({top_tz}, mpcsn, mfmt);
     memory::desc init_weights_md({weights_tz}, mpcsn, mfmt);
@@ -179,8 +193,7 @@ void MKLDNNInnerProductLayer<Dtype>::InitInnerProductFwd(const vector<Blob<Dtype
     shared_ptr<MemPD> prv_fwd_bottom_data_memory_pd(new MemPD(ipFwd_pd->src_primitive_desc()));
     shared_ptr<MemPD> prv_fwd_top_data_memory_pd(new MemPD(ipFwd_pd->dst_primitive_desc()));
     shared_ptr<MemPD> prv_fwd_weights_data_memory_pd(new MemPD(ipFwd_pd->weights_primitive_desc()));
-    shared_ptr<MemPD> prv_fwd_bias_data_memory_pd(new MemPD(ipFwd_pd->bias_primitive_desc()));
-
+ 
     // Create usr memory primitive descriptors stored as class members
     engine cpu_engine = CpuEngine::Instance().get_engine();
     memory::format input_mfmt = has_spatial ? memory::format::nchw : memory::format::nc;
@@ -189,6 +202,10 @@ void MKLDNNInnerProductLayer<Dtype>::InitInnerProductFwd(const vector<Blob<Dtype
     shared_ptr<MemPD> usr_top_data_memory_pd(new MemPD({{top_tz}, mpcsn, memory::format::nc}, cpu_engine));
     memory::format weights_mfmt = has_spatial ? memory::format::oihw : memory::format::oi;
     shared_ptr<MemPD> usr_weights_data_memory_pd(new MemPD({{weights_tz}, mpcsn, weights_mfmt}, cpu_engine));
+#ifdef DEBUG
+    LOG(INFO) << "Memory format of usr_bottom_data_memory_pd: " << input_mfmt;
+    LOG(INFO) << "Memory format of usr_weights_data_memory_pd: " << weights_mfmt;
+#endif
 
     // ---  init primitive and prv_memory descriptors ----------------------
     fwd_bottom_data.reset(new MKLDNNData<Dtype>(usr_bottom_data_memory_pd, prv_fwd_bottom_data_memory_pd, bottom[0], this));
@@ -204,6 +221,7 @@ void MKLDNNInnerProductLayer<Dtype>::InitInnerProductFwd(const vector<Blob<Dtype
     fwd_weights_data_primitive = fwd_weights_data->create_input(true);
 
     if (this->bias_term_) {
+        shared_ptr<MemPD> prv_fwd_bias_data_memory_pd(new MemPD(ipFwd_pd->bias_primitive_desc()));
         fwd_bias_data.reset(new MKLDNNData<Dtype>(usr_bias_data_memory_pd, prv_fwd_bias_data_memory_pd, this->blobs_[1].get(), this));
         fwd_bias_data   ->name = "fwd_bias_data     @ " + this->layer_param_.name();
         fwd_bias_data_primitive = fwd_bias_data->create_input(true);
@@ -215,10 +233,20 @@ void MKLDNNInnerProductLayer<Dtype>::InitInnerProductFwd(const vector<Blob<Dtype
                             , *fwd_bottom_data_primitive, *fwd_weights_data_primitive
                             , *fwd_top_data_memory));
     }
-    fwd_bottom_data->set_mkldnn_primitive(ipFwd);
+    
+    //Because the inputs of inner product layer always come from user memory, so will not trigger the wrong reorder from extprv to prv
+    fwd_bottom_data->set_mkldnn_primitive(ipFwd);     //Wrong passed primitive! (TODO: Checking!)
+    //MKLDNNPrimitive<Dtype> fwd_bottom_data_primitive_transfer(fwd_bottom_data_primitive);
+    //fwd_bottom_data->set_mkldnn_primitive(fwd_bottom_data_primitive_transfer);
+
     fwd_top_data->set_mkldnn_primitive(ipFwd);
-    fwd_weights_data->set_mkldnn_primitive(ipFwd);
-    fwd_bias_data->set_mkldnn_primitive(ipFwd);
+    
+    fwd_weights_data->set_mkldnn_primitive(ipFwd);    //Wrong passed primitive! (TODO: Checking!)
+    //MKLDNNPrimitive<Dtype> fwd_weights_data_primitive_transfer(fwd_weights_data_primitive);    
+    //fwd_weights_data->set_mkldnn_primitive(fwd_weights_data_primitive_transfer);
+
+    if (this->bias_term_) 
+      fwd_bias_data->set_mkldnn_primitive(ipFwd);
 }
 
 template <typename Dtype>
@@ -226,12 +254,17 @@ void MKLDNNInnerProductLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bot
                                                 , const vector<Blob<Dtype>*>& top)
 {
     VLOG(1) << "MKLDNNInnerProductLayer<Dtype>::Forward_cpu: " << this->layer_param_.name();
+#ifdef DEBUG
+    LOG(INFO) << "MKLDNNInnerProductLayer<Dtype>::Forward_cpu: " << this->layer_param_.name();
+#endif
+
     if( ipFwd_pd == NULL)
         InitInnerProductFwd(bottom, top);
     // making reorders if needed.
     fwd_bottom_data->sync_before_read();
     fwd_weights_data->sync_before_read();
-    fwd_bias_data->sync_before_read();
+    if (this->bias_term_) 
+      fwd_bias_data->sync_before_read();
     // update top that head at prv
     fwd_top_data->sync_before_write();
 
@@ -253,7 +286,7 @@ void MKLDNNInnerProductLayer<Dtype>::InitInnerProductBwd(const vector<Blob<Dtype
     int32_t h = this->h_;
     int32_t oc = this->N_;
     int32_t ic = this->K_/h_/w_;
-    bool has_spatial = h > 1 || w > 1;
+    bool has_spatial = (bottom[0]->shape().size() != 2);
 
     // Initialize memory descriptors (format = any) to create inner_product descriptor
     memory::data_type mpcsn = memory::data_type::f32;
@@ -264,6 +297,20 @@ void MKLDNNInnerProductLayer<Dtype>::InitInnerProductBwd(const vector<Blob<Dtype
     memory::dims weights_tz = (has_spatial) ? memory::dims {oc, ic, h, w} : memory::dims{oc, ic};
     memory::dims bias_tz = {oc};
 
+#ifdef DEBUG
+    LOG(INFO) << "has_spatial flag value: " << has_spatial;
+    if (has_spatial)
+    {
+        LOG(INFO) << "Dimension of bottom for MKLDNN: " << n << " " << ic << " " << h << " " << w;
+        LOG(INFO) << "Dimension of weights for MKLDNN: " << oc << " " << ic << " " << h << " " << w;
+    }
+    else
+    {
+        LOG(INFO) << "Dimension of bottom for MKLDNN: " << n << " " << ic;
+        LOG(INFO) << "Dimension of weights for MKLDNN: " << oc << " " << ic;
+    }
+#endif
+
     memory::desc init_bottom_md({bottom_tz}, mpcsn, mfmt);
     memory::desc init_top_md({top_tz}, mpcsn, mfmt);
     memory::desc init_weights_md({weights_tz}, mpcsn, mfmt);
@@ -272,9 +319,13 @@ void MKLDNNInnerProductLayer<Dtype>::InitInnerProductBwd(const vector<Blob<Dtype
     // Initialize inner_product primitive descriptor
     shared_ptr<inner_product_backward_data::desc> ipBwdData_desc;
     shared_ptr<inner_product_backward_weights::desc> ipBwdWeights_desc;
- 
+ if (this->bias_term_)
     ipBwdWeights_desc.reset(new inner_product_backward_weights::desc(init_bottom_md, init_weights_md
                         , init_bias_md, init_top_md));
+ else
+    ipBwdWeights_desc.reset(new inner_product_backward_weights::desc(init_bottom_md, init_weights_md
+                        , init_top_md));
+    
     ipBwdData_desc.reset(new inner_product_backward_data::desc(init_bottom_md, init_weights_md, init_top_md));
 
     // ---- Determining engine to use -----------------------
@@ -310,16 +361,19 @@ void MKLDNNInnerProductLayer<Dtype>::InitInnerProductBwd(const vector<Blob<Dtype
     shared_ptr<MemPD> prv_bwdw_bottom_data_memory_pd(new MemPD(ipBwdWeights_pd->src_primitive_desc()));
     shared_ptr<MemPD> prv_bwdw_top_diff_memory_pd(new MemPD(ipBwdWeights_pd->diff_dst_primitive_desc()));
     shared_ptr<MemPD> prv_bwdw_weights_diff_memory_pd(new MemPD(ipBwdWeights_pd->diff_weights_primitive_desc()));
-    shared_ptr<MemPD> prv_bwdw_bias_diff_memory_pd(new MemPD(ipBwdWeights_pd->diff_bias_primitive_desc()));
 
     // Create usr memory primitive descriptors stored as class members
     engine cpu_engine = CpuEngine::Instance().get_engine();
-    memory::format input_mfmt = has_spatial ? memory::format::nchw : memory::format::nc;
+    memory::format input_mfmt = has_spatial ? memory::format::nchw : memory::format::nc;    
     shared_ptr<MemPD> usr_bottom_data_memory_pd(new MemPD({{bottom_tz}, mpcsn, input_mfmt}, cpu_engine));
     shared_ptr<MemPD> usr_bias_data_memory_pd(new MemPD({{bias_tz}, mpcsn, memory::format::x}, cpu_engine));
     shared_ptr<MemPD> usr_top_data_memory_pd(new MemPD({{top_tz}, mpcsn, memory::format::nc}, cpu_engine));
     memory::format weights_mfmt = has_spatial ? memory::format::oihw : memory::format::oi;
     shared_ptr<MemPD> usr_weights_data_memory_pd(new MemPD({{weights_tz}, mpcsn, weights_mfmt}, cpu_engine));
+#ifdef DEBUG
+    LOG(INFO) << "Memory format of usr_bottom_data_memory_pd: " << input_mfmt;
+    LOG(INFO) << "Memory format of usr_weights_data_memory_pd: " << weights_mfmt;
+#endif
 
     // ---  init primitive and prv_memory descriptors ----------------------
     bwdd_bottom_diff.reset(new MKLDNNDiff<Dtype>(usr_bottom_data_memory_pd, prv_bwdd_bottom_diff_memory_pd, bottom[0], this));
@@ -343,8 +397,8 @@ void MKLDNNInnerProductLayer<Dtype>::InitInnerProductBwd(const vector<Blob<Dtype
     bwdw_weights_diff->name = "bwdw_weights_diff  @ " + this->layer_param_.name();
     bwdw_weights_diff_memory = bwdw_weights_diff->create_output_memory();
 
-
     if (this->bias_term_) {
+        shared_ptr<MemPD> prv_bwdw_bias_diff_memory_pd(new MemPD(ipBwdWeights_pd->diff_bias_primitive_desc()));
         bwdw_bias_diff.reset(new MKLDNNDiff<Dtype>(usr_bias_data_memory_pd, prv_bwdw_bias_diff_memory_pd, this->blobs_[1].get(), this));
         bwdw_bias_diff   ->name = "bwdw_bias_diff     @ " + this->layer_param_.name();
         bwdw_bias_diff_memory = bwdw_bias_diff->create_output_memory();
@@ -363,13 +417,28 @@ void MKLDNNInnerProductLayer<Dtype>::InitInnerProductBwd(const vector<Blob<Dtype
                     , *bwdd_bottom_diff_memory));
 
     bwdd_bottom_diff->set_mkldnn_primitive(ipBwdData);
-    bwdd_top_diff->set_mkldnn_primitive(ipBwdData);
-    bwdd_weights_data->set_mkldnn_primitive(ipBwdData);
+    
+    bwdd_top_diff->set_mkldnn_primitive(ipBwdData);           //Wrong passed primitive! (TODO: Checking!)
+    //MKLDNNPrimitive<Dtype> bwdd_top_diff_primitive_transfer(bwdd_top_diff_primitive);
+    //bwdd_top_diff->set_mkldnn_primitive(bwdd_top_diff_primitive_transfer);
+
+    bwdd_weights_data->set_mkldnn_primitive(ipBwdData);       //Wrong passed primitive! (TODO: Checking!)
+    //MKLDNNPrimitive<Dtype> bwdd_weights_data_primitive_transfer(bwdd_weights_data_primitive);
+    //bwdd_weights_data->set_mkldnn_primitive(bwdd_weights_data_primitive_transfer);
+
+
+    bwdw_bottom_data->set_mkldnn_primitive(ipBwdWeights);     //Wrong passed primitive! (TODO: Checking!)
+    //MKLDNNPrimitive<Dtype> bwdw_bottom_data_primitive_transfer(bwdw_bottom_data_primitive);
+    //bwdw_bottom_data->set_mkldnn_primitive(bwdw_bottom_data_primitive_transfer);
+
+    bwdw_top_diff->set_mkldnn_primitive(ipBwdWeights);        //Wrong passed primitive! (TODO: Checking!)
+    //MKLDNNPrimitive<Dtype> bwdw_top_diff_primitive_transfer(bwdw_top_diff_primitive);
+    //bwdw_top_diff->set_mkldnn_primitive(bwdw_top_diff_primitive_transfer);
 
-    bwdw_bottom_data->set_mkldnn_primitive(ipBwdWeights);
-    bwdw_top_diff->set_mkldnn_primitive(ipBwdWeights);
     bwdw_weights_diff->set_mkldnn_primitive(ipBwdWeights);
-    bwdw_bias_diff->set_mkldnn_primitive(ipBwdWeights);
+
+    if (this->bias_term_)
+        bwdw_bias_diff->set_mkldnn_primitive(ipBwdWeights);
 }
 
 
@@ -380,6 +449,10 @@ void MKLDNNInnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& to
                                                 , const vector<Blob<Dtype>*>& bottom)
 {
     VLOG(1) << "MKLDNNInnerProductLayer<Dtype>::Backward_cpu: " << this->layer_param_.name();
+#ifdef DEBUG
+    LOG(INFO) << "MKLDNNInnerProductLayer<Dtype>::Backward_cpu: " << this->layer_param_.name();
+#endif
+
     if( ipBwdData_pd == NULL)
         InitInnerProductBwd(top, propagate_down, bottom);
     if (propagate_down[0]) {
@@ -390,7 +463,52 @@ void MKLDNNInnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& to
 
         PERFORMANCE_EVENT_ID_INIT(perf_id_bw_, PERFORMANCE_MKLDNN_NAME("BW"));
         PERFORMANCE_MEASUREMENT_BEGIN();
+#ifdef DEBUG
+        if (bottom[0]->prv_data() != NULL)
+        {
+            LOG(INFO) << "Debug: Bottom prv data: " << *bottom[0]->prv_data();
+        }
+        else
+        {
+            LOG(INFO) << "Debug: Bottom prv data is NULL!";
+            //LOG(INFO) << "Debug: Bottom cpu data: " << *bottom[0]->cpu_data();
+            //Chong: if don't have this LOG print, will cause: this->_cpu_ptr == cpu_ptr crash, without the fix in dropout_layer.cpp
+        }
+
+        if (top[0]->prv_diff() != NULL)
+        {
+            LOG(INFO) << "Debug: Top prv diff: " << *top[0]->prv_diff();
+        }
+        else
+        {
+            LOG(INFO) << "Debug: Top prv diff is NULL!";
+            LOG(INFO) << "Debug: Top cpu diff: " << *top[0]->cpu_diff();
+        }
+
+        if (this->blobs_[0]->prv_data() != NULL)
+        {
+            LOG(INFO) << "Debug: Weights prv data from blobs_[0]: " << *this->blobs_[0]->prv_data();
+        }
+        else
+        {
+            LOG(INFO) << "Debug: Weights prv data is NULL!";
+            LOG(INFO) << "Debug: Weights cpu data: " << *this->blobs_[0]->cpu_data();
+        }
+        //Before submit, so get_prv_ptr() always has the value
+        LOG(INFO) << "Debug: Weights prv data from get_prv_ptr: " << *bwdd_weights_data->get_prv_ptr();
+#endif
         ipBwdData.submit();
+#ifdef DEBUG
+        if (bottom[0]->prv_diff() != NULL)
+        {
+            LOG(INFO) << "Debug: Bottom prv diff: " << *bottom[0]->prv_diff();
+        }
+        else
+        {
+            LOG(INFO) << "Debug: Bottom prv diff is NULL!";
+            LOG(INFO) << "Debug: Bottom cpu diff: " << *bottom[0]->cpu_diff();
+        }
+#endif
         PERFORMANCE_MEASUREMENT_END_ID(perf_id_bw_);
     }
     if (this->param_propagate_down(0)) {
diff --git a/src/caffe/layers/mkldnn_lrn_layer.cpp b/src/caffe/layers/mkldnn_lrn_layer.cpp
index ae118de7c..48ea4e884 100644
--- a/src/caffe/layers/mkldnn_lrn_layer.cpp
+++ b/src/caffe/layers/mkldnn_lrn_layer.cpp
@@ -138,7 +138,7 @@ void MKLDNNLRNLayer<Dtype>::InitLRNFwd(const vector<Blob<Dtype>*>& bottom, const
     memory::data_type mpcsn = memory::data_type::f32;
     // ---- Initialize memory descriptors -------------
     memory::dims tz = {n, ic, ih, iw};
-    shared_ptr<memory::desc> bottom_md, top_md;
+    shared_ptr<memory::desc> top_md;
     shared_ptr<memory::primitive_desc> usr_mpd, prv_mpd;
     if (bottom_data_is_prv) {
         shared_ptr<MKLDNNMemoryDescriptor<Dtype, false> > mem_descr
@@ -266,7 +266,7 @@ void MKLDNNLRNLayer<Dtype>::InitLRNBwd(const vector<Blob<Dtype>*>& top
     bottom_diff_md = top_diff_md;
 
     // ---- Initialize LRN primitive descriptor -------------
-    lrn_backward::desc lrnBwd_desc(lrn_algorithm, *bottom_diff_md, *top_diff_md,
+    lrn_backward::desc lrnBwd_desc(lrn_algorithm, *bottom_md, *top_diff_md,
                         size_, alpha_, beta_);
     // ---- Determining engine to use -----------------------
     std::string subengines = this->layer_param_.engine();
diff --git a/src/caffe/layers/mkldnn_pooling_layer.cpp b/src/caffe/layers/mkldnn_pooling_layer.cpp
index fe1ae7d03..6bce42a1c 100644
--- a/src/caffe/layers/mkldnn_pooling_layer.cpp
+++ b/src/caffe/layers/mkldnn_pooling_layer.cpp
@@ -289,6 +289,10 @@ void MKLDNNPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom
                                             ,const vector<Blob<Dtype>*>& top)
 {
     VLOG(1) << "MKLDNNPoolingLayer<Dtype>::Forward_cpu: " << this->layer_param_.name();
+#ifdef DEBUG
+    LOG(INFO) << "MKLDNNPoolingLayer<Dtype>::Forward_cpu: " << this->layer_param_.name();
+#endif
+
     if (NULL == poolingFwd_pd)
         InitPoolingFwd(bottom, top);
     // making reorders if needed.
@@ -422,6 +426,10 @@ void MKLDNNPoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top
                                             , const vector<Blob<Dtype>*>& bottom)
 {
     VLOG(1) << "MKLDNNPoolingLayer<Dtype>::Backward_cpu: " << this->layer_param_.name();
+#ifdef DEBUG
+    LOG(INFO) << "MKLDNNPoolingLayer<Dtype>::Backward_cpu: " << this->layer_param_.name();
+#endif
+
     if (!propagate_down[0]) {
         return;
     }
@@ -433,7 +441,38 @@ void MKLDNNPoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top
 
     PERFORMANCE_EVENT_ID_INIT(perf_id_bw_, PERFORMANCE_MKLDNN_NAME("BW"));
     PERFORMANCE_MEASUREMENT_BEGIN();
+#ifdef DEBUG
+    if (bottom[0]->prv_data() != NULL)
+    {
+        LOG(INFO) << "Debug: Bottom prv data: " << *bottom[0]->prv_data();
+    }
+    else
+    {
+        LOG(INFO) << "Debug: Bottom prv data is NULL!";
+        //LOG(INFO) << "Debug: Bottom cpu data: " << *bottom[0]->cpu_data();
+    }
+
+    if (top[0]->prv_diff() != NULL)
+    {
+        LOG(INFO) << "Debug: Top prv diff: " << *top[0]->prv_diff();
+    }
+    else
+    {
+        LOG(INFO) << "Debug: Top prv diff is NULL!";
+        //LOG(INFO) << "Debug: Top cpu diff: " << *top[0]->cpu_diff();
+    }
+#endif
     poolingBwd.submit();
+#ifdef DEBUG
+    if (bottom[0]->prv_diff() != NULL)
+    {
+        LOG(INFO) << "Debug: Bottom prv diff: " << *bottom[0]->prv_diff();
+    }
+    else
+    {
+        LOG(INFO) << "Debug: Bottom prv diff is NULL!";
+    }
+#endif
     PERFORMANCE_MEASUREMENT_END_ID(perf_id_bw_);
 }
 
diff --git a/src/caffe/layers/mkldnn_relu_layer.cpp b/src/caffe/layers/mkldnn_relu_layer.cpp
index 541684fbc..7b6cb2e06 100644
--- a/src/caffe/layers/mkldnn_relu_layer.cpp
+++ b/src/caffe/layers/mkldnn_relu_layer.cpp
@@ -140,6 +140,10 @@ void MKLDNNReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom
                                         ,const vector<Blob<Dtype>*>& top)
 {
     VLOG(1) << "MKLDNNReLULayer<Dtype>::Forward_cpu: " << this->layer_param_.name();
+#ifdef DEBUG
+    LOG(INFO) << "MKLDNNReLULayer<Dtype>::Forward_cpu: " << this->layer_param_.name();
+#endif
+
     bool inplace = (bottom[0] == top[0]);
     if( reluFwd_pd == NULL)
         InitReLUFwd(bottom, top);
@@ -184,10 +188,31 @@ void MKLDNNReLULayer<Dtype>::InitReLUBwd(const vector<Blob<Dtype>*>& top
     if (top_diff_is_prv) {
       shared_ptr<MKLDNNMemoryDescriptor<Dtype, /* is_diff */ true> > mem_descr
         = get_mkldnn_prv_descriptor<Dtype, /* is_diff */ true>(top[0]);
+#ifdef DEBUG
+      memory::format bwd_prv_top_diff_mfmt = static_cast<memory::format>(mem_descr->prv_memory_pd()->desc().data.format);
+      LOG(INFO) << "MKLDNNReLULayer<Dtype>::InitReLUBwd: memory format of prv top diff is: " << bwd_prv_top_diff_mfmt;
+#endif
       top_diff_md.reset(new memory::desc(mem_descr->prv_memory_pd()->desc()));
       usr_diff_mpd = mem_descr->usr_memory_pd();
       prv_diff_mpd = mem_descr->prv_memory_pd();
     } else {
+      bool bottom_data_is_prv = (const_cast<Dtype*>(bottom[0]->prv_data()) != NULL);
+      if (bottom_data_is_prv) {
+          shared_ptr<MKLDNNMemoryDescriptor<Dtype, false> > mem_descr
+              = get_mkldnn_prv_descriptor<Dtype, false>(bottom[0]);
+#ifdef DEBUG
+          memory::format fwd_prv_bottom_data_mfmt = static_cast<memory::format>(mem_descr->prv_memory_pd()->desc().data.format);
+          LOG(INFO) << "MKLDNNReLULayer<Dtype>::InitReLUBwd: memory format of prv bottom data is: " << fwd_prv_bottom_data_mfmt;
+          LOG(INFO) << "MKLDNNReLULayer<Dtype>::InitReLUBwd: Reorder the usr top diff to the format of prv bottom data! (Performance consideration)";              
+#endif
+          prv_diff_mpd = mem_descr->prv_memory_pd();
+          //top[0]->prv_data() is empty, however top[0]->get_prv_diff_descriptor() has value.
+          //Find root cause in the mkldnn_memory: create_output_memory() and sync_before_write() functions.
+          //But that a major fix, will lead the nan in the AlexNet training.
+          //So need investigation further, however, this will fix ICL-84.
+          top[0]->set_prv_diff_descriptor(NULL);
+      }
+
       top_diff_md.reset(new memory::desc({{n, ic, ih, iw}}, mpcsn, memory::format::nchw));
       usr_diff_mpd.reset(new memory::primitive_desc(*top_diff_md, cpu_engine));
     }
@@ -236,6 +261,10 @@ void MKLDNNReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top
                                           , const vector<Blob<Dtype>*>& bottom)
 {
     VLOG(1) << "MKLDNNReLULayer<Dtype>::Backward_cpu: " << this->layer_param_.name();
+#ifdef DEBUG
+    LOG(INFO) << "MKLDNNReLULayer<Dtype>::Backward_cpu: " << this->layer_param_.name();
+#endif
+
     bool inplace = (bottom[0] == top[0]);
     if (!propagate_down[0]) {
         return;
@@ -249,7 +278,36 @@ void MKLDNNReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top
 
     PERFORMANCE_EVENT_ID_INIT(perf_id_bw_, PERFORMANCE_MKLDNN_NAME("BW"));
     PERFORMANCE_MEASUREMENT_BEGIN();
+#ifdef DEBUG
+    if (bottom[0]->prv_data() != NULL)
+    {
+        LOG(INFO) << "Debug: Bottom prv data: " << *bottom[0]->prv_data();
+    }
+    else
+    {
+        LOG(INFO) << "Debug: Bottom prv data is NULL!";
+    }
+
+    if (top[0]->prv_diff() != NULL)
+    {
+        LOG(INFO) << "Debug: Top prv diff: " << *top[0]->prv_diff();
+    }
+    else
+    {
+        LOG(INFO) << "Debug: Top prv diff is NULL!";
+    }
+#endif
     reluBwd.submit();
+#ifdef DEBUG
+    if (bottom[0]->prv_diff() != NULL)
+    {
+        LOG(INFO) << "Debug: Bottom prv diff: " << *bottom[0]->prv_diff();
+    }
+    else
+    {
+        LOG(INFO) << "Debug: Bottom prv diff is NULL!";
+    }
+#endif
     PERFORMANCE_MEASUREMENT_END_ID(perf_id_bw_);
 }
 
diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp
index 70908d54c..0aeab4ee4 100644
--- a/src/caffe/layers/softmax_loss_layer.cpp
+++ b/src/caffe/layers/softmax_loss_layer.cpp
@@ -132,24 +132,73 @@ void SoftmaxWithLossLayer<Dtype>::Forward_cpu(
   int dim = prob_.count() / outer_num_;
   int count = 0;
   Dtype loss = 0;
-  for (int i = 0; i < outer_num_; ++i) {
-    for (int j = 0; j < inner_num_; j++) {
-      const int label_value = static_cast<int>(label[i * inner_num_ + j]);
-      if (has_ignore_label_ && label_value == ignore_label_) {
-        continue;
+  if (bottom.size() == 3) {
+      const Dtype* weights = bottom[2]->cpu_data();
+      Dtype weighted_sum = 0;
+      Dtype weighted_sum_local = 0;
+      Dtype loss_local = 0;
+
+      for (int i = 0; i < outer_num_; ++i) {
+        weighted_sum_local = 0;
+        loss_local = 0;
+
+        #ifdef _OPENMP
+        #pragma omp parallel for reduction(+: loss_local, weighted_sum_local) if(inner_num_ > 1)
+        #endif
+        for (int j = 0; j < inner_num_; j++) {
+          const int label_value = static_cast<int>(label[i * inner_num_ + j]);
+          if (has_ignore_label_ && label_value == ignore_label_) {
+            continue;
+          }
+
+          DCHECK_GE(label_value, 0);
+          DCHECK_LT(label_value, prob_.shape(softmax_axis_));
+          Dtype p = prob_data[i * dim + label_value * inner_num_ + j];
+          loss_local += weights[i * inner_num_ + j] * log(std::max(Dtype(FLT_MIN), std::min(p, Dtype(1.0 - FLT_MIN))));
+          weighted_sum_local += weights[i * inner_num_ + j];
+        }
+
+        weighted_sum += weighted_sum_local;
+        loss -= loss_local;
+      }
+
+      top[0]->mutable_cpu_data()[0] = loss / weighted_sum;
+      if (top.size() == 2) {
+        top[1]->ShareData(prob_);
+      }
+  } else {
+      int count_local = 0;
+      Dtype loss_local = 0;
+
+      for (int i = 0; i < outer_num_; ++i) {
+        count_local = 0;
+        loss_local = 0;
+
+        #ifdef _OPENMP
+        #pragma omp parallel for reduction(+: loss_local, count_local) if(inner_num_ > 1)
+        #endif
+        for (int j = 0; j < inner_num_; j++) {
+          const int label_value = static_cast<int>(label[i * inner_num_ + j]);
+          if (has_ignore_label_ && label_value == ignore_label_) {
+            continue;
+          }
+
+          DCHECK_GE(label_value, 0);
+          DCHECK_LT(label_value, prob_.shape(softmax_axis_));
+          Dtype p = prob_data[i * dim + label_value * inner_num_ + j];
+          loss_local += log(std::max(Dtype(FLT_MIN), std::min(p, Dtype(1.0 - FLT_MIN))));
+          ++count_local;
+        }
+
+        count += count_local;
+        loss -= loss_local;
+      }
+
+      Dtype normalizer = LossLayer<Dtype>::GetNormalizer(normalization_, outer_num_, inner_num_, count);
+      top[0]->mutable_cpu_data()[0] = loss / normalizer;
+      if (top.size() == 2) {
+        top[1]->ShareData(prob_);
       }
-      DCHECK_GE(label_value, 0);
-      DCHECK_LT(label_value, prob_.shape(softmax_axis_));
-      loss -= log(std::max(prob_data[i * dim + label_value * inner_num_ + j],
-                           Dtype(FLT_MIN)));
-      ++count;
-    }
-  }
-  Dtype normalizer = LossLayer<Dtype>::GetNormalizer(
-      normalization_, outer_num_, inner_num_, count);
-  top[0]->mutable_cpu_data()[0] = loss / normalizer;
-  if (top.size() == 2) {
-    top[1]->ShareData(prob_);
   }
 }
 
@@ -161,30 +210,58 @@ void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
                << " Layer cannot backpropagate to label inputs.";
   }
   if (propagate_down[0]) {
-    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-    const Dtype* prob_data = prob_.cpu_data();
-    caffe_copy(prob_.count(), prob_data, bottom_diff);
-    const Dtype* label = bottom[1]->cpu_data();
-    int dim = prob_.count() / outer_num_;
-    int count = 0;
-    for (int i = 0; i < outer_num_; ++i) {
-      for (int j = 0; j < inner_num_; ++j) {
-        const int label_value = static_cast<int>(label[i * inner_num_ + j]);
-        if (has_ignore_label_ && label_value == ignore_label_) {
-          for (int c = 0; c < bottom[0]->shape(softmax_axis_); ++c) {
-            bottom_diff[i * dim + c * inner_num_ + j] = 0;
+    if (bottom.size() == 3) {
+        Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+        const Dtype* prob_data = prob_.cpu_data();
+        caffe_copy(prob_.count(), prob_data, bottom_diff);
+        const Dtype* label = bottom[1]->cpu_data();
+        int dim = prob_.count() / outer_num_;
+        Dtype weight_sum = Dtype(0);
+        const Dtype* weights = bottom[2]->cpu_data();
+        for (int i = 0; i < outer_num_; ++i) {
+          for (int j = 0; j < inner_num_; ++j) {
+            const int label_value = static_cast<int>(label[i * inner_num_ + j]);
+            if (has_ignore_label_ && label_value == ignore_label_) {
+              for (int c = 0; c < bottom[0]->shape(softmax_axis_); ++c) {
+                bottom_diff[i * dim + c * inner_num_ + j] = 0;
+              }
+            } else {
+              bottom_diff[i * dim + label_value * inner_num_ + j] -= 1;
+              for (int c = 0; c < bottom[0]->shape(1); ++c) {
+                bottom_diff[i * dim + c * inner_num_ + j] *= weights[i * inner_num_ + j];
+              }
+              weight_sum += weights[i * inner_num_ + j];
+            }
           }
-        } else {
-          bottom_diff[i * dim + label_value * inner_num_ + j] -= 1;
-          ++count;
         }
-      }
+
+        Dtype loss_weight = top[0]->cpu_diff()[0] / weight_sum;
+        caffe_scal(prob_.count(), loss_weight, bottom_diff);
+    } else {
+        Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+        const Dtype* prob_data = prob_.cpu_data();
+        caffe_copy(prob_.count(), prob_data, bottom_diff);
+        const Dtype* label = bottom[1]->cpu_data();
+        int dim = prob_.count() / outer_num_;
+        int count = 0;
+        for (int i = 0; i < outer_num_; ++i) {
+          for (int j = 0; j < inner_num_; ++j) {
+            const int label_value = static_cast<int>(label[i * inner_num_ + j]);
+            if (has_ignore_label_ && label_value == ignore_label_) {
+              for (int c = 0; c < bottom[0]->shape(softmax_axis_); ++c) {
+                bottom_diff[i * dim + c * inner_num_ + j] = 0;
+              }
+            } else {
+              bottom_diff[i * dim + label_value * inner_num_ + j] -= 1;
+              ++count;
+            }
+          }
+        }
+        // Scale gradient
+        Dtype normalizer = LossLayer<Dtype>::GetNormalizer(normalization_, outer_num_, inner_num_, count);
+        Dtype loss_weight = top[0]->cpu_diff()[0] / normalizer;
+        caffe_scal(prob_.count(), loss_weight, bottom_diff);
     }
-    // Scale gradient
-    Dtype normalizer = LossLayer<Dtype>::GetNormalizer(
-        normalization_, outer_num_, inner_num_, count);
-    Dtype loss_weight = top[0]->cpu_diff()[0] / normalizer;
-    caffe_scal(prob_.count(), loss_weight, bottom_diff);
   }
 }
 
diff --git a/src/caffe/mkldnn_memory.cpp b/src/caffe/mkldnn_memory.cpp
index a14ef5084..ddad67f57 100644
--- a/src/caffe/mkldnn_memory.cpp
+++ b/src/caffe/mkldnn_memory.cpp
@@ -81,6 +81,7 @@ void MKLDNNMemoryDescriptorBase<Dtype>::create_reorder_descriptors()
     if ( *_usr_memory_pd != *_prv_memory_pd) {
         _reorder_usr2prv_pd = shared_ptr<reorder::primitive_desc>(
                 new reorder::primitive_desc(*_usr_memory_pd, *_prv_memory_pd));
+
         _reorder_prv2usr_pd = shared_ptr<reorder::primitive_desc>(
                 new reorder::primitive_desc(*_prv_memory_pd, *_usr_memory_pd));
     }
@@ -98,9 +99,17 @@ template <typename Dtype, bool is_diff>
         : MKLDNNMemoryDescriptorBase<Dtype>(usr_memory_pd, prv_memory_pd, blob, mkldnn_layer)
 {
     const Dtype* prv_ptr = is_diff ?  blob->prv_diff() : blob->prv_data();
+
     if (prv_ptr != NULL) {
         shared_ptr<MKLDNNMemoryDescriptor<Dtype, is_diff> > blob_prv_mkldnn_mem_descr = get_mkldnn_prv_descriptor<Dtype, is_diff>(blob);
+#ifdef DEBUG        
+        LOG(INFO) << "Format of blob-prv-memory-pd: " << blob_prv_mkldnn_mem_descr->prv_memory_pd()->desc().data.format;
+        LOG(INFO) << "Format of this-prv-memory-pd: " << this->prv_memory_pd()->desc().data.format;
+#endif
         if (*blob_prv_mkldnn_mem_descr->prv_memory_pd() !=  *this->prv_memory_pd()) {
+#ifdef DEBUG
+            LOG(INFO) << "Formats of blob-prv-memory-pd and this-prv-memory-pd are not equal !";
+#endif
             this->set_extprv_memory_pd(blob_prv_mkldnn_mem_descr->prv_memory_pd());
         }
     }
@@ -126,10 +135,18 @@ void MKLDNNMemoryDescriptor<Dtype, is_diff>::create_reorder_to_prv(void* cpu_ptr
 template <typename Dtype, bool is_diff>
 void MKLDNNMemoryDescriptor<Dtype, is_diff>::convert_to_prv(void* cpu_ptr)
 {
+#ifdef DEBUG
+    LOG(INFO) << "--- MKLDNNMemoryDescriptorBase<Dtype>::convert_to_prv --- " << this->name;
+#endif
     CHECK(cpu_ptr);
     CHECK_EQ(this->_cpu_ptr, cpu_ptr);
     create_reorder_to_prv(cpu_ptr);
     VLOG(1) << "--- MKLDNNMemoryDescriptorBase<Dtype>::convert_to_prv --- " << this->name;
+#ifdef DEBUG
+    LOG(INFO) << "Reorder: from usr to prv.";
+    LOG(INFO) << "Format of _usr_memory_pd: " << this->_usr_memory_pd->desc().data.format;
+    LOG(INFO) << "Format of _prv_memory_pd: " << this->_prv_memory_pd->desc().data.format;
+#endif
     PERFORMANCE_MEASUREMENT_BEGIN();
     this->_reorder_usr2prv.submit();
     PERFORMANCE_MEASUREMENT_END_STATIC("mkldnn_conversion");
@@ -157,11 +174,19 @@ void MKLDNNMemoryDescriptor<Dtype, is_diff>::create_reorder_from_prv(void* cpu_p
 template <typename Dtype, bool is_diff>
 void MKLDNNMemoryDescriptor<Dtype, is_diff>::convert_from_prv(void* cpu_ptr)
 {
+#ifdef DEBUG
+    LOG(INFO) << "--- MKLDNNMemoryDescriptorBase<Dtype>::convert_from_prv --- " << this->name;
+#endif
     CHECK(cpu_ptr);
     if(this->_reorder_prv2usr_pd == NULL)
         return;
     create_reorder_from_prv(cpu_ptr);
     VLOG(1) << "--- MKLDNNMemoryDescriptorBase<Dtype>::convert_from_prv --- " << this->name;
+#ifdef DEBUG
+    LOG(INFO) << "Reorder: from prv to usr.";
+    LOG(INFO) << "Format of _prv_memory_pd: " << this->_prv_memory_pd->desc().data.format;
+    LOG(INFO) << "Format of _usr_memory_pd: " << this->_usr_memory_pd->desc().data.format;
+#endif
     PERFORMANCE_MEASUREMENT_BEGIN();
     this->_reorder_prv2usr.submit();
     PERFORMANCE_MEASUREMENT_END_STATIC("mkldnn_conversion");
@@ -181,11 +206,26 @@ void MKLDNNMemoryDescriptor<Dtype, is_diff>::create_reorder_from_extprv(shared_p
 template <typename Dtype, bool is_diff>
 void MKLDNNMemoryDescriptor<Dtype, is_diff>::convert_from_extprv(shared_ptr<primitive> aprimitive)
 {
+#ifdef DEBUG
+    LOG(INFO) << "--- MKLDNNMemoryDescriptorBase<Dtype>::convert_from_extprv --- " << this->name;
+#endif
     CHECK(aprimitive);
     if(this->_reorder_extprv2prv_pd == NULL)
         return;
+    if (this->_extprv_memory_pd->desc().data.format == this->_prv_memory_pd->desc().data.format)
+    {
+#ifdef DEBUG
+        LOG(INFO) << "The format of _extprv_memory_pd and _prv_memory_pd is same, no need do conversion.";
+#endif
+        return;
+    }
     create_reorder_from_extprv(aprimitive);
     VLOG(1) << "--- MKLDNNMemoryDescriptorBase<Dtype>::convert_from_extprv --- " << this->name;
+#ifdef DEBUG
+    LOG(INFO) << "Reorder: from extprv to prv.";
+    LOG(INFO) << "Format of _extprv_memory_pd: " << this->_extprv_memory_pd->desc().data.format;
+    LOG(INFO) << "Format of _prv_memory_pd: " << this->_prv_memory_pd->desc().data.format;
+#endif
     PERFORMANCE_MEASUREMENT_BEGIN();
     this->_reorder_extprv2prv.submit();
     PERFORMANCE_MEASUREMENT_END_STATIC("mkldnn_conversion");
@@ -290,14 +330,22 @@ void MKLDNNMemoryDescriptor<Dtype, is_diff>::sync_before_read()
         // if blob has not prv descriptor then set it to avoid conversions on next iterations
         if (is_diff) {
             this->_blob->set_prv_diff_descriptor(this->get_shared_ptr(), false);
+            // Original:
             // below line designated to set correspondent SyncedMemory->_head to HEAD_AT_CPU
             // TODO: need to optimize
-            this->_blob->set_prv_diff_descriptor(NULL);
+            //this->_blob->set_prv_diff_descriptor(NULL);
+            // It will lead the performance drop in two aspects:
+            // 1. FWD Conv: Reorder of weights from oihw to OIhw16i16o is executed for every iteration. This should be happening only once per convolution layer including all iterations.
+            // 2. BWD Conv: Reorder of weights is happening from oihw to OIhw16o16i format, where as expected, the reorder should happen from OIhw16i16o to OIhw16o16i for better performance.
         } else {
-            this->_blob->set_prv_data_descriptor(this->get_shared_ptr(), false);
+            this->_blob->set_prv_data_descriptor(this->get_shared_ptr(), true);     //Change from false to true, suggested by Czaja, Jacek
+            // Original:
             // below line designated to set correspondent SyncedMemory->_head to HEAD_AT_CPU
             // TODO: need to optimize
-            this->_blob->set_prv_data_descriptor(NULL);
+            //this->_blob->set_prv_data_descriptor(NULL);
+            // It will lead the performance drop in two aspects:
+            // 1. FWD Conv: Reorder of weights from oihw to OIhw16i16o is executed for every iteration. This should be happening only once per convolution layer including all iterations.
+            // 2. BWD Conv: Reorder of weights is happening from oihw to OIhw16o16i format, where as expected, the reorder should happen from OIhw16i16o to OIhw16o16i for better performance.
         }
     } else {
         shared_ptr<MKLDNNMemoryDescriptor<Dtype, is_diff> > blob_prv_mkldnn_mem_descr = get_mkldnn_prv_descriptor<Dtype, is_diff>(this->_blob);
@@ -326,6 +374,16 @@ void MKLDNNMemoryDescriptor<Dtype, is_diff>::sync_before_write(bool inplace)
             this->_blob->set_prv_data_descriptor(this->get_shared_ptr(), this->conversion_needed() ? false : true);
         }
     }
+    //Fix me: this->conversion_needed() == false means diff/data is in the CPU, no need to set the prv_diff/data_descriptor
+    /*
+    if ((!inplace) && (this->conversion_needed())) {
+        if (is_diff) {
+            this->_blob->set_prv_diff_descriptor(this->get_shared_ptr(), false);
+        } else {
+            this->_blob->set_prv_data_descriptor(this->get_shared_ptr(), false);
+        }
+    }
+    */
 }
 
 template <typename Dtype, bool is_diff>
@@ -372,7 +430,7 @@ shared_ptr<primitive> MKLDNNMemoryDescriptor<Dtype, is_diff>::create_input(bool
 template <typename Dtype, bool is_diff>
 shared_ptr<memory> MKLDNNMemoryDescriptor<Dtype, is_diff>::create_output_memory(bool inplace)
 {
-    // TODO: need to iptimize code
+    // TODO: need to optimize code
     shared_ptr<memory> omem = create_output_memory(this->_blob);
     if(!inplace) {
         if(is_diff) {
@@ -381,6 +439,16 @@ shared_ptr<memory> MKLDNNMemoryDescriptor<Dtype, is_diff>::create_output_memory(
             this->_blob->set_prv_data_descriptor(this->get_shared_ptr(), this->conversion_needed() ? false : true);
         }
     }
+    /*
+    //Fix me: this->conversion_needed() == false means diff/data is in the CPU, no need to set the prv_diff/data_descriptor
+    if ((!inplace) && (this->conversion_needed())) {
+        if (is_diff) {
+            this->_blob->set_prv_diff_descriptor(this->get_shared_ptr(), false);
+        } else {
+            this->_blob->set_prv_data_descriptor(this->get_shared_ptr(), false);
+        }
+    }
+    */
     return omem;
 }
 
diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp
index 6455a2989..25e978856 100644
--- a/src/caffe/syncedmem.cpp
+++ b/src/caffe/syncedmem.cpp
@@ -92,8 +92,10 @@ inline void SyncedMemory::to_cpu() {
   case SYNCED_PRV:
   case HEAD_AT_CPU:
     if (prv_descriptor_.get()) {
-        if ( prv_descriptor_->on_to_cpu())
-            head_ = SYNCED;
+        if (prv_descriptor_->on_to_cpu())
+            //Fix: head_ = SYNCED means for caffe that CPU and GPU are in sync,
+            //as we do not have GPU setting, head_ to SYNCED will cause problems.
+            head_ = SYNCED_PRV;
     }
     break;
   case SYNCED:
diff --git a/src/caffe/test/test_hdf5_output_layer.cpp b/src/caffe/test/test_hdf5_output_layer.cpp
index 4ed88d1ed..2e49dd86a 100644
--- a/src/caffe/test/test_hdf5_output_layer.cpp
+++ b/src/caffe/test/test_hdf5_output_layer.cpp
@@ -153,6 +153,8 @@ TYPED_TEST(HDF5OutputLayerTest, TestForward) {
   status = H5Fclose(file_id);
   EXPECT_GE(status, 0) << "Failed to close HDF5 file " <<
       this->output_file_name_;
+
+  delete blob_label;
 }
 
 }  // namespace caffe
diff --git a/src/caffe/test/test_inner_product_layer.cpp b/src/caffe/test/test_inner_product_layer.cpp
index 2e42c0382..278b1308b 100644
--- a/src/caffe/test/test_inner_product_layer.cpp
+++ b/src/caffe/test/test_inner_product_layer.cpp
@@ -243,6 +243,8 @@ TYPED_TEST(InnerProductLayerTest, TestForwardTranspose) {
     for (int i = 0; i < count; ++i) {
       EXPECT_FLOAT_EQ(data[i], data_t[i]);
     }
+    delete top;
+    delete top_t;
   } else {
     LOG(ERROR) << "Skipping test due to old architecture.";
   }
@@ -425,6 +427,10 @@ TYPED_TEST(InnerProductLayerTest, TestBackwardTranspose) {
       EXPECT_NE(Dtype(0.), data[i]);
       EXPECT_FLOAT_EQ(data[i], data_t[i]);
     }
+    delete bottom_diff;
+    delete diff;
+    delete w;
+    delete top;
   } else {
     LOG(ERROR) << "Skipping test due to old architecture.";
   }
diff --git a/src/caffe/test/test_mkldnn_convolution_layer.cpp b/src/caffe/test/test_mkldnn_convolution_layer.cpp
index 14e4fd76e..551ab7bd2 100644
--- a/src/caffe/test/test_mkldnn_convolution_layer.cpp
+++ b/src/caffe/test/test_mkldnn_convolution_layer.cpp
@@ -1,1003 +1,1051 @@
-/*
-All modification made by Intel Corporation: © 2016 Intel Corporation
-
-All contributions by the University of California:
-Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
-All rights reserved.
-
-All other contributions:
-Copyright (c) 2014, 2015, the respective contributors
-All rights reserved.
-For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
-
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-      this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * Neither the name of Intel Corporation nor the names of its contributors
-      may be used to endorse or promote products derived from this software
-      without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#ifdef MKLDNN_SUPPORTED
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layers/mkldnn_layers.hpp"
-
-#include "caffe/test/test_caffe_main.hpp"
-#include "caffe/test/test_gradient_check_util.hpp"
-
-namespace caffe {
-
-// Reference convolution for checking results:
-// accumulate through explicit loops over input, output, and filters.
-template <typename Dtype>
-void caffe_conv(const Blob<Dtype>* in, ConvolutionParameter* conv_param,
-    const vector<shared_ptr<Blob<Dtype> > >& weights,
-    Blob<Dtype>* out) {
-  const bool has_depth = (out->num_axes() == 5);
-  if (!has_depth) { CHECK_EQ(4, out->num_axes()); }
-  // Kernel size, stride, and pad
-  int kernel_h, kernel_w;
-  if (conv_param->has_kernel_h() || conv_param->has_kernel_w()) {
-    kernel_h = conv_param->kernel_h();
-    kernel_w = conv_param->kernel_w();
-  } else {
-    kernel_h = kernel_w = conv_param->kernel_size(0);
-  }
-  int pad_h, pad_w;
-  if (conv_param->has_pad_h() || conv_param->has_pad_w()) {
-    pad_h = conv_param->pad_h();
-    pad_w = conv_param->pad_w();
-  } else {
-    pad_h = pad_w = conv_param->pad_size() ? conv_param->pad(0) : 0;
-  }
-  int stride_h, stride_w;
-  if (conv_param->has_stride_h() || conv_param->has_stride_w()) {
-    stride_h = conv_param->stride_h();
-    stride_w = conv_param->stride_w();
-  } else {
-    stride_h = stride_w = conv_param->stride_size() ? conv_param->stride(0) : 1;
-  }
-  int dilation_h, dilation_w;
-  dilation_h = dilation_w = conv_param->dilation_size() ?
-                            conv_param->dilation(0) : 1;
-  int kernel_d, pad_d, stride_d, dilation_d;
-  if (has_depth) {
-    kernel_d = kernel_h;
-    stride_d = stride_h;
-    pad_d = pad_h;
-    dilation_d = dilation_h;
-  } else {
-    kernel_d = stride_d = dilation_d = 1;
-    pad_d = 0;
-  }
-  // Groups
-  int groups = conv_param->group();
-  int o_g = out->shape(1) / groups;
-  int k_g = in->shape(1) / groups;
-  int o_head, k_head;
-  // Convolution
-  vector<int> weight_offset(4 + has_depth);
-  vector<int> in_offset(4 + has_depth);
-  vector<int> out_offset(4 + has_depth);
-  Dtype* out_data = out->mutable_cpu_data();
-  for (int n = 0; n < out->shape(0); n++) {
-    for (int g = 0; g < groups; g++) {
-      o_head = o_g * g;
-      k_head = k_g * g;
-      for (int o = 0; o < o_g; o++) {
-        for (int k = 0; k < k_g; k++) {
-          for (int z = 0; z < (has_depth ? out->shape(2) : 1); z++) {
-            for (int y = 0; y < out->shape(2 + has_depth); y++) {
-              for (int x = 0; x < out->shape(3 + has_depth); x++) {
-                for (int r = 0; r < kernel_d; r++) {
-                  for (int p = 0; p < kernel_h; p++) {
-                    for (int q = 0; q < kernel_w; q++) {
-                      int in_z = z * stride_d - pad_d + r * dilation_d;
-                      int in_y = y * stride_h - pad_h + p * dilation_h;
-                      int in_x = x * stride_w - pad_w + q * dilation_w;
-                      if (in_z >= 0 && in_z < (has_depth ? in->shape(2) : 1)
-                          && in_y >= 0 && in_y < in->shape(2 + has_depth)
-                          && in_x >= 0 && in_x < in->shape(3 + has_depth)) {
-                        weight_offset[0] = o + o_head;
-                        weight_offset[1] = k;
-                        if (has_depth) { weight_offset[2] = r; }
-                        weight_offset[2 + has_depth] = p;
-                        weight_offset[3 + has_depth] = q;
-                        in_offset[0] = n;
-                        in_offset[1] = k + k_head;
-                        if (has_depth) { in_offset[2] = in_z; }
-                        in_offset[2 + has_depth] = in_y;
-                        in_offset[3 + has_depth] = in_x;
-                        out_offset[0] = n;
-                        out_offset[1] = o + o_head;
-                        if (has_depth) { out_offset[2] = z; }
-                        out_offset[2 + has_depth] = y;
-                        out_offset[3 + has_depth] = x;
-                        out_data[out->offset(out_offset)] +=
-                            in->data_at(in_offset)
-                            * weights[0]->data_at(weight_offset);
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  // Bias
-  if (conv_param->bias_term()) {
-    const Dtype* bias_data = weights[1]->cpu_data();
-    for (int n = 0; n < out->shape(0); n++) {
-      for (int o = 0; o < out->shape(1); o++) {
-        for (int z = 0; z < (has_depth ? out->shape(2) : 1); z++) {
-          for (int y = 0; y < out->shape(2 + has_depth); y++) {
-            for (int x = 0; x < out->shape(3 + has_depth); x++) {
-              out_offset[0] = n;
-              out_offset[1] = o;
-              if (has_depth) { out_offset[2] = z; }
-              out_offset[2 + has_depth] = y;
-              out_offset[3 + has_depth] = x;
-              out_data[out->offset(out_offset)] += bias_data[o];
-            }
-          }
-        }
-      }
-    }
-  }
-  //relu
-  if (conv_param->relu()){
-    for (int n = 0; n < out->shape(0); n++) {
-      for (int o = 0; o < out->shape(1); o++) {
-        for (int z = 0; z < (has_depth ? out->shape(2) : 1); z++) {
-          for (int y = 0; y < out->shape(2 + has_depth); y++) {
-            for (int x = 0; x < out->shape(3 + has_depth); x++) {
-              out_offset[0] = n;
-              out_offset[1] = o;
-              if (has_depth) { out_offset[2] = z; }
-              out_offset[2 + has_depth] = y;
-              out_offset[3 + has_depth] = x;
-              if(out_data[out->offset(out_offset)] < 0) out_data[out->offset(out_offset)] = 0;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template void caffe_conv(const Blob<float>* in,
-    ConvolutionParameter* conv_param,
-    const vector<shared_ptr<Blob<float> > >& weights,
-    Blob<float>* out);
-template void caffe_conv(const Blob<double>* in,
-    ConvolutionParameter* conv_param,
-    const vector<shared_ptr<Blob<double> > >& weights,
-    Blob<double>* out);
-
-template <typename TypeParam>
-class MKLDNNConvolutionLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
-
-
-#define MB 2
-#define IC 8
-#define OC 8
-#define IH 5
-#define IW 5
-#define OH 5
-#define OW 5
-#define KH 3
-#define KW 3
-#define CS 1
-#define GR 2
-#define PD 1
-
- protected:
-  MKLDNNConvolutionLayerTest()
-      : blob_bottom_(new Blob<Dtype>(MB, IC, IH, IW)),
-        blob_bottom_2_(new Blob<Dtype>(MB, IC, IH, IW)),
-        blob_top_(new Blob<Dtype>()),
-        blob_top_2_(new Blob<Dtype>()) {}
-  virtual void SetUp() {
-    // fill the values
-    FillerParameter filler_param;
-    filler_param.set_value(1.);
-    GaussianFiller<Dtype> filler(filler_param);
-    filler.Fill(this->blob_bottom_);
-    filler.Fill(this->blob_bottom_2_);
-    blob_bottom_vec_.push_back(blob_bottom_);
-    blob_top_vec_.push_back(blob_top_);
-  }
-
-  virtual ~MKLDNNConvolutionLayerTest() {
-    delete blob_bottom_;
-    delete blob_bottom_2_;
-    delete blob_top_;
-    delete blob_top_2_;
-  }
-
-  virtual Blob<Dtype>* MakeReferenceTop(Blob<Dtype>* top) {
-    this->ref_blob_top_.reset(new Blob<Dtype>());
-    this->ref_blob_top_->ReshapeLike(*top);
-    return this->ref_blob_top_.get();
-  }
-
-  Blob<Dtype>* const blob_bottom_;
-  Blob<Dtype>* const blob_bottom_2_;
-  Blob<Dtype>* const blob_top_;
-  Blob<Dtype>* const blob_top_2_;
-  shared_ptr<Blob<Dtype> > ref_blob_top_;
-  vector<Blob<Dtype>*> blob_bottom_vec_;
-  vector<Blob<Dtype>*> blob_top_vec_;
-};
-
-typedef ::testing::Types<CPUDevice<float>
-//                        ,CPUDevice<double>
-                        > TestDtypesCPU;
-
-TYPED_TEST_CASE(MKLDNNConvolutionLayerTest, TestDtypesCPU);
-
-TYPED_TEST(MKLDNNConvolutionLayerTest, TestSetupMKLDNN) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(KH);
-  convolution_param->add_stride(CS);
-  convolution_param->set_num_output(OC);
-  convolution_param->add_pad(PD);
-  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
-  this->blob_top_vec_.push_back(this->blob_top_2_);
-  shared_ptr<Layer<Dtype> > layer(
-      new MKLDNNConvolutionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), MB);
-  EXPECT_EQ(this->blob_top_->channels(), OC);
-  EXPECT_EQ(this->blob_top_->height(), OH);
-  EXPECT_EQ(this->blob_top_->width(), OW);
-  EXPECT_EQ(this->blob_top_2_->num(), MB);
-  EXPECT_EQ(this->blob_top_2_->channels(), OC );
-  EXPECT_EQ(this->blob_top_2_->height(), OH);
-  EXPECT_EQ(this->blob_top_2_->width(), OW);
-  // setting group should not change the shape
-  convolution_param->set_num_output(OC);
-  convolution_param->set_group(GR);
-  layer.reset(new MKLDNNConvolutionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(this->blob_top_->num(), MB);
-  EXPECT_EQ(this->blob_top_->channels(), OC);
-  EXPECT_EQ(this->blob_top_->height(), OH);
-  EXPECT_EQ(this->blob_top_->width(), OW);
-  EXPECT_EQ(this->blob_top_2_->num(), MB);
-  EXPECT_EQ(this->blob_top_2_->channels(), OC);
-  EXPECT_EQ(this->blob_top_2_->height(), OH);
-  EXPECT_EQ(this->blob_top_2_->width(), OW);
-}
-
-TYPED_TEST(MKLDNNConvolutionLayerTest, DISABLED_TestSimpleConvolutionMKLDNN) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
-  this->blob_top_vec_.push_back(this->blob_top_2_);
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(KH);
-  convolution_param->add_stride(CS);
-  convolution_param->set_num_output(OC);
-  convolution_param->add_pad(PD);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("constant");
-  convolution_param->mutable_bias_filler()->set_value(0.1);
-  shared_ptr<Layer<Dtype> > layer(
-      new MKLDNNConvolutionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Check against reference convolution.
-  const Dtype* top_data;
-  const Dtype* ref_top_data;
-  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
-      this->MakeReferenceTop(this->blob_top_));
-  top_data = this->blob_top_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-
-#if 0   // TODO: improve conv so that it runs on all buffers in bottom vector
-  caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(),
-      this->MakeReferenceTop(this->blob_top_2_));
-  top_data = this->blob_top_2_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-#endif
-}
-
-TYPED_TEST(MKLDNNConvolutionLayerTest, DISABLED_TestSimpleConvolutionReLUMKLDNN) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
-  this->blob_top_vec_.push_back(this->blob_top_2_);
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_num_output(OC);
-  convolution_param->set_relu(true);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("constant");
-  convolution_param->mutable_bias_filler()->set_value(0.1);
-  shared_ptr<Layer<Dtype> > layer(
-      new MKLDNNConvolutionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Check against reference convolution.
-  const Dtype* top_data;
-  const Dtype* ref_top_data;
-  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
-      this->MakeReferenceTop(this->blob_top_));
-  top_data = this->blob_top_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-}
-
-#if 0
-TYPED_TEST(MKLDNNConvolutionLayerTest, TestDilatedConvolutionMKLDNN) {
-  typedef typename TypeParam::Dtype Dtype;
-  vector<int> bottom_shape;
-  bottom_shape.push_back(2);
-  bottom_shape.push_back(3);
-  bottom_shape.push_back(8);
-  bottom_shape.push_back(7);
-  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
-  this->blob_top_vec_.push_back(this->blob_top_2_);
-  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
-    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
-  }
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_dilation(2);
-  convolution_param->set_num_output(4);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("constant");
-  convolution_param->mutable_bias_filler()->set_value(0.1);
-  shared_ptr<Layer<Dtype> > layer(
-      new MKLDNNConvolutionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Check against reference convolution.
-  const Dtype* top_data;
-  const Dtype* ref_top_data;
-  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
-             this->MakeReferenceTop(this->blob_top_));
-  top_data = this->blob_top_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-#if 0   // TODO: improve conv so that it runs on all buffers in bottom vector
-  caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(),
-             this->MakeReferenceTop(this->blob_top_2_));
-  top_data = this->blob_top_2_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-#endif
-}
-#endif
-
-#if 0
-TYPED_TEST(MKLDNNConvolutionLayerTest, Test0DConvolutionMKLDNN) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  const int kNumOutput = 3;
-  convolution_param->set_num_output(kNumOutput);
-  convolution_param->set_axis(3);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("gaussian");
-  shared_ptr<Layer<Dtype> > layer(
-      new MKLDNNConvolutionLayer<Dtype>(layer_param));
-  vector<int> top_shape = this->blob_bottom_->shape();
-  top_shape[3] = kNumOutput;
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  EXPECT_EQ(top_shape, this->blob_top_->shape());
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Check against reference convolution.
-  vector<int> weight_offset(2);
-  const Blob<Dtype>* weight = layer->blobs()[0].get();
-  const Blob<Dtype>* bias = layer->blobs()[1].get();
-  const int num = this->blob_top_->count(3);
-  const int dim = this->blob_top_->shape(3);
-  const int bottom_dim = this->blob_bottom_->shape(3);
-  for (int n = 0; n < num; ++n) {
-    for (int d = 0; d < dim; ++d) {
-      weight_offset[0] = d;
-      Dtype value = bias->cpu_data()[d];
-      for (int bottom_d = 0; bottom_d < bottom_dim; ++bottom_d) {
-        weight_offset[1] = bottom_d;
-        value += weight->data_at(weight_offset) *
-                 this->blob_bottom_->cpu_data()[n * bottom_dim + bottom_d];
-      }
-      EXPECT_NEAR(value, this->blob_top_->cpu_data()[n * dim + d], 1e-4);
-    }
-  }
-}
-#endif
-
-#if 0
-TYPED_TEST(MKLDNNConvolutionLayerTest, TestSimple3DConvolution) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
-  this->blob_top_vec_.push_back(this->blob_top_2_);
-  vector<int> bottom_shape(5);
-  bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0);
-  bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1);
-  bottom_shape[2] = 5;
-  bottom_shape[3] = this->blob_bottom_vec_[0]->shape(2);
-  bottom_shape[4] = this->blob_bottom_vec_[0]->shape(3);
-  FillerParameter filler_param;
-  GaussianFiller<Dtype> filler(filler_param);
-  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
-    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
-    filler.Fill(this->blob_bottom_vec_[i]);
-  }
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_num_output(4);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("gaussian");
-  shared_ptr<Layer<Dtype> > layer(
-      new MKLDNNConvolutionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Check against reference convolution.
-  const Dtype* top_data;
-  const Dtype* ref_top_data;
-  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
-      this->MakeReferenceTop(this->blob_top_));
-  top_data = this->blob_top_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-
-#if 0   // TODO: improve conv so that it runs on all buffers in bottom vector
-  caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(),
-      this->MakeReferenceTop(this->blob_top_2_));
-  top_data = this->blob_top_2_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-#endif
-}
-#endif
-
-#if 0
-TYPED_TEST(MKLDNNConvolutionLayerTest, TestDilated3DConvolution) {
-  typedef typename TypeParam::Dtype Dtype;
-  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
-  this->blob_top_vec_.push_back(this->blob_top_2_);
-  vector<int> bottom_shape(5);
-  bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0);
-  bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1);
-  bottom_shape[2] = 6;
-  bottom_shape[3] = 7;
-  bottom_shape[4] = 8;
-  FillerParameter filler_param;
-  GaussianFiller<Dtype> filler(filler_param);
-  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
-    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
-    filler.Fill(this->blob_bottom_vec_[i]);
-  }
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_dilation(2);
-  convolution_param->set_num_output(4);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("gaussian");
-  shared_ptr<Layer<Dtype> > layer(
-      new MKLDNNConvolutionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Check against reference convolution.
-  const Dtype* top_data;
-  const Dtype* ref_top_data;
-  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
-             this->MakeReferenceTop(this->blob_top_));
-  top_data = this->blob_top_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-  caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(),
-             this->MakeReferenceTop(this->blob_top_2_));
-  top_data = this->blob_top_2_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-}
-#endif
-
-TYPED_TEST(MKLDNNConvolutionLayerTest, DISABLED_Test1x1Convolution) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(1);
-  convolution_param->add_stride(1);
-  convolution_param->set_num_output(OC);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("constant");
-  convolution_param->mutable_bias_filler()->set_value(0.1);
-  shared_ptr<Layer<Dtype> > layer(
-      new MKLDNNConvolutionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Check against reference convolution.
-  const Dtype* top_data;
-  const Dtype* ref_top_data;
-  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
-      this->MakeReferenceTop(this->blob_top_));
-  top_data = this->blob_top_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-}
-
-TYPED_TEST(MKLDNNConvolutionLayerTest, DISABLED_Test1x1ConvolutionReLU) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(1);
-  convolution_param->add_stride(1);
-  convolution_param->set_num_output(OC);
-  convolution_param->set_relu(true);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("constant");
-  convolution_param->mutable_bias_filler()->set_value(0.1);
-  shared_ptr<Layer<Dtype> > layer(
-      new MKLDNNConvolutionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Check against reference convolution.
-  const Dtype* top_data;
-  const Dtype* ref_top_data;
-  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
-      this->MakeReferenceTop(this->blob_top_));
-  top_data = this->blob_top_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-}
-
-TYPED_TEST(MKLDNNConvolutionLayerTest, DISABLED_TestSimpleConvolutionGroup) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(KH);
-  convolution_param->add_stride(CS);
-  convolution_param->set_num_output(OC);
-  convolution_param->set_group(GR);
-  convolution_param->add_pad(PD);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("constant");
-  convolution_param->mutable_bias_filler()->set_value(0.1);
-  shared_ptr<Layer<Dtype> > layer(
-      new MKLDNNConvolutionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Check against reference convolution.
-  const Dtype* top_data;
-  const Dtype* ref_top_data;
-  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
-      this->MakeReferenceTop(this->blob_top_));
-  top_data = this->blob_top_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-}
-
-TYPED_TEST(MKLDNNConvolutionLayerTest, DISABLED_TestSimpleConvolutionReLUGroup) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_num_output(OC);
-  convolution_param->set_relu(true);
-  convolution_param->set_group(GR);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("constant");
-  convolution_param->mutable_bias_filler()->set_value(0.1);
-  shared_ptr<Layer<Dtype> > layer(
-      new MKLDNNConvolutionLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Check against reference convolution.
-  const Dtype* top_data;
-  const Dtype* ref_top_data;
-  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
-      this->MakeReferenceTop(this->blob_top_));
-  top_data = this->blob_top_->cpu_data();
-  ref_top_data = this->ref_blob_top_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
-  }
-}
-
-#if 0
-TYPED_TEST(MKLDNNConvolutionLayerTest, TestSobelConvolution) {
-  // Test separable convolution by computing the Sobel operator
-  // as a single filter then comparing the result
-  // as the convolution of two rectangular filters.
-  typedef typename TypeParam::Dtype Dtype;
-  // Fill bottoms with identical Gaussian noise.
-  shared_ptr<GaussianFiller<Dtype> > filler;
-  FillerParameter filler_param;
-  filler_param.set_value(1.);
-  filler.reset(new GaussianFiller<Dtype>(filler_param));
-  filler->Fill(this->blob_bottom_);
-  this->blob_bottom_2_->CopyFrom(*this->blob_bottom_);
-  // Compute Sobel G_x operator as 3 x 3 convolution.
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_num_output(1);
-  convolution_param->set_bias_term(false);
-  shared_ptr<Layer<Dtype> > layer(
-      new MKLDNNConvolutionLayer<Dtype>(layer_param));
-  layer->blobs().resize(1);
-  layer->blobs()[0].reset(new Blob<Dtype>(1, 3, 3, 3));
-  Dtype* weights = layer->blobs()[0]->mutable_cpu_data();
-  for (int c = 0; c < 3; ++c) {
-    int i = c * 9;  // 3 x 3 filter
-    weights[i +  0] = -1;
-    weights[i +  1] =  0;
-    weights[i +  2] =  1;
-    weights[i +  3] = -2;
-    weights[i +  4] =  0;
-    weights[i +  5] =  2;
-    weights[i +  6] = -1;
-    weights[i +  7] =  0;
-    weights[i +  8] =  1;
-  }
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  // Compute Sobel G_x operator as separable 3 x 1 and 1 x 3 convolutions.
-  // (1) the [1 2 1] column filter
-  vector<Blob<Dtype>*> sep_blob_bottom_vec;
-  vector<Blob<Dtype>*> sep_blob_top_vec;
-  shared_ptr<Blob<Dtype> > blob_sep(new Blob<Dtype>());
-  sep_blob_bottom_vec.push_back(this->blob_bottom_2_);
-  sep_blob_top_vec.push_back(this->blob_top_2_);
-  convolution_param->clear_kernel_size();
-  convolution_param->clear_stride();
-  convolution_param->set_kernel_h(3);
-  convolution_param->set_kernel_w(1);
-  convolution_param->set_stride_h(2);
-  convolution_param->set_stride_w(1);
-  convolution_param->set_num_output(1);
-  convolution_param->set_bias_term(false);
-  layer.reset(new MKLDNNConvolutionLayer<Dtype>(layer_param));
-  layer->blobs().resize(1);
-  layer->blobs()[0].reset(new Blob<Dtype>(1, 3, 3, 1));
-  Dtype* weights_1 = layer->blobs()[0]->mutable_cpu_data();
-  for (int c = 0; c < 3; ++c) {
-    int i = c * 3;  // 3 x 1 filter
-    weights_1[i +  0] = 1;
-    weights_1[i +  1] = 2;
-    weights_1[i +  2] = 1;
-  }
-  layer->SetUp(sep_blob_bottom_vec, sep_blob_top_vec);
-  layer->Forward(sep_blob_bottom_vec, sep_blob_top_vec);
-  // (2) the [-1 0 1] row filter
-  blob_sep->CopyFrom(*this->blob_top_2_, false, true);
-  sep_blob_bottom_vec.clear();
-  sep_blob_bottom_vec.push_back(blob_sep.get());
-  convolution_param->set_kernel_h(1);
-  convolution_param->set_kernel_w(3);
-  convolution_param->set_stride_h(1);
-  convolution_param->set_stride_w(2);
-  convolution_param->set_num_output(1);
-  convolution_param->set_bias_term(false);
-  layer.reset(new MKLDNNConvolutionLayer<Dtype>(layer_param));
-  layer->blobs().resize(1);
-  layer->blobs()[0].reset(new Blob<Dtype>(1, 1, 1, 3));
-  Dtype* weights_2 = layer->blobs()[0]->mutable_cpu_data();
-  weights_2[0] = -1;
-  weights_2[1] =  0;
-  weights_2[2] =  1;
-  layer->SetUp(sep_blob_bottom_vec, sep_blob_top_vec);
-  layer->Forward(sep_blob_bottom_vec, sep_blob_top_vec);
-  // Test equivalence of full and separable filters.
-  const Dtype* top_data = this->blob_top_->cpu_data();
-  const Dtype* sep_top_data = this->blob_top_2_->cpu_data();
-  for (int i = 0; i < this->blob_top_->count(); ++i) {
-    EXPECT_NEAR(top_data[i], sep_top_data[i], 1e-4);
-  }
-}
-#endif
-
-#if 0
-TYPED_TEST(MKLDNNConvolutionLayerTest, TestNDAgainst2D) {
-  typedef typename TypeParam::Dtype Dtype;
-  const int kernel_h = 11;
-  const int kernel_w = 13;
-  vector<int> bottom_shape(4);
-  bottom_shape[0] = 15;
-  bottom_shape[1] = 18;
-  bottom_shape[2] = kernel_h * 2;
-  bottom_shape[3] = kernel_w * 2;
-  FillerParameter filler_param;
-  GaussianFiller<Dtype> filler(filler_param);
-  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
-    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
-    filler.Fill(this->blob_bottom_vec_[i]);
-  }
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->set_num_output(12);
-  convolution_param->set_bias_term(false);
-  convolution_param->set_group(6);
-  convolution_param->set_kernel_h(kernel_h);
-  convolution_param->set_kernel_w(kernel_w);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  Blob<Dtype> weights;
-  Blob<Dtype> top_diff;
-  // Shape and fill weights and top_diff.
-  bool copy_diff;
-  bool reshape;
-  {
-    MKLDNNConvolutionLayer<Dtype> layer(layer_param);
-    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    top_diff.ReshapeLike(*this->blob_top_);
-    filler.Fill(&top_diff);
-    ASSERT_EQ(1, layer.blobs().size());
-    copy_diff = false; reshape = true;
-    weights.CopyFrom(*layer.blobs()[0], copy_diff, reshape);
-  }
-  vector<bool> propagate_down(1, true);
-  Blob<Dtype> result_2d;
-  Blob<Dtype> backward_result_2d;
-  Blob<Dtype> backward_weight_result_2d;
-  // Test with 2D im2col
-  {
-    caffe_set(this->blob_top_->count(), Dtype(0),
-              this->blob_top_->mutable_cpu_data());
-    caffe_set(this->blob_bottom_->count(), Dtype(0),
-              this->blob_bottom_->mutable_cpu_diff());
-    caffe_set(weights.count(), Dtype(0), weights.mutable_cpu_diff());
-    // Do SetUp and Forward; save Forward result in result_2d.
-    convolution_param->set_force_nd_im2col(false);
-    MKLDNNConvolutionLayer<Dtype> layer_2d(layer_param);
-    layer_2d.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    ASSERT_EQ(1, layer_2d.blobs().size());
-    copy_diff = false; reshape = false;
-    layer_2d.blobs()[0]->CopyFrom(weights, copy_diff, reshape);
-    layer_2d.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    copy_diff = false; reshape = true;
-    result_2d.CopyFrom(*this->blob_top_, copy_diff, reshape);
-    // Copy pre-generated top diff into actual top diff;
-    // do Backward and save result in backward_result_2d.
-    ASSERT_EQ(this->blob_top_->shape(), top_diff.shape());
-    caffe_copy(top_diff.count(), top_diff.cpu_data(),
-               this->blob_top_->mutable_cpu_diff());
-    layer_2d.Backward(this->blob_top_vec_, propagate_down,
-                      this->blob_bottom_vec_);
-    copy_diff = true; reshape = true;
-    backward_result_2d.CopyFrom(*this->blob_bottom_, copy_diff, reshape);
-    backward_weight_result_2d.CopyFrom(weights, copy_diff, reshape);
-  }
-  Blob<Dtype> result_nd;
-  Blob<Dtype> backward_result_nd;
-  Blob<Dtype> backward_weight_result_nd;
-  // Test with ND im2col
-  {
-    caffe_set(this->blob_top_->count(), Dtype(0),
-              this->blob_top_->mutable_cpu_data());
-    caffe_set(this->blob_bottom_->count(), Dtype(0),
-              this->blob_bottom_->mutable_cpu_diff());
-    caffe_set(weights.count(), Dtype(0), weights.mutable_cpu_diff());
-    // Do SetUp and Forward; save Forward result in result_nd.
-    convolution_param->set_force_nd_im2col(true);
-    MKLDNNConvolutionLayer<Dtype> layer_nd(layer_param);
-    layer_nd.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    ASSERT_EQ(1, layer_nd.blobs().size());
-    copy_diff = false; reshape = false;
-    layer_nd.blobs()[0]->CopyFrom(weights, copy_diff, reshape);
-    layer_nd.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    copy_diff = false; reshape = true;
-    result_nd.CopyFrom(*this->blob_top_, copy_diff, reshape);
-    // Copy pre-generated top diff into actual top diff;
-    // do Backward and save result in backward_result_nd.
-    ASSERT_EQ(this->blob_top_->shape(), top_diff.shape());
-    caffe_copy(top_diff.count(), top_diff.cpu_data(),
-               this->blob_top_->mutable_cpu_diff());
-    layer_nd.Backward(this->blob_top_vec_, propagate_down,
-                      this->blob_bottom_vec_);
-    copy_diff = true; reshape = true;
-    backward_result_nd.CopyFrom(*this->blob_bottom_, copy_diff, reshape);
-    backward_weight_result_nd.CopyFrom(weights, copy_diff, reshape);
-  }
-  ASSERT_EQ(result_nd.count(), result_2d.count());
-  for (int i = 0; i < result_2d.count(); ++i)  {
-    EXPECT_EQ(result_2d.cpu_data()[i], result_nd.cpu_data()[i]);
-  }
-  ASSERT_EQ(backward_result_nd.count(), backward_result_2d.count());
-  for (int i = 0; i < backward_result_2d.count(); ++i) {
-    EXPECT_EQ(backward_result_2d.cpu_diff()[i],
-              backward_result_nd.cpu_diff()[i]);
-  }
-  ASSERT_EQ(backward_weight_result_nd.count(),
-            backward_weight_result_2d.count());
-  for (int i = 0; i < backward_weight_result_2d.count(); ++i) {
-    EXPECT_EQ(backward_weight_result_2d.cpu_diff()[i],
-              backward_weight_result_nd.cpu_diff()[i]);
-  }
-}
-#endif
-
-TYPED_TEST(MKLDNNConvolutionLayerTest, DISABLED_TestGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-
-// TODO: improve conv so that it runs on all buffers in bottom vector
-  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
-  this->blob_top_vec_.push_back(this->blob_top_2_);
-  convolution_param->add_kernel_size(KH);
-  convolution_param->add_stride(CS);
-  convolution_param->set_num_output(OC);
-  convolution_param->add_pad(PD);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("gaussian");
-  MKLDNNConvolutionLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-#if 0
-TYPED_TEST(MKLDNNConvolutionLayerTest, TestDilatedGradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  vector<int> bottom_shape;
-  bottom_shape.push_back(2);
-  bottom_shape.push_back(3);
-  bottom_shape.push_back(5);
-  bottom_shape.push_back(6);
-  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
-    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
-  }
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_dilation(2);
-  convolution_param->set_num_output(2);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("gaussian");
-  MKLDNNConvolutionLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-                                  this->blob_top_vec_);
-}
-#endif
-
-#if 0
-TYPED_TEST(MKLDNNConvolutionLayerTest, TestGradient3D) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  vector<int> bottom_shape(5);
-  bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0);
-  bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1);
-  bottom_shape[2] = 5;
-  bottom_shape[3] = this->blob_bottom_vec_[0]->shape(2);
-  bottom_shape[4] = this->blob_bottom_vec_[0]->shape(3);
-  FillerParameter filler_param;
-  GaussianFiller<Dtype> filler(filler_param);
-  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
-    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
-    filler.Fill(this->blob_bottom_vec_[i]);
-  }
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_num_output(2);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("gaussian");
-  MKLDNNConvolutionLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-#endif
-
-TYPED_TEST(MKLDNNConvolutionLayerTest, DISABLED_Test1x1Gradient) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
-  this->blob_top_vec_.push_back(this->blob_top_2_);
-  convolution_param->add_kernel_size(1);
-  convolution_param->add_stride(1);
-  convolution_param->set_num_output(2);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("gaussian");
-  MKLDNNConvolutionLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-TYPED_TEST(MKLDNNConvolutionLayerTest, DISABLED_TestGradientGroup) {
-  typedef typename TypeParam::Dtype Dtype;
-  LayerParameter layer_param;
-  ConvolutionParameter* convolution_param =
-      layer_param.mutable_convolution_param();
-  convolution_param->add_kernel_size(3);
-  convolution_param->add_stride(2);
-  convolution_param->set_num_output(2);
-  convolution_param->set_group(GR);
-  convolution_param->mutable_weight_filler()->set_type("gaussian");
-  convolution_param->mutable_bias_filler()->set_type("gaussian");
-  MKLDNNConvolutionLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
-}
-
-}  // namespace caffe
-#endif  // #ifdef MKLDNN_SUPPORTED
+/*
+All modification made by Intel Corporation: © 2016 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef MKLDNN_SUPPORTED
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layers/mkldnn_layers.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+// Reference convolution for checking results:
+// accumulate through explicit loops over input, output, and filters.
+template <typename Dtype>
+void caffe_conv(const Blob<Dtype>* in, ConvolutionParameter* conv_param,
+    const vector<shared_ptr<Blob<Dtype> > >& weights,
+    Blob<Dtype>* out) {
+  const bool has_depth = (out->num_axes() == 5);
+  if (!has_depth) { CHECK_EQ(4, out->num_axes()); }
+  // Kernel size, stride, and pad
+  int kernel_h, kernel_w;
+  if (conv_param->has_kernel_h() || conv_param->has_kernel_w()) {
+    kernel_h = conv_param->kernel_h();
+    kernel_w = conv_param->kernel_w();
+  } else {
+    kernel_h = kernel_w = conv_param->kernel_size(0);
+  }
+  int pad_h, pad_w;
+  if (conv_param->has_pad_h() || conv_param->has_pad_w()) {
+    pad_h = conv_param->pad_h();
+    pad_w = conv_param->pad_w();
+  } else {
+    pad_h = pad_w = conv_param->pad_size() ? conv_param->pad(0) : 0;
+  }
+  int stride_h, stride_w;
+  if (conv_param->has_stride_h() || conv_param->has_stride_w()) {
+    stride_h = conv_param->stride_h();
+    stride_w = conv_param->stride_w();
+  } else {
+    stride_h = stride_w = conv_param->stride_size() ? conv_param->stride(0) : 1;
+  }
+  int dilation_h, dilation_w;
+  dilation_h = dilation_w = conv_param->dilation_size() ?
+                            conv_param->dilation(0) : 1;
+  int kernel_d, pad_d, stride_d, dilation_d;
+  if (has_depth) {
+    kernel_d = kernel_h;
+    stride_d = stride_h;
+    pad_d = pad_h;
+    dilation_d = dilation_h;
+  } else {
+    kernel_d = stride_d = dilation_d = 1;
+    pad_d = 0;
+  }
+  // Groups
+  int groups = conv_param->group();
+  int o_g = out->shape(1) / groups;
+  int k_g = in->shape(1) / groups;
+  int o_head, k_head;
+  // Convolution
+  vector<int> weight_offset(4 + has_depth);
+  vector<int> in_offset(4 + has_depth);
+  vector<int> out_offset(4 + has_depth);
+  Dtype* out_data = out->mutable_cpu_data();
+  for (int n = 0; n < out->shape(0); n++) {
+    for (int g = 0; g < groups; g++) {
+      o_head = o_g * g;
+      k_head = k_g * g;
+      for (int o = 0; o < o_g; o++) {
+        for (int k = 0; k < k_g; k++) {
+          for (int z = 0; z < (has_depth ? out->shape(2) : 1); z++) {
+            for (int y = 0; y < out->shape(2 + has_depth); y++) {
+              for (int x = 0; x < out->shape(3 + has_depth); x++) {
+                for (int r = 0; r < kernel_d; r++) {
+                  for (int p = 0; p < kernel_h; p++) {
+                    for (int q = 0; q < kernel_w; q++) {
+                      int in_z = z * stride_d - pad_d + r * dilation_d;
+                      int in_y = y * stride_h - pad_h + p * dilation_h;
+                      int in_x = x * stride_w - pad_w + q * dilation_w;
+                      if (in_z >= 0 && in_z < (has_depth ? in->shape(2) : 1)
+                          && in_y >= 0 && in_y < in->shape(2 + has_depth)
+                          && in_x >= 0 && in_x < in->shape(3 + has_depth)) {
+                        weight_offset[0] = o + o_head;
+                        weight_offset[1] = k;
+                        if (has_depth) { weight_offset[2] = r; }
+                        weight_offset[2 + has_depth] = p;
+                        weight_offset[3 + has_depth] = q;
+                        in_offset[0] = n;
+                        in_offset[1] = k + k_head;
+                        if (has_depth) { in_offset[2] = in_z; }
+                        in_offset[2 + has_depth] = in_y;
+                        in_offset[3 + has_depth] = in_x;
+                        out_offset[0] = n;
+                        out_offset[1] = o + o_head;
+                        if (has_depth) { out_offset[2] = z; }
+                        out_offset[2 + has_depth] = y;
+                        out_offset[3 + has_depth] = x;
+                        out_data[out->offset(out_offset)] +=
+                            in->data_at(in_offset)
+                            * weights[0]->data_at(weight_offset);
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  // Bias
+  if (conv_param->bias_term()) {
+    const Dtype* bias_data = weights[1]->cpu_data();
+    for (int n = 0; n < out->shape(0); n++) {
+      for (int o = 0; o < out->shape(1); o++) {
+        for (int z = 0; z < (has_depth ? out->shape(2) : 1); z++) {
+          for (int y = 0; y < out->shape(2 + has_depth); y++) {
+            for (int x = 0; x < out->shape(3 + has_depth); x++) {
+              out_offset[0] = n;
+              out_offset[1] = o;
+              if (has_depth) { out_offset[2] = z; }
+              out_offset[2 + has_depth] = y;
+              out_offset[3 + has_depth] = x;
+              out_data[out->offset(out_offset)] += bias_data[o];
+            }
+          }
+        }
+      }
+    }
+  }
+  //relu
+  if (conv_param->relu()){
+    for (int n = 0; n < out->shape(0); n++) {
+      for (int o = 0; o < out->shape(1); o++) {
+        for (int z = 0; z < (has_depth ? out->shape(2) : 1); z++) {
+          for (int y = 0; y < out->shape(2 + has_depth); y++) {
+            for (int x = 0; x < out->shape(3 + has_depth); x++) {
+              out_offset[0] = n;
+              out_offset[1] = o;
+              if (has_depth) { out_offset[2] = z; }
+              out_offset[2 + has_depth] = y;
+              out_offset[3 + has_depth] = x;
+              if(out_data[out->offset(out_offset)] < 0) out_data[out->offset(out_offset)] = 0;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template void caffe_conv(const Blob<float>* in,
+    ConvolutionParameter* conv_param,
+    const vector<shared_ptr<Blob<float> > >& weights,
+    Blob<float>* out);
+template void caffe_conv(const Blob<double>* in,
+    ConvolutionParameter* conv_param,
+    const vector<shared_ptr<Blob<double> > >& weights,
+    Blob<double>* out);
+
+template <typename TypeParam>
+class MKLDNNConvolutionLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+
+#define MB 2
+#define IC 8
+#define OC 8
+#define IH 5
+#define IW 5
+#define OH 5
+#define OW 5
+#define KH 3
+#define KW 3
+#define CS 1
+#define GR 2
+#define PD 1
+
+ protected:
+  MKLDNNConvolutionLayerTest()
+      : blob_bottom_(new Blob<Dtype>(MB, IC, IH, IW)),
+        blob_bottom_2_(new Blob<Dtype>(MB, IC, IH, IW)),
+        blob_top_(new Blob<Dtype>()),
+        blob_top_2_(new Blob<Dtype>()) {}
+  virtual void SetUp() {
+    // fill the values
+    FillerParameter filler_param;
+    filler_param.set_value(1.);
+    GaussianFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    filler.Fill(this->blob_bottom_2_);
+    blob_bottom_vec_.push_back(blob_bottom_);
+    blob_top_vec_.push_back(blob_top_);
+  }
+
+  virtual ~MKLDNNConvolutionLayerTest() {
+    delete blob_bottom_;
+    delete blob_bottom_2_;
+    delete blob_top_;
+    delete blob_top_2_;
+  }
+
+  virtual Blob<Dtype>* MakeReferenceTop(Blob<Dtype>* top) {
+    this->ref_blob_top_.reset(new Blob<Dtype>());
+    this->ref_blob_top_->ReshapeLike(*top);
+    return this->ref_blob_top_.get();
+  }
+
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_bottom_2_;
+  Blob<Dtype>* const blob_top_;
+  Blob<Dtype>* const blob_top_2_;
+  shared_ptr<Blob<Dtype> > ref_blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+typedef ::testing::Types<CPUDevice<float>
+//                        ,CPUDevice<double>
+                        > TestDtypesCPU;
+
+TYPED_TEST_CASE(MKLDNNConvolutionLayerTest, TestDtypesCPU);
+
+TYPED_TEST(MKLDNNConvolutionLayerTest, TestSetupMKLDNN) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->add_kernel_size(KH);
+  convolution_param->add_stride(CS);
+  convolution_param->set_num_output(OC);
+  convolution_param->add_pad(PD);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+  this->blob_top_vec_.push_back(this->blob_top_2_);
+  shared_ptr<Layer<Dtype> > layer(
+      new MKLDNNConvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_->num(), MB);
+  EXPECT_EQ(this->blob_top_->channels(), OC);
+  EXPECT_EQ(this->blob_top_->height(), OH);
+  EXPECT_EQ(this->blob_top_->width(), OW);
+  EXPECT_EQ(this->blob_top_2_->num(), MB);
+  EXPECT_EQ(this->blob_top_2_->channels(), OC );
+  EXPECT_EQ(this->blob_top_2_->height(), OH);
+  EXPECT_EQ(this->blob_top_2_->width(), OW);
+  // setting group should not change the shape
+  convolution_param->set_num_output(OC);
+  convolution_param->set_group(GR);
+  layer.reset(new MKLDNNConvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_->num(), MB);
+  EXPECT_EQ(this->blob_top_->channels(), OC);
+  EXPECT_EQ(this->blob_top_->height(), OH);
+  EXPECT_EQ(this->blob_top_->width(), OW);
+  EXPECT_EQ(this->blob_top_2_->num(), MB);
+  EXPECT_EQ(this->blob_top_2_->channels(), OC);
+  EXPECT_EQ(this->blob_top_2_->height(), OH);
+  EXPECT_EQ(this->blob_top_2_->width(), OW);
+}
+
+TYPED_TEST(MKLDNNConvolutionLayerTest, TestSetupMKLDNNWithRectangeKernelStridePad) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->set_kernel_h(4);
+  convolution_param->set_kernel_w(1);
+  convolution_param->set_stride_h(3);
+  convolution_param->set_stride_w(1);
+  convolution_param->set_num_output(OC);
+  convolution_param->set_pad_h(2);
+  convolution_param->set_pad_w(1);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+  this->blob_top_vec_.push_back(this->blob_top_2_);
+  shared_ptr<MKLDNNConvolutionLayer<Dtype> > layer(
+      new MKLDNNConvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(convolution_param->kernel_h(), 4);
+  EXPECT_EQ(layer->GetKernelHeight(), 4);
+  EXPECT_EQ(convolution_param->kernel_w(), 1);
+  EXPECT_EQ(layer->GetKernelWidth(), 1);
+  EXPECT_EQ(convolution_param->stride_h(), 3);
+  EXPECT_EQ(layer->GetStrideHeight(), 3);
+  EXPECT_EQ(convolution_param->stride_w(), 1);
+  EXPECT_EQ(layer->GetStrideWidth(), 1);
+  EXPECT_EQ(convolution_param->pad_h(), 2);
+  EXPECT_EQ(layer->GetPadHeight(), 2);
+  EXPECT_EQ(convolution_param->pad_w(), 1);
+  EXPECT_EQ(layer->GetPadWidth(), 1);
+  // setting group should not change the shape
+  convolution_param->set_num_output(OC);
+  convolution_param->set_group(GR);
+  layer.reset(new MKLDNNConvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(convolution_param->kernel_h(), 4);
+  EXPECT_EQ(layer->GetKernelHeight(), 4);
+  EXPECT_EQ(convolution_param->kernel_w(), 1);
+  EXPECT_EQ(layer->GetKernelWidth(), 1);
+  EXPECT_EQ(convolution_param->stride_h(), 3);
+  EXPECT_EQ(layer->GetStrideHeight(), 3);
+  EXPECT_EQ(convolution_param->stride_w(), 1);
+  EXPECT_EQ(layer->GetStrideWidth(), 1);
+  EXPECT_EQ(convolution_param->pad_h(), 2);
+  EXPECT_EQ(layer->GetPadHeight(), 2);
+  EXPECT_EQ(convolution_param->pad_w(), 1);
+  EXPECT_EQ(layer->GetPadWidth(), 1);
+}
+
+TYPED_TEST(MKLDNNConvolutionLayerTest, TestSimpleConvolutionMKLDNN) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+  this->blob_top_vec_.push_back(this->blob_top_2_);
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->add_kernel_size(KH);
+  convolution_param->add_stride(CS);
+  convolution_param->set_num_output(OC);
+  convolution_param->add_pad(PD);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("constant");
+  convolution_param->mutable_bias_filler()->set_value(0.1);
+  shared_ptr<Layer<Dtype> > layer(
+      new MKLDNNConvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Check against reference convolution.
+  const Dtype* top_data;
+  const Dtype* ref_top_data;
+  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
+      this->MakeReferenceTop(this->blob_top_));
+  top_data = this->blob_top_->cpu_data();
+  ref_top_data = this->ref_blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+  }
+
+#if 0   // TODO: improve conv so that it runs on all buffers in bottom vector
+  caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(),
+      this->MakeReferenceTop(this->blob_top_2_));
+  top_data = this->blob_top_2_->cpu_data();
+  ref_top_data = this->ref_blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+  }
+#endif
+}
+
+TYPED_TEST(MKLDNNConvolutionLayerTest, TestSimpleConvolutionReLUMKLDNN) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+  this->blob_top_vec_.push_back(this->blob_top_2_);
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
+  convolution_param->set_num_output(OC);
+  convolution_param->set_relu(true);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("constant");
+  convolution_param->mutable_bias_filler()->set_value(0.1);
+  shared_ptr<Layer<Dtype> > layer(
+      new MKLDNNConvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Check against reference convolution.
+  const Dtype* top_data;
+  const Dtype* ref_top_data;
+  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
+      this->MakeReferenceTop(this->blob_top_));
+  top_data = this->blob_top_->cpu_data();
+  ref_top_data = this->ref_blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+  }
+}
+
+#if 0
+TYPED_TEST(MKLDNNConvolutionLayerTest, TestDilatedConvolutionMKLDNN) {
+  typedef typename TypeParam::Dtype Dtype;
+  vector<int> bottom_shape;
+  bottom_shape.push_back(2);
+  bottom_shape.push_back(3);
+  bottom_shape.push_back(8);
+  bottom_shape.push_back(7);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+  this->blob_top_vec_.push_back(this->blob_top_2_);
+  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
+    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
+  }
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_dilation(2);
+  convolution_param->set_num_output(4);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("constant");
+  convolution_param->mutable_bias_filler()->set_value(0.1);
+  shared_ptr<Layer<Dtype> > layer(
+      new MKLDNNConvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Check against reference convolution.
+  const Dtype* top_data;
+  const Dtype* ref_top_data;
+  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
+             this->MakeReferenceTop(this->blob_top_));
+  top_data = this->blob_top_->cpu_data();
+  ref_top_data = this->ref_blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+  }
+#if 0   // TODO: improve conv so that it runs on all buffers in bottom vector
+  caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(),
+             this->MakeReferenceTop(this->blob_top_2_));
+  top_data = this->blob_top_2_->cpu_data();
+  ref_top_data = this->ref_blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+  }
+#endif
+}
+#endif
+
+#if 0
+TYPED_TEST(MKLDNNConvolutionLayerTest, Test0DConvolutionMKLDNN) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  const int kNumOutput = 3;
+  convolution_param->set_num_output(kNumOutput);
+  convolution_param->set_axis(3);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("gaussian");
+  shared_ptr<Layer<Dtype> > layer(
+      new MKLDNNConvolutionLayer<Dtype>(layer_param));
+  vector<int> top_shape = this->blob_bottom_->shape();
+  top_shape[3] = kNumOutput;
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(top_shape, this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Check against reference convolution.
+  vector<int> weight_offset(2);
+  const Blob<Dtype>* weight = layer->blobs()[0].get();
+  const Blob<Dtype>* bias = layer->blobs()[1].get();
+  const int num = this->blob_top_->count(3);
+  const int dim = this->blob_top_->shape(3);
+  const int bottom_dim = this->blob_bottom_->shape(3);
+  for (int n = 0; n < num; ++n) {
+    for (int d = 0; d < dim; ++d) {
+      weight_offset[0] = d;
+      Dtype value = bias->cpu_data()[d];
+      for (int bottom_d = 0; bottom_d < bottom_dim; ++bottom_d) {
+        weight_offset[1] = bottom_d;
+        value += weight->data_at(weight_offset) *
+                 this->blob_bottom_->cpu_data()[n * bottom_dim + bottom_d];
+      }
+      EXPECT_NEAR(value, this->blob_top_->cpu_data()[n * dim + d], 1e-4);
+    }
+  }
+}
+#endif
+
+#if 0
+TYPED_TEST(MKLDNNConvolutionLayerTest, TestSimple3DConvolution) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+  this->blob_top_vec_.push_back(this->blob_top_2_);
+  vector<int> bottom_shape(5);
+  bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0);
+  bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1);
+  bottom_shape[2] = 5;
+  bottom_shape[3] = this->blob_bottom_vec_[0]->shape(2);
+  bottom_shape[4] = this->blob_bottom_vec_[0]->shape(3);
+  FillerParameter filler_param;
+  GaussianFiller<Dtype> filler(filler_param);
+  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
+    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
+    filler.Fill(this->blob_bottom_vec_[i]);
+  }
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
+  convolution_param->set_num_output(4);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("gaussian");
+  shared_ptr<Layer<Dtype> > layer(
+      new MKLDNNConvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Check against reference convolution.
+  const Dtype* top_data;
+  const Dtype* ref_top_data;
+  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
+      this->MakeReferenceTop(this->blob_top_));
+  top_data = this->blob_top_->cpu_data();
+  ref_top_data = this->ref_blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+  }
+
+#if 0   // TODO: improve conv so that it runs on all buffers in bottom vector
+  caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(),
+      this->MakeReferenceTop(this->blob_top_2_));
+  top_data = this->blob_top_2_->cpu_data();
+  ref_top_data = this->ref_blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+  }
+#endif
+}
+#endif
+
+#if 0
+TYPED_TEST(MKLDNNConvolutionLayerTest, TestDilated3DConvolution) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+  this->blob_top_vec_.push_back(this->blob_top_2_);
+  vector<int> bottom_shape(5);
+  bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0);
+  bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1);
+  bottom_shape[2] = 6;
+  bottom_shape[3] = 7;
+  bottom_shape[4] = 8;
+  FillerParameter filler_param;
+  GaussianFiller<Dtype> filler(filler_param);
+  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
+    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
+    filler.Fill(this->blob_bottom_vec_[i]);
+  }
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_dilation(2);
+  convolution_param->set_num_output(4);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("gaussian");
+  shared_ptr<Layer<Dtype> > layer(
+      new MKLDNNConvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Check against reference convolution.
+  const Dtype* top_data;
+  const Dtype* ref_top_data;
+  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
+             this->MakeReferenceTop(this->blob_top_));
+  top_data = this->blob_top_->cpu_data();
+  ref_top_data = this->ref_blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+  }
+  caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(),
+             this->MakeReferenceTop(this->blob_top_2_));
+  top_data = this->blob_top_2_->cpu_data();
+  ref_top_data = this->ref_blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+  }
+}
+#endif
+
+TYPED_TEST(MKLDNNConvolutionLayerTest, Test1x1Convolution) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->add_kernel_size(1);
+  convolution_param->add_stride(1);
+  convolution_param->set_num_output(OC);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("constant");
+  convolution_param->mutable_bias_filler()->set_value(0.1);
+  shared_ptr<Layer<Dtype> > layer(
+      new MKLDNNConvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Check against reference convolution.
+  const Dtype* top_data;
+  const Dtype* ref_top_data;
+  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
+      this->MakeReferenceTop(this->blob_top_));
+  top_data = this->blob_top_->cpu_data();
+  ref_top_data = this->ref_blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+  }
+}
+
+TYPED_TEST(MKLDNNConvolutionLayerTest, Test1x1ConvolutionReLU) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->add_kernel_size(1);
+  convolution_param->add_stride(1);
+  convolution_param->set_num_output(OC);
+  convolution_param->set_relu(true);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("constant");
+  convolution_param->mutable_bias_filler()->set_value(0.1);
+  shared_ptr<Layer<Dtype> > layer(
+      new MKLDNNConvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Check against reference convolution.
+  const Dtype* top_data;
+  const Dtype* ref_top_data;
+  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
+      this->MakeReferenceTop(this->blob_top_));
+  top_data = this->blob_top_->cpu_data();
+  ref_top_data = this->ref_blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+  }
+}
+
+TYPED_TEST(MKLDNNConvolutionLayerTest, TestSimpleConvolutionGroup) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->add_kernel_size(KH);
+  convolution_param->add_stride(CS);
+  convolution_param->set_num_output(OC);
+  convolution_param->set_group(GR);
+  convolution_param->add_pad(PD);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("constant");
+  convolution_param->mutable_bias_filler()->set_value(0.1);
+  shared_ptr<Layer<Dtype> > layer(
+      new MKLDNNConvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Check against reference convolution.
+  const Dtype* top_data;
+  const Dtype* ref_top_data;
+  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
+      this->MakeReferenceTop(this->blob_top_));
+  top_data = this->blob_top_->cpu_data();
+  ref_top_data = this->ref_blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+  }
+}
+
+TYPED_TEST(MKLDNNConvolutionLayerTest, TestSimpleConvolutionReLUGroup) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
+  convolution_param->set_num_output(OC);
+  convolution_param->set_relu(true);
+  convolution_param->set_group(GR);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("constant");
+  convolution_param->mutable_bias_filler()->set_value(0.1);
+  shared_ptr<Layer<Dtype> > layer(
+      new MKLDNNConvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Check against reference convolution.
+  const Dtype* top_data;
+  const Dtype* ref_top_data;
+  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
+      this->MakeReferenceTop(this->blob_top_));
+  top_data = this->blob_top_->cpu_data();
+  ref_top_data = this->ref_blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+  }
+}
+
+#if 0
+TYPED_TEST(MKLDNNConvolutionLayerTest, TestSobelConvolution) {
+  // Test separable convolution by computing the Sobel operator
+  // as a single filter then comparing the result
+  // as the convolution of two rectangular filters.
+  typedef typename TypeParam::Dtype Dtype;
+  // Fill bottoms with identical Gaussian noise.
+  shared_ptr<GaussianFiller<Dtype> > filler;
+  FillerParameter filler_param;
+  filler_param.set_value(1.);
+  filler.reset(new GaussianFiller<Dtype>(filler_param));
+  filler->Fill(this->blob_bottom_);
+  this->blob_bottom_2_->CopyFrom(*this->blob_bottom_);
+  // Compute Sobel G_x operator as 3 x 3 convolution.
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
+  convolution_param->set_num_output(1);
+  convolution_param->set_bias_term(false);
+  shared_ptr<Layer<Dtype> > layer(
+      new MKLDNNConvolutionLayer<Dtype>(layer_param));
+  layer->blobs().resize(1);
+  layer->blobs()[0].reset(new Blob<Dtype>(1, 3, 3, 3));
+  Dtype* weights = layer->blobs()[0]->mutable_cpu_data();
+  for (int c = 0; c < 3; ++c) {
+    int i = c * 9;  // 3 x 3 filter
+    weights[i +  0] = -1;
+    weights[i +  1] =  0;
+    weights[i +  2] =  1;
+    weights[i +  3] = -2;
+    weights[i +  4] =  0;
+    weights[i +  5] =  2;
+    weights[i +  6] = -1;
+    weights[i +  7] =  0;
+    weights[i +  8] =  1;
+  }
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Compute Sobel G_x operator as separable 3 x 1 and 1 x 3 convolutions.
+  // (1) the [1 2 1] column filter
+  vector<Blob<Dtype>*> sep_blob_bottom_vec;
+  vector<Blob<Dtype>*> sep_blob_top_vec;
+  shared_ptr<Blob<Dtype> > blob_sep(new Blob<Dtype>());
+  sep_blob_bottom_vec.push_back(this->blob_bottom_2_);
+  sep_blob_top_vec.push_back(this->blob_top_2_);
+  convolution_param->clear_kernel_size();
+  convolution_param->clear_stride();
+  convolution_param->set_kernel_h(3);
+  convolution_param->set_kernel_w(1);
+  convolution_param->set_stride_h(2);
+  convolution_param->set_stride_w(1);
+  convolution_param->set_num_output(1);
+  convolution_param->set_bias_term(false);
+  layer.reset(new MKLDNNConvolutionLayer<Dtype>(layer_param));
+  layer->blobs().resize(1);
+  layer->blobs()[0].reset(new Blob<Dtype>(1, 3, 3, 1));
+  Dtype* weights_1 = layer->blobs()[0]->mutable_cpu_data();
+  for (int c = 0; c < 3; ++c) {
+    int i = c * 3;  // 3 x 1 filter
+    weights_1[i +  0] = 1;
+    weights_1[i +  1] = 2;
+    weights_1[i +  2] = 1;
+  }
+  layer->SetUp(sep_blob_bottom_vec, sep_blob_top_vec);
+  layer->Forward(sep_blob_bottom_vec, sep_blob_top_vec);
+  // (2) the [-1 0 1] row filter
+  blob_sep->CopyFrom(*this->blob_top_2_, false, true);
+  sep_blob_bottom_vec.clear();
+  sep_blob_bottom_vec.push_back(blob_sep.get());
+  convolution_param->set_kernel_h(1);
+  convolution_param->set_kernel_w(3);
+  convolution_param->set_stride_h(1);
+  convolution_param->set_stride_w(2);
+  convolution_param->set_num_output(1);
+  convolution_param->set_bias_term(false);
+  layer.reset(new MKLDNNConvolutionLayer<Dtype>(layer_param));
+  layer->blobs().resize(1);
+  layer->blobs()[0].reset(new Blob<Dtype>(1, 1, 1, 3));
+  Dtype* weights_2 = layer->blobs()[0]->mutable_cpu_data();
+  weights_2[0] = -1;
+  weights_2[1] =  0;
+  weights_2[2] =  1;
+  layer->SetUp(sep_blob_bottom_vec, sep_blob_top_vec);
+  layer->Forward(sep_blob_bottom_vec, sep_blob_top_vec);
+  // Test equivalence of full and separable filters.
+  const Dtype* top_data = this->blob_top_->cpu_data();
+  const Dtype* sep_top_data = this->blob_top_2_->cpu_data();
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    EXPECT_NEAR(top_data[i], sep_top_data[i], 1e-4);
+  }
+}
+#endif
+
+#if 0
+TYPED_TEST(MKLDNNConvolutionLayerTest, TestNDAgainst2D) {
+  typedef typename TypeParam::Dtype Dtype;
+  const int kernel_h = 11;
+  const int kernel_w = 13;
+  vector<int> bottom_shape(4);
+  bottom_shape[0] = 15;
+  bottom_shape[1] = 18;
+  bottom_shape[2] = kernel_h * 2;
+  bottom_shape[3] = kernel_w * 2;
+  FillerParameter filler_param;
+  GaussianFiller<Dtype> filler(filler_param);
+  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
+    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
+    filler.Fill(this->blob_bottom_vec_[i]);
+  }
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->set_num_output(12);
+  convolution_param->set_bias_term(false);
+  convolution_param->set_group(6);
+  convolution_param->set_kernel_h(kernel_h);
+  convolution_param->set_kernel_w(kernel_w);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  Blob<Dtype> weights;
+  Blob<Dtype> top_diff;
+  // Shape and fill weights and top_diff.
+  bool copy_diff;
+  bool reshape;
+  {
+    MKLDNNConvolutionLayer<Dtype> layer(layer_param);
+    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    top_diff.ReshapeLike(*this->blob_top_);
+    filler.Fill(&top_diff);
+    ASSERT_EQ(1, layer.blobs().size());
+    copy_diff = false; reshape = true;
+    weights.CopyFrom(*layer.blobs()[0], copy_diff, reshape);
+  }
+  vector<bool> propagate_down(1, true);
+  Blob<Dtype> result_2d;
+  Blob<Dtype> backward_result_2d;
+  Blob<Dtype> backward_weight_result_2d;
+  // Test with 2D im2col
+  {
+    caffe_set(this->blob_top_->count(), Dtype(0),
+              this->blob_top_->mutable_cpu_data());
+    caffe_set(this->blob_bottom_->count(), Dtype(0),
+              this->blob_bottom_->mutable_cpu_diff());
+    caffe_set(weights.count(), Dtype(0), weights.mutable_cpu_diff());
+    // Do SetUp and Forward; save Forward result in result_2d.
+    convolution_param->set_force_nd_im2col(false);
+    MKLDNNConvolutionLayer<Dtype> layer_2d(layer_param);
+    layer_2d.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    ASSERT_EQ(1, layer_2d.blobs().size());
+    copy_diff = false; reshape = false;
+    layer_2d.blobs()[0]->CopyFrom(weights, copy_diff, reshape);
+    layer_2d.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    copy_diff = false; reshape = true;
+    result_2d.CopyFrom(*this->blob_top_, copy_diff, reshape);
+    // Copy pre-generated top diff into actual top diff;
+    // do Backward and save result in backward_result_2d.
+    ASSERT_EQ(this->blob_top_->shape(), top_diff.shape());
+    caffe_copy(top_diff.count(), top_diff.cpu_data(),
+               this->blob_top_->mutable_cpu_diff());
+    layer_2d.Backward(this->blob_top_vec_, propagate_down,
+                      this->blob_bottom_vec_);
+    copy_diff = true; reshape = true;
+    backward_result_2d.CopyFrom(*this->blob_bottom_, copy_diff, reshape);
+    backward_weight_result_2d.CopyFrom(weights, copy_diff, reshape);
+  }
+  Blob<Dtype> result_nd;
+  Blob<Dtype> backward_result_nd;
+  Blob<Dtype> backward_weight_result_nd;
+  // Test with ND im2col
+  {
+    caffe_set(this->blob_top_->count(), Dtype(0),
+              this->blob_top_->mutable_cpu_data());
+    caffe_set(this->blob_bottom_->count(), Dtype(0),
+              this->blob_bottom_->mutable_cpu_diff());
+    caffe_set(weights.count(), Dtype(0), weights.mutable_cpu_diff());
+    // Do SetUp and Forward; save Forward result in result_nd.
+    convolution_param->set_force_nd_im2col(true);
+    MKLDNNConvolutionLayer<Dtype> layer_nd(layer_param);
+    layer_nd.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    ASSERT_EQ(1, layer_nd.blobs().size());
+    copy_diff = false; reshape = false;
+    layer_nd.blobs()[0]->CopyFrom(weights, copy_diff, reshape);
+    layer_nd.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    copy_diff = false; reshape = true;
+    result_nd.CopyFrom(*this->blob_top_, copy_diff, reshape);
+    // Copy pre-generated top diff into actual top diff;
+    // do Backward and save result in backward_result_nd.
+    ASSERT_EQ(this->blob_top_->shape(), top_diff.shape());
+    caffe_copy(top_diff.count(), top_diff.cpu_data(),
+               this->blob_top_->mutable_cpu_diff());
+    layer_nd.Backward(this->blob_top_vec_, propagate_down,
+                      this->blob_bottom_vec_);
+    copy_diff = true; reshape = true;
+    backward_result_nd.CopyFrom(*this->blob_bottom_, copy_diff, reshape);
+    backward_weight_result_nd.CopyFrom(weights, copy_diff, reshape);
+  }
+  ASSERT_EQ(result_nd.count(), result_2d.count());
+  for (int i = 0; i < result_2d.count(); ++i)  {
+    EXPECT_EQ(result_2d.cpu_data()[i], result_nd.cpu_data()[i]);
+  }
+  ASSERT_EQ(backward_result_nd.count(), backward_result_2d.count());
+  for (int i = 0; i < backward_result_2d.count(); ++i) {
+    EXPECT_EQ(backward_result_2d.cpu_diff()[i],
+              backward_result_nd.cpu_diff()[i]);
+  }
+  ASSERT_EQ(backward_weight_result_nd.count(),
+            backward_weight_result_2d.count());
+  for (int i = 0; i < backward_weight_result_2d.count(); ++i) {
+    EXPECT_EQ(backward_weight_result_2d.cpu_diff()[i],
+              backward_weight_result_nd.cpu_diff()[i]);
+  }
+}
+#endif
+
+TYPED_TEST(MKLDNNConvolutionLayerTest, DISABLED_TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+
+// TODO: improve conv so that it runs on all buffers in bottom vector
+  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+  this->blob_top_vec_.push_back(this->blob_top_2_);
+  convolution_param->add_kernel_size(KH);
+  convolution_param->add_stride(CS);
+  convolution_param->set_num_output(OC);
+  convolution_param->add_pad(PD);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("gaussian");
+  MKLDNNConvolutionLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+#if 0
+TYPED_TEST(MKLDNNConvolutionLayerTest, TestDilatedGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  vector<int> bottom_shape;
+  bottom_shape.push_back(2);
+  bottom_shape.push_back(3);
+  bottom_shape.push_back(5);
+  bottom_shape.push_back(6);
+  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
+    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
+  }
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_dilation(2);
+  convolution_param->set_num_output(2);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("gaussian");
+  MKLDNNConvolutionLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+                                  this->blob_top_vec_);
+}
+#endif
+
+#if 0
+TYPED_TEST(MKLDNNConvolutionLayerTest, TestGradient3D) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  vector<int> bottom_shape(5);
+  bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0);
+  bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1);
+  bottom_shape[2] = 5;
+  bottom_shape[3] = this->blob_bottom_vec_[0]->shape(2);
+  bottom_shape[4] = this->blob_bottom_vec_[0]->shape(3);
+  FillerParameter filler_param;
+  GaussianFiller<Dtype> filler(filler_param);
+  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
+    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
+    filler.Fill(this->blob_bottom_vec_[i]);
+  }
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
+  convolution_param->set_num_output(2);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("gaussian");
+  MKLDNNConvolutionLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+#endif
+
+TYPED_TEST(MKLDNNConvolutionLayerTest, Test1x1Gradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+  this->blob_top_vec_.push_back(this->blob_top_2_);
+  convolution_param->add_kernel_size(1);
+  convolution_param->add_stride(1);
+  convolution_param->set_num_output(2);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("gaussian");
+  MKLDNNConvolutionLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(MKLDNNConvolutionLayerTest, TestGradientGroup) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
+  convolution_param->set_num_output(2);
+  convolution_param->set_group(GR);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("gaussian");
+  MKLDNNConvolutionLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+}  // namespace caffe
+#endif  // #ifdef MKLDNN_SUPPORTED
diff --git a/src/caffe/test/test_mkldnn_inner_product_layer.cpp b/src/caffe/test/test_mkldnn_inner_product_layer.cpp
index 084e10a6d..a45d94b32 100644
--- a/src/caffe/test/test_mkldnn_inner_product_layer.cpp
+++ b/src/caffe/test/test_mkldnn_inner_product_layer.cpp
@@ -165,6 +165,26 @@ TYPED_TEST(MKLDNNInnerProductLayerTest, TestForward) {
   }
 }
 
+TYPED_TEST(MKLDNNInnerProductLayerTest, TestForwardNoBias) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_);
+  LayerParameter layer_param;
+  InnerProductParameter* inner_product_param =
+      layer_param.mutable_inner_product_param();
+  inner_product_param->set_num_output(10);
+  inner_product_param->mutable_weight_filler()->set_type("uniform");
+  inner_product_param->set_bias_term(false);
+  shared_ptr<InnerProductLayer<Dtype> > layer(
+      new MKLDNNInnerProductLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_top_->cpu_data();
+  const int count = this->blob_top_->count();
+  for (int i = 0; i < count; ++i) {
+    EXPECT_GE(data[i], 1.);
+  }
+}
+
 // TODO: add support for transposed weights in MKLDNNInnerProduct 
 // layer and then enable following test (check if it was ported properly)
 #if 0
@@ -273,6 +293,22 @@ TYPED_TEST(MKLDNNInnerProductLayerTest, TestGradient) {
         this->blob_top_vec_);
 }
 
+TYPED_TEST(MKLDNNInnerProductLayerTest, TestGradientNoBias) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_);
+    LayerParameter layer_param;
+    InnerProductParameter* inner_product_param =
+        layer_param.mutable_inner_product_param();
+    inner_product_param->set_num_output(10);
+    inner_product_param->mutable_weight_filler()->set_type("gaussian");
+    inner_product_param->set_bias_term(false);
+    shared_ptr<InnerProductLayer<Dtype> > layer(
+      new MKLDNNInnerProductLayer<Dtype>(layer_param));
+    GradientChecker<Dtype> checker(1e-2, 1e-3);
+    checker.CheckGradientExhaustive(layer.get(), this->blob_bottom_vec_,
+        this->blob_top_vec_);
+}
+
 TYPED_TEST(MKLDNNInnerProductLayerTest, TestGradientTranspose) {
   typedef typename TypeParam::Dtype Dtype;
   this->blob_bottom_vec_.push_back(this->blob_bottom_);
diff --git a/src/caffe/test/test_mkldnn_pooling_layer.cpp b/src/caffe/test/test_mkldnn_pooling_layer.cpp
index f1d3ff4d9..288c114a6 100644
--- a/src/caffe/test/test_mkldnn_pooling_layer.cpp
+++ b/src/caffe/test/test_mkldnn_pooling_layer.cpp
@@ -586,7 +586,7 @@ TYPED_TEST(MKLDNNPoolingLayerTest, TestGradientMaxTopMask) {
 #endif
   
 // Average Pooling
-TYPED_TEST(MKLDNNPoolingLayerTest, TestForwardAve) {
+TYPED_TEST(MKLDNNPoolingLayerTest, DISABLED_TestForwardAve) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
diff --git a/src/caffe/test/test_spp_layer.cpp b/src/caffe/test/test_spp_layer.cpp
index 708886e22..66660fccf 100644
--- a/src/caffe/test/test_spp_layer.cpp
+++ b/src/caffe/test/test_spp_layer.cpp
@@ -78,7 +78,12 @@ class SPPLayerTest : public MultiDeviceTest<TypeParam> {
     blob_bottom_vec_3_.push_back(blob_bottom_3_);
     blob_top_vec_.push_back(blob_top_);
   }
-  virtual ~SPPLayerTest() { delete blob_bottom_; delete blob_top_; }
+  virtual ~SPPLayerTest() {
+    delete blob_bottom_;
+    delete blob_top_;
+    delete blob_bottom_2_;
+    delete blob_bottom_3_;
+  }
 
   Blob<Dtype>* const blob_bottom_;
   Blob<Dtype>* const blob_bottom_2_;
diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp
old mode 100644
new mode 100755
index f7e0b2ce7..43f07cbf8
--- a/src/caffe/util/im2col.cpp
+++ b/src/caffe/util/im2col.cpp
@@ -59,6 +59,7 @@ void im2col_cpu(const Dtype* data_im, const int channels,
     const int stride_h, const int stride_w,
     const int dilation_h, const int dilation_w,
     Dtype* data_col) {
+#if 0
   const int output_h = (height + 2 * pad_h -
     (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
   const int output_w = (width + 2 * pad_w -
@@ -89,6 +90,85 @@ void im2col_cpu(const Dtype* data_im, const int channels,
       }
     }
   }
+#else
+  int dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
+  int dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
+  int height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
+  int channels_col = channels * kernel_h * kernel_w;
+  #ifdef _OPENMP
+  #pragma omp parallel for
+  #endif
+  for (int c = 0; c < channels_col; ++c) {
+    int w_offset = c % kernel_w;
+    int h_offset = (c / kernel_w) % kernel_h;
+    int c_im = c / kernel_h / kernel_w;
+
+    const int hc0 = h_offset * dilation_h - pad_h;
+    const int wc0 = w_offset * dilation_w - pad_w;
+    for (int h = 0; h < height_col; ++h) {
+      int h_pad = h * stride_h + hc0;
+
+      const int row_offset = (c * height_col + h) * width_col;
+      const int srow_offset = (c_im * height + h_pad) * width;
+      for (int w = 0; w < width_col; ++w) {
+        int w_pad = w * stride_w + wc0;
+        if ((((unsigned)h_pad) < ((unsigned)height)) && (((unsigned)w_pad) < ((unsigned)width)))
+          data_col[row_offset + w] = data_im[srow_offset + w_pad];
+        else {
+          data_col[row_offset + w] = 0.;
+        }
+      }
+    }
+  }
+#endif
+}
+
+template <typename Dtype>
+void im3d2col_cpu(const Dtype* data_im, const int channels,
+    const int depth, const int height, const int width,
+    const int kernel_d, const int kernel_h, const int kernel_w,
+    const int pad_d, const int pad_h, const int pad_w,
+    const int stride_d, const int stride_h, const int stride_w,
+    const int dilation_d, const int dilation_h, const int dilation_w,
+    Dtype* data_col) {
+  // LOG(ERROR) << "image size: " << depth << ", " << height << ", " << width;
+  // LOG(ERROR) << "kernel size: " << kernel_d << ", " << kernel_h << ", " << kernel_w;
+
+  // Implicit dilated kernel size
+  long dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
+  long dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
+  long dil_kernel_d = (kernel_d - 1) * dilation_d + 1;
+  long height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
+  long width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
+  long depth_col = (depth + 2 * pad_d - dil_kernel_d) / stride_d + 1;
+  long channels_col = channels * kernel_h * kernel_w * kernel_d;
+  #ifdef _OPENMP
+  #pragma omp parallel for
+  #endif
+  for (long c = 0; c < channels_col; ++c) {
+    long w_offset = c % kernel_w;
+    long h_offset = (c / kernel_w) % kernel_h;
+    long d_offset = (c / kernel_w / kernel_h) % kernel_d;
+    long c_im = c / kernel_h / kernel_w / kernel_d;
+    for (int d = 0; d < depth_col; ++d) {
+      long d_pad = d * stride_d - pad_d + d_offset * dilation_d;
+      for (long h = 0; h < height_col; ++h) {
+        long h_pad = h * stride_h - pad_h + h_offset * dilation_h;
+        for (long w = 0; w < width_col; ++w) {
+          long w_pad = w * stride_w - pad_w + w_offset * dilation_w;
+          if (((unsigned long)h_pad < (unsigned long)height) &&
+              ((unsigned long)w_pad < (unsigned long)width) &&
+              ((unsigned long)d_pad < (unsigned long)depth)) {
+            data_col[((c * depth_col + d) * height_col + h) * width_col + w] =
+              data_im[((c_im * depth + d_pad) * height + h_pad) * width + w_pad];
+          } else {
+            data_col[((c * depth_col + d) * height_col + h) * width_col + w] = 0.;
+          }
+        }
+      }
+    }
+  }
 }
 
 // Explicit instantiation
@@ -102,6 +182,20 @@ template void im2col_cpu<double>(const double* data_im, const int channels,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, const int dilation_h, const int dilation_w,
     double* data_col);
+template void im3d2col_cpu<float>(const float* data_im, const int channels,
+    const int depth, const int height, const int width,
+    const int kernel_d, const int kernel_h, const int kernel_w,
+    const int pad_d, const int pad_h, const int pad_w,
+    const int stride_d, const int stride_h, const int stride_w,
+    const int dilation_d, const int dilation_h, const int dilation_w,
+    float* data_col);
+template void im3d2col_cpu<double>(const double* data_im, const int channels,
+    const int depth, const int height, const int width,
+    const int kernel_d, const int kernel_h, const int kernel_w,
+    const int pad_d, const int pad_h, const int pad_w,
+    const int stride_d, const int stride_h, const int stride_w,
+    const int dilation_d, const int dilation_h, const int dilation_w,
+    double* data_col);
 
 template <typename Dtype>
 inline void im2col_nd_core_cpu(const Dtype* data_input, const bool im2col,
@@ -203,6 +297,7 @@ void col2im_cpu(const Dtype* data_col, const int channels,
     const int stride_h, const int stride_w,
     const int dilation_h, const int dilation_w,
     Dtype* data_im) {
+#if 0
   caffe_set(height * width * channels, Dtype(0), data_im);
   const int output_h = (height + 2 * pad_h -
     (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
@@ -231,6 +326,93 @@ void col2im_cpu(const Dtype* data_col, const int channels,
       }
     }
   }
+#else
+  int dil_patch_h = (kernel_h - 1) * dilation_h + 1;
+  int dil_patch_w = (kernel_w - 1) * dilation_w + 1;
+  int height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
+  long chunk_len = kernel_h * kernel_w;
+
+  caffe_set(height * width * channels, Dtype(0), data_im);
+
+  #ifdef _OPENMP
+  #pragma omp parallel for if (channels > 1)
+  #endif 
+  for (int idx = 0; idx < channels; ++idx) {
+    for (int inner_idx = 0; inner_idx < chunk_len; ++inner_idx) {
+      int c = idx * chunk_len + inner_idx;
+      int w_offset = c % kernel_w;
+      int h_offset = (c / kernel_w) % kernel_h;
+      int c_im = c / kernel_h / kernel_w;
+
+      const int hc0 = h_offset * dilation_h - pad_h;
+      const int wc0 = w_offset * dilation_w - pad_w;
+      for (int h = 0; h < height_col; ++h) {
+        for (int w = 0; w < width_col; ++w) {
+          int h_pad = h * stride_h + hc0;
+          const int srow_offset = (c_im * height + h_pad) * width;
+          const int row_offset = (c * height_col + h) * width_col;
+          int w_pad = w * stride_w + wc0;
+          if ((((unsigned)h_pad) < ((unsigned)height)) && (((unsigned)w_pad) < ((unsigned)width))) {
+            data_im[srow_offset + w_pad] += data_col[row_offset + w];
+          }
+        }
+      }
+    }
+  }
+#endif
+}
+
+template <typename Dtype>
+void col2im3d_cpu(const Dtype* data_col, const int channels,
+    const int depth, const int height, const int width,
+    const int kernel_d, const int kernel_h, const int kernel_w,
+    const int pad_d, const int pad_h, const int pad_w,
+    const int stride_d, const int stride_h, const int stride_w,
+    const int dilation_d, const int dilation_h, const int dilation_w,
+    Dtype* data_im) {
+  // Implicit dilated patch
+  long dil_patch_h = (kernel_h - 1) * dilation_h + 1;
+  long dil_patch_w = (kernel_w - 1) * dilation_w + 1;
+  long dil_patch_d = (kernel_d - 1) * dilation_d + 1;
+  long height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
+  long width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
+  long depth_col = (depth + 2 * pad_d - dil_patch_d) / stride_d + 1;
+  long num_kernels = channels * height * width * depth;
+  long chunk_len = kernel_h * kernel_w * kernel_d;
+
+  caffe_set(num_kernels, Dtype(0), data_im);
+
+  #ifdef _OPENMP
+  #pragma omp parallel for if (channels > 1)
+  #endif
+  for (long c_im = 0; c_im < channels; ++c_im) {
+    for (long c = c_im * chunk_len; c < chunk_len * (c_im + 1); ++c) {
+      long w_offset = c % kernel_w;
+      long h_offset = (c / kernel_w) % kernel_h;
+      long d_offset = (c / kernel_w / kernel_h) % kernel_d;
+ 
+      long dc0 = d_offset * dilation_d - pad_d;
+      long hc0 = h_offset * dilation_h - pad_h;
+      long wc0 = w_offset * dilation_w - pad_w;
+      for (long d = 0; d < depth_col; ++d) {
+        long d_pad = d * stride_d + dc0;
+        for (long h = 0; h < height_col; ++h) {
+          long h_pad = h * stride_h + hc0;
+          for (long w = 0; w < width_col; ++w) {
+            long w_pad = w * stride_w + wc0;
+
+            if (((unsigned long)h_pad < (unsigned long)height) &&
+                ((unsigned long)w_pad < (unsigned long)width) &&
+                ((unsigned long)d_pad < (unsigned long)depth)) {
+              data_im[((c_im * depth + d_pad) * height + h_pad) * width + w_pad] +=
+                data_col[((c * depth_col + d) * height_col + h) * width_col + w];
+            }
+          }
+        }
+      }
+    }
+  }
 }
 
 // Explicit instantiation
@@ -244,6 +426,20 @@ template void col2im_cpu<double>(const double* data_col, const int channels,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, const int dilation_h, const int dilation_w,
     double* data_im);
+template void col2im3d_cpu<float>(const float* data_col, const int channels,
+    const int depth, const int height, const int width,
+    const int kernel_d, const int kernel_h, const int kernel_w,
+    const int pad_d, const int pad_h, const int pad_w,
+    const int stride_d, const int stride_h, const int stride_w,
+    const int dilation_d, const int dilation_h, const int dilation_w,
+    float* data_im);
+template void col2im3d_cpu<double>(const double* data_col, const int channels,
+    const int depth, const int height, const int width,
+    const int kernel_d, const int kernel_h, const int kernel_w,
+    const int pad_d, const int pad_h, const int pad_w,
+    const int stride_d, const int stride_h, const int stride_w,
+    const int dilation_d, const int dilation_h, const int dilation_w,
+    double* data_im);
 
 template <typename Dtype>
 void col2im_nd_cpu(const Dtype* data_col, const int num_spatial_axes,