diff --git a/examples/faster-rcnn/license.txt b/examples/faster-rcnn/license.txt
index 617c1e140..31b20cd2a 100644
--- a/examples/faster-rcnn/license.txt
+++ b/examples/faster-rcnn/license.txt
@@ -1,8 +1,83 @@
-The MIT License
-Copyright (c) <year> <copyright holders>
+Faster R-CNN
 
-Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The MIT License (MIT)
+
+Copyright (c) 2015 Microsoft Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+************************************************************************
+
+THIRD-PARTY SOFTWARE NOTICES AND INFORMATION
+
+This project, Faster R-CNN, incorporates material from the project(s)
+listed below (collectively, "Third Party Code").  Microsoft is not the
+original author of the Third Party Code.  The original copyright notice
+and license under which Microsoft received such Third Party Code are set
+out below. This Third Party Code is licensed to you under their original
+license terms set forth below.  Microsoft reserves all other rights not
+expressly granted, whether by implication, estoppel or otherwise.
+
+1.	Caffe, (https://github.com/BVLC/caffe/)
+
+COPYRIGHT
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+
+Caffe uses a shared copyright model: each contributor holds copyright
+over their contributions to Caffe. The project versioning records all
+such contribution and copyright details. If a contributor wants to
+further mark their specific copyright on a particular contribution,
+they should indicate their copyright solely in the commit message of
+the change when it is committed.
+
+The BSD 2-Clause License
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+************END OF THIRD-PARTY SOFTWARE NOTICES AND INFORMATION**********
 
- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/examples/rfcn/license.txt b/examples/rfcn/license.txt
index 617c1e140..8d734011c 100644
--- a/examples/rfcn/license.txt
+++ b/examples/rfcn/license.txt
@@ -1,8 +1,23 @@
-The MIT License
-Copyright (c) <year> <copyright holders>
+MIT License
 
-Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+Copyright (c) 2016 Yuwen Xiong
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
 
- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/include/caffe/layers/base_conv_layer.hpp b/include/caffe/layers/base_conv_layer.hpp
index 59cb1bf28..40ad8afb8 100755
--- a/include/caffe/layers/base_conv_layer.hpp
+++ b/include/caffe/layers/base_conv_layer.hpp
@@ -221,12 +221,12 @@ class BaseConvolutionLayer : public Layer<Dtype> {
   }
 #endif
 
-  int num_kernels_im2col_;
-  int num_kernels_col2im_;
-  int conv_out_channels_;
-  int conv_in_channels_;
-  int conv_out_spatial_dim_;
-  int kernel_dim_;
+  size_t num_kernels_im2col_;
+  size_t num_kernels_col2im_;
+  size_t conv_out_channels_;
+  size_t conv_in_channels_;
+  size_t conv_out_spatial_dim_;
+  size_t kernel_dim_;
   size_t col_offset_;
   size_t output_offset_;
 
diff --git a/include/caffe/layers/mkldnn_layers.hpp b/include/caffe/layers/mkldnn_layers.hpp
index aca64c362..dae314208 100644
--- a/include/caffe/layers/mkldnn_layers.hpp
+++ b/include/caffe/layers/mkldnn_layers.hpp
@@ -141,7 +141,7 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer<Dtype>, public Layer<Dtype> {
 
     shared_ptr<primitive> input_primitive, bwd_top_diff_primitive;
 
-    int32_t num_, width_, height_, channels_;
+    vector<int> shape_;
     Dtype eps_, moving_average_fraction_;
     bool use_weight_bias_, bias_term_, use_global_stats_;
     int num_stats_batches_;
@@ -402,7 +402,7 @@ class MKLDNNReLULayer : public MKLDNNLayer<Dtype> , public NeuronLayer<Dtype>  {
     , reluFwd_pd(), reluBwd_pd()
     , fwd_top_data_memory(), bwd_bottom_diff_memory()
     , fwd_bottom_data_primitive(), bwd_top_diff_primitive()
-    , num_(0), width_(0), height_(0), channels_(0)
+    , shape_(0)
   {
     PERFORMANCE_EVENT_ID_RESET(perf_id_fw_);
     PERFORMANCE_EVENT_ID_RESET(perf_id_bw_);
@@ -431,7 +431,7 @@ class MKLDNNReLULayer : public MKLDNNLayer<Dtype> , public NeuronLayer<Dtype>  {
     MKLDNNPrimitive<Dtype> reluFwd, reluBwd;
     shared_ptr<memory> fwd_top_data_memory, bwd_bottom_diff_memory;
     shared_ptr<primitive> fwd_bottom_data_primitive, bwd_top_diff_primitive, bwd_bottom_data_primitive;
-    int32_t num_, width_, height_, channels_;
+    vector<int> shape_;
 
     PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
     PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
@@ -480,7 +480,8 @@ class MKLDNNConcatLayer : public MKLDNNLayer<Dtype> , public Layer<Dtype> {
     vector<int> split_dims;
     bool in_place_;
 
-    int32_t num_, width_, height_, channels_, num_concats_;
+    int32_t num_concats_;
+    vector<int> shape_;
     int concat_dimension;
 
     PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
@@ -537,7 +538,7 @@ class MKLDNNEltwiseLayer : public MKLDNNLayer<Dtype> , public Layer<Dtype>  {
     , eltwiseFwd_pd()
     , fwd_top_data_memory()
     , fwd_bottom_data_primitives_()
-    , num_(0), width_(0), height_(0), channels_(0)
+    , shape_(0)
     , num_bottoms_(0)
   {
     PERFORMANCE_EVENT_ID_RESET(perf_id_fw_);
@@ -573,7 +574,7 @@ class MKLDNNEltwiseLayer : public MKLDNNLayer<Dtype> , public Layer<Dtype>  {
     EltwiseParameter_EltwiseOp op_;
     vector<Dtype> coeffs_;
     Blob<int> max_idx_;
-    int32_t num_, width_, height_, channels_;
+    vector<int> shape_;
     int32_t num_bottoms_;
     bool stable_prod_grad_;
 
diff --git a/src/caffe/layers/mkldnn_batch_norm_layer.cpp b/src/caffe/layers/mkldnn_batch_norm_layer.cpp
index d3f76bfbd..b19ab7bab 100644
--- a/src/caffe/layers/mkldnn_batch_norm_layer.cpp
+++ b/src/caffe/layers/mkldnn_batch_norm_layer.cpp
@@ -64,10 +64,8 @@ void MKLDNNBatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom
 
     Layer<Dtype>::LayerSetUp(bottom, top);
 
-    channels_ = bottom[0]->channels();
-    height_   = bottom[0]->height();
-    width_    = bottom[0]->width();
-    num_      = bottom[0]->num();
+    shape_ = bottom[0]->shape();
+    const int channels = shape_[1];
 
     eps_ = this->layer_param_.batch_norm_param().eps();
     use_weight_bias_ = this->layer_param_.batch_norm_param().use_weight_bias();
@@ -77,12 +75,12 @@ void MKLDNNBatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom
     if (this->layer_param_.batch_norm_param().has_use_global_stats())
       use_global_stats_ = this->layer_param_.batch_norm_param().use_global_stats();
 
-    InitStatsBatchVars(num_);
+    InitStatsBatchVars(shape_[0]);
 
     this->blobs_.resize(3 + (use_weight_bias_ ? 1:0) + (use_weight_bias_ && bias_term_ ? 1:0));
 
     vector<int> sz;
-    sz.push_back(channels_);
+    sz.push_back(channels);
     this->blobs_[0].reset(new Blob<Dtype>(sz));
     this->blobs_[1].reset(new Blob<Dtype>(sz));
     sz[0]=1;
@@ -96,7 +94,7 @@ void MKLDNNBatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom
     //Optimization: use the temp blob to combine the scale and shift together. Avoid the additional copies.
     // Initialize scale and shift combination blob
     vector<int> scaleshift_blob_shape(1);
-    scaleshift_blob_shape[0] = 2*channels_;
+    scaleshift_blob_shape[0] = 2*channels;
     scaleshift_blob_.reset(new Blob<Dtype>(scaleshift_blob_shape));
     //Should initialize the scaleshift_blob_ buffer to 0, because when bias_term_ == false, need to pass zero bias to MKLDNN
     caffe_set(scaleshift_blob_shape[0], static_cast<Dtype>(0),
@@ -111,8 +109,8 @@ void MKLDNNBatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom
     if (use_weight_bias_) {
         // Initialize scale and shift
         vector<int> scaleshift_shape(1);
-        scaleshift_shape[0] = channels_;
-        VLOG(1) << "MKLDNNBatchNormLayer<Dtype>::LayerSetUp: channels_  = " << channels_;
+        scaleshift_shape[0] = channels;
+        VLOG(1) << "MKLDNNBatchNormLayer<Dtype>::LayerSetUp: channels_  = " << channels;
 
         this->blobs_[3].reset(new Blob<Dtype>(scaleshift_shape));
         this->blobs_[3]->set_cpu_data(scaleshift_blob_->mutable_cpu_data());
@@ -128,8 +126,8 @@ void MKLDNNBatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom
 
         if (bias_term_) {
             this->blobs_[4].reset(new Blob<Dtype>(scaleshift_shape));
-            this->blobs_[4]->set_cpu_data(scaleshift_blob_->mutable_cpu_data() + scaleshift_blob_->offset(channels_));
-            this->blobs_[4]->set_cpu_diff(scaleshift_diff_blob->mutable_cpu_diff() + scaleshift_blob_->offset(channels_));
+            this->blobs_[4]->set_cpu_data(scaleshift_blob_->mutable_cpu_data() + scaleshift_blob_->offset(channels));
+            this->blobs_[4]->set_cpu_diff(scaleshift_diff_blob->mutable_cpu_diff() + scaleshift_blob_->offset(channels));
             FillerParameter bias_filler_param(this->layer_param_.batch_norm_param().bias_filler());
             if (!this->layer_param_.batch_norm_param().has_bias_filler()) {
                 bias_filler_param.set_type("constant");
@@ -161,17 +159,10 @@ void MKLDNNBatchNormLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom
 {
     VLOG(1) << "MKLDNNBatchNormLayer<Dtype>::Reshape: " << this->layer_param_.name();
 
-    this->reshape = (this->width_ == bottom[0]->width() &&
-                     this->height_ == bottom[0]->height() &&
-                     this->channels_ == bottom[0]->channels() &&
-                     this->num_ == bottom[0]->num()) ? false : true;
+    this->reshape = (this->shape_ == bottom[0]->shape()) ? false : true;
+    this->shape_ = bottom[0]->shape();
 
-    this->width_ = bottom[0]->width();
-    this->height_ = bottom[0]->height();
-    this->num_ = bottom[0]->num();
-    this->channels_ = bottom[0]->channels();
-
-    InitStatsBatchVars(this->num_);
+    InitStatsBatchVars(this->shape_[0]);
 
     //Fix: should reshape the top blob with the real size of bottom blob
     //top[0]->Reshape(this->num_, this->channels_, this->height_, this->width_);
@@ -194,10 +185,19 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNorm(const vector<Blob<Dtype>*>& bott
     if (use_weight_bias_) flags |= use_scale_shift;
     if (use_global_stats_) flags |= use_global_stats;
 
-    int32_t n  = this->num_;
-    int32_t iw = this->width_;
-    int32_t ih = this->height_;
-    int32_t ic = this->channels_;    
+    memory::format src_mfmt;
+    auto tensor_size = this->shape_.size();
+    memory::dims dim = this->shape_;
+    if(tensor_size == 5) {
+        src_mfmt = memory::format::ncdhw;
+    } else {
+        CHECK_LE(tensor_size, 4)
+            << "mkldnn batch normalization layer only supports dim size <= 5!";
+        if (tensor_size < 4) dim.resize(4, 1); // extend to nchw with dim 1 to match mkldnn format
+        src_mfmt = memory::format::nchw;
+    }
+
+    const int channels = this->shape_[1];
 
     bool bottom_data_is_prv = (const_cast<Dtype*>(bottom[0]->prv_data()) != NULL);
 
@@ -216,13 +216,13 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNorm(const vector<Blob<Dtype>*>& bott
         usr_mpd = mem_descr->usr_memory_pd();
         prv_mpd = mem_descr->prv_memory_pd();
     } else {
-        input_md.reset(new memory::desc({{n, ic, ih, iw}}, mpcsn, memory::format::nchw));   //MKLDNN batch norm only support 4D memory descriptor!
+        input_md.reset(new memory::desc({dim}, mpcsn, src_mfmt));
         usr_mpd.reset(new memory::primitive_desc(*input_md, cpu_engine));
     }
     output_md = input_md;
     input_stats_md.reset(new memory::desc(*input_md));
     CHECK(input_stats_md->data.ndims > 0 &&
-          input_stats_md->data.dims[0] == this->num_);
+          input_stats_md->data.dims[0] == this->shape_[0]);
     input_stats_md->data.dims[0] = stats_batch_size_;
 
     // ---- Initialize BatchNorm primitive descriptor -------------
@@ -262,7 +262,7 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNorm(const vector<Blob<Dtype>*>& bott
     if (use_weight_bias_) {
         //For test in train, memory address of blobs_[3] and blobs_[4] will be changed when share data from train net. If the address
         // of blobs_[3] and blobs_[4] are continued, we will use them immediately, otherwise we will copy them to scaleshift_blob_ in Forward.
-        if((this->blobs_[3]->mutable_cpu_data() + this->blobs_[3]->offset(channels_)) == this->blobs_[4]->mutable_cpu_data()){
+        if((this->blobs_[3]->mutable_cpu_data() + this->blobs_[3]->offset(channels)) == this->blobs_[4]->mutable_cpu_data()){
             scaleshift_memory.reset(new memory(BatchNormFwd_pd->weights_primitive_desc(), this->blobs_[3]->mutable_cpu_data()));
         }else {
             scaleshift_memory.reset(new memory(BatchNormFwd_pd->weights_primitive_desc(), this->scaleshift_blob_->mutable_cpu_data()));
@@ -309,8 +309,8 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNorm(const vector<Blob<Dtype>*>& bott
         LOG(INFO) << "MKLDNN batch norm only support 4D memory descriptor! Use 4D for calculation and reshape to 2D for output!";
 #endif
         vector<int> top_shape;
-        top_shape.push_back(bottom[0]->num());
-        top_shape.push_back(bottom[0]->channels());
+        top_shape.push_back(bottom[0]->shape(0));
+        top_shape.push_back(bottom[0]->shape(1));
         top[0]->Reshape(top_shape);
     }
 }
@@ -319,12 +319,15 @@ template <typename Dtype>
 template <bool diff>
 shared_ptr<memory> MKLDNNBatchNormLayer<Dtype>::GetStatsBatchMemory(
   shared_ptr<MKLDNNMemoryDescriptor<Dtype, diff> > mkldnn_mem, int idx) {
-    long data_offset =
-      idx * stats_batch_size_ * this->channels_ * this->width_ * this->height_;
+    int length = this->shape_[1];
+    for(int i=2;i<this->shape_.size();i++)
+        length *= this->shape_[i];
+
+    long data_offset =  idx * stats_batch_size_ * length;
     engine cpu_engine = CpuEngine::Instance().get_engine();
     shared_ptr<memory::desc> stats_md = mkldnn_mem->get_memory_desc();
     CHECK(stats_md->data.ndims > 0 &&
-          stats_md->data.dims[0] == this->num_);
+          stats_md->data.dims[0] == this->shape_[0]);
     stats_md->data.dims[0] = stats_batch_size_;
     shared_ptr<memory::primitive_desc> stats_mpd(
       new memory::primitive_desc(*stats_md, cpu_engine));
@@ -338,6 +341,8 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNormFwdPrimitive(int idx) {
     input_stats[idx] = GetStatsBatchMemory<false>(fwd_bottom_data, idx);
     output_stats[idx] = GetStatsBatchMemory<false>(fwd_top_data, idx);
 
+    const int channels = this->shape_[1];
+
     // ---- Create BatchNorm --------------------
     if (this->phase_ == TEST && !use_global_stats_) {
         if (use_weight_bias_) {
@@ -353,9 +358,9 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNormFwdPrimitive(int idx) {
         variance_memory[idx].reset(new memory(BatchNormFwd_pd->variance_primitive_desc()));
 
         if (use_global_stats_) {
-            caffe_copy<Dtype>(this->channels_, this->blobs_[0]->cpu_data(),
+            caffe_copy<Dtype>(channels, this->blobs_[0]->cpu_data(),
                 static_cast<Dtype *>(mean_memory[idx]->get_data_handle()));
-            caffe_copy<Dtype>(this->channels_, this->blobs_[1]->cpu_data(),
+            caffe_copy<Dtype>(channels, this->blobs_[1]->cpu_data(),
                static_cast<Dtype *>(variance_memory[idx]->get_data_handle()));
             if (use_weight_bias_) {
                 BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd,
@@ -398,9 +403,11 @@ void MKLDNNBatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom
     // update top that head at prv
     fwd_top_data->sync_before_write();
 
-    if((this->blobs_[3]->mutable_cpu_data() + this->blobs_[3]->offset(channels_)) != this->blobs_[4]->mutable_cpu_data()){
-        caffe_copy(channels_, this->blobs_[3]->cpu_data(), this->scaleshift_blob_->mutable_cpu_data());
-        caffe_copy(channels_, this->blobs_[4]->cpu_data(), this->scaleshift_blob_->mutable_cpu_data() + scaleshift_blob_->offset(channels_));
+    const int channels = this->shape_[1];
+
+    if((this->blobs_[3]->mutable_cpu_data() + this->blobs_[3]->offset(channels)) != this->blobs_[4]->mutable_cpu_data()){
+        caffe_copy(channels, this->blobs_[3]->cpu_data(), this->scaleshift_blob_->mutable_cpu_data());
+        caffe_copy(channels, this->blobs_[4]->cpu_data(), this->scaleshift_blob_->mutable_cpu_data() + scaleshift_blob_->offset(channels));
     }
 
     for (int stats_batch_idx = 0; stats_batch_idx < num_stats_batches_; stats_batch_idx++) {
@@ -429,11 +436,11 @@ void MKLDNNBatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom
         Dtype *variance_buffer_ = (Dtype *)(variance_memory[stats_batch_idx]->get_data_handle());
         this->blobs_[2]->mutable_cpu_data()[0] *= moving_average_fraction_;
         this->blobs_[2]->mutable_cpu_data()[0] += 1;
-        caffe_cpu_axpby<Dtype>(this->channels_, Dtype(1), mean_buffer_,
+        caffe_cpu_axpby<Dtype>(channels, Dtype(1), mean_buffer_,
             moving_average_fraction_, this->blobs_[0]->mutable_cpu_data());
-        int m = bottom[0]->count()/num_stats_batches_/channels_;
+        int m = bottom[0]->count()/num_stats_batches_/channels;
         Dtype bias_correction_factor = m > 1 ? Dtype(m)/(m-1) : 1;
-        caffe_cpu_axpby<Dtype>(this->channels_, bias_correction_factor,
+        caffe_cpu_axpby<Dtype>(channels, bias_correction_factor,
             variance_buffer_, moving_average_fraction_,
             this->blobs_[1]->mutable_cpu_data());
       }
@@ -450,10 +457,17 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNormBwd(
 {
     if (std::is_same<Dtype, double>::value) NOT_IMPLEMENTED;
 
-    int32_t n = this->num_;
-    int32_t w = this->width_;
-    int32_t h = this->height_;
-    int32_t c = this->channels_;
+    memory::format src_mfmt;
+    auto tensor_size = this->shape_.size();
+    memory::dims dim = this->shape_;
+    if(tensor_size == 5) {
+        src_mfmt = memory::format::ncdhw;
+    } else {
+        CHECK_LE(tensor_size, 4)
+            << "mkldnn batch normalization layer only supports dim size <= 5!";
+        if (tensor_size < 4) dim.resize(4, 1); // extend to nchw with dim 1 to match mkldnn format
+        src_mfmt = memory::format::nchw;
+    }
 
     unsigned flags = 0;
     if (use_weight_bias_) flags |= use_scale_shift;
@@ -475,16 +489,16 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNormBwd(
         usr_diff_mpd = mem_descr->usr_memory_pd();
         prv_diff_mpd = mem_descr->prv_memory_pd();
     } else {
-        top_diff_md.reset(new memory::desc({{n, c, h, w}}, mpcsn, memory::format::nchw));   //MKLDNN batch norm only support 4D memory descriptor!
+        top_diff_md.reset(new memory::desc({dim}, mpcsn, src_mfmt));   //MKLDNN batch norm only support 4D memory descriptor!
         usr_diff_mpd.reset(new memory::primitive_desc(*top_diff_md, cpu_engine));
     }
     top_diff_stats_md.reset(new memory::desc(*top_diff_md));
     CHECK(top_diff_stats_md->data.ndims > 0 &&
-          top_diff_stats_md->data.dims[0] == this->num_);
+          top_diff_stats_md->data.dims[0] == this->shape_[0]);
     top_diff_stats_md->data.dims[0] = stats_batch_size_;
     output_stats_md.reset(new memory::desc(output_memory->get_primitive_desc().desc()));
     CHECK(output_stats_md->data.ndims > 0 &&
-          output_stats_md->data.dims[0] == this->num_);
+          output_stats_md->data.dims[0] == this->shape_[0]);
     output_stats_md->data.dims[0] = stats_batch_size_;
 
     // ---- Initialize bnrm primitive descriptor -------------
diff --git a/src/caffe/layers/mkldnn_concat_layer.cpp b/src/caffe/layers/mkldnn_concat_layer.cpp
index 9299f9cea..6143f57ae 100644
--- a/src/caffe/layers/mkldnn_concat_layer.cpp
+++ b/src/caffe/layers/mkldnn_concat_layer.cpp
@@ -73,96 +73,33 @@ void MKLDNNConcatLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     concat_dimension = bottom[0]->CanonicalAxisIndex(concat_param.axis());
   }
 
+  vector<int> bottom_0_shape = bottom[0]->shape();
+  bottom_0_shape[concat_dimension] = 0;
   for (auto i = 1; i < num_concats_; ++i) {
-    if (concat_dimension == 0)
-    {
-      CHECK_EQ(bottom[0]->channels(), bottom[i]->channels());
-      CHECK_EQ(bottom[0]->height(), bottom[i]->height());
-      CHECK_EQ(bottom[0]->width(), bottom[i]->width());
-      break;
-    }
-    else if (concat_dimension == 1)
-    {
-      CHECK_EQ(bottom[0]->num(), bottom[i]->num());
-      if (!concat_param.per_fla_fuse()){
-        CHECK_EQ(bottom[0]->height(), bottom[i]->height());
-        CHECK_EQ(bottom[0]->width(), bottom[i]->width());
-      }
-      break;
-    }
-    else if (concat_dimension == 2)
-    {
-      CHECK_EQ(bottom[0]->num(), bottom[i]->num());
-      CHECK_EQ(bottom[0]->channels(), bottom[i]->channels());
-      CHECK_EQ(bottom[0]->width(), bottom[i]->width());
-      break;
-    }
-    else if (concat_dimension == 3)
-    {
-      CHECK_EQ(bottom[0]->num(), bottom[i]->num());
-      CHECK_EQ(bottom[0]->channels(), bottom[i]->channels());
-      CHECK_EQ(bottom[0]->height(), bottom[i]->height());
-      break;
+    vector<int> bottom_i_shape = bottom[i]->shape();
+    bottom_i_shape[concat_dimension] = 0;
+    if (concat_dimension == 1 && concat_param.per_fla_fuse()) {
+      for(int i = 0; i < concat_dimension + 1; i++) CHECK_EQ(bottom_i_shape[i] == bottom_0_shape[i], true);
+      continue;
     }
+    CHECK_EQ(bottom_0_shape == bottom_i_shape, true);
   }
 
   split_dims.reserve(num_concats_);
-  if (concat_dimension == 0)
-  {
-    num_ = 0;
-    channels_ = bottom[0]->channels();
-    height_ = bottom[0]->height();
-    width_ = bottom[0]->width();
+  shape_ = bottom[0]->shape();
+  shape_[concat_dimension] = 0;
+  if (concat_dimension == 1 && concat_param.per_fla_fuse()) {
+    for(int i= concat_dimension + 1; i < shape_.size(); i++) shape_[i] = 1;
     for (auto i = 0; i < num_concats_; ++i) {
       CHECK_EQ(dim_src, bottom[i]->shape().size());
-      split_dims[i] = bottom[i]->num();
-      num_ += split_dims[i];
+      split_dims[i] = bottom[i]->count(concat_dimension);
+      shape_[concat_dimension] += split_dims[i];
     }
-  }
-  else if (concat_dimension == 1)
-  {
-    num_ = bottom[0]->num();
-    channels_ = 0;
-    height_ = bottom[0]->height();
-    width_ = bottom[0]->width();
-    if (concat_param.per_fla_fuse()){
-      height_ = 1;
-      width_ = 1;
-      for (auto i = 0; i < num_concats_; ++i) {
-        CHECK_EQ(dim_src, bottom[i]->shape().size());
-        split_dims[i] = bottom[i]->channels()*bottom[i]->height()*bottom[i]->width();
-        channels_ += split_dims[i];
-      }
-    } else{
-      for (auto i = 0; i < num_concats_; ++i) {
-        CHECK_EQ(dim_src, bottom[i]->shape().size());
-        split_dims[i] = bottom[i]->channels();
-        channels_ += split_dims[i];
-      }
-    }
-  }
-  else if (concat_dimension == 2)
-  {
-    num_ = bottom[0]->num();
-    channels_ = bottom[0]->channels();
-    height_ = 0;
-    width_ = bottom[0]->width();
-    for (auto i = 0; i < num_concats_; ++i) {
-      CHECK_EQ(dim_src, bottom[i]->shape().size());
-      split_dims[i] = bottom[i]->height();
-      height_ += split_dims[i];
-    }
-  }
-  else if (concat_dimension == 3)
-  {
-    num_ = bottom[0]->num();
-    channels_ = bottom[0]->channels();
-    height_ = bottom[0]->height();
-    width_ = 0;
+  } else {
     for (auto i = 0; i < num_concats_; ++i) {
       CHECK_EQ(dim_src, bottom[i]->shape().size());
-      split_dims[i] = bottom[i]->width();
-      width_ += split_dims[i];
+      split_dims[i] = bottom[i]->shape(concat_dimension);
+      shape_[concat_dimension] += split_dims[i];
     }
   }
 }
@@ -172,100 +109,31 @@ void MKLDNNConcatLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
   VLOG(1) << "MKLDNNConcatLayer<Dtype>::Reshape: "  << this->layer_param_.name();
   const ConcatParameter& concat_param = this->layer_param_.concat_param();
-  if (concat_dimension == 0)
-  {
-    //Need to re-calculate the shape duo to the change of batch size
-    num_ = 0;
-    channels_ = bottom[0]->channels();
-    height_ = bottom[0]->height();
-    width_ = bottom[0]->width();
-    //Also need to reshape the concat dim, in case the concat dim is just be reshaped by batch size
-    for (auto i = 0; i < num_concats_; ++i) {
-        split_dims[i] = bottom[i]->num();
-        num_ += split_dims[i];
-    }
 
-    if (this->channels_ == bottom[0]->channels() &&
-        this->height_ == bottom[0]->height() &&
-        this->width_ == bottom[0]->width()) {
-      this->reshape = false;
-    } else {
-      this->reshape = true;
-    }
-  }
-  else if (concat_dimension == 1)
-  {
-    num_ = bottom[0]->num();
-    channels_ = 0;
-    height_ = bottom[0]->height();
-    width_ = bottom[0]->width();
-    if (concat_param.per_fla_fuse()){
-      height_ = 1;
-      width_ = 1;
-      for (auto i = 0; i < num_concats_; ++i) {
-          split_dims[i] = bottom[i]->channels()*bottom[i]->height()*bottom[i]->width();
-          channels_ += split_dims[i];
-      }
-      if (this->num_ == bottom[0]->num()) { 
-        this->reshape = false;
-      } else {
-        this->reshape = true;
-      }
+  vector<int> dim = bottom[0]->shape();
+  int dim_src = bottom[0]->shape().size();
 
-    } else{
-      for (auto i = 0; i < num_concats_; ++i) {
-        split_dims[i] = bottom[i]->channels();
-        channels_ += split_dims[i];
-      }
-      if (this->num_ == bottom[0]->num() &&
-          this->height_ == bottom[0]->height() &&
-          this->width_ == bottom[0]->width()) {
-        this->reshape = false;
-      } else {
-        this->reshape = true;
-      }
-    }
-  }
-  else if (concat_dimension == 2)
-  {
-    num_ = bottom[0]->num();
-    channels_ = bottom[0]->channels();
-    height_ = 0;
-    width_ = bottom[0]->width();
+  split_dims.clear();
+  dim[concat_dimension] = 0;
+  if (concat_dimension == 1 && concat_param.per_fla_fuse()) {
+    for(int i= concat_dimension + 1; i < shape_.size(); i++) dim[i] = 1;
     for (auto i = 0; i < num_concats_; ++i) {
-        split_dims[i] = bottom[i]->height();
-        height_ += split_dims[i];
-    }
-
-    if (this->num_ == bottom[0]->num() &&
-        this->channels_ == bottom[0]->channels() &&
-        this->width_ == bottom[0]->width()) {
-      this->reshape = false;
-    } else {
-      this->reshape = true;
+      CHECK_EQ(dim_src, bottom[i]->shape().size());
+      split_dims[i] = bottom[i]->count(concat_dimension);
+      dim[concat_dimension] += split_dims[i];
     }
-  }
-  else if (concat_dimension == 3)
-  {
-    num_ = bottom[0]->num();
-    channels_ = bottom[0]->channels();
-    height_ = bottom[0]->height();
-    width_ = 0;
+  } else {
     for (auto i = 0; i < num_concats_; ++i) {
-        split_dims[i] = bottom[i]->width();
-        width_ += split_dims[i];
+      CHECK_EQ(dim_src, bottom[i]->shape().size());
+      split_dims[i] = bottom[i]->shape(concat_dimension);
+      dim[concat_dimension] += split_dims[i];
     }
+  } 
 
-    if (this->num_ == bottom[0]->num() &&
-        this->channels_ == bottom[0]->channels() &&
-        this->height_ == bottom[0]->height()) {
-      this->reshape = false;
-    } else {
-      this->reshape = true;
-    }
-  }
+  this->reshape = (dim != shape_);
+  shape_ = dim;
 
-  top[0]->Reshape(num_, channels_, height_, width_);
+  top[0]->Reshape(shape_);
 }
 
 template <typename Dtype>
@@ -273,37 +141,20 @@ void MKLDNNConcatLayer<Dtype>::InitConcatFwd(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
   if (std::is_same<Dtype, double>::value)  NOT_IMPLEMENTED;
 
-  //Fix: MKLDNN concat layer should use 4D blob as input! Reshape the 2D input blob into 4D for calculation!
-  bool has_spatial = (bottom[0]->shape().size() != 2);
-#ifdef DEBUG
-  LOG(INFO) << "has_spatial flag value: " << has_spatial;
-#endif
-  if (has_spatial == false)
-  {
-#ifdef DEBUG
-      LOG(INFO) << "size of bottom blob: " << bottom[0]->shape().size();
-      LOG(INFO) << "size of top blob: " << top[0]->shape().size();
-      LOG(INFO) << "MKLDNN concat layer only support 4D blob as input! Reshape the 2D input blob into 4D for calculation!";
-#endif
-      for (auto i = 0; i < num_concats_; i++)
-      {
-          vector<int> bottom_4D_shape;
-          int bottom_4D_height = 1;
-          int bottom_4D_width = 1;
-          bottom_4D_shape.push_back(bottom[i]->num());
-          bottom_4D_shape.push_back(bottom[i]->channels());
-          bottom_4D_shape.push_back(bottom_4D_height);
-          bottom_4D_shape.push_back(bottom_4D_width);
-          bottom[i]->Reshape(bottom_4D_shape, false);
-      }      
-  }
   engine cpu_engine = CpuEngine::Instance().get_engine();
   memory::data_type usr_dt = memory::data_type::f32;
   memory::data_type prv_dt = usr_dt;
   // memory::format mfmt_any = memory::format::any;
-  memory::format mfmt_nchw = memory::format::nchw;
+  memory::format mfmt_out;
+  if(this->shape_.size() == 5) {
+    mfmt_out = memory::format::ncdhw;
+  } else {
+    CHECK_LE(this->shape_.size(), 4) << "mkldnn concat layer doesn't support this dim size!";
+    mfmt_out = memory::format::nchw;
+  }
 
-  memory::dims output_tz = {num_, channels_, height_, width_};
+  memory::dims output_tz = this->shape_;
+  if(output_tz.size() < 4) output_tz.resize(4, 1); // resize to nchw with dim 1
   std::vector<memory::primitive_desc> srcs_mpd;
   std::vector<primitive::at> srcs;
   fwd_bottom_data.clear();
@@ -345,28 +196,14 @@ void MKLDNNConcatLayer<Dtype>::InitConcatFwd(const vector<Blob<Dtype>*>& bottom,
     fwd_bottom_data.push_back(boost::shared_ptr<MKLDNNData<Dtype> >());
     mem_descr.push_back(boost::shared_ptr<MKLDNNMemoryDescriptor<Dtype, false>>());
 
-    memory::dims input_tz = {0, 0, 0, 0};
-    if (concat_dimension == 0)
-    {
-      input_tz = {split_dims[i], channels_, height_, width_};
-    }
-    else if (concat_dimension == 1)
-    {
-      input_tz = {num_, split_dims[i], height_, width_};
-    }
-    else if (concat_dimension == 2)
-    {
-      input_tz = {num_, channels_, split_dims[i], width_};
-    }
-    else if (concat_dimension == 3)
-    {
-      input_tz = {num_, channels_, height_, split_dims[i]};
-    }
+    memory::dims input_tz = this->shape_;
+    if(input_tz.size() < 4) input_tz.resize(4, 1); // resize to nchw with dim 1
+    input_tz[concat_dimension] = split_dims[i];
 
-    memory::format src_mfmt = mfmt_nchw;
+    memory::format src_mfmt = mfmt_out;
     shared_ptr<memory::primitive_desc> prv_src_mpd;
     shared_ptr<memory::primitive_desc> usr_src_mpd(
-        new memory::primitive_desc({input_tz, usr_dt, mfmt_nchw}, cpu_engine));
+        new memory::primitive_desc({input_tz, usr_dt, mfmt_out}, cpu_engine));
  
     if (const_cast<Dtype*>(bottom[i]->prv_data()) != NULL) {
       scale = 1.;
@@ -396,7 +233,7 @@ void MKLDNNConcatLayer<Dtype>::InitConcatFwd(const vector<Blob<Dtype>*>& bottom,
   }
 
   shared_ptr<memory::primitive_desc> usr_dst_mpd(new memory::primitive_desc(
-        {output_tz, usr_dt, mfmt_nchw}, cpu_engine));
+        {output_tz, usr_dt, mfmt_out}, cpu_engine));
 
   concatFwd_pd.reset(new concat::primitive_desc(concat_dimension, srcs_mpd));
 
@@ -414,7 +251,7 @@ void MKLDNNConcatLayer<Dtype>::InitConcatFwd(const vector<Blob<Dtype>*>& bottom,
  
   fwd_output_memory = fwd_top_data->create_output_memory();
 
-  memory::format base_mfmt = mfmt_nchw;
+  memory::format base_mfmt = mfmt_out;
   float base_scale = 1.;
   this->in_place_ = true;
 
@@ -423,7 +260,7 @@ void MKLDNNConcatLayer<Dtype>::InitConcatFwd(const vector<Blob<Dtype>*>& bottom,
       base_mfmt = src_mfmts[i];
       base_scale = bottom_scales[i];
     }
-    else if((concat_dimension != 0 && bottom[i]->shape()[concat_dimension - 1] != 1) || base_mfmt != src_mfmts[i] || fabs(base_scale-bottom_scales[i]) > FLT_MIN || different_input_dt) {
+    else if((concat_dimension != 0 && bottom[i]->shape()[concat_dimension - 1] != 1) || base_mfmt != src_mfmts[i] || fabs(base_scale-bottom_scales[i]) > FLT_MIN || different_input_dt || bottom[i]->prv_data() == NULL) {
       this->in_place_ = false;
       break;
     }
@@ -468,15 +305,24 @@ void MKLDNNConcatLayer<Dtype>::InitConcatBwd(const vector<Blob<Dtype>*>& top,
   engine cpu_engine = CpuEngine::Instance().get_engine();
   memory::data_type data_type = memory::data_type::f32;
   // memory::format mfmt_any = memory::format::any;
-  memory::format mfmt_nchw = memory::format::nchw;
-  memory::format diff_dst_mfmt = mfmt_nchw;
+  memory::format mfmt_out;
+  memory::dims offsets;
+  if(this->shape_.size() == 5) {
+    mfmt_out =  memory::format::ncdhw;
+    offsets = {0, 0, 0, 0, 0};
+  } else {
+    mfmt_out =  memory::format::nchw;
+    offsets = {0, 0, 0, 0};
+  }
+
+  memory::format diff_dst_mfmt = mfmt_out;
 
-  memory::dims input_tz = {num_, channels_, height_, width_};
-  memory::dims offsets = {0, 0, 0, 0};
+  memory::dims input_tz = this->shape_;
+  if (input_tz.size() < 4) input_tz.resize(4, 1);
 
   shared_ptr<memory::primitive_desc> prv_diff_dst_mpd;
   shared_ptr<memory::primitive_desc> usr_diff_dst_mpd(
-    new memory::primitive_desc({input_tz, data_type, mfmt_nchw},
+    new memory::primitive_desc({input_tz, data_type, mfmt_out},
         cpu_engine));
 
   bool top_diff_is_prv = (const_cast<Dtype*>(top[0]->prv_diff()) != NULL);
@@ -502,26 +348,12 @@ void MKLDNNConcatLayer<Dtype>::InitConcatBwd(const vector<Blob<Dtype>*>& top,
     bwd_bottom_diff.push_back(boost::shared_ptr<MKLDNNDiff<Dtype> >());
     reorders.push_back(MKLDNNPrimitive<Dtype>());
 
-    memory::dims dims = {0, 0, 0, 0};
-    if (concat_dimension == 0)
-    {
-      dims = {split_dims[i], channels_, height_, width_};
-    }
-    else if (concat_dimension == 1)
-    {
-      dims = {num_, split_dims[i], height_, width_};
-    }
-    else if (concat_dimension == 2)
-    {
-      dims = {num_, channels_, split_dims[i], width_};
-    }
-    else if (concat_dimension == 3)
-    {
-      dims = {num_, channels_, height_, split_dims[i]};
-    }
+    memory::dims dims = this->shape_;
+    if (dims.size() < 4) dims.resize(4, 1);
+    dims[concat_dimension] = split_dims[i];
 
     shared_ptr<memory::primitive_desc> usr_diff_src_mpd(
-      new memory::primitive_desc({dims, data_type, mfmt_nchw},
+      new memory::primitive_desc({dims, data_type, mfmt_out},
           cpu_engine));
     shared_ptr<memory::primitive_desc> prv_diff_src_mpd(
       new memory::primitive_desc({dims, data_type, diff_dst_mfmt},
@@ -587,8 +419,24 @@ void MKLDNNConcatLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top
   LOG(INFO) << "MKLDNNConcatLayer<Dtype>::Backward_cpu: " << this->layer_param_.name();
 #endif
 
-  if ((reorders.size() == 0) || (true == this->reshape))
+  if ((reorders.size() == 0) || (true == this->reshape)) {
+    bool concat_axis = true;
+    for (auto i = 0; i < num_concats_; i++) {
+      if (bottom[i]->shape()[concat_dimension] % 16 != 0) {
+        concat_axis = false;
+        break;
+      }
+    }
+    // mkldnn view primitive creation has restriction if viewed area (offset or size) is not
+    // aligned on block size (would assert if such case happens).
+    // This case usually would be triggered when the bottom blob's dim along concat axis is
+    // not 16-dividable and the top blob's block size becomes 16 due to the following mkldnn
+    // layer's reorder.
+    // To avoid such assertion, we have to explicitly convert the top blob state from prv to cpu state.
+    if (top[0]->prv_diff() != NULL && !concat_axis)
+      top[0]->mutable_cpu_diff();
     InitConcatBwd(top, propagate_down, bottom);
+  }
   bwd_top_diff->sync_before_read();
   for (auto i = 0; i < num_concats_; ++i) {
     bwd_bottom_diff[i]->sync_before_write();
diff --git a/src/caffe/layers/mkldnn_eltwise_layer.cpp b/src/caffe/layers/mkldnn_eltwise_layer.cpp
index aed8db34f..b67ce4052 100644
--- a/src/caffe/layers/mkldnn_eltwise_layer.cpp
+++ b/src/caffe/layers/mkldnn_eltwise_layer.cpp
@@ -78,15 +78,10 @@ template <typename Dtype>
 void MKLDNNEltwiseLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top)
 {
     VLOG(1) << "MKLDNNEltwiseLayer<Dtype>::Reshape: " << this->layer_param_.name();
-    this->reshape = (this->width_ == bottom[0]->width() &&
-                     this->height_ == bottom[0]->height() &&
-                     this->channels_ == bottom[0]->channels() &&
-                     this->num_ == bottom[0]->num()) ? false : true;
-
-    this->width_ = bottom[0]->width();
-    this->height_ = bottom[0]->height();
-    this->num_ = bottom[0]->num();
-    this->channels_ = bottom[0]->channels();
+    this->reshape = (this->shape_ == bottom[0]->shape()) ? false : true;
+    this->shape_ = bottom[0]->shape();
+    CHECK_LE(this->shape_.size(), 5)
+      << "Tensor dimension must be less than 6";
 
     switch (op_)
     {
@@ -129,11 +124,6 @@ void MKLDNNEltwiseLayer<Dtype>::InitEltwiseFwd(const vector<Blob<Dtype>*>& botto
 {
     if (std::is_same<Dtype, double>::value) NOT_IMPLEMENTED;
     
-    int32_t n  = this->num_;
-    int32_t iw = this->width_;
-    int32_t ih = this->height_;
-    int32_t ic = this->channels_;
-
     // If we just do simple adding, scale is 1.0 for all inputs we have
     std::vector<float> scale(num_bottoms_, 1.0);
     //Eltwise layer is supporting multiplication coefficient and this scale value can be used for that.
@@ -144,7 +134,17 @@ void MKLDNNEltwiseLayer<Dtype>::InitEltwiseFwd(const vector<Blob<Dtype>*>& botto
 
     engine cpu_engine = CpuEngine::Instance().get_engine();
     memory::data_type mpcsn = memory::data_type::f32;
-    memory::format mfmt_nchw = memory::format::nchw;
+
+    memory::format src_mfmt;
+    auto tensor_size = this->shape_.size();
+    memory::dims dim = this->shape_;
+    if(tensor_size == 5) {
+        src_mfmt = memory::format::ncdhw;
+    } else {
+        CHECK_LE(tensor_size, 4) << "The mkldnn eltwise layer only supports dim size <= 5!";
+        if (tensor_size < 4) dim.resize(4, 1);
+        src_mfmt = memory::format::nchw;
+    }
 
     // ---- Initialize memory descriptors -------------
     std::vector<memory::data_type> prv_dt(num_bottoms_, memory::data_type::f32);
@@ -177,10 +177,10 @@ void MKLDNNEltwiseLayer<Dtype>::InitEltwiseFwd(const vector<Blob<Dtype>*>& botto
     for (auto i = 0; i < num_bottoms_; i++) 
     {
         fwd_bottom_data.push_back(boost::shared_ptr<MKLDNNData<Dtype> >());
-        memory::format bottom_data_mfmt = mfmt_nchw;
+        memory::format bottom_data_mfmt = src_mfmt;
         shared_ptr<memory::primitive_desc> prv_bottom_data_mpd;
         shared_ptr<memory::primitive_desc> usr_bottom_data_mpd(
-            new memory::primitive_desc({{n, ic, ih, iw}, mpcsn, mfmt_nchw}, cpu_engine));
+            new memory::primitive_desc({dim, mpcsn, src_mfmt}, cpu_engine));
 
         bool bottom_data_is_prv = (const_cast<Dtype*>(bottom[i]->prv_data()) != NULL);
         if (bottom_data_is_prv)
@@ -194,11 +194,11 @@ void MKLDNNEltwiseLayer<Dtype>::InitEltwiseFwd(const vector<Blob<Dtype>*>& botto
                     mem_descr->prv_memory_pd()->desc().data.data_type);
             }
             prv_bottom_data_mpd.reset(new memory::primitive_desc(
-              {{n, ic, ih, iw}, bottom_data_dt, bottom_data_mfmt}, cpu_engine));
+              {dim, bottom_data_dt, bottom_data_mfmt}, cpu_engine));
         }
 
         bottom_data_mpd.push_back(memory::primitive_desc(
-            {{n, ic, ih, iw}, bottom_data_dt, bottom_data_mfmt}, cpu_engine));
+            {dim, bottom_data_dt, bottom_data_mfmt}, cpu_engine));
 
         fwd_bottom_data[i].reset(new MKLDNNData<Dtype>(
             usr_bottom_data_mpd, prv_bottom_data_mpd, bottom[i], this));        
@@ -208,13 +208,13 @@ void MKLDNNEltwiseLayer<Dtype>::InitEltwiseFwd(const vector<Blob<Dtype>*>& botto
     }
 
     shared_ptr<memory::primitive_desc> usr_top_data_mpd(new memory::primitive_desc(
-        {{n, ic, ih, iw}, mpcsn, mfmt_nchw}, cpu_engine));
+        {dim, mpcsn, src_mfmt}, cpu_engine));
     
     // ---- Determining engine to use -----------------------
     std::string subengines = this->layer_param_.engine();
     if (subengines.find("MKLDNN") == std::string::npos || subengines == "MKLDNN")
         subengines = "MKLDNN:CPU";
-    eltwiseFwd_pd.reset(new sum::primitive_desc({{n, ic, ih, iw}, bottom_data_dt, memory::format::any}, scale, bottom_data_mpd));
+    eltwiseFwd_pd.reset(new sum::primitive_desc({dim, bottom_data_dt, memory::format::any}, scale, bottom_data_mpd));
     CHECK(eltwiseFwd_pd);
 
     shared_ptr<memory::primitive_desc> prv_top_data_mpd(new memory::primitive_desc(eltwiseFwd_pd->dst_primitive_desc()));
diff --git a/src/caffe/layers/mkldnn_relu_layer.cpp b/src/caffe/layers/mkldnn_relu_layer.cpp
index 4c5f83456..78e5125c6 100644
--- a/src/caffe/layers/mkldnn_relu_layer.cpp
+++ b/src/caffe/layers/mkldnn_relu_layer.cpp
@@ -60,14 +60,12 @@ void MKLDNNReLULayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom
 
     NeuronLayer<Dtype>::Reshape(bottom, top);
 
-    this->reshape = (this->width_ == bottom[0]->width() &&
-                     this->height_ == bottom[0]->height() &&
-                     this->channels_ == bottom[0]->channels() &&
-                     this->num_ == bottom[0]->num()) ? false : true;
-    this->width_ = bottom[0]->width();
-    this->height_ = bottom[0]->height();
-    this->num_ = bottom[0]->num();
-    this->channels_ = bottom[0]->channels();
+    this->reshape = (this->shape_ == bottom[0]->shape()) ? false : true;
+
+    this->shape_ = bottom[0]->shape();
+
+    CHECK_LE(this->shape_.size(), 5)
+      << "Tensor dimension must be less than 6.";
 
 }
 
@@ -76,10 +74,6 @@ void MKLDNNReLULayer<Dtype>::InitReLUFwd(const vector<Blob<Dtype>*>& bottom, con
 {
     if (std::is_same<Dtype, double>::value) NOT_IMPLEMENTED;
     auto propagation = this->phase_ == TEST ? prop_kind::forward_scoring : prop_kind::forward_training;
-    int32_t n  = this->num_;
-    int32_t iw = this->width_;
-    int32_t ih = this->height_;
-    int32_t ic = this->channels_;
 
     Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
     bool bottom_data_is_prv = (const_cast<Dtype*>(bottom[0]->prv_data()) != NULL);
@@ -92,7 +86,18 @@ void MKLDNNReLULayer<Dtype>::InitReLUFwd(const vector<Blob<Dtype>*>& bottom, con
     shared_ptr<memory::primitive_desc> usr_data_mpd(NULL), prv_data_mpd(NULL), top_data_mpd(NULL);
     memory::data_type src_dt = memory::data_type::f32;
     memory::data_type top_dt = memory::data_type::f32;
-    memory::format src_mfmt = memory::format::nchw;
+
+    memory::format src_mfmt;
+    auto tensor_size = this->shape_.size();
+    memory::dims dim = this->shape_;
+    if(tensor_size == 5) {
+        src_mfmt = memory::format::ncdhw;
+    } else {
+        CHECK_LE(tensor_size, 4) << "The mkldnn relu layer only supports dim size <= 5!";
+        if (tensor_size < 4) dim.resize(4, 1); // extend to nchw with dim 1 to match mkldnn format
+        src_mfmt = memory::format::nchw;
+    }
+
     //bottom_data_is_prv = false;
     std::vector<float> scale;
     if (bottom_data_is_prv) {
@@ -105,13 +110,13 @@ void MKLDNNReLULayer<Dtype>::InitReLUFwd(const vector<Blob<Dtype>*>& bottom, con
         src_dt = static_cast<memory::data_type>(mem_descr->prv_memory_pd()->desc().data.data_type);
         src_mfmt = static_cast<memory::format>(mem_descr->prv_memory_pd()->desc().data.format);
     } else {
-        bottom_data_md.reset(new memory::desc({{n, ic, ih, iw}}, mpcsn, memory::format::nchw));
+        bottom_data_md.reset(new memory::desc({dim}, mpcsn, src_mfmt));
         usr_data_mpd.reset(new memory::primitive_desc(*bottom_data_md, cpu_engine));
         prv_data_mpd.reset(new memory::primitive_desc(*bottom_data_md, cpu_engine));
         scale.push_back(1.);
     }
     top_dt = src_dt;
-    top_data_mpd.reset(new memory::primitive_desc({{n,ic,ih,iw}, top_dt, src_mfmt}, cpu_engine));
+    top_data_mpd.reset(new memory::primitive_desc({dim, top_dt, src_mfmt}, cpu_engine));
 
     // ---- Initialize relu primitive descriptor -------------
     //relu_forward::desc reluFwd_desc(propagation, *bottom_data_md, negative_slope);
@@ -195,11 +200,6 @@ void MKLDNNReLULayer<Dtype>::InitReLUBwd(const vector<Blob<Dtype>*>& top
 {
     if (std::is_same<Dtype, double>::value) NOT_IMPLEMENTED;
 
-    int32_t n  = this->num_;
-    int32_t iw = this->width_;
-    int32_t ih = this->height_;
-    int32_t ic = this->channels_;
-
     Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
     bool top_diff_is_prv = top[0]->prv_diff() != NULL;
     bool inplace = (bottom[0] == top[0]);
@@ -220,13 +220,24 @@ void MKLDNNReLULayer<Dtype>::InitReLUBwd(const vector<Blob<Dtype>*>& top
     // ---- Initialize memory descriptors -------------
     shared_ptr<memory::desc> bottom_data_md;
     shared_ptr<memory::primitive_desc> usr_data_mpd(NULL), prv_data_mpd(NULL);
+
+    memory::format mfmt;
+    auto tensor_size = this->shape_.size();
+    memory::dims dim = this->shape_;
+    if(tensor_size == 5) {
+        mfmt = memory::format::ncdhw;
+    } else {
+        CHECK_LE(tensor_size, 4) << "The mkldnn relu layer only supports dim size <= 5!";
+        if (tensor_size < 4) dim.resize(4, 1); // extend to nchw with dim 1 to match mkldnn format
+        mfmt = memory::format::nchw;
+    }
     if (bottom_data_is_prv) {
         shared_ptr<MKLDNNMemoryDescriptor<Dtype, false> > mem_descr
             = get_mkldnn_prv_descriptor<Dtype, false>(bottom[0]);
         usr_data_mpd = mem_descr->usr_memory_pd();
         prv_data_mpd = mem_descr->prv_memory_pd();
     } else {
-        bottom_data_md.reset(new memory::desc({{n, ic, ih, iw}}, mpcsn, memory::format::nchw));
+        bottom_data_md.reset(new memory::desc(dim, mpcsn, mfmt));
         usr_data_mpd.reset(new memory::primitive_desc(*bottom_data_md, cpu_engine));
     }
 
@@ -276,7 +287,7 @@ void MKLDNNReLULayer<Dtype>::InitReLUBwd(const vector<Blob<Dtype>*>& top
           top[0]->set_prv_diff_descriptor(NULL);
       }
 
-      top_diff_md.reset(new memory::desc({{n, ic, ih, iw}}, mpcsn, memory::format::nchw));
+      top_diff_md.reset(new memory::desc(dim, mpcsn, mfmt));
       usr_diff_mpd.reset(new memory::primitive_desc(*top_diff_md, cpu_engine));
     }
     
diff --git a/src/caffe/layers/mvn_layer.cpp b/src/caffe/layers/mvn_layer.cpp
index 09542b105..3f9f4804d 100644
--- a/src/caffe/layers/mvn_layer.cpp
+++ b/src/caffe/layers/mvn_layer.cpp
@@ -45,19 +45,21 @@ namespace caffe {
 template <typename Dtype>
 void MVNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  top[0]->Reshape(bottom[0]->num(), bottom[0]->channels(),
-      bottom[0]->height(), bottom[0]->width());
-  mean_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-      1, 1);
-  variance_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-      1, 1);
-  temp_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-      bottom[0]->height(), bottom[0]->width());
+  top[0]->Reshape(bottom[0]->shape());
+  vector<int> temp_shape = bottom[0]->shape();
+  for (int i = 2; i < temp_shape.size(); i++)
+    temp_shape[i] = 1;
+  mean_.Reshape(temp_shape);
+  variance_.Reshape(temp_shape);
+  temp_.Reshape(bottom[0]->shape());
+
+  vector<int> shape = bottom[0]->shape();
+  shape[0] = 1;
   if ( this->layer_param_.mvn_param().across_channels() ) {
-    sum_multiplier_.Reshape(1, bottom[0]->channels(), bottom[0]->height(),
-                            bottom[0]->width());
+    sum_multiplier_.Reshape(shape);
   } else {
-    sum_multiplier_.Reshape(1, 1, bottom[0]->height(), bottom[0]->width());
+    shape[1] = 1;
+    sum_multiplier_.Reshape(shape);
   }
   Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data();
   caffe_set(sum_multiplier_.count(), Dtype(1), multiplier_data);
@@ -71,9 +73,9 @@ void MVNLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   Dtype* top_data = top[0]->mutable_cpu_data();
   int num;
   if (this->layer_param_.mvn_param().across_channels())
-    num = bottom[0]->num();
+    num = bottom[0]->shape(0);
   else
-    num = bottom[0]->num() * bottom[0]->channels();
+    num = bottom[0]->shape(0) * bottom[0]->shape(1);
 
   int dim = bottom[0]->count() / num;
 
@@ -118,9 +120,9 @@ void MVNLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 
   int num;
   if (this->layer_param_.mvn_param().across_channels())
-    num = bottom[0]->num();
+    num = bottom[0]->shape(0);
   else
-    num = bottom[0]->num() * bottom[0]->channels();
+    num = bottom[0]->shape(0) * bottom[0]->shape(1);
 
   int dim = bottom[0]->count() / num;
 
diff --git a/src/caffe/layers/mvn_layer.cu b/src/caffe/layers/mvn_layer.cu
index 739293be0..1df728d08 100644
--- a/src/caffe/layers/mvn_layer.cu
+++ b/src/caffe/layers/mvn_layer.cu
@@ -12,9 +12,9 @@ void MVNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   Dtype* top_data = top[0]->mutable_gpu_data();
   int num;
   if (this->layer_param_.mvn_param().across_channels())
-    num = bottom[0]->num();
+    num = bottom[0]->shape(0);
   else
-    num = bottom[0]->num() * bottom[0]->channels();
+    num = bottom[0]->shape(0) * bottom[0]->shape(1);
 
   int dim = bottom[0]->count() / num;
 
@@ -60,9 +60,9 @@ void MVNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 
   int num;
   if (this->layer_param_.mvn_param().across_channels())
-    num = bottom[0]->num();
+    num = bottom[0]->shape(0);
   else
-    num = bottom[0]->num() * bottom[0]->channels();
+    num = bottom[0]->shape(0) * bottom[0]->shape(1);
 
   int dim = bottom[0]->count() / num;
 
diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp
index 43f07cbf8..90b8e67c2 100755
--- a/src/caffe/util/im2col.cpp
+++ b/src/caffe/util/im2col.cpp
@@ -203,9 +203,9 @@ inline void im2col_nd_core_cpu(const Dtype* data_input, const bool im2col,
     const int* kernel_shape, const int* pad, const int* stride,
     const int* dilation, Dtype* data_output) {
   if (!im2col) {
-    int im_size = im_shape[0];
+    size_t im_size = im_shape[0];
     for (int i = 0; i < num_spatial_axes; ++i) {
-      im_size *= im_shape[1 + i];
+      im_size *= (size_t)im_shape[1 + i];
     }
     caffe_set(im_size, Dtype(0), data_output);
   }
@@ -333,7 +333,7 @@ void col2im_cpu(const Dtype* data_col, const int channels,
   int width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
   long chunk_len = kernel_h * kernel_w;
 
-  caffe_set(height * width * channels, Dtype(0), data_im);
+  caffe_set((size_t)height * (size_t)width * (size_t)channels, Dtype(0), data_im);
 
   #ifdef _OPENMP
   #pragma omp parallel for if (channels > 1)
@@ -378,7 +378,7 @@ void col2im3d_cpu(const Dtype* data_col, const int channels,
   long height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
   long width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
   long depth_col = (depth + 2 * pad_d - dil_patch_d) / stride_d + 1;
-  long num_kernels = channels * height * width * depth;
+  long num_kernels = (size_t)channels * (size_t)height * (size_t)width * (size_t)depth;
   long chunk_len = kernel_h * kernel_w * kernel_d;
 
   caffe_set(num_kernels, Dtype(0), data_im);