From d626ff6ab396cf401eb2a75753bc12740b0967c6 Mon Sep 17 00:00:00 2001 From: fzou1 Date: Tue, 1 Aug 2017 15:32:50 +0800 Subject: [PATCH 01/38] Merge pull request #106 for fixing crash issue of classification/batch_classification --- examples/cpp_classification/batch_classification.cpp | 4 ++++ examples/cpp_classification/classification.cpp | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/examples/cpp_classification/batch_classification.cpp b/examples/cpp_classification/batch_classification.cpp index 374671baa..8295bf4e5 100644 --- a/examples/cpp_classification/batch_classification.cpp +++ b/examples/cpp_classification/batch_classification.cpp @@ -422,6 +422,10 @@ int main(int argc, char** argv) { cout<<"Use mean file: "< Date: Tue, 1 Aug 2017 16:31:12 +0800 Subject: [PATCH 02/38] Fix the python typo --- examples/pycaffe/tune_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pycaffe/tune_model.py b/examples/pycaffe/tune_model.py index 8305b081b..628adf9c0 100644 --- a/examples/pycaffe/tune_model.py +++ b/examples/pycaffe/tune_model.py @@ -23,7 +23,7 @@ def tuneModelDefinition(model_path, iteration): caffe_path = os.path.join(working_dir, "..", "..", "build", "tools", "caffe") if not os.path.exists(caffe_path): print "Caffe binary does not exist; please build Caffe binary first." - sys,exit(1) + sys.exit(1) base_model_name = os.path.basename(model_path) model_dir = os.path.dirname(model_path) From 96ee2c6990519107d5fc36a5e9d230d0e5fb488a Mon Sep 17 00:00:00 2001 From: "Haihao.Shen" Date: Tue, 1 Aug 2017 16:36:56 +0800 Subject: [PATCH 03/38] Upgrade MKLML version from 0425 to 0720 --- external/mkl/prepare_mkl.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/external/mkl/prepare_mkl.sh b/external/mkl/prepare_mkl.sh index b68bc7aec..09284dc41 100755 --- a/external/mkl/prepare_mkl.sh +++ b/external/mkl/prepare_mkl.sh @@ -74,10 +74,10 @@ echo $VERSION_LINE # Return Version Line # MKL DST=`dirname $0` OMP=0 -VERSION_MATCH=20170425 -ARCHIVE_BASENAME=mklml_lnx_2018.0.20170425.tgz +VERSION_MATCH=20170720 +ARCHIVE_BASENAME=mklml_lnx_2018.0.20170720.tgz MKL_CONTENT_DIR=`echo $ARCHIVE_BASENAME | rev | cut -d "." -f 2- | rev` -GITHUB_RELEASE_TAG=1.0.0 +GITHUB_RELEASE_TAG=1.0.2 MKLURL="https://github.com/intel/caffe/releases/download/$GITHUB_RELEASE_TAG/$ARCHIVE_BASENAME" # there are diffrent MKL lib to be used for GCC and for ICC From aaba8d24d851100db954ed12cb1655b8622fad56 Mon Sep 17 00:00:00 2001 From: Feng Tian Date: Tue, 1 Aug 2017 16:25:14 +0800 Subject: [PATCH 04/38] Fix the fusion bug for MKLDNN relu and conv layer. As MKLDNN currently doesn't support dilation convolution, we couldn't fuse them but use caffe engine to calculate. Change-Id: Icfbac285ac98e4fcefd540791bcccf6692849904 --- src/caffe/layer_factory.cpp | 15 +++++++------ src/caffe/net.cpp | 43 ++++++++++++++++++++++++------------- 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp index 2b52007cc..0a6f83a21 100644 --- a/src/caffe/layer_factory.cpp +++ b/src/caffe/layer_factory.cpp @@ -92,6 +92,7 @@ shared_ptr > GetConvolutionLayer( for (int i = 0; i < conv_param.dilation_size(); ++i) { if (conv_param.dilation(i) > 1) { use_dilation = true; + break; } } #endif @@ -589,10 +590,10 @@ shared_ptr > GetEltwiseLayer(const LayerParameter& param) { #if defined(MKL2017_SUPPORTED) else if (ep.isEngine("MKL2017")) engine = EltwiseParameter_Engine_MKL2017; -#endif -#if defined(MKLDNN_SUPPORTED) - else if (ep.isEngine("MKLDNN")) - engine = EltwiseParameter_Engine_MKLDNN; +#endif +#if defined(MKLDNN_SUPPORTED) + else if (ep.isEngine("MKLDNN")) + engine = EltwiseParameter_Engine_MKLDNN; #endif } @@ -605,9 +606,9 @@ shared_ptr > GetEltwiseLayer(const LayerParameter& param) { } else if (engine == EltwiseParameter_Engine_MKL2017) { return shared_ptr >(new MKLEltwiseLayer(param)); #endif -#ifdef MKLDNN_SUPPORTED - } else if (engine == EltwiseParameter_Engine_MKLDNN) { - return shared_ptr >(new MKLDNNEltwiseLayer(param)); +#ifdef MKLDNN_SUPPORTED + } else if (engine == EltwiseParameter_Engine_MKLDNN) { + return shared_ptr >(new MKLDNNEltwiseLayer(param)); #endif } else { LOG(FATAL) << "Layer " << param.name() << " has unknow engine."; diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index 0a8aeb981..98395e82a 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -628,13 +628,24 @@ void Net::CompilationRuleTwo(const NetParameter& param, // Note: Currently merging of convolution and relu layers is feasible // If current layer is Convolution of MKLDNN engine.. if ((layer_param->type().compare("Convolution") == 0) && - ((layer_param->convolution_param().engine() == ConvolutionParameter_Engine_MKLDNN) - || (((layer_param->convolution_param().engine() == ConvolutionParameter_Engine_DEFAULT) && - (param.engine().compare(0, 6, "MKLDNN") == 0 - && param.engine().find(":DLA", 6) == string::npos)) || - (param.engine() == "" && - layer_param->engine().compare(0, 6, "MKLDNN") == 0 && - layer_param->engine().find(":DLA", 6) == string::npos)))) { + ((layer_param->convolution_param().engine() == ConvolutionParameter_Engine_MKLDNN) || + ((layer_param->convolution_param().engine() == ConvolutionParameter_Engine_DEFAULT) && + (layer_param->engine().compare(0, 6, "MKLDNN") == 0) && + (layer_param->engine().find(":DLA", 6) == string::npos)) || + ((layer_param->convolution_param().engine() == ConvolutionParameter_Engine_DEFAULT) && + (layer_param->engine() == "") && + (param.engine().compare(0, 6, "MKLDNN") == 0 && + param.engine().find(":DLA", 6) == string::npos)))) { + // check if Dialation is larger than 1. if yes, don't fuse the following Relu layer with this conv layer + // as MKLDNN doesn't support dilation convolution yet. + bool dilation = false; + for (int i = 0; i < layer_param->convolution_param().dilation_size(); ++i) { + if (layer_param->convolution_param().dilation(i) > 1) { + dilation = true; + break; + } + } + std::vector consumer_layer_params; GetBlobConsumers(consumer_layer_params, layer_param->top(0), param, i+1 < param.layer_size() ? i+1 : i); @@ -644,14 +655,16 @@ void Net::CompilationRuleTwo(const NetParameter& param, // Consumer layer of blob produced by Conv // has to be ReLU layer with one Input Blob - if ((consumer_layer_param.type().compare("ReLU") == 0) && - ((consumer_layer_param.relu_param().engine() == ReLUParameter_Engine_MKLDNN) - || (((consumer_layer_param.relu_param().engine() == ReLUParameter_Engine_DEFAULT) && - (param.engine().compare(0, 6, "MKLDNN") == 0 - && param.engine().find(":DLA", 6) == string::npos)) || - (param.engine() == "" && - layer_param->engine().compare(0, 6, "MKLDNN") == 0 && - layer_param->engine().find(":DLA", 6) == string::npos)))) { + if (!dilation && + (consumer_layer_param.type().compare("ReLU") == 0) && + ((consumer_layer_param.relu_param().engine() == ReLUParameter_Engine_MKLDNN) || + ((consumer_layer_param.relu_param().engine() == ReLUParameter_Engine_DEFAULT) && + (consumer_layer_param.engine().compare(0, 6, "MKLDNN") == 0 && + consumer_layer_param.engine().find(":DLA", 6) == string::npos)) || + ((consumer_layer_param.relu_param().engine() == ReLUParameter_Engine_DEFAULT) && + (consumer_layer_param.engine() == "") && + (param.engine().compare(0, 6, "MKLDNN") == 0 && + param.engine().find(":DLA", 6) == string::npos)))) { string& convolution_top_blob_name = const_cast(layer_param->top(0)); From 84fae0eb7b4052b2ab769d99cf46d43e661ae832 Mon Sep 17 00:00:00 2001 From: Feng Tian Date: Wed, 2 Aug 2017 09:22:44 +0800 Subject: [PATCH 05/38] fix crash due to null pointer dereference. This issue was found during enabling debug_info option in solver.prototxt with mkldnn engine. --- src/caffe/blob.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp index dd5546bde..dddb0f2db 100644 --- a/src/caffe/blob.cpp +++ b/src/caffe/blob.cpp @@ -369,7 +369,13 @@ Dtype Blob::asum_diff() const { switch (diff_->head()) { case SyncedMemory::SYNCED_PRV: case SyncedMemory::HEAD_AT_PRV: - return caffe_cpu_asum( prv_diff_count(), prv_diff()); + { + const Dtype* prv_ptr = prv_diff(); + if (prv_ptr == NULL) + return caffe_cpu_asum(count_, cpu_diff()); + else + return caffe_cpu_asum(prv_diff_count(), prv_diff()); + } case SyncedMemory::HEAD_AT_CPU: return caffe_cpu_asum(count_, cpu_diff()); case SyncedMemory::HEAD_AT_GPU: @@ -462,7 +468,11 @@ Dtype Blob::sumsq_diff() const { case SyncedMemory::SYNCED_PRV: case SyncedMemory::HEAD_AT_PRV: diff = prv_diff(); - sumsq = caffe_cpu_dot(prv_diff_count(), diff, diff); + if (diff == NULL) { + diff = cpu_diff(); + sumsq = caffe_cpu_dot(count_, diff, diff); + } else + sumsq = caffe_cpu_dot(prv_diff_count(), diff, diff); break; case SyncedMemory::HEAD_AT_CPU: diff = cpu_diff(); From 79e05ccd5c7225a892d1d30fea86caee38b97d7a Mon Sep 17 00:00:00 2001 From: "Zhang, Guoming" Date: Wed, 2 Aug 2017 20:11:55 +0800 Subject: [PATCH 06/38] Fix for the issue that Intel caffe couldn't converge on Resnet-50. --- src/caffe/net.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index 98395e82a..10621c531 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -728,11 +728,12 @@ void Net::CompilationRuleThree(const NetParameter& param, // If current layer is BatchNorm of MKL2017 engine.. if (((layer_param->type().compare("BatchNorm") == 0) && ((layer_param->batch_norm_param().engine() == - BatchNormParameter_Engine_MKL2017) + BatchNormParameter_Engine_MKL2017 || layer_param->batch_norm_param().engine() == + BatchNormParameter_Engine_MKLDNN) || ((layer_param->batch_norm_param().engine() == BatchNormParameter_Engine_DEFAULT) && - param.engine().compare("MKL2017") == 0))) && - (layer_param->top(0) == layer_param->bottom(0) )) { + (param.engine().compare("MKL2017") == 0 || param.engine().compare("MKLDNN") == 0)))) && + (layer_param->top(0) == layer_param->bottom(0))) { std::string& batch_norm_top = const_cast(layer_param->top(0)); std::vector consumer_layer_params; GetBlobConsumers(consumer_layer_params, From f787234872adf1de7192663aa1ae0a872b5a5176 Mon Sep 17 00:00:00 2001 From: fzou1 Date: Fri, 4 Aug 2017 11:16:19 +0800 Subject: [PATCH 07/38] add random resizing param for test net Change-Id: I0f1e7e2b758e666f6eec8c8c71a1cd905de7b1e1 --- .../resnet_50_256_nodes_8k_batch/train_val.prototxt | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/models/intel_optimized_models/multinode/resnet_50_256_nodes_8k_batch/train_val.prototxt b/models/intel_optimized_models/multinode/resnet_50_256_nodes_8k_batch/train_val.prototxt index e5c7a9128..d98323ed6 100644 --- a/models/intel_optimized_models/multinode/resnet_50_256_nodes_8k_batch/train_val.prototxt +++ b/models/intel_optimized_models/multinode/resnet_50_256_nodes_8k_batch/train_val.prototxt @@ -14,8 +14,10 @@ layer { mean_value: 104 mean_value: 117 mean_value: 123 - random_resize_param { - min_size: 256 max_size: 480 + random_aspect_ratio_param { + min_area_ratio: 0.08 + max_area_ratio: 1 + aspect_ratio_change: 0.75 resize_param { interp_mode: CUBIC } @@ -44,6 +46,13 @@ layer { mean_value: 104 mean_value: 117 mean_value: 123 + random_resize_param { + min_size: 256 + max_size: 256 + resize_param { + interp_mode: CUBIC + } + } } data_param { source: "examples/imagenet/ilsvrc12_val_lmdb" From 70ef5231c261795faf8dc99abe60bc756068c89b Mon Sep 17 00:00:00 2001 From: fzou1 Date: Mon, 7 Aug 2017 14:11:14 +0800 Subject: [PATCH 08/38] Remove additional normalization overhead with iter_size 1 for multi-node and make it consistent with single node. loss is divided during setting up --- include/caffe/layer.hpp | 5 +++++ src/caffe/solvers/sgd_solver.cpp | 12 ------------ 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp index 45d65c799..9dc4d557b 100644 --- a/include/caffe/layer.hpp +++ b/include/caffe/layer.hpp @@ -521,7 +521,12 @@ class Layer { CHECK_EQ(top.size(), num_loss_weights) << "loss_weight must be " "unspecified or specified once per top blob."; for (int top_id = 0; top_id < top.size(); ++top_id) { +#ifdef USE_MLSL + const Dtype loss_weight = layer_param_.loss_weight(top_id) / + GetDistribution().get_data_parts(); +#else const Dtype loss_weight = layer_param_.loss_weight(top_id); +#endif if (loss_weight == Dtype(0)) { continue; } this->set_loss(top_id, loss_weight); const int count = top[top_id]->count(); diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp index 264ac954f..df5fbcd26 100644 --- a/src/caffe/solvers/sgd_solver.cpp +++ b/src/caffe/solvers/sgd_solver.cpp @@ -226,23 +226,11 @@ void SGDSolver::ApplyUpdate(int param_id) { template void SGDSolver::Normalize(int param_id) { - -#ifdef USE_MLSL - if ((this->param_.iter_size() == 1) && !mn::is_multinode()) { - return; - } -#else /* !USE_MLSL */ if (this->param_.iter_size() == 1) { return; } -#endif /* USE_MLSL */ // Scale gradient to counterbalance accumulation. const vector*>& net_params = this->net_->learnable_params(); - -#ifdef USE_MLSL - const Dtype accum_normalization = Dtype(1.) / (this->param_.iter_size() * mn::get_nodes_count()); -#else /* !USE_MLSL */ const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size(); -#endif /* USE_MLSL */ switch (Caffe::mode()) { case Caffe::CPU: { From cd36cd9fe2aa2c7983185af87a5dc6b7a5c45457 Mon Sep 17 00:00:00 2001 From: "Yu, Chong" Date: Mon, 7 Aug 2017 18:48:24 +0800 Subject: [PATCH 09/38] SGD update optimization by fusion. Need ICC build. --- Makefile | 6 + Makefile.config.example | 3 + include/caffe/sgd_solvers.hpp | 5 + src/caffe/solvers/sgd_solver.cpp | 386 ++++++++++++++++++++++++++++++- 4 files changed, 397 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index fd6e78bc8..46d259a37 100644 --- a/Makefile +++ b/Makefile @@ -547,6 +547,12 @@ LIBRARY_DIRS += $(LIB_BUILD_DIR) # Automatic dependency generation (nvcc is handled separately) CXXFLAGS += -MMD -MP +##########SGD FUSION####################### +ifeq ($(ENABLE_SGD_FUSION), 1) + COMMON_FLAGS += -DENABLE_SGD_FUSION +endif +########################################### +# # Complete build flags. COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) CXXFLAGS += -std=c++11 -pthread -fPIC $(COMMON_FLAGS) $(WARNINGS) diff --git a/Makefile.config.example b/Makefile.config.example index 8bfcc57a3..539a00a67 100644 --- a/Makefile.config.example +++ b/Makefile.config.example @@ -170,5 +170,8 @@ DISTRIBUTE_DIR := distribute # The ID of the GPU that 'make runtest' will use to run unit tests. TEST_GPUID := 0 +# Uncomment for enabling SGD fusion +# ENABLE_SGD_FUSION := 1 + # enable pretty build (comment to see full commands) Q ?= @ diff --git a/include/caffe/sgd_solvers.hpp b/include/caffe/sgd_solvers.hpp index a11da89de..9741ef212 100644 --- a/include/caffe/sgd_solvers.hpp +++ b/include/caffe/sgd_solvers.hpp @@ -81,6 +81,11 @@ class SGDSolver : public Solver { // of gradients/updates and is not needed in snapshots vector > > history_, update_, temp_; +#ifdef ENABLE_SGD_FUSION + //Fuse the Normalize, Regularize and ComputeUpdateValue process together + void Normalize_Regularize_ComputeUpdateValue_Fusion(int param_id, Dtype rate); +#endif /* ENABLE_SGD_FUSION */ + // loss history for 'plateau' LR policy (should be stored in snapshots) Dtype minimum_loss_; int iter_last_event_; diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp index df5fbcd26..1480d3005 100644 --- a/src/caffe/solvers/sgd_solver.cpp +++ b/src/caffe/solvers/sgd_solver.cpp @@ -42,6 +42,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "caffe/util/hdf5.hpp" #include "caffe/util/io.hpp" #include "caffe/util/upgrade_proto.hpp" +#include + namespace caffe { template @@ -208,13 +210,38 @@ void SGDSolver::ApplyUpdate(int param_id) { return; } +#ifdef ENABLE_SGD_FUSION + switch (Caffe::mode()) { + case Caffe::CPU: { + //VLOG(1) << "Use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD"; + //LOG(INFO) << "Use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD"; + Normalize_Regularize_ComputeUpdateValue_Fusion(param_id, rate); + break; + } + case Caffe::GPU: { +#ifndef CPU_ONLY + //VLOG(1) << "Currently we do not support use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD in GPU mode."; + //LOG(INFO) << "Currently we do not support use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD in GPU mode."; +#else + NO_GPU; +#endif + break; + } + default: + LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); + } +#else /* !ENABLE_SGD_FUSION */ + //LOG(INFO) << "No Fusion: Param_id: " << param_id; Normalize(param_id); + LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], diff, param_id, "ApplyUpdate: delwt after Normalize:"); Regularize(param_id); LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], diff, param_id, "ApplyUpdate: delwt after Regularize:"); ComputeUpdateValue(param_id, rate); +#endif /* ENABLE_SGD_FUSION */ + LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], diff, param_id, "ApplyUpdate: wtinc:"); LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], data, param_id, "ApplyUpdate: weight before update:"); @@ -224,12 +251,359 @@ void SGDSolver::ApplyUpdate(int param_id) { LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], data, param_id, "ApplyUpdate: weight after update:"); } +#ifdef ENABLE_SGD_FUSION +//Math function for fusion +template +void axpy_axpby_copy(size_t count, const Dtype decay, const Dtype* net_params_data, Dtype *net_params_diff, + const Dtype rate, const Dtype momentum, Dtype* history_data); + +template <> +void axpy_axpby_copy(size_t count, const float decay, const float* net_params_data, float *net_params_diff, + const float rate, const float momentum, float* history_data) +{ + float temp_result = 0.; +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (size_t i = 0; i < count; ++i) { + temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i]; + history_data[i] = temp_result; + net_params_diff[i] = temp_result; + } +} + +template <> +void axpy_axpby_copy(size_t count, const double decay, const double* net_params_data, double *net_params_diff, + const double rate, const double momentum, double* history_data) +{ + double temp_result = 0.; +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (size_t i = 0; i < count; ++i) { + temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i]; + history_data[i] = temp_result; + net_params_diff[i] = temp_result; + } +} + +template +void avx512_axpy_axpby_copy(size_t count, const Dtype decay, const Dtype* net_params_data, Dtype *net_params_diff, + const Dtype rate, const Dtype momentum, Dtype* history_data); + +template <> +void avx512_axpy_axpby_copy(size_t count, const float decay, const float* net_params_data, float *net_params_diff, + const float rate, const float momentum, float* history_data) +{ + // If count is smaller than 16 we use non-avx512 implementation + // 16 is the element number which one avx512 register can hold + if (count < 16) { + return axpy_axpby_copy(count, decay, net_params_data, net_params_diff, + rate, momentum, history_data); + } + + // If count can't be divided by 16, we handle tailing remainder + // with non-avx512 imeplementation + if (count % 16 != 0) { + size_t remainder = count % 16; + count -= remainder; + axpy_axpby_copy(remainder, decay, net_params_data+count, net_params_diff+count, + rate, momentum, history_data+count); + } + + size_t group_size = 16; +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (size_t idx = 0; idx < count; idx += group_size) { + const float *fnet_params_data = net_params_data + idx; + float *fnet_params_diff = net_params_diff + idx; + float *fhistory_data = history_data + idx; + __m512 operand1_v = _mm512_loadu_ps(fnet_params_data); + __m512 operand2_v = _mm512_loadu_ps(fnet_params_diff); + __m512 operand3_v = _mm512_loadu_ps(fhistory_data); + __m512 decay_operand_v = _mm512_set1_ps(decay); + __m512 rate_operand_v = _mm512_set1_ps(rate); + __m512 momentum_operand_v = _mm512_set1_ps(momentum); + __m512 decay_result = _mm512_mul_ps(decay_operand_v, operand1_v); + __m512 axpy_result = _mm512_add_ps(decay_result, operand2_v); + __m512 rate_result = _mm512_mul_ps(rate_operand_v, axpy_result); + __m512 momentum_result = _mm512_mul_ps(momentum_operand_v, operand3_v); + __m512 axpby_result = _mm512_add_ps(rate_result, momentum_result); + _mm512_storeu_ps(fhistory_data, axpby_result); + _mm512_storeu_ps(fnet_params_diff, axpby_result); + } +} + +template <> +void avx512_axpy_axpby_copy(size_t count, const double decay, const double* net_params_data, double* net_params_diff, + const double rate, const double momentum, double* history_data) +{ + // If count is smaller than 8 we use non-avx512 implementation + // 8 is the element number which one avx512 register can hold + if (count < 8) { + return axpy_axpby_copy(count, decay, net_params_data, net_params_diff, + rate, momentum, history_data); + } + + // If count can't be divided by 8, we handle tailing remainder + // with non-avx512 imeplementation + if (count % 8 != 0) { + size_t remainder = count % 8; + count -= remainder; + axpy_axpby_copy(remainder, decay, net_params_data+count, net_params_diff+count, + rate, momentum, history_data+count); + } + + size_t group_size = 8; +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (size_t idx = 0; idx < count; idx += group_size) { + const double *fnet_params_data = net_params_data + idx; + double *fnet_params_diff = net_params_diff + idx; + double *fhistory_data = history_data + idx; + __m512 operand1_v = _mm512_loadu_pd(fnet_params_data); + __m512 operand2_v = _mm512_loadu_pd(fnet_params_diff); + __m512 operand3_v = _mm512_loadu_pd(fhistory_data); + __m512 decay_operand_v = _mm512_set1_pd(decay); + __m512 rate_operand_v = _mm512_set1_pd(rate); + __m512 momentum_operand_v = _mm512_set1_pd(momentum); + __m512 decay_result = _mm512_mul_pd(decay_operand_v, operand1_v); + __m512 axpy_result = _mm512_add_pd(decay_result, operand2_v); + __m512 rate_result = _mm512_mul_pd(rate_operand_v, axpy_result); + __m512 momentum_result = _mm512_mul_pd(momentum_operand_v, operand3_v); + __m512 axpby_result = _mm512_add_pd(rate_result, momentum_result); + _mm512_storeu_pd(fhistory_data, axpby_result); + _mm512_storeu_pd(fnet_params_diff, axpby_result); + } +} + + +template +void SGDSolver::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_id, Dtype rate) { +//LOG(INFO) << "Fusion: Param_id: " << param_id; + +//#pragma region 1. Common initialization + //Normalize initialization + bool skip_Normalize_stage_flag = false; + if (this->param_.iter_size() == 1) { skip_Normalize_stage_flag = true; } + + // Scale gradient to counterbalance accumulation. + const vector*>& net_params = this->net_->learnable_params(); + + //Regularize initialization + const vector& net_params_weight_decay = + this->net_->params_weight_decay(); + Dtype weight_decay = this->param_.weight_decay(); + string regularization_type = this->param_.regularization_type(); + Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; + + //ComputeUpdateValue initialization + const vector& net_params_lr = this->net_->params_lr(); + Dtype momentum = this->param_.momentum(); + Dtype local_rate = rate * net_params_lr[param_id]; +//#pragma endregion + +//#pragma region 2. Common condition judgement + bool prv_diff_condition_flag = false; + if (net_params[param_id]->prv_diff() + && (net_params[param_id]->prv_diff_count() + == net_params[param_id]->count())) { + prv_diff_condition_flag = true; + //LOG(INFO) << "Common condition judgement: prv_diff_condition_flag = true."; + } + else + { + //LOG(INFO) << "Common condition judgement: prv_diff_condition_flag = false."; + } +//#pragma endregion + +//#pragma region 3. Normalize stage + if (skip_Normalize_stage_flag == false) + { + //LOG(INFO) << "Normalize stage: Normalize stage is not skipped."; + + const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size(); + + if (prv_diff_condition_flag) { + //LOG(INFO) << "Normalize stage: prv_diff_condition_flag = true."; + caffe_scal(net_params[param_id]->count(), accum_normalization, + net_params[param_id]->mutable_prv_diff()); + } + else { + //LOG(INFO) << "Normalize stage: prv_diff_condition_flag = false."; + caffe_scal(net_params[param_id]->count(), accum_normalization, + net_params[param_id]->mutable_cpu_diff()); + } + } + else + { + //LOG(INFO) << "Normalize stage: Normalize stage is skipped."; + } +//#pragma endregion + +//For POR topologies from BVLC, all skipped the Normalize stage, and use L2 regularization +//If prv_diff_condition_flag == true, then prv_data_condition_flag == true (1) +//If prv_diff_condition_flag == false, then prv_data_condition_flag == false (2) +//Another case is local_decay == 0, prv_diff_condition_flag == false (3) +//So only need to consider the fusion in situations (1) and (2), set execute_separate_ComputeUpdateValue_stage_flag to false value + bool execute_separate_ComputeUpdateValue_stage_flag = true; + //Regularize stage (Fused ComputeUpdateValue_stage in some situations) + if (local_decay) { + if (regularization_type == "L2") { + //LOG(INFO) << "Regularize stage: regularization_type == L2."; + // add weight decay + if (net_params[param_id]->prv_data() + && (net_params[param_id]->prv_data_count() + == net_params[param_id]->count())) { + //LOG(INFO) << "Regularize stage: prv_data_condition_flag = true."; + CHECK_EQ(true, + net_params[param_id]->get_prv_data_descriptor()->layout_compare( + net_params[param_id]->get_prv_diff_descriptor())); + /* + caffe_axpy(net_params[param_id]->count(), + local_decay, + net_params[param_id]->prv_data(), + net_params[param_id]->mutable_prv_diff()); + */ + if (prv_diff_condition_flag) { + //situation (1) + //LOG(INFO) << "Fused ComputeUpdateValue stage: prv_diff_condition_flag = true."; + /* + caffe_cpu_axpby(net_params[param_id]->count(), local_rate, + net_params[param_id]->prv_diff(), momentum, + history_[param_id]->mutable_cpu_data()); + + caffe_copy(net_params[param_id]->count(), + history_[param_id]->cpu_data(), + net_params[param_id]->mutable_prv_diff()); + */ + + avx512_axpy_axpby_copy(net_params[param_id]->count(), local_decay, + net_params[param_id]->prv_data(), net_params[param_id]->mutable_prv_diff(), + local_rate, momentum, history_[param_id]->mutable_cpu_data()); + + execute_separate_ComputeUpdateValue_stage_flag = false; + } + else + { + //Will not happen! + //LOG(INFO) << "Cannot Fused ComputeUpdateValue stage: prv_diff_condition_flag = false."; + caffe_cpu_axpby(net_params[param_id]->count(), local_rate, + net_params[param_id]->cpu_diff(), momentum, + history_[param_id]->mutable_cpu_data()); + + caffe_copy(net_params[param_id]->count(), + history_[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); + + execute_separate_ComputeUpdateValue_stage_flag = false; + //You can set the flag to true value, and not execute caffe_cpu_axpby and caffe_copy + //But set to false value and execute caffe_cpu_axpby and caffe_copy inside will save one condition judgement time + } + } else { + //LOG(INFO) << "Regularize stage: prv_data_condition_flag = false."; + /* + caffe_axpy(net_params[param_id]->count(), + local_decay, + net_params[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); + */ + if (!prv_diff_condition_flag) + { + //situation (2) + //LOG(INFO) << "Fused ComputeUpdateValue stage: prv_diff_condition_flag = false."; + /* + caffe_cpu_axpby(net_params[param_id]->count(), local_rate, + net_params[param_id]->cpu_diff(), momentum, + history_[param_id]->mutable_cpu_data()); + + caffe_copy(net_params[param_id]->count(), + history_[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); + */ + + avx512_axpy_axpby_copy(net_params[param_id]->count(), local_decay, + net_params[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff(), + local_rate, momentum, history_[param_id]->mutable_cpu_data()); + + execute_separate_ComputeUpdateValue_stage_flag = false; + } + else + { + //Will not happen! + //LOG(INFO) << "Cannot Fused ComputeUpdateValue stage: prv_diff_condition_flag = true."; + caffe_cpu_axpby(net_params[param_id]->count(), local_rate, + net_params[param_id]->prv_diff(), momentum, + history_[param_id]->mutable_cpu_data()); + + caffe_copy(net_params[param_id]->count(), + history_[param_id]->cpu_data(), + net_params[param_id]->mutable_prv_diff()); + + execute_separate_ComputeUpdateValue_stage_flag = false; + //You can set the flag to true value, and not execute caffe_cpu_axpby and caffe_copy + //But set to false value and execute caffe_cpu_axpby and caffe_copy inside will save one condition judgement time + } + } + } else if (regularization_type == "L1") { + //LOG(INFO) << "Regularize stage: regularization_type == L1."; + caffe_cpu_sign(net_params[param_id]->count(), + net_params[param_id]->cpu_data(), + temp_[param_id]->mutable_cpu_data()); + caffe_axpy(net_params[param_id]->count(), + local_decay, + temp_[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); + } else { + LOG(FATAL) << "Unknown regularization type: " << regularization_type; + } + } + + //ComputeUpdateValue stage (separate) + if (execute_separate_ComputeUpdateValue_stage_flag == true) + { + //Include the situation: regularization_type == "L1"/"Unknown" + //Include situations (3): local_decay == 0 + //No Regularize stage, only ComputeUpdateValue stage + //ComputeUpdateValue stage + if (prv_diff_condition_flag) { + //LOG(INFO) << "ComputeUpdateValue stage: prv_diff_condition_flag = true."; + caffe_cpu_axpby(net_params[param_id]->count(), local_rate, + net_params[param_id]->prv_diff(), momentum, + history_[param_id]->mutable_cpu_data()); + + caffe_copy(net_params[param_id]->count(), + history_[param_id]->cpu_data(), + net_params[param_id]->mutable_prv_diff()); + } else { + //LOG(INFO) << "ComputeUpdateValue stage: prv_diff_condition_flag = false."; + caffe_cpu_axpby(net_params[param_id]->count(), local_rate, + net_params[param_id]->cpu_diff(), momentum, + history_[param_id]->mutable_cpu_data()); + + caffe_copy(net_params[param_id]->count(), + history_[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); + } + } + +} +#endif /* ENABLE_SGD_FUSION */ + template void SGDSolver::Normalize(int param_id) { - if (this->param_.iter_size() == 1) { return; } + if (this->param_.iter_size() == 1) { + //LOG(INFO) << "Normalize stage: Normalize stage is skipped."; + return; + } + + //LOG(INFO) << "Normalize stage: Normalize stage is not skipped."; // Scale gradient to counterbalance accumulation. const vector*>& net_params = this->net_->learnable_params(); + const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size(); switch (Caffe::mode()) { @@ -238,11 +612,12 @@ void SGDSolver::Normalize(int param_id) { if (net_params[param_id]->prv_diff() && (net_params[param_id]->prv_diff_count() == net_params[param_id]->count())) { - + //LOG(INFO) << "Normalize stage: prv_diff_condition_flag = true."; caffe_scal(net_params[param_id]->count(), accum_normalization, net_params[param_id]->mutable_prv_diff()); } else { + //LOG(INFO) << "Normalize stage: prv_diff_condition_flag = false."; caffe_scal(net_params[param_id]->count(), accum_normalization, net_params[param_id]->mutable_cpu_diff()); } @@ -275,10 +650,12 @@ void SGDSolver::Regularize(int param_id) { case Caffe::CPU: { if (local_decay) { if (regularization_type == "L2") { + //LOG(INFO) << "Regularize stage: regularization_type == L2."; // add weight decay if (net_params[param_id]->prv_data() && (net_params[param_id]->prv_data_count() == net_params[param_id]->count())) { + //LOG(INFO) << "Regularize stage: prv_data_condition_flag = true."; CHECK_EQ(true, net_params[param_id]->get_prv_data_descriptor()->layout_compare( net_params[param_id]->get_prv_diff_descriptor())); @@ -288,12 +665,14 @@ void SGDSolver::Regularize(int param_id) { net_params[param_id]->prv_data(), net_params[param_id]->mutable_prv_diff()); } else { + //LOG(INFO) << "Regularize stage: prv_data_condition_flag = false."; caffe_axpy(net_params[param_id]->count(), local_decay, net_params[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff()); } } else if (regularization_type == "L1") { + //LOG(INFO) << "Regularize stage: regularization_type == L1."; caffe_cpu_sign(net_params[param_id]->count(), net_params[param_id]->cpu_data(), temp_[param_id]->mutable_cpu_data()); @@ -364,7 +743,7 @@ void SGDSolver::ComputeUpdateValue(int param_id, Dtype rate) { if (net_params[param_id]->prv_diff() && (net_params[param_id]->prv_diff_count() == net_params[param_id]->count())) { - + //LOG(INFO) << "ComputeUpdateValue stage: prv_diff_condition_flag = true."; caffe_cpu_axpby(net_params[param_id]->count(), local_rate, net_params[param_id]->prv_diff(), momentum, history_[param_id]->mutable_cpu_data()); @@ -373,6 +752,7 @@ void SGDSolver::ComputeUpdateValue(int param_id, Dtype rate) { history_[param_id]->cpu_data(), net_params[param_id]->mutable_prv_diff()); } else { + //LOG(INFO) << "ComputeUpdateValue stage: prv_diff_condition_flag = false."; caffe_cpu_axpby(net_params[param_id]->count(), local_rate, net_params[param_id]->cpu_diff(), momentum, history_[param_id]->mutable_cpu_data()); From a1d000d85378d8afee4204098c05f06b409e9c42 Mon Sep 17 00:00:00 2001 From: "Yu, Chong" Date: Tue, 8 Aug 2017 23:54:34 +0800 Subject: [PATCH 10/38] 1. Use Macro for header file include. 2. Support L1 Regulization fusion optimization. --- src/caffe/solvers/sgd_solver.cpp | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp index 1480d3005..f8709412b 100644 --- a/src/caffe/solvers/sgd_solver.cpp +++ b/src/caffe/solvers/sgd_solver.cpp @@ -42,8 +42,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "caffe/util/hdf5.hpp" #include "caffe/util/io.hpp" #include "caffe/util/upgrade_proto.hpp" -#include +#ifdef ENABLE_SGD_FUSION +#include +#endif /* ENABLE_SGD_FUSION */ namespace caffe { template @@ -443,11 +445,12 @@ void SGDSolver::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_ } //#pragma endregion -//For POR topologies from BVLC, all skipped the Normalize stage, and use L2 regularization +//For most common topologies from BVLC, all skipped the Normalize stage, and use L2 regularization //If prv_diff_condition_flag == true, then prv_data_condition_flag == true (1) //If prv_diff_condition_flag == false, then prv_data_condition_flag == false (2) //Another case is local_decay == 0, prv_diff_condition_flag == false (3) //So only need to consider the fusion in situations (1) and (2), set execute_separate_ComputeUpdateValue_stage_flag to false value +//We can extend the fusion in L1 regularization bool execute_separate_ComputeUpdateValue_stage_flag = true; //Regularize stage (Fused ComputeUpdateValue_stage in some situations) if (local_decay) { @@ -552,10 +555,19 @@ void SGDSolver::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_ caffe_cpu_sign(net_params[param_id]->count(), net_params[param_id]->cpu_data(), temp_[param_id]->mutable_cpu_data()); + + /* caffe_axpy(net_params[param_id]->count(), local_decay, temp_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff()); + */ + + avx512_axpy_axpby_copy(net_params[param_id]->count(), local_decay, + temp_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff(), + local_rate, momentum, history_[param_id]->mutable_cpu_data()); + + execute_separate_ComputeUpdateValue_stage_flag = false; } else { LOG(FATAL) << "Unknown regularization type: " << regularization_type; } @@ -564,7 +576,7 @@ void SGDSolver::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_ //ComputeUpdateValue stage (separate) if (execute_separate_ComputeUpdateValue_stage_flag == true) { - //Include the situation: regularization_type == "L1"/"Unknown" + //Include the situation: regularization_type == "Unknown" //Include situations (3): local_decay == 0 //No Regularize stage, only ComputeUpdateValue stage //ComputeUpdateValue stage From 29c8bd382822a68588935c5ffe00bd023c792f35 Mon Sep 17 00:00:00 2001 From: "Yu, Chong" Date: Wed, 9 Aug 2017 09:36:36 +0800 Subject: [PATCH 11/38] Check whether the machine support the avx512 command before using the SGD fusion. Change-Id: I768bd16c5aadd5a17a78e7d4b72fbd0e05685994 --- src/caffe/solvers/sgd_solver.cpp | 40 ++++++++++++++++---------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp index f8709412b..378fecf9b 100644 --- a/src/caffe/solvers/sgd_solver.cpp +++ b/src/caffe/solvers/sgd_solver.cpp @@ -213,36 +213,36 @@ void SGDSolver::ApplyUpdate(int param_id) { } #ifdef ENABLE_SGD_FUSION - switch (Caffe::mode()) { - case Caffe::CPU: { - //VLOG(1) << "Use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD"; - //LOG(INFO) << "Use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD"; - Normalize_Regularize_ComputeUpdateValue_Fusion(param_id, rate); - break; - } - case Caffe::GPU: { -#ifndef CPU_ONLY - //VLOG(1) << "Currently we do not support use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD in GPU mode."; - //LOG(INFO) << "Currently we do not support use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD in GPU mode."; -#else - NO_GPU; -#endif - break; - } - default: - LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); + if (Caffe::mode() == Caffe::CPU) + { + const unsigned long avx512_features = (_FEATURE_AVX512F | _FEATURE_AVX512CD); + bool avx512_enabled_ = _may_i_use_cpu_feature(avx512_features); + if (avx512_enabled_) + { + //LOG(INFO) << "Avx512 command is supported!"; + //VLOG(1) << "Use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD"; + //LOG(INFO) << "Use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD"; + Normalize_Regularize_ComputeUpdateValue_Fusion(param_id, rate); + this->net_->learnable_params()[param_id]->Update(); + return; + } + else + { + //LOG(INFO) << "Avx512 command is not supported, so cannot use the SGD fusion!"; + } } -#else /* !ENABLE_SGD_FUSION */ +#endif /* ENABLE_SGD_FUSION */ + //LOG(INFO) << "No Fusion: Param_id: " << param_id; Normalize(param_id); LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], diff, param_id, "ApplyUpdate: delwt after Normalize:"); Regularize(param_id); + LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], diff, param_id, "ApplyUpdate: delwt after Regularize:"); ComputeUpdateValue(param_id, rate); -#endif /* ENABLE_SGD_FUSION */ LOG_PARAM_BLOB(this->net_->learnable_params()[param_id], diff, param_id, "ApplyUpdate: wtinc:"); From 98dc3fd24e4acdeb4ef9a9fdb2911c7bd9bacb45 Mon Sep 17 00:00:00 2001 From: xiaolil1 Date: Wed, 9 Aug 2017 11:47:09 +0800 Subject: [PATCH 12/38] Fix mkldnn split layer for accuracy issue --- src/caffe/layers/mkldnn_split_layer.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/caffe/layers/mkldnn_split_layer.cpp b/src/caffe/layers/mkldnn_split_layer.cpp index ab2c5156a..12359c141 100644 --- a/src/caffe/layers/mkldnn_split_layer.cpp +++ b/src/caffe/layers/mkldnn_split_layer.cpp @@ -94,10 +94,15 @@ void MKLDNNSplitLayer::InitSplitBwd(const vector*>& bottom, // Dimensions of bottom and top blobs. There is a number of // top blobs each of the same size as the bottom one - memory::dims bottom_tz = {static_cast(this->sizes_src_[0]), - static_cast(this->sizes_src_[1]), - static_cast(this->sizes_src_[2]), - static_cast(this->sizes_src_[3])}; + memory::dims bottom_tz; + bottom_tz.resize(4); + for(int i=0; i<4; i++) { + if(i < this->sizes_src_.size()) { + bottom_tz[i] = static_cast(this->sizes_src_[i]); + } else { + bottom_tz[i] = 1; + } + } shared_ptr prv_diff_dst_mpd; shared_ptr usr_diff_dst_mpd( From 872ac8a5c9252701435b2c396ac6e48acd5747bd Mon Sep 17 00:00:00 2001 From: Feng Tian Date: Wed, 9 Aug 2017 14:04:44 +0800 Subject: [PATCH 13/38] add 64-bits blob size support --- include/caffe/blob.hpp | 6 +++--- include/caffe/layer.hpp | 8 ++++---- python/caffe/_caffe.cpp | 2 +- src/caffe/blob.cpp | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp index 3295f7ab1..47d0d751c 100644 --- a/include/caffe/blob.hpp +++ b/include/caffe/blob.hpp @@ -109,7 +109,7 @@ class Blob { return shape_[CanonicalAxisIndex(index)]; } inline int num_axes() const { return shape_.size(); } - inline int count() const { return count_; } + inline long count() const { return count_; } /** * @brief Compute the volume of a slice; i.e., the product of dimensions @@ -332,8 +332,8 @@ class Blob { shared_ptr shape_data_; #endif vector shape_; - int count_; - int capacity_; + long count_; + long capacity_; DISABLE_COPY_AND_ASSIGN(Blob); }; // class Blob diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp index 9dc4d557b..5a95a7730 100644 --- a/include/caffe/layer.hpp +++ b/include/caffe/layer.hpp @@ -55,8 +55,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LOG_BLOB(layer, blob, part, blob_id, description) \ do \ { \ - int elems_to_log = std::min(MAX_ELEMS_TO_LOG, blob->count()); \ - for (int idx = 0; idx < elems_to_log; idx++) \ + long elems_to_log = std::min(static_cast(MAX_ELEMS_TO_LOG), blob->count()); \ + for (long idx = 0; idx < elems_to_log; idx++) \ { \ LOG_LAYER(layer) << description \ << ", blob_id " << blob_id \ @@ -68,8 +68,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LOG_PARAM_BLOB(blob, part, blob_id, description) \ do \ { \ - int elems_to_log = std::min(MAX_ELEMS_TO_LOG, blob->count()); \ - for (int idx = 0; idx < elems_to_log; idx++) \ + long elems_to_log = std::min(static_cast(MAX_ELEMS_TO_LOG), blob->count()); \ + for (long idx = 0; idx < elems_to_log; idx++) \ { \ DLOG(INFO) << description \ << ", blob_id " << blob_id \ diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp index b9dc23e24..3b02f509b 100644 --- a/python/caffe/_caffe.cpp +++ b/python/caffe/_caffe.cpp @@ -411,7 +411,7 @@ BOOST_PYTHON_MODULE(_caffe) { .add_property("channels", &Blob::channels) .add_property("height", &Blob::height) .add_property("width", &Blob::width) - .add_property("count", static_cast::*)() const>( + .add_property("count", static_cast::*)() const>( &Blob::count)) .def("reshape", bp::raw_function(&Blob_Reshape)) .add_property("data", bp::make_function(&Blob::mutable_cpu_data, diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp index dddb0f2db..48ae68dc7 100644 --- a/src/caffe/blob.cpp +++ b/src/caffe/blob.cpp @@ -72,7 +72,7 @@ void Blob::Reshape(const vector& shape) { for (int i = 0; i < shape.size(); ++i) { CHECK_GE(shape[i], 0); if (count_ != 0) { - CHECK_LE(shape[i], INT_MAX / count_) << "blob size exceeds INT_MAX"; + CHECK_LE(shape[i], LONG_MAX / count_) << "blob size exceeds LONG_MAX"; } count_ *= shape[i]; if (shape_[i] != shape[i]) { From 6200f6ba848254f661f4eea41e46c8faccb1b9e0 Mon Sep 17 00:00:00 2001 From: fzou1 Date: Wed, 9 Aug 2017 23:43:19 +0800 Subject: [PATCH 14/38] fix convergence issue of forward overlapping optimization by moving ClearParamDiffs after WaitGradientComm; and enable it by default Change-Id: I4dac71a49720cd72b6df2eb14047ad5ad1fd1098 --- Makefile | 2 +- cmake/Dependencies.cmake | 2 +- include/caffe/multinode/multi_sync.hpp | 4 ---- src/caffe/multinode/multi_solver.cpp | 18 +++++++++++++----- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index 46d259a37..f7144b7db 100644 --- a/Makefile +++ b/Makefile @@ -80,7 +80,7 @@ ifeq ($(CAFFE_MLSL_SHUFFLE), 1) COMMON_FLAGS += -DCAFFE_MLSL_SHUFFLE endif -ifeq ($(FW_OVERLAP_OPT), 1) +ifneq ($(FW_OVERLAP_OPT), 0) COMMON_FLAGS += -DFW_OVERLAP_OPT endif endif diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 67adf4ba7..b8c5577c6 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -122,7 +122,7 @@ if(USE_MLSL) if(CAFFE_MLSL_SHUFFLE) add_definitions("-DCAFFE_MLSL_SHUFFLE") endif() - if(FW_OVERLAP_OPT) + if(FW_OVERLAP_OPT OR NOT DEFINED FW_OVERLAP_OPT) message(STATUS "Forward overlapping optimization is enabled!") add_definitions("-DFW_OVERLAP_OPT") endif() diff --git a/include/caffe/multinode/multi_sync.hpp b/include/caffe/multinode/multi_sync.hpp index 6300c4876..905d9fce7 100644 --- a/include/caffe/multinode/multi_sync.hpp +++ b/include/caffe/multinode/multi_sync.hpp @@ -215,10 +215,6 @@ namespace caffe { } void on_iter_finished(int layer_id) { -#ifdef FW_OVERLAP_OPT - solver->set_layer_finished_flag(layer_id, false); -#endif - boost::shared_ptr> &layer = layers[layer_id]; if (layer->layerOp == nullptr) { return; diff --git a/src/caffe/multinode/multi_solver.cpp b/src/caffe/multinode/multi_solver.cpp index 13ad8da2b..54e256631 100644 --- a/src/caffe/multinode/multi_solver.cpp +++ b/src/caffe/multinode/multi_solver.cpp @@ -105,12 +105,13 @@ Dtype MultiSolver::ForwardBackwardImpl(bool first, bool last) { for (int i = 0; i < layers.size(); ++i) { #ifdef FW_OVERLAP_OPT - if (first && IsSkipWaitGradient(i) == false) { + if (first && (IsSkipWaitGradient(i) == false)) { while (layer_finished_flags_[i] == false) { WaitAndUpdateGradient(i); if (layer_finished_flags_[i]) break; + // wait and update gradient for next layers for (int k=i+1; k::ForwardBackwardImpl(bool first, bool last) { break; } } + layer_finished_flags_[i] = false; } #endif @@ -129,6 +131,11 @@ Dtype MultiSolver::ForwardBackwardImpl(bool first, bool last) { LAYER_TIMING_STOP(forward, i); } + // Clear parameter diffs after communication is finished (that is, after + // calling WaitGradientComm) + if (first) + root_solver_->net()->ClearParamDiffs(); + for (int i = layers.size() - 1; i >= 0; --i) { if (!layer_need_backward[i]) { continue; @@ -160,6 +167,11 @@ Dtype MultiSolver::ForwardBackwardImpl(bool first, bool last) { if (last) { #endif for (int i = 0; i < layers.size(); ++i) { +#ifdef FW_OVERLAP_OPT + if (layer_finished_flags_[i]) + continue; +#endif + if (IsSkipWaitGradient(i)) { #ifdef FW_OVERLAP_OPT finished_count++; @@ -167,10 +179,6 @@ Dtype MultiSolver::ForwardBackwardImpl(bool first, bool last) { #endif continue; } -#ifdef FW_OVERLAP_OPT - if (layer_finished_flags_[i]) - continue; -#endif WaitAndUpdateGradient(i); #ifdef FW_OVERLAP_OPT From 5522134cca276eba46ddee6cf194d7182e39d679 Mon Sep 17 00:00:00 2001 From: "Yu, Chong" Date: Thu, 10 Aug 2017 14:17:35 +0800 Subject: [PATCH 15/38] Take out the dependency of ICC. --- src/caffe/solvers/sgd_solver.cpp | 107 ++----------------------------- 1 file changed, 7 insertions(+), 100 deletions(-) diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp index 378fecf9b..e32307dda 100644 --- a/src/caffe/solvers/sgd_solver.cpp +++ b/src/caffe/solvers/sgd_solver.cpp @@ -43,9 +43,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "caffe/util/io.hpp" #include "caffe/util/upgrade_proto.hpp" -#ifdef ENABLE_SGD_FUSION -#include -#endif /* ENABLE_SGD_FUSION */ namespace caffe { template @@ -265,7 +262,8 @@ void axpy_axpby_copy(size_t count, const float decay, const float* net_pa { float temp_result = 0.; #ifdef _OPENMP -#pragma omp parallel for +//#pragma omp parallel for +#pragma omp parallel for simd schedule(static) #endif for (size_t i = 0; i < count; ++i) { temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i]; @@ -280,7 +278,8 @@ void axpy_axpby_copy(size_t count, const double decay, const double* net { double temp_result = 0.; #ifdef _OPENMP -#pragma omp parallel for +//#pragma omp parallel for +#pragma omp parallel for simd schedule(static) #endif for (size_t i = 0; i < count; ++i) { temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i]; @@ -289,98 +288,6 @@ void axpy_axpby_copy(size_t count, const double decay, const double* net } } -template -void avx512_axpy_axpby_copy(size_t count, const Dtype decay, const Dtype* net_params_data, Dtype *net_params_diff, - const Dtype rate, const Dtype momentum, Dtype* history_data); - -template <> -void avx512_axpy_axpby_copy(size_t count, const float decay, const float* net_params_data, float *net_params_diff, - const float rate, const float momentum, float* history_data) -{ - // If count is smaller than 16 we use non-avx512 implementation - // 16 is the element number which one avx512 register can hold - if (count < 16) { - return axpy_axpby_copy(count, decay, net_params_data, net_params_diff, - rate, momentum, history_data); - } - - // If count can't be divided by 16, we handle tailing remainder - // with non-avx512 imeplementation - if (count % 16 != 0) { - size_t remainder = count % 16; - count -= remainder; - axpy_axpby_copy(remainder, decay, net_params_data+count, net_params_diff+count, - rate, momentum, history_data+count); - } - - size_t group_size = 16; -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (size_t idx = 0; idx < count; idx += group_size) { - const float *fnet_params_data = net_params_data + idx; - float *fnet_params_diff = net_params_diff + idx; - float *fhistory_data = history_data + idx; - __m512 operand1_v = _mm512_loadu_ps(fnet_params_data); - __m512 operand2_v = _mm512_loadu_ps(fnet_params_diff); - __m512 operand3_v = _mm512_loadu_ps(fhistory_data); - __m512 decay_operand_v = _mm512_set1_ps(decay); - __m512 rate_operand_v = _mm512_set1_ps(rate); - __m512 momentum_operand_v = _mm512_set1_ps(momentum); - __m512 decay_result = _mm512_mul_ps(decay_operand_v, operand1_v); - __m512 axpy_result = _mm512_add_ps(decay_result, operand2_v); - __m512 rate_result = _mm512_mul_ps(rate_operand_v, axpy_result); - __m512 momentum_result = _mm512_mul_ps(momentum_operand_v, operand3_v); - __m512 axpby_result = _mm512_add_ps(rate_result, momentum_result); - _mm512_storeu_ps(fhistory_data, axpby_result); - _mm512_storeu_ps(fnet_params_diff, axpby_result); - } -} - -template <> -void avx512_axpy_axpby_copy(size_t count, const double decay, const double* net_params_data, double* net_params_diff, - const double rate, const double momentum, double* history_data) -{ - // If count is smaller than 8 we use non-avx512 implementation - // 8 is the element number which one avx512 register can hold - if (count < 8) { - return axpy_axpby_copy(count, decay, net_params_data, net_params_diff, - rate, momentum, history_data); - } - - // If count can't be divided by 8, we handle tailing remainder - // with non-avx512 imeplementation - if (count % 8 != 0) { - size_t remainder = count % 8; - count -= remainder; - axpy_axpby_copy(remainder, decay, net_params_data+count, net_params_diff+count, - rate, momentum, history_data+count); - } - - size_t group_size = 8; -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (size_t idx = 0; idx < count; idx += group_size) { - const double *fnet_params_data = net_params_data + idx; - double *fnet_params_diff = net_params_diff + idx; - double *fhistory_data = history_data + idx; - __m512 operand1_v = _mm512_loadu_pd(fnet_params_data); - __m512 operand2_v = _mm512_loadu_pd(fnet_params_diff); - __m512 operand3_v = _mm512_loadu_pd(fhistory_data); - __m512 decay_operand_v = _mm512_set1_pd(decay); - __m512 rate_operand_v = _mm512_set1_pd(rate); - __m512 momentum_operand_v = _mm512_set1_pd(momentum); - __m512 decay_result = _mm512_mul_pd(decay_operand_v, operand1_v); - __m512 axpy_result = _mm512_add_pd(decay_result, operand2_v); - __m512 rate_result = _mm512_mul_pd(rate_operand_v, axpy_result); - __m512 momentum_result = _mm512_mul_pd(momentum_operand_v, operand3_v); - __m512 axpby_result = _mm512_add_pd(rate_result, momentum_result); - _mm512_storeu_pd(fhistory_data, axpby_result); - _mm512_storeu_pd(fnet_params_diff, axpby_result); - } -} - template void SGDSolver::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_id, Dtype rate) { @@ -483,7 +390,7 @@ void SGDSolver::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_ net_params[param_id]->mutable_prv_diff()); */ - avx512_axpy_axpby_copy(net_params[param_id]->count(), local_decay, + axpy_axpby_copy(net_params[param_id]->count(), local_decay, net_params[param_id]->prv_data(), net_params[param_id]->mutable_prv_diff(), local_rate, momentum, history_[param_id]->mutable_cpu_data()); @@ -527,7 +434,7 @@ void SGDSolver::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_ net_params[param_id]->mutable_cpu_diff()); */ - avx512_axpy_axpby_copy(net_params[param_id]->count(), local_decay, + axpy_axpby_copy(net_params[param_id]->count(), local_decay, net_params[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff(), local_rate, momentum, history_[param_id]->mutable_cpu_data()); @@ -563,7 +470,7 @@ void SGDSolver::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_ net_params[param_id]->mutable_cpu_diff()); */ - avx512_axpy_axpby_copy(net_params[param_id]->count(), local_decay, + axpy_axpby_copy(net_params[param_id]->count(), local_decay, temp_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff(), local_rate, momentum, history_[param_id]->mutable_cpu_data()); From 1fb45f844c229655e8ed2554c54467661a4a24fd Mon Sep 17 00:00:00 2001 From: fzou1 Date: Thu, 10 Aug 2017 15:42:23 +0800 Subject: [PATCH 16/38] fix regression by removing duplicated ClearParamDiffs call; and correct loss for displaying by multiplying NO of nodes for data parallelism --- src/caffe/multinode/multi_solver.cpp | 1 - src/caffe/solver.cpp | 11 ++++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/caffe/multinode/multi_solver.cpp b/src/caffe/multinode/multi_solver.cpp index 54e256631..d21fb5580 100644 --- a/src/caffe/multinode/multi_solver.cpp +++ b/src/caffe/multinode/multi_solver.cpp @@ -198,7 +198,6 @@ Dtype MultiSolver::ForwardBackwardImpl(bool first, bool last) { template Dtype MultiSolver::ForwardBackward() { Dtype loss = 0; - root_solver_->net()->ClearParamDiffs(); for (int i = 0; i < iter_size; ++i) { loss += ForwardBackwardImpl( (i == 0), (i + 1 == iter_size)); diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index 3c8d1e66b..f7e7ac1cd 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -321,7 +321,12 @@ void Solver::Step(int iters) { const string& output_name = net_->blob_names()[net_->output_blob_indices()[j]]; const Dtype loss_weight = - net_->blob_loss_weights()[net_->output_blob_indices()[j]]; + net_->blob_loss_weights()[net_->output_blob_indices()[j]] +#ifdef USE_MLSL + * mn::get_distrib()->get_data_parts() +#endif + ; + for (int k = 0; k < result[j]->count(); ++k) { ostringstream loss_msg_stream; if (loss_weight) { @@ -928,6 +933,10 @@ void Solver::Restore(const char* state_file) { template void Solver::UpdateSmoothedLoss(Dtype loss, int start_iter, int average_loss) { +#ifdef USE_MLSL + loss *= mn::get_distrib()->get_data_parts(); +#endif + if (losses_.size() < average_loss) { losses_.push_back(loss); int size = losses_.size(); From 63c3bb5c84c812230ab869e08f4efa337a67a3b7 Mon Sep 17 00:00:00 2001 From: "Yu, Chong" Date: Thu, 10 Aug 2017 15:47:59 +0800 Subject: [PATCH 17/38] Fix the error to build with gcc 4.8. --- src/caffe/solvers/sgd_solver.cpp | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp index e32307dda..8b1a70e87 100644 --- a/src/caffe/solvers/sgd_solver.cpp +++ b/src/caffe/solvers/sgd_solver.cpp @@ -212,21 +212,11 @@ void SGDSolver::ApplyUpdate(int param_id) { #ifdef ENABLE_SGD_FUSION if (Caffe::mode() == Caffe::CPU) { - const unsigned long avx512_features = (_FEATURE_AVX512F | _FEATURE_AVX512CD); - bool avx512_enabled_ = _may_i_use_cpu_feature(avx512_features); - if (avx512_enabled_) - { - //LOG(INFO) << "Avx512 command is supported!"; - //VLOG(1) << "Use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD"; - //LOG(INFO) << "Use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD"; - Normalize_Regularize_ComputeUpdateValue_Fusion(param_id, rate); - this->net_->learnable_params()[param_id]->Update(); - return; - } - else - { - //LOG(INFO) << "Avx512 command is not supported, so cannot use the SGD fusion!"; - } + //VLOG(1) << "Use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD"; + //LOG(INFO) << "Use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD"; + Normalize_Regularize_ComputeUpdateValue_Fusion(param_id, rate); + this->net_->learnable_params()[param_id]->Update(); + return; } #endif /* ENABLE_SGD_FUSION */ @@ -262,8 +252,9 @@ void axpy_axpby_copy(size_t count, const float decay, const float* net_pa { float temp_result = 0.; #ifdef _OPENMP -//#pragma omp parallel for -#pragma omp parallel for simd schedule(static) +//#pragma omp parallel for simd schedule(static) //Not work for GCC 4.8 +#pragma omp parallel for schedule(static) +#pragma simd #endif for (size_t i = 0; i < count; ++i) { temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i]; @@ -278,8 +269,9 @@ void axpy_axpby_copy(size_t count, const double decay, const double* net { double temp_result = 0.; #ifdef _OPENMP -//#pragma omp parallel for -#pragma omp parallel for simd schedule(static) +//#pragma omp parallel for simd schedule(static) //Not work for GCC 4.8 +#pragma omp parallel for schedule(static) +#pragma simd #endif for (size_t i = 0; i < count; ++i) { temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i]; From 02e0829418e23863dd9b06ce694859dbaf43290e Mon Sep 17 00:00:00 2001 From: fzou1 Date: Fri, 11 Aug 2017 10:28:43 +0800 Subject: [PATCH 18/38] fix hang issue during testing by moving checking flag after IsSkipWaitGradient --- src/caffe/multinode/multi_solver.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/caffe/multinode/multi_solver.cpp b/src/caffe/multinode/multi_solver.cpp index d21fb5580..ad5a7066b 100644 --- a/src/caffe/multinode/multi_solver.cpp +++ b/src/caffe/multinode/multi_solver.cpp @@ -167,11 +167,6 @@ Dtype MultiSolver::ForwardBackwardImpl(bool first, bool last) { if (last) { #endif for (int i = 0; i < layers.size(); ++i) { -#ifdef FW_OVERLAP_OPT - if (layer_finished_flags_[i]) - continue; -#endif - if (IsSkipWaitGradient(i)) { #ifdef FW_OVERLAP_OPT finished_count++; @@ -179,6 +174,10 @@ Dtype MultiSolver::ForwardBackwardImpl(bool first, bool last) { #endif continue; } +#ifdef FW_OVERLAP_OPT + if (layer_finished_flags_[i]) + continue; +#endif WaitAndUpdateGradient(i); #ifdef FW_OVERLAP_OPT From 668599e62a877f5382bcb6c068e0fc9943891c5c Mon Sep 17 00:00:00 2001 From: linxinan Date: Mon, 14 Aug 2017 12:33:10 +0800 Subject: [PATCH 19/38] Add 5 warm up iterations in caffe time, because the first several iteration times have huge variance in some machines. --- tools/caffe.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tools/caffe.cpp b/tools/caffe.cpp index 231209127..5d0ea7f49 100644 --- a/tools/caffe.cpp +++ b/tools/caffe.cpp @@ -547,6 +547,22 @@ int time() { const vector*> >& top_vecs = caffe_net.top_vecs(); const vector >& bottom_need_backward = caffe_net.bottom_need_backward(); + + // Warm up 5 iterations here, because the first several iteration times + // have huge variance in some machines. + int warmup_iterations = 5; + for (int j = 0; j < warmup_iterations; ++j) { + for (int i = 0; i < layers.size(); ++i) { + layers[i]->Forward(bottom_vecs[i], top_vecs[i]); + } + if (!FLAGS_forward_only) { + for (int i = layers.size() - 1; i >= 0; --i) { + layers[i]->Backward(top_vecs[i], bottom_need_backward[i], + bottom_vecs[i]); + } + } + } + LOG(INFO) << "*** Benchmark begins ***"; LOG(INFO) << "Testing for " << FLAGS_iterations << " iterations."; Timer total_timer; From 5fb759e3e7134bcb2d9efcb2b01d9bfa59f64cb1 Mon Sep 17 00:00:00 2001 From: fzou1 Date: Mon, 14 Aug 2017 16:38:11 +0800 Subject: [PATCH 20/38] fix accuracy issue --- src/caffe/multinode/multi_solver.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/caffe/multinode/multi_solver.cpp b/src/caffe/multinode/multi_solver.cpp index ad5a7066b..59eec8c7c 100644 --- a/src/caffe/multinode/multi_solver.cpp +++ b/src/caffe/multinode/multi_solver.cpp @@ -105,8 +105,10 @@ Dtype MultiSolver::ForwardBackwardImpl(bool first, bool last) { for (int i = 0; i < layers.size(); ++i) { #ifdef FW_OVERLAP_OPT - if (first && (IsSkipWaitGradient(i) == false)) { + if (first) { while (layer_finished_flags_[i] == false) { + if (IsSkipWaitGradient(i)) + break; WaitAndUpdateGradient(i); if (layer_finished_flags_[i]) break; @@ -167,6 +169,10 @@ Dtype MultiSolver::ForwardBackwardImpl(bool first, bool last) { if (last) { #endif for (int i = 0; i < layers.size(); ++i) { +#ifdef FW_OVERLAP_OPT + if (layer_finished_flags_[i]) + continue; +#endif if (IsSkipWaitGradient(i)) { #ifdef FW_OVERLAP_OPT finished_count++; @@ -174,10 +180,6 @@ Dtype MultiSolver::ForwardBackwardImpl(bool first, bool last) { #endif continue; } -#ifdef FW_OVERLAP_OPT - if (layer_finished_flags_[i]) - continue; -#endif WaitAndUpdateGradient(i); #ifdef FW_OVERLAP_OPT From 1f9468a0b0f1c72ab45c8d02ce51194a74c98871 Mon Sep 17 00:00:00 2001 From: "Yu, Chong" Date: Mon, 14 Aug 2017 23:47:30 +0800 Subject: [PATCH 21/38] Fuse the Update stage together in SGD update process. --- include/caffe/sgd_solvers.hpp | 4 +- src/caffe/solvers/sgd_solver.cpp | 135 ++++++++++++++++++++----------- 2 files changed, 88 insertions(+), 51 deletions(-) diff --git a/include/caffe/sgd_solvers.hpp b/include/caffe/sgd_solvers.hpp index 9741ef212..09f6ff26e 100644 --- a/include/caffe/sgd_solvers.hpp +++ b/include/caffe/sgd_solvers.hpp @@ -82,8 +82,8 @@ class SGDSolver : public Solver { vector > > history_, update_, temp_; #ifdef ENABLE_SGD_FUSION - //Fuse the Normalize, Regularize and ComputeUpdateValue process together - void Normalize_Regularize_ComputeUpdateValue_Fusion(int param_id, Dtype rate); + //Fuse the Normalize, Regularize, ComputeUpdateValue and Update process together + void SGDFusion(int param_id, Dtype rate); #endif /* ENABLE_SGD_FUSION */ // loss history for 'plateau' LR policy (should be stored in snapshots) diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp index 8b1a70e87..fafe8a418 100644 --- a/src/caffe/solvers/sgd_solver.cpp +++ b/src/caffe/solvers/sgd_solver.cpp @@ -212,10 +212,9 @@ void SGDSolver::ApplyUpdate(int param_id) { #ifdef ENABLE_SGD_FUSION if (Caffe::mode() == Caffe::CPU) { - //VLOG(1) << "Use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD"; - //LOG(INFO) << "Use Normalize_Regularize_ComputeUpdateValue_Fusion for SGD"; - Normalize_Regularize_ComputeUpdateValue_Fusion(param_id, rate); - this->net_->learnable_params()[param_id]->Update(); + //VLOG(1) << "Use Normalize_Regularize_ComputeUpdateValue_Update_Fusion for SGD"; + //LOG(INFO) << "Use Normalize_Regularize_ComputeUpdateValue_Update_Fusion for SGD"; + SGDFusion(param_id, rate); return; } #endif /* ENABLE_SGD_FUSION */ @@ -242,6 +241,8 @@ void SGDSolver::ApplyUpdate(int param_id) { #ifdef ENABLE_SGD_FUSION //Math function for fusion +//Function 1: axpy_axpby_copy +//Start: For L1 Regularize_ComputeUpdateValue_Fusion template void axpy_axpby_copy(size_t count, const Dtype decay, const Dtype* net_params_data, Dtype *net_params_diff, const Dtype rate, const Dtype momentum, Dtype* history_data); @@ -253,13 +254,13 @@ void axpy_axpby_copy(size_t count, const float decay, const float* net_pa float temp_result = 0.; #ifdef _OPENMP //#pragma omp parallel for simd schedule(static) //Not work for GCC 4.8 -#pragma omp parallel for schedule(static) +#pragma omp parallel for schedule(static) #pragma simd #endif for (size_t i = 0; i < count; ++i) { temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i]; - history_data[i] = temp_result; - net_params_diff[i] = temp_result; + history_data[i] = temp_result; + net_params_diff[i] = temp_result; } } @@ -270,19 +271,62 @@ void axpy_axpby_copy(size_t count, const double decay, const double* net double temp_result = 0.; #ifdef _OPENMP //#pragma omp parallel for simd schedule(static) //Not work for GCC 4.8 -#pragma omp parallel for schedule(static) +#pragma omp parallel for schedule(static) +#pragma simd +#endif + for (size_t i = 0; i < count; ++i) { + temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i]; + history_data[i] = temp_result; + net_params_diff[i] = temp_result; + } +} +//End: For L1 Regularize_ComputeUpdateValue_Fusion + +//Function 2: axpy_axpby_copy_axpy +//Start: For L2 Regularize_ComputeUpdateValue_Update_Fusion +template +void axpy_axpby_copy_axpy(size_t count, const Dtype decay, Dtype* net_params_data, Dtype *net_params_diff, + const Dtype rate, const Dtype momentum, Dtype* history_data, const Dtype update_param); + +template <> +void axpy_axpby_copy_axpy(size_t count, const float decay, float* net_params_data, float *net_params_diff, + const float rate, const float momentum, float* history_data, const float update_param) +{ + float temp_result = 0.; +#ifdef _OPENMP +//#pragma omp parallel for simd schedule(static) //Not work for GCC 4.8 +#pragma omp parallel for schedule(static) #pragma simd #endif for (size_t i = 0; i < count; ++i) { temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i]; history_data[i] = temp_result; - net_params_diff[i] = temp_result; + net_params_diff[i] = temp_result; + net_params_data[i] = update_param * temp_result + net_params_data[i]; } } +template <> +void axpy_axpby_copy_axpy(size_t count, const double decay, double* net_params_data, double *net_params_diff, + const double rate, const double momentum, double* history_data, const double update_param) +{ + double temp_result = 0.; +#ifdef _OPENMP +//#pragma omp parallel for simd schedule(static) //Not work for GCC 4.8 +#pragma omp parallel for schedule(static) +#pragma simd +#endif + for (size_t i = 0; i < count; ++i) { + temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i]; + net_params_diff[i] = temp_result; + net_params_data[i] = update_param * temp_result + net_params_data[i]; + } +} +//End: For L2 Regularize_ComputeUpdateValue_Update_Fusion + template -void SGDSolver::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_id, Dtype rate) { +void SGDSolver::SGDFusion(int param_id, Dtype rate) { //LOG(INFO) << "Fusion: Param_id: " << param_id; //#pragma region 1. Common initialization @@ -310,7 +354,7 @@ void SGDSolver::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_ bool prv_diff_condition_flag = false; if (net_params[param_id]->prv_diff() && (net_params[param_id]->prv_diff_count() - == net_params[param_id]->count())) { + == net_params[param_id]->prv_data_count())) { prv_diff_condition_flag = true; //LOG(INFO) << "Common condition judgement: prv_diff_condition_flag = true."; } @@ -329,7 +373,7 @@ void SGDSolver::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_ if (prv_diff_condition_flag) { //LOG(INFO) << "Normalize stage: prv_diff_condition_flag = true."; - caffe_scal(net_params[param_id]->count(), accum_normalization, + caffe_scal(net_params[param_id]->prv_data_count(), accum_normalization, net_params[param_id]->mutable_prv_diff()); } else { @@ -349,8 +393,10 @@ void SGDSolver::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_ //If prv_diff_condition_flag == false, then prv_data_condition_flag == false (2) //Another case is local_decay == 0, prv_diff_condition_flag == false (3) //So only need to consider the fusion in situations (1) and (2), set execute_separate_ComputeUpdateValue_stage_flag to false value -//We can extend the fusion in L1 regularization - bool execute_separate_ComputeUpdateValue_stage_flag = true; +//We can extend the fusion in L1 regularization by axpy_axpby_copy +//We extend the fusion of Update stage in L2 regularization by axpy_axpby_copy_axpy, +//then need to change execute_separate_ComputeUpdateValue_stage_flag to execute_separate_ComputeUpdateValue_Update_stage_flag + bool execute_separate_ComputeUpdateValue_Update_stage_flag = true; //Regularize stage (Fused ComputeUpdateValue_stage in some situations) if (local_decay) { if (regularization_type == "L2") { @@ -381,28 +427,20 @@ void SGDSolver::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_ history_[param_id]->cpu_data(), net_params[param_id]->mutable_prv_diff()); */ + + if(net_params[param_id]->prv_data_count() != history_[param_id]->count()) + history_[param_id]->Reshape(net_params[param_id]->shape()); - axpy_axpby_copy(net_params[param_id]->count(), local_decay, - net_params[param_id]->prv_data(), net_params[param_id]->mutable_prv_diff(), - local_rate, momentum, history_[param_id]->mutable_cpu_data()); + axpy_axpby_copy_axpy(net_params[param_id]->prv_data_count(), local_decay, + net_params[param_id]->mutable_prv_data(), net_params[param_id]->mutable_prv_diff(), + local_rate, momentum, history_[param_id]->mutable_cpu_data(), Dtype(-1)); - execute_separate_ComputeUpdateValue_stage_flag = false; + execute_separate_ComputeUpdateValue_Update_stage_flag = false; } else { //Will not happen! //LOG(INFO) << "Cannot Fused ComputeUpdateValue stage: prv_diff_condition_flag = false."; - caffe_cpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->cpu_diff(), momentum, - history_[param_id]->mutable_cpu_data()); - - caffe_copy(net_params[param_id]->count(), - history_[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); - - execute_separate_ComputeUpdateValue_stage_flag = false; - //You can set the flag to true value, and not execute caffe_cpu_axpby and caffe_copy - //But set to false value and execute caffe_cpu_axpby and caffe_copy inside will save one condition judgement time } } else { //LOG(INFO) << "Regularize stage: prv_data_condition_flag = false."; @@ -426,27 +464,18 @@ void SGDSolver::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_ net_params[param_id]->mutable_cpu_diff()); */ - axpy_axpby_copy(net_params[param_id]->count(), local_decay, - net_params[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff(), - local_rate, momentum, history_[param_id]->mutable_cpu_data()); + axpy_axpby_copy_axpy(net_params[param_id]->count(), local_decay, + net_params[param_id]->mutable_cpu_data(), net_params[param_id]->mutable_cpu_diff(), + local_rate, momentum, history_[param_id]->mutable_cpu_data(), Dtype(-1)); - execute_separate_ComputeUpdateValue_stage_flag = false; + execute_separate_ComputeUpdateValue_Update_stage_flag = false; } else { //Will not happen! //LOG(INFO) << "Cannot Fused ComputeUpdateValue stage: prv_diff_condition_flag = true."; - caffe_cpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->prv_diff(), momentum, - history_[param_id]->mutable_cpu_data()); - - caffe_copy(net_params[param_id]->count(), - history_[param_id]->cpu_data(), - net_params[param_id]->mutable_prv_diff()); - - execute_separate_ComputeUpdateValue_stage_flag = false; - //You can set the flag to true value, and not execute caffe_cpu_axpby and caffe_copy - //But set to false value and execute caffe_cpu_axpby and caffe_copy inside will save one condition judgement time + if(net_params[param_id]->prv_data_count() != history_[param_id]->count()) + history_[param_id]->Reshape(net_params[param_id]->shape()); } } } else if (regularization_type == "L1") { @@ -466,14 +495,17 @@ void SGDSolver::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_ temp_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff(), local_rate, momentum, history_[param_id]->mutable_cpu_data()); - execute_separate_ComputeUpdateValue_stage_flag = false; + execute_separate_ComputeUpdateValue_Update_stage_flag = false; + + //Update stage (separate) + net_params[param_id]->Update(); } else { LOG(FATAL) << "Unknown regularization type: " << regularization_type; } } - //ComputeUpdateValue stage (separate) - if (execute_separate_ComputeUpdateValue_stage_flag == true) + //ComputeUpdateValue_Update stage (separate) + if (execute_separate_ComputeUpdateValue_Update_stage_flag == true) { //Include the situation: regularization_type == "Unknown" //Include situations (3): local_decay == 0 @@ -481,11 +513,13 @@ void SGDSolver::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_ //ComputeUpdateValue stage if (prv_diff_condition_flag) { //LOG(INFO) << "ComputeUpdateValue stage: prv_diff_condition_flag = true."; - caffe_cpu_axpby(net_params[param_id]->count(), local_rate, + if(net_params[param_id]->prv_data_count() != history_[param_id]->count()) + history_[param_id]->Reshape(net_params[param_id]->shape()); + caffe_cpu_axpby(net_params[param_id]->prv_data_count(), local_rate, net_params[param_id]->prv_diff(), momentum, history_[param_id]->mutable_cpu_data()); - caffe_copy(net_params[param_id]->count(), + caffe_copy(net_params[param_id]->prv_data_count(), history_[param_id]->cpu_data(), net_params[param_id]->mutable_prv_diff()); } else { @@ -498,6 +532,9 @@ void SGDSolver::Normalize_Regularize_ComputeUpdateValue_Fusion(int param_ history_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff()); } + + //Update stage (separate) + net_params[param_id]->Update(); } } From ad50db99b6c654db39b84b62121f3136beb87a3d Mon Sep 17 00:00:00 2001 From: "Yu, Chong" Date: Tue, 15 Aug 2017 10:29:27 +0800 Subject: [PATCH 22/38] Simplify the flag name. --- src/caffe/solvers/sgd_solver.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp index fafe8a418..929ff050f 100644 --- a/src/caffe/solvers/sgd_solver.cpp +++ b/src/caffe/solvers/sgd_solver.cpp @@ -396,7 +396,8 @@ void SGDSolver::SGDFusion(int param_id, Dtype rate) { //We can extend the fusion in L1 regularization by axpy_axpby_copy //We extend the fusion of Update stage in L2 regularization by axpy_axpby_copy_axpy, //then need to change execute_separate_ComputeUpdateValue_stage_flag to execute_separate_ComputeUpdateValue_Update_stage_flag - bool execute_separate_ComputeUpdateValue_Update_stage_flag = true; +//Simplify the execute_separate_ComputeUpdateValue_Update_stage_flag to is_separate_ComputeUpdateValue_Update + bool is_separate_ComputeUpdateValue_Update = true; //Regularize stage (Fused ComputeUpdateValue_stage in some situations) if (local_decay) { if (regularization_type == "L2") { @@ -435,7 +436,7 @@ void SGDSolver::SGDFusion(int param_id, Dtype rate) { net_params[param_id]->mutable_prv_data(), net_params[param_id]->mutable_prv_diff(), local_rate, momentum, history_[param_id]->mutable_cpu_data(), Dtype(-1)); - execute_separate_ComputeUpdateValue_Update_stage_flag = false; + is_separate_ComputeUpdateValue_Update = false; } else { @@ -468,7 +469,7 @@ void SGDSolver::SGDFusion(int param_id, Dtype rate) { net_params[param_id]->mutable_cpu_data(), net_params[param_id]->mutable_cpu_diff(), local_rate, momentum, history_[param_id]->mutable_cpu_data(), Dtype(-1)); - execute_separate_ComputeUpdateValue_Update_stage_flag = false; + is_separate_ComputeUpdateValue_Update = false; } else { @@ -495,7 +496,7 @@ void SGDSolver::SGDFusion(int param_id, Dtype rate) { temp_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff(), local_rate, momentum, history_[param_id]->mutable_cpu_data()); - execute_separate_ComputeUpdateValue_Update_stage_flag = false; + is_separate_ComputeUpdateValue_Update = false; //Update stage (separate) net_params[param_id]->Update(); @@ -505,7 +506,7 @@ void SGDSolver::SGDFusion(int param_id, Dtype rate) { } //ComputeUpdateValue_Update stage (separate) - if (execute_separate_ComputeUpdateValue_Update_stage_flag == true) + if (is_separate_ComputeUpdateValue_Update == true) { //Include the situation: regularization_type == "Unknown" //Include situations (3): local_decay == 0 From 149b4a9be9f2305cae080a03443a9bba067b11fc Mon Sep 17 00:00:00 2001 From: "Gong, Jiong" Date: Fri, 18 Aug 2017 00:57:46 +0800 Subject: [PATCH 23/38] Enable bn stats batch size in caffe engine --- include/caffe/layers/batch_norm_layer.hpp | 8 ++ .../caffe/util/apply_bn_stats_batch_size.hpp | 45 +++++++++ src/caffe/layers/batch_norm_layer.cpp | 93 ++++++++++++------- src/caffe/net.cpp | 7 ++ src/caffe/proto/caffe.proto | 5 + src/caffe/util/apply_bn_stats_batch_size.cpp | 57 ++++++++++++ 6 files changed, 184 insertions(+), 31 deletions(-) create mode 100644 include/caffe/util/apply_bn_stats_batch_size.hpp create mode 100644 src/caffe/util/apply_bn_stats_batch_size.cpp diff --git a/include/caffe/layers/batch_norm_layer.hpp b/include/caffe/layers/batch_norm_layer.hpp index e83bab953..c777de30c 100644 --- a/include/caffe/layers/batch_norm_layer.hpp +++ b/include/caffe/layers/batch_norm_layer.hpp @@ -117,11 +117,19 @@ class BatchNormLayer : public Layer { const Dtype* data_to_be_replicated, FuncTy op_func); + void ForwardStatsBatch_cpu(const vector*>& bottom, + const vector*>& top, int stats_batch_idx); + void BackwardStatsBatch_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom, + int stats_batch_idx); + Blob mean_, variance_, temp_, x_norm_; bool use_global_stats_; Dtype moving_average_fraction_; int channels_; Dtype eps_; + int num_stats_batches_; + int stats_batch_size_; // extra temporarary variables is used to carry out sums/broadcasting // using BLAS diff --git a/include/caffe/util/apply_bn_stats_batch_size.hpp b/include/caffe/util/apply_bn_stats_batch_size.hpp new file mode 100644 index 000000000..872b2c5bf --- /dev/null +++ b/include/caffe/util/apply_bn_stats_batch_size.hpp @@ -0,0 +1,45 @@ +/* +All modification made by Intel Corporation: © 2017 Intel Corporation + +All contributions by the University of California: +Copyright (c) 2014, 2015, The Regents of the University of California (Regents) +All rights reserved. + +All other contributions: +Copyright (c) 2014, 2015, the respective contributors +All rights reserved. +For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md + + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef APPLY_BN_STATS_BATCH_SIZE_HPP_ +#define APPLY_BN_STATS_BATCH_SIZE_HPP_ +#include "caffe/proto/caffe.pb.h" + +namespace caffe { +void ApplyBnStatsBatchSize(const NetParameter& param, + NetParameter* param_with_stats_batch_size); +} +#endif // APPLY_BN_STATS_BATCH_SIZE_HPP_ diff --git a/src/caffe/layers/batch_norm_layer.cpp b/src/caffe/layers/batch_norm_layer.cpp index b7746d988..dada5873d 100644 --- a/src/caffe/layers/batch_norm_layer.cpp +++ b/src/caffe/layers/batch_norm_layer.cpp @@ -81,13 +81,22 @@ void BatchNormLayer::Reshape(const vector*>& bottom, CHECK_EQ(bottom[0]->shape(1), channels_); top[0]->ReshapeLike(*bottom[0]); + num_stats_batches_ = 1; + stats_batch_size_ = bottom[0]->shape(0); + BatchNormParameter param = this->layer_param_.batch_norm_param(); + if (!use_global_stats_ && param.stats_batch_size() > 0) { + CHECK_EQ(bottom[0]->shape(0) % param.stats_batch_size(), 0); + num_stats_batches_ = bottom[0]->shape(0) / param.stats_batch_size(); + stats_batch_size_ = param.stats_batch_size(); + } + vector sz; sz.push_back(channels_); mean_.Reshape(sz); variance_.Reshape(sz); temp_.ReshapeLike(*bottom[0]); x_norm_.ReshapeLike(*bottom[0]); - sz[0]=bottom[0]->shape(0); + sz[0]=stats_batch_size_; batch_sum_multiplier_.Reshape(sz); int spatial_dim = bottom[0]->count(2); @@ -99,7 +108,7 @@ void BatchNormLayer::Reshape(const vector*>& bottom, caffe_set(spatial_sum_multiplier_.count(), Dtype(1), multiplier_data); } - int numbychans = channels_*bottom[0]->shape(0); + int numbychans = channels_*stats_batch_size_; if (num_by_chans_.num_axes() == 0 || num_by_chans_.shape(0) != numbychans) { sz[0] = numbychans; @@ -149,18 +158,20 @@ void BatchNormLayer::replicate_to_op(Dtype* buffer_to_write, } } - - template -void BatchNormLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->cpu_data(); - Dtype* top_data = top[0]->mutable_cpu_data(); - int num = bottom[0]->shape(0); +void BatchNormLayer::ForwardStatsBatch_cpu(const vector*>& bottom, + const vector*>& top, int stats_batch_idx) { + long data_stats_count = stats_batch_size_ * bottom[0]->count(1); + long data_offset = stats_batch_idx * data_stats_count; + const Dtype* bottom_data = bottom[0]->cpu_data() + data_offset; + Dtype* top_data = top[0]->mutable_cpu_data() + data_offset; + Dtype* temp_data = temp_.mutable_cpu_data() + data_offset; + Dtype* x_norm_data = x_norm_.mutable_cpu_data() + data_offset; + int num = stats_batch_size_; int spatial_dim = bottom[0]->count()/(bottom[0]->shape(0)*channels_); if (bottom[0] != top[0]) { - caffe_copy(bottom[0]->count(), bottom_data, top_data); + caffe_copy(data_stats_count, bottom_data, top_data); } if (use_global_stats_) { @@ -192,10 +203,10 @@ void BatchNormLayer::Forward_cpu(const vector*>& bottom, if (!use_global_stats_) { // compute variance using var(X) = E((X-EX)^2) - caffe_powx(top[0]->count(), top_data, Dtype(2), - temp_.mutable_cpu_data()); // (X-EX)^2 + caffe_powx(data_stats_count, top_data, Dtype(2), + temp_data); // (X-EX)^2 caffe_cpu_gemv(CblasNoTrans, channels_ * num, spatial_dim, - 1. / (num * spatial_dim), temp_.cpu_data(), + 1. / (num * spatial_dim), temp_data, spatial_sum_multiplier_.cpu_data(), 0., num_by_chans_.mutable_cpu_data()); caffe_cpu_gemv(CblasTrans, num, channels_, 1., @@ -220,37 +231,40 @@ void BatchNormLayer::Forward_cpu(const vector*>& bottom, variance_.mutable_cpu_data()); // replicate variance to input size - this->replicate(temp_.mutable_cpu_data(), + this->replicate(temp_data, num, spatial_dim*channels_, spatial_dim, variance_.cpu_data()); - caffe_div(temp_.count(), top_data, temp_.cpu_data(), top_data); + caffe_div(data_stats_count, top_data, temp_data, top_data); // TODO(cdoersch): The caching is only needed because later in-place layers // might clobber the data. Can we skip this if they won't? - caffe_copy(x_norm_.count(), top_data, - x_norm_.mutable_cpu_data()); + caffe_copy(data_stats_count, top_data, + x_norm_data); } template -void BatchNormLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { +void BatchNormLayer::BackwardStatsBatch_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom, + int stats_batch_idx) { + long data_stats_count = stats_batch_size_ * bottom[0]->count(1); + long data_offset = stats_batch_idx * data_stats_count; const Dtype* top_diff; if (bottom[0] != top[0]) { - top_diff = top[0]->cpu_diff(); + top_diff = top[0]->cpu_diff() + data_offset; } else { - caffe_copy(x_norm_.count(), top[0]->cpu_diff(), x_norm_.mutable_cpu_diff()); - top_diff = x_norm_.cpu_diff(); + caffe_copy(data_stats_count, top[0]->cpu_diff() + data_offset, + x_norm_.mutable_cpu_diff() + data_offset); + top_diff = x_norm_.cpu_diff() + data_offset; } - Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff() + data_offset; if (use_global_stats_) { - caffe_div(temp_.count(), top_diff, temp_.cpu_data(), bottom_diff); + caffe_div(data_stats_count, top_diff, temp_.cpu_data() + data_offset, bottom_diff); return; } - const Dtype* top_data = x_norm_.cpu_data(); - int num = bottom[0]->shape()[0]; + const Dtype* top_data = x_norm_.cpu_data() + data_offset; + int num = stats_batch_size_; int spatial_dim = bottom[0]->count()/(bottom[0]->shape(0)*channels_); // if Y = (X-mean(X))/(sqrt(var(X)+eps)), then // @@ -265,7 +279,7 @@ void BatchNormLayer::Backward_cpu(const vector*>& top, // dimensions except the channels dimension where required. // sum(dE/dY \cdot Y) - caffe_mul(temp_.count(), top_data, top_diff, bottom_diff); + caffe_mul(data_stats_count, top_data, top_diff, bottom_diff); caffe_cpu_gemv(CblasNoTrans, channels_ * num, spatial_dim, 1., bottom_diff, spatial_sum_multiplier_.cpu_data(), 0., num_by_chans_.mutable_cpu_data()); @@ -280,7 +294,7 @@ void BatchNormLayer::Backward_cpu(const vector*>& top, mean_.cpu_data()); // sum(dE/dY \cdot Y) \cdot Y - caffe_mul(temp_.count(), top_data, bottom_diff, bottom_diff); + caffe_mul(data_stats_count, top_data, bottom_diff, bottom_diff); // sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y caffe_cpu_gemv(CblasNoTrans, channels_ * num, spatial_dim, 1., @@ -300,12 +314,29 @@ void BatchNormLayer::Backward_cpu(const vector*>& top, std::plus()); // dE/dY - mean(dE/dY)-mean(dE/dY \cdot Y) \cdot Y - caffe_cpu_axpby(temp_.count(), Dtype(1), top_diff, + caffe_cpu_axpby(data_stats_count, Dtype(1), top_diff, Dtype(-1. / (num * spatial_dim)), bottom_diff); // note: temp_ still contains sqrt(var(X)+eps), computed during the forward // pass. - caffe_div(temp_.count(), bottom_diff, temp_.cpu_data(), bottom_diff); + caffe_div(data_stats_count, bottom_diff, temp_.cpu_data() + data_offset, bottom_diff); +} + +template +void BatchNormLayer::Forward_cpu(const vector*>& bottom, + const vector*>& top) { + for (int i = 0; i < num_stats_batches_; i++) { + ForwardStatsBatch_cpu(bottom, top, i); + } +} + +template +void BatchNormLayer::Backward_cpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + for (int i = 0; i < num_stats_batches_; i++) { + BackwardStatsBatch_cpu(top, propagate_down, bottom, i); + } } diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index 10621c531..9fda127c6 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -62,6 +62,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "caffe/multinode/mlsl.hpp" #include "caffe/multinode/apply_mn_param.hpp" #include "caffe/util/remove_batch_norm.hpp" +#include "caffe/util/apply_bn_stats_batch_size.hpp" PERFORMANCE_CREATE_MONITOR(); @@ -147,6 +148,12 @@ void Net::Init(const NetParameter& in_param) { this->kept_bn_layers_.push_back(param.compile_net_state().kept_bn_layers(idx)); } + NetParameter param_with_stats_batch_size; + if (param.has_bn_stats_batch_size()) { + ApplyBnStatsBatchSize(param, ¶m_with_stats_batch_size); + param = param_with_stats_batch_size; + } + #ifdef USE_MLSL NetParameter param_with_mn; if (mn::is_multinode()) { diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto index eaf9b6e6b..cd6cb761f 100644 --- a/src/caffe/proto/caffe.proto +++ b/src/caffe/proto/caffe.proto @@ -208,6 +208,9 @@ message NetParameter { optional string engine = 9 [default = ""]; + // Batch size used for BatchNorm statistics, 0 would use the batch size of bottom blob + optional uint32 bn_stats_batch_size = 11 [default = 0]; + // The layers that make up the net. Each of their configurations, including // connectivity and behavior, is specified as a LayerParameter. repeated LayerParameter layer = 100; // ID 100 so layers are printed last. @@ -900,6 +903,8 @@ message BatchNormParameter { optional bool bias_term = 6 [default = true]; // whether to have bias terms optional FillerParameter filler = 7; // The filler for the weight optional FillerParameter bias_filler = 8; // The filler for the bias + // Batch size used for statistics, 0 would use the batch size of bottom blob + optional uint32 stats_batch_size = 9 [default = 0]; } message SplitParameter { diff --git a/src/caffe/util/apply_bn_stats_batch_size.cpp b/src/caffe/util/apply_bn_stats_batch_size.cpp new file mode 100644 index 000000000..078cf6bc5 --- /dev/null +++ b/src/caffe/util/apply_bn_stats_batch_size.cpp @@ -0,0 +1,57 @@ +/* +All modification made by Intel Corporation: © 2017 Intel Corporation + +All contributions by the University of California: +Copyright (c) 2014, 2015, The Regents of the University of California (Regents) +All rights reserved. + +All other contributions: +Copyright (c) 2014, 2015, the respective contributors +All rights reserved. +For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md + + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include "caffe/common.hpp" +#include "caffe/util/apply_bn_stats_batch_size.hpp" + +namespace caffe { +void ApplyBnStatsBatchSize(const NetParameter& param, + NetParameter* param_with_stats_batch_size) { + CHECK(param.has_bn_stats_batch_size()); + param_with_stats_batch_size->CopyFrom(param); + param_with_stats_batch_size->clear_layer(); + int bn_stats_batch_size = param.bn_stats_batch_size(); + for (int i = 0; i < param.layer_size(); i++) { + LayerParameter *layer_param = param_with_stats_batch_size->add_layer(); + layer_param->CopyFrom(param.layer(i)); + if (layer_param->type() == "BatchNorm") { + layer_param->mutable_batch_norm_param()->set_stats_batch_size(bn_stats_batch_size); + } + } +} +} From 22cba68429089db02e0efc1a6167f7b452b76659 Mon Sep 17 00:00:00 2001 From: xinanlin Date: Fri, 18 Aug 2017 13:14:32 +0800 Subject: [PATCH 24/38] change MKLDNN LD path from relative path to absoulte path when use raw Makefile --- Makefile.mkldnn | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile.mkldnn b/Makefile.mkldnn index ec1a70bc5..d113a8923 100644 --- a/Makefile.mkldnn +++ b/Makefile.mkldnn @@ -1,5 +1,5 @@ CAFFE_ROOTDIR := $(shell pwd) -MKLDNN_ROOTDIR := external/mkldnn +MKLDNN_ROOTDIR := $(CAFFE_ROOTDIR)/external/mkldnn MKLDNN_TMPDIR := $(MKLDNN_ROOTDIR)/tmp MKLDNN_SRCDIR := $(MKLDNN_ROOTDIR)/src MKLDNN_BUILDDIR := $(MKLDNN_ROOTDIR)/build @@ -22,7 +22,7 @@ ifneq (,$(findstring ccache,$(CC))) endif MKLDNN_GITHUB := https://github.com/01org/mkl-dnn.git -MKLDNN_CMAKE_FLAGS += $(MKLDNN_SRCDIR) -DCMAKE_INSTALL_PREFIX=$(CAFFE_ROOTDIR)/$(MKLDNN_INSTALLDIR) -DMKLROOT=${MKL_ROOTDIR} -B$(CAFFE_ROOTDIR)/$(MKLDNN_BUILDDIR) -DCMAKE_CXX_COMPILER="$(MKLDNN_CXX)" -DCMAKE_C_COMPILER="$(MKLDNN_CC)" +MKLDNN_CMAKE_FLAGS += $(MKLDNN_SRCDIR) -DCMAKE_INSTALL_PREFIX=$(MKLDNN_INSTALLDIR) -DMKLROOT=${MKL_ROOTDIR} -B$(MKLDNN_BUILDDIR) -DCMAKE_CXX_COMPILER="$(MKLDNN_CXX)" -DCMAKE_C_COMPILER="$(MKLDNN_CC)" ifeq ("$(wildcard $(MKLDNN_INSTALLDIR)/include/mkldnn.hpp)", "") mkldnn_download: @@ -32,8 +32,8 @@ mkldnn_download: mkldnn_build: mkldnn_download cmake $(MKLDNN_CMAKE_FLAGS) - make -C $(CAFFE_ROOTDIR)/$(MKLDNN_BUILDDIR) -j$(shell cat /proc/cpuinfo |grep 'processor'|wc -l) - make -C $(CAFFE_ROOTDIR)/$(MKLDNN_BUILDDIR) install + make -C $(MKLDNN_BUILDDIR) -j$(shell cat /proc/cpuinfo |grep 'processor'|wc -l) + make -C $(MKLDNN_BUILDDIR) install else mkldnn_download: mkldnn_build: From 1e238296b85dbf7a7b8f57ef6e331245b78597fd Mon Sep 17 00:00:00 2001 From: "Gong, Jiong" Date: Fri, 18 Aug 2017 20:55:33 +0800 Subject: [PATCH 25/38] support bn stats batch size in mkl bn --- include/caffe/layers/mkl_layers.hpp | 18 ++- src/caffe/layers/batch_norm_layer.cpp | 2 +- src/caffe/layers/mkl_batch_norm_layer.cpp | 143 ++++++++++++++-------- 3 files changed, 109 insertions(+), 54 deletions(-) diff --git a/include/caffe/layers/mkl_layers.hpp b/include/caffe/layers/mkl_layers.hpp index 0d5d66416..c9806daee 100644 --- a/include/caffe/layers/mkl_layers.hpp +++ b/include/caffe/layers/mkl_layers.hpp @@ -481,12 +481,12 @@ class MKLBatchNormLayer : public Layer { batchNormFwd(static_cast(NULL)), batchNormFwdInference(static_cast(NULL)), batchNormBwd(static_cast(NULL)), - mean_buffer_(static_cast(NULL)), - variance_buffer_(static_cast(NULL)), scaleShift_buffer_(static_cast(NULL)), diffScaleShift_buffer_(static_cast(NULL)), layout_usr_(static_cast(NULL)), - use_global_stats_(false) + use_global_stats_(false), + num_stats_batches_(1), + stats_batch_size_(0) { PERFORMANCE_EVENT_ID_RESET(perf_id_fw_); PERFORMANCE_EVENT_ID_RESET(perf_id_bw_); @@ -515,6 +515,12 @@ class MKLBatchNormLayer : public Layer { virtual void Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom); + void ForwardStatsBatch_cpu(const vector*>& bottom, + const vector*>& top, int stats_batch_idx); + void BackwardStatsBatch_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom, + int stats_batch_idx); + void Init(const vector*>& bottom, const vector*>& top); @@ -534,12 +540,14 @@ class MKLBatchNormLayer : public Layer { shared_ptr > bwd_bottom_diff; Blob temp_; dnnPrimitive_t batchNormFwd, batchNormFwdInference, batchNormBwd; - Dtype *mean_buffer_; - Dtype *variance_buffer_; + vector mean_buffers_; + vector variance_buffers_; Dtype *scaleShift_buffer_; Dtype *diffScaleShift_buffer_; dnnLayout_t layout_usr_; bool use_global_stats_; + int num_stats_batches_; + int stats_batch_size_; PERFORMANCE_EVENT_ID_DECL(perf_id_fw_); PERFORMANCE_EVENT_ID_DECL(perf_id_bw_); diff --git a/src/caffe/layers/batch_norm_layer.cpp b/src/caffe/layers/batch_norm_layer.cpp index dada5873d..8331dd7d7 100644 --- a/src/caffe/layers/batch_norm_layer.cpp +++ b/src/caffe/layers/batch_norm_layer.cpp @@ -218,7 +218,7 @@ void BatchNormLayer::ForwardStatsBatch_cpu(const vector*>& bo this->blobs_[2]->mutable_cpu_data()[0] += 1; caffe_cpu_axpby(mean_.count(), Dtype(1), mean_.cpu_data(), moving_average_fraction_, this->blobs_[0]->mutable_cpu_data()); - int m = bottom[0]->count()/channels_; + int m = bottom[0]->count()/num_stats_batches_/channels_; Dtype bias_correction_factor = m > 1 ? Dtype(m)/(m-1) : 1; caffe_cpu_axpby(variance_.count(), bias_correction_factor, variance_.cpu_data(), moving_average_fraction_, diff --git a/src/caffe/layers/mkl_batch_norm_layer.cpp b/src/caffe/layers/mkl_batch_norm_layer.cpp index 6dce50243..a24500c69 100755 --- a/src/caffe/layers/mkl_batch_norm_layer.cpp +++ b/src/caffe/layers/mkl_batch_norm_layer.cpp @@ -52,8 +52,12 @@ MKLBatchNormLayer::~MKLBatchNormLayer() { dnnDelete(batchNormFwdInference); dnnDelete(batchNormBwd); dnnLayoutDelete(layout_usr_); - dnnReleaseBuffer(mean_buffer_); - dnnReleaseBuffer(variance_buffer_); + for (int i = 0; i < mean_buffers_.size(); i++) { + dnnReleaseBuffer(mean_buffers_[i]); + } + for (int i = 0; i < variance_buffers_.size(); i++) { + dnnReleaseBuffer(variance_buffers_[i]); + } dnnReleaseBuffer(scaleShift_buffer_); dnnReleaseBuffer(diffScaleShift_buffer_); } @@ -71,6 +75,15 @@ void MKLBatchNormLayer::Init(const vector*>& bottom, if (this->layer_param_.batch_norm_param().has_use_global_stats()) use_global_stats_ = this->layer_param_.batch_norm_param().use_global_stats(); + num_stats_batches_ = 1; + stats_batch_size_ = bottom[0]->shape(0); + BatchNormParameter param = this->layer_param_.batch_norm_param(); + if (!use_global_stats_ && param.stats_batch_size() > 0) { + CHECK_EQ(bottom[0]->shape(0) % param.stats_batch_size(), 0); + num_stats_batches_ = bottom[0]->shape(0) / param.stats_batch_size(); + stats_batch_size_ = param.stats_batch_size(); + } + CHECK(use_weight_bias_) << "BatchNorm without scaling have not supported yet"; size_t dim = 4, sizes[4], strides[4]; @@ -99,18 +112,25 @@ void MKLBatchNormLayer::Init(const vector*>& bottom, // TODO: Make a cleanup routine to avoid // copy of following code in the Destructor - dnnError_t e; - dnnLayoutDelete(layout_usr_); - e = dnnLayoutCreate(&layout_usr_, dim, sizes, strides); - CHECK_EQ(e, E_SUCCESS); - fwd_bottom_data->create_user_layout(dim, sizes, strides, false); fwd_top_data ->create_user_layout(dim, sizes, strides, false); bwd_bottom_diff->create_user_layout(dim, sizes, strides, false); bwd_top_diff ->create_user_layout(dim, sizes, strides, false); - dnnReleaseBuffer(mean_buffer_); - dnnReleaseBuffer(variance_buffer_); + sizes[3] /= num_stats_batches_; + dnnError_t e; + dnnLayoutDelete(layout_usr_); + e = dnnLayoutCreate(&layout_usr_, dim, sizes, strides); + CHECK_EQ(e, E_SUCCESS); + + for (int i = 0; i < mean_buffers_.size(); i++) { + dnnReleaseBuffer(mean_buffers_[i]); + } + for (int i = 0; i < variance_buffers_.size(); i++) { + dnnReleaseBuffer(variance_buffers_[i]); + } + mean_buffers_.resize(num_stats_batches_, NULL); + variance_buffers_.resize(num_stats_batches_, NULL); dnnReleaseBuffer(scaleShift_buffer_); dnnReleaseBuffer(diffScaleShift_buffer_); @@ -223,26 +243,30 @@ void MKLBatchNormLayer::Reshape(const vector*>& bottom, strides[2] = sizes[0]*sizes[1]; strides[3] = sizes[0]*sizes[1]*sizes[2]; - dnnError_t e; - dnnLayoutDelete(layout_usr_); - e = dnnLayoutCreate(&layout_usr_, dim, sizes, strides); - CHECK_EQ(e, E_SUCCESS); fwd_bottom_data->create_user_layout(dim, sizes, strides, false); fwd_top_data ->create_user_layout(dim, sizes, strides, false); bwd_bottom_diff->create_user_layout(dim, sizes, strides, false); bwd_top_diff ->create_user_layout(dim, sizes, strides, false); + + sizes[3] /= num_stats_batches_; + dnnError_t e; + dnnLayoutDelete(layout_usr_); + e = dnnLayoutCreate(&layout_usr_, dim, sizes, strides); + CHECK_EQ(e, E_SUCCESS); } } template -void MKLBatchNormLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { +void MKLBatchNormLayer::ForwardStatsBatch_cpu(const vector*>& bottom, + const vector*>& top, int stats_batch_idx) { + long data_offset = stats_batch_idx * stats_batch_size_ * bottom[0]->count(1); void* bottom_data = reinterpret_cast(const_cast(bottom[0]->prv_data())); int is_first_pass = 0; - unsigned int amount_to_copy =0; + long amount_to_copy =0; - if (NULL != bottom_data) { + // TODO: support private memory with num_stats_batches_ > 1 + if (NULL != bottom_data && num_stats_batches_ == 1) { amount_to_copy = bottom[0]->prv_data_count(); // Is it the first pass? Create a primitive. if (batchNormFwd == NULL) { @@ -311,7 +335,7 @@ void MKLBatchNormLayer::Forward_cpu( } bottom_data = reinterpret_cast(const_cast(bottom[0]->cpu_data())); - amount_to_copy = bottom[0]->count(); + amount_to_copy = bottom[0]->count() / num_stats_batches_; } if (is_first_pass == 1) { dnnError_t e; @@ -319,18 +343,22 @@ void MKLBatchNormLayer::Forward_cpu( e = dnnLayoutCreateFromPrimitive( &mean_buffer_l, batchNormFwd, dnnResourceMean); CHECK_EQ(e, E_SUCCESS); - e = dnnAllocateBuffer( - reinterpret_cast(&mean_buffer_), mean_buffer_l); - CHECK_EQ(e, E_SUCCESS); + for (int i = 0; i < num_stats_batches_; i++) { + e = dnnAllocateBuffer( + reinterpret_cast(&mean_buffers_[i]), mean_buffer_l); + CHECK_EQ(e, E_SUCCESS); + } dnnLayoutDelete(mean_buffer_l); dnnLayout_t variance_buffer_l = NULL; e = dnnLayoutCreateFromPrimitive( &variance_buffer_l, batchNormFwd, dnnResourceVariance); CHECK_EQ(e, E_SUCCESS); - e = dnnAllocateBuffer( - reinterpret_cast(&variance_buffer_), variance_buffer_l); - CHECK_EQ(e, E_SUCCESS); + for (int i = 0; i < num_stats_batches_; i++) { + e = dnnAllocateBuffer( + reinterpret_cast(&variance_buffers_[i]), variance_buffer_l); + CHECK_EQ(e, E_SUCCESS); + } dnnLayoutDelete(variance_buffer_l); dnnLayout_t diffScaleShift_buffer_l = NULL; @@ -374,8 +402,8 @@ void MKLBatchNormLayer::Forward_cpu( // Note that this is only necessary for Backward; we skip this if not // doing Backward // TODO: make a caffe_coppy working on blobs - caffe_copy(amount_to_copy, static_cast(bottom_data), - temp_.mutable_cpu_data()); + caffe_copy(amount_to_copy, static_cast(bottom_data) + data_offset, + temp_.mutable_cpu_data() + data_offset); } if (use_global_stats_) { @@ -383,24 +411,25 @@ void MKLBatchNormLayer::Forward_cpu( const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ? 0 : 1 / this->blobs_[2]->cpu_data()[0]; caffe_cpu_scale(this->blobs_[0]->count(), scale_factor, - this->blobs_[0]->cpu_data(), mean_buffer_); + this->blobs_[0]->cpu_data(), mean_buffers_[stats_batch_idx]); caffe_cpu_scale(this->blobs_[1]->count(), scale_factor, - this->blobs_[1]->cpu_data(), variance_buffer_); + this->blobs_[1]->cpu_data(), variance_buffers_[stats_batch_idx]); } dnnError_t e; void* BatchNorm_res[dnnResourceNumber]; - BatchNorm_res[dnnResourceMean] = mean_buffer_; - BatchNorm_res[dnnResourceVariance] = variance_buffer_; - BatchNorm_res[dnnResourceSrc] = bottom_data; + BatchNorm_res[dnnResourceMean] = mean_buffers_[stats_batch_idx]; + BatchNorm_res[dnnResourceVariance] = variance_buffers_[stats_batch_idx]; + BatchNorm_res[dnnResourceSrc] = (Dtype*)bottom_data + data_offset; BatchNorm_res[dnnResourceScaleShift] = scaleShift_buffer_; if (fwd_top_data->conversion_needed()) { top[0]->set_prv_data_descriptor(fwd_top_data); + data_offset = stats_batch_idx * (top[0]->prv_data_count() / num_stats_batches_); BatchNorm_res[dnnResourceDst] = - reinterpret_cast(top[0]->mutable_prv_data()); + reinterpret_cast(top[0]->mutable_prv_data() + data_offset); } else { BatchNorm_res[dnnResourceDst] = - reinterpret_cast(top[0]->mutable_cpu_data()); + reinterpret_cast(top[0]->mutable_cpu_data() + data_offset); DLOG(INFO) << "Using cpu_data for top in DnnBatchNorm."; } @@ -415,20 +444,21 @@ void MKLBatchNormLayer::Forward_cpu( // compute and save moving average this->blobs_[2]->mutable_cpu_data()[0] *= moving_average_fraction_; this->blobs_[2]->mutable_cpu_data()[0] += 1; - caffe_cpu_axpby(this->blobs_[0]->count(), Dtype(1), mean_buffer_, + caffe_cpu_axpby(this->blobs_[0]->count(), Dtype(1), mean_buffers_[stats_batch_idx], moving_average_fraction_, this->blobs_[0]->mutable_cpu_data()); - int m = bottom[0]->count()/channels_; + int m = bottom[0]->count()/num_stats_batches_/channels_; Dtype bias_correction_factor = m > 1 ? Dtype(m)/(m-1) : 1; caffe_cpu_axpby(this->blobs_[1]->count(), bias_correction_factor, - variance_buffer_, moving_average_fraction_, + variance_buffers_[stats_batch_idx], moving_average_fraction_, this->blobs_[1]->mutable_cpu_data()); } } template -void MKLBatchNormLayer::Backward_cpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { +void MKLBatchNormLayer::BackwardStatsBatch_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom, + int stats_batch_idx) { + long data_offset = stats_batch_idx * stats_batch_size_ * bottom[0]->count(1); void *bottom_data = NULL; if (bottom[0] == top[0]) { bottom_data = reinterpret_cast( @@ -437,7 +467,7 @@ void MKLBatchNormLayer::Backward_cpu( bottom_data = reinterpret_cast( const_cast(bottom[0]->prv_data())); - if (NULL == bottom_data) + if (NULL == bottom_data || num_stats_batches_ > 1) bottom_data = reinterpret_cast( const_cast(bottom[0]->cpu_data())); @@ -445,19 +475,19 @@ void MKLBatchNormLayer::Backward_cpu( dnnError_t e; void* BatchNorm_res[dnnResourceNumber]; - BatchNorm_res[dnnResourceMean] = mean_buffer_; - BatchNorm_res[dnnResourceVariance] = variance_buffer_; - BatchNorm_res[dnnResourceSrc] = bottom_data; + BatchNorm_res[dnnResourceMean] = mean_buffers_[stats_batch_idx]; + BatchNorm_res[dnnResourceVariance] = variance_buffers_[stats_batch_idx]; + BatchNorm_res[dnnResourceSrc] = (Dtype*)bottom_data + data_offset; BatchNorm_res[dnnResourceScaleShift] = scaleShift_buffer_; BatchNorm_res[dnnResourceDiffScaleShift] = diffScaleShift_buffer_; - - BatchNorm_res[dnnResourceDiffDst] = bwd_top_diff->get_converted_prv(top[0], - true); + BatchNorm_res[dnnResourceDiffDst] = + bwd_top_diff->get_converted_prv(top[0], true) + data_offset; if (bwd_bottom_diff->conversion_needed()) { bottom[0]->set_prv_diff_descriptor(bwd_bottom_diff); - BatchNorm_res[dnnResourceDiffSrc] = bottom[0]->mutable_prv_diff(); + data_offset = stats_batch_idx * (bottom[0]->prv_diff_count() / num_stats_batches_); + BatchNorm_res[dnnResourceDiffSrc] = bottom[0]->mutable_prv_diff() + data_offset; } else { - BatchNorm_res[dnnResourceDiffSrc] = bottom[0]->mutable_cpu_diff(); + BatchNorm_res[dnnResourceDiffSrc] = bottom[0]->mutable_cpu_diff() + data_offset; } PERFORMANCE_EVENT_ID_INIT(perf_id_bw_, PERFORMANCE_MKL_NAME("BW")); @@ -479,6 +509,23 @@ void MKLBatchNormLayer::Backward_cpu( } } +template +void MKLBatchNormLayer::Forward_cpu( + const vector*>& bottom, const vector*>& top) { + for (int i = 0; i < num_stats_batches_; i++) { + ForwardStatsBatch_cpu(bottom, top, i); + } +} + +template +void MKLBatchNormLayer::Backward_cpu( + const vector*>& top, const vector& propagate_down, + const vector*>& bottom) { + for (int i = 0; i < num_stats_batches_; i++) { + BackwardStatsBatch_cpu(top, propagate_down, bottom, i); + } +} + #ifdef CPU_ONLY STUB_GPU(MKLBatchNormLayer); From f4c0f7758f81c613d7be388aa88000bb10654ef1 Mon Sep 17 00:00:00 2001 From: "Gong, Jiong" Date: Sat, 19 Aug 2017 19:15:36 +0800 Subject: [PATCH 26/38] enable bn stats batch size in mkldnn --- include/caffe/layers/mkldnn_layers.hpp | 12 +- include/caffe/mkldnn_memory.hpp | 3 + src/caffe/layers/mkldnn_batch_norm_layer.cpp | 266 ++++++++++++------- src/caffe/mkldnn_memory.cpp | 26 ++ 4 files changed, 209 insertions(+), 98 deletions(-) diff --git a/include/caffe/layers/mkldnn_layers.hpp b/include/caffe/layers/mkldnn_layers.hpp index f63301e2a..bf23438bd 100644 --- a/include/caffe/layers/mkldnn_layers.hpp +++ b/include/caffe/layers/mkldnn_layers.hpp @@ -68,7 +68,6 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer, public Layer { , fwd_top_data(), fwd_bottom_data() , bwd_top_diff(), bwd_bottom_diff() , BatchNormFwd_pd(), BatchNormBwd_pd() - , mean_memory(), variance_memory() , scaleshift_memory(), bwd_scaleshift_diff_memory() , output_memory(), bwd_bottom_diff_memory() , input_primitive(), bwd_top_diff_primitive() @@ -96,22 +95,29 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer, public Layer { void InitBatchNormBwd(const vector*>& top, const vector& propagate_down, const vector*>& bottom); + void InitBatchNormFwdPrimitive(int stats_batch_idx); + void InitBatchNormBwdPrimitive(int stats_batch_idx); + template shared_ptr GetStatsBatchMemory( + shared_ptr > mkldnn_data, int idx); shared_ptr > fwd_top_data, fwd_bottom_data; shared_ptr > bwd_top_diff, bwd_bottom_diff; shared_ptr BatchNormFwd_pd; shared_ptr BatchNormBwd_pd; - MKLDNNPrimitive BatchNormFwd, BatchNormBwd; - shared_ptr mean_memory, variance_memory; + vector > BatchNormFwd, BatchNormBwd; + vector > mean_memory, variance_memory; shared_ptr scaleshift_memory, bwd_scaleshift_diff_memory; shared_ptr output_memory, bwd_bottom_diff_memory; + vector > input_stats, output_stats, top_diff_stats, bottom_diff_stats; shared_ptr input_primitive, bwd_top_diff_primitive; int32_t num_, width_, height_, channels_; Dtype eps_, moving_average_fraction_; bool use_weight_bias_, bias_term_, use_global_stats_; + int num_stats_batches_; + int stats_batch_size_; PERFORMANCE_EVENT_ID_DECL(perf_id_fw_); PERFORMANCE_EVENT_ID_DECL(perf_id_bw_); diff --git a/include/caffe/mkldnn_memory.hpp b/include/caffe/mkldnn_memory.hpp index a59ce6e12..3b1a1c6ad 100644 --- a/include/caffe/mkldnn_memory.hpp +++ b/include/caffe/mkldnn_memory.hpp @@ -94,6 +94,7 @@ class MKLDNNMemoryDescriptorBase : public PrvMemDescr if (_prv_memory == NULL) allocate(); return _internal_ptr; } + shared_ptr reorder_usr2prv() { return _reorder_usr2prv.aprimitive; } shared_ptr reorder_prv2usr() { return _reorder_prv2usr.aprimitive; } shared_ptr reorder_extprv2prv() { return _reorder_extprv2prv.aprimitive; } @@ -201,6 +202,8 @@ class MKLDNNMemoryDescriptor : public MKLDNNMemoryDescriptorBase { shared_ptr create_output_memory(Blob * blob, bool inplace = false); shared_ptr create_input(bool set_prv_ptr); shared_ptr create_output_memory(bool inplace = false); + Dtype* get_memory_ptr(long offset = 0); + shared_ptr get_memory_desc(); void set_mkldnn_primitive(MKLDNNPrimitive& mprimitive) { CHECK(mprimitive.aprimitive); _mkldnn_primitive = mprimitive; } MKLDNNPrimitive& mkldnn_primitive() { return _mkldnn_primitive; } diff --git a/src/caffe/layers/mkldnn_batch_norm_layer.cpp b/src/caffe/layers/mkldnn_batch_norm_layer.cpp index 4db92b943..6688f8584 100644 --- a/src/caffe/layers/mkldnn_batch_norm_layer.cpp +++ b/src/caffe/layers/mkldnn_batch_norm_layer.cpp @@ -130,6 +130,15 @@ void MKLDNNBatchNormLayer::Reshape(const vector*>& bottom this->num_ = bottom[0]->num(); this->channels_ = bottom[0]->channels(); + num_stats_batches_ = 1; + stats_batch_size_ = bottom[0]->shape(0); + BatchNormParameter param = this->layer_param_.batch_norm_param(); + if (!use_global_stats_ && param.stats_batch_size() > 0) { + CHECK_EQ(bottom[0]->shape(0) % param.stats_batch_size(), 0); + num_stats_batches_ = bottom[0]->shape(0) / param.stats_batch_size(); + stats_batch_size_ = param.stats_batch_size(); + } + //Fix: should reshape the top blob with the real size of bottom blob //top[0]->Reshape(this->num_, this->channels_, this->height_, this->width_); #ifdef DEBUG @@ -159,8 +168,9 @@ void MKLDNNBatchNormLayer::InitBatchNorm(const vector*>& bott memory::data_type mpcsn = memory::data_type::f32; // ---- Initialize memory descriptors ------------- - shared_ptr input_md, output_md, scaleshift_md; - shared_ptr usr_mpd, prv_mpd, scaleshift_mpd; + shared_ptr input_md, input_stats_md, output_md, scaleshift_md; + shared_ptr usr_mpd, prv_mpd; + shared_ptr scaleshift_mpd; if (bottom_data_is_prv) { shared_ptr > mem_descr = get_mkldnn_prv_descriptor(bottom[0]); @@ -172,9 +182,13 @@ void MKLDNNBatchNormLayer::InitBatchNorm(const vector*>& bott usr_mpd.reset(new memory::primitive_desc(*input_md, cpu_engine)); } output_md = input_md; + input_stats_md.reset(new memory::desc(*input_md)); + CHECK(input_stats_md->data.ndims > 0 && + input_stats_md->data.dims[0] == this->num_); + input_stats_md->data.dims[0] = stats_batch_size_; // ---- Initialize BatchNorm primitive descriptor ------------- - batch_normalization_forward::desc BatchNormFwd_desc(propagation, *input_md, eps_, flags); + batch_normalization_forward::desc BatchNormFwd_desc(propagation, *input_stats_md, eps_, flags); // ---- Determining engine to use ----------------------- std::string subengines = this->layer_param_.engine(); if (subengines == "" || subengines == "MKLDNN") @@ -206,44 +220,13 @@ void MKLDNNBatchNormLayer::InitBatchNorm(const vector*>& bott fwd_top_data.reset(new MKLDNNData(usr_mpd, prv_mpd, top[0], this)); output_memory = fwd_top_data->create_output_memory(); - // ---- Create BatchNorm -------------------- - if (this->phase_ == TEST && !use_global_stats_) { - if (use_weight_bias_) { - BatchNormFwd.reset(new batch_normalization_forward(*BatchNormFwd_pd, - *input_primitive, *scaleshift_memory, *output_memory)); - } else { - BatchNormFwd.reset(new batch_normalization_forward(*BatchNormFwd_pd, - *input_primitive, *output_memory)); - } - } else { - mean_memory.reset(new memory(BatchNormFwd_pd->mean_primitive_desc())); - variance_memory.reset(new memory(BatchNormFwd_pd->variance_primitive_desc())); - - if (use_global_stats_) { - caffe_copy(this->channels_, this->blobs_[0]->cpu_data(), - static_cast(mean_memory->get_data_handle())); - caffe_copy(this->channels_, this->blobs_[1]->cpu_data(), - static_cast(variance_memory->get_data_handle())); - if (use_weight_bias_) { - BatchNormFwd.reset(new batch_normalization_forward(*BatchNormFwd_pd, - *input_primitive, (const primitive::at)*mean_memory, - (const primitive::at)*variance_memory, *scaleshift_memory, - *output_memory)); - } else { - BatchNormFwd.reset(new batch_normalization_forward(*BatchNormFwd_pd, - *input_primitive, (const primitive::at)*mean_memory, - (const primitive::at)*variance_memory, *output_memory)); - } - } else { - if (use_weight_bias_) { - BatchNormFwd.reset(new batch_normalization_forward(*BatchNormFwd_pd, - *input_primitive, *scaleshift_memory, *output_memory, - *mean_memory, *variance_memory)); - } else { - BatchNormFwd.reset(new batch_normalization_forward(*BatchNormFwd_pd, - *input_primitive, *output_memory, *mean_memory, *variance_memory)); - } - } + mean_memory.resize(num_stats_batches_); + variance_memory.resize(num_stats_batches_); + input_stats.resize(num_stats_batches_); + output_stats.resize(num_stats_batches_); + BatchNormFwd.resize(num_stats_batches_); + for (int i = 0; i < num_stats_batches_; i++) { + InitBatchNormFwdPrimitive(i); } //fwd_bottom_data->set_mkldnn_primitive(BatchNormFwd); //Wrong passed primitive! (TODO: Checking!) @@ -272,6 +255,70 @@ void MKLDNNBatchNormLayer::InitBatchNorm(const vector*>& bott } } +template +template +shared_ptr MKLDNNBatchNormLayer::GetStatsBatchMemory( + shared_ptr > mkldnn_mem, int idx) { + long data_offset = + idx * stats_batch_size_ * this->channels_ * this->width_ * this->height_; + engine cpu_engine = CpuEngine::Instance().get_engine(); + shared_ptr stats_md = mkldnn_mem->get_memory_desc(); + CHECK(stats_md->data.ndims > 0 && + stats_md->data.dims[0] == this->num_); + stats_md->data.dims[0] = stats_batch_size_; + shared_ptr stats_mpd( + new memory::primitive_desc(*stats_md, cpu_engine)); + shared_ptr stats( + new memory(*stats_mpd, mkldnn_mem->get_memory_ptr(data_offset))); + return stats; +} + +template +void MKLDNNBatchNormLayer::InitBatchNormFwdPrimitive(int idx) { + input_stats[idx] = GetStatsBatchMemory(fwd_bottom_data, idx); + output_stats[idx] = GetStatsBatchMemory(fwd_top_data, idx); + + // ---- Create BatchNorm -------------------- + if (this->phase_ == TEST && !use_global_stats_) { + if (use_weight_bias_) { + BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd, + *input_stats[idx], *scaleshift_memory, + *output_stats[idx])); + } else { + BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd, + *input_stats[idx], *output_stats[idx])); + } + } else { + mean_memory[idx].reset(new memory(BatchNormFwd_pd->mean_primitive_desc())); + variance_memory[idx].reset(new memory(BatchNormFwd_pd->variance_primitive_desc())); + + if (use_global_stats_) { + caffe_copy(this->channels_, this->blobs_[0]->cpu_data(), + static_cast(mean_memory[idx]->get_data_handle())); + caffe_copy(this->channels_, this->blobs_[1]->cpu_data(), + static_cast(variance_memory[idx]->get_data_handle())); + if (use_weight_bias_) { + BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd, + *input_stats[idx], *mean_memory[idx], + *variance_memory[idx], *scaleshift_memory, + *output_stats[idx])); + } else { + BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd, + *input_stats[idx], *mean_memory[idx], + *variance_memory[idx], *output_stats[idx])); + } + } else { + if (use_weight_bias_) { + BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd, + *input_stats[idx], *scaleshift_memory, *output_stats[idx], + *mean_memory[idx], *variance_memory[idx])); + } else { + BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd, + *input_stats[idx], *output_stats[idx], *mean_memory[idx], *variance_memory[idx])); + } + } + } +} template void MKLDNNBatchNormLayer::Forward_cpu(const vector*>& bottom @@ -289,20 +336,21 @@ void MKLDNNBatchNormLayer::Forward_cpu(const vector*>& bottom // update top that head at prv fwd_top_data->sync_before_write(); - if (use_global_stats_) { + for (int stats_batch_idx = 0; stats_batch_idx < num_stats_batches_; stats_batch_idx++) { + if (use_global_stats_) { // use the stored mean/variance estimates. const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ? 0 : 1 / this->blobs_[2]->cpu_data()[0]; - Dtype *mean_buffer_ = (Dtype *)(mean_memory->get_data_handle()); - Dtype *variance_buffer_ = (Dtype *)(variance_memory->get_data_handle()); + Dtype *mean_buffer_ = (Dtype *)(mean_memory[stats_batch_idx]->get_data_handle()); + Dtype *variance_buffer_ = (Dtype *)(variance_memory[stats_batch_idx]->get_data_handle()); //TODO: optimize, do this operation in the InitBatchNorm, so no need to calculate each time caffe_cpu_scale(this->blobs_[0]->count(), scale_factor, this->blobs_[0]->cpu_data(), mean_buffer_); caffe_cpu_scale(this->blobs_[1]->count(), scale_factor, this->blobs_[1]->cpu_data(), variance_buffer_); - } - if (use_weight_bias_) { + } + if (use_weight_bias_) { Dtype* scaleShift_buffer_ = (Dtype *)(scaleshift_memory->get_data_handle()); // Fill ScaleShift buffer for (int i = 0; i < this->channels_; i++) { @@ -312,26 +360,27 @@ void MKLDNNBatchNormLayer::Forward_cpu(const vector*>& bottom scaleShift_buffer_[channels_ + i] = this->blobs_[4]->cpu_data()[i]; } } - } + } - PERFORMANCE_EVENT_ID_INIT(perf_id_fw_, PERFORMANCE_MKLDNN_NAME("FW")); - PERFORMANCE_MEASUREMENT_BEGIN(); - BatchNormFwd.submit(); - PERFORMANCE_MEASUREMENT_END_ID(perf_id_fw_); + PERFORMANCE_EVENT_ID_INIT(perf_id_fw_, PERFORMANCE_MKLDNN_NAME("FW")); + PERFORMANCE_MEASUREMENT_BEGIN(); + BatchNormFwd[stats_batch_idx].submit(); + PERFORMANCE_MEASUREMENT_END_ID(perf_id_fw_); - if (this->phase_ == TRAIN && !use_global_stats_) { + if (this->phase_ == TRAIN && !use_global_stats_) { // compute and save moving average - Dtype *mean_buffer_ = (Dtype *)(mean_memory->get_data_handle()); - Dtype *variance_buffer_ = (Dtype *)(variance_memory->get_data_handle()); + Dtype *mean_buffer_ = (Dtype *)(mean_memory[stats_batch_idx]->get_data_handle()); + Dtype *variance_buffer_ = (Dtype *)(variance_memory[stats_batch_idx]->get_data_handle()); this->blobs_[2]->mutable_cpu_data()[0] *= moving_average_fraction_; this->blobs_[2]->mutable_cpu_data()[0] += 1; caffe_cpu_axpby(this->channels_, Dtype(1), mean_buffer_, moving_average_fraction_, this->blobs_[0]->mutable_cpu_data()); - int m = bottom[0]->count()/channels_; + int m = bottom[0]->count()/num_stats_batches_/channels_; Dtype bias_correction_factor = m > 1 ? Dtype(m)/(m-1) : 1; caffe_cpu_axpby(this->channels_, bias_correction_factor, variance_buffer_, moving_average_fraction_, this->blobs_[1]->mutable_cpu_data()); + } } } @@ -359,7 +408,7 @@ void MKLDNNBatchNormLayer::InitBatchNormBwd( memory::data_type mpcsn = memory::data_type::f32; // ---- Initialize memory descriptors ------------- - shared_ptr top_diff_md, top_data_md; + shared_ptr top_diff_md, top_diff_stats_md, top_data_md, output_stats_md; shared_ptr usr_diff_mpd(NULL), prv_diff_mpd(NULL); if (top_diff_is_prv) { shared_ptr > mem_descr @@ -371,10 +420,18 @@ void MKLDNNBatchNormLayer::InitBatchNormBwd( top_diff_md.reset(new memory::desc({{n, c, h, w}}, mpcsn, memory::format::nchw)); //MKLDNN batch norm only support 4D memory descriptor! usr_diff_mpd.reset(new memory::primitive_desc(*top_diff_md, cpu_engine)); } + top_diff_stats_md.reset(new memory::desc(*top_diff_md)); + CHECK(top_diff_stats_md->data.ndims > 0 && + top_diff_stats_md->data.dims[0] == this->num_); + top_diff_stats_md->data.dims[0] = stats_batch_size_; + output_stats_md.reset(new memory::desc(output_memory->get_primitive_desc().desc())); + CHECK(output_stats_md->data.ndims > 0 && + output_stats_md->data.dims[0] == this->num_); + output_stats_md->data.dims[0] = stats_batch_size_; // ---- Initialize bnrm primitive descriptor ------------- batch_normalization_backward::desc BatchNormBwd_desc(prop_kind::backward, - *top_diff_md, output_memory->get_primitive_desc().desc(), eps_, + *top_diff_stats_md, *output_stats_md, eps_, flags); // ---- Determining engine to use ----------------------- std::string subengines = this->layer_param_.engine(); @@ -396,6 +453,11 @@ void MKLDNNBatchNormLayer::InitBatchNormBwd( CHECK(BatchNormBwd_pd); + if (use_weight_bias_) { + bwd_scaleshift_diff_memory.reset(new memory( + BatchNormFwd_pd->weights_primitive_desc())); + } + // --- init primitive and prv_memory descriptors ---------------------- bwd_top_diff.reset(new MKLDNNDiff(usr_diff_mpd, prv_diff_mpd, top[0], this)); bwd_top_diff->name = "bwd_top_diff_data @ " + this->layer_param_.name(); @@ -405,17 +467,11 @@ void MKLDNNBatchNormLayer::InitBatchNormBwd( bwd_bottom_diff->name = "bwd_bottom_diff_data @ " + this->layer_param_.name(); bwd_bottom_diff_memory = bwd_bottom_diff->create_output_memory(inplace); - if (use_weight_bias_) { - bwd_scaleshift_diff_memory.reset(new memory( - BatchNormFwd_pd->weights_primitive_desc())); - BatchNormBwd.reset(new batch_normalization_backward(*BatchNormBwd_pd, - *input_primitive, *mean_memory, *variance_memory, - *bwd_top_diff_primitive, *scaleshift_memory, - *bwd_bottom_diff_memory, *bwd_scaleshift_diff_memory)); - } else { - BatchNormBwd.reset(new batch_normalization_backward(*BatchNormBwd_pd, - *input_primitive, *mean_memory, *variance_memory, - *bwd_top_diff_primitive, *bwd_bottom_diff_memory)); + top_diff_stats.resize(num_stats_batches_); + bottom_diff_stats.resize(num_stats_batches_); + BatchNormBwd.resize(num_stats_batches_); + for (int i = 0; i < num_stats_batches_; i++) { + InitBatchNormBwdPrimitive(i); } //bwd_top_diff->set_mkldnn_primitive(BatchNormBwd); //Wrong passed primitive! (TODO: Checking!) @@ -427,6 +483,23 @@ void MKLDNNBatchNormLayer::InitBatchNormBwd( bwd_bottom_diff->set_mkldnn_primitive(bwd_bottom_diff_memory_transfer); } +template +void MKLDNNBatchNormLayer::InitBatchNormBwdPrimitive(int idx) { + top_diff_stats[idx] = GetStatsBatchMemory(bwd_top_diff, idx); + bottom_diff_stats[idx] = GetStatsBatchMemory(bwd_bottom_diff, idx); + + if (use_weight_bias_) { + BatchNormBwd[idx].reset(new batch_normalization_backward(*BatchNormBwd_pd, + *input_stats[idx], *mean_memory[idx], *variance_memory[idx], + *top_diff_stats[idx], *scaleshift_memory, + *bottom_diff_stats[idx], *bwd_scaleshift_diff_memory)); + } else { + BatchNormBwd[idx].reset(new batch_normalization_backward(*BatchNormBwd_pd, + *input_stats[idx], *mean_memory[idx], *variance_memory[idx], + *top_diff_stats[idx], *bottom_diff_stats[idx])); + } +} + template void MKLDNNBatchNormLayer::Backward_cpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) @@ -443,53 +516,56 @@ void MKLDNNBatchNormLayer::Backward_cpu(const vector*>& top, // update bottom that head at prv bwd_bottom_diff->sync_before_write(); - PERFORMANCE_EVENT_ID_INIT(perf_id_bw_, PERFORMANCE_MKLDNN_NAME("BW")); - PERFORMANCE_MEASUREMENT_BEGIN(); + for (int stats_batch_idx = 0; stats_batch_idx < num_stats_batches_; stats_batch_idx++) { + + PERFORMANCE_EVENT_ID_INIT(perf_id_bw_, PERFORMANCE_MKLDNN_NAME("BW")); + PERFORMANCE_MEASUREMENT_BEGIN(); #ifdef DEBUG - if (bottom[0]->prv_data() != NULL) - { + if (bottom[0]->prv_data() != NULL) + { LOG(INFO) << "Debug: Bottom prv data: " << *bottom[0]->prv_data(); - } - else - { + } + else + { LOG(INFO) << "Debug: Bottom prv data is NULL!"; - } - - if (top[0]->prv_diff() != NULL) - { + } + + if (top[0]->prv_diff() != NULL) + { LOG(INFO) << "Debug: Top prv diff: " << *top[0]->prv_diff(); - } - else - { + } + else + { LOG(INFO) << "Debug: Top prv diff is NULL!"; LOG(INFO) << "Debug: Top cpu diff: " << *top[0]->cpu_diff(); - } + } #endif - BatchNormBwd.submit(); + BatchNormBwd[stats_batch_idx].submit(); #ifdef DEBUG - if (bottom[0]->prv_diff() != NULL) - { + if (bottom[0]->prv_diff() != NULL) + { LOG(INFO) << "Debug: Bottom prv diff: " << *bottom[0]->prv_diff(); - } - else - { + } + else + { LOG(INFO) << "Debug: Bottom prv diff is NULL!"; LOG(INFO) << "Debug: Bottom cpu diff: " << *bottom[0]->cpu_diff(); - } + } #endif - PERFORMANCE_MEASUREMENT_END_ID(perf_id_bw_); + PERFORMANCE_MEASUREMENT_END_ID(perf_id_bw_); - /* FIXME: this wouldn't work with lazy stream */ - if (use_weight_bias_) { + /* FIXME: this wouldn't work with lazy stream */ + if (use_weight_bias_) { Dtype* dw = (Dtype *)(bwd_scaleshift_diff_memory->get_data_handle()); for (int i = 0; i < this->channels_; i++) - this->blobs_[3]->mutable_cpu_diff()[i] = dw[i]; + this->blobs_[3]->mutable_cpu_diff()[i] += dw[i]; if (bias_term_) { dw += channels_; for (int i = 0; i < this->channels_; i++) - this->blobs_[4]->mutable_cpu_diff()[i] = dw[i]; + this->blobs_[4]->mutable_cpu_diff()[i] += dw[i]; } + } } } diff --git a/src/caffe/mkldnn_memory.cpp b/src/caffe/mkldnn_memory.cpp index bacb6ae61..6e42e691d 100644 --- a/src/caffe/mkldnn_memory.cpp +++ b/src/caffe/mkldnn_memory.cpp @@ -453,6 +453,32 @@ shared_ptr MKLDNNMemoryDescriptor::create_output_memory( return omem; } +template +Dtype* MKLDNNMemoryDescriptor::get_memory_ptr(long offset) { + if (this->conversion_needed()) { + // TODO: support DFP16 offset + if (this->prv_ptr() != NULL) return (Dtype*)this->prv_ptr() + offset; + // when _internal_ptr is null, having same private layout as _blob + else return is_diff ? + (Dtype*)this->_blob->prv_diff() + offset : + (Dtype*)this->_blob->prv_data() + offset; + } else { + return const_cast( + is_diff ? this->_blob->cpu_diff() + offset : this->_blob->cpu_data() + offset); + } +} + +template +shared_ptr MKLDNNMemoryDescriptor::get_memory_desc() { + shared_ptr desc; + if (this->conversion_needed()) { + desc.reset(new memory::desc(this->prv_memory_pd()->desc())); + } else { + desc.reset(new memory::desc(this->usr_memory_pd()->desc())); + } + return desc; +} + template shared_ptr > get_mkldnn_prv_descriptor(Blob* blob) { From 4e4aecb2d9391056a60449271a3f3845eaf4cb6d Mon Sep 17 00:00:00 2001 From: "Yu, Chong" Date: Mon, 21 Aug 2017 11:10:23 +0800 Subject: [PATCH 27/38] Update MKLDNN version to 27420a241b2efd8d88f1e003635434194fdfb1b8 --- mkldnn.commit | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkldnn.commit b/mkldnn.commit index 7eb0167ed..4c279b216 100644 --- a/mkldnn.commit +++ b/mkldnn.commit @@ -1 +1 @@ -171572a205c71f5bbb08657de5660c9d06cf2d8f +27420a241b2efd8d88f1e003635434194fdfb1b8 From 445a396817ccea903ad538e066a84f5252a82cd0 Mon Sep 17 00:00:00 2001 From: linxinan Date: Tue, 22 Aug 2017 20:27:03 +0800 Subject: [PATCH 28/38] add vgg_16_8nodes solver prototxt --- .../multinode/vgg_16_8nodes/solver.prototxt | 14 + .../vgg_16_8nodes/train_val.prototxt | 612 ++++++++++++++++++ 2 files changed, 626 insertions(+) create mode 100644 models/intel_optimized_models/multinode/vgg_16_8nodes/solver.prototxt create mode 100644 models/intel_optimized_models/multinode/vgg_16_8nodes/train_val.prototxt diff --git a/models/intel_optimized_models/multinode/vgg_16_8nodes/solver.prototxt b/models/intel_optimized_models/multinode/vgg_16_8nodes/solver.prototxt new file mode 100644 index 000000000..1b55e4c7d --- /dev/null +++ b/models/intel_optimized_models/multinode/vgg_16_8nodes/solver.prototxt @@ -0,0 +1,14 @@ +net: "models/intel_optimized_models/multinode/vgg_16_8nodes/train_val.prototxt" +test_iter: 1563 +test_interval: 10000 +momentum: 0.9 +weight_decay: 0.0005 +base_lr: 0.01 +lr_policy: "poly" +power: 2 +max_iter: 300000 +display: 40 +snapshot: 100000 +solver_mode: CPU +snapshot_prefix: "models/intel_optimized_models/multinode/vgg_16_8nodes" + diff --git a/models/intel_optimized_models/multinode/vgg_16_8nodes/train_val.prototxt b/models/intel_optimized_models/multinode/vgg_16_8nodes/train_val.prototxt new file mode 100644 index 000000000..5571737db --- /dev/null +++ b/models/intel_optimized_models/multinode/vgg_16_8nodes/train_val.prototxt @@ -0,0 +1,612 @@ +name: "VGG_ILSVRC_16_layer" +layer { + name: "data" + type: "Data" + include { + phase: TRAIN + } + transform_param { + crop_size: 224 + mean_value: 104 + mean_value: 117 + mean_value: 124 + mirror: true + } + data_param { + source: "examples/imagenet/ilsvrc12_train_lmdb" + batch_size: 32 + backend: LMDB + } + top: "data" + top: "label" +} +layer { + name: "data" + type: "Data" + include { + phase: TEST + } + transform_param { + crop_size: 224 + mean_value: 104 + mean_value: 117 + mean_value: 124 + mirror: false + } + data_param { + source: "examples/imagenet/ilsvrc12_val_lmdb" + batch_size: 50 + backend: LMDB + } + top: "data" + top: "label" +} +layer { + name: "conv1_1" + type: "Convolution" + bottom: "data" + top: "conv1_1" + convolution_param { + + num_output: 64 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } + + +} +layer { + bottom: "conv1_1" + top: "conv1_1" + name: "relu1_1" + type: "ReLU" + relu_param { + + } +} +layer { + bottom: "conv1_1" + top: "conv1_2" + name: "conv1_2" + type: "Convolution" + convolution_param { + + num_output: 64 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } + + +} +layer { + bottom: "conv1_2" + top: "conv1_2" + name: "relu1_2" + type: "ReLU" + relu_param { + + } +} +layer { + bottom: "conv1_2" + top: "pool1" + name: "pool1" + type: "Pooling" + pooling_param { + + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + bottom: "pool1" + top: "conv2_1" + name: "conv2_1" + type: "Convolution" + convolution_param { + + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } + + +} +layer { + bottom: "conv2_1" + top: "conv2_1" + name: "relu2_1" + type: "ReLU" + relu_param { + + } +} +layer { + bottom: "conv2_1" + top: "conv2_2" + name: "conv2_2" + type: "Convolution" + convolution_param { + + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } + + +} +layer { + bottom: "conv2_2" + top: "conv2_2" + name: "relu2_2" + type: "ReLU" + relu_param { + + } +} +layer { + bottom: "conv2_2" + top: "pool2" + name: "pool2" + type: "Pooling" + pooling_param { + + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + bottom: "pool2" + top: "conv3_1" + name: "conv3_1" + type: "Convolution" + convolution_param { + + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } + + +} +layer { + bottom: "conv3_1" + top: "conv3_1" + name: "relu3_1" + type: "ReLU" + relu_param { + + } +} +layer { + bottom: "conv3_1" + top: "conv3_2" + name: "conv3_2" + type: "Convolution" + convolution_param { + + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } + + +} +layer { + bottom: "conv3_2" + top: "conv3_2" + name: "relu3_2" + type: "ReLU" + relu_param { + + } +} +layer { + bottom: "conv3_2" + top: "conv3_3" + name: "conv3_3" + type: "Convolution" + convolution_param { + + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } + + +} +layer { + bottom: "conv3_3" + top: "conv3_3" + name: "relu3_3" + type: "ReLU" + relu_param { + + } +} +layer { + bottom: "conv3_3" + top: "pool3" + name: "pool3" + type: "Pooling" + pooling_param { + + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + bottom: "pool3" + top: "conv4_1" + name: "conv4_1" + type: "Convolution" + convolution_param { + + num_output: 512 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } + + +} +layer { + bottom: "conv4_1" + top: "conv4_1" + name: "relu4_1" + type: "ReLU" + relu_param { + + } +} +layer { + bottom: "conv4_1" + top: "conv4_2" + name: "conv4_2" + type: "Convolution" + convolution_param { + + num_output: 512 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } + + +} +layer { + bottom: "conv4_2" + top: "conv4_2" + name: "relu4_2" + type: "ReLU" + relu_param { + + } +} +layer { + bottom: "conv4_2" + top: "conv4_3" + name: "conv4_3" + type: "Convolution" + convolution_param { + + num_output: 512 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } + + +} +layer { + bottom: "conv4_3" + top: "conv4_3" + name: "relu4_3" + type: "ReLU" + relu_param { + + } +} +layer { + bottom: "conv4_3" + top: "pool4" + name: "pool4" + type: "Pooling" + pooling_param { + + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + bottom: "pool4" + top: "conv5_1" + name: "conv5_1" + type: "Convolution" + convolution_param { + + num_output: 512 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } + + +} +layer { + bottom: "conv5_1" + top: "conv5_1" + name: "relu5_1" + type: "ReLU" + relu_param { + + } +} +layer { + bottom: "conv5_1" + top: "conv5_2" + name: "conv5_2" + type: "Convolution" + convolution_param { + + num_output: 512 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } + + +} +layer { + bottom: "conv5_2" + top: "conv5_2" + name: "relu5_2" + type: "ReLU" + relu_param { + + } +} +layer { + bottom: "conv5_2" + top: "conv5_3" + name: "conv5_3" + type: "Convolution" + convolution_param { + + num_output: 512 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.0 + } + } + + +} +layer { + bottom: "conv5_3" + top: "conv5_3" + name: "relu5_3" + type: "ReLU" + relu_param { + + } +} +layer { + bottom: "conv5_3" + top: "pool5" + name: "pool5" + type: "Pooling" + pooling_param { + + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + bottom: "pool5" + top: "fc6" + name: "fc6" + type: "InnerProduct" + inner_product_param { + num_output: 4096 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.1 + } + } + + +} +layer { + bottom: "fc6" + top: "fc6" + name: "relu6" + type: "ReLU" + relu_param { + + } +} +layer { + bottom: "fc6" + top: "fc6" + name: "drop6" + type: "Dropout" + dropout_param { + dropout_ratio: 0.5 + } +} +layer { + bottom: "fc6" + top: "fc7" + name: "fc7" + type: "InnerProduct" + inner_product_param { + num_output: 4096 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.1 + } + } + + +} +layer { + bottom: "fc7" + top: "fc7" + name: "relu7" + type: "ReLU" + relu_param { + + } +} +layer { + bottom: "fc7" + top: "fc7" + name: "drop7" + type: "Dropout" + dropout_param { + dropout_ratio: 0.5 + } +} +layer { + name: "fc8" + bottom: "fc7" + top: "fc8" + type: "InnerProduct" + inner_product_param { + num_output: 1000 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0.1 + } + } + + +} +layer { + name: "loss" + type: "SoftmaxWithLoss" + bottom: "fc8" + bottom: "label" + top: "loss/loss" +} +layer { + name: "accuracy/top1" + type: "Accuracy" + bottom: "fc8" + bottom: "label" + top: "accuracy@1" + include: { phase: TEST } + accuracy_param { + top_k: 1 + } +} +layer { + name: "accuracy/top5" + type: "Accuracy" + bottom: "fc8" + bottom: "label" + top: "accuracy@5" + include: { phase: TEST } + accuracy_param { + top_k: 5 + } +} From d3ec15c34388d4f40659f243bf5ae482e3a73252 Mon Sep 17 00:00:00 2001 From: "Gong, Jiong" Date: Tue, 22 Aug 2017 22:37:09 +0800 Subject: [PATCH 29/38] fix icl197 and a regression in bn --- src/caffe/layers/mkldnn_batch_norm_layer.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/caffe/layers/mkldnn_batch_norm_layer.cpp b/src/caffe/layers/mkldnn_batch_norm_layer.cpp index 6688f8584..91c753bba 100644 --- a/src/caffe/layers/mkldnn_batch_norm_layer.cpp +++ b/src/caffe/layers/mkldnn_batch_norm_layer.cpp @@ -62,6 +62,8 @@ void MKLDNNBatchNormLayer::LayerSetUp(const vector*>& bottom bias_term_ = this->layer_param_.batch_norm_param().bias_term(); moving_average_fraction_ = this->layer_param_.batch_norm_param().moving_average_fraction(); use_global_stats_ = this->phase_ == TEST; + if (this->layer_param_.batch_norm_param().has_use_global_stats()) + use_global_stats_ = this->layer_param_.batch_norm_param().use_global_stats(); this->blobs_.resize(3 + (use_weight_bias_ ? 1:0) + (use_weight_bias_ && bias_term_ ? 1:0)); @@ -299,13 +301,13 @@ void MKLDNNBatchNormLayer::InitBatchNormFwdPrimitive(int idx) { static_cast(variance_memory[idx]->get_data_handle())); if (use_weight_bias_) { BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd, - *input_stats[idx], *mean_memory[idx], - *variance_memory[idx], *scaleshift_memory, + *input_stats[idx], (const primitive::at)*mean_memory[idx], + (const primitive::at)*variance_memory[idx], *scaleshift_memory, *output_stats[idx])); } else { BatchNormFwd[idx].reset(new batch_normalization_forward(*BatchNormFwd_pd, - *input_stats[idx], *mean_memory[idx], - *variance_memory[idx], *output_stats[idx])); + *input_stats[idx], (const primitive::at)*mean_memory[idx], + (const primitive::at)*variance_memory[idx], *output_stats[idx])); } } else { if (use_weight_bias_) { From 3fb9b9acb6bbf776119eb37b7f0455ad22501d71 Mon Sep 17 00:00:00 2001 From: "Yu, Chong" Date: Wed, 23 Aug 2017 22:24:30 +0800 Subject: [PATCH 30/38] Avoid unnecessary scale and shift coping back and forth in BatchNorm. --- include/caffe/layers/mkldnn_layers.hpp | 2 + src/caffe/layers/mkldnn_batch_norm_layer.cpp | 49 +++++++++----------- 2 files changed, 23 insertions(+), 28 deletions(-) diff --git a/include/caffe/layers/mkldnn_layers.hpp b/include/caffe/layers/mkldnn_layers.hpp index bf23438bd..7d5e0dbed 100644 --- a/include/caffe/layers/mkldnn_layers.hpp +++ b/include/caffe/layers/mkldnn_layers.hpp @@ -71,6 +71,7 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer, public Layer { , scaleshift_memory(), bwd_scaleshift_diff_memory() , output_memory(), bwd_bottom_diff_memory() , input_primitive(), bwd_top_diff_primitive() + , scaleshift_combination() { PERFORMANCE_EVENT_ID_RESET(perf_id_fw_); PERFORMANCE_EVENT_ID_RESET(perf_id_bw_); @@ -118,6 +119,7 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer, public Layer { bool use_weight_bias_, bias_term_, use_global_stats_; int num_stats_batches_; int stats_batch_size_; + shared_ptr> scaleshift_combination; PERFORMANCE_EVENT_ID_DECL(perf_id_fw_); PERFORMANCE_EVENT_ID_DECL(perf_id_bw_); diff --git a/src/caffe/layers/mkldnn_batch_norm_layer.cpp b/src/caffe/layers/mkldnn_batch_norm_layer.cpp index 91c753bba..0b9bb2343 100644 --- a/src/caffe/layers/mkldnn_batch_norm_layer.cpp +++ b/src/caffe/layers/mkldnn_batch_norm_layer.cpp @@ -78,6 +78,19 @@ void MKLDNNBatchNormLayer::LayerSetUp(const vector*>& bottom this->blobs_[i]->mutable_cpu_data()); } + //IntelCaffe treat scale and shift as different blobs, so current MKL-DNN integration has additional copies from Caffe to MKL-DNN buffer on fwd pass and from MKL-DNN to Caffe buffer on bwd pass. + //Optimization: use the temp blob to combine the scale and shift together. Avoid the additional copies. + // Initialize scale and shift combination blob + vector scaleshift_combination_shape(1); + scaleshift_combination_shape[0] = 2*channels_; + this->scaleshift_combination.reset(new Blob(scaleshift_combination_shape)); + //Should initialize the scaleshift_combine buffer to 0, because when bias_term_ == false, need to pass zero bias to MKLDNN + caffe_set(scaleshift_combination_shape[0], static_cast(0), + scaleshift_combination->mutable_cpu_data()); + //Not so necessary, because the diff will initialize to 0 automatically + caffe_set(scaleshift_combination_shape[0], static_cast(0), + scaleshift_combination->mutable_cpu_diff()); + if (use_weight_bias_) { // Initialize scale and shift vector scaleshift_shape(1); @@ -85,6 +98,8 @@ void MKLDNNBatchNormLayer::LayerSetUp(const vector*>& bottom VLOG(1) << "MKLDNNBatchNormLayer::LayerSetUp: channels_ = " << channels_; this->blobs_[3].reset(new Blob(scaleshift_shape)); + this->blobs_[3]->set_cpu_data(scaleshift_combination->mutable_cpu_data()); + this->blobs_[3]->set_cpu_diff(scaleshift_combination->mutable_cpu_diff()); FillerParameter filler_param(this->layer_param_.batch_norm_param().filler()); if (!this->layer_param_.batch_norm_param().has_filler()) { filler_param.set_type("constant"); @@ -94,8 +109,10 @@ void MKLDNNBatchNormLayer::LayerSetUp(const vector*>& bottom VLOG(1) << "MKLDNNBatchNormLayer::LayerSetUp: scaleshift " << __LINE__ << ":" << this->layer_param_.name(); filler->Fill(this->blobs_[3].get()); - if ( bias_term_ ) { + if (bias_term_) { this->blobs_[4].reset(new Blob(scaleshift_shape)); + this->blobs_[4]->set_cpu_data(scaleshift_combination->mutable_cpu_data() + scaleshift_combination->offset(channels_)); + this->blobs_[4]->set_cpu_diff(scaleshift_combination->mutable_cpu_diff() + scaleshift_combination->offset(channels_)); FillerParameter bias_filler_param(this->layer_param_.batch_norm_param().bias_filler()); if (!this->layer_param_.batch_norm_param().has_bias_filler()) { bias_filler_param.set_type("constant"); @@ -212,7 +229,7 @@ void MKLDNNBatchNormLayer::InitBatchNorm(const vector*>& bott // ---- Create memory --------------------- if (use_weight_bias_) { - scaleshift_memory.reset(new memory(BatchNormFwd_pd->weights_primitive_desc())); + scaleshift_memory.reset(new memory(BatchNormFwd_pd->weights_primitive_desc(), this->scaleshift_combination->mutable_cpu_data())); } // --- init primitive and prv_memory descriptors ---------------------- @@ -352,18 +369,7 @@ void MKLDNNBatchNormLayer::Forward_cpu(const vector*>& bottom caffe_cpu_scale(this->blobs_[1]->count(), scale_factor, this->blobs_[1]->cpu_data(), variance_buffer_); } - if (use_weight_bias_) { - Dtype* scaleShift_buffer_ = (Dtype *)(scaleshift_memory->get_data_handle()); - // Fill ScaleShift buffer - for (int i = 0; i < this->channels_; i++) { - scaleShift_buffer_[i] = this->blobs_[3]->cpu_data()[i]; - scaleShift_buffer_[channels_ + i] = 0; - if (bias_term_) { - scaleShift_buffer_[channels_ + i] = this->blobs_[4]->cpu_data()[i]; - } - } - } - + PERFORMANCE_EVENT_ID_INIT(perf_id_fw_, PERFORMANCE_MKLDNN_NAME("FW")); PERFORMANCE_MEASUREMENT_BEGIN(); BatchNormFwd[stats_batch_idx].submit(); @@ -457,7 +463,7 @@ void MKLDNNBatchNormLayer::InitBatchNormBwd( if (use_weight_bias_) { bwd_scaleshift_diff_memory.reset(new memory( - BatchNormFwd_pd->weights_primitive_desc())); + BatchNormFwd_pd->weights_primitive_desc(), this->scaleshift_combination->mutable_cpu_diff())); } // --- init primitive and prv_memory descriptors ---------------------- @@ -555,19 +561,6 @@ void MKLDNNBatchNormLayer::Backward_cpu(const vector*>& top, } #endif PERFORMANCE_MEASUREMENT_END_ID(perf_id_bw_); - - /* FIXME: this wouldn't work with lazy stream */ - if (use_weight_bias_) { - Dtype* dw = (Dtype *)(bwd_scaleshift_diff_memory->get_data_handle()); - for (int i = 0; i < this->channels_; i++) - this->blobs_[3]->mutable_cpu_diff()[i] += dw[i]; - - if (bias_term_) { - dw += channels_; - for (int i = 0; i < this->channels_; i++) - this->blobs_[4]->mutable_cpu_diff()[i] += dw[i]; - } - } } } From bf824c47654bc22b12f5f5f757b049bda7611ada Mon Sep 17 00:00:00 2001 From: "Gong, Jiong" Date: Fri, 25 Aug 2017 22:21:41 +0800 Subject: [PATCH 31/38] support scaleshift accum with stats batch size>1 Change-Id: I3b1a16dae1a6a2965b43ce61109d0a58b70e9093 --- include/caffe/layers/mkldnn_layers.hpp | 5 +- src/caffe/layers/mkldnn_batch_norm_layer.cpp | 63 +++++++++++++------- 2 files changed, 43 insertions(+), 25 deletions(-) diff --git a/include/caffe/layers/mkldnn_layers.hpp b/include/caffe/layers/mkldnn_layers.hpp index 7d5e0dbed..f7ce1062e 100644 --- a/include/caffe/layers/mkldnn_layers.hpp +++ b/include/caffe/layers/mkldnn_layers.hpp @@ -71,7 +71,6 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer, public Layer { , scaleshift_memory(), bwd_scaleshift_diff_memory() , output_memory(), bwd_bottom_diff_memory() , input_primitive(), bwd_top_diff_primitive() - , scaleshift_combination() { PERFORMANCE_EVENT_ID_RESET(perf_id_fw_); PERFORMANCE_EVENT_ID_RESET(perf_id_bw_); @@ -100,6 +99,7 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer, public Layer { void InitBatchNormBwdPrimitive(int stats_batch_idx); template shared_ptr GetStatsBatchMemory( shared_ptr > mkldnn_data, int idx); + void InitStatsBatchVars(int batch_size); shared_ptr > fwd_top_data, fwd_bottom_data; shared_ptr > bwd_top_diff, bwd_bottom_diff; shared_ptr BatchNormFwd_pd; @@ -119,7 +119,8 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer, public Layer { bool use_weight_bias_, bias_term_, use_global_stats_; int num_stats_batches_; int stats_batch_size_; - shared_ptr> scaleshift_combination; + shared_ptr > scaleshift_blob_; + shared_ptr > scaleshift_acc_; PERFORMANCE_EVENT_ID_DECL(perf_id_fw_); PERFORMANCE_EVENT_ID_DECL(perf_id_bw_); diff --git a/src/caffe/layers/mkldnn_batch_norm_layer.cpp b/src/caffe/layers/mkldnn_batch_norm_layer.cpp index 0b9bb2343..f1edfebd4 100644 --- a/src/caffe/layers/mkldnn_batch_norm_layer.cpp +++ b/src/caffe/layers/mkldnn_batch_norm_layer.cpp @@ -44,6 +44,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace caffe { +template +void MKLDNNBatchNormLayer::InitStatsBatchVars(int batch_size) { + num_stats_batches_ = 1; + stats_batch_size_ = batch_size; + BatchNormParameter param = this->layer_param_.batch_norm_param(); + if (!use_global_stats_ && param.stats_batch_size() > 0) { + CHECK_EQ(batch_size % param.stats_batch_size(), 0); + num_stats_batches_ = batch_size / param.stats_batch_size(); + stats_batch_size_ = param.stats_batch_size(); + } +} + template void MKLDNNBatchNormLayer::LayerSetUp(const vector*>& bottom ,const vector*>& top) @@ -65,6 +77,8 @@ void MKLDNNBatchNormLayer::LayerSetUp(const vector*>& bottom if (this->layer_param_.batch_norm_param().has_use_global_stats()) use_global_stats_ = this->layer_param_.batch_norm_param().use_global_stats(); + InitStatsBatchVars(num_); + this->blobs_.resize(3 + (use_weight_bias_ ? 1:0) + (use_weight_bias_ && bias_term_ ? 1:0)); vector sz; @@ -81,15 +95,18 @@ void MKLDNNBatchNormLayer::LayerSetUp(const vector*>& bottom //IntelCaffe treat scale and shift as different blobs, so current MKL-DNN integration has additional copies from Caffe to MKL-DNN buffer on fwd pass and from MKL-DNN to Caffe buffer on bwd pass. //Optimization: use the temp blob to combine the scale and shift together. Avoid the additional copies. // Initialize scale and shift combination blob - vector scaleshift_combination_shape(1); - scaleshift_combination_shape[0] = 2*channels_; - this->scaleshift_combination.reset(new Blob(scaleshift_combination_shape)); - //Should initialize the scaleshift_combine buffer to 0, because when bias_term_ == false, need to pass zero bias to MKLDNN - caffe_set(scaleshift_combination_shape[0], static_cast(0), - scaleshift_combination->mutable_cpu_data()); - //Not so necessary, because the diff will initialize to 0 automatically - caffe_set(scaleshift_combination_shape[0], static_cast(0), - scaleshift_combination->mutable_cpu_diff()); + vector scaleshift_blob_shape(1); + scaleshift_blob_shape[0] = 2*channels_; + scaleshift_blob_.reset(new Blob(scaleshift_blob_shape)); + //Should initialize the scaleshift_blob_ buffer to 0, because when bias_term_ == false, need to pass zero bias to MKLDNN + caffe_set(scaleshift_blob_shape[0], static_cast(0), + scaleshift_blob_->mutable_cpu_data()); + shared_ptr > scaleshift_diff_blob = scaleshift_blob_; + scaleshift_acc_ = scaleshift_blob_; + if (num_stats_batches_ > 1) { + this->scaleshift_acc_.reset(new Blob(scaleshift_blob_shape)); + scaleshift_diff_blob = scaleshift_acc_; + } if (use_weight_bias_) { // Initialize scale and shift @@ -98,8 +115,8 @@ void MKLDNNBatchNormLayer::LayerSetUp(const vector*>& bottom VLOG(1) << "MKLDNNBatchNormLayer::LayerSetUp: channels_ = " << channels_; this->blobs_[3].reset(new Blob(scaleshift_shape)); - this->blobs_[3]->set_cpu_data(scaleshift_combination->mutable_cpu_data()); - this->blobs_[3]->set_cpu_diff(scaleshift_combination->mutable_cpu_diff()); + this->blobs_[3]->set_cpu_data(scaleshift_blob_->mutable_cpu_data()); + this->blobs_[3]->set_cpu_diff(scaleshift_diff_blob->mutable_cpu_diff()); FillerParameter filler_param(this->layer_param_.batch_norm_param().filler()); if (!this->layer_param_.batch_norm_param().has_filler()) { filler_param.set_type("constant"); @@ -111,8 +128,8 @@ void MKLDNNBatchNormLayer::LayerSetUp(const vector*>& bottom if (bias_term_) { this->blobs_[4].reset(new Blob(scaleshift_shape)); - this->blobs_[4]->set_cpu_data(scaleshift_combination->mutable_cpu_data() + scaleshift_combination->offset(channels_)); - this->blobs_[4]->set_cpu_diff(scaleshift_combination->mutable_cpu_diff() + scaleshift_combination->offset(channels_)); + this->blobs_[4]->set_cpu_data(scaleshift_blob_->mutable_cpu_data() + scaleshift_blob_->offset(channels_)); + this->blobs_[4]->set_cpu_diff(scaleshift_diff_blob->mutable_cpu_diff() + scaleshift_blob_->offset(channels_)); FillerParameter bias_filler_param(this->layer_param_.batch_norm_param().bias_filler()); if (!this->layer_param_.batch_norm_param().has_bias_filler()) { bias_filler_param.set_type("constant"); @@ -149,14 +166,7 @@ void MKLDNNBatchNormLayer::Reshape(const vector*>& bottom this->num_ = bottom[0]->num(); this->channels_ = bottom[0]->channels(); - num_stats_batches_ = 1; - stats_batch_size_ = bottom[0]->shape(0); - BatchNormParameter param = this->layer_param_.batch_norm_param(); - if (!use_global_stats_ && param.stats_batch_size() > 0) { - CHECK_EQ(bottom[0]->shape(0) % param.stats_batch_size(), 0); - num_stats_batches_ = bottom[0]->shape(0) / param.stats_batch_size(); - stats_batch_size_ = param.stats_batch_size(); - } + InitStatsBatchVars(this->num_); //Fix: should reshape the top blob with the real size of bottom blob //top[0]->Reshape(this->num_, this->channels_, this->height_, this->width_); @@ -229,7 +239,7 @@ void MKLDNNBatchNormLayer::InitBatchNorm(const vector*>& bott // ---- Create memory --------------------- if (use_weight_bias_) { - scaleshift_memory.reset(new memory(BatchNormFwd_pd->weights_primitive_desc(), this->scaleshift_combination->mutable_cpu_data())); + scaleshift_memory.reset(new memory(BatchNormFwd_pd->weights_primitive_desc(), this->scaleshift_blob_->mutable_cpu_data())); } // --- init primitive and prv_memory descriptors ---------------------- @@ -463,7 +473,7 @@ void MKLDNNBatchNormLayer::InitBatchNormBwd( if (use_weight_bias_) { bwd_scaleshift_diff_memory.reset(new memory( - BatchNormFwd_pd->weights_primitive_desc(), this->scaleshift_combination->mutable_cpu_diff())); + BatchNormFwd_pd->weights_primitive_desc(), this->scaleshift_blob_->mutable_cpu_diff())); } // --- init primitive and prv_memory descriptors ---------------------- @@ -561,6 +571,13 @@ void MKLDNNBatchNormLayer::Backward_cpu(const vector*>& top, } #endif PERFORMANCE_MEASUREMENT_END_ID(perf_id_bw_); + if (num_stats_batches_ > 1) { + CHECK(scaleshift_blob_ != scaleshift_acc_); + CHECK(scaleshift_blob_->count() == scaleshift_acc_->count()); + caffe_cpu_axpby(scaleshift_acc_->count(), Dtype(1), + scaleshift_blob_->mutable_cpu_diff(), + Dtype(1), scaleshift_acc_->mutable_cpu_diff()); + } } } From 28c46874c16e7c7e10cc8325331ce77b47c83edb Mon Sep 17 00:00:00 2001 From: "Gong, Jiong" Date: Sat, 26 Aug 2017 02:30:56 +0800 Subject: [PATCH 32/38] add resnet 64 node prototxt Change-Id: I57b497a6a2f028d998301f23b64965b3ab24edff --- .../solver.prototxt | 19 + .../train_val.prototxt | 3322 +++++++++++++++++ 2 files changed, 3341 insertions(+) create mode 100644 models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/solver.prototxt create mode 100644 models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/train_val.prototxt diff --git a/models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/solver.prototxt b/models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/solver.prototxt new file mode 100644 index 000000000..4f4f21a93 --- /dev/null +++ b/models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/solver.prototxt @@ -0,0 +1,19 @@ +net: "models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/train_val.prototxt" +test_iter: 1000 +test_interval: 156 +test_initialization: false +display: 40 +base_lr: 3.2 +lr_policy: "multistep" +stepvalue:4680 +stepvalue:9360 +stepvalue:12480 +gamma: 0.1 +max_iter: 14075 +warmup_iter: 780 # 1281167 / 8192 * 5 epochs +warmup_start_lr: 0.1 +momentum: 0.9 +weight_decay: 0.0001 +snapshot: 156 +snapshot_prefix: "models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/resnet_50_64_nodes_8k" +solver_mode: CPU diff --git a/models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/train_val.prototxt b/models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/train_val.prototxt new file mode 100644 index 000000000..3dd57aaac --- /dev/null +++ b/models/intel_optimized_models/multinode/resnet_50_64_nodes_8k_batch/train_val.prototxt @@ -0,0 +1,3322 @@ +name: "ResNet-50" +bn_stats_batch_size: 32 +layer { + name: "data" + type: "Data" + top: "data" + top: "label" + include { + phase: TRAIN + } + transform_param { + mirror: true + crop_size: 224 + scale: 0.0078125 + mean_value: 104 + mean_value: 117 + mean_value: 123 + random_aspect_ratio_param { + min_area_ratio: 0.08 + max_area_ratio: 1 + aspect_ratio_change: 0.75 + resize_param { + interp_mode: CUBIC + } + } + } + data_param { + source: "examples/imagenet/ilsvrc12_train_lmdb" + batch_size: 128 + backend: LMDB + prefetch: 2 + shuffle: true + } +} +layer { + name: "data" + type: "Data" + top: "data" + top: "label" + include { + phase: TEST + } + transform_param { + mirror: false + crop_size: 224 + scale: 0.0078125 + mean_value: 104 + mean_value: 117 + mean_value: 123 + random_resize_param { + min_size: 256 + max_size: 256 + resize_param { + interp_mode: CUBIC + } + } + } + data_param { + source: "examples/imagenet/ilsvrc12_val_lmdb" + batch_size: 50 + backend: LMDB + } +} + +layer { + bottom: "data" + top: "conv1" + name: "conv1" + type: "Convolution" + convolution_param { + num_output: 64 + kernel_size: 7 + pad: 3 + stride: 2 + weight_filler { + type: "msra" + variance_norm: FAN_OUT + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "conv1" + top: "conv1" + name: "bn_conv1" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "conv1" + top: "conv1" + name: "scale_conv1" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "conv1" + top: "conv1" + name: "conv1_relu" + type: "ReLU" + relu_param { + } +} + +layer { + bottom: "conv1" + top: "pool1" + name: "pool1" + type: "Pooling" + pooling_param { + kernel_size: 3 + stride: 2 + pool: MAX + } +} + +layer { + bottom: "pool1" + top: "res2a_branch1" + name: "res2a_branch1" + type: "Convolution" + convolution_param { + num_output: 256 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res2a_branch1" + top: "res2a_branch1" + name: "bn2a_branch1" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res2a_branch1" + top: "res2a_branch1" + name: "scale2a_branch1" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "pool1" + top: "res2a_branch2a" + name: "res2a_branch2a" + type: "Convolution" + convolution_param { + + num_output: 64 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res2a_branch2a" + top: "res2a_branch2a" + name: "bn2a_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res2a_branch2a" + top: "res2a_branch2a" + name: "scale2a_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res2a_branch2a" + top: "res2a_branch2a" + name: "res2a_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res2a_branch2a" + top: "res2a_branch2b" + name: "res2a_branch2b" + type: "Convolution" + convolution_param { + num_output: 64 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res2a_branch2b" + top: "res2a_branch2b" + name: "bn2a_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res2a_branch2b" + top: "res2a_branch2b" + name: "scale2a_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res2a_branch2b" + top: "res2a_branch2b" + name: "res2a_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res2a_branch2b" + top: "res2a_branch2c" + name: "res2a_branch2c" + type: "Convolution" + convolution_param { + num_output: 256 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res2a_branch2c" + top: "res2a_branch2c" + name: "bn2a_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res2a_branch2c" + top: "res2a_branch2c" + name: "scale2a_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res2a_branch1" + bottom: "res2a_branch2c" + top: "res2a" + name: "res2a" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res2a" + top: "res2a" + name: "res2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res2a" + top: "res2b_branch2a" + name: "res2b_branch2a" + type: "Convolution" + convolution_param { + num_output: 64 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res2b_branch2a" + top: "res2b_branch2a" + name: "bn2b_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res2b_branch2a" + top: "res2b_branch2a" + name: "scale2b_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res2b_branch2a" + top: "res2b_branch2a" + name: "res2b_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res2b_branch2a" + top: "res2b_branch2b" + name: "res2b_branch2b" + type: "Convolution" + convolution_param { + num_output: 64 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res2b_branch2b" + top: "res2b_branch2b" + name: "bn2b_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res2b_branch2b" + top: "res2b_branch2b" + name: "scale2b_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res2b_branch2b" + top: "res2b_branch2b" + name: "res2b_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res2b_branch2b" + top: "res2b_branch2c" + name: "res2b_branch2c" + type: "Convolution" + convolution_param { + num_output: 256 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res2b_branch2c" + top: "res2b_branch2c" + name: "bn2b_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res2b_branch2c" + top: "res2b_branch2c" + name: "scale2b_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res2a" + bottom: "res2b_branch2c" + top: "res2b" + name: "res2b" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res2b" + top: "res2b" + name: "res2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res2b" + top: "res2c_branch2a" + name: "res2c_branch2a" + type: "Convolution" + convolution_param { + + num_output: 64 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res2c_branch2a" + top: "res2c_branch2a" + name: "bn2c_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res2c_branch2a" + top: "res2c_branch2a" + name: "scale2c_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res2c_branch2a" + top: "res2c_branch2a" + name: "res2c_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res2c_branch2a" + top: "res2c_branch2b" + name: "res2c_branch2b" + type: "Convolution" + convolution_param { + num_output: 64 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res2c_branch2b" + top: "res2c_branch2b" + name: "bn2c_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res2c_branch2b" + top: "res2c_branch2b" + name: "scale2c_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res2c_branch2b" + top: "res2c_branch2b" + name: "res2c_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res2c_branch2b" + top: "res2c_branch2c" + name: "res2c_branch2c" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res2c_branch2c" + top: "res2c_branch2c" + name: "bn2c_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 0 } + } +} + +layer { + bottom: "res2c_branch2c" + top: "res2c_branch2c" + name: "scale2c_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res2b" + bottom: "res2c_branch2c" + top: "res2c" + name: "res2c" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res2c" + top: "res2c" + name: "res2c_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res2c" + top: "res3a_branch1" + name: "res3a_branch1" + type: "Convolution" + convolution_param { + num_output: 512 + kernel_size: 1 + pad: 0 + stride: 2 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3a_branch1" + top: "res3a_branch1" + name: "bn3a_branch1" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res3a_branch1" + top: "res3a_branch1" + name: "scale3a_branch1" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res2c" + top: "res3a_branch2a" + name: "res3a_branch2a" + type: "Convolution" + convolution_param { + + num_output: 128 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3a_branch2a" + top: "res3a_branch2a" + name: "bn3a_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res3a_branch2a" + top: "res3a_branch2a" + name: "scale3a_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3a_branch2a" + top: "res3a_branch2a" + name: "res3a_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res3a_branch2a" + top: "res3a_branch2b" + name: "res3a_branch2b" + type: "Convolution" + convolution_param { + + num_output: 128 + kernel_size: 3 + pad: 1 + stride: 2 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3a_branch2b" + top: "res3a_branch2b" + name: "bn3a_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res3a_branch2b" + top: "res3a_branch2b" + name: "scale3a_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3a_branch2b" + top: "res3a_branch2b" + name: "res3a_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res3a_branch2b" + top: "res3a_branch2c" + name: "res3a_branch2c" + type: "Convolution" + convolution_param { + + num_output: 512 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3a_branch2c" + top: "res3a_branch2c" + name: "bn3a_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res3a_branch2c" + top: "res3a_branch2c" + name: "scale3a_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3a_branch1" + bottom: "res3a_branch2c" + top: "res3a" + name: "res3a" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res3a" + top: "res3a" + name: "res3a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res3a" + top: "res3b_branch2a" + name: "res3b_branch2a" + type: "Convolution" + convolution_param { + + num_output: 128 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3b_branch2a" + top: "res3b_branch2a" + name: "bn3b_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res3b_branch2a" + top: "res3b_branch2a" + name: "scale3b_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3b_branch2a" + top: "res3b_branch2a" + name: "res3b_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res3b_branch2a" + top: "res3b_branch2b" + name: "res3b_branch2b" + type: "Convolution" + convolution_param { + + num_output: 128 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3b_branch2b" + top: "res3b_branch2b" + name: "bn3b_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res3b_branch2b" + top: "res3b_branch2b" + name: "scale3b_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3b_branch2b" + top: "res3b_branch2b" + name: "res3b_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res3b_branch2b" + top: "res3b_branch2c" + name: "res3b_branch2c" + type: "Convolution" + convolution_param { + + num_output: 512 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3b_branch2c" + top: "res3b_branch2c" + name: "bn3b_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res3b_branch2c" + top: "res3b_branch2c" + name: "scale3b_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3a" + bottom: "res3b_branch2c" + top: "res3b" + name: "res3b" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res3b" + top: "res3b" + name: "res3b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res3b" + top: "res3c_branch2a" + name: "res3c_branch2a" + type: "Convolution" + convolution_param { + + num_output: 128 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3c_branch2a" + top: "res3c_branch2a" + name: "bn3c_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res3c_branch2a" + top: "res3c_branch2a" + name: "scale3c_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3c_branch2a" + top: "res3c_branch2a" + name: "res3c_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res3c_branch2a" + top: "res3c_branch2b" + name: "res3c_branch2b" + type: "Convolution" + convolution_param { + + num_output: 128 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3c_branch2b" + top: "res3c_branch2b" + name: "bn3c_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res3c_branch2b" + top: "res3c_branch2b" + name: "scale3c_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3c_branch2b" + top: "res3c_branch2b" + name: "res3c_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res3c_branch2b" + top: "res3c_branch2c" + name: "res3c_branch2c" + type: "Convolution" + convolution_param { + + num_output: 512 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3c_branch2c" + top: "res3c_branch2c" + name: "bn3c_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res3c_branch2c" + top: "res3c_branch2c" + name: "scale3c_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3b" + bottom: "res3c_branch2c" + top: "res3c" + name: "res3c" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res3c" + top: "res3c" + name: "res3c_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res3c" + top: "res3d_branch2a" + name: "res3d_branch2a" + type: "Convolution" + convolution_param { + num_output: 128 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3d_branch2a" + top: "res3d_branch2a" + name: "bn3d_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res3d_branch2a" + top: "res3d_branch2a" + name: "scale3d_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3d_branch2a" + top: "res3d_branch2a" + name: "res3d_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res3d_branch2a" + top: "res3d_branch2b" + name: "res3d_branch2b" + type: "Convolution" + convolution_param { + num_output: 128 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3d_branch2b" + top: "res3d_branch2b" + name: "bn3d_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res3d_branch2b" + top: "res3d_branch2b" + name: "scale3d_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3d_branch2b" + top: "res3d_branch2b" + name: "res3d_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res3d_branch2b" + top: "res3d_branch2c" + name: "res3d_branch2c" + type: "Convolution" + convolution_param { + + num_output: 512 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res3d_branch2c" + top: "res3d_branch2c" + name: "bn3d_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 0 } + } +} + +layer { + bottom: "res3d_branch2c" + top: "res3d_branch2c" + name: "scale3d_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3c" + bottom: "res3d_branch2c" + top: "res3d" + name: "res3d" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res3d" + top: "res3d" + name: "res3d_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res3d" + top: "res4a_branch1" + name: "res4a_branch1" + type: "Convolution" + convolution_param { + + num_output: 1024 + kernel_size: 1 + pad: 0 + stride: 2 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4a_branch1" + top: "res4a_branch1" + name: "bn4a_branch1" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4a_branch1" + top: "res4a_branch1" + name: "scale4a_branch1" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res3d" + top: "res4a_branch2a" + name: "res4a_branch2a" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4a_branch2a" + top: "res4a_branch2a" + name: "bn4a_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4a_branch2a" + top: "res4a_branch2a" + name: "scale4a_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4a_branch2a" + top: "res4a_branch2a" + name: "res4a_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4a_branch2a" + top: "res4a_branch2b" + name: "res4a_branch2b" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 3 + pad: 1 + stride: 2 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4a_branch2b" + top: "res4a_branch2b" + name: "bn4a_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4a_branch2b" + top: "res4a_branch2b" + name: "scale4a_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4a_branch2b" + top: "res4a_branch2b" + name: "res4a_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4a_branch2b" + top: "res4a_branch2c" + name: "res4a_branch2c" + type: "Convolution" + convolution_param { + + num_output: 1024 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4a_branch2c" + top: "res4a_branch2c" + name: "bn4a_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4a_branch2c" + top: "res4a_branch2c" + name: "scale4a_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4a_branch1" + bottom: "res4a_branch2c" + top: "res4a" + name: "res4a" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res4a" + top: "res4a" + name: "res4a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4a" + top: "res4b_branch2a" + name: "res4b_branch2a" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4b_branch2a" + top: "res4b_branch2a" + name: "bn4b_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4b_branch2a" + top: "res4b_branch2a" + name: "scale4b_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4b_branch2a" + top: "res4b_branch2a" + name: "res4b_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4b_branch2a" + top: "res4b_branch2b" + name: "res4b_branch2b" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4b_branch2b" + top: "res4b_branch2b" + name: "bn4b_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4b_branch2b" + top: "res4b_branch2b" + name: "scale4b_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4b_branch2b" + top: "res4b_branch2b" + name: "res4b_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4b_branch2b" + top: "res4b_branch2c" + name: "res4b_branch2c" + type: "Convolution" + convolution_param { + + num_output: 1024 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4b_branch2c" + top: "res4b_branch2c" + name: "bn4b_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4b_branch2c" + top: "res4b_branch2c" + name: "scale4b_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4a" + bottom: "res4b_branch2c" + top: "res4b" + name: "res4b" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res4b" + top: "res4b" + name: "res4b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4b" + top: "res4c_branch2a" + name: "res4c_branch2a" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4c_branch2a" + top: "res4c_branch2a" + name: "bn4c_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4c_branch2a" + top: "res4c_branch2a" + name: "scale4c_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4c_branch2a" + top: "res4c_branch2a" + name: "res4c_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4c_branch2a" + top: "res4c_branch2b" + name: "res4c_branch2b" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4c_branch2b" + top: "res4c_branch2b" + name: "bn4c_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4c_branch2b" + top: "res4c_branch2b" + name: "scale4c_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4c_branch2b" + top: "res4c_branch2b" + name: "res4c_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4c_branch2b" + top: "res4c_branch2c" + name: "res4c_branch2c" + type: "Convolution" + convolution_param { + + num_output: 1024 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4c_branch2c" + top: "res4c_branch2c" + name: "bn4c_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4c_branch2c" + top: "res4c_branch2c" + name: "scale4c_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4b" + bottom: "res4c_branch2c" + top: "res4c" + name: "res4c" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res4c" + top: "res4c" + name: "res4c_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4c" + top: "res4d_branch2a" + name: "res4d_branch2a" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4d_branch2a" + top: "res4d_branch2a" + name: "bn4d_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4d_branch2a" + top: "res4d_branch2a" + name: "scale4d_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4d_branch2a" + top: "res4d_branch2a" + name: "res4d_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4d_branch2a" + top: "res4d_branch2b" + name: "res4d_branch2b" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4d_branch2b" + top: "res4d_branch2b" + name: "bn4d_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4d_branch2b" + top: "res4d_branch2b" + name: "scale4d_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4d_branch2b" + top: "res4d_branch2b" + name: "res4d_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4d_branch2b" + top: "res4d_branch2c" + name: "res4d_branch2c" + type: "Convolution" + convolution_param { + + num_output: 1024 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4d_branch2c" + top: "res4d_branch2c" + name: "bn4d_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4d_branch2c" + top: "res4d_branch2c" + name: "scale4d_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4c" + bottom: "res4d_branch2c" + top: "res4d" + name: "res4d" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res4d" + top: "res4d" + name: "res4d_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4d" + top: "res4e_branch2a" + name: "res4e_branch2a" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4e_branch2a" + top: "res4e_branch2a" + name: "bn4e_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4e_branch2a" + top: "res4e_branch2a" + name: "scale4e_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4e_branch2a" + top: "res4e_branch2a" + name: "res4e_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4e_branch2a" + top: "res4e_branch2b" + name: "res4e_branch2b" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4e_branch2b" + top: "res4e_branch2b" + name: "bn4e_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4e_branch2b" + top: "res4e_branch2b" + name: "scale4e_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4e_branch2b" + top: "res4e_branch2b" + name: "res4e_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4e_branch2b" + top: "res4e_branch2c" + name: "res4e_branch2c" + type: "Convolution" + convolution_param { + + num_output: 1024 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4e_branch2c" + top: "res4e_branch2c" + name: "bn4e_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4e_branch2c" + top: "res4e_branch2c" + name: "scale4e_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4d" + bottom: "res4e_branch2c" + top: "res4e" + name: "res4e" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res4e" + top: "res4e" + name: "res4e_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4e" + top: "res4f_branch2a" + name: "res4f_branch2a" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4f_branch2a" + top: "res4f_branch2a" + name: "bn4f_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4f_branch2a" + top: "res4f_branch2a" + name: "scale4f_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4f_branch2a" + top: "res4f_branch2a" + name: "res4f_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4f_branch2a" + top: "res4f_branch2b" + name: "res4f_branch2b" + type: "Convolution" + convolution_param { + + num_output: 256 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4f_branch2b" + top: "res4f_branch2b" + name: "bn4f_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res4f_branch2b" + top: "res4f_branch2b" + name: "scale4f_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4f_branch2b" + top: "res4f_branch2b" + name: "res4f_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4f_branch2b" + top: "res4f_branch2c" + name: "res4f_branch2c" + type: "Convolution" + convolution_param { + + num_output: 1024 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res4f_branch2c" + top: "res4f_branch2c" + name: "bn4f_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 0 } + } +} + +layer { + bottom: "res4f_branch2c" + top: "res4f_branch2c" + name: "scale4f_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4e" + bottom: "res4f_branch2c" + top: "res4f" + name: "res4f" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res4f" + top: "res4f" + name: "res4f_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res4f" + top: "res5a_branch1" + name: "res5a_branch1" + type: "Convolution" + convolution_param { + + num_output: 2048 + kernel_size: 1 + pad: 0 + stride: 2 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res5a_branch1" + top: "res5a_branch1" + name: "bn5a_branch1" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res5a_branch1" + top: "res5a_branch1" + name: "scale5a_branch1" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res4f" + top: "res5a_branch2a" + name: "res5a_branch2a" + type: "Convolution" + convolution_param { + + num_output: 512 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res5a_branch2a" + top: "res5a_branch2a" + name: "bn5a_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res5a_branch2a" + top: "res5a_branch2a" + name: "scale5a_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res5a_branch2a" + top: "res5a_branch2a" + name: "res5a_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res5a_branch2a" + top: "res5a_branch2b" + name: "res5a_branch2b" + type: "Convolution" + convolution_param { + + num_output: 512 + kernel_size: 3 + pad: 1 + stride: 2 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res5a_branch2b" + top: "res5a_branch2b" + name: "bn5a_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res5a_branch2b" + top: "res5a_branch2b" + name: "scale5a_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res5a_branch2b" + top: "res5a_branch2b" + name: "res5a_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res5a_branch2b" + top: "res5a_branch2c" + name: "res5a_branch2c" + type: "Convolution" + convolution_param { + + num_output: 2048 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res5a_branch2c" + top: "res5a_branch2c" + name: "bn5a_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res5a_branch2c" + top: "res5a_branch2c" + name: "scale5a_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res5a_branch1" + bottom: "res5a_branch2c" + top: "res5a" + name: "res5a" + type: "Eltwise" + eltwise_param { + + } +} + +layer { + bottom: "res5a" + top: "res5a" + name: "res5a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res5a" + top: "res5b_branch2a" + name: "res5b_branch2a" + type: "Convolution" + convolution_param { + + num_output: 512 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res5b_branch2a" + top: "res5b_branch2a" + name: "bn5b_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res5b_branch2a" + top: "res5b_branch2a" + name: "scale5b_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res5b_branch2a" + top: "res5b_branch2a" + name: "res5b_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res5b_branch2a" + top: "res5b_branch2b" + name: "res5b_branch2b" + type: "Convolution" + convolution_param { + + num_output: 512 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res5b_branch2b" + top: "res5b_branch2b" + name: "bn5b_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res5b_branch2b" + top: "res5b_branch2b" + name: "scale5b_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res5b_branch2b" + top: "res5b_branch2b" + name: "res5b_branch2b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res5b_branch2b" + top: "res5b_branch2c" + name: "res5b_branch2c" + type: "Convolution" + convolution_param { + + num_output: 2048 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res5b_branch2c" + top: "res5b_branch2c" + name: "bn5b_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res5b_branch2c" + top: "res5b_branch2c" + name: "scale5b_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res5a" + bottom: "res5b_branch2c" + top: "res5b" + name: "res5b" + type: "Eltwise" + eltwise_param { + } +} + +layer { + bottom: "res5b" + top: "res5b" + name: "res5b_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res5b" + top: "res5c_branch2a" + name: "res5c_branch2a" + type: "Convolution" + convolution_param { + num_output: 512 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res5c_branch2a" + top: "res5c_branch2a" + name: "bn5c_branch2a" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res5c_branch2a" + top: "res5c_branch2a" + name: "scale5c_branch2a" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res5c_branch2a" + top: "res5c_branch2a" + name: "res5c_branch2a_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res5c_branch2a" + top: "res5c_branch2b" + name: "res5c_branch2b" + type: "Convolution" + convolution_param { + num_output: 512 + kernel_size: 3 + pad: 1 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res5c_branch2b" + top: "res5c_branch2b" + name: "bn5c_branch2b" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 1 } + } +} + +layer { + bottom: "res5c_branch2b" + top: "res5c_branch2b" + name: "scale5c_branch2b" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res5c_branch2b" + top: "res5c_branch2b" + name: "res5c_branch2b_relu" + type: "ReLU" + relu_param { + } +} + +layer { + bottom: "res5c_branch2b" + top: "res5c_branch2c" + name: "res5c_branch2c" + type: "Convolution" + convolution_param { + num_output: 2048 + kernel_size: 1 + pad: 0 + stride: 1 + bias_term: false + weight_filler { + type: "msra" + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "res5c_branch2c" + top: "res5c_branch2c" + name: "bn5c_branch2c" + type: "BatchNorm" + param { lr_mult: 0 } + param { lr_mult: 0 } + param { lr_mult: 0 } + batch_norm_param { + moving_average_fraction: 0.9 + filler { value: 0 } + } +} + +layer { + bottom: "res5c_branch2c" + top: "res5c_branch2c" + name: "scale5c_branch2c" + type: "Scale" + param { decay_mult: 0 } + param { decay_mult: 0 } + scale_param { + bias_term: true + } +} + +layer { + bottom: "res5b" + bottom: "res5c_branch2c" + top: "res5c" + name: "res5c" + type: "Eltwise" + eltwise_param { + } +} + +layer { + bottom: "res5c" + top: "res5c" + name: "res5c_relu" + type: "ReLU" + relu_param { + + } +} + +layer { + bottom: "res5c" + top: "pool5" + name: "pool5" + type: "Pooling" + pooling_param { + kernel_size: 7 + stride: 1 + pool: AVE + } +} + +layer { + bottom: "pool5" + top: "fc1000" + name: "fc1000" + type: "InnerProduct" + inner_product_param { + num_output: 1000 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} + +layer { + bottom: "fc1000" + bottom: "label" + top: "loss" + name: "prob" + type: "SoftmaxWithLoss" +} +layer { + name: "loss3/top-1" + type: "Accuracy" + bottom: "fc1000" + bottom: "label" + top: "loss3/top-1" +} +layer { + name: "loss3/top-5" + type: "Accuracy" + bottom: "fc1000" + bottom: "label" + top: "loss3/top-5" + accuracy_param { + top_k: 5 + } +} From c55cac361e62ef1d99b9974d1d4d51665ad26e66 Mon Sep 17 00:00:00 2001 From: Haihao Shen Date: Sat, 26 Aug 2017 06:42:13 +0800 Subject: [PATCH 33/38] Support padded layout --- src/caffe/mkldnn_memory.cpp | 3 +- src/caffe/solvers/sgd_solver.cpp | 107 ++++--------------------------- 2 files changed, 13 insertions(+), 97 deletions(-) diff --git a/src/caffe/mkldnn_memory.cpp b/src/caffe/mkldnn_memory.cpp index 6e42e691d..c53cff7ff 100644 --- a/src/caffe/mkldnn_memory.cpp +++ b/src/caffe/mkldnn_memory.cpp @@ -212,8 +212,7 @@ void MKLDNNMemoryDescriptor::convert_from_extprv(shared_ptr_reorder_extprv2prv_pd == NULL) return; - if (this->_extprv_memory_pd->desc().data.format == this->_prv_memory_pd->desc().data.format && - this->_extprv_memory_pd->desc().data.data_type == this->_prv_memory_pd->desc().data.data_type) + if (*this->_extprv_memory_pd == *this->_prv_memory_pd) { #ifdef DEBUG LOG(INFO) << "The format and data_type of _extprv_memory_pd and _prv_memory_pd is same, no need do conversion."; diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp index 929ff050f..6a7e2ca43 100644 --- a/src/caffe/solvers/sgd_solver.cpp +++ b/src/caffe/solvers/sgd_solver.cpp @@ -354,38 +354,25 @@ void SGDSolver::SGDFusion(int param_id, Dtype rate) { bool prv_diff_condition_flag = false; if (net_params[param_id]->prv_diff() && (net_params[param_id]->prv_diff_count() - == net_params[param_id]->prv_data_count())) { + == net_params[param_id]->count())) { prv_diff_condition_flag = true; - //LOG(INFO) << "Common condition judgement: prv_diff_condition_flag = true."; - } - else - { - //LOG(INFO) << "Common condition judgement: prv_diff_condition_flag = false."; } //#pragma endregion //#pragma region 3. Normalize stage if (skip_Normalize_stage_flag == false) { - //LOG(INFO) << "Normalize stage: Normalize stage is not skipped."; - const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size(); if (prv_diff_condition_flag) { - //LOG(INFO) << "Normalize stage: prv_diff_condition_flag = true."; - caffe_scal(net_params[param_id]->prv_data_count(), accum_normalization, + caffe_scal(net_params[param_id]->prv_diff_count(), accum_normalization, net_params[param_id]->mutable_prv_diff()); } else { - //LOG(INFO) << "Normalize stage: prv_diff_condition_flag = false."; caffe_scal(net_params[param_id]->count(), accum_normalization, net_params[param_id]->mutable_cpu_diff()); } } - else - { - //LOG(INFO) << "Normalize stage: Normalize stage is skipped."; - } //#pragma endregion //For most common topologies from BVLC, all skipped the Normalize stage, and use L2 regularization @@ -401,97 +388,35 @@ void SGDSolver::SGDFusion(int param_id, Dtype rate) { //Regularize stage (Fused ComputeUpdateValue_stage in some situations) if (local_decay) { if (regularization_type == "L2") { - //LOG(INFO) << "Regularize stage: regularization_type == L2."; // add weight decay if (net_params[param_id]->prv_data() && (net_params[param_id]->prv_data_count() == net_params[param_id]->count())) { - //LOG(INFO) << "Regularize stage: prv_data_condition_flag = true."; CHECK_EQ(true, net_params[param_id]->get_prv_data_descriptor()->layout_compare( net_params[param_id]->get_prv_diff_descriptor())); - /* - caffe_axpy(net_params[param_id]->count(), - local_decay, - net_params[param_id]->prv_data(), - net_params[param_id]->mutable_prv_diff()); - */ if (prv_diff_condition_flag) { - //situation (1) - //LOG(INFO) << "Fused ComputeUpdateValue stage: prv_diff_condition_flag = true."; - /* - caffe_cpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->prv_diff(), momentum, - history_[param_id]->mutable_cpu_data()); - - caffe_copy(net_params[param_id]->count(), - history_[param_id]->cpu_data(), - net_params[param_id]->mutable_prv_diff()); - */ - - if(net_params[param_id]->prv_data_count() != history_[param_id]->count()) - history_[param_id]->Reshape(net_params[param_id]->shape()); - axpy_axpby_copy_axpy(net_params[param_id]->prv_data_count(), local_decay, net_params[param_id]->mutable_prv_data(), net_params[param_id]->mutable_prv_diff(), local_rate, momentum, history_[param_id]->mutable_cpu_data(), Dtype(-1)); is_separate_ComputeUpdateValue_Update = false; } - else - { - //Will not happen! - //LOG(INFO) << "Cannot Fused ComputeUpdateValue stage: prv_diff_condition_flag = false."; - } } else { - //LOG(INFO) << "Regularize stage: prv_data_condition_flag = false."; - /* - caffe_axpy(net_params[param_id]->count(), - local_decay, - net_params[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); - */ if (!prv_diff_condition_flag) { - //situation (2) - //LOG(INFO) << "Fused ComputeUpdateValue stage: prv_diff_condition_flag = false."; - /* - caffe_cpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->cpu_diff(), momentum, - history_[param_id]->mutable_cpu_data()); - - caffe_copy(net_params[param_id]->count(), - history_[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); - */ - axpy_axpby_copy_axpy(net_params[param_id]->count(), local_decay, net_params[param_id]->mutable_cpu_data(), net_params[param_id]->mutable_cpu_diff(), local_rate, momentum, history_[param_id]->mutable_cpu_data(), Dtype(-1)); is_separate_ComputeUpdateValue_Update = false; } - else - { - //Will not happen! - //LOG(INFO) << "Cannot Fused ComputeUpdateValue stage: prv_diff_condition_flag = true."; - if(net_params[param_id]->prv_data_count() != history_[param_id]->count()) - history_[param_id]->Reshape(net_params[param_id]->shape()); - } } } else if (regularization_type == "L1") { - //LOG(INFO) << "Regularize stage: regularization_type == L1."; caffe_cpu_sign(net_params[param_id]->count(), net_params[param_id]->cpu_data(), temp_[param_id]->mutable_cpu_data()); - /* - caffe_axpy(net_params[param_id]->count(), - local_decay, - temp_[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); - */ - axpy_axpby_copy(net_params[param_id]->count(), local_decay, temp_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff(), local_rate, momentum, history_[param_id]->mutable_cpu_data()); @@ -513,18 +438,14 @@ void SGDSolver::SGDFusion(int param_id, Dtype rate) { //No Regularize stage, only ComputeUpdateValue stage //ComputeUpdateValue stage if (prv_diff_condition_flag) { - //LOG(INFO) << "ComputeUpdateValue stage: prv_diff_condition_flag = true."; - if(net_params[param_id]->prv_data_count() != history_[param_id]->count()) - history_[param_id]->Reshape(net_params[param_id]->shape()); - caffe_cpu_axpby(net_params[param_id]->prv_data_count(), local_rate, + caffe_cpu_axpby(net_params[param_id]->prv_diff_count(), local_rate, net_params[param_id]->prv_diff(), momentum, history_[param_id]->mutable_cpu_data()); - caffe_copy(net_params[param_id]->prv_data_count(), + caffe_copy(net_params[param_id]->count(), history_[param_id]->cpu_data(), net_params[param_id]->mutable_prv_diff()); } else { - //LOG(INFO) << "ComputeUpdateValue stage: prv_diff_condition_flag = false."; caffe_cpu_axpby(net_params[param_id]->count(), local_rate, net_params[param_id]->cpu_diff(), momentum, history_[param_id]->mutable_cpu_data()); @@ -537,7 +458,6 @@ void SGDSolver::SGDFusion(int param_id, Dtype rate) { //Update stage (separate) net_params[param_id]->Update(); } - } #endif /* ENABLE_SGD_FUSION */ @@ -561,12 +481,10 @@ void SGDSolver::Normalize(int param_id) { if (net_params[param_id]->prv_diff() && (net_params[param_id]->prv_diff_count() == net_params[param_id]->count())) { - //LOG(INFO) << "Normalize stage: prv_diff_condition_flag = true."; - caffe_scal(net_params[param_id]->count(), accum_normalization, + caffe_scal(net_params[param_id]->prv_diff_count(), accum_normalization, net_params[param_id]->mutable_prv_diff()); } else { - //LOG(INFO) << "Normalize stage: prv_diff_condition_flag = false."; caffe_scal(net_params[param_id]->count(), accum_normalization, net_params[param_id]->mutable_cpu_diff()); } @@ -599,29 +517,25 @@ void SGDSolver::Regularize(int param_id) { case Caffe::CPU: { if (local_decay) { if (regularization_type == "L2") { - //LOG(INFO) << "Regularize stage: regularization_type == L2."; // add weight decay if (net_params[param_id]->prv_data() && (net_params[param_id]->prv_data_count() == net_params[param_id]->count())) { - //LOG(INFO) << "Regularize stage: prv_data_condition_flag = true."; CHECK_EQ(true, net_params[param_id]->get_prv_data_descriptor()->layout_compare( net_params[param_id]->get_prv_diff_descriptor())); - caffe_axpy(net_params[param_id]->count(), + caffe_axpy(net_params[param_id]->prv_data_count(), local_decay, net_params[param_id]->prv_data(), net_params[param_id]->mutable_prv_diff()); } else { - //LOG(INFO) << "Regularize stage: prv_data_condition_flag = false."; caffe_axpy(net_params[param_id]->count(), local_decay, net_params[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff()); } } else if (regularization_type == "L1") { - //LOG(INFO) << "Regularize stage: regularization_type == L1."; caffe_cpu_sign(net_params[param_id]->count(), net_params[param_id]->cpu_data(), temp_[param_id]->mutable_cpu_data()); @@ -692,8 +606,7 @@ void SGDSolver::ComputeUpdateValue(int param_id, Dtype rate) { if (net_params[param_id]->prv_diff() && (net_params[param_id]->prv_diff_count() == net_params[param_id]->count())) { - //LOG(INFO) << "ComputeUpdateValue stage: prv_diff_condition_flag = true."; - caffe_cpu_axpby(net_params[param_id]->count(), local_rate, + caffe_cpu_axpby(net_params[param_id]->prv_diff_count(), local_rate, net_params[param_id]->prv_diff(), momentum, history_[param_id]->mutable_cpu_data()); @@ -701,7 +614,6 @@ void SGDSolver::ComputeUpdateValue(int param_id, Dtype rate) { history_[param_id]->cpu_data(), net_params[param_id]->mutable_prv_diff()); } else { - //LOG(INFO) << "ComputeUpdateValue stage: prv_diff_condition_flag = false."; caffe_cpu_axpby(net_params[param_id]->count(), local_rate, net_params[param_id]->cpu_diff(), momentum, history_[param_id]->mutable_cpu_data()); @@ -709,6 +621,11 @@ void SGDSolver::ComputeUpdateValue(int param_id, Dtype rate) { caffe_copy(net_params[param_id]->count(), history_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff()); + + if (net_params[param_id]->prv_diff_count() + != net_params[param_id]->count()) { + net_params[param_id]->mutable_prv_diff(); + } } break; } From d5789e76dc7c413dfc22dc6d71d05d81335c93d2 Mon Sep 17 00:00:00 2001 From: Haihao Shen Date: Tue, 29 Aug 2017 07:50:39 +0800 Subject: [PATCH 34/38] Fix the issue of padded layout under MKL2017 --- src/caffe/solvers/sgd_solver.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp index 6a7e2ca43..5347dcdf7 100644 --- a/src/caffe/solvers/sgd_solver.cpp +++ b/src/caffe/solvers/sgd_solver.cpp @@ -622,8 +622,9 @@ void SGDSolver::ComputeUpdateValue(int param_id, Dtype rate) { history_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff()); - if (net_params[param_id]->prv_diff_count() - != net_params[param_id]->count()) { + if (net_params[param_id]->prv_diff() + && (net_params[param_id]->prv_diff_count() + != net_params[param_id]->count())) { net_params[param_id]->mutable_prv_diff(); } } From 5440cd4f693e81c11eb74841f2c0bd903e268a75 Mon Sep 17 00:00:00 2001 From: fzou1 Date: Thu, 31 Aug 2017 12:42:37 +0800 Subject: [PATCH 35/38] add script for running caffe on single node and multiple nodes with Intel CPUs Change-Id: I0299102309bd6e18794f6e454002faa5db63613e --- scripts/run_intelcaffe.sh | 604 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 604 insertions(+) create mode 100755 scripts/run_intelcaffe.sh diff --git a/scripts/run_intelcaffe.sh b/scripts/run_intelcaffe.sh new file mode 100755 index 000000000..29a5309ab --- /dev/null +++ b/scripts/run_intelcaffe.sh @@ -0,0 +1,604 @@ +#!/bin/sh + set -x + +benchmark_mode="all" + +# time/train/resume_train +mode="train" + +# it's assigned by detect_cpu +cpu_model=skx + +# a list of nodes +host_file="" + +# network parameters +network="opa" +tcp_netmask="" + +# specify number of MLSL ep servers in command +num_mlsl_servers=-1 + +# parameters for caffe time +iteration=0 +model_file="" +# parameters for resuming training +snapshot="" +# parameters for training +solver_file="" + +# specify engine for running caffe +engine="MKL2017" + +result_dir="" +debug="off" + +function usage +{ + script_name=$0 + echo "Usage:" + echo " $script_name --host host_file [--solver solver_file]" + echo " [--network opa/tcp] [--netmask tcp_netmask] [--debug on/off]" + echo " [--mode train/resume_train/time/none] [--benchmark all/qperf/mpi/none]" + echo " [--iteration iter] [--model_file deploy.prototxt]" + echo " [--snapshot snapshot.caffemodel]" + echo " [--num_mlsl_servers num_mlsl_servers]" + echo " [--output output_folder]" + echo "" + echo " Parameters:" + echo " host: host file includes list of nodes." + echo "" + echo " Optional parameters:" + echo " solver: specify solver file if mode is train/resume_train" + echo " network: opa(default), tcp" + echo " netmask: only used if network is tcp" + echo " debug: off(default). MLSL debug information is outputed if it's on" + echo " mode: train(default), resume_train, time, none(not to run caffe test)" + echo " benchmark: all(default). Includes qperf, all-reduce performance" + echo " Dependency: user needs to install qperf, IMB-MPI1;" + echo " and add them in system path." + echo " iteration and model_file: only used if mode is time (caffe time)" + echo " snapshot: only used if mode is resume_train" + echo " num_mlsl_servers: number of MLSL ep servers" + echo " output_folder: output folder for storing results" +} + +declare -a cpu_list=("Intel Xeon E5-26xx (Broadwell)" "Intel Xeon Phi 72xx (Knight Landing)" + "Intel Xeon Platinum 8180 (Skylake)" "Intel Xeon 6148 (Skylake)") + +function detect_cpu +{ + # detect cpu model + model_string=`lscpu | grep "Model name" | awk -F ':' '{print $2}'` + if [[ $model_string == *"72"* ]]; then + cpu_model=knl + elif [[ $model_string == *"8180"* ]]; then + cpu_model=skx + elif [[ $model_string == *"6148"* ]]; then + cpu_model=skx + elif [[ $model_string == *"E5-26"* ]]; then + cpu_model=bdw + else + echo "CPU model: $model_string" + echo " Use default settings, which may not be optimal ones." + fi +} + +function set_numa_node +{ + # detect numa mode: cache and flat mode for KNL + numa_node=($(numactl -H | grep "available" | awk -F ' ' '{print $2}')) + if [ $numa_node -eq 1 ]; then + echo "Cache mode." + # cache mode, use numa node 0 + numanode=0 + else + echo "Flat mode." + numanode=1 + fi +} + + +function check_dependency +{ + dep=$1 + which $dep >/dev/null 2>&1 + if [ $? -ne 0 ]; then + echo "Warning: cannot find $dep" + return 1 + fi + return 0 +} + + +function init_mpi_envs +{ + # IMPI configuration + if [ "$network" == "opa" ]; then + export I_MPI_FABRICS=tmi + export I_MPI_TMI_PROVIDER=psm2 + if [ "$cpu_model" == "knl" ]; then + # PSM2 configuration + export PSM2_MQ_RNDV_HFI_WINDOW=4194304 #2097152 # to workaround PSM2 bug in IFS 10.2 and 10.3 + export PSM2_MQ_EAGER_SDMA_SZ=65536 + export PSM2_MQ_RNDV_HFI_THRESH=200000 + fi + + export PSM2_IDENTIFY=1 # for debug + elif [ "$network" == "tcp" ]; then + export I_MPI_FABRICS=tcp + export I_MPI_TCP_NETMASK=$tcp_netmask + else + echo "Invalid network: $network" + exit 1 + fi + + export I_MPI_FALLBACK=0 + export I_MPI_DEBUG=6 +} + + +function clear_shm +{ + clear_command="rm -rf /dev/shm/*" + check_shm_command="df -h | grep shm" + + # TODO: check if 50G is the minimum shm size? + min_shm_size=50 + shm_unit="G" + + for node in "${nodenames[@]}" + do + ssh ${node} "$clear_command" + shm_line=`ssh ${node} "$check_shm_command"` + shm_string=`echo $shm_line | awk -F ' ' '{print $(NF-2)}'` + unit="${shm_string:(-1)}" + shm_size=${shm_string::-1} + if [ "$unit" == "$shm_unit" ] && [ $shm_size -ge ${min_shm_size} ]; then + continue + else + echo "Error: /dev/shm size = ${shm_size}${unit}, on node: ${node}." + echo " It's less than minimum size: ${min_shm_size}${shm_unit}." + echo " Please clean or enlarge it." + exit 1 + fi + done +} + +function kill_zombie_processes +{ + kill_command="for process in ep_server caffe mpiexec.hydra; do for i in \$(ps -e | grep -w \$process | awk -F ' ' '{print \$1}'); do kill -9 \$i; echo \"\$process \$i killed.\"; done done" + for node in "${nodenames[@]}" + do + ssh ${node} "$kill_command" + done +} + +function clear_envs +{ + clear_shm + kill_zombie_processes +} + +function set_mlsl_vars +{ + if [ "${num_mlsl_servers}" -eq -1 ]; then + if [ ${numnodes} -eq 1 ]; then + numservers=0 + else + if [ ${cpu_model} == knl ]; then + numservers=4 + else + numservers=2 + fi + fi + else + numservers=$((num_mlsl_servers)) + fi + + echo "MLSL_NUM_SERVERS: $numservers" + export MLSL_NUM_SERVERS=${numservers} + + if [ ${numservers} -gt 0 ]; then + if [ ${cpu_model} == knl ]; then + listep=6,7,8,9,10,11,12,13 + else + listep=6,7,8,9 + fi + export MLSL_SERVER_AFFINITY="${listep}" + echo "MLSL_SERVER_AFFINITY: ${listep}" + fi + + # MLSL configuration + if [ "$debug" == "on" ]; then + export MLSL_LOG_LEVEL=3 + else + export MLSL_LOG_LEVEL=0 + fi +} + +function set_env_vars +{ + set_mlsl_vars + + ppncpu=1 + threadspercore=1 + + cores=`lscpu | grep "Core(s) per socket:" | awk '{print $4}'` + sockets=`lscpu | grep "Socket(s)" | awk '{print $2}'` + maxcores=$((cores*sockets)) + + numthreads=$(((maxcores-numservers)*threadspercore)) + numthreads_per_proc=$((numthreads/ppncpu)) + + export OMP_NUM_THREADS=${numthreads_per_proc} + + # OMP configuration + # threadspercore=1 + affinitystr="proclist=[0-5,$((5+numservers+1))-$((maxcores-1))],granularity=thread,explicit" + export KMP_HW_SUBSET=1t + export KMP_AFFINITY=$affinitystr +} + +function execute_command +{ + local xeonbin_=$1 + local result_dir_=$2 + + if [ ${cpu_model} == knl ]; then + exec_command="numactl --preferred=$numanode $xeonbin_" + else + exec_command="$xeonbin_" + fi + + if [ ${numnodes} -gt 1 ]; then + # Produce the configuration file for mpiexec. + # Each line of the config file contains a # host, environment, binary name. + cfile_=nodeconfig-${cpu_model}-${numnodes}.txt + rm -f $cfile_ + + for node in "${nodenames[@]}" + do + echo "-host ${node} -n $ppncpu $exec_command" >> $cfile_ + done + fi + + clear_envs + log_file=outputCluster-${cpu_model}-${numnodes}.txt + + sensors_bin="sensors" + check_dependency $sensors_bin + has_sensors=$? + if [ $has_sensors -eq 0 ]; then + sensor_log_file=sensors-${cpu_model}-${numnodes}-start.log + $sensors_bin >$sensor_log_file + mv $sensor_log_file $result_dir_/ + fi + + if [ ${numnodes} -eq 1 ]; then + time GLOG_minloglevel=0 $exec_command >${log_file} 2>&1 + else + init_mpi_envs + exec_command="-l -configfile $cfile_" + time GLOG_minloglevel=0 mpiexec.hydra $exec_command >${log_file} 2>&1 + fi + + if [ $has_sensors -eq 0 ]; then + sensor_log_file=sensors-${cpu_model}-${numnodes}-end.log + $sensors_bin >$sensor_log_file + mv $sensor_log_file $result_dir_/ + fi + mv $log_file $cfile_ $result_dir_/ +} + +function run_qperf_bench +{ + qperf_bin="qperf" + check_dependency $qperf_bin + if [ $? -ne 0 ]; then + echo "Skip qperf benchmark." + return + fi + + # measure bandwidth and latency + qperf_result_log="qperf_bench_result.log" + rm -f $qperf_result_log + + server_node="" + port=1234567 + qperf_param="-lp $port -oo msg_size:1024:512M:*2 -vu tcp_bw tcp_lat" + + for ((i=0; i> $qperf_result_log + echo >>$qperf_result_log + + for ((j=i+1; j>$qperf_result_log + done + done + + mv $qperf_result_log $result_dir/ +} + +function run_mpi_bench +{ + # MPI benchmark + mpibench_bin="IMB-MPI1" + check_dependency $mpibench_bin + if [ $? -ne 0 ]; then + echo "Skip MPI benchmark..." + return + fi + + xeonbin="$mpibench_bin allreduce" + + declare -a adjust_values=(1 2 3 5 7 8 9 0) + declare -a collective_values=('tmi' 'none') + + echo "Start mpi bench..." + for ((i=0; i<${#adjust_values[@]}; i++)) + do + for ((j=0; j<${#collective_values[@]}; j++)) + do + if [ ${adjust_values[$i]} -eq 0 ]; then + unset I_MPI_ADJUST_ALLREDUCE + else + export I_MPI_ADJUST_ALLREDUCE=${adjust_values[$i]} + fi + + if [ "${collective_values[$j]}" == "none" ]; then + unset I_MPI_COLLECTIVE_DEFAULTS + else + export I_MPI_COLLECTIVE_DEFAULTS=${collective_values[$j]} + fi + echo "iteration $i, ${j}..." + echo "I_MPI_ADJUST_ALLREDUCE=$I_MPI_ADJUST_ALLREDUCE" + echo "I_MPI_COLLECTIVE_DEFAULTS=$I_MPI_COLLECTIVE_DEFAULTS" + + test_result_dir=$result_dir/mpibench-${adjust_values[$i]}-${collective_values[$j]} + mkdir -p $test_result_dir + execute_command "$xeonbin" $test_result_dir + done + done + + # TODO: analyze the report and select the best algorithm and setting + unset I_MPI_COLLECTIVE_DEFAULTS + unset I_MPI_ADJUST_ALLREDUCE + + echo "Finished." +} + +function run_benchmark +{ + echo "Run benchmark with ${numnodes} nodes..." + if [ $numnodes -gt 1 ]; then + if [ "$benchmark_mode" == "all" ] || [ "$benchmark_mode" == "qperf" ]; then + run_qperf_bench + fi + + if [ "$benchmark_mode" == "all" ] || [ "$benchmark_mode == mpi" ]; then + set_env_vars + run_mpi_bench + fi + fi +} + +function run_caffe +{ + echo "Run caffe with ${numnodes} nodes..." + + if [ ${mode} == "time" ]; then + xeonbin="$caffe_bin time --iterations $iteration --model $model_file -engine=$engine" + else + xeonbin="$caffe_bin train --solver $solver_file -engine=$engine" + if [ ${mode} == "resume_train" ]; then + xeonbin+=" --snapshot=${snapshot}" + fi + fi + + set_env_vars + execute_command "$xeonbin" $result_dir +} + + +if [ $# -le 1 ]; then + usage + exit 0 +fi + +root_dir=$(cd $(dirname $(dirname $0)); pwd) +result_dir=${root_dir}/"result-`date +%Y%m%d%H%M%S`" + +while [[ $# -gt 1 ]] +do + key="$1" + case $key in + --solver) + solver_file="$2" + shift + ;; + --host) + host_file="$2" + shift + ;; + --network) + network="$2" + shift + ;; + --netmask) + tcp_netmask="$2" + shift + ;; + --debug) + debug="$2" + shift + ;; + --num_mlsl_servers) + num_mlsl_servers=$2 + shift + ;; + --mode) + mode=$2 + shift + ;; + --iteration) + iteration=$2 + shift + ;; + --model_file) + model_file=$2 + shift + ;; + --snapshot) + snapshot=$2 + shift + ;; + --engine) + engine=$2 + shift + ;; + --benchmark) + benchmark_mode=$2 + shift + ;; + --output) + result_dir=$2 + shift + ;; + *) + echo "Unknown option: $key" + usage + exit 1 + ;; + esac + shift +done + +# check parameters +if [ "$host_file" == "" ]; then + echo "Error: host file is NOT specified." + exit 1 +fi +if [ ! -f $host_file ]; then + echo "Error: host file does NOT exist." + exit 1 +fi + +echo "" +echo "CPUs with optimal settings:" +for ((i=0; i<${#cpu_list[@]}; i++)) +do + echo " ${cpu_list[$i]}" +done +echo "" +echo "Settings:" +echo " Host file: $host_file" +echo " Running mode: $mode" +echo " Benchmark: $benchmark_mode" +echo " Debug option: $debug" +echo " Engine: $engine" +echo " Number of MLSL servers: $num_mlsl_servers" +echo " -1: selected automatically according to CPU model." +echo " BDW/SKX: 2, KNL: 4" + + +if [ "$mode" == "train" ] || [ "$mode" == "resume_train" ]; then + if [ "$solver_file" == "" ]; then + echo "Error: solver file is NOT specified." + exit 1 + fi + if [ ! -f $solver_file ]; then + echo "Error: solver file does NOT exist." + exit 1 + fi + + echo " Solver file: $solver_file" + + if [ "$mode" == "resume_train" ]; then + if [ "$snapshot" == "" ]; then + echo "Error: snapshot is NOT specified." + exit 1 + fi + if [ ! -f $snapshot ]; then + echo "Eror: snapshot file does NOT exist." + exit 1 + fi + echo " Snapshot for resuming train: $snapshot" + fi +fi + +if [ "$mode" == "time" ]; then + if [ "$model_file" == "" ]; then + echo "Error: model file is NOT specified." + exit 1 + fi + if [ ! -f $model_file ]; then + echo "Eror: model file does NOT exist." + exit 1 + fi + + if [ $iteration -le 0 ]; then + echo "Error: iteration ($iteration) <= 0." + exit 1 + fi + echo " Iteration for running caffe time: $iteration" + echo " Model file for running caffe time: $model_file" +fi + +echo " Network: $network" +if [ "$network" == "tcp" ]; then + if [ "$tcp_netmask" == "" ]; then + echo "Error: TCP netmask is NOT specified." + exit 0 + fi + echo " Netmask for TCP network: $tcp_netmask" +fi + +# Names to configfile, binary (executable) files # +nodenames=( `cat $host_file | sort | uniq ` ) +if [ ${#nodenames[@]} -eq 0 ]; then + echo "Error: empty host file! Exit." + exit 0 +fi +numnodes=${#nodenames[@]} +echo "Number of nodes: $numnodes" + +detect_cpu + +if [ $cpu_model == knl ]; then + set_numa_node +fi + +if [ ! -d $result_dir ]; then + echo "Create result directory: $result_dir" + mkdir -p $result_dir +fi + +if [ "${benchmark_mode}" != "none" ]; then + run_benchmark +fi + +if [ "${mode}" != "none" ]; then + caffe_bin="./build/tools/caffe" + check_dependency $caffe_bin + if [ $? -ne 0 ]; then + echo "Exit." + exit 0 + fi + + run_caffe +fi + +echo "Result folder: $result_dir" From 29faeaee6dbb3d67242a7d2efbf0f52016568ad4 Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Tue, 5 Sep 2017 00:14:42 +0800 Subject: [PATCH 36/38] update mkldnn version to b01e3a55a07be62172e713bcd2644c5176360212 --- mkldnn.commit | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkldnn.commit b/mkldnn.commit index 4c279b216..9abcb727c 100644 --- a/mkldnn.commit +++ b/mkldnn.commit @@ -1 +1 @@ -27420a241b2efd8d88f1e003635434194fdfb1b8 +b01e3a55a07be62172e713bcd2644c5176360212 From 7605ad37f69d9a4dcf35f3efe714b59f4d87c0c3 Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Tue, 5 Sep 2017 00:30:18 +0800 Subject: [PATCH 37/38] change googlenet_4node max_iteration to 450000 --- .../multinode/googlenet_4nodes/solver.prototxt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/intel_optimized_models/multinode/googlenet_4nodes/solver.prototxt b/models/intel_optimized_models/multinode/googlenet_4nodes/solver.prototxt index 589971c10..773a61852 100644 --- a/models/intel_optimized_models/multinode/googlenet_4nodes/solver.prototxt +++ b/models/intel_optimized_models/multinode/googlenet_4nodes/solver.prototxt @@ -19,7 +19,7 @@ average_loss: 40 base_lr: 0.04 lr_policy: "poly" power: 0.5 -max_iter: 350000 +max_iter: 450000 momentum: 0.9 weight_decay: 0.0002 snapshot: 50000 From d9d52b7e2c972310a09c8e84766067576fc4bd75 Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Tue, 5 Sep 2017 00:35:35 +0800 Subject: [PATCH 38/38] add googlenet v2 8 nodes solver --- .../googlenet_16nodes/solver.prototxt | 27 - .../googlenet_16nodes/train_val.prototxt | 2434 ----------------- .../googlenet_v2_4nodes/solver.prototxt | 24 - .../googlenet_v2_8nodes/solver.prototxt | 15 + .../train_val.prototxt | 0 .../resnet_50_16_nodes/solver.prototxt | 15 - .../resnet_50_16_nodes/train_val.prototxt | 2306 ---------------- 7 files changed, 15 insertions(+), 4806 deletions(-) delete mode 100644 models/intel_optimized_models/multinode/googlenet_16nodes/solver.prototxt delete mode 100644 models/intel_optimized_models/multinode/googlenet_16nodes/train_val.prototxt delete mode 100644 models/intel_optimized_models/multinode/googlenet_v2_4nodes/solver.prototxt create mode 100644 models/intel_optimized_models/multinode/googlenet_v2_8nodes/solver.prototxt rename models/intel_optimized_models/multinode/{googlenet_v2_4nodes => googlenet_v2_8nodes}/train_val.prototxt (100%) delete mode 100644 models/intel_optimized_models/multinode/resnet_50_16_nodes/solver.prototxt delete mode 100644 models/intel_optimized_models/multinode/resnet_50_16_nodes/train_val.prototxt diff --git a/models/intel_optimized_models/multinode/googlenet_16nodes/solver.prototxt b/models/intel_optimized_models/multinode/googlenet_16nodes/solver.prototxt deleted file mode 100644 index 4c9b59fc4..000000000 --- a/models/intel_optimized_models/multinode/googlenet_16nodes/solver.prototxt +++ /dev/null @@ -1,27 +0,0 @@ -#This is Intel(R) optimized (in terms of time to train) version of solver for model described in the [GoogLeNet](http://arxiv.org/abs/1409.4842) publication. -#Original solver.prototxt can be found in /models/bvlc_googlenet/ directory of this repository. -#Differences: -#- base_lr is set to 0.065 -#- max_iter is set to 100000 -# -#- bias_filler value changed to 0.1 -# -#Top-5 and Top-1 results achieved with this version of solver: -#Top-5: 88.74% -#Top-1: 68.35% -#Training was performed using server equipped with Intel(R) Xeon Phi(TM) CPU 7250 processor. -net: "models/intel_optimized_models/multinode/googlenet_16nodes/train_val.prototxt" -#test_iter: 1000 -#test_interval: 10000 -#test_initialization: false -display: 40 -average_loss: 40 -base_lr: 0.065 -lr_policy: "poly" -power: 0.5 -max_iter: 100000 -momentum: 0.9 -weight_decay: 0.0002 -snapshot: 50000 -snapshot_prefix: "models/intel_optimized_models/multinode/googlenet_16nodes/googlenet" -solver_mode: CPU diff --git a/models/intel_optimized_models/multinode/googlenet_16nodes/train_val.prototxt b/models/intel_optimized_models/multinode/googlenet_16nodes/train_val.prototxt deleted file mode 100644 index f5276ab97..000000000 --- a/models/intel_optimized_models/multinode/googlenet_16nodes/train_val.prototxt +++ /dev/null @@ -1,2434 +0,0 @@ -name: "GoogleNet" -layer { - name: "data" - type: "Data" - top: "data" - top: "label" - include { - phase: TRAIN - } - transform_param { - mirror: true - crop_size: 224 - mean_value: 104 - mean_value: 117 - mean_value: 123 - } - data_param { - source: "examples/imagenet/ilsvrc12_train_lmdb" - batch_size: 64 - backend: LMDB - shuffle: true - } -} -layer { - name: "data" - type: "Data" - top: "data" - top: "label" - include { - phase: TEST - } - transform_param { - mirror: false - crop_size: 224 - mean_value: 104 - mean_value: 117 - mean_value: 123 - } - data_param { - source: "examples/imagenet/ilsvrc12_val_lmdb" - batch_size: 50 - backend: LMDB - } -} -layer { - name: "conv1/7x7_s2" - type: "Convolution" - bottom: "data" - top: "conv1/7x7_s2" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 64 - pad: 3 - kernel_size: 7 - stride: 2 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "conv1/relu_7x7" - type: "ReLU" - bottom: "conv1/7x7_s2" - top: "conv1/7x7_s2" -} -layer { - name: "pool1/3x3_s2" - type: "Pooling" - bottom: "conv1/7x7_s2" - top: "pool1/3x3_s2" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "pool1/norm1" - type: "LRN" - bottom: "pool1/3x3_s2" - top: "pool1/norm1" - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } -} -layer { - name: "conv2/3x3_reduce" - type: "Convolution" - bottom: "pool1/norm1" - top: "conv2/3x3_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 64 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "conv2/relu_3x3_reduce" - type: "ReLU" - bottom: "conv2/3x3_reduce" - top: "conv2/3x3_reduce" -} -layer { - name: "conv2/3x3" - type: "Convolution" - bottom: "conv2/3x3_reduce" - top: "conv2/3x3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 192 - pad: 1 - kernel_size: 3 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "conv2/relu_3x3" - type: "ReLU" - bottom: "conv2/3x3" - top: "conv2/3x3" -} -layer { - name: "conv2/norm2" - type: "LRN" - bottom: "conv2/3x3" - top: "conv2/norm2" - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } -} -layer { - name: "pool2/3x3_s2" - type: "Pooling" - bottom: "conv2/norm2" - top: "pool2/3x3_s2" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "inception_3a/1x1" - type: "Convolution" - bottom: "pool2/3x3_s2" - top: "inception_3a/1x1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 64 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_3a/relu_1x1" - type: "ReLU" - bottom: "inception_3a/1x1" - top: "inception_3a/1x1" -} -layer { - name: "inception_3a/3x3_reduce" - type: "Convolution" - bottom: "pool2/3x3_s2" - top: "inception_3a/3x3_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 96 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_3a/relu_3x3_reduce" - type: "ReLU" - bottom: "inception_3a/3x3_reduce" - top: "inception_3a/3x3_reduce" -} -layer { - name: "inception_3a/3x3" - type: "Convolution" - bottom: "inception_3a/3x3_reduce" - top: "inception_3a/3x3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 128 - pad: 1 - kernel_size: 3 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_3a/relu_3x3" - type: "ReLU" - bottom: "inception_3a/3x3" - top: "inception_3a/3x3" -} -layer { - name: "inception_3a/5x5_reduce" - type: "Convolution" - bottom: "pool2/3x3_s2" - top: "inception_3a/5x5_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 16 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_3a/relu_5x5_reduce" - type: "ReLU" - bottom: "inception_3a/5x5_reduce" - top: "inception_3a/5x5_reduce" -} -layer { - name: "inception_3a/5x5" - type: "Convolution" - bottom: "inception_3a/5x5_reduce" - top: "inception_3a/5x5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 32 - pad: 2 - kernel_size: 5 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_3a/relu_5x5" - type: "ReLU" - bottom: "inception_3a/5x5" - top: "inception_3a/5x5" -} -layer { - name: "inception_3a/pool" - type: "Pooling" - bottom: "pool2/3x3_s2" - top: "inception_3a/pool" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 1 - pad: 1 - } -} -layer { - name: "inception_3a/pool_proj" - type: "Convolution" - bottom: "inception_3a/pool" - top: "inception_3a/pool_proj" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 32 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_3a/relu_pool_proj" - type: "ReLU" - bottom: "inception_3a/pool_proj" - top: "inception_3a/pool_proj" -} -layer { - name: "inception_3a/output" - type: "Concat" - bottom: "inception_3a/1x1" - bottom: "inception_3a/3x3" - bottom: "inception_3a/5x5" - bottom: "inception_3a/pool_proj" - top: "inception_3a/output" -} -layer { - name: "inception_3b/1x1" - type: "Convolution" - bottom: "inception_3a/output" - top: "inception_3b/1x1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 128 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_3b/relu_1x1" - type: "ReLU" - bottom: "inception_3b/1x1" - top: "inception_3b/1x1" -} -layer { - name: "inception_3b/3x3_reduce" - type: "Convolution" - bottom: "inception_3a/output" - top: "inception_3b/3x3_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 128 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_3b/relu_3x3_reduce" - type: "ReLU" - bottom: "inception_3b/3x3_reduce" - top: "inception_3b/3x3_reduce" -} -layer { - name: "inception_3b/3x3" - type: "Convolution" - bottom: "inception_3b/3x3_reduce" - top: "inception_3b/3x3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 192 - pad: 1 - kernel_size: 3 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_3b/relu_3x3" - type: "ReLU" - bottom: "inception_3b/3x3" - top: "inception_3b/3x3" -} -layer { - name: "inception_3b/5x5_reduce" - type: "Convolution" - bottom: "inception_3a/output" - top: "inception_3b/5x5_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 32 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_3b/relu_5x5_reduce" - type: "ReLU" - bottom: "inception_3b/5x5_reduce" - top: "inception_3b/5x5_reduce" -} -layer { - name: "inception_3b/5x5" - type: "Convolution" - bottom: "inception_3b/5x5_reduce" - top: "inception_3b/5x5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 96 - pad: 2 - kernel_size: 5 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_3b/relu_5x5" - type: "ReLU" - bottom: "inception_3b/5x5" - top: "inception_3b/5x5" -} -layer { - name: "inception_3b/pool" - type: "Pooling" - bottom: "inception_3a/output" - top: "inception_3b/pool" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 1 - pad: 1 - } -} -layer { - name: "inception_3b/pool_proj" - type: "Convolution" - bottom: "inception_3b/pool" - top: "inception_3b/pool_proj" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 64 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_3b/relu_pool_proj" - type: "ReLU" - bottom: "inception_3b/pool_proj" - top: "inception_3b/pool_proj" -} -layer { - name: "inception_3b/output" - type: "Concat" - bottom: "inception_3b/1x1" - bottom: "inception_3b/3x3" - bottom: "inception_3b/5x5" - bottom: "inception_3b/pool_proj" - top: "inception_3b/output" -} -layer { - name: "pool3/3x3_s2" - type: "Pooling" - bottom: "inception_3b/output" - top: "pool3/3x3_s2" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "inception_4a/1x1" - type: "Convolution" - bottom: "pool3/3x3_s2" - top: "inception_4a/1x1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 192 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4a/relu_1x1" - type: "ReLU" - bottom: "inception_4a/1x1" - top: "inception_4a/1x1" -} -layer { - name: "inception_4a/3x3_reduce" - type: "Convolution" - bottom: "pool3/3x3_s2" - top: "inception_4a/3x3_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 96 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4a/relu_3x3_reduce" - type: "ReLU" - bottom: "inception_4a/3x3_reduce" - top: "inception_4a/3x3_reduce" -} -layer { - name: "inception_4a/3x3" - type: "Convolution" - bottom: "inception_4a/3x3_reduce" - top: "inception_4a/3x3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 208 - pad: 1 - kernel_size: 3 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4a/relu_3x3" - type: "ReLU" - bottom: "inception_4a/3x3" - top: "inception_4a/3x3" -} -layer { - name: "inception_4a/5x5_reduce" - type: "Convolution" - bottom: "pool3/3x3_s2" - top: "inception_4a/5x5_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 16 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4a/relu_5x5_reduce" - type: "ReLU" - bottom: "inception_4a/5x5_reduce" - top: "inception_4a/5x5_reduce" -} -layer { - name: "inception_4a/5x5" - type: "Convolution" - bottom: "inception_4a/5x5_reduce" - top: "inception_4a/5x5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 48 - pad: 2 - kernel_size: 5 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4a/relu_5x5" - type: "ReLU" - bottom: "inception_4a/5x5" - top: "inception_4a/5x5" -} -layer { - name: "inception_4a/pool" - type: "Pooling" - bottom: "pool3/3x3_s2" - top: "inception_4a/pool" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 1 - pad: 1 - } -} -layer { - name: "inception_4a/pool_proj" - type: "Convolution" - bottom: "inception_4a/pool" - top: "inception_4a/pool_proj" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 64 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4a/relu_pool_proj" - type: "ReLU" - bottom: "inception_4a/pool_proj" - top: "inception_4a/pool_proj" -} -layer { - name: "inception_4a/output" - type: "Concat" - bottom: "inception_4a/1x1" - bottom: "inception_4a/3x3" - bottom: "inception_4a/5x5" - bottom: "inception_4a/pool_proj" - top: "inception_4a/output" -} -layer { - name: "loss1/ave_pool" - type: "Pooling" - bottom: "inception_4a/output" - top: "loss1/ave_pool" - pooling_param { - pool: AVE - kernel_size: 5 - stride: 3 - } -} -layer { - name: "loss1/conv" - type: "Convolution" - bottom: "loss1/ave_pool" - top: "loss1/conv" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 128 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "loss1/relu_conv" - type: "ReLU" - bottom: "loss1/conv" - top: "loss1/conv" -} -layer { - name: "loss1/fc" - type: "InnerProduct" - bottom: "loss1/conv" - top: "loss1/fc" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 1024 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "loss1/relu_fc" - type: "ReLU" - bottom: "loss1/fc" - top: "loss1/fc" -} -layer { - name: "loss1/drop_fc" - type: "Dropout" - bottom: "loss1/fc" - top: "loss1/fc" - dropout_param { - dropout_ratio: 0.7 - } -} -layer { - name: "loss1/classifier" - type: "InnerProduct" - bottom: "loss1/fc" - top: "loss1/classifier" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 1000 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "loss1/loss" - type: "SoftmaxWithLoss" - bottom: "loss1/classifier" - bottom: "label" - top: "loss1/loss1" - loss_weight: 0.3 -} -layer { - name: "loss1/top-1" - type: "Accuracy" - bottom: "loss1/classifier" - bottom: "label" - top: "loss1/top-1" - include { - phase: TEST - } -} -layer { - name: "loss1/top-5" - type: "Accuracy" - bottom: "loss1/classifier" - bottom: "label" - top: "loss1/top-5" - include { - phase: TEST - } - accuracy_param { - top_k: 5 - } -} -layer { - name: "inception_4b/1x1" - type: "Convolution" - bottom: "inception_4a/output" - top: "inception_4b/1x1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 160 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4b/relu_1x1" - type: "ReLU" - bottom: "inception_4b/1x1" - top: "inception_4b/1x1" -} -layer { - name: "inception_4b/3x3_reduce" - type: "Convolution" - bottom: "inception_4a/output" - top: "inception_4b/3x3_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 112 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4b/relu_3x3_reduce" - type: "ReLU" - bottom: "inception_4b/3x3_reduce" - top: "inception_4b/3x3_reduce" -} -layer { - name: "inception_4b/3x3" - type: "Convolution" - bottom: "inception_4b/3x3_reduce" - top: "inception_4b/3x3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 224 - pad: 1 - kernel_size: 3 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4b/relu_3x3" - type: "ReLU" - bottom: "inception_4b/3x3" - top: "inception_4b/3x3" -} -layer { - name: "inception_4b/5x5_reduce" - type: "Convolution" - bottom: "inception_4a/output" - top: "inception_4b/5x5_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 24 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4b/relu_5x5_reduce" - type: "ReLU" - bottom: "inception_4b/5x5_reduce" - top: "inception_4b/5x5_reduce" -} -layer { - name: "inception_4b/5x5" - type: "Convolution" - bottom: "inception_4b/5x5_reduce" - top: "inception_4b/5x5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 64 - pad: 2 - kernel_size: 5 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4b/relu_5x5" - type: "ReLU" - bottom: "inception_4b/5x5" - top: "inception_4b/5x5" -} -layer { - name: "inception_4b/pool" - type: "Pooling" - bottom: "inception_4a/output" - top: "inception_4b/pool" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 1 - pad: 1 - } -} -layer { - name: "inception_4b/pool_proj" - type: "Convolution" - bottom: "inception_4b/pool" - top: "inception_4b/pool_proj" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 64 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4b/relu_pool_proj" - type: "ReLU" - bottom: "inception_4b/pool_proj" - top: "inception_4b/pool_proj" -} -layer { - name: "inception_4b/output" - type: "Concat" - bottom: "inception_4b/1x1" - bottom: "inception_4b/3x3" - bottom: "inception_4b/5x5" - bottom: "inception_4b/pool_proj" - top: "inception_4b/output" -} -layer { - name: "inception_4c/1x1" - type: "Convolution" - bottom: "inception_4b/output" - top: "inception_4c/1x1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 128 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4c/relu_1x1" - type: "ReLU" - bottom: "inception_4c/1x1" - top: "inception_4c/1x1" -} -layer { - name: "inception_4c/3x3_reduce" - type: "Convolution" - bottom: "inception_4b/output" - top: "inception_4c/3x3_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 128 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4c/relu_3x3_reduce" - type: "ReLU" - bottom: "inception_4c/3x3_reduce" - top: "inception_4c/3x3_reduce" -} -layer { - name: "inception_4c/3x3" - type: "Convolution" - bottom: "inception_4c/3x3_reduce" - top: "inception_4c/3x3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - pad: 1 - kernel_size: 3 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4c/relu_3x3" - type: "ReLU" - bottom: "inception_4c/3x3" - top: "inception_4c/3x3" -} -layer { - name: "inception_4c/5x5_reduce" - type: "Convolution" - bottom: "inception_4b/output" - top: "inception_4c/5x5_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 24 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4c/relu_5x5_reduce" - type: "ReLU" - bottom: "inception_4c/5x5_reduce" - top: "inception_4c/5x5_reduce" -} -layer { - name: "inception_4c/5x5" - type: "Convolution" - bottom: "inception_4c/5x5_reduce" - top: "inception_4c/5x5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 64 - pad: 2 - kernel_size: 5 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4c/relu_5x5" - type: "ReLU" - bottom: "inception_4c/5x5" - top: "inception_4c/5x5" -} -layer { - name: "inception_4c/pool" - type: "Pooling" - bottom: "inception_4b/output" - top: "inception_4c/pool" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 1 - pad: 1 - } -} -layer { - name: "inception_4c/pool_proj" - type: "Convolution" - bottom: "inception_4c/pool" - top: "inception_4c/pool_proj" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 64 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4c/relu_pool_proj" - type: "ReLU" - bottom: "inception_4c/pool_proj" - top: "inception_4c/pool_proj" -} -layer { - name: "inception_4c/output" - type: "Concat" - bottom: "inception_4c/1x1" - bottom: "inception_4c/3x3" - bottom: "inception_4c/5x5" - bottom: "inception_4c/pool_proj" - top: "inception_4c/output" -} -layer { - name: "inception_4d/1x1" - type: "Convolution" - bottom: "inception_4c/output" - top: "inception_4d/1x1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 112 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4d/relu_1x1" - type: "ReLU" - bottom: "inception_4d/1x1" - top: "inception_4d/1x1" -} -layer { - name: "inception_4d/3x3_reduce" - type: "Convolution" - bottom: "inception_4c/output" - top: "inception_4d/3x3_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 144 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4d/relu_3x3_reduce" - type: "ReLU" - bottom: "inception_4d/3x3_reduce" - top: "inception_4d/3x3_reduce" -} -layer { - name: "inception_4d/3x3" - type: "Convolution" - bottom: "inception_4d/3x3_reduce" - top: "inception_4d/3x3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 288 - pad: 1 - kernel_size: 3 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4d/relu_3x3" - type: "ReLU" - bottom: "inception_4d/3x3" - top: "inception_4d/3x3" -} -layer { - name: "inception_4d/5x5_reduce" - type: "Convolution" - bottom: "inception_4c/output" - top: "inception_4d/5x5_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 32 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4d/relu_5x5_reduce" - type: "ReLU" - bottom: "inception_4d/5x5_reduce" - top: "inception_4d/5x5_reduce" -} -layer { - name: "inception_4d/5x5" - type: "Convolution" - bottom: "inception_4d/5x5_reduce" - top: "inception_4d/5x5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 64 - pad: 2 - kernel_size: 5 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4d/relu_5x5" - type: "ReLU" - bottom: "inception_4d/5x5" - top: "inception_4d/5x5" -} -layer { - name: "inception_4d/pool" - type: "Pooling" - bottom: "inception_4c/output" - top: "inception_4d/pool" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 1 - pad: 1 - } -} -layer { - name: "inception_4d/pool_proj" - type: "Convolution" - bottom: "inception_4d/pool" - top: "inception_4d/pool_proj" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 64 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4d/relu_pool_proj" - type: "ReLU" - bottom: "inception_4d/pool_proj" - top: "inception_4d/pool_proj" -} -layer { - name: "inception_4d/output" - type: "Concat" - bottom: "inception_4d/1x1" - bottom: "inception_4d/3x3" - bottom: "inception_4d/5x5" - bottom: "inception_4d/pool_proj" - top: "inception_4d/output" -} -layer { - name: "loss2/ave_pool" - type: "Pooling" - bottom: "inception_4d/output" - top: "loss2/ave_pool" - pooling_param { - pool: AVE - kernel_size: 5 - stride: 3 - } -} -layer { - name: "loss2/conv" - type: "Convolution" - bottom: "loss2/ave_pool" - top: "loss2/conv" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 128 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "loss2/relu_conv" - type: "ReLU" - bottom: "loss2/conv" - top: "loss2/conv" -} -layer { - name: "loss2/fc" - type: "InnerProduct" - bottom: "loss2/conv" - top: "loss2/fc" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 1024 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "loss2/relu_fc" - type: "ReLU" - bottom: "loss2/fc" - top: "loss2/fc" -} -layer { - name: "loss2/drop_fc" - type: "Dropout" - bottom: "loss2/fc" - top: "loss2/fc" - dropout_param { - dropout_ratio: 0.7 - } -} -layer { - name: "loss2/classifier" - type: "InnerProduct" - bottom: "loss2/fc" - top: "loss2/classifier" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 1000 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "loss2/loss" - type: "SoftmaxWithLoss" - bottom: "loss2/classifier" - bottom: "label" - top: "loss2/loss1" - loss_weight: 0.3 -} -layer { - name: "loss2/top-1" - type: "Accuracy" - bottom: "loss2/classifier" - bottom: "label" - top: "loss2/top-1" - include { - phase: TEST - } -} -layer { - name: "loss2/top-5" - type: "Accuracy" - bottom: "loss2/classifier" - bottom: "label" - top: "loss2/top-5" - include { - phase: TEST - } - accuracy_param { - top_k: 5 - } -} -layer { - name: "inception_4e/1x1" - type: "Convolution" - bottom: "inception_4d/output" - top: "inception_4e/1x1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4e/relu_1x1" - type: "ReLU" - bottom: "inception_4e/1x1" - top: "inception_4e/1x1" -} -layer { - name: "inception_4e/3x3_reduce" - type: "Convolution" - bottom: "inception_4d/output" - top: "inception_4e/3x3_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 160 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4e/relu_3x3_reduce" - type: "ReLU" - bottom: "inception_4e/3x3_reduce" - top: "inception_4e/3x3_reduce" -} -layer { - name: "inception_4e/3x3" - type: "Convolution" - bottom: "inception_4e/3x3_reduce" - top: "inception_4e/3x3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 320 - pad: 1 - kernel_size: 3 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4e/relu_3x3" - type: "ReLU" - bottom: "inception_4e/3x3" - top: "inception_4e/3x3" -} -layer { - name: "inception_4e/5x5_reduce" - type: "Convolution" - bottom: "inception_4d/output" - top: "inception_4e/5x5_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 32 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4e/relu_5x5_reduce" - type: "ReLU" - bottom: "inception_4e/5x5_reduce" - top: "inception_4e/5x5_reduce" -} -layer { - name: "inception_4e/5x5" - type: "Convolution" - bottom: "inception_4e/5x5_reduce" - top: "inception_4e/5x5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 128 - pad: 2 - kernel_size: 5 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4e/relu_5x5" - type: "ReLU" - bottom: "inception_4e/5x5" - top: "inception_4e/5x5" -} -layer { - name: "inception_4e/pool" - type: "Pooling" - bottom: "inception_4d/output" - top: "inception_4e/pool" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 1 - pad: 1 - } -} -layer { - name: "inception_4e/pool_proj" - type: "Convolution" - bottom: "inception_4e/pool" - top: "inception_4e/pool_proj" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 128 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_4e/relu_pool_proj" - type: "ReLU" - bottom: "inception_4e/pool_proj" - top: "inception_4e/pool_proj" -} -layer { - name: "inception_4e/output" - type: "Concat" - bottom: "inception_4e/1x1" - bottom: "inception_4e/3x3" - bottom: "inception_4e/5x5" - bottom: "inception_4e/pool_proj" - top: "inception_4e/output" -} -layer { - name: "pool4/3x3_s2" - type: "Pooling" - bottom: "inception_4e/output" - top: "pool4/3x3_s2" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "inception_5a/1x1" - type: "Convolution" - bottom: "pool4/3x3_s2" - top: "inception_5a/1x1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_5a/relu_1x1" - type: "ReLU" - bottom: "inception_5a/1x1" - top: "inception_5a/1x1" -} -layer { - name: "inception_5a/3x3_reduce" - type: "Convolution" - bottom: "pool4/3x3_s2" - top: "inception_5a/3x3_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 160 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_5a/relu_3x3_reduce" - type: "ReLU" - bottom: "inception_5a/3x3_reduce" - top: "inception_5a/3x3_reduce" -} -layer { - name: "inception_5a/3x3" - type: "Convolution" - bottom: "inception_5a/3x3_reduce" - top: "inception_5a/3x3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 320 - pad: 1 - kernel_size: 3 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_5a/relu_3x3" - type: "ReLU" - bottom: "inception_5a/3x3" - top: "inception_5a/3x3" -} -layer { - name: "inception_5a/5x5_reduce" - type: "Convolution" - bottom: "pool4/3x3_s2" - top: "inception_5a/5x5_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 32 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_5a/relu_5x5_reduce" - type: "ReLU" - bottom: "inception_5a/5x5_reduce" - top: "inception_5a/5x5_reduce" -} -layer { - name: "inception_5a/5x5" - type: "Convolution" - bottom: "inception_5a/5x5_reduce" - top: "inception_5a/5x5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 128 - pad: 2 - kernel_size: 5 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_5a/relu_5x5" - type: "ReLU" - bottom: "inception_5a/5x5" - top: "inception_5a/5x5" -} -layer { - name: "inception_5a/pool" - type: "Pooling" - bottom: "pool4/3x3_s2" - top: "inception_5a/pool" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 1 - pad: 1 - } -} -layer { - name: "inception_5a/pool_proj" - type: "Convolution" - bottom: "inception_5a/pool" - top: "inception_5a/pool_proj" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 128 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_5a/relu_pool_proj" - type: "ReLU" - bottom: "inception_5a/pool_proj" - top: "inception_5a/pool_proj" -} -layer { - name: "inception_5a/output" - type: "Concat" - bottom: "inception_5a/1x1" - bottom: "inception_5a/3x3" - bottom: "inception_5a/5x5" - bottom: "inception_5a/pool_proj" - top: "inception_5a/output" -} -layer { - name: "inception_5b/1x1" - type: "Convolution" - bottom: "inception_5a/output" - top: "inception_5b/1x1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_5b/relu_1x1" - type: "ReLU" - bottom: "inception_5b/1x1" - top: "inception_5b/1x1" -} -layer { - name: "inception_5b/3x3_reduce" - type: "Convolution" - bottom: "inception_5a/output" - top: "inception_5b/3x3_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 192 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_5b/relu_3x3_reduce" - type: "ReLU" - bottom: "inception_5b/3x3_reduce" - top: "inception_5b/3x3_reduce" -} -layer { - name: "inception_5b/3x3" - type: "Convolution" - bottom: "inception_5b/3x3_reduce" - top: "inception_5b/3x3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - pad: 1 - kernel_size: 3 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_5b/relu_3x3" - type: "ReLU" - bottom: "inception_5b/3x3" - top: "inception_5b/3x3" -} -layer { - name: "inception_5b/5x5_reduce" - type: "Convolution" - bottom: "inception_5a/output" - top: "inception_5b/5x5_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 48 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_5b/relu_5x5_reduce" - type: "ReLU" - bottom: "inception_5b/5x5_reduce" - top: "inception_5b/5x5_reduce" -} -layer { - name: "inception_5b/5x5" - type: "Convolution" - bottom: "inception_5b/5x5_reduce" - top: "inception_5b/5x5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 128 - pad: 2 - kernel_size: 5 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_5b/relu_5x5" - type: "ReLU" - bottom: "inception_5b/5x5" - top: "inception_5b/5x5" -} -layer { - name: "inception_5b/pool" - type: "Pooling" - bottom: "inception_5a/output" - top: "inception_5b/pool" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 1 - pad: 1 - } -} -layer { - name: "inception_5b/pool_proj" - type: "Convolution" - bottom: "inception_5b/pool" - top: "inception_5b/pool_proj" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 128 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "inception_5b/relu_pool_proj" - type: "ReLU" - bottom: "inception_5b/pool_proj" - top: "inception_5b/pool_proj" -} -layer { - name: "inception_5b/output" - type: "Concat" - bottom: "inception_5b/1x1" - bottom: "inception_5b/3x3" - bottom: "inception_5b/5x5" - bottom: "inception_5b/pool_proj" - top: "inception_5b/output" -} -layer { - name: "pool5/7x7_s1" - type: "Pooling" - bottom: "inception_5b/output" - top: "pool5/7x7_s1" - pooling_param { - pool: AVE - kernel_size: 7 - stride: 1 - } -} -layer { - name: "pool5/drop_7x7_s1" - type: "Dropout" - bottom: "pool5/7x7_s1" - top: "pool5/7x7_s1" - dropout_param { - dropout_ratio: 0.4 - } -} -layer { - name: "loss3/classifier" - type: "InnerProduct" - bottom: "pool5/7x7_s1" - top: "loss3/classifier" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 1000 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "loss3/loss3" - type: "SoftmaxWithLoss" - bottom: "loss3/classifier" - bottom: "label" - top: "loss3/loss3" - loss_weight: 1 -} -layer { - name: "loss3/top-1" - type: "Accuracy" - bottom: "loss3/classifier" - bottom: "label" - top: "loss3/top-1" - include { - phase: TEST - } -} -layer { - name: "loss3/top-5" - type: "Accuracy" - bottom: "loss3/classifier" - bottom: "label" - top: "loss3/top-5" - include { - phase: TEST - } - accuracy_param { - top_k: 5 - } -} diff --git a/models/intel_optimized_models/multinode/googlenet_v2_4nodes/solver.prototxt b/models/intel_optimized_models/multinode/googlenet_v2_4nodes/solver.prototxt deleted file mode 100644 index dda5240f3..000000000 --- a/models/intel_optimized_models/multinode/googlenet_v2_4nodes/solver.prototxt +++ /dev/null @@ -1,24 +0,0 @@ -#This is Intel(R) optimized (in terms of time to train) version of solver for model GoogLeNet v2. -#Original solver.prototxt can be found in /models/default_resnet_50/ directory of this repository. -#Differences: -#- lr_policy is set to poly instead of step -#- base_lr is set to 0.05 -#- max_iter is decreased to 100000 -# -#Top-5 and Top-1 results achieved with this version of solver: -#Top-5: 89.40% -#Top-1: 69.02% -#Training was performed using server equipped with Intel(R) Xeon Phi(TM) CPU 7250 processor. - -net: "models/intel_optimized_models/multinode/googlenet_v2_4nodes/train_val.prototxt" -base_lr: 0.05 -display: 40 -max_iter: 100000 -lr_policy: "poly" -power: 0.5 -momentum: 0.9 -weight_decay: 0.0002 -snapshot: 10000 -snapshot_prefix: "models/intel_optimized_models/multinode/googlenet_v2_4nodes/default_googlenet_v2" -solver_mode: CPU -average_loss: 40 \ No newline at end of file diff --git a/models/intel_optimized_models/multinode/googlenet_v2_8nodes/solver.prototxt b/models/intel_optimized_models/multinode/googlenet_v2_8nodes/solver.prototxt new file mode 100644 index 000000000..a39aedfe5 --- /dev/null +++ b/models/intel_optimized_models/multinode/googlenet_v2_8nodes/solver.prototxt @@ -0,0 +1,15 @@ +net: "models/intel_optimized_models/multinode/googlenet_v2_8nodes/train_val.prototxt" +test_iter: 1000 +test_interval: 10000 +test_initialization: false +base_lr: 0.06 +display: 40 +max_iter: 182000 +lr_policy: "poly" +power: 0.5 +momentum: 0.9 +weight_decay: 0.0002 +snapshot: 10000 +snapshot_prefix: "models/intel_optimized_models/multinode/googlenet_v2_8nodes/default_googlenet_v2" +solver_mode: CPU +average_loss: 40 diff --git a/models/intel_optimized_models/multinode/googlenet_v2_4nodes/train_val.prototxt b/models/intel_optimized_models/multinode/googlenet_v2_8nodes/train_val.prototxt similarity index 100% rename from models/intel_optimized_models/multinode/googlenet_v2_4nodes/train_val.prototxt rename to models/intel_optimized_models/multinode/googlenet_v2_8nodes/train_val.prototxt diff --git a/models/intel_optimized_models/multinode/resnet_50_16_nodes/solver.prototxt b/models/intel_optimized_models/multinode/resnet_50_16_nodes/solver.prototxt deleted file mode 100644 index a66f60dfa..000000000 --- a/models/intel_optimized_models/multinode/resnet_50_16_nodes/solver.prototxt +++ /dev/null @@ -1,15 +0,0 @@ -#This solver is described by Computer Vision Group Jena (CVGJ) in [ImageNet pre-trained models with batch normalization] (https://arxiv.org/pdf/1612.01452.pdf) -net: "models/intel_optimized_models/multinode/resnet_50_16_nodes/train_val.prototxt" -#test_iter: 5000 -#test_interval: 15000 -#test_initialization: false -base_lr: 0.1 -display: 20 -max_iter: 320000 -lr_policy: "poly" -power: 1 -momentum: 0.9 -weight_decay: 0.0001 -snapshot: 30000 -snapshot_prefix: "caffe-resnet50" -solver_mode: CPU diff --git a/models/intel_optimized_models/multinode/resnet_50_16_nodes/train_val.prototxt b/models/intel_optimized_models/multinode/resnet_50_16_nodes/train_val.prototxt deleted file mode 100644 index 71b07d00a..000000000 --- a/models/intel_optimized_models/multinode/resnet_50_16_nodes/train_val.prototxt +++ /dev/null @@ -1,2306 +0,0 @@ -#This is Intel(R) optimized (in terms of time to train) version of topology described in the [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) publication. -# -#Top-5 and Top-1 results achieved with this topology: -#Top-5: 92% -#Top-1: 73.9% -#Training was performed using server equipped with Intel(R) Xeon Phi(TM) CPU 7250 processor. - -layer { -name: "data" -type: "Data" -top: "data" -top: "label" -include { - phase: TRAIN -} -transform_param { - scale: 0.0078125 - mirror: true - crop_size: 224 - mean_value: 104 - mean_value: 117 - mean_value: 123 -} - data_param { - source: "examples/imagenet/ilsvrc12_train_lmdb" - batch_size: 16 - backend: LMDB - shuffle: true - } - -} -layer { -name: "data" -type: "Data" -top: "data" -top: "label" -include { - phase: TEST -} -transform_param { - scale: 0.0078125 - mirror: false - crop_size: 224 - mean_value: 104 - mean_value: 117 - mean_value: 123 -} - data_param { - source: "examples/imagenet/ilsvrc12_val_lmdb" - batch_size: 10 - backend: LMDB - } - -} - -layer { -name: "conv1" -type: "Convolution" -bottom: "data" -top: "conv1" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -param { - lr_mult: 2.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 64 - pad: 3 - kernel_size: 7 - stride: 2 - weight_filler { - type: "msra" - variance_norm: FAN_OUT - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "conv1_bn" -type: "BatchNorm" -bottom: "conv1" -top: "conv1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "conv1_relu" -type: "ReLU" -bottom: "conv1_pcs_arm_sim" -top: "conv1_pcs_arm_sim" - -} -layer { -name: "conv1_pool" -type: "Pooling" -bottom: "conv1_pcs_arm_sim" -top: "conv1_pool" -pooling_param { - kernel_size: 3 - stride: 2 -} - -} -layer { -name: "layer_64_1_conv1" -type: "Convolution" -bottom: "conv1_pool" -top: "layer_64_1_conv1" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 64 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_64_1_bn2" -type: "BatchNorm" -bottom: "layer_64_1_conv1" -top: "layer_64_1_conv1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_64_1_relu2" -type: "ReLU" -bottom: "layer_64_1_conv1_pcs_arm_sim" -top: "layer_64_1_conv1_pcs_arm_sim" - -} -layer { -name: "layer_64_1_conv2" -type: "Convolution" -bottom: "layer_64_1_conv1_pcs_arm_sim" -top: "layer_64_1_conv2" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 64 - bias_term: false - pad: 1 - kernel_size: 3 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_64_1_bn3" -type: "BatchNorm" -bottom: "layer_64_1_conv2" -top: "layer_64_1_conv2_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_64_1_relu3" -type: "ReLU" -bottom: "layer_64_1_conv2_pcs_arm_sim" -top: "layer_64_1_conv2_pcs_arm_sim" - -} -layer { -name: "layer_64_1_conv3" -type: "Convolution" -bottom: "layer_64_1_conv2_pcs_arm_sim" -top: "layer_64_1_conv3" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 256 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_64_1_conv_expand" -type: "Convolution" -bottom: "layer_64_1_conv1_pcs_arm_sim" -top: "layer_64_1_conv_expand" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 256 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_64_1_sum" -type: "Eltwise" -bottom: "layer_64_1_conv3" -bottom: "layer_64_1_conv_expand" -top: "layer_64_1_sum" - -} -layer { -name: "layer_64_2_bn1" -type: "BatchNorm" -bottom: "layer_64_1_sum" -top: "layer_64_2_bn1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_64_2_relu1" -type: "ReLU" -bottom: "layer_64_2_bn1_pcs_arm_sim" -top: "layer_64_2_bn1_pcs_arm_sim" - -} -layer { -name: "layer_64_2_conv1" -type: "Convolution" -bottom: "layer_64_2_bn1_pcs_arm_sim" -top: "layer_64_2_conv1" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 64 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_64_2_bn2" -type: "BatchNorm" -bottom: "layer_64_2_conv1" -top: "layer_64_2_conv1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_64_2_relu2" -type: "ReLU" -bottom: "layer_64_2_conv1_pcs_arm_sim" -top: "layer_64_2_conv1_pcs_arm_sim" - -} -layer { -name: "layer_64_2_conv2" -type: "Convolution" -bottom: "layer_64_2_conv1_pcs_arm_sim" -top: "layer_64_2_conv2" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 64 - bias_term: false - pad: 1 - kernel_size: 3 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_64_2_bn3" -type: "BatchNorm" -bottom: "layer_64_2_conv2" -top: "layer_64_2_conv2_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_64_2_relu3" -type: "ReLU" -bottom: "layer_64_2_conv2_pcs_arm_sim" -top: "layer_64_2_conv2_pcs_arm_sim" - -} -layer { -name: "layer_64_2_conv3" -type: "Convolution" -bottom: "layer_64_2_conv2_pcs_arm_sim" -top: "layer_64_2_conv3" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 256 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_64_2_sum" -type: "Eltwise" -bottom: "layer_64_2_conv3" -bottom: "layer_64_1_sum" -top: "layer_64_2_sum" - -} -layer { -name: "layer_64_3_bn1" -type: "BatchNorm" -bottom: "layer_64_2_sum" -top: "layer_64_3_bn1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_64_3_relu1" -type: "ReLU" -bottom: "layer_64_3_bn1_pcs_arm_sim" -top: "layer_64_3_bn1_pcs_arm_sim" - -} -layer { -name: "layer_64_3_conv1" -type: "Convolution" -bottom: "layer_64_3_bn1_pcs_arm_sim" -top: "layer_64_3_conv1" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 64 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_64_3_bn2" -type: "BatchNorm" -bottom: "layer_64_3_conv1" -top: "layer_64_3_conv1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_64_3_relu2" -type: "ReLU" -bottom: "layer_64_3_conv1_pcs_arm_sim" -top: "layer_64_3_conv1_pcs_arm_sim" - -} -layer { -name: "layer_64_3_conv2" -type: "Convolution" -bottom: "layer_64_3_conv1_pcs_arm_sim" -top: "layer_64_3_conv2" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 64 - bias_term: false - pad: 1 - kernel_size: 3 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_64_3_bn3" -type: "BatchNorm" -bottom: "layer_64_3_conv2" -top: "layer_64_3_conv2_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_64_3_relu3" -type: "ReLU" -bottom: "layer_64_3_conv2_pcs_arm_sim" -top: "layer_64_3_conv2_pcs_arm_sim" - -} -layer { -name: "layer_64_3_conv3" -type: "Convolution" -bottom: "layer_64_3_conv2_pcs_arm_sim" -top: "layer_64_3_conv3" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 256 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_64_3_sum" -type: "Eltwise" -bottom: "layer_64_3_conv3" -bottom: "layer_64_2_sum" -top: "layer_64_3_sum" - -} -layer { -name: "layer_128_1_bn1" -type: "BatchNorm" -bottom: "layer_64_3_sum" -top: "layer_128_1_bn1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_128_1_relu1" -type: "ReLU" -bottom: "layer_128_1_bn1_pcs_arm_sim" -top: "layer_128_1_bn1_pcs_arm_sim" - -} -layer { -name: "layer_128_1_conv1" -type: "Convolution" -bottom: "layer_128_1_bn1_pcs_arm_sim" -top: "layer_128_1_conv1" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 128 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_128_1_bn2" -type: "BatchNorm" -bottom: "layer_128_1_conv1" -top: "layer_128_1_conv1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_128_1_relu2" -type: "ReLU" -bottom: "layer_128_1_conv1_pcs_arm_sim" -top: "layer_128_1_conv1_pcs_arm_sim" - -} -layer { -name: "layer_128_1_conv2" -type: "Convolution" -bottom: "layer_128_1_conv1_pcs_arm_sim" -top: "layer_128_1_conv2" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 128 - bias_term: false - pad: 1 - kernel_size: 3 - stride: 2 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_128_1_bn3" -type: "BatchNorm" -bottom: "layer_128_1_conv2" -top: "layer_128_1_conv2_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_128_1_relu3" -type: "ReLU" -bottom: "layer_128_1_conv2_pcs_arm_sim" -top: "layer_128_1_conv2_pcs_arm_sim" - -} -layer { -name: "layer_128_1_conv3" -type: "Convolution" -bottom: "layer_128_1_conv2_pcs_arm_sim" -top: "layer_128_1_conv3" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 512 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_128_1_conv_expand" -type: "Convolution" -bottom: "layer_128_1_bn1_pcs_arm_sim" -top: "layer_128_1_conv_expand" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 512 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 2 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_128_1_sum" -type: "Eltwise" -bottom: "layer_128_1_conv3" -bottom: "layer_128_1_conv_expand" -top: "layer_128_1_sum" - -} -layer { -name: "layer_128_2_bn1" -type: "BatchNorm" -bottom: "layer_128_1_sum" -top: "layer_128_2_bn1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_128_2_relu1" -type: "ReLU" -bottom: "layer_128_2_bn1_pcs_arm_sim" -top: "layer_128_2_bn1_pcs_arm_sim" - -} -layer { -name: "layer_128_2_conv1" -type: "Convolution" -bottom: "layer_128_2_bn1_pcs_arm_sim" -top: "layer_128_2_conv1" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 128 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_128_2_bn2" -type: "BatchNorm" -bottom: "layer_128_2_conv1" -top: "layer_128_2_conv1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_128_2_relu2" -type: "ReLU" -bottom: "layer_128_2_conv1_pcs_arm_sim" -top: "layer_128_2_conv1_pcs_arm_sim" - -} -layer { -name: "layer_128_2_conv2" -type: "Convolution" -bottom: "layer_128_2_conv1_pcs_arm_sim" -top: "layer_128_2_conv2" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 128 - bias_term: false - pad: 1 - kernel_size: 3 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_128_2_bn3" -type: "BatchNorm" -bottom: "layer_128_2_conv2" -top: "layer_128_2_conv2_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_128_2_relu3" -type: "ReLU" -bottom: "layer_128_2_conv2_pcs_arm_sim" -top: "layer_128_2_conv2_pcs_arm_sim" - -} -layer { -name: "layer_128_2_conv3" -type: "Convolution" -bottom: "layer_128_2_conv2_pcs_arm_sim" -top: "layer_128_2_conv3" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 512 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_128_2_sum" -type: "Eltwise" -bottom: "layer_128_2_conv3" -bottom: "layer_128_1_sum" -top: "layer_128_2_sum" - -} -layer { -name: "layer_128_3_bn1" -type: "BatchNorm" -bottom: "layer_128_2_sum" -top: "layer_128_3_bn1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_128_3_relu1" -type: "ReLU" -bottom: "layer_128_3_bn1_pcs_arm_sim" -top: "layer_128_3_bn1_pcs_arm_sim" - -} -layer { -name: "layer_128_3_conv1" -type: "Convolution" -bottom: "layer_128_3_bn1_pcs_arm_sim" -top: "layer_128_3_conv1" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 128 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_128_3_bn2" -type: "BatchNorm" -bottom: "layer_128_3_conv1" -top: "layer_128_3_conv1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_128_3_relu2" -type: "ReLU" -bottom: "layer_128_3_conv1_pcs_arm_sim" -top: "layer_128_3_conv1_pcs_arm_sim" - -} -layer { -name: "layer_128_3_conv2" -type: "Convolution" -bottom: "layer_128_3_conv1_pcs_arm_sim" -top: "layer_128_3_conv2" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 128 - bias_term: false - pad: 1 - kernel_size: 3 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_128_3_bn3" -type: "BatchNorm" -bottom: "layer_128_3_conv2" -top: "layer_128_3_conv2_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_128_3_relu3" -type: "ReLU" -bottom: "layer_128_3_conv2_pcs_arm_sim" -top: "layer_128_3_conv2_pcs_arm_sim" - -} -layer { -name: "layer_128_3_conv3" -type: "Convolution" -bottom: "layer_128_3_conv2_pcs_arm_sim" -top: "layer_128_3_conv3" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 512 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_128_3_sum" -type: "Eltwise" -bottom: "layer_128_3_conv3" -bottom: "layer_128_2_sum" -top: "layer_128_3_sum" - -} -layer { -name: "layer_128_4_bn1" -type: "BatchNorm" -bottom: "layer_128_3_sum" -top: "layer_128_4_bn1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_128_4_relu1" -type: "ReLU" -bottom: "layer_128_4_bn1_pcs_arm_sim" -top: "layer_128_4_bn1_pcs_arm_sim" - -} -layer { -name: "layer_128_4_conv1" -type: "Convolution" -bottom: "layer_128_4_bn1_pcs_arm_sim" -top: "layer_128_4_conv1" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 128 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_128_4_bn2" -type: "BatchNorm" -bottom: "layer_128_4_conv1" -top: "layer_128_4_conv1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_128_4_relu2" -type: "ReLU" -bottom: "layer_128_4_conv1_pcs_arm_sim" -top: "layer_128_4_conv1_pcs_arm_sim" - -} -layer { -name: "layer_128_4_conv2" -type: "Convolution" -bottom: "layer_128_4_conv1_pcs_arm_sim" -top: "layer_128_4_conv2" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 128 - bias_term: false - pad: 1 - kernel_size: 3 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_128_4_bn3" -type: "BatchNorm" -bottom: "layer_128_4_conv2" -top: "layer_128_4_conv2_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_128_4_relu3" -type: "ReLU" -bottom: "layer_128_4_conv2_pcs_arm_sim" -top: "layer_128_4_conv2_pcs_arm_sim" - -} -layer { -name: "layer_128_4_conv3" -type: "Convolution" -bottom: "layer_128_4_conv2_pcs_arm_sim" -top: "layer_128_4_conv3" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 512 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_128_4_sum" -type: "Eltwise" -bottom: "layer_128_4_conv3" -bottom: "layer_128_3_sum" -top: "layer_128_4_sum" - -} -layer { -name: "layer_256_1_bn1" -type: "BatchNorm" -bottom: "layer_128_4_sum" -top: "layer_256_1_bn1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_256_1_relu1" -type: "ReLU" -bottom: "layer_256_1_bn1_pcs_arm_sim" -top: "layer_256_1_bn1_pcs_arm_sim" - -} -layer { -name: "layer_256_1_conv1" -type: "Convolution" -bottom: "layer_256_1_bn1_pcs_arm_sim" -top: "layer_256_1_conv1" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 256 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_256_1_bn2" -type: "BatchNorm" -bottom: "layer_256_1_conv1" -top: "layer_256_1_conv1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_256_1_relu2" -type: "ReLU" -bottom: "layer_256_1_conv1_pcs_arm_sim" -top: "layer_256_1_conv1_pcs_arm_sim" - -} -layer { -name: "layer_256_1_conv2" -type: "Convolution" -bottom: "layer_256_1_conv1_pcs_arm_sim" -top: "layer_256_1_conv2" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 256 - bias_term: false - pad: 1 - kernel_size: 3 - stride: 2 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_256_1_bn3" -type: "BatchNorm" -bottom: "layer_256_1_conv2" -top: "layer_256_1_conv2_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_256_1_relu3" -type: "ReLU" -bottom: "layer_256_1_conv2_pcs_arm_sim" -top: "layer_256_1_conv2_pcs_arm_sim" - -} -layer { -name: "layer_256_1_conv3" -type: "Convolution" -bottom: "layer_256_1_conv2_pcs_arm_sim" -top: "layer_256_1_conv3" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 1024 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_256_1_conv_expand" -type: "Convolution" -bottom: "layer_256_1_bn1_pcs_arm_sim" -top: "layer_256_1_conv_expand" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 1024 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 2 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_256_1_sum" -type: "Eltwise" -bottom: "layer_256_1_conv3" -bottom: "layer_256_1_conv_expand" -top: "layer_256_1_sum" - -} -layer { -name: "layer_256_2_bn1" -type: "BatchNorm" -bottom: "layer_256_1_sum" -top: "layer_256_2_bn1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_256_2_relu1" -type: "ReLU" -bottom: "layer_256_2_bn1_pcs_arm_sim" -top: "layer_256_2_bn1_pcs_arm_sim" - -} -layer { -name: "layer_256_2_conv1" -type: "Convolution" -bottom: "layer_256_2_bn1_pcs_arm_sim" -top: "layer_256_2_conv1" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 256 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_256_2_bn2" -type: "BatchNorm" -bottom: "layer_256_2_conv1" -top: "layer_256_2_conv1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_256_2_relu2" -type: "ReLU" -bottom: "layer_256_2_conv1_pcs_arm_sim" -top: "layer_256_2_conv1_pcs_arm_sim" - -} -layer { -name: "layer_256_2_conv2" -type: "Convolution" -bottom: "layer_256_2_conv1_pcs_arm_sim" -top: "layer_256_2_conv2" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 256 - bias_term: false - pad: 1 - kernel_size: 3 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_256_2_bn3" -type: "BatchNorm" -bottom: "layer_256_2_conv2" -top: "layer_256_2_conv2_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_256_2_relu3" -type: "ReLU" -bottom: "layer_256_2_conv2_pcs_arm_sim" -top: "layer_256_2_conv2_pcs_arm_sim" - -} -layer { -name: "layer_256_2_conv3" -type: "Convolution" -bottom: "layer_256_2_conv2_pcs_arm_sim" -top: "layer_256_2_conv3" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 1024 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_256_2_sum" -type: "Eltwise" -bottom: "layer_256_2_conv3" -bottom: "layer_256_1_sum" -top: "layer_256_2_sum" - -} -layer { -name: "layer_256_3_bn1" -type: "BatchNorm" -bottom: "layer_256_2_sum" -top: "layer_256_3_bn1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_256_3_relu1" -type: "ReLU" -bottom: "layer_256_3_bn1_pcs_arm_sim" -top: "layer_256_3_bn1_pcs_arm_sim" - -} -layer { -name: "layer_256_3_conv1" -type: "Convolution" -bottom: "layer_256_3_bn1_pcs_arm_sim" -top: "layer_256_3_conv1" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 256 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_256_3_bn2" -type: "BatchNorm" -bottom: "layer_256_3_conv1" -top: "layer_256_3_conv1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_256_3_relu2" -type: "ReLU" -bottom: "layer_256_3_conv1_pcs_arm_sim" -top: "layer_256_3_conv1_pcs_arm_sim" - -} -layer { -name: "layer_256_3_conv2" -type: "Convolution" -bottom: "layer_256_3_conv1_pcs_arm_sim" -top: "layer_256_3_conv2" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 256 - bias_term: false - pad: 1 - kernel_size: 3 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_256_3_bn3" -type: "BatchNorm" -bottom: "layer_256_3_conv2" -top: "layer_256_3_conv2_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_256_3_relu3" -type: "ReLU" -bottom: "layer_256_3_conv2_pcs_arm_sim" -top: "layer_256_3_conv2_pcs_arm_sim" - -} -layer { -name: "layer_256_3_conv3" -type: "Convolution" -bottom: "layer_256_3_conv2_pcs_arm_sim" -top: "layer_256_3_conv3" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 1024 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_256_3_sum" -type: "Eltwise" -bottom: "layer_256_3_conv3" -bottom: "layer_256_2_sum" -top: "layer_256_3_sum" - -} -layer { -name: "layer_256_4_bn1" -type: "BatchNorm" -bottom: "layer_256_3_sum" -top: "layer_256_4_bn1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_256_4_relu1" -type: "ReLU" -bottom: "layer_256_4_bn1_pcs_arm_sim" -top: "layer_256_4_bn1_pcs_arm_sim" - -} -layer { -name: "layer_256_4_conv1" -type: "Convolution" -bottom: "layer_256_4_bn1_pcs_arm_sim" -top: "layer_256_4_conv1" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 256 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_256_4_bn2" -type: "BatchNorm" -bottom: "layer_256_4_conv1" -top: "layer_256_4_conv1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_256_4_relu2" -type: "ReLU" -bottom: "layer_256_4_conv1_pcs_arm_sim" -top: "layer_256_4_conv1_pcs_arm_sim" - -} -layer { -name: "layer_256_4_conv2" -type: "Convolution" -bottom: "layer_256_4_conv1_pcs_arm_sim" -top: "layer_256_4_conv2" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 256 - bias_term: false - pad: 1 - kernel_size: 3 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_256_4_bn3" -type: "BatchNorm" -bottom: "layer_256_4_conv2" -top: "layer_256_4_conv2_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_256_4_relu3" -type: "ReLU" -bottom: "layer_256_4_conv2_pcs_arm_sim" -top: "layer_256_4_conv2_pcs_arm_sim" - -} -layer { -name: "layer_256_4_conv3" -type: "Convolution" -bottom: "layer_256_4_conv2_pcs_arm_sim" -top: "layer_256_4_conv3" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 1024 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_256_4_sum" -type: "Eltwise" -bottom: "layer_256_4_conv3" -bottom: "layer_256_3_sum" -top: "layer_256_4_sum" - -} -layer { -name: "layer_256_5_bn1" -type: "BatchNorm" -bottom: "layer_256_4_sum" -top: "layer_256_5_bn1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_256_5_relu1" -type: "ReLU" -bottom: "layer_256_5_bn1_pcs_arm_sim" -top: "layer_256_5_bn1_pcs_arm_sim" - -} -layer { -name: "layer_256_5_conv1" -type: "Convolution" -bottom: "layer_256_5_bn1_pcs_arm_sim" -top: "layer_256_5_conv1" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 256 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_256_5_bn2" -type: "BatchNorm" -bottom: "layer_256_5_conv1" -top: "layer_256_5_conv1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_256_5_relu2" -type: "ReLU" -bottom: "layer_256_5_conv1_pcs_arm_sim" -top: "layer_256_5_conv1_pcs_arm_sim" - -} -layer { -name: "layer_256_5_conv2" -type: "Convolution" -bottom: "layer_256_5_conv1_pcs_arm_sim" -top: "layer_256_5_conv2" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 256 - bias_term: false - pad: 1 - kernel_size: 3 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_256_5_bn3" -type: "BatchNorm" -bottom: "layer_256_5_conv2" -top: "layer_256_5_conv2_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_256_5_relu3" -type: "ReLU" -bottom: "layer_256_5_conv2_pcs_arm_sim" -top: "layer_256_5_conv2_pcs_arm_sim" - -} -layer { -name: "layer_256_5_conv3" -type: "Convolution" -bottom: "layer_256_5_conv2_pcs_arm_sim" -top: "layer_256_5_conv3" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 1024 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_256_5_sum" -type: "Eltwise" -bottom: "layer_256_5_conv3" -bottom: "layer_256_4_sum" -top: "layer_256_5_sum" - -} -layer { -name: "layer_256_6_bn1" -type: "BatchNorm" -bottom: "layer_256_5_sum" -top: "layer_256_6_bn1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_256_6_relu1" -type: "ReLU" -bottom: "layer_256_6_bn1_pcs_arm_sim" -top: "layer_256_6_bn1_pcs_arm_sim" - -} -layer { -name: "layer_256_6_conv1" -type: "Convolution" -bottom: "layer_256_6_bn1_pcs_arm_sim" -top: "layer_256_6_conv1" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 256 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_256_6_bn2" -type: "BatchNorm" -bottom: "layer_256_6_conv1" -top: "layer_256_6_conv1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_256_6_relu2" -type: "ReLU" -bottom: "layer_256_6_conv1_pcs_arm_sim" -top: "layer_256_6_conv1_pcs_arm_sim" - -} -layer { -name: "layer_256_6_conv2" -type: "Convolution" -bottom: "layer_256_6_conv1_pcs_arm_sim" -top: "layer_256_6_conv2" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 256 - bias_term: false - pad: 1 - kernel_size: 3 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_256_6_bn3" -type: "BatchNorm" -bottom: "layer_256_6_conv2" -top: "layer_256_6_conv2_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_256_6_relu3" -type: "ReLU" -bottom: "layer_256_6_conv2_pcs_arm_sim" -top: "layer_256_6_conv2_pcs_arm_sim" - -} -layer { -name: "layer_256_6_conv3" -type: "Convolution" -bottom: "layer_256_6_conv2_pcs_arm_sim" -top: "layer_256_6_conv3" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 1024 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_256_6_sum" -type: "Eltwise" -bottom: "layer_256_6_conv3" -bottom: "layer_256_5_sum" -top: "layer_256_6_sum" - -} -layer { -name: "layer_512_1_bn1" -type: "BatchNorm" -bottom: "layer_256_6_sum" -top: "layer_512_1_bn1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_512_1_relu1" -type: "ReLU" -bottom: "layer_512_1_bn1_pcs_arm_sim" -top: "layer_512_1_bn1_pcs_arm_sim" - -} -layer { -name: "layer_512_1_conv1" -type: "Convolution" -bottom: "layer_512_1_bn1_pcs_arm_sim" -top: "layer_512_1_conv1" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 512 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_512_1_bn2" -type: "BatchNorm" -bottom: "layer_512_1_conv1" -top: "layer_512_1_conv1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_512_1_relu2" -type: "ReLU" -bottom: "layer_512_1_conv1_pcs_arm_sim" -top: "layer_512_1_conv1_pcs_arm_sim" - -} -layer { -name: "layer_512_1_conv2" -type: "Convolution" -bottom: "layer_512_1_conv1_pcs_arm_sim" -top: "layer_512_1_conv2" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 512 - bias_term: false - pad: 1 - kernel_size: 3 - stride: 2 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_512_1_bn3" -type: "BatchNorm" -bottom: "layer_512_1_conv2" -top: "layer_512_1_conv2_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_512_1_relu3" -type: "ReLU" -bottom: "layer_512_1_conv2_pcs_arm_sim" -top: "layer_512_1_conv2_pcs_arm_sim" - -} -layer { -name: "layer_512_1_conv3" -type: "Convolution" -bottom: "layer_512_1_conv2_pcs_arm_sim" -top: "layer_512_1_conv3" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 2048 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_512_1_conv_expand" -type: "Convolution" -bottom: "layer_512_1_bn1_pcs_arm_sim" -top: "layer_512_1_conv_expand" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 2048 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 2 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_512_1_sum" -type: "Eltwise" -bottom: "layer_512_1_conv3" -bottom: "layer_512_1_conv_expand" -top: "layer_512_1_sum" - -} -layer { -name: "layer_512_2_bn1" -type: "BatchNorm" -bottom: "layer_512_1_sum" -top: "layer_512_2_bn1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_512_2_relu1" -type: "ReLU" -bottom: "layer_512_2_bn1_pcs_arm_sim" -top: "layer_512_2_bn1_pcs_arm_sim" - -} -layer { -name: "layer_512_2_conv1" -type: "Convolution" -bottom: "layer_512_2_bn1_pcs_arm_sim" -top: "layer_512_2_conv1" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 512 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_512_2_bn2" -type: "BatchNorm" -bottom: "layer_512_2_conv1" -top: "layer_512_2_conv1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_512_2_relu2" -type: "ReLU" -bottom: "layer_512_2_conv1_pcs_arm_sim" -top: "layer_512_2_conv1_pcs_arm_sim" - -} -layer { -name: "layer_512_2_conv2" -type: "Convolution" -bottom: "layer_512_2_conv1_pcs_arm_sim" -top: "layer_512_2_conv2" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 512 - bias_term: false - pad: 1 - kernel_size: 3 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_512_2_bn3" -type: "BatchNorm" -bottom: "layer_512_2_conv2" -top: "layer_512_2_conv2_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_512_2_relu3" -type: "ReLU" -bottom: "layer_512_2_conv2_pcs_arm_sim" -top: "layer_512_2_conv2_pcs_arm_sim" - -} -layer { -name: "layer_512_2_conv3" -type: "Convolution" -bottom: "layer_512_2_conv2_pcs_arm_sim" -top: "layer_512_2_conv3" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 2048 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_512_2_sum" -type: "Eltwise" -bottom: "layer_512_2_conv3" -bottom: "layer_512_1_sum" -top: "layer_512_2_sum" - -} -layer { -name: "layer_512_3_bn1" -type: "BatchNorm" -bottom: "layer_512_2_sum" -top: "layer_512_3_bn1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_512_3_relu1" -type: "ReLU" -bottom: "layer_512_3_bn1_pcs_arm_sim" -top: "layer_512_3_bn1_pcs_arm_sim" - -} -layer { -name: "layer_512_3_conv1" -type: "Convolution" -bottom: "layer_512_3_bn1_pcs_arm_sim" -top: "layer_512_3_conv1" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 512 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_512_3_bn2" -type: "BatchNorm" -bottom: "layer_512_3_conv1" -top: "layer_512_3_conv1_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_512_3_relu2" -type: "ReLU" -bottom: "layer_512_3_conv1_pcs_arm_sim" -top: "layer_512_3_conv1_pcs_arm_sim" - -} -layer { -name: "layer_512_3_conv2" -type: "Convolution" -bottom: "layer_512_3_conv1_pcs_arm_sim" -top: "layer_512_3_conv2" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 512 - bias_term: false - pad: 1 - kernel_size: 3 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_512_3_bn3" -type: "BatchNorm" -bottom: "layer_512_3_conv2" -top: "layer_512_3_conv2_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "layer_512_3_relu3" -type: "ReLU" -bottom: "layer_512_3_conv2_pcs_arm_sim" -top: "layer_512_3_conv2_pcs_arm_sim" - -} -layer { -name: "layer_512_3_conv3" -type: "Convolution" -bottom: "layer_512_3_conv2_pcs_arm_sim" -top: "layer_512_3_conv3" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -convolution_param { - num_output: 2048 - bias_term: false - pad: 0 - kernel_size: 1 - stride: 1 - weight_filler { - type: "msra" - } - bias_filler { - type: "constant" - value: 0.0 - } -} - -} -layer { -name: "layer_512_3_sum" -type: "Eltwise" -bottom: "layer_512_3_conv3" -bottom: "layer_512_2_sum" -top: "layer_512_3_sum" - -} -layer { -name: "last_bn" -type: "BatchNorm" -bottom: "layer_512_3_sum" -top: "layer_512_3_sum_pcs_arm_sim" - batch_norm_param { - } -} -layer { -name: "last_relu" -type: "ReLU" -bottom: "layer_512_3_sum_pcs_arm_sim" -top: "layer_512_3_sum_pcs_arm_sim" - -} -layer { -name: "global_pool" -type: "Pooling" -bottom: "layer_512_3_sum_pcs_arm_sim" -top: "global_pool" -pooling_param { - pool: AVE - global_pooling: true -} - -} -layer { -name: "score" -type: "InnerProduct" -bottom: "global_pool" -top: "score" -param { - lr_mult: 1.0 - decay_mult: 1.0 -} -param { - lr_mult: 2.0 - decay_mult: 1.0 -} -inner_product_param { - num_output: 1000 -} - -} -layer { -name: "loss" -type: "SoftmaxWithLoss" -bottom: "score" -bottom: "label" -top: "loss" - -} -layer { -name: "accuracy" -type: "Accuracy" -bottom: "score" -bottom: "label" -top: "accuracy" -include { - phase: TEST -} - -}