Hotfix issue 751 (#769)

quinnrong94 · web-flow · commit 7f278dbaefa8 · 2021-02-03T17:25:18.000+08:00
* [TEST] support scale and bias input; fix uint8 read error

* [TEST] fix blob dump problem

* [TOOLS][QUANT] support reverse channel

* [TOOLS][QUANT] skip average pooling quantization

* [ARM] fix int8 inner product error

* [TOOLS][QUANT] update doc

* [TOOLS][QUANT] simplify reverse channel

* [ARM] inner product init tmp buffer to zero
diff --git a/doc/cn/user/quantization.md b/doc/cn/user/quantization.md
@@ -18,7 +18,7 @@ cd <path_to_tnn>/platforms/linux/
 ## 三、量化工具的使用  
 ### 1. 命令  
 ```
-./quantization_cmd [-h] [-p] [-m] [-i] [-b] [-w] [-n] [-s] [-c] <param>
+./quantization_cmd [-h] [-p] [-m] [-i] [-b] [-w] [-n] [-s] [-r] [-t] <param>
 ```
 ### 2. 参数说明  
 
@@ -32,6 +32,7 @@ cd <path_to_tnn>/platforms/linux/
 |-w, --weight_method|        |✅|指定weights的量化方法：<br>&bull; 0 Min-Max方法（默认）<br>&bull; 1 ADMM方法|
 |-n, --bias         |        |✅|预处理，仅对输入为图片时起作用。对输入数据各通道进行bias操作，参数格式为：0.0,0.0,0.0|
 |-s, --scale        |        |✅|预处理，仅对输入为图片时起作用。对输入数据各通道进行scale操作，参数格式为：1.0,1.0,1.0|
+|-r, --reverse_channel|        |✅|预处理，仅对输入为图片时起作用：<br>&bull; 0 使用RGB顺序（默认）<br>&bull; 1 使用BGR顺序|
 |-t, --merge_type|        |✅|在量化的时候采用Per-Tensor还是Per-Channel的方式。<br>&bull; 0 Per-Channel方法（默认）<br>&bull; 1 混合方法，weights采用Per-Channel，blob采用Per-Tensor。<br>&bull; 2 Per-Tensor方法|  
   
 ### 3. 量化输入   
diff --git a/doc/en/user/quantization_en.md b/doc/en/user/quantization_en.md
@@ -19,7 +19,7 @@ cd <path_to_tnn>/platforms/linux/
 ## III. Usage
 ### 1. Command  
 ```
-./quantization_cmd [-h] [-p] [-m] [-i] [-b] [-w] [-n] [-s] [-c] <param>
+./quantization_cmd [-h] [-p] [-m] [-i] [-b] [-w] [-n] [-s] [-r] [-t] <param>
 ```
 ### 2. Parameter Description  
 
@@ -31,9 +31,9 @@ cd <path_to_tnn>/platforms/linux/
 |-i, --input_path   |&radic; |&radic;|Specify the path of the quantitative input folder. The currently supported formats are: <br>&bull; Text file (the file suffix is ​​.txt) <br>&bull; Common picture format files (file suffix is ​​.jpg .jpeg .png .bmp) <br> All files under this directory will be used as input.|
 |-b, --blob_method  |        |&radic;|Specify the feature map quantization method：<br>&bull; 0 Min-Max method (default)<br>&bull; 2 KL method|
 |-w, --weight_method|        |&radic;|Specify the quantification method of weights: <br>&bull; 0 Min-Max method (default)<br>&bull; 1 ADMM method|
-|-n, --mean         |        |&radic;|
-Pre-processing, mean operation on each channel of input data, parameter format: 0.0, 0.0, 0.0|
+|-n, --mean         |        |&radic;|Pre-processing, mean operation on each channel of input data, parameter format: 0.0, 0.0, 0.0|
 |-s, --scale        |        |&radic;|Pre-processing, scale the input data channels, the parameter format is: 1.0, 1.0, 1.0|
+|-r, --reverse_channel|        |&radic;|Pre-processing, valid for picture format files: <br>&bull; 0 use RGB order (default)<br>&bull; 1 use BGR order|
 |-t, --merge_type|        |&radic;|Whether use per-tensor or per-channel method when quantifying: <br>&bull; 0 per-channel method (default)<br>&bull; 1 mix method, weights: per-channel, blob: per-tensor.<br>&bull; 2 per-tensor method|  
   
 ### 3. Quantization Input   
diff --git a/source/tnn/core/default_network.cc b/source/tnn/core/default_network.cc
@@ -274,8 +274,6 @@ Status DefaultNetwork::UpdateBlobPrecision(std::shared_ptr<LayerInfo> layer_info
     if (device_->GetDeviceType() != DEVICE_ARM && device_->GetDeviceType() != DEVICE_NAIVE) {
         return TNN_OK;
     }
-    static bool cpu_support_fp16 = CpuUtils::CpuSupportFp16();
-    LOGD("support fp 16: %d\n", cpu_support_fp16 ? 1 : 0);
 
     auto &desc      = (*blob)->GetBlobDesc();
     auto layer_type = layer_info->type;
@@ -287,9 +285,10 @@ Status DefaultNetwork::UpdateBlobPrecision(std::shared_ptr<LayerInfo> layer_info
                 RETURN_ON_NEQ(GenerateInt8Blob(name, net_resource, blob), TNN_OK);
             }
         } else {
-            bool layer_implemented_fp16 = device_->GetImplementedPrecision(layer_type)->fp16_implemented;
             // update blob of non-quantized network by config precision and enabled precision
             if (config_.precision == PRECISION_NORMAL || config_.precision == PRECISION_AUTO) {
+                static bool cpu_support_fp16 = CpuUtils::CpuSupportFp16();
+                bool layer_implemented_fp16  = device_->GetImplementedPrecision(layer_type)->fp16_implemented;
                 desc.data_type = (cpu_support_fp16 && layer_implemented_fp16) ? DATA_TYPE_HALF : DATA_TYPE_FLOAT;
             } else if (config_.precision == PRECISION_LOW) {
                 desc.data_type = DATA_TYPE_BFP16;
diff --git a/source/tnn/device/arm/acc/arm_inner_product_layer_acc.cc b/source/tnn/device/arm/acc/arm_inner_product_layer_acc.cc
@@ -238,24 +238,41 @@ Status ArmInnerProductLayerAcc::Exec<int8_t>(const std::vector<Blob *> &inputs,
     InnerProductLayerParam *fc_param = dynamic_cast<InnerProductLayerParam *>(param_);
     auto dims_input                  = inputs[0]->GetBlobDesc().dims;
     auto dims_output                 = outputs[0]->GetBlobDesc().dims;
-    auto ic                          = dims_input[3] * dims_input[2] * ROUND_UP(dims_input[1], 4);
-    auto ic_r4                       = ROUND_UP(ic, 4);
-    auto oc_r4                       = ROUND_UP(dims_output[1], 4);
-
     auto input_origin  = reinterpret_cast<int8_t *>(GetBlobHandlePtr(inputs[0]->GetHandle()));
     auto output_origin = reinterpret_cast<int8_t *>(GetBlobHandlePtr(outputs[0]->GetHandle()));
+
+    auto ic    = dims_input[1];
+    auto ic_r4 = ROUND_UP(ic, 4);
+    auto hw    = dims_input[2] * dims_input[3];
+    auto ik    = ic * hw;
+    auto ik_r8 = ROUND_UP(ik, 8);
+    auto oc_r4 = ROUND_UP(dims_output[1], 4);
+
+    int8_t *tmp_ptr = (int8_t *)context_->GetSharedWorkSpace(ik_r8);
+    for (int k = ik; k < ik_r8; ++k) {
+        tmp_ptr[k] = 0;
+    }
+
     for (int n = 0; n < dims_output[0]; n++) {
-        auto input_ptr  = input_origin + n * ic_r4;
+        auto input_ptr  = input_origin + n * ic_r4 * hw;
         auto output_ptr = output_origin + n * oc_r4;
-        auto ic_r8      = ROUND_UP(ic_r4, 8);
-        if (ic_r4 != ic_r8) {
-            int8_t *tmp_ptr = (int8_t *)context_->GetSharedWorkSpace(ic_r8);
-            memcpy(tmp_ptr, input_ptr, ic_r4);
-            *(int32_t *)(tmp_ptr + ic_r4) = 0;
-            input_ptr                     = tmp_ptr;
+
+        if (hw == 1) {
+            if (ic_r4 != ik_r8) {
+                memcpy(tmp_ptr, input_ptr, ic_r4);
+            } else {
+                tmp_ptr = input_ptr;
+            }
+        } else if (ic == 1) {
+            for (int k = 0; k < ik; ++k) {
+                tmp_ptr[k] = input_ptr[k<<2];
+            }
+        } else {
+            UnpackHWC4ToCHW(tmp_ptr, input_ptr, ic, hw);
         }
-        GemvInt8(output_ptr, input_ptr, buffer_weight_.force_to<int8_t *>(), buffer_bias_.force_to<int32_t *>(),
-                 buffer_scale_.force_to<float *>(), ROUND_UP(ic_r4, 8), oc_r4);
+
+        GemvInt8(output_ptr, tmp_ptr, buffer_weight_.force_to<int8_t *>(), buffer_bias_.force_to<int32_t *>(),
+                 buffer_scale_.force_to<float *>(), ik_r8, oc_r4);
     }
 
     return TNN_OK;
diff --git a/source/tnn/device/arm/arm_util.cc b/source/tnn/device/arm/arm_util.cc
@@ -437,6 +437,20 @@ int UnpackC4WithStride(float *dst, const float *src, size_t ih, size_t iw, size_
     return 0;
 }
 
+int UnpackHWC4ToCHW(int8_t *dst, const int8_t *src, size_t channel, size_t hw) {
+    auto c_r4 = ROUND_UP(channel, 4);
+
+    for (int c = 0; c < channel; ++c) {
+        auto src_c = src + c;
+        auto dst_c = dst + c * hw;
+        for (int z = 0; z < hw; ++z) {
+            dst_c[z] = src_c[z * c_r4];
+        }
+    }
+
+    return 0;
+}
+
 #define ConvertWeightsPreparation                                        \
     const int goc       = output_channel / group;                        \
     const int gic       = input_channel / group;                         \
diff --git a/source/tnn/device/arm/arm_util.h b/source/tnn/device/arm/arm_util.h
@@ -74,6 +74,8 @@ int UnpackC4WithStride(float *dst, const float *src, size_t ih, size_t iw, size_
 
 int UnpackAndDequant(float *dst, const int8_t *src, size_t hw, size_t channel, float *scale, float *bias);
 
+int UnpackHWC4ToCHW(int8_t *dst, const int8_t *src, size_t channel, size_t hw);
+
 template <typename T>
 int ConvertWeightsC4ToC8(T *weight, int ic, int oc);
 
diff --git a/test/flags.cc b/test/flags.cc
@@ -54,4 +54,8 @@ DEFINE_string(is, "", input_shape_message);
 
 DEFINE_bool(et, false, enable_tune_message);
 
+DEFINE_string(sc, "", scale_message);
+
+DEFINE_string(bi, "", bias_message);
+
 }  // namespace TNN_NS
diff --git a/test/flags.h b/test/flags.h
@@ -63,6 +63,10 @@ static const char network_type_message[] = "network type: NAIVE, NPU, COREML, SN
 
 static const char enable_tune_message[] = "enable tune kernel(default false)";
 
+static const char scale_message[] = "input scale: s0,s1,s2,...)";
+
+static const char bias_message[] = "input bias: b0,b1,b2,...)";
+
 DECLARE_bool(h);
 
 DECLARE_string(mt);
@@ -101,6 +105,10 @@ DECLARE_string(is);
 
 DECLARE_bool(et);
 
+DECLARE_string(sc);
+
+DECLARE_string(bi);
+
 }  // namespace TNN_NS
 
 #endif  // TNN_TEST_FLAGS_H_
diff --git a/test/test.cc b/test/test.cc
@@ -90,12 +90,12 @@ namespace test {
             MatMap input_mat_map = CreateBlobMatMap(input_blob_map, FLAGS_it);
             InitInputMatMap(input_mat_map);
             auto input_converters_map = CreateBlobConverterMap(input_blob_map);
-            auto input_params_map = CreateConvertParamMap(input_mat_map);
+            auto input_params_map = CreateConvertParamMap(input_mat_map, true);
 
             //mat format NCHW_FLOAT
             MatMap output_mat_map = CreateBlobMatMap(output_blob_map, 0);
             auto output_converters_map = CreateBlobConverterMap(output_blob_map);
-            auto output_params_map = CreateConvertParamMap(output_mat_map);
+            auto output_params_map = CreateConvertParamMap(output_mat_map, false);
 
             for (int i = 0; i < FLAGS_wc; ++i) {
                 for(auto element : input_converters_map) {
@@ -132,7 +132,11 @@ namespace test {
                         return ret;
                     }
                 }
+#if DUMP_INPUT_BLOB || DUMP_OUTPUT_BLOB
+                ret = instance->Forward();
+#else
                 ret = instance->ForwardAsync(nullptr);
+#endif
                 if (!CheckResult("Forward", ret)) {
                     return ret;
                 }
@@ -204,6 +208,8 @@ namespace test {
         printf("    -fc \"<format for compare>\t%s \n", output_format_cmp_message);
         printf("    -nt \"<network type>\t%s \n", output_format_cmp_message);
         printf("    -et \"<enable tune>\t%s \n", enable_tune_message);
+        printf("    -sc \"<input scale>\t%s \n", scale_message);
+        printf("    -bi \"<input bias>\t%s \n", bias_message);
     }
 
     void SetCpuAffinity() {
@@ -386,7 +392,9 @@ namespace test {
                     if (mat_type == NCHW_FLOAT) {
                         input_stream >> reinterpret_cast<float*>(mat_data)[i];
                     } else {
-                        input_stream >> reinterpret_cast<uint8_t*>(mat_data)[i];
+                        int val;
+                        input_stream >> val;
+                        reinterpret_cast<uint8_t*>(mat_data)[i] = (uint8_t)val;
                     }
                 }
             }
@@ -402,21 +410,60 @@ namespace test {
         return converter_map;
     }
 
-    std::map<std::string, MatConvertParam> CreateConvertParamMap(MatMap& mat_map) {
+    static void SetScaleOrBias(std::vector<float> &param, const std::string &message) {
+        std::string delimiter = ",";
+        std::vector<float> fval;
+        std::ptrdiff_t p1 = 0, p2;
+        while (true) {
+            p2 = message.find(delimiter, p1);
+            if (p2 != std::string::npos) {
+                fval.push_back(atof(message.substr(p1, p2 - p1).c_str()));
+                p1 = p2 + 1;
+            } else {
+                fval.push_back(atof(message.substr(p1, message.length() - p1).c_str()));
+                break;
+            }
+        }
+        if (fval.size() > param.size()) {
+            param = fval;
+        } else {
+            for (int i = 0; i < fval.size(); ++i) {
+                param[i] = fval[i];
+            }
+        }
+    }
+
+    std::map<std::string, MatConvertParam> CreateConvertParamMap(MatMap& mat_map, bool is_input) {
         std::map<std::string, MatConvertParam> param_map;
         for(auto iter : mat_map) {
             MatConvertParam param;
             auto name = iter.first;
             auto mat = iter.second;
             auto mat_type = mat->GetMatType();
             auto dims = mat->GetDims();
-            if(mat_type != NCHW_FLOAT) { 
-                std::fill(param.scale.begin(), param.scale.end(), 1.0f / 255.0f); 
-                std::fill(param.bias.begin(), param.bias.end(), 0);
-            } else if(dims[1] > 4) {
-                param.scale = std::vector<float>(dims[1], 1);
-                param.bias  = std::vector<float>(dims[1], 0);
+
+            // scale
+            if(is_input && !FLAGS_sc.empty()) {
+                SetScaleOrBias(param.scale, FLAGS_sc);
+            } else {
+                if(mat_type != NCHW_FLOAT) {
+                    std::fill(param.scale.begin(), param.scale.end(), 1.0f / 255.0f);
+                } else if(dims[1] > 4) {
+                    param.scale = std::vector<float>(dims[1], 1);
+                }
+            }
+
+            // bias
+            if(is_input && !FLAGS_bi.empty()) {
+                SetScaleOrBias(param.bias, FLAGS_bi);
+            } else {
+                if(mat_type != NCHW_FLOAT) {
+                    std::fill(param.bias.begin(), param.bias.end(), 0);
+                } else if(dims[1] > 4) {
+                    param.bias  = std::vector<float>(dims[1], 0);
+                }
             }
+
             param_map[name] = param;
         }
         return param_map;
diff --git a/test/test.h b/test/test.h
@@ -48,7 +48,7 @@ namespace test {
 
     std::map<std::string, std::shared_ptr<BlobConverter>> CreateBlobConverterMap(BlobMap& blob_map);
 
-    std::map<std::string, MatConvertParam> CreateConvertParamMap(MatMap& mat_map); 
+    std::map<std::string, MatConvertParam> CreateConvertParamMap(MatMap& mat_map, bool is_input);
 
     void WriteOutput(MatMap& outputs);
 
diff --git a/test/unit_test/layer_test/test_innerproduct_int8_layer.cc b/test/unit_test/layer_test/test_innerproduct_int8_layer.cc
@@ -19,19 +19,20 @@
 
 namespace TNN_NS {
 
-class InnerProductInt8LayerTest : public LayerTest, public ::testing::WithParamInterface<std::tuple<int, int, int>> {};
+class InnerProductInt8LayerTest : public LayerTest, public ::testing::WithParamInterface<std::tuple<int, int, int, int>> {};
 
 INSTANTIATE_TEST_SUITE_P(LayerTest, InnerProductInt8LayerTest,
-                         ::testing::Combine(testing::Values(1, 2), testing::Values(3, 4, 8, 9, 16),
+                         ::testing::Combine(testing::Values(1, 2), testing::Values(1, 8, 9, 16),
+                                            testing::Values(1, 9, 16, 19),
                                             // output channel
                                             testing::Values(1, 4, 8, 16, 32)));
 
 TEST_P(InnerProductInt8LayerTest, InnerProductLayer) {
     // get param
     int batch          = std::get<0>(GetParam());
     int input_channel  = std::get<1>(GetParam());
-    int output_channel = std::get<2>(GetParam());
-    int input_size     = 1;
+    int input_size     = std::get<2>(GetParam());
+    int output_channel = std::get<3>(GetParam());
     DeviceType dev     = ConvertDeviceType(FLAGS_dt);
     if (DEVICE_ARM != dev) {
         GTEST_SKIP();
diff --git a/tools/common/file_reader.cc b/tools/common/file_reader.cc
@@ -23,14 +23,26 @@
 
 namespace TNN_NS {
 
+
 static void ProcessNHWC2NCHW(unsigned char* img_data, float* blob_data, int channel, int height, int width,
-                             std::vector<float> bias, std::vector<float> scale) {
+                             std::vector<float> bias, std::vector<float> scale, bool reverse_channel) {
     ASSERT(bias.size() >= channel)
     ASSERT(scale.size() >= channel)
+    // only reverse B and R channel for color images
+    bool need_do_reverse = false;
+    if (reverse_channel) {
+        if (channel == 3 || channel == 4) {
+            need_do_reverse = true;
+        }
+    }
     for (int h = 0; h < height; ++h) {
         for (int w = 0; w < width; ++w) {
             for (int c = 0; c < channel; ++c) {
-                int idx_src        = h * width * channel + w * channel + c;
+                int c_src = c;
+                if (need_do_reverse) {
+                    c_src = (c < 3) ? (2 - c) : c;
+                }
+                int idx_src        = h * width * channel + w * channel + c_src;
                 int idx_dst        = c * height * width + h * width + w;
                 blob_data[idx_dst] = ((float)img_data[idx_src] - bias[c]) * scale[c];
             }
@@ -39,8 +51,9 @@ static void ProcessNHWC2NCHW(unsigned char* img_data, float* blob_data, int chan
 }
 
 FileReader::FileReader() {
-    bias_  = {0.0f, 0.0f, 0.0f, 0.0f};
-    scale_ = {1.0f, 1.0f, 1.0f, 1.0f};
+    bias_            = {0.0f, 0.0f, 0.0f, 0.0f};
+    scale_           = {1.0f, 1.0f, 1.0f, 1.0f};
+    reverse_channel_ = false;
 }
 
 FileReader::~FileReader() {}
@@ -164,6 +177,10 @@ void FileReader::SetScaleValue(std::vector<float> scale) {
     scale_ = scale;
 }
 
+void FileReader::SetReverseChannel(bool reverse_channel) {
+    reverse_channel_ = reverse_channel;
+}
+
 Status FileReader::PreProcessImage(unsigned char* img_data, Blob* blob, int width, int height, int channel) {
     float* data_ptr = static_cast<float*>(blob->GetHandle().base);
     if (blob->GetBlobDesc().data_format == DATA_FORMAT_NCHW) {
@@ -186,10 +203,10 @@ Status FileReader::PreProcessImage(unsigned char* img_data, Blob* blob, int widt
                 LOGE("resize image falied!\n");
                 return TNNERR_INVALID_INPUT;
             }
-            ProcessNHWC2NCHW(img_resized, data_ptr, blob_c, blob_h, blob_w, bias_, scale_);
+            ProcessNHWC2NCHW(img_resized, data_ptr, blob_c, blob_h, blob_w, bias_, scale_, reverse_channel_);
             free(img_resized);
         } else {
-            ProcessNHWC2NCHW(img_data, data_ptr, blob_c, blob_h, blob_w, bias_, scale_);
+            ProcessNHWC2NCHW(img_data, data_ptr, blob_c, blob_h, blob_w, bias_, scale_, reverse_channel_);
         }
 
     } else {
diff --git a/tools/common/file_reader.h b/tools/common/file_reader.h
diff --git a/tools/quantization/calibration.cc b/tools/quantization/calibration.cc
diff --git a/tools/quantization/calibration_common.h b/tools/quantization/calibration_common.h
diff --git a/tools/quantization/quantize.cc b/tools/quantization/quantize.cc