Skip to content

Commit 7f278db

Browse files
authored
Hotfix issue 751 (#769)
* [TEST] support scale and bias input; fix uint8 read error * [TEST] fix blob dump problem * [TOOLS][QUANT] support reverse channel * [TOOLS][QUANT] skip average pooling quantization * [ARM] fix int8 inner product error * [TOOLS][QUANT] update doc * [TOOLS][QUANT] simplify reverse channel * [ARM] inner product init tmp buffer to zero
1 parent e62dddf commit 7f278db

File tree

16 files changed

+181
-43
lines changed

16 files changed

+181
-43
lines changed

doc/cn/user/quantization.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ cd <path_to_tnn>/platforms/linux/
1818
## 三、量化工具的使用
1919
### 1. 命令
2020
```
21-
./quantization_cmd [-h] [-p] [-m] [-i] [-b] [-w] [-n] [-s] [-c] <param>
21+
./quantization_cmd [-h] [-p] [-m] [-i] [-b] [-w] [-n] [-s] [-r] [-t] <param>
2222
```
2323
### 2. 参数说明
2424

@@ -32,6 +32,7 @@ cd <path_to_tnn>/platforms/linux/
3232
|-w, --weight_method| ||指定weights的量化方法:<br>&bull; 0 Min-Max方法(默认)<br>&bull; 1 ADMM方法|
3333
|-n, --bias | ||预处理,仅对输入为图片时起作用。对输入数据各通道进行bias操作,参数格式为:0.0,0.0,0.0|
3434
|-s, --scale | ||预处理,仅对输入为图片时起作用。对输入数据各通道进行scale操作,参数格式为:1.0,1.0,1.0|
35+
|-r, --reverse_channel| ||预处理,仅对输入为图片时起作用:<br>&bull; 0 使用RGB顺序(默认)<br>&bull; 1 使用BGR顺序|
3536
|-t, --merge_type| ||在量化的时候采用Per-Tensor还是Per-Channel的方式。<br>&bull; 0 Per-Channel方法(默认)<br>&bull; 1 混合方法,weights采用Per-Channel,blob采用Per-Tensor。<br>&bull; 2 Per-Tensor方法|
3637

3738
### 3. 量化输入

doc/en/user/quantization_en.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ cd <path_to_tnn>/platforms/linux/
1919
## III. Usage
2020
### 1. Command
2121
```
22-
./quantization_cmd [-h] [-p] [-m] [-i] [-b] [-w] [-n] [-s] [-c] <param>
22+
./quantization_cmd [-h] [-p] [-m] [-i] [-b] [-w] [-n] [-s] [-r] [-t] <param>
2323
```
2424
### 2. Parameter Description
2525

@@ -31,9 +31,9 @@ cd <path_to_tnn>/platforms/linux/
3131
|-i, --input_path |&radic; |&radic;|Specify the path of the quantitative input folder. The currently supported formats are: <br>&bull; Text file (the file suffix is ​​.txt) <br>&bull; Common picture format files (file suffix is ​​.jpg .jpeg .png .bmp) <br> All files under this directory will be used as input.|
3232
|-b, --blob_method | |&radic;|Specify the feature map quantization method:<br>&bull; 0 Min-Max method (default)<br>&bull; 2 KL method|
3333
|-w, --weight_method| |&radic;|Specify the quantification method of weights: <br>&bull; 0 Min-Max method (default)<br>&bull; 1 ADMM method|
34-
|-n, --mean | |&radic;|
35-
Pre-processing, mean operation on each channel of input data, parameter format: 0.0, 0.0, 0.0|
34+
|-n, --mean | |&radic;|Pre-processing, mean operation on each channel of input data, parameter format: 0.0, 0.0, 0.0|
3635
|-s, --scale | |&radic;|Pre-processing, scale the input data channels, the parameter format is: 1.0, 1.0, 1.0|
36+
|-r, --reverse_channel| |&radic;|Pre-processing, valid for picture format files: <br>&bull; 0 use RGB order (default)<br>&bull; 1 use BGR order|
3737
|-t, --merge_type| |&radic;|Whether use per-tensor or per-channel method when quantifying: <br>&bull; 0 per-channel method (default)<br>&bull; 1 mix method, weights: per-channel, blob: per-tensor.<br>&bull; 2 per-tensor method|
3838

3939
### 3. Quantization Input

source/tnn/core/default_network.cc

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -274,8 +274,6 @@ Status DefaultNetwork::UpdateBlobPrecision(std::shared_ptr<LayerInfo> layer_info
274274
if (device_->GetDeviceType() != DEVICE_ARM && device_->GetDeviceType() != DEVICE_NAIVE) {
275275
return TNN_OK;
276276
}
277-
static bool cpu_support_fp16 = CpuUtils::CpuSupportFp16();
278-
LOGD("support fp 16: %d\n", cpu_support_fp16 ? 1 : 0);
279277

280278
auto &desc = (*blob)->GetBlobDesc();
281279
auto layer_type = layer_info->type;
@@ -287,9 +285,10 @@ Status DefaultNetwork::UpdateBlobPrecision(std::shared_ptr<LayerInfo> layer_info
287285
RETURN_ON_NEQ(GenerateInt8Blob(name, net_resource, blob), TNN_OK);
288286
}
289287
} else {
290-
bool layer_implemented_fp16 = device_->GetImplementedPrecision(layer_type)->fp16_implemented;
291288
// update blob of non-quantized network by config precision and enabled precision
292289
if (config_.precision == PRECISION_NORMAL || config_.precision == PRECISION_AUTO) {
290+
static bool cpu_support_fp16 = CpuUtils::CpuSupportFp16();
291+
bool layer_implemented_fp16 = device_->GetImplementedPrecision(layer_type)->fp16_implemented;
293292
desc.data_type = (cpu_support_fp16 && layer_implemented_fp16) ? DATA_TYPE_HALF : DATA_TYPE_FLOAT;
294293
} else if (config_.precision == PRECISION_LOW) {
295294
desc.data_type = DATA_TYPE_BFP16;

source/tnn/device/arm/acc/arm_inner_product_layer_acc.cc

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -238,24 +238,41 @@ Status ArmInnerProductLayerAcc::Exec<int8_t>(const std::vector<Blob *> &inputs,
238238
InnerProductLayerParam *fc_param = dynamic_cast<InnerProductLayerParam *>(param_);
239239
auto dims_input = inputs[0]->GetBlobDesc().dims;
240240
auto dims_output = outputs[0]->GetBlobDesc().dims;
241-
auto ic = dims_input[3] * dims_input[2] * ROUND_UP(dims_input[1], 4);
242-
auto ic_r4 = ROUND_UP(ic, 4);
243-
auto oc_r4 = ROUND_UP(dims_output[1], 4);
244-
245241
auto input_origin = reinterpret_cast<int8_t *>(GetBlobHandlePtr(inputs[0]->GetHandle()));
246242
auto output_origin = reinterpret_cast<int8_t *>(GetBlobHandlePtr(outputs[0]->GetHandle()));
243+
244+
auto ic = dims_input[1];
245+
auto ic_r4 = ROUND_UP(ic, 4);
246+
auto hw = dims_input[2] * dims_input[3];
247+
auto ik = ic * hw;
248+
auto ik_r8 = ROUND_UP(ik, 8);
249+
auto oc_r4 = ROUND_UP(dims_output[1], 4);
250+
251+
int8_t *tmp_ptr = (int8_t *)context_->GetSharedWorkSpace(ik_r8);
252+
for (int k = ik; k < ik_r8; ++k) {
253+
tmp_ptr[k] = 0;
254+
}
255+
247256
for (int n = 0; n < dims_output[0]; n++) {
248-
auto input_ptr = input_origin + n * ic_r4;
257+
auto input_ptr = input_origin + n * ic_r4 * hw;
249258
auto output_ptr = output_origin + n * oc_r4;
250-
auto ic_r8 = ROUND_UP(ic_r4, 8);
251-
if (ic_r4 != ic_r8) {
252-
int8_t *tmp_ptr = (int8_t *)context_->GetSharedWorkSpace(ic_r8);
253-
memcpy(tmp_ptr, input_ptr, ic_r4);
254-
*(int32_t *)(tmp_ptr + ic_r4) = 0;
255-
input_ptr = tmp_ptr;
259+
260+
if (hw == 1) {
261+
if (ic_r4 != ik_r8) {
262+
memcpy(tmp_ptr, input_ptr, ic_r4);
263+
} else {
264+
tmp_ptr = input_ptr;
265+
}
266+
} else if (ic == 1) {
267+
for (int k = 0; k < ik; ++k) {
268+
tmp_ptr[k] = input_ptr[k<<2];
269+
}
270+
} else {
271+
UnpackHWC4ToCHW(tmp_ptr, input_ptr, ic, hw);
256272
}
257-
GemvInt8(output_ptr, input_ptr, buffer_weight_.force_to<int8_t *>(), buffer_bias_.force_to<int32_t *>(),
258-
buffer_scale_.force_to<float *>(), ROUND_UP(ic_r4, 8), oc_r4);
273+
274+
GemvInt8(output_ptr, tmp_ptr, buffer_weight_.force_to<int8_t *>(), buffer_bias_.force_to<int32_t *>(),
275+
buffer_scale_.force_to<float *>(), ik_r8, oc_r4);
259276
}
260277

261278
return TNN_OK;

source/tnn/device/arm/arm_util.cc

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,20 @@ int UnpackC4WithStride(float *dst, const float *src, size_t ih, size_t iw, size_
437437
return 0;
438438
}
439439

440+
int UnpackHWC4ToCHW(int8_t *dst, const int8_t *src, size_t channel, size_t hw) {
441+
auto c_r4 = ROUND_UP(channel, 4);
442+
443+
for (int c = 0; c < channel; ++c) {
444+
auto src_c = src + c;
445+
auto dst_c = dst + c * hw;
446+
for (int z = 0; z < hw; ++z) {
447+
dst_c[z] = src_c[z * c_r4];
448+
}
449+
}
450+
451+
return 0;
452+
}
453+
440454
#define ConvertWeightsPreparation \
441455
const int goc = output_channel / group; \
442456
const int gic = input_channel / group; \

source/tnn/device/arm/arm_util.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@ int UnpackC4WithStride(float *dst, const float *src, size_t ih, size_t iw, size_
7474

7575
int UnpackAndDequant(float *dst, const int8_t *src, size_t hw, size_t channel, float *scale, float *bias);
7676

77+
int UnpackHWC4ToCHW(int8_t *dst, const int8_t *src, size_t channel, size_t hw);
78+
7779
template <typename T>
7880
int ConvertWeightsC4ToC8(T *weight, int ic, int oc);
7981

test/flags.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,4 +54,8 @@ DEFINE_string(is, "", input_shape_message);
5454

5555
DEFINE_bool(et, false, enable_tune_message);
5656

57+
DEFINE_string(sc, "", scale_message);
58+
59+
DEFINE_string(bi, "", bias_message);
60+
5761
} // namespace TNN_NS

test/flags.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,10 @@ static const char network_type_message[] = "network type: NAIVE, NPU, COREML, SN
6363

6464
static const char enable_tune_message[] = "enable tune kernel(default false)";
6565

66+
static const char scale_message[] = "input scale: s0,s1,s2,...)";
67+
68+
static const char bias_message[] = "input bias: b0,b1,b2,...)";
69+
6670
DECLARE_bool(h);
6771

6872
DECLARE_string(mt);
@@ -101,6 +105,10 @@ DECLARE_string(is);
101105

102106
DECLARE_bool(et);
103107

108+
DECLARE_string(sc);
109+
110+
DECLARE_string(bi);
111+
104112
} // namespace TNN_NS
105113

106114
#endif // TNN_TEST_FLAGS_H_

test/test.cc

Lines changed: 57 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -90,12 +90,12 @@ namespace test {
9090
MatMap input_mat_map = CreateBlobMatMap(input_blob_map, FLAGS_it);
9191
InitInputMatMap(input_mat_map);
9292
auto input_converters_map = CreateBlobConverterMap(input_blob_map);
93-
auto input_params_map = CreateConvertParamMap(input_mat_map);
93+
auto input_params_map = CreateConvertParamMap(input_mat_map, true);
9494

9595
//mat format NCHW_FLOAT
9696
MatMap output_mat_map = CreateBlobMatMap(output_blob_map, 0);
9797
auto output_converters_map = CreateBlobConverterMap(output_blob_map);
98-
auto output_params_map = CreateConvertParamMap(output_mat_map);
98+
auto output_params_map = CreateConvertParamMap(output_mat_map, false);
9999

100100
for (int i = 0; i < FLAGS_wc; ++i) {
101101
for(auto element : input_converters_map) {
@@ -132,7 +132,11 @@ namespace test {
132132
return ret;
133133
}
134134
}
135+
#if DUMP_INPUT_BLOB || DUMP_OUTPUT_BLOB
136+
ret = instance->Forward();
137+
#else
135138
ret = instance->ForwardAsync(nullptr);
139+
#endif
136140
if (!CheckResult("Forward", ret)) {
137141
return ret;
138142
}
@@ -204,6 +208,8 @@ namespace test {
204208
printf(" -fc \"<format for compare>\t%s \n", output_format_cmp_message);
205209
printf(" -nt \"<network type>\t%s \n", output_format_cmp_message);
206210
printf(" -et \"<enable tune>\t%s \n", enable_tune_message);
211+
printf(" -sc \"<input scale>\t%s \n", scale_message);
212+
printf(" -bi \"<input bias>\t%s \n", bias_message);
207213
}
208214

209215
void SetCpuAffinity() {
@@ -386,7 +392,9 @@ namespace test {
386392
if (mat_type == NCHW_FLOAT) {
387393
input_stream >> reinterpret_cast<float*>(mat_data)[i];
388394
} else {
389-
input_stream >> reinterpret_cast<uint8_t*>(mat_data)[i];
395+
int val;
396+
input_stream >> val;
397+
reinterpret_cast<uint8_t*>(mat_data)[i] = (uint8_t)val;
390398
}
391399
}
392400
}
@@ -402,21 +410,60 @@ namespace test {
402410
return converter_map;
403411
}
404412

405-
std::map<std::string, MatConvertParam> CreateConvertParamMap(MatMap& mat_map) {
413+
static void SetScaleOrBias(std::vector<float> &param, const std::string &message) {
414+
std::string delimiter = ",";
415+
std::vector<float> fval;
416+
std::ptrdiff_t p1 = 0, p2;
417+
while (true) {
418+
p2 = message.find(delimiter, p1);
419+
if (p2 != std::string::npos) {
420+
fval.push_back(atof(message.substr(p1, p2 - p1).c_str()));
421+
p1 = p2 + 1;
422+
} else {
423+
fval.push_back(atof(message.substr(p1, message.length() - p1).c_str()));
424+
break;
425+
}
426+
}
427+
if (fval.size() > param.size()) {
428+
param = fval;
429+
} else {
430+
for (int i = 0; i < fval.size(); ++i) {
431+
param[i] = fval[i];
432+
}
433+
}
434+
}
435+
436+
std::map<std::string, MatConvertParam> CreateConvertParamMap(MatMap& mat_map, bool is_input) {
406437
std::map<std::string, MatConvertParam> param_map;
407438
for(auto iter : mat_map) {
408439
MatConvertParam param;
409440
auto name = iter.first;
410441
auto mat = iter.second;
411442
auto mat_type = mat->GetMatType();
412443
auto dims = mat->GetDims();
413-
if(mat_type != NCHW_FLOAT) {
414-
std::fill(param.scale.begin(), param.scale.end(), 1.0f / 255.0f);
415-
std::fill(param.bias.begin(), param.bias.end(), 0);
416-
} else if(dims[1] > 4) {
417-
param.scale = std::vector<float>(dims[1], 1);
418-
param.bias = std::vector<float>(dims[1], 0);
444+
445+
// scale
446+
if(is_input && !FLAGS_sc.empty()) {
447+
SetScaleOrBias(param.scale, FLAGS_sc);
448+
} else {
449+
if(mat_type != NCHW_FLOAT) {
450+
std::fill(param.scale.begin(), param.scale.end(), 1.0f / 255.0f);
451+
} else if(dims[1] > 4) {
452+
param.scale = std::vector<float>(dims[1], 1);
453+
}
454+
}
455+
456+
// bias
457+
if(is_input && !FLAGS_bi.empty()) {
458+
SetScaleOrBias(param.bias, FLAGS_bi);
459+
} else {
460+
if(mat_type != NCHW_FLOAT) {
461+
std::fill(param.bias.begin(), param.bias.end(), 0);
462+
} else if(dims[1] > 4) {
463+
param.bias = std::vector<float>(dims[1], 0);
464+
}
419465
}
466+
420467
param_map[name] = param;
421468
}
422469
return param_map;

test/test.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ namespace test {
4848

4949
std::map<std::string, std::shared_ptr<BlobConverter>> CreateBlobConverterMap(BlobMap& blob_map);
5050

51-
std::map<std::string, MatConvertParam> CreateConvertParamMap(MatMap& mat_map);
51+
std::map<std::string, MatConvertParam> CreateConvertParamMap(MatMap& mat_map, bool is_input);
5252

5353
void WriteOutput(MatMap& outputs);
5454

0 commit comments

Comments
 (0)