Skip to content

Commit 4a70be4

Browse files
authored
fix requantize pack4to8 (Tencent#5893)
1 parent ff5b554 commit 4a70be4

File tree

8 files changed

+408
-126
lines changed

8 files changed

+408
-126
lines changed

src/layer/arm/convolution_arm.cpp

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1376,15 +1376,22 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con
13761376
#if __ARM_NEON
13771377
if (opt.use_packing_layout)
13781378
{
1379-
#if NCNN_ARM82
1380-
if (ncnn::cpu_support_arm_asimdhp() && opt.use_fp16_arithmetic)
1379+
if (use_int8_requantize)
13811380
{
1382-
out_elempack_int32 = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
1381+
out_elempack_int32 = num_output % 8 == 0 ? 8 : 1;
13831382
}
13841383
else
1385-
#endif // NCNN_ARM82
13861384
{
1387-
out_elempack_int32 = num_output % 4 == 0 ? 4 : 1;
1385+
#if NCNN_ARM82
1386+
if (ncnn::cpu_support_arm_asimdhp() && opt.use_fp16_arithmetic)
1387+
{
1388+
out_elempack_int32 = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
1389+
}
1390+
else
1391+
#endif // NCNN_ARM82
1392+
{
1393+
out_elempack_int32 = num_output % 4 == 0 ? 4 : 1;
1394+
}
13881395
}
13891396
}
13901397
#endif // __ARM_NEON

src/layer/arm/requantize_arm.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,8 @@ static void requantize_relu(const int* intptr, signed char* ptr, const Mat& scal
120120
for (; i < size; i++)
121121
{
122122
float v = *intptr * scale;
123+
if (v < 0) v = 0;
123124
*ptr = float2int8(v);
124-
if (*ptr < 0) *ptr = 0;
125125
intptr++;
126126
ptr++;
127127
}
@@ -190,8 +190,8 @@ static void requantize_relu(const int* intptr, signed char* ptr, const Mat& scal
190190
for (; i < size; i++)
191191
{
192192
float v = *intptr * scale + bias;
193+
if (v < 0) v = 0;
193194
*ptr = float2int8(v);
194-
if (*ptr < 0) *ptr = 0;
195195
intptr++;
196196
ptr++;
197197
}
@@ -288,8 +288,8 @@ static void requantize_leakyrelu(const int* intptr, signed char* ptr, const Mat&
288288
for (; i < size; i++)
289289
{
290290
float v = *intptr * scale;
291+
if (v < 0) v *= slope;
291292
*ptr = float2int8(v);
292-
if (*ptr < 0) *ptr *= slope;
293293
intptr++;
294294
ptr++;
295295
}
@@ -358,8 +358,8 @@ static void requantize_leakyrelu(const int* intptr, signed char* ptr, const Mat&
358358
for (; i < size; i++)
359359
{
360360
float v = *intptr * scale + bias;
361+
if (v < 0) v *= slope;
361362
*ptr = float2int8(v);
362-
if (*ptr < 0) *ptr *= slope;
363363
intptr++;
364364
ptr++;
365365
}

src/layer/loongarch/requantize_loongarch.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,8 @@ static void requantize_relu(const int* intptr, signed char* ptr, const Mat& scal
120120
for (; i < size; i++)
121121
{
122122
float v = *intptr * scale;
123+
if (v < 0) v = 0;
123124
*ptr = float2int8(v);
124-
if (*ptr < 0) *ptr = 0;
125125
intptr++;
126126
ptr++;
127127
}
@@ -182,8 +182,8 @@ static void requantize_relu(const int* intptr, signed char* ptr, const Mat& scal
182182
for (; i < size; i++)
183183
{
184184
float v = *intptr * scale + bias;
185+
if (v < 0) v = 0;
185186
*ptr = float2int8(v);
186-
if (*ptr < 0) *ptr = 0;
187187
intptr++;
188188
ptr++;
189189
}
@@ -281,8 +281,8 @@ static void requantize_leakyrelu(const int* intptr, signed char* ptr, const Mat&
281281
for (; i < size; i++)
282282
{
283283
float v = *intptr * scale;
284+
if (v < 0) v *= slope;
284285
*ptr = float2int8(v);
285-
if (*ptr < 0) *ptr *= slope;
286286
intptr++;
287287
ptr++;
288288
}
@@ -343,8 +343,8 @@ static void requantize_leakyrelu(const int* intptr, signed char* ptr, const Mat&
343343
for (; i < size; i++)
344344
{
345345
float v = *intptr * scale + bias;
346+
if (v < 0) v *= slope;
346347
*ptr = float2int8(v);
347-
if (*ptr < 0) *ptr *= slope;
348348
intptr++;
349349
ptr++;
350350
}

src/layer/mips/requantize_mips.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,8 @@ static void requantize_relu(const int* intptr, signed char* ptr, const Mat& scal
120120
for (; i < size; i++)
121121
{
122122
float v = *intptr * scale;
123+
if (v < 0) v = 0;
123124
*ptr = float2int8(v);
124-
if (*ptr < 0) *ptr = 0;
125125
intptr++;
126126
ptr++;
127127
}
@@ -182,8 +182,8 @@ static void requantize_relu(const int* intptr, signed char* ptr, const Mat& scal
182182
for (; i < size; i++)
183183
{
184184
float v = *intptr * scale + bias;
185+
if (v < 0) v = 0;
185186
*ptr = float2int8(v);
186-
if (*ptr < 0) *ptr = 0;
187187
intptr++;
188188
ptr++;
189189
}
@@ -281,8 +281,8 @@ static void requantize_leakyrelu(const int* intptr, signed char* ptr, const Mat&
281281
for (; i < size; i++)
282282
{
283283
float v = *intptr * scale;
284+
if (v < 0) v *= slope;
284285
*ptr = float2int8(v);
285-
if (*ptr < 0) *ptr *= slope;
286286
intptr++;
287287
ptr++;
288288
}
@@ -343,8 +343,8 @@ static void requantize_leakyrelu(const int* intptr, signed char* ptr, const Mat&
343343
for (; i < size; i++)
344344
{
345345
float v = *intptr * scale + bias;
346+
if (v < 0) v *= slope;
346347
*ptr = float2int8(v);
347-
if (*ptr < 0) *ptr *= slope;
348348
intptr++;
349349
ptr++;
350350
}

src/layer/x86/convolution_x86.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -993,7 +993,18 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
993993
#if __SSE2__
994994
if (opt.use_packing_layout)
995995
{
996-
out_elempack_int32 = num_output % 4 == 0 ? 4 : 1;
996+
if (use_int8_requantize)
997+
{
998+
#if __AVX__
999+
out_elempack_int32 = num_output % 8 == 0 ? 8 : 1;
1000+
#else
1001+
out_elempack_int32 = num_output % 8 == 0 ? 4 : 1;
1002+
#endif
1003+
}
1004+
else
1005+
{
1006+
out_elempack_int32 = num_output % 4 == 0 ? 4 : 1;
1007+
}
9971008
}
9981009
#endif // __SSE2__
9991010

src/layer/x86/requantize_x86.cpp

Lines changed: 171 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -330,18 +330,103 @@ static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_
330330
}
331331
}
332332

333+
#if __SSE2__
334+
#if !__AVX__
335+
static void requantize_pack4to8(const int* intptr0, const int* intptr1, signed char* ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int activation_type, const Mat& activation_params, int elemcount)
336+
{
337+
const int scale_in_data_size = scale_in_data.w;
338+
const int bias_data_size = bias_data.w;
339+
const int scale_out_data_size = scale_out_data.w;
340+
341+
// NCNN_LOGE("requantize_pack4to8 %d %d %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount);
342+
343+
__m128 _scale_in0 = _mm_set1_ps(scale_in_data[0]);
344+
__m128 _scale_in1 = _scale_in0;
345+
if (scale_in_data_size > 1)
346+
{
347+
_scale_in0 = _mm_loadu_ps((const float*)scale_in_data);
348+
_scale_in1 = _mm_loadu_ps((const float*)scale_in_data + 4);
349+
}
350+
351+
__m128 _scale_out0 = _mm_set1_ps(scale_out_data[0]);
352+
__m128 _scale_out1 = _scale_out0;
353+
if (scale_out_data_size > 1)
354+
{
355+
_scale_out0 = _mm_loadu_ps((const float*)scale_out_data);
356+
_scale_out1 = _mm_loadu_ps((const float*)scale_out_data + 4);
357+
}
358+
359+
if (bias_data_size == 0)
360+
{
361+
int i = 0;
362+
for (; i < elemcount; i++)
363+
{
364+
__m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr0));
365+
__m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr1));
366+
_v0 = _mm_mul_ps(_v0, _scale_in0);
367+
_v1 = _mm_mul_ps(_v1, _scale_in1);
368+
_v0 = activation_sse(_v0, activation_type, activation_params);
369+
_v1 = activation_sse(_v1, activation_type, activation_params);
370+
_v0 = _mm_mul_ps(_v0, _scale_out0);
371+
_v1 = _mm_mul_ps(_v1, _scale_out1);
372+
*(int64_t*)ptr = float2int8_sse(_v0, _v1);
373+
intptr0 += 4;
374+
intptr1 += 4;
375+
ptr += 8;
376+
}
377+
}
378+
else
379+
{
380+
__m128 _bias0 = _mm_set1_ps(bias_data[0]);
381+
__m128 _bias1 = _bias0;
382+
if (bias_data_size > 1)
383+
{
384+
_bias0 = _mm_loadu_ps((const float*)bias_data);
385+
_bias1 = _mm_loadu_ps((const float*)bias_data + 4);
386+
}
387+
388+
int i = 0;
389+
for (; i < elemcount; i++)
390+
{
391+
__m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr0));
392+
__m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr1));
393+
_v0 = _mm_comp_fmadd_ps(_v0, _scale_in0, _bias0);
394+
_v1 = _mm_comp_fmadd_ps(_v1, _scale_in1, _bias1);
395+
_v0 = activation_sse(_v0, activation_type, activation_params);
396+
_v1 = activation_sse(_v1, activation_type, activation_params);
397+
_v0 = _mm_mul_ps(_v0, _scale_out0);
398+
_v1 = _mm_mul_ps(_v1, _scale_out1);
399+
*(int64_t*)ptr = float2int8_sse(_v0, _v1);
400+
intptr0 += 4;
401+
intptr1 += 4;
402+
ptr += 8;
403+
}
404+
}
405+
}
406+
#endif // !__AVX__
407+
#endif // __SSE2__
408+
333409
int Requantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
334410
{
335411
const int dims = bottom_blob.dims;
336412
const int w = bottom_blob.w;
337413
const int h = bottom_blob.h;
338414
const int channels = bottom_blob.c;
339415
const int elempack = bottom_blob.elempack;
340-
const size_t out_elemsize = elempack * 1u;
341416

342417
if (dims == 1)
343418
{
344-
top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
419+
int out_elempack = 1;
420+
#if __SSE2__
421+
if (opt.use_packing_layout)
422+
{
423+
out_elempack = w * elempack % 8 == 0 ? 8 : 1;
424+
}
425+
#endif
426+
const int outw = w * elempack / out_elempack;
427+
const size_t out_elemsize = out_elempack * 1u;
428+
429+
top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator);
345430
if (top_blob.empty())
346431
return -100;
347432

@@ -368,41 +453,107 @@ int Requantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
368453

369454
if (dims == 2)
370455
{
371-
top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
456+
int out_elempack = 1;
457+
#if __SSE2__
458+
if (opt.use_packing_layout)
459+
{
460+
out_elempack = h * elempack % 8 == 0 ? 8 : 1;
461+
}
462+
#endif
463+
const int outh = h * elempack / out_elempack;
464+
const size_t out_elemsize = out_elempack * 1u;
465+
466+
top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
372467
if (top_blob.empty())
373468
return -100;
374469

375-
#pragma omp parallel for num_threads(opt.num_threads)
376-
for (int i = 0; i < h; i++)
470+
#if __SSE2__
471+
#if !__AVX__
472+
if (elempack == 4 && out_elempack == 8)
377473
{
378-
const int* intptr = bottom_blob.row<const int>(i);
379-
signed char* ptr = top_blob.row<signed char>(i);
474+
#pragma omp parallel for num_threads(opt.num_threads)
475+
for (int i = 0; i < outh; i++)
476+
{
477+
const int* intptr0 = bottom_blob.row<const int>(i * 2);
478+
const int* intptr1 = bottom_blob.row<const int>(i * 2 + 1);
479+
signed char* ptr = top_blob.row<signed char>(i);
380480

381-
const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * elempack, elempack) : scale_in_data;
382-
const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data;
383-
const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * elempack, elempack) : scale_out_data;
481+
const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * out_elempack, out_elempack) : scale_in_data;
482+
const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * out_elempack, out_elempack) : bias_data;
483+
const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * out_elempack, out_elempack) : scale_out_data;
384484

385-
requantize(intptr, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w, elempack);
485+
requantize_pack4to8(intptr0, intptr1, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w);
486+
}
487+
}
488+
#endif // !__AVX__
489+
#endif // __SSE2__
490+
if (elempack == out_elempack)
491+
{
492+
#pragma omp parallel for num_threads(opt.num_threads)
493+
for (int i = 0; i < h; i++)
494+
{
495+
const int* intptr = bottom_blob.row<const int>(i);
496+
signed char* ptr = top_blob.row<signed char>(i);
497+
498+
const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * elempack, elempack) : scale_in_data;
499+
const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data;
500+
const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * elempack, elempack) : scale_out_data;
501+
502+
requantize(intptr, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w, elempack);
503+
}
386504
}
387505
}
388506

389507
if (dims == 3)
390508
{
391-
top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
509+
int out_elempack = 1;
510+
#if __SSE2__
511+
if (opt.use_packing_layout)
512+
{
513+
out_elempack = channels * elempack % 8 == 0 ? 8 : 1;
514+
}
515+
#endif
516+
const int outc = channels * elempack / out_elempack;
517+
const size_t out_elemsize = out_elempack * 1u;
518+
519+
top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
392520
if (top_blob.empty())
393521
return -100;
394522

395-
#pragma omp parallel for num_threads(opt.num_threads)
396-
for (int q = 0; q < channels; q++)
523+
#if __SSE2__
524+
#if !__AVX__
525+
if (elempack == 4 && out_elempack == 8)
526+
{
527+
#pragma omp parallel for num_threads(opt.num_threads)
528+
for (int q = 0; q < outc; q++)
529+
{
530+
const int* intptr0 = bottom_blob.channel(q * 2);
531+
const int* intptr1 = bottom_blob.channel(q * 2 + 1);
532+
signed char* ptr = top_blob.channel(q);
533+
534+
const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * out_elempack, out_elempack) : scale_in_data;
535+
const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * out_elempack, out_elempack) : bias_data;
536+
const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * out_elempack, out_elempack) : scale_out_data;
537+
538+
requantize_pack4to8(intptr0, intptr1, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h);
539+
}
540+
}
541+
#endif // !__AVX__
542+
#endif // __SSE2__
543+
if (elempack == out_elempack)
397544
{
398-
const int* intptr = bottom_blob.channel(q);
399-
signed char* ptr = top_blob.channel(q);
545+
#pragma omp parallel for num_threads(opt.num_threads)
546+
for (int q = 0; q < channels; q++)
547+
{
548+
const int* intptr = bottom_blob.channel(q);
549+
signed char* ptr = top_blob.channel(q);
400550

401-
const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * elempack, elempack) : scale_in_data;
402-
const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data;
403-
const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * elempack, elempack) : scale_out_data;
551+
const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * elempack, elempack) : scale_in_data;
552+
const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data;
553+
const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * elempack, elempack) : scale_out_data;
404554

405-
requantize(intptr, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h, elempack);
555+
requantize(intptr, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h, elempack);
556+
}
406557
}
407558
}
408559

0 commit comments

Comments
 (0)