@@ -330,18 +330,103 @@ static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_
330330 }
331331}
332332
333+ #if __SSE2__
334+ #if !__AVX__
335+ static void requantize_pack4to8 (const int * intptr0, const int * intptr1, signed char * ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int activation_type, const Mat& activation_params, int elemcount)
336+ {
337+ const int scale_in_data_size = scale_in_data.w ;
338+ const int bias_data_size = bias_data.w ;
339+ const int scale_out_data_size = scale_out_data.w ;
340+
341+ // NCNN_LOGE("requantize_pack4to8 %d %d %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount);
342+
343+ __m128 _scale_in0 = _mm_set1_ps (scale_in_data[0 ]);
344+ __m128 _scale_in1 = _scale_in0;
345+ if (scale_in_data_size > 1 )
346+ {
347+ _scale_in0 = _mm_loadu_ps ((const float *)scale_in_data);
348+ _scale_in1 = _mm_loadu_ps ((const float *)scale_in_data + 4 );
349+ }
350+
351+ __m128 _scale_out0 = _mm_set1_ps (scale_out_data[0 ]);
352+ __m128 _scale_out1 = _scale_out0;
353+ if (scale_out_data_size > 1 )
354+ {
355+ _scale_out0 = _mm_loadu_ps ((const float *)scale_out_data);
356+ _scale_out1 = _mm_loadu_ps ((const float *)scale_out_data + 4 );
357+ }
358+
359+ if (bias_data_size == 0 )
360+ {
361+ int i = 0 ;
362+ for (; i < elemcount; i++)
363+ {
364+ __m128 _v0 = _mm_cvtepi32_ps (_mm_loadu_si128 ((const __m128i*)intptr0));
365+ __m128 _v1 = _mm_cvtepi32_ps (_mm_loadu_si128 ((const __m128i*)intptr1));
366+ _v0 = _mm_mul_ps (_v0, _scale_in0);
367+ _v1 = _mm_mul_ps (_v1, _scale_in1);
368+ _v0 = activation_sse (_v0, activation_type, activation_params);
369+ _v1 = activation_sse (_v1, activation_type, activation_params);
370+ _v0 = _mm_mul_ps (_v0, _scale_out0);
371+ _v1 = _mm_mul_ps (_v1, _scale_out1);
372+ *(int64_t *)ptr = float2int8_sse (_v0, _v1);
373+ intptr0 += 4 ;
374+ intptr1 += 4 ;
375+ ptr += 8 ;
376+ }
377+ }
378+ else
379+ {
380+ __m128 _bias0 = _mm_set1_ps (bias_data[0 ]);
381+ __m128 _bias1 = _bias0;
382+ if (bias_data_size > 1 )
383+ {
384+ _bias0 = _mm_loadu_ps ((const float *)bias_data);
385+ _bias1 = _mm_loadu_ps ((const float *)bias_data + 4 );
386+ }
387+
388+ int i = 0 ;
389+ for (; i < elemcount; i++)
390+ {
391+ __m128 _v0 = _mm_cvtepi32_ps (_mm_loadu_si128 ((const __m128i*)intptr0));
392+ __m128 _v1 = _mm_cvtepi32_ps (_mm_loadu_si128 ((const __m128i*)intptr1));
393+ _v0 = _mm_comp_fmadd_ps (_v0, _scale_in0, _bias0);
394+ _v1 = _mm_comp_fmadd_ps (_v1, _scale_in1, _bias1);
395+ _v0 = activation_sse (_v0, activation_type, activation_params);
396+ _v1 = activation_sse (_v1, activation_type, activation_params);
397+ _v0 = _mm_mul_ps (_v0, _scale_out0);
398+ _v1 = _mm_mul_ps (_v1, _scale_out1);
399+ *(int64_t *)ptr = float2int8_sse (_v0, _v1);
400+ intptr0 += 4 ;
401+ intptr1 += 4 ;
402+ ptr += 8 ;
403+ }
404+ }
405+ }
406+ #endif // !__AVX__
407+ #endif // __SSE2__
408+
333409int Requantize_x86::forward (const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
334410{
335411 const int dims = bottom_blob.dims ;
336412 const int w = bottom_blob.w ;
337413 const int h = bottom_blob.h ;
338414 const int channels = bottom_blob.c ;
339415 const int elempack = bottom_blob.elempack ;
340- const size_t out_elemsize = elempack * 1u ;
341416
342417 if (dims == 1 )
343418 {
344- top_blob.create (w, out_elemsize, elempack, opt.blob_allocator );
419+ int out_elempack = 1 ;
420+ #if __SSE2__
421+ if (opt.use_packing_layout )
422+ {
423+ out_elempack = w * elempack % 8 == 0 ? 8 : 1 ;
424+ }
425+ #endif
426+ const int outw = w * elempack / out_elempack;
427+ const size_t out_elemsize = out_elempack * 1u ;
428+
429+ top_blob.create (outw, out_elemsize, out_elempack, opt.blob_allocator );
345430 if (top_blob.empty ())
346431 return -100 ;
347432
@@ -368,41 +453,107 @@ int Requantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
368453
369454 if (dims == 2 )
370455 {
371- top_blob.create (w, h, out_elemsize, elempack, opt.blob_allocator );
456+ int out_elempack = 1 ;
457+ #if __SSE2__
458+ if (opt.use_packing_layout )
459+ {
460+ out_elempack = h * elempack % 8 == 0 ? 8 : 1 ;
461+ }
462+ #endif
463+ const int outh = h * elempack / out_elempack;
464+ const size_t out_elemsize = out_elempack * 1u ;
465+
466+ top_blob.create (w, outh, out_elemsize, out_elempack, opt.blob_allocator );
372467 if (top_blob.empty ())
373468 return -100 ;
374469
375- #pragma omp parallel for num_threads(opt.num_threads)
376- for (int i = 0 ; i < h; i++)
470+ #if __SSE2__
471+ #if !__AVX__
472+ if (elempack == 4 && out_elempack == 8 )
377473 {
378- const int * intptr = bottom_blob.row <const int >(i);
379- signed char * ptr = top_blob.row <signed char >(i);
474+ #pragma omp parallel for num_threads(opt.num_threads)
475+ for (int i = 0 ; i < outh; i++)
476+ {
477+ const int * intptr0 = bottom_blob.row <const int >(i * 2 );
478+ const int * intptr1 = bottom_blob.row <const int >(i * 2 + 1 );
479+ signed char * ptr = top_blob.row <signed char >(i);
380480
381- const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range (i * elempack, elempack ) : scale_in_data;
382- const Mat bias_data_i = bias_data_size > 1 ? bias_data.range (i * elempack, elempack ) : bias_data;
383- const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range (i * elempack, elempack ) : scale_out_data;
481+ const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range (i * out_elempack, out_elempack ) : scale_in_data;
482+ const Mat bias_data_i = bias_data_size > 1 ? bias_data.range (i * out_elempack, out_elempack ) : bias_data;
483+ const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range (i * out_elempack, out_elempack ) : scale_out_data;
384484
385- requantize (intptr, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w, elempack);
485+ requantize_pack4to8 (intptr0, intptr1, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w);
486+ }
487+ }
488+ #endif // !__AVX__
489+ #endif // __SSE2__
490+ if (elempack == out_elempack)
491+ {
492+ #pragma omp parallel for num_threads(opt.num_threads)
493+ for (int i = 0 ; i < h; i++)
494+ {
495+ const int * intptr = bottom_blob.row <const int >(i);
496+ signed char * ptr = top_blob.row <signed char >(i);
497+
498+ const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range (i * elempack, elempack) : scale_in_data;
499+ const Mat bias_data_i = bias_data_size > 1 ? bias_data.range (i * elempack, elempack) : bias_data;
500+ const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range (i * elempack, elempack) : scale_out_data;
501+
502+ requantize (intptr, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w, elempack);
503+ }
386504 }
387505 }
388506
389507 if (dims == 3 )
390508 {
391- top_blob.create (w, h, channels, out_elemsize, elempack, opt.blob_allocator );
509+ int out_elempack = 1 ;
510+ #if __SSE2__
511+ if (opt.use_packing_layout )
512+ {
513+ out_elempack = channels * elempack % 8 == 0 ? 8 : 1 ;
514+ }
515+ #endif
516+ const int outc = channels * elempack / out_elempack;
517+ const size_t out_elemsize = out_elempack * 1u ;
518+
519+ top_blob.create (w, h, outc, out_elemsize, out_elempack, opt.blob_allocator );
392520 if (top_blob.empty ())
393521 return -100 ;
394522
395- #pragma omp parallel for num_threads(opt.num_threads)
396- for (int q = 0 ; q < channels; q++)
523+ #if __SSE2__
524+ #if !__AVX__
525+ if (elempack == 4 && out_elempack == 8 )
526+ {
527+ #pragma omp parallel for num_threads(opt.num_threads)
528+ for (int q = 0 ; q < outc; q++)
529+ {
530+ const int * intptr0 = bottom_blob.channel (q * 2 );
531+ const int * intptr1 = bottom_blob.channel (q * 2 + 1 );
532+ signed char * ptr = top_blob.channel (q);
533+
534+ const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range (q * out_elempack, out_elempack) : scale_in_data;
535+ const Mat bias_data_q = bias_data_size > 1 ? bias_data.range (q * out_elempack, out_elempack) : bias_data;
536+ const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range (q * out_elempack, out_elempack) : scale_out_data;
537+
538+ requantize_pack4to8 (intptr0, intptr1, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h);
539+ }
540+ }
541+ #endif // !__AVX__
542+ #endif // __SSE2__
543+ if (elempack == out_elempack)
397544 {
398- const int * intptr = bottom_blob.channel (q);
399- signed char * ptr = top_blob.channel (q);
545+ #pragma omp parallel for num_threads(opt.num_threads)
546+ for (int q = 0 ; q < channels; q++)
547+ {
548+ const int * intptr = bottom_blob.channel (q);
549+ signed char * ptr = top_blob.channel (q);
400550
401- const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range (q * elempack, elempack) : scale_in_data;
402- const Mat bias_data_q = bias_data_size > 1 ? bias_data.range (q * elempack, elempack) : bias_data;
403- const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range (q * elempack, elempack) : scale_out_data;
551+ const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range (q * elempack, elempack) : scale_in_data;
552+ const Mat bias_data_q = bias_data_size > 1 ? bias_data.range (q * elempack, elempack) : bias_data;
553+ const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range (q * elempack, elempack) : scale_out_data;
404554
405- requantize (intptr, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h, elempack);
555+ requantize (intptr, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h, elempack);
556+ }
406557 }
407558 }
408559
0 commit comments