Skip to content

Commit 383cb5f

Browse files
committed
w--
1 parent b38c5bf commit 383cb5f

File tree

1 file changed

+49
-49
lines changed

1 file changed

+49
-49
lines changed

src/layer/x86/gemm_int8.h

+49-49
Original file line numberDiff line numberDiff line change
@@ -4464,9 +4464,9 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
44644464
{
44654465
const float* p0 = (const float*)A + k * A_hstep + (i + ii) * elempack;
44664466

4467-
#if __AVX512VNNI__ || __AVXVNNI__
4467+
#if __AVX512VNNI__
44684468
__m128i _v127 = _mm_set1_epi8(127);
4469-
#endif // __AVX512VNNI__ || __AVXVNNI__
4469+
#endif // __AVX512VNNI__
44704470

44714471
const float scale = scales[i + ii];
44724472

@@ -6952,9 +6952,9 @@ static void transpose_pack_B_tile_fp32_to_int8(const Mat& B, Mat& BT, int j, int
69526952
{
69536953
const float* p0 = (const float*)B + k * B_hstep + (j + jj) * elempack;
69546954

6955-
#if __AVX512VNNI__ || __AVXVNNI__
6955+
#if __AVX512VNNI__
69566956
__m128i _v127 = _mm_set1_epi8(127);
6957-
#endif // __AVX512VNNI__ || __AVXVNNI__
6957+
#endif // __AVX512VNNI__
69586958

69596959
#if __SSE2__
69606960
#if __AVX__
@@ -7094,7 +7094,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
70947094

70957095
__m512 _descale = _mm512_loadu_ps((const float*)descales + i + ii);
70967096

7097-
__m512 _c0;
7097+
__m512 _c0 = _mm512_set1_ps(0.f);
70987098
if (pC)
70997099
{
71007100
if (broadcast_type_C == 0)
@@ -7357,7 +7357,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
73577357
_cf = _mm512_loadu_ps(pC + 128 + 112);
73587358
pC += 256;
73597359
}
7360-
if (c_elempack == 8)
7360+
else if (c_elempack == 8)
73617361
{
73627362
__m512 _tmp0 = _mm512_loadu_ps(pC);
73637363
__m512 _tmp1 = _mm512_loadu_ps(pC + 16);
@@ -7395,7 +7395,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
73957395

73967396
pC += 128;
73977397
}
7398-
if (c_elempack == 4)
7398+
else if (c_elempack == 4)
73997399
{
74007400
_c0 = _mm512_loadu_ps(pC);
74017401
_c1 = _mm512_loadu_ps(pC + 16);
@@ -7450,7 +7450,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
74507450

74517451
pC += 64;
74527452
}
7453-
if (c_elempack == 1)
7453+
else // if (c_elempack == 1)
74547454
{
74557455
_c0 = _mm512_loadu_ps(pC);
74567456
_c1 = _mm512_loadu_ps(pC + c_hstep);
@@ -7938,7 +7938,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
79387938
_c7 = _mm512_loadu_ps(pC + 112);
79397939
pC += 128;
79407940
}
7941-
if (c_elempack == 8)
7941+
else if (c_elempack == 8)
79427942
{
79437943
__m512 _tmp0 = _mm512_loadu_ps(pC);
79447944
__m512 _tmp1 = _mm512_loadu_ps(pC + 16);
@@ -7960,7 +7960,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
79607960

79617961
pC += 64;
79627962
}
7963-
if (c_elempack == 4)
7963+
else if (c_elempack == 4)
79647964
{
79657965
_c0 = _mm512_loadu_ps(pC);
79667966
_c1 = _mm512_loadu_ps(pC + 16);
@@ -7991,7 +7991,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
79917991

79927992
pC += 32;
79937993
}
7994-
if (c_elempack == 1)
7994+
else // if (c_elempack == 1)
79957995
{
79967996
__m256 _cc0 = _mm256_loadu_ps(pC);
79977997
__m256 _cc1 = _mm256_loadu_ps(pC + c_hstep);
@@ -8278,7 +8278,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
82788278
_c3 = _mm512_loadu_ps(pC + 48);
82798279
pC += 64;
82808280
}
8281-
if (c_elempack == 8)
8281+
else if (c_elempack == 8)
82828282
{
82838283
__m512 _cc0 = _mm512_loadu_ps(pC);
82848284
__m512 _cc1 = _mm512_loadu_ps(pC + 16);
@@ -8290,7 +8290,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
82908290
_c3 = _mm512_shuffle_f32x4(_cc1, _cc3, _MM_SHUFFLE(3, 2, 3, 2));
82918291
pC += 32;
82928292
}
8293-
if (c_elempack == 4)
8293+
else if (c_elempack == 4)
82948294
{
82958295
_c0 = _mm512_loadu_ps(pC);
82968296
_c1 = _mm512_loadu_ps(pC + c_hstep * 4);
@@ -8306,7 +8306,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
83068306
_c3 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
83078307
pC += 16;
83088308
}
8309-
if (c_elempack == 1)
8309+
else // if (c_elempack == 1)
83108310
{
83118311
__m128 _cc0 = _mm_loadu_ps(pC);
83128312
__m128 _cc1 = _mm_loadu_ps(pC + c_hstep);
@@ -8562,15 +8562,15 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
85628562
_c1 = _mm512_loadu_ps(pC + 16);
85638563
pC += 32;
85648564
}
8565-
if (c_elempack == 8)
8565+
else if (c_elempack == 8)
85668566
{
85678567
__m512 _cc0 = _mm512_loadu_ps(pC);
85688568
__m512 _cc1 = _mm512_loadu_ps(pC + c_hstep * 8);
85698569
_c0 = _mm512_shuffle_f32x4(_cc0, _cc1, _MM_SHUFFLE(1, 0, 1, 0));
85708570
_c1 = _mm512_shuffle_f32x4(_cc0, _cc1, _MM_SHUFFLE(3, 2, 3, 2));
85718571
pC += 16;
85728572
}
8573-
if (c_elempack == 4)
8573+
else if (c_elempack == 4)
85748574
{
85758575
__m128 _cc0 = _mm_loadu_ps(pC);
85768576
__m128 _cc1 = _mm_loadu_ps(pC + 4);
@@ -8588,7 +8588,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
85888588
_c1 = _mm512_insertf32x8(_mm512_castps256_ps512(_cc13), _cc57, 1);
85898589
pC += 8;
85908590
}
8591-
if (c_elempack == 1)
8591+
else // if (c_elempack == 1)
85928592
{
85938593
__m512i _vindex = _mm512_mullo_epi32(_mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), _mm512_set1_epi32(c_hstep));
85948594
_c0 = _mm512_i32gather_ps(_vindex, pC, sizeof(float));
@@ -8691,14 +8691,14 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
86918691
_c0 = _mm512_loadu_ps(pC);
86928692
pC += 16;
86938693
}
8694-
if (c_elempack == 8)
8694+
else if (c_elempack == 8)
86958695
{
86968696
__m256 _cc0 = _mm256_loadu_ps(pC);
86978697
__m256 _cc1 = _mm256_loadu_ps(pC + c_hstep * 8);
86988698
_c0 = _mm512_insertf32x8(_mm512_castps256_ps512(_cc0), _cc1, 1);
86998699
pC += 8;
87008700
}
8701-
if (c_elempack == 4)
8701+
else if (c_elempack == 4)
87028702
{
87038703
__m128 _cc0 = _mm_loadu_ps(pC);
87048704
__m128 _cc1 = _mm_loadu_ps(pC + c_hstep * 4);
@@ -8709,7 +8709,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
87098709
_c0 = _mm512_insertf32x8(_mm512_castps256_ps512(_cc01), _cc23, 1);
87108710
pC += 4;
87118711
}
8712-
if (c_elempack == 1)
8712+
else // if (c_elempack == 1)
87138713
{
87148714
__m512i _vindex = _mm512_mullo_epi32(_mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), _mm512_set1_epi32(c_hstep));
87158715
_c0 = _mm512_i32gather_ps(_vindex, pC, sizeof(float));
@@ -8783,9 +8783,9 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
87838783
__m512 _descale_avx512 = _mm512_broadcast_f32x8(_descale);
87848784
#endif
87858785

8786-
__m256 _c0;
8786+
__m256 _c0 = _mm256_set1_ps(0.f);
87878787
#if __AVX512F__
8788-
__m512 _c0_avx512;
8788+
__m512 _c0_avx512 = _mm512_set1_ps(0.f);
87898789
#endif
87908790
if (pC)
87918791
{
@@ -8970,7 +8970,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
89708970

89718971
pC += 128;
89728972
}
8973-
if (c_elempack == 4)
8973+
else if (c_elempack == 4)
89748974
{
89758975
_c0_avx512 = _mm512_loadu_ps(pC);
89768976
_c1_avx512 = _mm512_loadu_ps(pC + 16);
@@ -9010,7 +9010,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
90109010

90119011
pC += 64;
90129012
}
9013-
if (c_elempack == 1)
9013+
else // if (c_elempack == 1)
90149014
{
90159015
_c0_avx512 = _mm512_loadu_ps(pC);
90169016
_c1_avx512 = _mm512_loadu_ps(pC + c_hstep);
@@ -9506,7 +9506,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
95069506
_c7 = _mm256_loadu_ps(pC + 56);
95079507
pC += 64;
95089508
}
9509-
if (c_elempack == 4)
9509+
else if (c_elempack == 4)
95109510
{
95119511
__m256 _tmp0 = _mm256_loadu_ps(pC);
95129512
__m256 _tmp1 = _mm256_loadu_ps(pC + 8);
@@ -9526,7 +9526,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
95269526
_c7 = _mm256_permute2f128_ps(_tmp3, _tmp7, _MM_SHUFFLE(0, 3, 0, 1));
95279527
pC += 32;
95289528
}
9529-
if (c_elempack == 1)
9529+
else // if (c_elempack == 1)
95309530
{
95319531
_c0 = _mm256_loadu_ps(pC);
95329532
_c1 = _mm256_loadu_ps(pC + c_hstep);
@@ -9772,7 +9772,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
97729772
_c3 = _mm256_loadu_ps(pC + 24);
97739773
pC += 32;
97749774
}
9775-
if (c_elempack == 4)
9775+
else if (c_elempack == 4)
97769776
{
97779777
__m256 _cc0 = _mm256_loadu_ps(pC);
97789778
__m256 _cc1 = _mm256_loadu_ps(pC + 8);
@@ -9784,7 +9784,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
97849784
_c3 = _mm256_permute2f128_ps(_cc1, _cc3, _MM_SHUFFLE(0, 3, 0, 1));
97859785
pC += 16;
97869786
}
9787-
if (c_elempack == 1)
9787+
else // if (c_elempack == 1)
97889788
{
97899789
// __m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
97909790
// _c0 = _mm256_i32gather_ps(pC, _vindex, c_hstep * sizeof(float));
@@ -9994,15 +9994,15 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
99949994
_c1 = _mm256_loadu_ps(pC + 8);
99959995
pC += 16;
99969996
}
9997-
if (c_elempack == 4)
9997+
else if (c_elempack == 4)
99989998
{
99999999
__m256 _cc0 = _mm256_loadu_ps(pC);
1000010000
__m256 _cc1 = _mm256_loadu_ps(pC + c_hstep * 4);
1000110001
_c0 = _mm256_permute2f128_ps(_cc0, _cc1, _MM_SHUFFLE(0, 2, 0, 0));
1000210002
_c1 = _mm256_permute2f128_ps(_cc0, _cc1, _MM_SHUFFLE(0, 3, 0, 1));
1000310003
pC += 8;
1000410004
}
10005-
if (c_elempack == 1)
10005+
else // if (c_elempack == 1)
1000610006
{
1000710007
#if __AVX2__
1000810008
__m256i _vindex = _mm256_mullo_epi32(_mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7), _mm256_set1_epi32(c_hstep));
@@ -10126,14 +10126,14 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
1012610126
_c0 = _mm256_loadu_ps(pC);
1012710127
pC += 8;
1012810128
}
10129-
if (c_elempack == 4)
10129+
else if (c_elempack == 4)
1013010130
{
1013110131
__m128 _cc0 = _mm_loadu_ps(pC);
1013210132
__m128 _cc1 = _mm_loadu_ps(pC + c_hstep * 4);
1013310133
_c0 = _mm256_insertf128_ps(_mm256_castps128_ps256(_cc0), _cc1, 1);
1013410134
pC += 4;
1013510135
}
10136-
if (c_elempack == 1)
10136+
else // if (c_elempack == 1)
1013710137
{
1013810138
#if __AVX2__
1013910139
__m256i _vindex = _mm256_mullo_epi32(_mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7), _mm256_set1_epi32(c_hstep));
@@ -10213,9 +10213,9 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
1021310213
__m512 _descale_avx512 = _mm512_broadcast_f32x4(_descale);
1021410214
#endif
1021510215

10216-
__m128 _c0;
10216+
__m128 _c0 = _mm_set1_ps(0.f);
1021710217
#if __AVX512F__
10218-
__m512 _c0_avx512;
10218+
__m512 _c0_avx512 = _mm512_set1_ps(0.f);
1021910219
#endif
1022010220
if (pC)
1022110221
{
@@ -10323,7 +10323,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
1032310323
_c2_avx512 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
1032410324
_c3_avx512 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
1032510325
}
10326-
if (c_elempack == 1)
10326+
else // if (c_elempack == 1)
1032710327
{
1032810328
_c0_avx512 = _mm512_loadu_ps(pC);
1032910329
_c1_avx512 = _mm512_loadu_ps(pC + c_hstep);
@@ -10596,7 +10596,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
1059610596
_c2 = _mm_loadu_ps(pC + 8);
1059710597
_c3 = _mm_loadu_ps(pC + 12);
1059810598
}
10599-
if (c_elempack == 1)
10599+
else // if (c_elempack == 1)
1060010600
{
1060110601
_c0 = _mm_loadu_ps(pC);
1060210602
_c1 = _mm_loadu_ps(pC + c_hstep);
@@ -10627,7 +10627,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
1062710627
_c3 = _mm_loadu_ps(pC + 28);
1062810628
pC += 32;
1062910629
}
10630-
if (c_elempack == 1)
10630+
else // if (c_elempack == 1)
1063110631
{
1063210632
_c0 = _mm_loadu_ps(pC + 4);
1063310633
_c1 = _mm_loadu_ps(pC + c_hstep + 4);
@@ -10832,7 +10832,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
1083210832
_c3 = _mm_loadu_ps(pC + 12);
1083310833
pC += 16;
1083410834
}
10835-
if (c_elempack == 1)
10835+
else // if (c_elempack == 1)
1083610836
{
1083710837
_c0 = _mm_loadu_ps(pC);
1083810838
_c1 = _mm_loadu_ps(pC + c_hstep);
@@ -10992,7 +10992,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
1099210992
_c1 = _mm_loadu_ps(pC + 4);
1099310993
pC += 8;
1099410994
}
10995-
if (c_elempack == 1)
10995+
else // if (c_elempack == 1)
1099610996
{
1099710997
_c0 = _mm_setr_ps(pC[0], pC[c_hstep], pC[c_hstep * 2], pC[c_hstep * 3]);
1099810998
_c1 = _mm_setr_ps(pC[1], pC[c_hstep + 1], pC[c_hstep * 2 + 1], pC[c_hstep * 3 + 1]);
@@ -11084,7 +11084,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
1108411084
_c0 = _mm_loadu_ps(pC);
1108511085
pC += 4;
1108611086
}
11087-
if (c_elempack == 1)
11087+
else // if (c_elempack == 1)
1108811088
{
1108911089
_c0 = _mm_setr_ps(pC[0], pC[c_hstep], pC[c_hstep * 2], pC[c_hstep * 3]);
1109011090
pC += 1;
@@ -11153,14 +11153,14 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
1115311153
#endif // __AVX512F__
1115411154
#endif
1115511155

11156-
float c0;
11157-
float c1;
11156+
float c0 = 0.f;
11157+
float c1 = 0.f;
1115811158
#if __SSE2__
11159-
__m128 _c0;
11160-
__m128 _c1;
11159+
__m128 _c0 = _mm_set1_ps(0.f);
11160+
__m128 _c1 = _mm_set1_ps(0.f);
1116111161
#if __AVX512F__
11162-
__m512 _c0_avx512;
11163-
__m512 _c1_avx512;
11162+
__m512 _c0_avx512 = _mm512_set1_ps(0.f);
11163+
__m512 _c1_avx512 = _mm512_set1_ps(0.f);
1116411164
#endif // __AVX512F__
1116511165
#endif
1116611166
if (pC)
@@ -11726,11 +11726,11 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
1172611726
#endif // __AVX512F__
1172711727
#endif
1172811728

11729-
float c0;
11729+
float c0 = 0.f;
1173011730
#if __SSE2__
11731-
__m128 _c0;
11731+
__m128 _c0 = _mm_set1_ps(0.f);
1173211732
#if __AVX512F__
11733-
__m512 _c0_avx512;
11733+
__m512 _c0_avx512 = _mm512_set1_ps(0.f);
1173411734
#endif // __AVX512F__
1173511735
#endif
1173611736
if (pC)

0 commit comments

Comments
 (0)