@@ -4464,9 +4464,9 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
4464
4464
{
4465
4465
const float* p0 = (const float*)A + k * A_hstep + (i + ii) * elempack;
4466
4466
4467
- #if __AVX512VNNI__ || __AVXVNNI__
4467
+ #if __AVX512VNNI__
4468
4468
__m128i _v127 = _mm_set1_epi8(127);
4469
- #endif // __AVX512VNNI__ || __AVXVNNI__
4469
+ #endif // __AVX512VNNI__
4470
4470
4471
4471
const float scale = scales[i + ii];
4472
4472
@@ -6952,9 +6952,9 @@ static void transpose_pack_B_tile_fp32_to_int8(const Mat& B, Mat& BT, int j, int
6952
6952
{
6953
6953
const float* p0 = (const float*)B + k * B_hstep + (j + jj) * elempack;
6954
6954
6955
- #if __AVX512VNNI__ || __AVXVNNI__
6955
+ #if __AVX512VNNI__
6956
6956
__m128i _v127 = _mm_set1_epi8(127);
6957
- #endif // __AVX512VNNI__ || __AVXVNNI__
6957
+ #endif // __AVX512VNNI__
6958
6958
6959
6959
#if __SSE2__
6960
6960
#if __AVX__
@@ -7094,7 +7094,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
7094
7094
7095
7095
__m512 _descale = _mm512_loadu_ps((const float*)descales + i + ii);
7096
7096
7097
- __m512 _c0;
7097
+ __m512 _c0 = _mm512_set1_ps(0.f) ;
7098
7098
if (pC)
7099
7099
{
7100
7100
if (broadcast_type_C == 0)
@@ -7357,7 +7357,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
7357
7357
_cf = _mm512_loadu_ps(pC + 128 + 112);
7358
7358
pC += 256;
7359
7359
}
7360
- if (c_elempack == 8)
7360
+ else if (c_elempack == 8)
7361
7361
{
7362
7362
__m512 _tmp0 = _mm512_loadu_ps(pC);
7363
7363
__m512 _tmp1 = _mm512_loadu_ps(pC + 16);
@@ -7395,7 +7395,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
7395
7395
7396
7396
pC += 128;
7397
7397
}
7398
- if (c_elempack == 4)
7398
+ else if (c_elempack == 4)
7399
7399
{
7400
7400
_c0 = _mm512_loadu_ps(pC);
7401
7401
_c1 = _mm512_loadu_ps(pC + 16);
@@ -7450,7 +7450,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
7450
7450
7451
7451
pC += 64;
7452
7452
}
7453
- if (c_elempack == 1)
7453
+ else // if (c_elempack == 1)
7454
7454
{
7455
7455
_c0 = _mm512_loadu_ps(pC);
7456
7456
_c1 = _mm512_loadu_ps(pC + c_hstep);
@@ -7938,7 +7938,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
7938
7938
_c7 = _mm512_loadu_ps(pC + 112);
7939
7939
pC += 128;
7940
7940
}
7941
- if (c_elempack == 8)
7941
+ else if (c_elempack == 8)
7942
7942
{
7943
7943
__m512 _tmp0 = _mm512_loadu_ps(pC);
7944
7944
__m512 _tmp1 = _mm512_loadu_ps(pC + 16);
@@ -7960,7 +7960,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
7960
7960
7961
7961
pC += 64;
7962
7962
}
7963
- if (c_elempack == 4)
7963
+ else if (c_elempack == 4)
7964
7964
{
7965
7965
_c0 = _mm512_loadu_ps(pC);
7966
7966
_c1 = _mm512_loadu_ps(pC + 16);
@@ -7991,7 +7991,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
7991
7991
7992
7992
pC += 32;
7993
7993
}
7994
- if (c_elempack == 1)
7994
+ else // if (c_elempack == 1)
7995
7995
{
7996
7996
__m256 _cc0 = _mm256_loadu_ps(pC);
7997
7997
__m256 _cc1 = _mm256_loadu_ps(pC + c_hstep);
@@ -8278,7 +8278,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
8278
8278
_c3 = _mm512_loadu_ps(pC + 48);
8279
8279
pC += 64;
8280
8280
}
8281
- if (c_elempack == 8)
8281
+ else if (c_elempack == 8)
8282
8282
{
8283
8283
__m512 _cc0 = _mm512_loadu_ps(pC);
8284
8284
__m512 _cc1 = _mm512_loadu_ps(pC + 16);
@@ -8290,7 +8290,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
8290
8290
_c3 = _mm512_shuffle_f32x4(_cc1, _cc3, _MM_SHUFFLE(3, 2, 3, 2));
8291
8291
pC += 32;
8292
8292
}
8293
- if (c_elempack == 4)
8293
+ else if (c_elempack == 4)
8294
8294
{
8295
8295
_c0 = _mm512_loadu_ps(pC);
8296
8296
_c1 = _mm512_loadu_ps(pC + c_hstep * 4);
@@ -8306,7 +8306,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
8306
8306
_c3 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
8307
8307
pC += 16;
8308
8308
}
8309
- if (c_elempack == 1)
8309
+ else // if (c_elempack == 1)
8310
8310
{
8311
8311
__m128 _cc0 = _mm_loadu_ps(pC);
8312
8312
__m128 _cc1 = _mm_loadu_ps(pC + c_hstep);
@@ -8562,15 +8562,15 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
8562
8562
_c1 = _mm512_loadu_ps(pC + 16);
8563
8563
pC += 32;
8564
8564
}
8565
- if (c_elempack == 8)
8565
+ else if (c_elempack == 8)
8566
8566
{
8567
8567
__m512 _cc0 = _mm512_loadu_ps(pC);
8568
8568
__m512 _cc1 = _mm512_loadu_ps(pC + c_hstep * 8);
8569
8569
_c0 = _mm512_shuffle_f32x4(_cc0, _cc1, _MM_SHUFFLE(1, 0, 1, 0));
8570
8570
_c1 = _mm512_shuffle_f32x4(_cc0, _cc1, _MM_SHUFFLE(3, 2, 3, 2));
8571
8571
pC += 16;
8572
8572
}
8573
- if (c_elempack == 4)
8573
+ else if (c_elempack == 4)
8574
8574
{
8575
8575
__m128 _cc0 = _mm_loadu_ps(pC);
8576
8576
__m128 _cc1 = _mm_loadu_ps(pC + 4);
@@ -8588,7 +8588,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
8588
8588
_c1 = _mm512_insertf32x8(_mm512_castps256_ps512(_cc13), _cc57, 1);
8589
8589
pC += 8;
8590
8590
}
8591
- if (c_elempack == 1)
8591
+ else // if (c_elempack == 1)
8592
8592
{
8593
8593
__m512i _vindex = _mm512_mullo_epi32(_mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), _mm512_set1_epi32(c_hstep));
8594
8594
_c0 = _mm512_i32gather_ps(_vindex, pC, sizeof(float));
@@ -8691,14 +8691,14 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
8691
8691
_c0 = _mm512_loadu_ps(pC);
8692
8692
pC += 16;
8693
8693
}
8694
- if (c_elempack == 8)
8694
+ else if (c_elempack == 8)
8695
8695
{
8696
8696
__m256 _cc0 = _mm256_loadu_ps(pC);
8697
8697
__m256 _cc1 = _mm256_loadu_ps(pC + c_hstep * 8);
8698
8698
_c0 = _mm512_insertf32x8(_mm512_castps256_ps512(_cc0), _cc1, 1);
8699
8699
pC += 8;
8700
8700
}
8701
- if (c_elempack == 4)
8701
+ else if (c_elempack == 4)
8702
8702
{
8703
8703
__m128 _cc0 = _mm_loadu_ps(pC);
8704
8704
__m128 _cc1 = _mm_loadu_ps(pC + c_hstep * 4);
@@ -8709,7 +8709,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
8709
8709
_c0 = _mm512_insertf32x8(_mm512_castps256_ps512(_cc01), _cc23, 1);
8710
8710
pC += 4;
8711
8711
}
8712
- if (c_elempack == 1)
8712
+ else // if (c_elempack == 1)
8713
8713
{
8714
8714
__m512i _vindex = _mm512_mullo_epi32(_mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), _mm512_set1_epi32(c_hstep));
8715
8715
_c0 = _mm512_i32gather_ps(_vindex, pC, sizeof(float));
@@ -8783,9 +8783,9 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
8783
8783
__m512 _descale_avx512 = _mm512_broadcast_f32x8(_descale);
8784
8784
#endif
8785
8785
8786
- __m256 _c0;
8786
+ __m256 _c0 = _mm256_set1_ps(0.f) ;
8787
8787
#if __AVX512F__
8788
- __m512 _c0_avx512;
8788
+ __m512 _c0_avx512 = _mm512_set1_ps(0.f) ;
8789
8789
#endif
8790
8790
if (pC)
8791
8791
{
@@ -8970,7 +8970,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
8970
8970
8971
8971
pC += 128;
8972
8972
}
8973
- if (c_elempack == 4)
8973
+ else if (c_elempack == 4)
8974
8974
{
8975
8975
_c0_avx512 = _mm512_loadu_ps(pC);
8976
8976
_c1_avx512 = _mm512_loadu_ps(pC + 16);
@@ -9010,7 +9010,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
9010
9010
9011
9011
pC += 64;
9012
9012
}
9013
- if (c_elempack == 1)
9013
+ else // if (c_elempack == 1)
9014
9014
{
9015
9015
_c0_avx512 = _mm512_loadu_ps(pC);
9016
9016
_c1_avx512 = _mm512_loadu_ps(pC + c_hstep);
@@ -9506,7 +9506,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
9506
9506
_c7 = _mm256_loadu_ps(pC + 56);
9507
9507
pC += 64;
9508
9508
}
9509
- if (c_elempack == 4)
9509
+ else if (c_elempack == 4)
9510
9510
{
9511
9511
__m256 _tmp0 = _mm256_loadu_ps(pC);
9512
9512
__m256 _tmp1 = _mm256_loadu_ps(pC + 8);
@@ -9526,7 +9526,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
9526
9526
_c7 = _mm256_permute2f128_ps(_tmp3, _tmp7, _MM_SHUFFLE(0, 3, 0, 1));
9527
9527
pC += 32;
9528
9528
}
9529
- if (c_elempack == 1)
9529
+ else // if (c_elempack == 1)
9530
9530
{
9531
9531
_c0 = _mm256_loadu_ps(pC);
9532
9532
_c1 = _mm256_loadu_ps(pC + c_hstep);
@@ -9772,7 +9772,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
9772
9772
_c3 = _mm256_loadu_ps(pC + 24);
9773
9773
pC += 32;
9774
9774
}
9775
- if (c_elempack == 4)
9775
+ else if (c_elempack == 4)
9776
9776
{
9777
9777
__m256 _cc0 = _mm256_loadu_ps(pC);
9778
9778
__m256 _cc1 = _mm256_loadu_ps(pC + 8);
@@ -9784,7 +9784,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
9784
9784
_c3 = _mm256_permute2f128_ps(_cc1, _cc3, _MM_SHUFFLE(0, 3, 0, 1));
9785
9785
pC += 16;
9786
9786
}
9787
- if (c_elempack == 1)
9787
+ else // if (c_elempack == 1)
9788
9788
{
9789
9789
// __m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
9790
9790
// _c0 = _mm256_i32gather_ps(pC, _vindex, c_hstep * sizeof(float));
@@ -9994,15 +9994,15 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
9994
9994
_c1 = _mm256_loadu_ps(pC + 8);
9995
9995
pC += 16;
9996
9996
}
9997
- if (c_elempack == 4)
9997
+ else if (c_elempack == 4)
9998
9998
{
9999
9999
__m256 _cc0 = _mm256_loadu_ps(pC);
10000
10000
__m256 _cc1 = _mm256_loadu_ps(pC + c_hstep * 4);
10001
10001
_c0 = _mm256_permute2f128_ps(_cc0, _cc1, _MM_SHUFFLE(0, 2, 0, 0));
10002
10002
_c1 = _mm256_permute2f128_ps(_cc0, _cc1, _MM_SHUFFLE(0, 3, 0, 1));
10003
10003
pC += 8;
10004
10004
}
10005
- if (c_elempack == 1)
10005
+ else // if (c_elempack == 1)
10006
10006
{
10007
10007
#if __AVX2__
10008
10008
__m256i _vindex = _mm256_mullo_epi32(_mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7), _mm256_set1_epi32(c_hstep));
@@ -10126,14 +10126,14 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
10126
10126
_c0 = _mm256_loadu_ps(pC);
10127
10127
pC += 8;
10128
10128
}
10129
- if (c_elempack == 4)
10129
+ else if (c_elempack == 4)
10130
10130
{
10131
10131
__m128 _cc0 = _mm_loadu_ps(pC);
10132
10132
__m128 _cc1 = _mm_loadu_ps(pC + c_hstep * 4);
10133
10133
_c0 = _mm256_insertf128_ps(_mm256_castps128_ps256(_cc0), _cc1, 1);
10134
10134
pC += 4;
10135
10135
}
10136
- if (c_elempack == 1)
10136
+ else // if (c_elempack == 1)
10137
10137
{
10138
10138
#if __AVX2__
10139
10139
__m256i _vindex = _mm256_mullo_epi32(_mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7), _mm256_set1_epi32(c_hstep));
@@ -10213,9 +10213,9 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
10213
10213
__m512 _descale_avx512 = _mm512_broadcast_f32x4(_descale);
10214
10214
#endif
10215
10215
10216
- __m128 _c0;
10216
+ __m128 _c0 = _mm_set1_ps(0.f) ;
10217
10217
#if __AVX512F__
10218
- __m512 _c0_avx512;
10218
+ __m512 _c0_avx512 = _mm512_set1_ps(0.f) ;
10219
10219
#endif
10220
10220
if (pC)
10221
10221
{
@@ -10323,7 +10323,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
10323
10323
_c2_avx512 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
10324
10324
_c3_avx512 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
10325
10325
}
10326
- if (c_elempack == 1)
10326
+ else // if (c_elempack == 1)
10327
10327
{
10328
10328
_c0_avx512 = _mm512_loadu_ps(pC);
10329
10329
_c1_avx512 = _mm512_loadu_ps(pC + c_hstep);
@@ -10596,7 +10596,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
10596
10596
_c2 = _mm_loadu_ps(pC + 8);
10597
10597
_c3 = _mm_loadu_ps(pC + 12);
10598
10598
}
10599
- if (c_elempack == 1)
10599
+ else // if (c_elempack == 1)
10600
10600
{
10601
10601
_c0 = _mm_loadu_ps(pC);
10602
10602
_c1 = _mm_loadu_ps(pC + c_hstep);
@@ -10627,7 +10627,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
10627
10627
_c3 = _mm_loadu_ps(pC + 28);
10628
10628
pC += 32;
10629
10629
}
10630
- if (c_elempack == 1)
10630
+ else // if (c_elempack == 1)
10631
10631
{
10632
10632
_c0 = _mm_loadu_ps(pC + 4);
10633
10633
_c1 = _mm_loadu_ps(pC + c_hstep + 4);
@@ -10832,7 +10832,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
10832
10832
_c3 = _mm_loadu_ps(pC + 12);
10833
10833
pC += 16;
10834
10834
}
10835
- if (c_elempack == 1)
10835
+ else // if (c_elempack == 1)
10836
10836
{
10837
10837
_c0 = _mm_loadu_ps(pC);
10838
10838
_c1 = _mm_loadu_ps(pC + c_hstep);
@@ -10992,7 +10992,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
10992
10992
_c1 = _mm_loadu_ps(pC + 4);
10993
10993
pC += 8;
10994
10994
}
10995
- if (c_elempack == 1)
10995
+ else // if (c_elempack == 1)
10996
10996
{
10997
10997
_c0 = _mm_setr_ps(pC[0], pC[c_hstep], pC[c_hstep * 2], pC[c_hstep * 3]);
10998
10998
_c1 = _mm_setr_ps(pC[1], pC[c_hstep + 1], pC[c_hstep * 2 + 1], pC[c_hstep * 3 + 1]);
@@ -11084,7 +11084,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
11084
11084
_c0 = _mm_loadu_ps(pC);
11085
11085
pC += 4;
11086
11086
}
11087
- if (c_elempack == 1)
11087
+ else // if (c_elempack == 1)
11088
11088
{
11089
11089
_c0 = _mm_setr_ps(pC[0], pC[c_hstep], pC[c_hstep * 2], pC[c_hstep * 3]);
11090
11090
pC += 1;
@@ -11153,14 +11153,14 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
11153
11153
#endif // __AVX512F__
11154
11154
#endif
11155
11155
11156
- float c0;
11157
- float c1;
11156
+ float c0 = 0.f ;
11157
+ float c1 = 0.f ;
11158
11158
#if __SSE2__
11159
- __m128 _c0;
11160
- __m128 _c1;
11159
+ __m128 _c0 = _mm_set1_ps(0.f) ;
11160
+ __m128 _c1 = _mm_set1_ps(0.f) ;
11161
11161
#if __AVX512F__
11162
- __m512 _c0_avx512;
11163
- __m512 _c1_avx512;
11162
+ __m512 _c0_avx512 = _mm512_set1_ps(0.f) ;
11163
+ __m512 _c1_avx512 = _mm512_set1_ps(0.f) ;
11164
11164
#endif // __AVX512F__
11165
11165
#endif
11166
11166
if (pC)
@@ -11726,11 +11726,11 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
11726
11726
#endif // __AVX512F__
11727
11727
#endif
11728
11728
11729
- float c0;
11729
+ float c0 = 0.f ;
11730
11730
#if __SSE2__
11731
- __m128 _c0;
11731
+ __m128 _c0 = _mm_set1_ps(0.f) ;
11732
11732
#if __AVX512F__
11733
- __m512 _c0_avx512;
11733
+ __m512 _c0_avx512 = _mm512_set1_ps(0.f) ;
11734
11734
#endif // __AVX512F__
11735
11735
#endif
11736
11736
if (pC)
0 commit comments