Skip to content

Commit 842ac38

Browse files
nihuisen.li
authored and
sen.li
committed
fix instruction extension dispatch (Tencent#5427)
1 parent 32167f5 commit 842ac38

8 files changed

+135
-2682
lines changed

src/layer/arm/convolution_im2col_gemm_int8.h

+95-2,601
Large diffs are not rendered by default.

src/layer/arm/convolution_packed_int8.h

+3-9
Original file line numberDiff line numberDiff line change
@@ -12,21 +12,18 @@
1212
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
1313
// specific language governing permissions and limitations under the License.
1414

15-
#if !(__ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD)
1615
#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
1716
void convolution_transform_kernel_packed_int8_i8mm(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h);
1817
void convolution_packed_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt);
1918
#endif
2019

21-
#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD
20+
#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
2221
void convolution_transform_kernel_packed_int8_asimddp(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h);
2322
void convolution_packed_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt);
2423
#endif
25-
#endif
2624

2725
static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
2826
{
29-
#if !(__ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD)
3027
#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
3128
if (ncnn::cpu_support_arm_i8mm())
3229
{
@@ -35,13 +32,12 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
3532
}
3633
#endif
3734

38-
#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD
35+
#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
3936
if (ncnn::cpu_support_arm_asimddp())
4037
{
4138
convolution_transform_kernel_packed_int8_asimddp(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
4239
return;
4340
}
44-
#endif
4541
#endif
4642

4743
const int maxk = kernel_w * kernel_h;
@@ -531,7 +527,6 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
531527

532528
static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
533529
{
534-
#if !(__ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD)
535530
#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
536531
if (ncnn::cpu_support_arm_i8mm())
537532
{
@@ -540,13 +535,12 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
540535
}
541536
#endif
542537

543-
#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD
538+
#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
544539
if (ncnn::cpu_support_arm_asimddp())
545540
{
546541
convolution_packed_int8_asimddp(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
547542
return;
548543
}
549-
#endif
550544
#endif
551545

552546
const int w = bottom_blob.w;

src/layer/arm/innerproduct_fp16s.h

+4-12
Original file line numberDiff line numberDiff line change
@@ -12,23 +12,20 @@
1212
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
1313
// specific language governing permissions and limitations under the License.
1414

15-
#if !(__ARM_FEATURE_FP16_FML || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
1615
#if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML
1716
void innerproduct_pack4_fp16s_neon_asimdfhm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);
1817
void innerproduct_fp16s_neon_asimdfhm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);
1918
void innerproduct_transform_kernel_fp16s_neon_asimdfhm(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt);
2019
#endif
2120

22-
#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC
21+
#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML
2322
void innerproduct_pack4_fp16s_neon_asimdhp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);
2423
void innerproduct_fp16s_neon_asimdhp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);
2524
void innerproduct_transform_kernel_fp16s_neon_asimdhp(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt);
2625
#endif
27-
#endif
2826

2927
static void innerproduct_pack4_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
3028
{
31-
#if !(__ARM_FEATURE_FP16_FML || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
3229
#if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML
3330
if (ncnn::cpu_support_arm_asimdfhm())
3431
{
@@ -37,13 +34,12 @@ static void innerproduct_pack4_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
3734
}
3835
#endif
3936

40-
#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC
37+
#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML
4138
if (ncnn::cpu_support_arm_asimdhp())
4239
{
4340
innerproduct_pack4_fp16s_neon_asimdhp(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);
4441
return;
4542
}
46-
#endif
4743
#endif
4844

4945
const int num_input = bottom_blob.w * bottom_blob.elempack;
@@ -294,7 +290,6 @@ static void innerproduct_pack4_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
294290

295291
static void innerproduct_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
296292
{
297-
#if !(__ARM_FEATURE_FP16_FML || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
298293
#if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML
299294
if (ncnn::cpu_support_arm_asimdfhm())
300295
{
@@ -303,13 +298,12 @@ static void innerproduct_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const
303298
}
304299
#endif
305300

306-
#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC
301+
#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML
307302
if (ncnn::cpu_support_arm_asimdhp())
308303
{
309304
innerproduct_fp16s_neon_asimdhp(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);
310305
return;
311306
}
312-
#endif
313307
#endif
314308

315309
const int num_input = bottom_blob.w * bottom_blob.elempack;
@@ -516,7 +510,6 @@ static void innerproduct_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const
516510

517511
static void innerproduct_transform_kernel_fp16s_neon(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt)
518512
{
519-
#if !(__ARM_FEATURE_FP16_FML || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
520513
#if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML
521514
if (ncnn::cpu_support_arm_asimdfhm())
522515
{
@@ -525,13 +518,12 @@ static void innerproduct_transform_kernel_fp16s_neon(const Mat& weight_data, Mat
525518
}
526519
#endif
527520

528-
#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC
521+
#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML
529522
if (ncnn::cpu_support_arm_asimdhp())
530523
{
531524
innerproduct_transform_kernel_fp16s_neon_asimdhp(weight_data, weight_data_tm, num_input, num_output, opt);
532525
return;
533526
}
534-
#endif
535527
#endif
536528

537529
int out_elempack = 1;

src/layer/arm/innerproduct_gemm_fp16s.h

+2-6
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,16 @@
1212
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
1313
// specific language governing permissions and limitations under the License.
1414

15-
#if !(__ARM_FEATURE_FP16_FML || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
1615
#if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML
1716
void innerproduct_gemm_fp16s_neon_asimdfhm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);
1817
#endif
1918

20-
#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC
19+
#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML
2120
void innerproduct_gemm_fp16s_neon_asimdhp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);
2221
#endif
23-
#endif
2422

2523
static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
2624
{
27-
#if !(__ARM_FEATURE_FP16_FML || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
2825
#if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML
2926
if (ncnn::cpu_support_arm_asimdfhm())
3027
{
@@ -33,13 +30,12 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
3330
}
3431
#endif
3532

36-
#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC
33+
#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML
3734
if (ncnn::cpu_support_arm_asimdhp())
3835
{
3936
innerproduct_gemm_fp16s_neon_asimdhp(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);
4037
return;
4138
}
42-
#endif
4339
#endif
4440

4541
const int num_input = bottom_blob.w;

src/layer/x86/cast_bf16.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ void cast_fp32_to_bf16_sse_avx512bf16(const Mat& bottom_blob, Mat& top_blob, con
1717
void cast_bf16_to_fp32_sse_avx512bf16(const Mat& bottom_blob, Mat& top_blob, const Option& opt);
1818
#endif
1919

20-
#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
20+
#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVX512BF16__
2121
void cast_fp32_to_bf16_sse_avx2(const Mat& bottom_blob, Mat& top_blob, const Option& opt);
2222
void cast_bf16_to_fp32_sse_avx2(const Mat& bottom_blob, Mat& top_blob, const Option& opt);
2323
#endif
@@ -32,7 +32,7 @@ static void cast_fp32_to_bf16_sse(const Mat& bottom_blob, Mat& top_blob, const O
3232
}
3333
#endif
3434

35-
#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
35+
#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVX512BF16__
3636
if (ncnn::cpu_support_x86_avx2())
3737
{
3838
cast_fp32_to_bf16_sse_avx2(bottom_blob, top_blob, opt);
@@ -104,7 +104,7 @@ static void cast_bf16_to_fp32_sse(const Mat& bottom_blob, Mat& top_blob, const O
104104
}
105105
#endif
106106

107-
#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
107+
#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVX512BF16__
108108
if (ncnn::cpu_support_x86_avx2())
109109
{
110110
cast_bf16_to_fp32_sse_avx2(bottom_blob, top_blob, opt);

src/layer/x86/convolution_3x3_winograd_int8.h

+11-21
Original file line numberDiff line numberDiff line change
@@ -12,29 +12,27 @@
1212
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
1313
// specific language governing permissions and limitations under the License.
1414

15-
#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
1615
#if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__
1716
void conv3x3s1_winograd23_int8_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
1817
void conv3x3s1_winograd43_int8_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
1918
#endif
2019

21-
#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__
20+
#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
2221
void conv3x3s1_winograd23_int8_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
2322
void conv3x3s1_winograd43_int8_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
2423
#endif
2524

26-
#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
25+
#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
2726
void conv3x3s1_winograd23_transform_kernel_int8_avx2(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt);
2827
void conv3x3s1_winograd23_int8_avx2(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
2928
void conv3x3s1_winograd43_transform_kernel_int8_avx2(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt);
3029
void conv3x3s1_winograd43_int8_avx2(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
3130
#endif
3231

33-
#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__
32+
#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
3433
void conv3x3s1_winograd23_int8_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
3534
void conv3x3s1_winograd43_int8_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
3635
#endif
37-
#endif
3836

3937
static void pack_A_tile_int8(const Mat& A, Mat& AT, int batch, int max_ii, int max_kk)
4038
{
@@ -3430,14 +3428,12 @@ static inline void conv3x3s1_winograd23_transform_kernel_tile_int8(const Mat& ke
34303428

34313429
static void conv3x3s1_winograd23_transform_kernel_int8(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt)
34323430
{
3433-
#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
3434-
#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
3431+
#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
34353432
if (ncnn::cpu_support_x86_avx2())
34363433
{
34373434
conv3x3s1_winograd23_transform_kernel_int8_avx2(kernel, AT, inch, outch, opt);
34383435
return;
34393436
}
3440-
#endif
34413437
#endif
34423438

34433439
const int M = outch;
@@ -4430,7 +4426,6 @@ static inline void conv3x3s1_winograd23_transform_output_tile_int8(const Mat& to
44304426

44314427
static void conv3x3s1_winograd23_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
44324428
{
4433-
#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
44344429
#if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__
44354430
if (ncnn::cpu_support_x86_avx512_vnni())
44364431
{
@@ -4439,29 +4434,28 @@ static void conv3x3s1_winograd23_int8(const Mat& bottom_blob, Mat& top_blob, con
44394434
}
44404435
#endif
44414436

4442-
#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__
4437+
#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
44434438
if (ncnn::cpu_support_x86_avx_vnni())
44444439
{
44454440
conv3x3s1_winograd23_int8_avxvnni(bottom_blob, top_blob, AT, nT, opt);
44464441
return;
44474442
}
44484443
#endif
44494444

4450-
#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
4445+
#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
44514446
if (ncnn::cpu_support_x86_avx2())
44524447
{
44534448
conv3x3s1_winograd23_int8_avx2(bottom_blob, top_blob, AT, nT, opt);
44544449
return;
44554450
}
44564451
#endif
44574452

4458-
#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__
4453+
#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
44594454
if (ncnn::cpu_support_x86_xop())
44604455
{
44614456
conv3x3s1_winograd23_int8_xop(bottom_blob, top_blob, AT, nT, opt);
44624457
return;
44634458
}
4464-
#endif
44654459
#endif
44664460

44674461
int outw = top_blob.w;
@@ -4642,14 +4636,12 @@ static inline void conv3x3s1_winograd43_transform_kernel_tile_int8(const Mat& ke
46424636

46434637
static void conv3x3s1_winograd43_transform_kernel_int8(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt)
46444638
{
4645-
#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
4646-
#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
4639+
#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
46474640
if (ncnn::cpu_support_x86_avx2())
46484641
{
46494642
conv3x3s1_winograd43_transform_kernel_int8_avx2(kernel, AT, inch, outch, opt);
46504643
return;
46514644
}
4652-
#endif
46534645
#endif
46544646

46554647
const int M = outch;
@@ -6260,7 +6252,6 @@ static inline void conv3x3s1_winograd43_transform_output_tile_int8(const Mat& to
62606252

62616253
static void conv3x3s1_winograd43_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
62626254
{
6263-
#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
62646255
#if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__
62656256
if (ncnn::cpu_support_x86_avx512_vnni())
62666257
{
@@ -6269,29 +6260,28 @@ static void conv3x3s1_winograd43_int8(const Mat& bottom_blob, Mat& top_blob, con
62696260
}
62706261
#endif
62716262

6272-
#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__
6263+
#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
62736264
if (ncnn::cpu_support_x86_avx_vnni())
62746265
{
62756266
conv3x3s1_winograd43_int8_avxvnni(bottom_blob, top_blob, AT, nT, opt);
62766267
return;
62776268
}
62786269
#endif
62796270

6280-
#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
6271+
#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
62816272
if (ncnn::cpu_support_x86_avx2())
62826273
{
62836274
conv3x3s1_winograd43_int8_avx2(bottom_blob, top_blob, AT, nT, opt);
62846275
return;
62856276
}
62866277
#endif
62876278

6288-
#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__
6279+
#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
62896280
if (ncnn::cpu_support_x86_xop())
62906281
{
62916282
conv3x3s1_winograd43_int8_xop(bottom_blob, top_blob, AT, nT, opt);
62926283
return;
62936284
}
6294-
#endif
62956285
#endif
62966286

62976287
int outw = top_blob.w;

0 commit comments

Comments
 (0)