senli123
diff --git a/‎src/layer/arm/convolution_im2col_gemm_int8.h
+95-2,601 b/‎src/layer/arm/convolution_im2col_gemm_int8.h
+95-2,601
diff --git a/‎src/layer/arm/convolution_packed_int8.h
+3-9 b/‎src/layer/arm/convolution_packed_int8.h
+3-9
diff --git a/‎src/layer/arm/innerproduct_fp16s.h
+4-12 b/‎src/layer/arm/innerproduct_fp16s.h
+4-12
diff --git a/‎src/layer/arm/innerproduct_gemm_fp16s.h
+2-6 b/‎src/layer/arm/innerproduct_gemm_fp16s.h
+2-6
diff --git a/‎src/layer/x86/cast_bf16.h
+3-3 b/‎src/layer/x86/cast_bf16.h
+3-3
diff --git a/‎src/layer/x86/convolution_3x3_winograd_int8.h
+11-21 b/‎src/layer/x86/convolution_3x3_winograd_int8.h
+11-21
@@ -12,21 +12,18 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#if !(__ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD)
 #if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
 void convolution_transform_kernel_packed_int8_i8mm(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h);
 void convolution_packed_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt);
 #endif
 
-#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD
+#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
 void convolution_transform_kernel_packed_int8_asimddp(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h);
 void convolution_packed_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt);
 #endif
-#endif
 
 static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
 {
-#if !(__ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD)
 #if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
     if (ncnn::cpu_support_arm_i8mm())
     {
@@ -35,13 +32,12 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
     }
 #endif
 
-#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD
+#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
     if (ncnn::cpu_support_arm_asimddp())
     {
         convolution_transform_kernel_packed_int8_asimddp(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
         return;
     }
-#endif
 #endif
 
     const int maxk = kernel_w * kernel_h;
@@ -531,7 +527,6 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
 
 static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
 {
-#if !(__ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD)
 #if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
     if (ncnn::cpu_support_arm_i8mm())
     {
@@ -540,13 +535,12 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
     }
 #endif
 
-#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD
+#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
     if (ncnn::cpu_support_arm_asimddp())
     {
         convolution_packed_int8_asimddp(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
         return;
     }
-#endif
 #endif
 
     const int w = bottom_blob.w;
 
@@ -12,23 +12,20 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#if !(__ARM_FEATURE_FP16_FML || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 #if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML
 void innerproduct_pack4_fp16s_neon_asimdfhm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);
 void innerproduct_fp16s_neon_asimdfhm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);
 void innerproduct_transform_kernel_fp16s_neon_asimdfhm(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt);
 #endif
 
-#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML
 void innerproduct_pack4_fp16s_neon_asimdhp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);
 void innerproduct_fp16s_neon_asimdhp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);
 void innerproduct_transform_kernel_fp16s_neon_asimdhp(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt);
 #endif
-#endif
 
 static void innerproduct_pack4_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
 {
-#if !(__ARM_FEATURE_FP16_FML || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 #if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML
     if (ncnn::cpu_support_arm_asimdfhm())
     {
@@ -37,13 +34,12 @@ static void innerproduct_pack4_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
     }
 #endif
 
-#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML
     if (ncnn::cpu_support_arm_asimdhp())
     {
         innerproduct_pack4_fp16s_neon_asimdhp(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);
         return;
     }
-#endif
 #endif
 
     const int num_input = bottom_blob.w * bottom_blob.elempack;
@@ -294,7 +290,6 @@ static void innerproduct_pack4_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
 
 static void innerproduct_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
 {
-#if !(__ARM_FEATURE_FP16_FML || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 #if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML
     if (ncnn::cpu_support_arm_asimdfhm())
     {
@@ -303,13 +298,12 @@ static void innerproduct_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const
     }
 #endif
 
-#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML
     if (ncnn::cpu_support_arm_asimdhp())
     {
         innerproduct_fp16s_neon_asimdhp(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);
         return;
     }
-#endif
 #endif
 
     const int num_input = bottom_blob.w * bottom_blob.elempack;
@@ -516,7 +510,6 @@ static void innerproduct_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const
 
 static void innerproduct_transform_kernel_fp16s_neon(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt)
 {
-#if !(__ARM_FEATURE_FP16_FML || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 #if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML
     if (ncnn::cpu_support_arm_asimdfhm())
     {
@@ -525,13 +518,12 @@ static void innerproduct_transform_kernel_fp16s_neon(const Mat& weight_data, Mat
     }
 #endif
 
-#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML
     if (ncnn::cpu_support_arm_asimdhp())
     {
         innerproduct_transform_kernel_fp16s_neon_asimdhp(weight_data, weight_data_tm, num_input, num_output, opt);
         return;
     }
-#endif
 #endif
 
     int out_elempack = 1;
 
@@ -12,19 +12,16 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#if !(__ARM_FEATURE_FP16_FML || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 #if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML
 void innerproduct_gemm_fp16s_neon_asimdfhm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);
 #endif
 
-#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML
 void innerproduct_gemm_fp16s_neon_asimdhp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);
 #endif
-#endif
 
 static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
 {
-#if !(__ARM_FEATURE_FP16_FML || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 #if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML
     if (ncnn::cpu_support_arm_asimdfhm())
     {
@@ -33,13 +30,12 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
     }
 #endif
 
-#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML
     if (ncnn::cpu_support_arm_asimdhp())
     {
         innerproduct_gemm_fp16s_neon_asimdhp(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);
         return;
     }
-#endif
 #endif
 
     const int num_input = bottom_blob.w;
 
@@ -17,7 +17,7 @@ void cast_fp32_to_bf16_sse_avx512bf16(const Mat& bottom_blob, Mat& top_blob, con
 void cast_bf16_to_fp32_sse_avx512bf16(const Mat& bottom_blob, Mat& top_blob, const Option& opt);
 #endif
 
-#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
+#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVX512BF16__
 void cast_fp32_to_bf16_sse_avx2(const Mat& bottom_blob, Mat& top_blob, const Option& opt);
 void cast_bf16_to_fp32_sse_avx2(const Mat& bottom_blob, Mat& top_blob, const Option& opt);
 #endif
@@ -32,7 +32,7 @@ static void cast_fp32_to_bf16_sse(const Mat& bottom_blob, Mat& top_blob, const O
     }
 #endif
 
-#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
+#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVX512BF16__
     if (ncnn::cpu_support_x86_avx2())
     {
         cast_fp32_to_bf16_sse_avx2(bottom_blob, top_blob, opt);
@@ -104,7 +104,7 @@ static void cast_bf16_to_fp32_sse(const Mat& bottom_blob, Mat& top_blob, const O
     }
 #endif
 
-#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
+#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVX512BF16__
     if (ncnn::cpu_support_x86_avx2())
     {
         cast_bf16_to_fp32_sse_avx2(bottom_blob, top_blob, opt);
 
@@ -12,29 +12,27 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
 #if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__
 void conv3x3s1_winograd23_int8_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
 void conv3x3s1_winograd43_int8_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
 #endif
 
-#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__
+#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
 void conv3x3s1_winograd23_int8_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
 void conv3x3s1_winograd43_int8_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
 #endif
 
-#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
+#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
 void conv3x3s1_winograd23_transform_kernel_int8_avx2(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt);
 void conv3x3s1_winograd23_int8_avx2(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
 void conv3x3s1_winograd43_transform_kernel_int8_avx2(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt);
 void conv3x3s1_winograd43_int8_avx2(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
 #endif
 
-#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__
+#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
 void conv3x3s1_winograd23_int8_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
 void conv3x3s1_winograd43_int8_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
 #endif
-#endif
 
 static void pack_A_tile_int8(const Mat& A, Mat& AT, int batch, int max_ii, int max_kk)
 {
@@ -3430,14 +3428,12 @@ static inline void conv3x3s1_winograd23_transform_kernel_tile_int8(const Mat& ke
 
 static void conv3x3s1_winograd23_transform_kernel_int8(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt)
 {
-#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
-#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
+#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
     if (ncnn::cpu_support_x86_avx2())
     {
         conv3x3s1_winograd23_transform_kernel_int8_avx2(kernel, AT, inch, outch, opt);
         return;
     }
-#endif
 #endif
 
     const int M = outch;
@@ -4430,7 +4426,6 @@ static inline void conv3x3s1_winograd23_transform_output_tile_int8(const Mat& to
 
 static void conv3x3s1_winograd23_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
 {
-#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
 #if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__
     if (ncnn::cpu_support_x86_avx512_vnni())
     {
@@ -4439,29 +4434,28 @@ static void conv3x3s1_winograd23_int8(const Mat& bottom_blob, Mat& top_blob, con
     }
 #endif
 
-#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__
+#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
     if (ncnn::cpu_support_x86_avx_vnni())
     {
         conv3x3s1_winograd23_int8_avxvnni(bottom_blob, top_blob, AT, nT, opt);
         return;
     }
 #endif
 
-#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
+#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
     if (ncnn::cpu_support_x86_avx2())
     {
         conv3x3s1_winograd23_int8_avx2(bottom_blob, top_blob, AT, nT, opt);
         return;
     }
 #endif
 
-#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__
+#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
     if (ncnn::cpu_support_x86_xop())
     {
         conv3x3s1_winograd23_int8_xop(bottom_blob, top_blob, AT, nT, opt);
         return;
     }
-#endif
 #endif
 
     int outw = top_blob.w;
@@ -4642,14 +4636,12 @@ static inline void conv3x3s1_winograd43_transform_kernel_tile_int8(const Mat& ke
 
 static void conv3x3s1_winograd43_transform_kernel_int8(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt)
 {
-#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
-#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
+#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
     if (ncnn::cpu_support_x86_avx2())
     {
         conv3x3s1_winograd43_transform_kernel_int8_avx2(kernel, AT, inch, outch, opt);
         return;
     }
-#endif
 #endif
 
     const int M = outch;
@@ -6260,7 +6252,6 @@ static inline void conv3x3s1_winograd43_transform_output_tile_int8(const Mat& to
 
 static void conv3x3s1_winograd43_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
 {
-#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
 #if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__
     if (ncnn::cpu_support_x86_avx512_vnni())
     {
@@ -6269,29 +6260,28 @@ static void conv3x3s1_winograd43_int8(const Mat& bottom_blob, Mat& top_blob, con
     }
 #endif
 
-#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__
+#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
     if (ncnn::cpu_support_x86_avx_vnni())
     {
         conv3x3s1_winograd43_int8_avxvnni(bottom_blob, top_blob, AT, nT, opt);
         return;
     }
 #endif
 
-#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
+#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
     if (ncnn::cpu_support_x86_avx2())
     {
         conv3x3s1_winograd43_int8_avx2(bottom_blob, top_blob, AT, nT, opt);
         return;
     }
 #endif
 
-#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__
+#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__
     if (ncnn::cpu_support_x86_xop())
     {
         conv3x3s1_winograd43_int8_xop(bottom_blob, top_blob, AT, nT, opt);
         return;
     }
-#endif
 #endif
 
     int outw = top_blob.w;
Original file line number	Diff line number	Diff line change
`@@ -12,21 +12,18 @@`
`12`	`12`	`// CONDITIONS OF ANY KIND, either express or implied. See the License for the`
`13`	`13`	`// specific language governing permissions and limitations under the License.`
`14`	`14`
`15`		`-#if !(__ARM_FEATURE_MATMUL_INT8 \|\| __ARM_FEATURE_DOTPROD)`
`16`	`15`	`#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8`
`17`	`16`	`void convolution_transform_kernel_packed_int8_i8mm(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h);`
`18`	`17`	`void convolution_packed_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt);`
`19`	`18`	`#endif`
`20`	`19`
`21`		`-#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD`
	`20`	`+#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8`
`22`	`21`	`void convolution_transform_kernel_packed_int8_asimddp(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h);`
`23`	`22`	`void convolution_packed_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt);`
`24`	`23`	`#endif`
`25`		`-#endif`
`26`	`24`
`27`	`25`	`static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)`
`28`	`26`	`{`
`29`		`-#if !(__ARM_FEATURE_MATMUL_INT8 \|\| __ARM_FEATURE_DOTPROD)`
`30`	`27`	`#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8`
`31`	`28`	`if (ncnn::cpu_support_arm_i8mm())`
`32`	`29`	`{`
`@@ -35,13 +32,12 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker`
`35`	`32`	`}`
`36`	`33`	`#endif`
`37`	`34`
`38`		`-#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD`
	`35`	`+#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8`
`39`	`36`	`if (ncnn::cpu_support_arm_asimddp())`
`40`	`37`	`{`
`41`	`38`	`convolution_transform_kernel_packed_int8_asimddp(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);`
`42`	`39`	`return;`
`43`	`40`	`}`
`44`		`-#endif`
`45`	`41`	`#endif`
`46`	`42`
`47`	`43`	`const int maxk = kernel_w * kernel_h;`
`@@ -531,7 +527,6 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker`
`531`	`527`
`532`	`528`	`static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)`
`533`	`529`	`{`
`534`		`-#if !(__ARM_FEATURE_MATMUL_INT8 \|\| __ARM_FEATURE_DOTPROD)`
`535`	`530`	`#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8`
`536`	`531`	`if (ncnn::cpu_support_arm_i8mm())`
`537`	`532`	`{`
`@@ -540,13 +535,12 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const`
`540`	`535`	`}`
`541`	`536`	`#endif`
`542`	`537`
`543`		`-#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD`
	`538`	`+#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8`
`544`	`539`	`if (ncnn::cpu_support_arm_asimddp())`
`545`	`540`	`{`
`546`	`541`	`convolution_packed_int8_asimddp(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);`
`547`	`542`	`return;`
`548`	`543`	`}`
`549`		`-#endif`
`550`	`544`	`#endif`
`551`	`545`
`552`	`546`	`const int w = bottom_blob.w;`
Original file line number	Diff line number	Diff line change
`@@ -12,23 +12,20 @@`
`12`	`12`	`// CONDITIONS OF ANY KIND, either express or implied. See the License for the`
`13`	`13`	`// specific language governing permissions and limitations under the License.`
`14`	`14`
`15`		`-#if !(__ARM_FEATURE_FP16_FML \|\| __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)`
`16`	`15`	`#if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML`
`17`	`16`	`void innerproduct_pack4_fp16s_neon_asimdfhm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);`
`18`	`17`	`void innerproduct_fp16s_neon_asimdfhm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);`
`19`	`18`	`void innerproduct_transform_kernel_fp16s_neon_asimdfhm(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt);`
`20`	`19`	`#endif`
`21`	`20`
`22`		`-#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC`
	`21`	`+#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML`
`23`	`22`	`void innerproduct_pack4_fp16s_neon_asimdhp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);`
`24`	`23`	`void innerproduct_fp16s_neon_asimdhp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);`
`25`	`24`	`void innerproduct_transform_kernel_fp16s_neon_asimdhp(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt);`
`26`	`25`	`#endif`
`27`		`-#endif`
`28`	`26`
`29`	`27`	`static void innerproduct_pack4_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)`
`30`	`28`	`{`
`31`		`-#if !(__ARM_FEATURE_FP16_FML \|\| __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)`
`32`	`29`	`#if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML`
`33`	`30`	`if (ncnn::cpu_support_arm_asimdfhm())`
`34`	`31`	`{`
`@@ -37,13 +34,12 @@ static void innerproduct_pack4_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,`
`37`	`34`	`}`
`38`	`35`	`#endif`
`39`	`36`
`40`		`-#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC`
	`37`	`+#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML`
`41`	`38`	`if (ncnn::cpu_support_arm_asimdhp())`
`42`	`39`	`{`
`43`	`40`	`innerproduct_pack4_fp16s_neon_asimdhp(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);`
`44`	`41`	`return;`
`45`	`42`	`}`
`46`		`-#endif`
`47`	`43`	`#endif`
`48`	`44`
`49`	`45`	`const int num_input = bottom_blob.w * bottom_blob.elempack;`
`@@ -294,7 +290,6 @@ static void innerproduct_pack4_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,`
`294`	`290`
`295`	`291`	`static void innerproduct_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)`
`296`	`292`	`{`
`297`		`-#if !(__ARM_FEATURE_FP16_FML \|\| __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)`
`298`	`293`	`#if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML`
`299`	`294`	`if (ncnn::cpu_support_arm_asimdfhm())`
`300`	`295`	`{`
`@@ -303,13 +298,12 @@ static void innerproduct_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const`
`303`	`298`	`}`
`304`	`299`	`#endif`
`305`	`300`
`306`		`-#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC`
	`301`	`+#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML`
`307`	`302`	`if (ncnn::cpu_support_arm_asimdhp())`
`308`	`303`	`{`
`309`	`304`	`innerproduct_fp16s_neon_asimdhp(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);`
`310`	`305`	`return;`
`311`	`306`	`}`
`312`		`-#endif`
`313`	`307`	`#endif`
`314`	`308`
`315`	`309`	`const int num_input = bottom_blob.w * bottom_blob.elempack;`
`@@ -516,7 +510,6 @@ static void innerproduct_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const`
`516`	`510`
`517`	`511`	`static void innerproduct_transform_kernel_fp16s_neon(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt)`
`518`	`512`	`{`
`519`		`-#if !(__ARM_FEATURE_FP16_FML \|\| __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)`
`520`	`513`	`#if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML`
`521`	`514`	`if (ncnn::cpu_support_arm_asimdfhm())`
`522`	`515`	`{`
`@@ -525,13 +518,12 @@ static void innerproduct_transform_kernel_fp16s_neon(const Mat& weight_data, Mat`
`525`	`518`	`}`
`526`	`519`	`#endif`
`527`	`520`
`528`		`-#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC`
	`521`	`+#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML`
`529`	`522`	`if (ncnn::cpu_support_arm_asimdhp())`
`530`	`523`	`{`
`531`	`524`	`innerproduct_transform_kernel_fp16s_neon_asimdhp(weight_data, weight_data_tm, num_input, num_output, opt);`
`532`	`525`	`return;`
`533`	`526`	`}`
`534`		`-#endif`
`535`	`527`	`#endif`
`536`	`528`
`537`	`529`	`int out_elempack = 1;`
Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@ void cast_fp32_to_bf16_sse_avx512bf16(const Mat& bottom_blob, Mat& top_blob, con`
`17`	`17`	`void cast_bf16_to_fp32_sse_avx512bf16(const Mat& bottom_blob, Mat& top_blob, const Option& opt);`
`18`	`18`	`#endif`
`19`	`19`
`20`		`-#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__`
	`20`	`+#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVX512BF16__`
`21`	`21`	`void cast_fp32_to_bf16_sse_avx2(const Mat& bottom_blob, Mat& top_blob, const Option& opt);`
`22`	`22`	`void cast_bf16_to_fp32_sse_avx2(const Mat& bottom_blob, Mat& top_blob, const Option& opt);`
`23`	`23`	`#endif`
`@@ -32,7 +32,7 @@ static void cast_fp32_to_bf16_sse(const Mat& bottom_blob, Mat& top_blob, const O`
`32`	`32`	`}`
`33`	`33`	`#endif`
`34`	`34`
`35`		`-#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__`
	`35`	`+#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVX512BF16__`
`36`	`36`	`if (ncnn::cpu_support_x86_avx2())`
`37`	`37`	`{`
`38`	`38`	`cast_fp32_to_bf16_sse_avx2(bottom_blob, top_blob, opt);`
`@@ -104,7 +104,7 @@ static void cast_bf16_to_fp32_sse(const Mat& bottom_blob, Mat& top_blob, const O`
`104`	`104`	`}`
`105`	`105`	`#endif`
`106`	`106`
`107`		`-#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__`
	`107`	`+#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVX512BF16__`
`108`	`108`	`if (ncnn::cpu_support_x86_avx2())`
`109`	`109`	`{`
`110`	`110`	`cast_bf16_to_fp32_sse_avx2(bottom_blob, top_blob, opt);`
Original file line number	Diff line number	Diff line change
`@@ -12,29 +12,27 @@`
`12`	`12`	`// CONDITIONS OF ANY KIND, either express or implied. See the License for the`
`13`	`13`	`// specific language governing permissions and limitations under the License.`
`14`	`14`
`15`		`-#if !(__AVX512VNNI__ \|\| __AVXVNNI__ \|\| __AVX2__ \|\| __XOP__)`
`16`	`15`	`#if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__`
`17`	`16`	`void conv3x3s1_winograd23_int8_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);`
`18`	`17`	`void conv3x3s1_winograd43_int8_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);`
`19`	`18`	`#endif`
`20`	`19`
`21`		`-#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__`
	`20`	`+#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__`
`22`	`21`	`void conv3x3s1_winograd23_int8_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);`
`23`	`22`	`void conv3x3s1_winograd43_int8_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);`
`24`	`23`	`#endif`
`25`	`24`
`26`		`-#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__`
	`25`	`+#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__`
`27`	`26`	`void conv3x3s1_winograd23_transform_kernel_int8_avx2(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt);`
`28`	`27`	`void conv3x3s1_winograd23_int8_avx2(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);`
`29`	`28`	`void conv3x3s1_winograd43_transform_kernel_int8_avx2(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt);`
`30`	`29`	`void conv3x3s1_winograd43_int8_avx2(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);`
`31`	`30`	`#endif`
`32`	`31`
`33`		`-#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__`
	`32`	`+#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__`
`34`	`33`	`void conv3x3s1_winograd23_int8_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);`
`35`	`34`	`void conv3x3s1_winograd43_int8_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);`
`36`	`35`	`#endif`
`37`		`-#endif`
`38`	`36`
`39`	`37`	`static void pack_A_tile_int8(const Mat& A, Mat& AT, int batch, int max_ii, int max_kk)`
`40`	`38`	`{`
`@@ -3430,14 +3428,12 @@ static inline void conv3x3s1_winograd23_transform_kernel_tile_int8(const Mat& ke`
`3430`	`3428`
`3431`	`3429`	`static void conv3x3s1_winograd23_transform_kernel_int8(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt)`
`3432`	`3430`	`{`
`3433`		`-#if !(__AVX512VNNI__ \|\| __AVXVNNI__ \|\| __AVX2__ \|\| __XOP__)`
`3434`		`-#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__`
	`3431`	`+#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__`
`3435`	`3432`	`if (ncnn::cpu_support_x86_avx2())`
`3436`	`3433`	`{`
`3437`	`3434`	`conv3x3s1_winograd23_transform_kernel_int8_avx2(kernel, AT, inch, outch, opt);`
`3438`	`3435`	`return;`
`3439`	`3436`	`}`
`3440`		`-#endif`
`3441`	`3437`	`#endif`
`3442`	`3438`
`3443`	`3439`	`const int M = outch;`
`@@ -4430,7 +4426,6 @@ static inline void conv3x3s1_winograd23_transform_output_tile_int8(const Mat& to`
`4430`	`4426`
`4431`	`4427`	`static void conv3x3s1_winograd23_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)`
`4432`	`4428`	`{`
`4433`		`-#if !(__AVX512VNNI__ \|\| __AVXVNNI__ \|\| __AVX2__ \|\| __XOP__)`
`4434`	`4429`	`#if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__`
`4435`	`4430`	`if (ncnn::cpu_support_x86_avx512_vnni())`
`4436`	`4431`	`{`
`@@ -4439,29 +4434,28 @@ static void conv3x3s1_winograd23_int8(const Mat& bottom_blob, Mat& top_blob, con`
`4439`	`4434`	`}`
`4440`	`4435`	`#endif`
`4441`	`4436`
`4442`		`-#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__`
	`4437`	`+#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__`
`4443`	`4438`	`if (ncnn::cpu_support_x86_avx_vnni())`
`4444`	`4439`	`{`
`4445`	`4440`	`conv3x3s1_winograd23_int8_avxvnni(bottom_blob, top_blob, AT, nT, opt);`
`4446`	`4441`	`return;`
`4447`	`4442`	`}`
`4448`	`4443`	`#endif`
`4449`	`4444`
`4450`		`-#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__`
	`4445`	`+#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__`
`4451`	`4446`	`if (ncnn::cpu_support_x86_avx2())`
`4452`	`4447`	`{`
`4453`	`4448`	`conv3x3s1_winograd23_int8_avx2(bottom_blob, top_blob, AT, nT, opt);`
`4454`	`4449`	`return;`
`4455`	`4450`	`}`
`4456`	`4451`	`#endif`
`4457`	`4452`
`4458`		`-#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__`
	`4453`	`+#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__`
`4459`	`4454`	`if (ncnn::cpu_support_x86_xop())`
`4460`	`4455`	`{`
`4461`	`4456`	`conv3x3s1_winograd23_int8_xop(bottom_blob, top_blob, AT, nT, opt);`
`4462`	`4457`	`return;`
`4463`	`4458`	`}`
`4464`		`-#endif`
`4465`	`4459`	`#endif`
`4466`	`4460`
`4467`	`4461`	`int outw = top_blob.w;`
`@@ -4642,14 +4636,12 @@ static inline void conv3x3s1_winograd43_transform_kernel_tile_int8(const Mat& ke`
`4642`	`4636`
`4643`	`4637`	`static void conv3x3s1_winograd43_transform_kernel_int8(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt)`
`4644`	`4638`	`{`
`4645`		`-#if !(__AVX512VNNI__ \|\| __AVXVNNI__ \|\| __AVX2__ \|\| __XOP__)`
`4646`		`-#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__`
	`4639`	`+#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__`
`4647`	`4640`	`if (ncnn::cpu_support_x86_avx2())`
`4648`	`4641`	`{`
`4649`	`4642`	`conv3x3s1_winograd43_transform_kernel_int8_avx2(kernel, AT, inch, outch, opt);`
`4650`	`4643`	`return;`
`4651`	`4644`	`}`
`4652`		`-#endif`
`4653`	`4645`	`#endif`
`4654`	`4646`
`4655`	`4647`	`const int M = outch;`
`@@ -6260,7 +6252,6 @@ static inline void conv3x3s1_winograd43_transform_output_tile_int8(const Mat& to`
`6260`	`6252`
`6261`	`6253`	`static void conv3x3s1_winograd43_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)`
`6262`	`6254`	`{`
`6263`		`-#if !(__AVX512VNNI__ \|\| __AVXVNNI__ \|\| __AVX2__ \|\| __XOP__)`
`6264`	`6255`	`#if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__`
`6265`	`6256`	`if (ncnn::cpu_support_x86_avx512_vnni())`
`6266`	`6257`	`{`
`@@ -6269,29 +6260,28 @@ static void conv3x3s1_winograd43_int8(const Mat& bottom_blob, Mat& top_blob, con`
`6269`	`6260`	`}`
`6270`	`6261`	`#endif`
`6271`	`6262`
`6272`		`-#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__`
	`6263`	`+#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__`
`6273`	`6264`	`if (ncnn::cpu_support_x86_avx_vnni())`
`6274`	`6265`	`{`
`6275`	`6266`	`conv3x3s1_winograd43_int8_avxvnni(bottom_blob, top_blob, AT, nT, opt);`
`6276`	`6267`	`return;`
`6277`	`6268`	`}`
`6278`	`6269`	`#endif`
`6279`	`6270`
`6280`		`-#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__`
	`6271`	`+#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__`
`6281`	`6272`	`if (ncnn::cpu_support_x86_avx2())`
`6282`	`6273`	`{`
`6283`	`6274`	`conv3x3s1_winograd43_int8_avx2(bottom_blob, top_blob, AT, nT, opt);`
`6284`	`6275`	`return;`
`6285`	`6276`	`}`
`6286`	`6277`	`#endif`
`6287`	`6278`
`6288`		`-#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__`
	`6279`	`+#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__ && !__AVX2__ && !__AVXVNNI__ && !__AVX512VNNI__`
`6289`	`6280`	`if (ncnn::cpu_support_x86_xop())`
`6290`	`6281`	`{`
`6291`	`6282`	`conv3x3s1_winograd43_int8_xop(bottom_blob, top_blob, AT, nT, opt);`
`6292`	`6283`	`return;`
`6293`	`6284`	`}`
`6294`		`-#endif`
`6295`	`6285`	`#endif`
`6296`	`6286`
`6297`	`6287`	`int outw = top_blob.w;`