Tencent
diff --git a/‎src/layer/arm/concat_arm.cpp
+8 b/‎src/layer/arm/concat_arm.cpp
+8
diff --git a/‎src/layer/arm/convolution_3x3_winograd.h
+33-3 b/‎src/layer/arm/convolution_3x3_winograd.h
+33-3
diff --git a/‎src/layer/arm/convolution_3x3_winograd_bf16s.h
+33-3 b/‎src/layer/arm/convolution_3x3_winograd_bf16s.h
+33-3
@@ -159,6 +159,8 @@ int Concat_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
         if (elempack < out_elempack)
         {
             convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
+            if (top_blob.empty())
+                return -100;
         }
     }
 
@@ -284,6 +286,8 @@ int Concat_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
         if (elempack < out_elempack)
         {
             convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
+            if (top_blob.empty())
+                return -100;
         }
     }
 
@@ -617,6 +621,8 @@ int Concat_arm::forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std::v
         if (elempack < out_elempack)
         {
             convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
+            if (top_blob.empty())
+                return -100;
         }
     }
 
@@ -816,6 +822,8 @@ int Concat_arm::forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std::v
         if (elempack < out_elempack)
         {
             convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
+            if (top_blob.empty())
+                return -100;
         }
     }
 
 
@@ -5578,7 +5578,7 @@ static inline void conv3x3s1_winograd23_transform_output_tile(const Mat& top_til
     }
 }
 
-static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
+static int conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
 {
     int outw = top_blob.w;
     int outh = top_blob.h;
@@ -5605,12 +5605,16 @@ static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Ma
     // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);
 
     Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
+    if (BT.empty())
+        return -100;
 
     const int nn_NK = nn_N * nn_K;
 
     if (nT > 1 && nn_NK < nT)
     {
         Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);
+        if (B_tile.empty())
+            return -100;
 
         for (int ppjk = 0; ppjk < nn_NK; ppjk++)
         {
@@ -5634,6 +5638,8 @@ static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Ma
     else
     {
         Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);
+        if (B_tileX.empty())
+            return -100;
 
         #pragma omp parallel for num_threads(nT)
         for (int ppjk = 0; ppjk < nn_NK; ppjk++)
@@ -5659,6 +5665,8 @@ static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Ma
     }
 
     Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
+    if (top_tileX.empty())
+        return -100;
 
     #pragma omp parallel for num_threads(nT)
     for (int ppj = 0; ppj < nn_M; ppj++)
@@ -5688,6 +5696,8 @@ static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Ma
             conv3x3s1_winograd23_transform_output_tile(top_tile, top_blob, bias, i, max_ii, j, max_jj);
         }
     }
+
+    return 0;
 }
 
 static inline void conv3x3s1_winograd43_transform_kernel_tile(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk)
@@ -7256,7 +7266,7 @@ static inline void conv3x3s1_winograd43_transform_output_tile(const Mat& top_til
     }
 }
 
-static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
+static int conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
 {
     int outw = top_blob.w;
     int outh = top_blob.h;
@@ -7283,12 +7293,16 @@ static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Ma
     // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);
 
     Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
+    if (BT.empty())
+        return -100;
 
     const int nn_NK = nn_N * nn_K;
 
     if (nT > 1 && nn_NK < nT)
     {
         Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);
+        if (B_tile.empty())
+            return -100;
 
         for (int ppjk = 0; ppjk < nn_NK; ppjk++)
         {
@@ -7312,6 +7326,8 @@ static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Ma
     else
     {
         Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);
+        if (B_tileX.empty())
+            return -100;
 
         #pragma omp parallel for num_threads(nT)
         for (int ppjk = 0; ppjk < nn_NK; ppjk++)
@@ -7337,6 +7353,8 @@ static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Ma
     }
 
     Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
+    if (top_tileX.empty())
+        return -100;
 
     #pragma omp parallel for num_threads(nT)
     for (int ppj = 0; ppj < nn_M; ppj++)
@@ -7366,6 +7384,8 @@ static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Ma
             conv3x3s1_winograd43_transform_output_tile(top_tile, top_blob, bias, i, max_ii, j, max_jj);
         }
     }
+
+    return 0;
 }
 
 static inline void conv3x3s1_winograd63_transform_kernel_tile(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk)
@@ -9292,7 +9312,7 @@ static inline void conv3x3s1_winograd63_transform_output_tile(const Mat& top_til
     }
 }
 
-static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
+static int conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
 {
     int outw = top_blob.w;
     int outh = top_blob.h;
@@ -9319,12 +9339,16 @@ static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Ma
     // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);
 
     Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
+    if (BT.empty())
+        return -100;
 
     const int nn_NK = nn_N * nn_K;
 
     if (nT > 1 && nn_NK < nT)
     {
         Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);
+        if (B_tile.empty())
+            return -100;
 
         for (int ppjk = 0; ppjk < nn_NK; ppjk++)
         {
@@ -9348,6 +9372,8 @@ static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Ma
     else
     {
         Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);
+        if (B_tileX.empty())
+            return -100;
 
         #pragma omp parallel for num_threads(nT)
         for (int ppjk = 0; ppjk < nn_NK; ppjk++)
@@ -9373,6 +9399,8 @@ static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Ma
     }
 
     Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
+    if (top_tileX.empty())
+        return -100;
 
     #pragma omp parallel for num_threads(nT)
     for (int ppj = 0; ppj < nn_M; ppj++)
@@ -9402,4 +9430,6 @@ static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Ma
             conv3x3s1_winograd63_transform_output_tile(top_tile, top_blob, bias, i, max_ii, j, max_jj);
         }
     }
+
+    return 0;
 }
@@ -920,7 +920,7 @@ static inline void conv3x3s1_winograd23_transform_output_tile_bf16s(const Mat& t
     }
 }
 
-static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
+static int conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
 {
     int outw = top_blob.w;
     int outh = top_blob.h;
@@ -947,12 +947,16 @@ static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, co
     // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);
 
     Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
+    if (BT.empty())
+        return -100;
 
     const int nn_NK = nn_N * nn_K;
 
     if (nT > 1 && nn_NK < nT)
     {
         Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);
+        if (B_tile.empty())
+            return -100;
 
         for (int ppjk = 0; ppjk < nn_NK; ppjk++)
         {
@@ -976,6 +980,8 @@ static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, co
     else
     {
         Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);
+        if (B_tileX.empty())
+            return -100;
 
         #pragma omp parallel for num_threads(nT)
         for (int ppjk = 0; ppjk < nn_NK; ppjk++)
@@ -1001,6 +1007,8 @@ static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, co
     }
 
     Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
+    if (top_tileX.empty())
+        return -100;
 
     #pragma omp parallel for num_threads(nT)
     for (int ppj = 0; ppj < nn_M; ppj++)
@@ -1030,6 +1038,8 @@ static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, co
             conv3x3s1_winograd23_transform_output_tile_bf16s(top_tile, top_blob, bias, i, max_ii, j, max_jj);
         }
     }
+
+    return 0;
 }
 
 static inline void conv3x3s1_winograd43_transform_input_tile_bf16s(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT)
@@ -2497,7 +2507,7 @@ static inline void conv3x3s1_winograd43_transform_output_tile_bf16s(const Mat& t
     }
 }
 
-static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
+static int conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
 {
     int outw = top_blob.w;
     int outh = top_blob.h;
@@ -2524,12 +2534,16 @@ static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, co
     // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);
 
     Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
+    if (BT.empty())
+        return -100;
 
     const int nn_NK = nn_N * nn_K;
 
     if (nT > 1 && nn_NK < nT)
     {
         Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);
+        if (B_tile.empty())
+            return -100;
 
         for (int ppjk = 0; ppjk < nn_NK; ppjk++)
         {
@@ -2553,6 +2567,8 @@ static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, co
     else
     {
         Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);
+        if (B_tileX.empty())
+            return -100;
 
         #pragma omp parallel for num_threads(nT)
         for (int ppjk = 0; ppjk < nn_NK; ppjk++)
@@ -2578,6 +2594,8 @@ static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, co
     }
 
     Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
+    if (top_tileX.empty())
+        return -100;
 
     #pragma omp parallel for num_threads(nT)
     for (int ppj = 0; ppj < nn_M; ppj++)
@@ -2607,6 +2625,8 @@ static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, co
             conv3x3s1_winograd43_transform_output_tile_bf16s(top_tile, top_blob, bias, i, max_ii, j, max_jj);
         }
     }
+
+    return 0;
 }
 
 static inline void conv3x3s1_winograd63_transform_input_tile_bf16s(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT)
@@ -4428,7 +4448,7 @@ static inline void conv3x3s1_winograd63_transform_output_tile_bf16s(const Mat& t
     }
 }
 
-static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
+static int conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
 {
     int outw = top_blob.w;
     int outh = top_blob.h;
@@ -4455,12 +4475,16 @@ static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, co
     // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);
 
     Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
+    if (BT.empty())
+        return -100;
 
     const int nn_NK = nn_N * nn_K;
 
     if (nT > 1 && nn_NK < nT)
     {
         Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);
+        if (B_tile.empty())
+            return -100;
 
         for (int ppjk = 0; ppjk < nn_NK; ppjk++)
         {
@@ -4484,6 +4508,8 @@ static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, co
     else
     {
         Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);
+        if (B_tileX.empty())
+            return -100;
 
         #pragma omp parallel for num_threads(nT)
         for (int ppjk = 0; ppjk < nn_NK; ppjk++)
@@ -4509,6 +4535,8 @@ static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, co
     }
 
     Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
+    if (top_tileX.empty())
+        return -100;
 
     #pragma omp parallel for num_threads(nT)
     for (int ppj = 0; ppj < nn_M; ppj++)
@@ -4538,4 +4566,6 @@ static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, co
             conv3x3s1_winograd63_transform_output_tile_bf16s(top_tile, top_blob, bias, i, max_ii, j, max_jj);
         }
     }
+
+    return 0;
 }
Original file line number	Diff line number	Diff line change
`@@ -159,6 +159,8 @@ int Concat_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&`
`159`	`159`	`if (elempack < out_elempack)`
`160`	`160`	`{`
`161`	`161`	`convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);`
	`162`	`+ if (top_blob.empty())`
	`163`	`+ return -100;`
`162`	`164`	`}`
`163`	`165`	`}`
`164`	`166`
`@@ -284,6 +286,8 @@ int Concat_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&`
`284`	`286`	`if (elempack < out_elempack)`
`285`	`287`	`{`
`286`	`288`	`convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);`
	`289`	`+ if (top_blob.empty())`
	`290`	`+ return -100;`
`287`	`291`	`}`
`288`	`292`	`}`
`289`	`293`
`@@ -617,6 +621,8 @@ int Concat_arm::forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std::v`
`617`	`621`	`if (elempack < out_elempack)`
`618`	`622`	`{`
`619`	`623`	`convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);`
	`624`	`+ if (top_blob.empty())`
	`625`	`+ return -100;`
`620`	`626`	`}`
`621`	`627`	`}`
`622`	`628`
`@@ -816,6 +822,8 @@ int Concat_arm::forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std::v`
`816`	`822`	`if (elempack < out_elempack)`
`817`	`823`	`{`
`818`	`824`	`convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);`
	`825`	`+ if (top_blob.empty())`
	`826`	`+ return -100;`
`819`	`827`	`}`
`820`	`828`	`}`
`821`	`829`
Original file line number	Diff line number	Diff line change
`@@ -5578,7 +5578,7 @@ static inline void conv3x3s1_winograd23_transform_output_tile(const Mat& top_til`
`5578`	`5578`	`}`
`5579`	`5579`	`}`
`5580`	`5580`
`5581`		`-static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)`
	`5581`	`+static int conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)`
`5582`	`5582`	`{`
`5583`	`5583`	`int outw = top_blob.w;`
`5584`	`5584`	`int outh = top_blob.h;`
`@@ -5605,12 +5605,16 @@ static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Ma`
`5605`	`5605`	`// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);`
`5606`	`5606`
`5607`	`5607`	`Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);`
	`5608`	`+ if (BT.empty())`
	`5609`	`+ return -100;`
`5608`	`5610`
`5609`	`5611`	`const int nn_NK = nn_N * nn_K;`
`5610`	`5612`
`5611`	`5613`	`if (nT > 1 && nn_NK < nT)`
`5612`	`5614`	`{`
`5613`	`5615`	`Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);`
	`5616`	`+ if (B_tile.empty())`
	`5617`	`+ return -100;`
`5614`	`5618`
`5615`	`5619`	`for (int ppjk = 0; ppjk < nn_NK; ppjk++)`
`5616`	`5620`	`{`
`@@ -5634,6 +5638,8 @@ static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Ma`
`5634`	`5638`	`else`
`5635`	`5639`	`{`
`5636`	`5640`	`Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);`
	`5641`	`+ if (B_tileX.empty())`
	`5642`	`+ return -100;`
`5637`	`5643`
`5638`	`5644`	`#pragma omp parallel for num_threads(nT)`
`5639`	`5645`	`for (int ppjk = 0; ppjk < nn_NK; ppjk++)`
`@@ -5659,6 +5665,8 @@ static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Ma`
`5659`	`5665`	`}`
`5660`	`5666`
`5661`	`5667`	`Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);`
	`5668`	`+ if (top_tileX.empty())`
	`5669`	`+ return -100;`
`5662`	`5670`
`5663`	`5671`	`#pragma omp parallel for num_threads(nT)`
`5664`	`5672`	`for (int ppj = 0; ppj < nn_M; ppj++)`
`@@ -5688,6 +5696,8 @@ static void conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Ma`
`5688`	`5696`	`conv3x3s1_winograd23_transform_output_tile(top_tile, top_blob, bias, i, max_ii, j, max_jj);`
`5689`	`5697`	`}`
`5690`	`5698`	`}`
	`5699`	`+`
	`5700`	`+ return 0;`
`5691`	`5701`	`}`
`5692`	`5702`
`5693`	`5703`	`static inline void conv3x3s1_winograd43_transform_kernel_tile(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk)`
`@@ -7256,7 +7266,7 @@ static inline void conv3x3s1_winograd43_transform_output_tile(const Mat& top_til`
`7256`	`7266`	`}`
`7257`	`7267`	`}`
`7258`	`7268`
`7259`		`-static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)`
	`7269`	`+static int conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)`
`7260`	`7270`	`{`
`7261`	`7271`	`int outw = top_blob.w;`
`7262`	`7272`	`int outh = top_blob.h;`
`@@ -7283,12 +7293,16 @@ static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Ma`
`7283`	`7293`	`// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);`
`7284`	`7294`
`7285`	`7295`	`Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);`
	`7296`	`+ if (BT.empty())`
	`7297`	`+ return -100;`
`7286`	`7298`
`7287`	`7299`	`const int nn_NK = nn_N * nn_K;`
`7288`	`7300`
`7289`	`7301`	`if (nT > 1 && nn_NK < nT)`
`7290`	`7302`	`{`
`7291`	`7303`	`Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);`
	`7304`	`+ if (B_tile.empty())`
	`7305`	`+ return -100;`
`7292`	`7306`
`7293`	`7307`	`for (int ppjk = 0; ppjk < nn_NK; ppjk++)`
`7294`	`7308`	`{`
`@@ -7312,6 +7326,8 @@ static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Ma`
`7312`	`7326`	`else`
`7313`	`7327`	`{`
`7314`	`7328`	`Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);`
	`7329`	`+ if (B_tileX.empty())`
	`7330`	`+ return -100;`
`7315`	`7331`
`7316`	`7332`	`#pragma omp parallel for num_threads(nT)`
`7317`	`7333`	`for (int ppjk = 0; ppjk < nn_NK; ppjk++)`
`@@ -7337,6 +7353,8 @@ static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Ma`
`7337`	`7353`	`}`
`7338`	`7354`
`7339`	`7355`	`Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);`
	`7356`	`+ if (top_tileX.empty())`
	`7357`	`+ return -100;`
`7340`	`7358`
`7341`	`7359`	`#pragma omp parallel for num_threads(nT)`
`7342`	`7360`	`for (int ppj = 0; ppj < nn_M; ppj++)`
`@@ -7366,6 +7384,8 @@ static void conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Ma`
`7366`	`7384`	`conv3x3s1_winograd43_transform_output_tile(top_tile, top_blob, bias, i, max_ii, j, max_jj);`
`7367`	`7385`	`}`
`7368`	`7386`	`}`
	`7387`	`+`
	`7388`	`+ return 0;`
`7369`	`7389`	`}`
`7370`	`7390`
`7371`	`7391`	`static inline void conv3x3s1_winograd63_transform_kernel_tile(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk)`
`@@ -9292,7 +9312,7 @@ static inline void conv3x3s1_winograd63_transform_output_tile(const Mat& top_til`
`9292`	`9312`	`}`
`9293`	`9313`	`}`
`9294`	`9314`
`9295`		`-static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)`
	`9315`	`+static int conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)`
`9296`	`9316`	`{`
`9297`	`9317`	`int outw = top_blob.w;`
`9298`	`9318`	`int outh = top_blob.h;`
`@@ -9319,12 +9339,16 @@ static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Ma`
`9319`	`9339`	`// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);`
`9320`	`9340`
`9321`	`9341`	`Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);`
	`9342`	`+ if (BT.empty())`
	`9343`	`+ return -100;`
`9322`	`9344`
`9323`	`9345`	`const int nn_NK = nn_N * nn_K;`
`9324`	`9346`
`9325`	`9347`	`if (nT > 1 && nn_NK < nT)`
`9326`	`9348`	`{`
`9327`	`9349`	`Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);`
	`9350`	`+ if (B_tile.empty())`
	`9351`	`+ return -100;`
`9328`	`9352`
`9329`	`9353`	`for (int ppjk = 0; ppjk < nn_NK; ppjk++)`
`9330`	`9354`	`{`
`@@ -9348,6 +9372,8 @@ static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Ma`
`9348`	`9372`	`else`
`9349`	`9373`	`{`
`9350`	`9374`	`Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);`
	`9375`	`+ if (B_tileX.empty())`
	`9376`	`+ return -100;`
`9351`	`9377`
`9352`	`9378`	`#pragma omp parallel for num_threads(nT)`
`9353`	`9379`	`for (int ppjk = 0; ppjk < nn_NK; ppjk++)`
`@@ -9373,6 +9399,8 @@ static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Ma`
`9373`	`9399`	`}`
`9374`	`9400`
`9375`	`9401`	`Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);`
	`9402`	`+ if (top_tileX.empty())`
	`9403`	`+ return -100;`
`9376`	`9404`
`9377`	`9405`	`#pragma omp parallel for num_threads(nT)`
`9378`	`9406`	`for (int ppj = 0; ppj < nn_M; ppj++)`
`@@ -9402,4 +9430,6 @@ static void conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Ma`
`9402`	`9430`	`conv3x3s1_winograd63_transform_output_tile(top_tile, top_blob, bias, i, max_ii, j, max_jj);`
`9403`	`9431`	`}`
`9404`	`9432`	`}`
	`9433`	`+`
	`9434`	`+ return 0;`
`9405`	`9435`	`}`
Original file line number	Diff line number	Diff line change
`@@ -920,7 +920,7 @@ static inline void conv3x3s1_winograd23_transform_output_tile_bf16s(const Mat& t`
`920`	`920`	`}`
`921`	`921`	`}`
`922`	`922`
`923`		`-static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)`
	`923`	`+static int conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)`
`924`	`924`	`{`
`925`	`925`	`int outw = top_blob.w;`
`926`	`926`	`int outh = top_blob.h;`
`@@ -947,12 +947,16 @@ static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, co`
`947`	`947`	`// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);`
`948`	`948`
`949`	`949`	`Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);`
	`950`	`+ if (BT.empty())`
	`951`	`+ return -100;`
`950`	`952`
`951`	`953`	`const int nn_NK = nn_N * nn_K;`
`952`	`954`
`953`	`955`	`if (nT > 1 && nn_NK < nT)`
`954`	`956`	`{`
`955`	`957`	`Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);`
	`958`	`+ if (B_tile.empty())`
	`959`	`+ return -100;`
`956`	`960`
`957`	`961`	`for (int ppjk = 0; ppjk < nn_NK; ppjk++)`
`958`	`962`	`{`
`@@ -976,6 +980,8 @@ static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, co`
`976`	`980`	`else`
`977`	`981`	`{`
`978`	`982`	`Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);`
	`983`	`+ if (B_tileX.empty())`
	`984`	`+ return -100;`
`979`	`985`
`980`	`986`	`#pragma omp parallel for num_threads(nT)`
`981`	`987`	`for (int ppjk = 0; ppjk < nn_NK; ppjk++)`
`@@ -1001,6 +1007,8 @@ static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, co`
`1001`	`1007`	`}`
`1002`	`1008`
`1003`	`1009`	`Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);`
	`1010`	`+ if (top_tileX.empty())`
	`1011`	`+ return -100;`
`1004`	`1012`
`1005`	`1013`	`#pragma omp parallel for num_threads(nT)`
`1006`	`1014`	`for (int ppj = 0; ppj < nn_M; ppj++)`
`@@ -1030,6 +1038,8 @@ static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, co`
`1030`	`1038`	`conv3x3s1_winograd23_transform_output_tile_bf16s(top_tile, top_blob, bias, i, max_ii, j, max_jj);`
`1031`	`1039`	`}`
`1032`	`1040`	`}`
	`1041`	`+`
	`1042`	`+ return 0;`
`1033`	`1043`	`}`
`1034`	`1044`
`1035`	`1045`	`static inline void conv3x3s1_winograd43_transform_input_tile_bf16s(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT)`
`@@ -2497,7 +2507,7 @@ static inline void conv3x3s1_winograd43_transform_output_tile_bf16s(const Mat& t`
`2497`	`2507`	`}`
`2498`	`2508`	`}`
`2499`	`2509`
`2500`		`-static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)`
	`2510`	`+static int conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)`
`2501`	`2511`	`{`
`2502`	`2512`	`int outw = top_blob.w;`
`2503`	`2513`	`int outh = top_blob.h;`
`@@ -2524,12 +2534,16 @@ static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, co`
`2524`	`2534`	`// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);`
`2525`	`2535`
`2526`	`2536`	`Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);`
	`2537`	`+ if (BT.empty())`
	`2538`	`+ return -100;`
`2527`	`2539`
`2528`	`2540`	`const int nn_NK = nn_N * nn_K;`
`2529`	`2541`
`2530`	`2542`	`if (nT > 1 && nn_NK < nT)`
`2531`	`2543`	`{`
`2532`	`2544`	`Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);`
	`2545`	`+ if (B_tile.empty())`
	`2546`	`+ return -100;`
`2533`	`2547`
`2534`	`2548`	`for (int ppjk = 0; ppjk < nn_NK; ppjk++)`
`2535`	`2549`	`{`
`@@ -2553,6 +2567,8 @@ static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, co`
`2553`	`2567`	`else`
`2554`	`2568`	`{`
`2555`	`2569`	`Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);`
	`2570`	`+ if (B_tileX.empty())`
	`2571`	`+ return -100;`
`2556`	`2572`
`2557`	`2573`	`#pragma omp parallel for num_threads(nT)`
`2558`	`2574`	`for (int ppjk = 0; ppjk < nn_NK; ppjk++)`
`@@ -2578,6 +2594,8 @@ static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, co`
`2578`	`2594`	`}`
`2579`	`2595`
`2580`	`2596`	`Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);`
	`2597`	`+ if (top_tileX.empty())`
	`2598`	`+ return -100;`
`2581`	`2599`
`2582`	`2600`	`#pragma omp parallel for num_threads(nT)`
`2583`	`2601`	`for (int ppj = 0; ppj < nn_M; ppj++)`
`@@ -2607,6 +2625,8 @@ static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, co`
`2607`	`2625`	`conv3x3s1_winograd43_transform_output_tile_bf16s(top_tile, top_blob, bias, i, max_ii, j, max_jj);`
`2608`	`2626`	`}`
`2609`	`2627`	`}`
	`2628`	`+`
	`2629`	`+ return 0;`
`2610`	`2630`	`}`
`2611`	`2631`
`2612`	`2632`	`static inline void conv3x3s1_winograd63_transform_input_tile_bf16s(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT)`
`@@ -4428,7 +4448,7 @@ static inline void conv3x3s1_winograd63_transform_output_tile_bf16s(const Mat& t`
`4428`	`4448`	`}`
`4429`	`4449`	`}`
`4430`	`4450`
`4431`		`-static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)`
	`4451`	`+static int conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)`
`4432`	`4452`	`{`
`4433`	`4453`	`int outw = top_blob.w;`
`4434`	`4454`	`int outh = top_blob.h;`
`@@ -4455,12 +4475,16 @@ static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, co`
`4455`	`4475`	`// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);`
`4456`	`4476`
`4457`	`4477`	`Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);`
	`4478`	`+ if (BT.empty())`
	`4479`	`+ return -100;`
`4458`	`4480`
`4459`	`4481`	`const int nn_NK = nn_N * nn_K;`
`4460`	`4482`
`4461`	`4483`	`if (nT > 1 && nn_NK < nT)`
`4462`	`4484`	`{`
`4463`	`4485`	`Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);`
	`4486`	`+ if (B_tile.empty())`
	`4487`	`+ return -100;`
`4464`	`4488`
`4465`	`4489`	`for (int ppjk = 0; ppjk < nn_NK; ppjk++)`
`4466`	`4490`	`{`
`@@ -4484,6 +4508,8 @@ static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, co`
`4484`	`4508`	`else`
`4485`	`4509`	`{`
`4486`	`4510`	`Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);`
	`4511`	`+ if (B_tileX.empty())`
	`4512`	`+ return -100;`
`4487`	`4513`
`4488`	`4514`	`#pragma omp parallel for num_threads(nT)`
`4489`	`4515`	`for (int ppjk = 0; ppjk < nn_NK; ppjk++)`
`@@ -4509,6 +4535,8 @@ static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, co`
`4509`	`4535`	`}`
`4510`	`4536`
`4511`	`4537`	`Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);`
	`4538`	`+ if (top_tileX.empty())`
	`4539`	`+ return -100;`
`4512`	`4540`
`4513`	`4541`	`#pragma omp parallel for num_threads(nT)`
`4514`	`4542`	`for (int ppj = 0; ppj < nn_M; ppj++)`
`@@ -4538,4 +4566,6 @@ static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, co`
`4538`	`4566`	`conv3x3s1_winograd63_transform_output_tile_bf16s(top_tile, top_blob, bias, i, max_ii, j, max_jj);`
`4539`	`4567`	`}`
`4540`	`4568`	`}`
	`4569`	`+`
	`4570`	`+ return 0;`
`4541`	`4571`	`}`