@@ -920,7 +920,7 @@ static inline void conv3x3s1_winograd23_transform_output_tile_bf16s(const Mat& t
920
920
}
921
921
}
922
922
923
- static void conv3x3s1_winograd23_bf16s (const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
923
+ static int conv3x3s1_winograd23_bf16s (const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
924
924
{
925
925
int outw = top_blob.w ;
926
926
int outh = top_blob.h ;
@@ -947,12 +947,16 @@ static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, co
947
947
// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);
948
948
949
949
Mat BT (TILE_K * TILE_N, B, (K + TILE_K - 1 ) / TILE_K, (N + TILE_N - 1 ) / TILE_N, 4u , opt.workspace_allocator );
950
+ if (BT.empty ())
951
+ return -100 ;
950
952
951
953
const int nn_NK = nn_N * nn_K;
952
954
953
955
if (nT > 1 && nn_NK < nT)
954
956
{
955
957
Mat B_tile (TILE_N * B * TILE_K, 4u , opt.workspace_allocator );
958
+ if (B_tile.empty ())
959
+ return -100 ;
956
960
957
961
for (int ppjk = 0 ; ppjk < nn_NK; ppjk++)
958
962
{
@@ -976,6 +980,8 @@ static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, co
976
980
else
977
981
{
978
982
Mat B_tileX (TILE_N * B * TILE_K, 1 , nT, 4u , opt.workspace_allocator );
983
+ if (B_tileX.empty ())
984
+ return -100 ;
979
985
980
986
#pragma omp parallel for num_threads(nT)
981
987
for (int ppjk = 0 ; ppjk < nn_NK; ppjk++)
@@ -1001,6 +1007,8 @@ static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, co
1001
1007
}
1002
1008
1003
1009
Mat top_tileX (TILE_N * B * TILE_M, 1 , nT, 4u , opt.workspace_allocator );
1010
+ if (top_tileX.empty ())
1011
+ return -100 ;
1004
1012
1005
1013
#pragma omp parallel for num_threads(nT)
1006
1014
for (int ppj = 0 ; ppj < nn_M; ppj++)
@@ -1030,6 +1038,8 @@ static void conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, co
1030
1038
conv3x3s1_winograd23_transform_output_tile_bf16s (top_tile, top_blob, bias, i, max_ii, j, max_jj);
1031
1039
}
1032
1040
}
1041
+
1042
+ return 0 ;
1033
1043
}
1034
1044
1035
1045
static inline void conv3x3s1_winograd43_transform_input_tile_bf16s (const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT)
@@ -2497,7 +2507,7 @@ static inline void conv3x3s1_winograd43_transform_output_tile_bf16s(const Mat& t
2497
2507
}
2498
2508
}
2499
2509
2500
- static void conv3x3s1_winograd43_bf16s (const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
2510
+ static int conv3x3s1_winograd43_bf16s (const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
2501
2511
{
2502
2512
int outw = top_blob.w ;
2503
2513
int outh = top_blob.h ;
@@ -2524,12 +2534,16 @@ static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, co
2524
2534
// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);
2525
2535
2526
2536
Mat BT (TILE_K * TILE_N, B, (K + TILE_K - 1 ) / TILE_K, (N + TILE_N - 1 ) / TILE_N, 4u , opt.workspace_allocator );
2537
+ if (BT.empty ())
2538
+ return -100 ;
2527
2539
2528
2540
const int nn_NK = nn_N * nn_K;
2529
2541
2530
2542
if (nT > 1 && nn_NK < nT)
2531
2543
{
2532
2544
Mat B_tile (TILE_N * B * TILE_K, 4u , opt.workspace_allocator );
2545
+ if (B_tile.empty ())
2546
+ return -100 ;
2533
2547
2534
2548
for (int ppjk = 0 ; ppjk < nn_NK; ppjk++)
2535
2549
{
@@ -2553,6 +2567,8 @@ static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, co
2553
2567
else
2554
2568
{
2555
2569
Mat B_tileX (TILE_N * B * TILE_K, 1 , nT, 4u , opt.workspace_allocator );
2570
+ if (B_tileX.empty ())
2571
+ return -100 ;
2556
2572
2557
2573
#pragma omp parallel for num_threads(nT)
2558
2574
for (int ppjk = 0 ; ppjk < nn_NK; ppjk++)
@@ -2578,6 +2594,8 @@ static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, co
2578
2594
}
2579
2595
2580
2596
Mat top_tileX (TILE_N * B * TILE_M, 1 , nT, 4u , opt.workspace_allocator );
2597
+ if (top_tileX.empty ())
2598
+ return -100 ;
2581
2599
2582
2600
#pragma omp parallel for num_threads(nT)
2583
2601
for (int ppj = 0 ; ppj < nn_M; ppj++)
@@ -2607,6 +2625,8 @@ static void conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, co
2607
2625
conv3x3s1_winograd43_transform_output_tile_bf16s (top_tile, top_blob, bias, i, max_ii, j, max_jj);
2608
2626
}
2609
2627
}
2628
+
2629
+ return 0 ;
2610
2630
}
2611
2631
2612
2632
static inline void conv3x3s1_winograd63_transform_input_tile_bf16s (const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT)
@@ -4428,7 +4448,7 @@ static inline void conv3x3s1_winograd63_transform_output_tile_bf16s(const Mat& t
4428
4448
}
4429
4449
}
4430
4450
4431
- static void conv3x3s1_winograd63_bf16s (const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
4451
+ static int conv3x3s1_winograd63_bf16s (const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
4432
4452
{
4433
4453
int outw = top_blob.w ;
4434
4454
int outh = top_blob.h ;
@@ -4455,12 +4475,16 @@ static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, co
4455
4475
// NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);
4456
4476
4457
4477
Mat BT (TILE_K * TILE_N, B, (K + TILE_K - 1 ) / TILE_K, (N + TILE_N - 1 ) / TILE_N, 4u , opt.workspace_allocator );
4478
+ if (BT.empty ())
4479
+ return -100 ;
4458
4480
4459
4481
const int nn_NK = nn_N * nn_K;
4460
4482
4461
4483
if (nT > 1 && nn_NK < nT)
4462
4484
{
4463
4485
Mat B_tile (TILE_N * B * TILE_K, 4u , opt.workspace_allocator );
4486
+ if (B_tile.empty ())
4487
+ return -100 ;
4464
4488
4465
4489
for (int ppjk = 0 ; ppjk < nn_NK; ppjk++)
4466
4490
{
@@ -4484,6 +4508,8 @@ static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, co
4484
4508
else
4485
4509
{
4486
4510
Mat B_tileX (TILE_N * B * TILE_K, 1 , nT, 4u , opt.workspace_allocator );
4511
+ if (B_tileX.empty ())
4512
+ return -100 ;
4487
4513
4488
4514
#pragma omp parallel for num_threads(nT)
4489
4515
for (int ppjk = 0 ; ppjk < nn_NK; ppjk++)
@@ -4509,6 +4535,8 @@ static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, co
4509
4535
}
4510
4536
4511
4537
Mat top_tileX (TILE_N * B * TILE_M, 1 , nT, 4u , opt.workspace_allocator );
4538
+ if (top_tileX.empty ())
4539
+ return -100 ;
4512
4540
4513
4541
#pragma omp parallel for num_threads(nT)
4514
4542
for (int ppj = 0 ; ppj < nn_M; ppj++)
@@ -4538,4 +4566,6 @@ static void conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, co
4538
4566
conv3x3s1_winograd63_transform_output_tile_bf16s (top_tile, top_blob, bias, i, max_ii, j, max_jj);
4539
4567
}
4540
4568
}
4569
+
4570
+ return 0 ;
4541
4571
}
0 commit comments