From 39cf4f6018a49d59deec1ae3133fabe602370131 Mon Sep 17 00:00:00 2001 From: nihui Date: Fri, 3 Jan 2025 18:52:45 +0800 Subject: [PATCH] slim reduction (#5866) --- src/layer/reduction.cpp | 1049 +++++++++++++------------------------- tests/test_reduction.cpp | 361 +++++-------- 2 files changed, 481 insertions(+), 929 deletions(-) diff --git a/src/layer/reduction.cpp b/src/layer/reduction.cpp index 55648f8eaf1..dc51b894fe4 100644 --- a/src/layer/reduction.cpp +++ b/src/layer/reduction.cpp @@ -45,35 +45,261 @@ int Reduction::load_param(const ParamDict& pd) return 0; } -template -static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool reduce_h, bool reduce_d, bool reduce_c, int keepdims, const Option& opt) +template +static float reduction(float v0, const float* ptr, int size) { Op op; - Op2 op2; - size_t elemsize = a.elemsize; - int dims = a.dims; + float sum = v0; + for (int i = 0; i < size; i++) + { + sum = op(sum, ptr[i]); + } - if (dims == 1) + return sum; +} + +template +static float reduction(float v0, const float* ptr, int size, int stride) +{ + Op op; + + float sum = v0; + for (int i = 0; i < size; i++) { - int w = a.w; - b.create(1, elemsize, opt.blob_allocator); - const float* ptr = a; + sum = op(sum, *ptr); + ptr += stride; + } + + return sum; +} - float sum = v0; - for (int i = 0; i < w; i++) +template +static float reduction(float v0, const float* ptr, int size0, int size1, int stride1) +{ + Op op; + + float sum = v0; + for (int i = 0; i < size1; i++) + { + for (int j = 0; j < size0; j++) { - sum = op(sum, ptr[i]); + sum = op(sum, ptr[j]); } - b[0] = sum; + ptr += stride1; + } + + return sum; +} + +template +static float reduction(float v0, const float* ptr, int size0, int stride0, int size1, int stride1) +{ + Op op; + + float sum = v0; + for (int i = 0; i < size1; i++) + { + const float* ptr0 = ptr; + for (int j = 0; j < size0; j++) + { + sum = op(sum, *ptr0); + ptr0 += stride0; + } + ptr += stride1; + } + + return sum; +} + +struct reduction_op_add +{ + float operator()(const float& x, const float& y) const + { + return x + y; + } +}; + +struct reduction_op_mul +{ + float operator()(const float& x, const float& y) const + { + return x * y; + } +}; + +struct reduction_op_asum +{ + float operator()(const float& x, const float& y) const + { + return x + fabsf(y); + } +}; + +struct reduction_op_sumsq +{ + float operator()(const float& x, const float& y) const + { + return x + y * y; + } +}; + +struct reduction_op_sumexp +{ + float operator()(const float& x, const float& y) const + { + return x + expf(y); + } +}; + +struct reduction_op_max +{ + float operator()(const float& x, const float& y) const + { + return std::max(x, y); + } +}; + +struct reduction_op_min +{ + float operator()(const float& x, const float& y) const + { + return std::min(x, y); + } +}; + +static float reduction(float v0, const float* ptr, int size, int op_type) +{ + if (op_type == Reduction::ReductionOp_SUM) return reduction(v0, ptr, size); + if (op_type == Reduction::ReductionOp_ASUM) return reduction(v0, ptr, size); + if (op_type == Reduction::ReductionOp_SUMSQ) return reduction(v0, ptr, size); + if (op_type == Reduction::ReductionOp_PROD) return reduction(v0, ptr, size); + if (op_type == Reduction::ReductionOp_MAX) return reduction(v0, ptr, size); + if (op_type == Reduction::ReductionOp_MIN) return reduction(v0, ptr, size); + if (op_type == Reduction::ReductionOp_LogSumExp) return reduction(v0, ptr, size); + + // should never reach here + return v0; +} + +static float reduction(float v0, const float* ptr, int size, int stride, int op_type) +{ + if (op_type == Reduction::ReductionOp_SUM) return reduction(v0, ptr, size, stride); + if (op_type == Reduction::ReductionOp_ASUM) return reduction(v0, ptr, size, stride); + if (op_type == Reduction::ReductionOp_SUMSQ) return reduction(v0, ptr, size, stride); + if (op_type == Reduction::ReductionOp_PROD) return reduction(v0, ptr, size, stride); + if (op_type == Reduction::ReductionOp_MAX) return reduction(v0, ptr, size, stride); + if (op_type == Reduction::ReductionOp_MIN) return reduction(v0, ptr, size, stride); + if (op_type == Reduction::ReductionOp_LogSumExp) return reduction(v0, ptr, size, stride); + + // should never reach here + return v0; +} + +static float reduction(float v0, const float* ptr, int size0, int size1, int stride1, int op_type) +{ + if (op_type == Reduction::ReductionOp_SUM) return reduction(v0, ptr, size0, size1, stride1); + if (op_type == Reduction::ReductionOp_ASUM) return reduction(v0, ptr, size0, size1, stride1); + if (op_type == Reduction::ReductionOp_SUMSQ) return reduction(v0, ptr, size0, size1, stride1); + if (op_type == Reduction::ReductionOp_PROD) return reduction(v0, ptr, size0, size1, stride1); + if (op_type == Reduction::ReductionOp_MAX) return reduction(v0, ptr, size0, size1, stride1); + if (op_type == Reduction::ReductionOp_MIN) return reduction(v0, ptr, size0, size1, stride1); + if (op_type == Reduction::ReductionOp_LogSumExp) return reduction(v0, ptr, size0, size1, stride1); + + // should never reach here + return v0; +} + +static float reduction(float v0, const float* ptr, int size0, int stride0, int size1, int stride1, int op_type) +{ + if (op_type == Reduction::ReductionOp_SUM) return reduction(v0, ptr, size0, stride0, size1, stride1); + if (op_type == Reduction::ReductionOp_ASUM) return reduction(v0, ptr, size0, stride0, size1, stride1); + if (op_type == Reduction::ReductionOp_SUMSQ) return reduction(v0, ptr, size0, stride0, size1, stride1); + if (op_type == Reduction::ReductionOp_PROD) return reduction(v0, ptr, size0, stride0, size1, stride1); + if (op_type == Reduction::ReductionOp_MAX) return reduction(v0, ptr, size0, stride0, size1, stride1); + if (op_type == Reduction::ReductionOp_MIN) return reduction(v0, ptr, size0, stride0, size1, stride1); + if (op_type == Reduction::ReductionOp_LogSumExp) return reduction(v0, ptr, size0, stride0, size1, stride1); + + // should never reach here + return v0; +} + +static int reduction_op(const Mat& a, Mat& b, bool reduce_w, bool reduce_h, bool reduce_d, bool reduce_c, int keepdims, int operation, float coeff, const Option& opt) +{ + int op_type = Reduction::ReductionOp_SUM; + int op2_type = Reduction::ReductionOp_SUM; + float v0 = 0.f; + + switch (operation) + { + case Reduction::ReductionOp_SUM: + case Reduction::ReductionOp_MEAN: + case Reduction::ReductionOp_LogSum: + { + break; + } + case Reduction::ReductionOp_ASUM: + case Reduction::ReductionOp_L1: + { + op_type = Reduction::ReductionOp_ASUM; + break; + } + case Reduction::ReductionOp_SUMSQ: + case Reduction::ReductionOp_L2: + { + op_type = Reduction::ReductionOp_SUMSQ; + break; + } + case Reduction::ReductionOp_MAX: + { + op_type = Reduction::ReductionOp_MAX; + op2_type = Reduction::ReductionOp_MAX; + v0 = -FLT_MAX; + break; + } + case Reduction::ReductionOp_MIN: + { + op_type = Reduction::ReductionOp_MIN; + op2_type = Reduction::ReductionOp_MIN; + v0 = FLT_MAX; + break; + } + case Reduction::ReductionOp_PROD: + { + op_type = Reduction::ReductionOp_PROD; + op2_type = Reduction::ReductionOp_PROD; + v0 = 1.f; + break; + } + case Reduction::ReductionOp_LogSumExp: + { + op_type = Reduction::ReductionOp_LogSumExp; + break; + } + default: + { + // should never reach here + break; + } + } + + const size_t elemsize = a.elemsize; + const int dims = a.dims; - return 0; + // NCNN_LOGE("%d (%d %d %d %d) %d %d %d %d", dims, a.w, a.h, a.d, a.c, reduce_w, reduce_h, reduce_d, reduce_c); + + if (dims == 1) + { + const int w = a.w; + b.create(1, elemsize, opt.blob_allocator); + + b[0] = reduction(v0, a, w, op_type); } if (dims == 2) { - int w = a.w; - int h = a.h; + const int w = a.w; + const int h = a.h; if (reduce_w && reduce_h) { @@ -92,22 +318,10 @@ static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool redu { const float* ptr = a.row(i); - float sum = v0; - for (int j = 0; j < w; j++) - { - sum = op(sum, ptr[j]); - } - sums[i] = sum; + sums[i] = reduction(v0, ptr, w, op_type); } - float sum = v0; - for (int i = 0; i < h; i++) - { - sum = op2(sum, sums[i]); - } - b[0] = sum; - - return 0; + b[0] = reduction(v0, sums, h, op2_type); } if (reduce_w && !reduce_h) @@ -123,14 +337,8 @@ static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool redu { const float* ptr = a.row(i); - float sum = v0; - for (int j = 0; j < w; j++) - { - sum = op(sum, ptr[j]); - } - b[i] = sum; + b[i] = reduction(v0, ptr, w, op_type); } - return 0; } if (!reduce_w && reduce_h) @@ -140,26 +348,21 @@ static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool redu b.create(w, 1, elemsize, opt.blob_allocator); else b.create(w, elemsize, opt.blob_allocator); - b.fill(v0); - for (int i = 0; i < h; i++) + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) { - const float* ptr = a.row(i); - for (int j = 0; j < w; j++) - { - b[j] = op(b[j], ptr[j]); - } + b[i] = reduction(v0, (const float*)a + i, h, a.w, op_type); } - return 0; } } if (dims == 3) { - int w = a.w; - int h = a.h; - int channels = a.c; - int size = w * h; + const int w = a.w; + const int h = a.h; + const int channels = a.c; + const int size = w * h; if (reduce_w && reduce_h && reduce_c) { @@ -177,22 +380,10 @@ static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool redu { const float* ptr = a.channel(q); - float sum = v0; - for (int i = 0; i < size; i++) - { - sum = op(sum, ptr[i]); - } - sums[q] = sum; - } - - float sum = v0; - for (int i = 0; i < channels; i++) - { - sum = op2(sum, sums[i]); + sums[q] = reduction(v0, ptr, size, op_type); } - b[0] = sum; - return 0; + b[0] = reduction(v0, sums, channels, op2_type); } if (reduce_w && reduce_h && !reduce_c) @@ -207,20 +398,10 @@ static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool redu for (int q = 0; q < channels; q++) { const float* ptr = a.channel(q); + float* outptr = keepdims ? b.channel(q) : (float*)b + q; - float sum = v0; - for (int i = 0; i < size; i++) - { - sum = op(sum, ptr[i]); - } - - if (keepdims) - b.channel(q)[0] = sum; - else - b[q] = sum; + outptr[0] = reduction(v0, ptr, size, op_type); } - - return 0; } if (reduce_w && !reduce_h && reduce_c) @@ -230,42 +411,12 @@ static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool redu b.create(1, h, 1, elemsize, opt.blob_allocator); else b.create(h, elemsize, opt.blob_allocator); - Mat mins(1, h, channels, elemsize, opt.workspace_allocator); - if (mins.empty()) - return -100; - - mins.fill(v0); #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = a.channel(q); - float* mins_ptr = mins.channel(q); - - for (int i = 0; i < h; i++) - { - float sum = v0; - for (int j = 0; j < w; j++) - { - sum = op(sum, ptr[j]); - } - mins_ptr[i] = sum; - ptr += w; - } - } - - b.fill(v0); - - for (int q = 0; q < channels; q++) + for (int i = 0; i < h; i++) { - const float* mins_ptr = mins.channel(q); - for (int i = 0; i < h; i++) - { - b[i] = op2(b[i], mins_ptr[i]); - } + b[i] = reduction(v0, (const float*)a.row(i), w, channels, a.cstep, op_type); } - - return 0; } if (!reduce_w && reduce_h && reduce_c) @@ -276,40 +427,11 @@ static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool redu else b.create(w, elemsize, opt.blob_allocator); - Mat mins(w, 1, channels, elemsize, opt.workspace_allocator); - if (mins.empty()) - return -100; - - mins.fill(v0); - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int j = 0; j < w; j++) { - const float* ptr = a.channel(q); - float* mins_ptr = mins.channel(q); - - for (int i = 0; i < h; i++) - { - for (int j = 0; j < w; j++) - { - mins_ptr[j] = op(mins_ptr[j], ptr[j]); - } - ptr += w; - } + b[j] = reduction(v0, (const float*)a + j, h, w, channels, a.cstep, op_type); } - - b.fill(v0); - - for (int q = 0; q < channels; q++) - { - const float* mins_ptr = mins.channel(q); - for (int j = 0; j < w; j++) - { - b[j] = op2(b[j], mins_ptr[j]); - } - } - - return 0; } if (reduce_w && !reduce_h && !reduce_c) @@ -328,17 +450,10 @@ static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool redu for (int i = 0; i < h; i++) { - float sum = v0; - for (int j = 0; j < w; j++) - { - sum = op(sum, ptr[j]); - } - outptr[i] = sum; + outptr[i] = reduction(v0, ptr, w, op_type); ptr += w; } } - - return 0; } if (!reduce_w && !reduce_h && reduce_c) @@ -349,19 +464,11 @@ static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool redu else b.create(w, h, elemsize, opt.blob_allocator); - b.fill(v0); - - for (int q = 0; q < channels; q++) + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < size; i++) { - const float* ptr = a.channel(q); - - for (int i = 0; i < size; i++) - { - b[i] = op(b[i], ptr[i]); - } + b[i] = reduction(v0, (const float*)a + i, channels, a.cstep, op_type); } - - return 0; } if (!reduce_w && reduce_h && !reduce_c) @@ -372,34 +479,27 @@ static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool redu else b.create(w, channels, elemsize, opt.blob_allocator); - b.fill(v0); - #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr = a.channel(q); float* outptr = keepdims ? b.channel(q) : b.row(q); - for (int i = 0; i < h; i++) + for (int j = 0; j < w; j++) { - for (int j = 0; j < w; j++) - { - outptr[j] = op(outptr[j], ptr[j]); - } - ptr += w; + outptr[j] = reduction(v0, ptr + j, h, w, op_type); } } - return 0; } } if (dims == 4) { - int w = a.w; - int h = a.h; - int d = a.d; - int channels = a.c; - int size = w * h * d; + const int w = a.w; + const int h = a.h; + const int d = a.d; + const int channels = a.c; + const int size = w * h * d; if (reduce_w && reduce_h && reduce_d && reduce_c) { @@ -417,22 +517,10 @@ static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool redu { const float* ptr = a.channel(q); - float sum = v0; - for (int i = 0; i < size; i++) - { - sum = op(sum, ptr[i]); - } - sums[q] = sum; - } - - float sum = v0; - for (int i = 0; i < channels; i++) - { - sum = op2(sum, sums[i]); + sums[q] = reduction(v0, ptr, size, op_type); } - b[0] = sum; - return 0; + b[0] = reduction(v0, sums, channels, op2_type); } if (reduce_w && reduce_h && reduce_d && !reduce_c) @@ -447,19 +535,10 @@ static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool redu for (int q = 0; q < channels; q++) { const float* ptr = a.channel(q); + float* outptr = keepdims ? b.channel(q) : (float*)b + q; - float sum = v0; - for (int i = 0; i < size; i++) - { - sum = op(sum, ptr[i]); - } - if (keepdims) - b.channel(q)[0] = sum; - else - b[q] = sum; + outptr[0] = reduction(v0, ptr, size, op_type); } - - return 0; } if (reduce_w && reduce_h && !reduce_d && reduce_c) @@ -469,42 +548,12 @@ static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool redu b.create(1, 1, d, 1, elemsize, opt.blob_allocator); else b.create(d, elemsize, opt.blob_allocator); - Mat mins(1, d, channels, elemsize, opt.workspace_allocator); - if (mins.empty()) - return -100; - - mins.fill(v0); #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = a.channel(q); - float* mins_ptr = mins.channel(q); - - for (int i = 0; i < d; i++) - { - float sum = v0; - for (int j = 0; j < w * h; j++) - { - sum = op(sum, ptr[j]); - } - mins_ptr[i] = sum; - ptr += w * h; - } - } - - b.fill(v0); - - for (int q = 0; q < channels; q++) + for (int i = 0; i < d; i++) { - const float* mins_ptr = mins.channel(q); - for (int i = 0; i < d; i++) - { - b[i] = op2(b[i], mins_ptr[i]); - } + b[i] = reduction(v0, (const float*)a.depth(i), w * h, channels, a.cstep, op_type); } - - return 0; } if (reduce_w && !reduce_h && reduce_d && reduce_c) @@ -514,43 +563,28 @@ static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool redu b.create(1, h, 1, 1, elemsize, opt.blob_allocator); else b.create(h, elemsize, opt.blob_allocator); - Mat mins(1, h, channels, elemsize, opt.workspace_allocator); + Mat mins(h, 1, channels, elemsize, opt.workspace_allocator); if (mins.empty()) return -100; - mins.fill(v0); - #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr = a.channel(q); float* mins_ptr = mins.channel(q); - for (int i = 0; i < d; i++) + for (int j = 0; j < h; j++) { - for (int j = 0; j < h; j++) - { - for (int k = 0; k < w; k++) - { - mins_ptr[j] = op(mins_ptr[j], ptr[k]); - } - ptr += w; - } + mins_ptr[j] = reduction(v0, ptr, w, d, w * h, op_type); + ptr += w; } } - b.fill(v0); - - for (int q = 0; q < channels; q++) + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) { - const float* mins_ptr = mins.channel(q); - for (int i = 0; i < h; i++) - { - b[i] = op2(b[i], mins_ptr[i]); - } + b[i] = reduction(v0, (const float*)mins + i, channels, mins.cstep, op2_type); } - - return 0; } if (!reduce_w && reduce_h && reduce_d && reduce_c) @@ -560,43 +594,12 @@ static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool redu b.create(w, 1, 1, 1, elemsize, opt.blob_allocator); else b.create(w, elemsize, opt.blob_allocator); - Mat mins(w, 1, channels, elemsize, opt.workspace_allocator); - if (mins.empty()) - return -100; - - mins.fill(v0); #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int i = 0; i < w; i++) { - const float* ptr = a.channel(q); - float* mins_ptr = mins.channel(q); - - for (int i = 0; i < d; i++) - { - for (int j = 0; j < h; j++) - { - for (int k = 0; k < w; k++) - { - mins_ptr[k] = op(mins_ptr[k], ptr[k]); - } - ptr += w; - } - } + b[i] = reduction(v0, (const float*)a + i, h * d, w, channels, a.cstep, op_type); } - - b.fill(v0); - - for (int q = 0; q < channels; q++) - { - const float* mins_ptr = mins.channel(q); - for (int i = 0; i < w; i++) - { - b[i] = op2(b[i], mins_ptr[i]); - } - } - - return 0; } if (reduce_w && reduce_h && !reduce_d && !reduce_c) @@ -615,17 +618,10 @@ static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool redu for (int i = 0; i < d; i++) { - float sum = v0; - for (int j = 0; j < w * h; j++) - { - sum = op(sum, ptr[j]); - } - outptr[i] = sum; + outptr[i] = reduction(v0, ptr, w * h, op_type); ptr += w * h; } } - - return 0; } if (reduce_w && !reduce_h && !reduce_d && reduce_c) @@ -636,49 +632,16 @@ static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool redu else b.create(h, d, elemsize, opt.blob_allocator); - Mat mins(h, d, channels, elemsize, opt.workspace_allocator); - if (mins.empty()) - return -100; - - mins.fill(v0); - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int i = 0; i < d; i++) { - const float* ptr = a.channel(q); - Mat minsm = mins.channel(q); - - for (int i = 0; i < d; i++) - { - float* mins_ptr = minsm.row(i); - for (int j = 0; j < h; j++) - { - for (int k = 0; k < w; k++) - { - mins_ptr[j] = op(mins_ptr[j], ptr[k]); - } - ptr += w; - } - } - } + float* bptr = keepdims ? b.depth(i) : b.row(i); - b.fill(v0); - - for (int q = 0; q < channels; q++) - { - const Mat minsm = mins.channel(q); - for (int i = 0; i < d; i++) + for (int j = 0; j < h; j++) { - const float* mins_ptr = minsm.row(i); - float* bptr = keepdims ? b.depth(i) : b.row(i); - for (int j = 0; j < h; j++) - { - bptr[j] = op2(bptr[j], mins_ptr[j]); - } + bptr[j] = reduction(v0, a.depth(i).row(j), w, channels, a.cstep, op_type); } } - - return 0; } if (!reduce_w && !reduce_h && reduce_d && reduce_c) @@ -689,49 +652,16 @@ static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool redu else b.create(w, h, elemsize, opt.blob_allocator); - Mat mins(w, h, channels, elemsize, opt.workspace_allocator); - if (mins.empty()) - return -100; - - mins.fill(v0); - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int i = 0; i < h; i++) { - const float* ptr = a.channel(q); - Mat minsm = mins.channel(q); + float* bptr = b.row(i); - for (int i = 0; i < d; i++) - { - for (int j = 0; j < h; j++) - { - float* mins_ptr = minsm.row(j); - for (int k = 0; k < w; k++) - { - mins_ptr[k] = op(mins_ptr[k], ptr[k]); - } - ptr += w; - } - } - } - - b.fill(v0); - - for (int q = 0; q < channels; q++) - { - const Mat minsm = mins.channel(q); - for (int i = 0; i < h; i++) + for (int j = 0; j < w; j++) { - const float* mins_ptr = minsm.row(i); - float* bptr = b.row(i); - for (int j = 0; j < w; j++) - { - bptr[j] = op2(bptr[j], mins_ptr[j]); - } + bptr[j] = reduction(v0, a.row(i) + j, d, w * h, channels, a.cstep, op_type); } } - - return 0; } if (reduce_w && !reduce_h && reduce_d && !reduce_c) @@ -747,25 +677,13 @@ static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool redu { const float* ptr = a.channel(q); float* outptr = keepdims ? b.channel(q) : b.row(q); - for (int i = 0; i < h; i++) - { - outptr[i] = v0; - } - for (int i = 0; i < d; i++) + for (int i = 0; i < h; i++) { - for (int j = 0; j < h; j++) - { - for (int k = 0; k < w; k++) - { - outptr[j] = op(outptr[j], ptr[k]); - } - ptr += w; - } + outptr[i] = reduction(v0, ptr, w, d, w * h, op_type); + ptr += w; } } - - return 0; } if (!reduce_w && reduce_h && !reduce_d && reduce_c) @@ -776,49 +694,16 @@ static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool redu else b.create(w, d, elemsize, opt.blob_allocator); - Mat mins(w, d, channels, elemsize, opt.workspace_allocator); - if (mins.empty()) - return -100; - - mins.fill(v0); - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int i = 0; i < d; i++) { - const float* ptr = a.channel(q); - Mat minsm = mins.channel(q); - - for (int i = 0; i < d; i++) - { - float* mins_ptr = minsm.row(i); - for (int j = 0; j < h; j++) - { - for (int k = 0; k < w; k++) - { - mins_ptr[k] = op(mins_ptr[k], ptr[k]); - } - ptr += w; - } - } - } + float* bptr = b.row(i); - b.fill(v0); - - for (int q = 0; q < channels; q++) - { - const Mat minsm = mins.channel(q); - for (int i = 0; i < d; i++) + for (int j = 0; j < w; j++) { - const float* mins_ptr = minsm.row(i); - float* bptr = b.row(i); - for (int j = 0; j < w; j++) - { - bptr[j] = op2(bptr[j], mins_ptr[j]); - } + bptr[j] = reduction(v0, (const float*)a.depth(i) + j, h, w, channels, a.cstep, op_type); } } - - return 0; } if (!reduce_w && reduce_h && reduce_d && !reduce_c) @@ -834,25 +719,12 @@ static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool redu { const float* ptr = a.channel(q); float* outptr = keepdims ? b.channel(q) : b.row(q); - for (int i = 0; i < w; i++) - { - outptr[i] = v0; - } - for (int i = 0; i < d; i++) + for (int i = 0; i < w; i++) { - for (int j = 0; j < h; j++) - { - for (int k = 0; k < w; k++) - { - outptr[k] = op(outptr[k], ptr[k]); - } - ptr += w; - } + outptr[i] = reduction(v0, ptr + i, h * d, w, op_type); } } - - return 0; } if (reduce_w && !reduce_h && !reduce_d && !reduce_c) @@ -871,17 +743,10 @@ static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool redu for (int i = 0; i < d * h; i++) { - float sum = v0; - for (int j = 0; j < w; j++) - { - sum = op(sum, ptr[j]); - } - outptr[i] = sum; + outptr[i] = reduction(v0, ptr, w, op_type); ptr += w; } } - - return 0; } if (!reduce_w && !reduce_h && !reduce_d && reduce_c) @@ -892,28 +757,16 @@ static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool redu else b.create(w, h, d, elemsize, opt.blob_allocator); - b.fill(v0); - - for (int q = 0; q < channels; q++) + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < d; i++) { - const float* ptr = a.channel(q); + float* outptr = keepdims ? b.depth(i) : b.channel(i); - for (int i = 0; i < d; i++) + for (int j = 0; j < w * h; j++) { - Mat outm = keepdims ? b.depth(i) : b.channel(i); - for (int j = 0; j < h; j++) - { - float* outptr = outm.row(j); - for (int k = 0; k < w; k++) - { - outptr[k] = op(outptr[k], ptr[k]); - } - ptr += w; - } + outptr[j] = reduction(v0, (const float*)a.depth(i) + j, channels, a.cstep, op_type); } } - - return 0; } if (!reduce_w && reduce_h && !reduce_d && !reduce_c) @@ -927,26 +780,19 @@ static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool redu #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { - const float* ptr = a.channel(q); Mat outm = b.channel(q); - outm.fill(v0); - for (int i = 0; i < d; i++) { + const float* ptr = a.channel(q).depth(i); float* outptr = outm.row(i); - for (int j = 0; j < h; j++) + + for (int k = 0; k < w; k++) { - for (int k = 0; k < w; k++) - { - outptr[k] = op(outptr[k], ptr[k]); - } - ptr += w; + outptr[k] = reduction(v0, ptr + k, h, w, op_type); } } } - - return 0; } if (!reduce_w && !reduce_h && reduce_d && !reduce_c) @@ -961,188 +807,84 @@ static int reduction_op(const Mat& a, Mat& b, float v0, bool reduce_w, bool redu for (int q = 0; q < channels; q++) { const float* ptr = a.channel(q); - Mat outm = b.channel(q); - - outm.fill(v0); + float* outptr = b.channel(q); - for (int i = 0; i < d; i++) + for (int j = 0; j < w * h; j++) { - for (int j = 0; j < h; j++) - { - float* outptr = outm.row(j); - for (int k = 0; k < w; k++) - { - outptr[k] = op(outptr[k], ptr[k]); - } - ptr += w; - } + outptr[j] = reduction(v0, ptr + j, d, w * h, op_type); } } - - return 0; } } - return 0; -} - -template -static int reduction_post_process(Mat& a, float coeff, const Option& opt) -{ - MathOp mathop; - - int dims = a.dims; - if (dims == 1) + if (operation == Reduction::ReductionOp_LogSum || operation == Reduction::ReductionOp_LogSumExp) { - int w = a.w; + const int size = b.total(); #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - a[i] = mathop(a[i]) * coeff; + for (int i = 0; i < size; i++) + { + b[i] = logf(b[i]); + } } - else if (dims == 2) + + if (operation == Reduction::ReductionOp_L2) { - int size = a.w * a.h; + const int size = b.total(); #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < size; i++) - a[i] = mathop(a[i]) * coeff; + { + // math optimization will probably generate rsqrt + // that produce -inf on sse with subnormal input + // flush subnormal input to zero as a workaround + // TODO explicit use simd sqrt like unaryop --- nihui + b[i] = sqrtf(b[i] < FLT_MIN ? 0.f : b[i]); + } } - else if (dims == 3 || dims == 4) + + if (operation == Reduction::ReductionOp_MEAN) { - int c = a.c; - int size = a.w * a.h * a.d; - if (c == 1) + int scale = 1; + if (dims == 1) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < size; i++) - a[i] = mathop(a[i]) * coeff; + scale = a.w; } - else + if (dims == 2) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < c; q++) - { - float* outptr = a.channel(q); - for (int i = 0; i < size; i++) - outptr[i] = mathop(outptr[i]) * coeff; - } + if (reduce_w) scale *= a.w; + if (reduce_h) scale *= a.h; + } + if (dims == 3) + { + if (reduce_w) scale *= a.w; + if (reduce_h) scale *= a.h; + if (reduce_c) scale *= a.c; + } + if (dims == 4) + { + if (reduce_w) scale *= a.w; + if (reduce_h) scale *= a.h; + if (reduce_d) scale *= a.d; + if (reduce_c) scale *= a.c; } - } - - return 0; -} - -template -static int reduction(const Mat& a, Mat& b, float v0, bool reduce_w, bool reduce_h, bool reduce_d, bool reduce_c, bool post_process, float coeff, int keepdims, const Option& opt) -{ - int ret = reduction_op(a, b, v0, reduce_w, reduce_h, reduce_d, reduce_c, keepdims, opt); - if (ret != 0) - return -100; - - if (post_process || fabsf(coeff - 1.f) > FLT_EPSILON) - { - ret = reduction_post_process(b, coeff, opt); - if (ret != 0) - return -100; - } - - return 0; -} - -template -struct post_process_identity -{ - T operator()(const T& x) const - { - return x; - } -}; - -template -struct post_process_sqrt -{ - T operator()(const T& x) const - { - // math optimization will probably generate rsqrt - // that produce -inf on sse with subnormal input - // flush subnormal input to zero as a workaround - // TODO explicit use simd sqrt like unaryop --- nihui - return static_cast(sqrtf(x < FLT_MIN ? 0.f : x)); - } -}; - -template -struct post_process_log -{ - T operator()(const T& x) const - { - return static_cast(logf(x)); - } -}; - -template -struct reduction_op_add -{ - T operator()(const T& x, const T& y) const - { - return x + y; - } -}; - -template -struct reduction_op_mul -{ - T operator()(const T& x, const T& y) const - { - return x * y; - } -}; - -template -struct reduction_op_asum -{ - T operator()(const T& x, const T& y) const - { - return static_cast(x + fabsf(y)); - } -}; -template -struct reduction_op_sumsq -{ - T operator()(const T& x, const T& y) const - { - return x + y * y; + coeff = coeff / scale; } -}; -template -struct reduction_op_sumsexp -{ - T operator()(const T& x, const T& y) const + if (coeff != 1.f) { - return static_cast(x + expf(y)); - } -}; + const int size = b.total(); -template -struct reduction_op_max -{ - T operator()(const T& x, const T& y) const - { - return std::max(x, y); + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < size; i++) + { + b[i] = b[i] * coeff; + } } -}; -template -struct reduction_op_min -{ - T operator()(const T& x, const T& y) const - { - return std::min(x, y); - } -}; + return 0; +} int Reduction::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { @@ -1198,68 +940,7 @@ int Reduction::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) } } - if (operation == ReductionOp_SUM) - return reduction, reduction_op_add, post_process_identity >(bottom_blob, top_blob, 0.f, reduce_w, reduce_h, reduce_d, reduce_c, false, coeff, keepdims, opt); - - if (operation == ReductionOp_ASUM) - return reduction, reduction_op_add, post_process_identity >(bottom_blob, top_blob, 0.f, reduce_w, reduce_h, reduce_d, reduce_c, false, coeff, keepdims, opt); - - if (operation == ReductionOp_SUMSQ) - return reduction, reduction_op_add, post_process_identity >(bottom_blob, top_blob, 0.f, reduce_w, reduce_h, reduce_d, reduce_c, false, coeff, keepdims, opt); - - if (operation == ReductionOp_MEAN) - { - int scale = 1; - int dims = bottom_blob.dims; - if (dims == 1) - { - scale = bottom_blob.w; - } - else if (dims == 2) - { - if (reduce_w) scale *= bottom_blob.w; - if (reduce_h) scale *= bottom_blob.h; - } - else if (dims == 3) - { - if (reduce_w) scale *= bottom_blob.w; - if (reduce_h) scale *= bottom_blob.h; - if (reduce_c) scale *= bottom_blob.c; - } - else if (dims == 4) - { - if (reduce_w) scale *= bottom_blob.w; - if (reduce_h) scale *= bottom_blob.h; - if (reduce_d) scale *= bottom_blob.d; - if (reduce_c) scale *= bottom_blob.c; - } - - float coeff_mean = coeff / scale; - return reduction, reduction_op_add, post_process_identity >(bottom_blob, top_blob, 0.f, reduce_w, reduce_h, reduce_d, reduce_c, true, coeff_mean, keepdims, opt); - } - - if (operation == ReductionOp_MAX) - return reduction, reduction_op_max, post_process_identity >(bottom_blob, top_blob, -FLT_MAX, reduce_w, reduce_h, reduce_d, reduce_c, false, coeff, keepdims, opt); - - if (operation == ReductionOp_MIN) - return reduction, reduction_op_min, post_process_identity >(bottom_blob, top_blob, FLT_MAX, reduce_w, reduce_h, reduce_d, reduce_c, false, coeff, keepdims, opt); - - if (operation == ReductionOp_PROD) - return reduction, reduction_op_mul, post_process_identity >(bottom_blob, top_blob, 1.f, reduce_w, reduce_h, reduce_d, reduce_c, false, coeff, keepdims, opt); - - if (operation == ReductionOp_L1) - return reduction, reduction_op_add, post_process_identity >(bottom_blob, top_blob, 0.f, reduce_w, reduce_h, reduce_d, reduce_c, false, 1.f, keepdims, opt); - - if (operation == ReductionOp_L2) - return reduction, reduction_op_add, post_process_sqrt >(bottom_blob, top_blob, 0.f, reduce_w, reduce_h, reduce_d, reduce_c, true, 1.f, keepdims, opt); - - if (operation == ReductionOp_LogSum) - return reduction, reduction_op_add, post_process_log >(bottom_blob, top_blob, 0.f, reduce_w, reduce_h, reduce_d, reduce_c, true, 1.f, keepdims, opt); - - if (operation == ReductionOp_LogSumExp) - return reduction, reduction_op_add, post_process_log >(bottom_blob, top_blob, 0.f, reduce_w, reduce_h, reduce_d, reduce_c, true, 1.f, keepdims, opt); - - return 0; + return reduction_op(bottom_blob, top_blob, reduce_w, reduce_h, reduce_d, reduce_c, keepdims, operation, coeff, opt); } } // namespace ncnn diff --git a/tests/test_reduction.cpp b/tests/test_reduction.cpp index f4ea8e23685..a5e5b638dce 100644 --- a/tests/test_reduction.cpp +++ b/tests/test_reduction.cpp @@ -18,52 +18,46 @@ static int op_type = 0; -static ncnn::Mat IntArrayMat(int a0) +static std::vector IntArray(int a0) { - ncnn::Mat m(1); - int* p = m; - p[0] = a0; + std::vector m(1); + m[0] = a0; return m; } -static ncnn::Mat IntArrayMat(int a0, int a1) +static std::vector IntArray(int a0, int a1) { - ncnn::Mat m(2); - int* p = m; - p[0] = a0; - p[1] = a1; + std::vector m(2); + m[0] = a0; + m[1] = a1; return m; } -static ncnn::Mat IntArrayMat(int a0, int a1, int a2) +static std::vector IntArray(int a0, int a1, int a2) { - ncnn::Mat m(3); - int* p = m; - p[0] = a0; - p[1] = a1; - p[2] = a2; + std::vector m(3); + m[0] = a0; + m[1] = a1; + m[2] = a2; return m; } -static ncnn::Mat IntArrayMat(int a0, int a1, int a2, int a3) +static std::vector IntArray(int a0, int a1, int a2, int a3) { - ncnn::Mat m(4); - int* p = m; - p[0] = a0; - p[1] = a1; - p[2] = a2; - p[3] = a3; + std::vector m(4); + m[0] = a0; + m[1] = a1; + m[2] = a2; + m[3] = a3; return m; } -static void print_int_array(const ncnn::Mat& a) +static void print_int_array(const std::vector& a) { - const int* pa = a; - fprintf(stderr, "["); - for (int i = 0; i < a.w; i++) + for (size_t i = 0; i < a.size(); i++) { - fprintf(stderr, " %d", pa[i]); + fprintf(stderr, " %d", a[i]); } fprintf(stderr, " ]"); } @@ -94,7 +88,7 @@ static int test_reduction(const ncnn::Mat& _a, float coeff, int keepdims) return ret; } -static int test_reduction(const ncnn::Mat& _a, float coeff, int keepdims, const ncnn::Mat& axes) +static int test_reduction(const ncnn::Mat& _a, float coeff, int keepdims, const std::vector& axes_array) { ncnn::Mat a = _a; if (op_type == 9 || op_type == 10) @@ -103,6 +97,15 @@ static int test_reduction(const ncnn::Mat& _a, float coeff, int keepdims, const Randomize(a, 0.001f, 2.f); } + ncnn::Mat axes(axes_array.size()); + { + int* p = axes; + for (size_t i = 0; i < axes_array.size(); i++) + { + p[i] = axes_array[i]; + } + } + ncnn::ParamDict pd; pd.set(0, op_type); pd.set(1, 0); // reduce_all @@ -118,247 +121,115 @@ static int test_reduction(const ncnn::Mat& _a, float coeff, int keepdims, const { fprintf(stderr, "test_reduction failed a.dims=%d a=(%d %d %d %d) op_type=%d coeff=%f keepdims=%d", a.dims, a.w, a.h, a.d, a.c, op_type, coeff, keepdims); fprintf(stderr, " axes="); - print_int_array(axes); + print_int_array(axes_array); fprintf(stderr, "\n"); } return ret; } +static int test_reduction_nd(const ncnn::Mat& a) +{ + int ret1 = 0 + || test_reduction(a, 1.f, 0) + || test_reduction(a, 2.f, 0) + || test_reduction(a, 1.f, 1) + || test_reduction(a, 2.f, 1) + || test_reduction(a, 1.f, 0, IntArray(0)) + || test_reduction(a, 1.f, 1, IntArray(0)); + + if (a.dims == 1 || ret1 != 0) + return ret1; + + int ret2 = 0 + || test_reduction(a, 2.f, 0, IntArray(1)) + || test_reduction(a, 2.f, 1, IntArray(1)) + || test_reduction(a, 1.f, 0, IntArray(0, 1)) + || test_reduction(a, 1.f, 1, IntArray(0, 1)); + + if (a.dims == 2 || ret2 != 0) + return ret2; + + int ret3 = 0 + || test_reduction(a, 1.f, 0, IntArray(2)) + || test_reduction(a, 1.f, 1, IntArray(2)) + || test_reduction(a, 2.f, 0, IntArray(0, 2)) + || test_reduction(a, 2.f, 0, IntArray(1, 2)) + || test_reduction(a, 2.f, 1, IntArray(0, 2)) + || test_reduction(a, 2.f, 1, IntArray(1, 2)) + || test_reduction(a, 1.f, 0, IntArray(0, 1, 2)) + || test_reduction(a, 1.f, 1, IntArray(0, 1, 2)); + + if (a.dims == 3 || ret3 != 0) + return ret3; + + int ret4 = 0 + || test_reduction(a, 2.f, 0, IntArray(3)) + || test_reduction(a, 2.f, 1, IntArray(3)) + || test_reduction(a, 1.f, 0, IntArray(0, 3)) + || test_reduction(a, 1.f, 0, IntArray(1, 3)) + || test_reduction(a, 2.f, 0, IntArray(2, 3)) + || test_reduction(a, 1.f, 1, IntArray(0, 3)) + || test_reduction(a, 1.f, 1, IntArray(1, 3)) + || test_reduction(a, 2.f, 1, IntArray(2, 3)) + || test_reduction(a, 2.f, 0, IntArray(0, 1, 3)) + || test_reduction(a, 1.f, 0, IntArray(0, 2, 3)) + || test_reduction(a, 2.f, 0, IntArray(1, 2, 3)) + || test_reduction(a, 2.f, 1, IntArray(0, 1, 3)) + || test_reduction(a, 1.f, 1, IntArray(0, 2, 3)) + || test_reduction(a, 2.f, 1, IntArray(1, 2, 3)) + || test_reduction(a, 1.f, 0, IntArray(0, 1, 2, 3)) + || test_reduction(a, 1.f, 1, IntArray(0, 1, 2, 3)); + + return ret4; +} + static int test_reduction_0() { + ncnn::Mat a = RandomMat(5, 6, 7, 24); + ncnn::Mat b = RandomMat(7, 8, 9, 12); + ncnn::Mat c = RandomMat(3, 4, 5, 13); + return 0 - || test_reduction(RandomMat(5, 6, 7, 24), 1.f, 0) - || test_reduction(RandomMat(5, 6, 7, 24), 2.f, 0) - || test_reduction(RandomMat(7, 8, 9, 12), 1.f, 0) - || test_reduction(RandomMat(7, 8, 9, 12), 2.f, 0) - || test_reduction(RandomMat(3, 4, 5, 13), 1.f, 0) - || test_reduction(RandomMat(3, 4, 5, 13), 2.f, 0) - - || test_reduction(RandomMat(5, 6, 7, 24), 1.f, 1) - || test_reduction(RandomMat(5, 6, 7, 24), 2.f, 1) - || test_reduction(RandomMat(7, 8, 9, 12), 1.f, 1) - || test_reduction(RandomMat(7, 8, 9, 12), 2.f, 1) - || test_reduction(RandomMat(3, 4, 5, 13), 1.f, 1) - || test_reduction(RandomMat(3, 4, 5, 13), 2.f, 1) - - || test_reduction(RandomMat(5, 6, 7, 24), 1.f, 0, IntArrayMat(0)) - || test_reduction(RandomMat(5, 6, 7, 24), 2.f, 0, IntArrayMat(1)) - || test_reduction(RandomMat(5, 6, 7, 24), 1.f, 0, IntArrayMat(2)) - || test_reduction(RandomMat(5, 6, 7, 24), 2.f, 0, IntArrayMat(3)) - || test_reduction(RandomMat(5, 6, 7, 24), 1.f, 0, IntArrayMat(0, 1)) - || test_reduction(RandomMat(5, 6, 7, 24), 2.f, 0, IntArrayMat(0, 2)) - || test_reduction(RandomMat(5, 6, 7, 24), 1.f, 0, IntArrayMat(0, 3)) - || test_reduction(RandomMat(5, 6, 7, 24), 2.f, 0, IntArrayMat(1, 2)) - || test_reduction(RandomMat(5, 6, 7, 24), 1.f, 0, IntArrayMat(1, 3)) - || test_reduction(RandomMat(5, 6, 7, 24), 2.f, 0, IntArrayMat(2, 3)) - || test_reduction(RandomMat(5, 6, 7, 24), 1.f, 0, IntArrayMat(0, 1, 2)) - || test_reduction(RandomMat(5, 6, 7, 24), 2.f, 0, IntArrayMat(0, 1, 3)) - || test_reduction(RandomMat(5, 6, 7, 24), 1.f, 0, IntArrayMat(0, 2, 3)) - || test_reduction(RandomMat(5, 6, 7, 24), 2.f, 0, IntArrayMat(1, 2, 3)) - || test_reduction(RandomMat(5, 6, 7, 24), 1.f, 0, IntArrayMat(0, 1, 2, 3)) - || test_reduction(RandomMat(7, 8, 9, 12), 1.f, 0, IntArrayMat(0)) - || test_reduction(RandomMat(7, 8, 9, 12), 2.f, 0, IntArrayMat(1)) - || test_reduction(RandomMat(7, 8, 9, 12), 1.f, 0, IntArrayMat(2)) - || test_reduction(RandomMat(7, 8, 9, 12), 2.f, 0, IntArrayMat(3)) - || test_reduction(RandomMat(7, 8, 9, 12), 1.f, 0, IntArrayMat(0, 1)) - || test_reduction(RandomMat(7, 8, 9, 12), 2.f, 0, IntArrayMat(0, 2)) - || test_reduction(RandomMat(7, 8, 9, 12), 1.f, 0, IntArrayMat(0, 3)) - || test_reduction(RandomMat(7, 8, 9, 12), 2.f, 0, IntArrayMat(1, 2)) - || test_reduction(RandomMat(7, 8, 9, 12), 1.f, 0, IntArrayMat(1, 3)) - || test_reduction(RandomMat(7, 8, 9, 12), 2.f, 0, IntArrayMat(2, 3)) - || test_reduction(RandomMat(7, 8, 9, 12), 1.f, 0, IntArrayMat(0, 1, 2)) - || test_reduction(RandomMat(7, 8, 9, 12), 2.f, 0, IntArrayMat(0, 1, 3)) - || test_reduction(RandomMat(7, 8, 9, 12), 1.f, 0, IntArrayMat(0, 2, 3)) - || test_reduction(RandomMat(7, 8, 9, 12), 2.f, 0, IntArrayMat(1, 2, 3)) - || test_reduction(RandomMat(7, 8, 9, 12), 1.f, 0, IntArrayMat(0, 1, 2, 3)) - || test_reduction(RandomMat(3, 4, 5, 13), 1.f, 0, IntArrayMat(0)) - || test_reduction(RandomMat(3, 4, 5, 13), 2.f, 0, IntArrayMat(1)) - || test_reduction(RandomMat(3, 4, 5, 13), 1.f, 0, IntArrayMat(2)) - || test_reduction(RandomMat(3, 4, 5, 13), 2.f, 0, IntArrayMat(3)) - || test_reduction(RandomMat(3, 4, 5, 13), 1.f, 0, IntArrayMat(0, 1)) - || test_reduction(RandomMat(3, 4, 5, 13), 2.f, 0, IntArrayMat(0, 2)) - || test_reduction(RandomMat(3, 4, 5, 13), 1.f, 0, IntArrayMat(0, 3)) - || test_reduction(RandomMat(3, 4, 5, 13), 2.f, 0, IntArrayMat(1, 2)) - || test_reduction(RandomMat(3, 4, 5, 13), 1.f, 0, IntArrayMat(1, 3)) - || test_reduction(RandomMat(3, 4, 5, 13), 2.f, 0, IntArrayMat(2, 3)) - || test_reduction(RandomMat(3, 4, 5, 13), 1.f, 0, IntArrayMat(0, 1, 2)) - || test_reduction(RandomMat(3, 4, 5, 13), 2.f, 0, IntArrayMat(0, 1, 3)) - || test_reduction(RandomMat(3, 4, 5, 13), 1.f, 0, IntArrayMat(0, 2, 3)) - || test_reduction(RandomMat(3, 4, 5, 13), 2.f, 0, IntArrayMat(1, 2, 3)) - || test_reduction(RandomMat(3, 4, 5, 13), 1.f, 0, IntArrayMat(0, 1, 2, 3)) - - || test_reduction(RandomMat(5, 6, 7, 24), 1.f, 1, IntArrayMat(0)) - || test_reduction(RandomMat(5, 6, 7, 24), 2.f, 1, IntArrayMat(1)) - || test_reduction(RandomMat(5, 6, 7, 24), 1.f, 1, IntArrayMat(2)) - || test_reduction(RandomMat(5, 6, 7, 24), 2.f, 1, IntArrayMat(3)) - || test_reduction(RandomMat(5, 6, 7, 24), 1.f, 1, IntArrayMat(0, 1)) - || test_reduction(RandomMat(5, 6, 7, 24), 2.f, 1, IntArrayMat(0, 2)) - || test_reduction(RandomMat(5, 6, 7, 24), 1.f, 1, IntArrayMat(0, 3)) - || test_reduction(RandomMat(5, 6, 7, 24), 2.f, 1, IntArrayMat(1, 2)) - || test_reduction(RandomMat(5, 6, 7, 24), 1.f, 1, IntArrayMat(1, 3)) - || test_reduction(RandomMat(5, 6, 7, 24), 2.f, 1, IntArrayMat(2, 3)) - || test_reduction(RandomMat(5, 6, 7, 24), 1.f, 1, IntArrayMat(0, 1, 2)) - || test_reduction(RandomMat(5, 6, 7, 24), 2.f, 1, IntArrayMat(0, 1, 3)) - || test_reduction(RandomMat(5, 6, 7, 24), 1.f, 1, IntArrayMat(0, 2, 3)) - || test_reduction(RandomMat(5, 6, 7, 24), 2.f, 1, IntArrayMat(1, 2, 3)) - || test_reduction(RandomMat(5, 6, 7, 24), 1.f, 1, IntArrayMat(0, 1, 2, 3)) - || test_reduction(RandomMat(7, 8, 9, 12), 1.f, 1, IntArrayMat(0)) - || test_reduction(RandomMat(7, 8, 9, 12), 2.f, 1, IntArrayMat(1)) - || test_reduction(RandomMat(7, 8, 9, 12), 1.f, 1, IntArrayMat(2)) - || test_reduction(RandomMat(7, 8, 9, 12), 2.f, 1, IntArrayMat(3)) - || test_reduction(RandomMat(7, 8, 9, 12), 1.f, 1, IntArrayMat(0, 1)) - || test_reduction(RandomMat(7, 8, 9, 12), 2.f, 1, IntArrayMat(0, 2)) - || test_reduction(RandomMat(7, 8, 9, 12), 1.f, 1, IntArrayMat(0, 3)) - || test_reduction(RandomMat(7, 8, 9, 12), 2.f, 1, IntArrayMat(1, 2)) - || test_reduction(RandomMat(7, 8, 9, 12), 1.f, 1, IntArrayMat(1, 3)) - || test_reduction(RandomMat(7, 8, 9, 12), 2.f, 1, IntArrayMat(2, 3)) - || test_reduction(RandomMat(7, 8, 9, 12), 1.f, 1, IntArrayMat(0, 1, 2)) - || test_reduction(RandomMat(7, 8, 9, 12), 2.f, 1, IntArrayMat(0, 1, 3)) - || test_reduction(RandomMat(7, 8, 9, 12), 1.f, 1, IntArrayMat(0, 2, 3)) - || test_reduction(RandomMat(7, 8, 9, 12), 2.f, 1, IntArrayMat(1, 2, 3)) - || test_reduction(RandomMat(7, 8, 9, 12), 1.f, 1, IntArrayMat(0, 1, 2, 3)) - || test_reduction(RandomMat(3, 4, 5, 13), 1.f, 1, IntArrayMat(0)) - || test_reduction(RandomMat(3, 4, 5, 13), 2.f, 1, IntArrayMat(1)) - || test_reduction(RandomMat(3, 4, 5, 13), 1.f, 1, IntArrayMat(2)) - || test_reduction(RandomMat(3, 4, 5, 13), 2.f, 1, IntArrayMat(3)) - || test_reduction(RandomMat(3, 4, 5, 13), 1.f, 1, IntArrayMat(0, 1)) - || test_reduction(RandomMat(3, 4, 5, 13), 2.f, 1, IntArrayMat(0, 2)) - || test_reduction(RandomMat(3, 4, 5, 13), 1.f, 1, IntArrayMat(0, 3)) - || test_reduction(RandomMat(3, 4, 5, 13), 2.f, 1, IntArrayMat(1, 2)) - || test_reduction(RandomMat(3, 4, 5, 13), 1.f, 1, IntArrayMat(1, 3)) - || test_reduction(RandomMat(3, 4, 5, 13), 2.f, 1, IntArrayMat(2, 3)) - || test_reduction(RandomMat(3, 4, 5, 13), 1.f, 1, IntArrayMat(0, 1, 2)) - || test_reduction(RandomMat(3, 4, 5, 13), 2.f, 1, IntArrayMat(0, 1, 3)) - || test_reduction(RandomMat(3, 4, 5, 13), 1.f, 1, IntArrayMat(0, 2, 3)) - || test_reduction(RandomMat(3, 4, 5, 13), 2.f, 1, IntArrayMat(1, 2, 3)) - || test_reduction(RandomMat(3, 4, 5, 13), 1.f, 1, IntArrayMat(0, 1, 2, 3)); + || test_reduction_nd(a) + || test_reduction_nd(b) + || test_reduction_nd(c); } static int test_reduction_1() { + ncnn::Mat a = RandomMat(5, 7, 24); + ncnn::Mat b = RandomMat(7, 9, 12); + ncnn::Mat c = RandomMat(3, 5, 13); + return 0 - || test_reduction(RandomMat(5, 7, 24), 1.f, 0) - || test_reduction(RandomMat(5, 7, 24), 2.f, 0) - || test_reduction(RandomMat(7, 9, 12), 1.f, 0) - || test_reduction(RandomMat(7, 9, 12), 2.f, 0) - || test_reduction(RandomMat(3, 5, 13), 1.f, 0) - || test_reduction(RandomMat(3, 5, 13), 2.f, 0) - - || test_reduction(RandomMat(5, 7, 24), 1.f, 1) - || test_reduction(RandomMat(5, 7, 24), 2.f, 1) - || test_reduction(RandomMat(7, 9, 12), 1.f, 1) - || test_reduction(RandomMat(7, 9, 12), 2.f, 1) - || test_reduction(RandomMat(3, 5, 13), 1.f, 1) - || test_reduction(RandomMat(3, 5, 13), 2.f, 1) - - || test_reduction(RandomMat(5, 7, 24), 1.f, 0, IntArrayMat(0)) - || test_reduction(RandomMat(5, 7, 24), 2.f, 0, IntArrayMat(1)) - || test_reduction(RandomMat(5, 7, 24), 1.f, 0, IntArrayMat(0, 1)) - || test_reduction(RandomMat(5, 7, 24), 2.f, 0, IntArrayMat(0, 2)) - || test_reduction(RandomMat(5, 7, 24), 1.f, 0, IntArrayMat(1, 2)) - || test_reduction(RandomMat(5, 7, 24), 2.f, 0, IntArrayMat(0, 1, 2)) - || test_reduction(RandomMat(7, 9, 12), 1.f, 0, IntArrayMat(0)) - || test_reduction(RandomMat(7, 9, 12), 2.f, 0, IntArrayMat(1)) - || test_reduction(RandomMat(7, 9, 12), 1.f, 0, IntArrayMat(0, 1)) - || test_reduction(RandomMat(7, 9, 12), 2.f, 0, IntArrayMat(0, 2)) - || test_reduction(RandomMat(7, 9, 12), 1.f, 0, IntArrayMat(1, 2)) - || test_reduction(RandomMat(7, 9, 12), 2.f, 0, IntArrayMat(0, 1, 2)) - || test_reduction(RandomMat(3, 5, 13), 1.f, 0, IntArrayMat(0)) - || test_reduction(RandomMat(3, 5, 13), 2.f, 0, IntArrayMat(1)) - || test_reduction(RandomMat(3, 5, 13), 1.f, 0, IntArrayMat(0, 1)) - || test_reduction(RandomMat(3, 5, 13), 2.f, 0, IntArrayMat(0, 2)) - || test_reduction(RandomMat(3, 5, 13), 1.f, 0, IntArrayMat(1, 2)) - || test_reduction(RandomMat(3, 5, 13), 2.f, 0, IntArrayMat(0, 1, 2)) - - || test_reduction(RandomMat(5, 7, 24), 1.f, 1, IntArrayMat(0)) - || test_reduction(RandomMat(5, 7, 24), 2.f, 1, IntArrayMat(1)) - || test_reduction(RandomMat(5, 7, 24), 1.f, 1, IntArrayMat(0, 1)) - || test_reduction(RandomMat(5, 7, 24), 2.f, 1, IntArrayMat(0, 2)) - || test_reduction(RandomMat(5, 7, 24), 1.f, 1, IntArrayMat(1, 2)) - || test_reduction(RandomMat(5, 7, 24), 2.f, 1, IntArrayMat(0, 1, 2)) - || test_reduction(RandomMat(7, 9, 12), 1.f, 1, IntArrayMat(0)) - || test_reduction(RandomMat(7, 9, 12), 2.f, 1, IntArrayMat(1)) - || test_reduction(RandomMat(7, 9, 12), 1.f, 1, IntArrayMat(0, 1)) - || test_reduction(RandomMat(7, 9, 12), 2.f, 1, IntArrayMat(0, 2)) - || test_reduction(RandomMat(7, 9, 12), 1.f, 1, IntArrayMat(1, 2)) - || test_reduction(RandomMat(7, 9, 12), 2.f, 1, IntArrayMat(0, 1, 2)) - || test_reduction(RandomMat(3, 5, 13), 1.f, 1, IntArrayMat(0)) - || test_reduction(RandomMat(3, 5, 13), 2.f, 1, IntArrayMat(1)) - || test_reduction(RandomMat(3, 5, 13), 1.f, 1, IntArrayMat(0, 1)) - || test_reduction(RandomMat(3, 5, 13), 2.f, 1, IntArrayMat(0, 2)) - || test_reduction(RandomMat(3, 5, 13), 1.f, 1, IntArrayMat(1, 2)) - || test_reduction(RandomMat(3, 5, 13), 2.f, 1, IntArrayMat(0, 1, 2)); + || test_reduction_nd(a) + || test_reduction_nd(b) + || test_reduction_nd(c); } static int test_reduction_2() { + ncnn::Mat a = RandomMat(15, 24); + ncnn::Mat b = RandomMat(17, 12); + ncnn::Mat c = RandomMat(19, 15); + return 0 - || test_reduction(RandomMat(15, 24), 1.f, 0) - || test_reduction(RandomMat(15, 24), 2.f, 0) - || test_reduction(RandomMat(17, 12), 1.f, 0) - || test_reduction(RandomMat(17, 12), 2.f, 0) - || test_reduction(RandomMat(19, 15), 1.f, 0) - || test_reduction(RandomMat(19, 15), 2.f, 0) - - || test_reduction(RandomMat(15, 24), 1.f, 1) - || test_reduction(RandomMat(15, 24), 2.f, 1) - || test_reduction(RandomMat(17, 12), 1.f, 1) - || test_reduction(RandomMat(17, 12), 2.f, 1) - || test_reduction(RandomMat(19, 15), 1.f, 1) - || test_reduction(RandomMat(19, 15), 2.f, 1) - - || test_reduction(RandomMat(15, 24), 1.f, 0, IntArrayMat(0)) - || test_reduction(RandomMat(15, 24), 2.f, 0, IntArrayMat(1)) - || test_reduction(RandomMat(15, 24), 1.f, 0, IntArrayMat(0, 1)) - || test_reduction(RandomMat(17, 12), 2.f, 0, IntArrayMat(0)) - || test_reduction(RandomMat(17, 12), 1.f, 0, IntArrayMat(1)) - || test_reduction(RandomMat(17, 12), 2.f, 0, IntArrayMat(0, 1)) - || test_reduction(RandomMat(19, 15), 1.f, 0, IntArrayMat(0)) - || test_reduction(RandomMat(19, 15), 2.f, 0, IntArrayMat(1)) - || test_reduction(RandomMat(19, 15), 1.f, 0, IntArrayMat(0, 1)) - - || test_reduction(RandomMat(15, 24), 2.f, 1, IntArrayMat(0)) - || test_reduction(RandomMat(15, 24), 1.f, 1, IntArrayMat(1)) - || test_reduction(RandomMat(15, 24), 2.f, 1, IntArrayMat(0, 1)) - || test_reduction(RandomMat(17, 12), 1.f, 1, IntArrayMat(0)) - || test_reduction(RandomMat(17, 12), 2.f, 1, IntArrayMat(1)) - || test_reduction(RandomMat(17, 12), 1.f, 1, IntArrayMat(0, 1)) - || test_reduction(RandomMat(19, 15), 2.f, 1, IntArrayMat(0)) - || test_reduction(RandomMat(19, 15), 1.f, 1, IntArrayMat(1)) - || test_reduction(RandomMat(19, 15), 2.f, 1, IntArrayMat(0, 1)); + || test_reduction_nd(a) + || test_reduction_nd(b) + || test_reduction_nd(c); } static int test_reduction_3() { + ncnn::Mat a = RandomMat(128); + ncnn::Mat b = RandomMat(124); + ncnn::Mat c = RandomMat(127); + return 0 - || test_reduction(RandomMat(128), 1.f, 0) - || test_reduction(RandomMat(128), 2.f, 0) - || test_reduction(RandomMat(124), 1.f, 0) - || test_reduction(RandomMat(124), 2.f, 0) - || test_reduction(RandomMat(127), 1.f, 0) - || test_reduction(RandomMat(127), 2.f, 0) - - || test_reduction(RandomMat(128), 1.f, 1) - || test_reduction(RandomMat(128), 2.f, 1) - || test_reduction(RandomMat(124), 1.f, 1) - || test_reduction(RandomMat(124), 2.f, 1) - || test_reduction(RandomMat(127), 1.f, 1) - || test_reduction(RandomMat(127), 2.f, 1) - - || test_reduction(RandomMat(128), 1.f, 0, IntArrayMat(0)) - || test_reduction(RandomMat(128), 2.f, 0, IntArrayMat(0)) - || test_reduction(RandomMat(124), 1.f, 0, IntArrayMat(0)) - || test_reduction(RandomMat(124), 2.f, 0, IntArrayMat(0)) - || test_reduction(RandomMat(127), 1.f, 0, IntArrayMat(0)) - || test_reduction(RandomMat(127), 2.f, 0, IntArrayMat(0)) - - || test_reduction(RandomMat(128), 1.f, 1, IntArrayMat(0)) - || test_reduction(RandomMat(128), 2.f, 1, IntArrayMat(0)) - || test_reduction(RandomMat(124), 1.f, 1, IntArrayMat(0)) - || test_reduction(RandomMat(124), 2.f, 1, IntArrayMat(0)) - || test_reduction(RandomMat(127), 1.f, 1, IntArrayMat(0)) - || test_reduction(RandomMat(127), 1.f, 1, IntArrayMat(0)); + || test_reduction_nd(a) + || test_reduction_nd(b) + || test_reduction_nd(c); } int main()