From 1e5a5816d112587df1090f16e9ccf79729e49c97 Mon Sep 17 00:00:00 2001 From: jianfeifeng Date: Sun, 22 Nov 2020 18:07:58 +0800 Subject: [PATCH] v1.0.0 is released --- CI_SCRIPTS/CPPLINT.cfg | 1 + CI_SCRIPTS/benchmark_verify.sh | 45 + CI_SCRIPTS/benchmark_verify_serial.sh | 25 + CI_SCRIPTS/dir_cpplint.sh | 11 + CI_SCRIPTS/format_code.sh | 55 + CI_SCRIPTS/genCommandLines.sh | 63 + CI_SCRIPTS/inference_big.txt | 66 + CI_SCRIPTS/inference_serial.txt | 19 + CI_SCRIPTS/inference_small.txt | 30 + CI_SCRIPTS/java_api_test.sh | 63 + CI_SCRIPTS/model_tools_test.sh | 183 + {scripts => CI_SCRIPTS}/operator_driver.sh | 45 +- {scripts => CI_SCRIPTS}/operator_test.sh | 60 +- {scripts => CI_SCRIPTS}/params/activation.csv | 0 CI_SCRIPTS/params/alexnet_convolution.csv | 6 + CI_SCRIPTS/params/argmax.csv | 5 + {scripts => CI_SCRIPTS}/params/attention.csv | 0 CI_SCRIPTS/params/bnn_convolution.csv | 53 + CI_SCRIPTS/params/check.csv | 3 + {scripts => CI_SCRIPTS}/params/clip.csv | 0 {scripts => CI_SCRIPTS}/params/concat.csv | 2 +- CI_SCRIPTS/params/convolution.csv | 17 + CI_SCRIPTS/params/deconvolution.csv | 12 + CI_SCRIPTS/params/detectionoutput.csv | 2 + CI_SCRIPTS/params/dilated_convolution.csv | 5 + {scripts => CI_SCRIPTS}/params/eltwise.csv | 0 CI_SCRIPTS/params/googlenet_convolution.csv | 58 + CI_SCRIPTS/params/l2normalization.csv | 2 + CI_SCRIPTS/params/lenet_convolution.csv | 3 + .../params/lenet_fully_connected.csv | 0 {scripts => CI_SCRIPTS}/params/mmm.csv | 0 .../mobilenetv1_depthwise_convolution.csv | 14 + .../mobilenetv2_depthwise_convolution.csv | 18 + CI_SCRIPTS/params/mobilenetv3_convolution.csv | 33 + .../mobilenetv3_depthwise_convolution.csv | 16 + {scripts => CI_SCRIPTS}/params/mvm.csv | 0 CI_SCRIPTS/params/non_max_suppression.csv | 2 + CI_SCRIPTS/params/normalization.csv | 3 + CI_SCRIPTS/params/padding.csv | 5 + {scripts => CI_SCRIPTS}/params/pipeline.csv | 0 {scripts => CI_SCRIPTS}/params/pooling.csv | 0 CI_SCRIPTS/params/pooling_bp.csv | 2 + CI_SCRIPTS/params/power.csv | 3 + CI_SCRIPTS/params/prelu.csv | 2 + CI_SCRIPTS/params/priorbox.csv | 5 + CI_SCRIPTS/params/reduction.csv | 6 + {scripts => CI_SCRIPTS}/params/reshape.csv | 0 CI_SCRIPTS/params/resnet50_convolution.csv | 54 + .../lstm.csv => CI_SCRIPTS/params/rnn.csv | 0 CI_SCRIPTS/params/roialign.csv | 2 + {scripts => CI_SCRIPTS}/params/scale.csv | 0 {scripts => CI_SCRIPTS}/params/slice.csv | 0 {scripts => CI_SCRIPTS}/params/softmax.csv | 0 {scripts => CI_SCRIPTS}/params/split.csv | 0 CI_SCRIPTS/params/tile.csv | 3 + {scripts => CI_SCRIPTS}/params/transpose.csv | 0 CI_SCRIPTS/parseAndExeCommands.sh | 430 +++ CI_SCRIPTS/transExecutors.sh | 39 + CMakeLists.txt | 166 +- README.md | 161 +- THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.md | 71 - blas-enhance/include/blas-enhance.h | 68 - blas-enhance/src/CMakeLists.txt | 32 - blas-enhance/src/cpu/arm/fp16/mmm_A55.cpp | 790 ---- blas-enhance/src/cpu/arm/fp16/mmm_A76.cpp | 634 --- blas-enhance/src/cpu/arm/fp16/mvm_A55.cpp | 144 - blas-enhance/src/cpu/arm/fp16/mvm_A76.cpp | 130 - blas-enhance/src/cpu/arm/fp16/mvm_common.h | 260 -- blas-enhance/src/cpu/arm/fp32/mvm_row_V8.cpp | 152 - blas-enhance/src/cpu/arm/int8/mmm_A55.cpp | 733 ---- blas-enhance/src/cpu/arm/int8/mmm_A76.cpp | 685 ---- blas-enhance/src/cpu/arm/int8/mmm_common.h | 489 --- blas-enhance/src/cpu/arm/int8/mvm.h | 146 - blas-enhance/src/mmm.cpp | 102 - bolt.cmake | 207 - cmakes/FindBlasEnhance.cmake | 26 - cmakes/FindGcl.cmake | 55 - cmakes/FindImage.cmake | 33 - cmakes/FindInference.cmake | 34 - cmakes/FindModelTools.cmake | 37 - cmakes/FindModelToolsCaffe.cmake | 33 - cmakes/FindModelToolsOnnx.cmake | 33 - cmakes/FindModelToolsTFLite.cmake | 33 - cmakes/FindTFLite.cmake | 15 - cmakes/FindTensorComputing.cmake | 29 - cmakes/FindUni.cmake | 18 - common/CMakeLists.txt | 18 + common/cmakes/FindFFTW.cmake | 23 + .../cmakes}/FindFlatBuffers.cmake | 4 +- common/cmakes/FindGcl.cmake | 56 + common/cmakes/FindJNI.cmake | 14 + {cmakes => common/cmakes}/FindOpenCL.cmake | 7 +- {cmakes => common/cmakes}/FindProtobuf.cmake | 66 +- common/cmakes/FindTFLite.cmake | 16 + {cmakes => common/cmakes}/Findjpeg.cmake | 11 +- common/cmakes/Findjsoncpp.cmake | 25 + common/cmakes/bolt.cmake | 412 ++ common/gcl/CMakeLists.txt | 20 + common/gcl/include/context.h | 212 + common/gcl/include/event.h | 160 + {gcl => common/gcl}/include/gcl.h | 14 +- common/gcl/include/gcl_common.h | 275 ++ common/gcl/include/gcl_engine.h | 66 + common/gcl/include/gcl_func.h | 1351 +++++++ common/gcl/include/gcl_kernel_binmap.h | 148 + common/gcl/include/gcl_kernel_source.h | 85 + common/gcl/include/gcl_kernel_type.h | 20 + common/gcl/include/gclmem_desc_infer.h | 713 ++++ common/gcl/include/kernel.h | 167 + common/gcl/include/memory.h | 661 ++++ common/gcl/include/ocl_context.h | 36 + common/gcl/include/ocl_data_alloc.h | 35 + common/gcl/include/ocl_data_trans.h | 33 + common/gcl/include/ocl_desc_trans.h | 31 + common/gcl/include/platform.h | 500 +++ common/gcl/include/program.h | 303 ++ common/gcl/src/CMakeLists.txt | 14 + common/gcl/src/ocl_context.cpp | 187 + common/gcl/src/ocl_data_trans.cpp | 279 ++ .../gcl}/tools/device_info/CMakeLists.txt | 6 +- .../gcl}/tools/device_info/clinfo.cpp | 19 +- .../gcl}/tools/gcl_sample/CMakeLists.txt | 11 +- common/gcl/tools/gcl_sample/build.sh | 20 + .../gcl}/tools/gcl_sample/cl/sample.cl | 31 +- common/gcl/tools/gcl_sample/sample.cpp | 168 + .../tools/kernel_lib_compile/CMakeLists.txt | 14 +- .../kernel_lib_compile/buildKernelLib.sh | 0 .../device_name/CMakeLists.txt | 6 +- .../device_name/device_name.cpp | 26 +- .../kernel_bin/CMakeLists.txt | 6 +- .../kernel_bin/clbinary.cpp | 154 +- .../kernel_bin2char/bin2char.cpp | 50 +- .../kernel_lib_compile/sh/adbDeviceNum.sh | 12 + .../kernel_lib_compile/sh/buildKernelBin.sh | 37 +- .../sh/buildKernelLibConfig.sh | 11 +- .../sh/compile/activation.sh | 0 .../kernel_lib_compile/sh/compile/argmax_x.sh | 12 + .../sh/compile/bilateral_slice_apply_c12.sh | 0 .../sh/compile/channel_resize.sh | 14 + .../kernel_lib_compile/sh/compile/common.sh | 16 +- .../kernel_lib_compile/sh/compile/concat.sh | 26 + .../sh/compile/conv_depthwise_s1.sh | 69 + .../sh/compile/conv_depthwise_s2.sh | 36 + .../sh/compile/conv_depthwise_trans_fltbuf.sh | 0 .../conv_direct_3d_sw1_nchw_to_ncwhc4.sh | 11 + .../conv_direct_3d_sw2_nchw_to_ncwhc4.sh | 11 + .../sh/compile/conv_direct_s1.sh | 311 ++ .../sh/compile/conv_direct_s1_fn_spe.sh | 44 + .../compile/conv_direct_s1_nchw_to_ncwhc4.sh | 6 + .../sh/compile/conv_direct_s1_spe_f1c3k1.sh | 0 .../sh/compile/conv_direct_s2.sh | 213 + .../compile/conv_direct_s2_nchw_to_ncwhc4.sh | 6 + .../sh/compile/conv_direct_spe_fwhs1.sh | 25 + .../sh/compile/conv_direct_trans_fltbuf.sh | 2 + .../sh/compile/conv_direct_wh_s1.sh | 143 + .../sh/compile/conv_direct_wh_s2.sh | 54 + .../sh/compile/conv_wino_gemm36_tn.sh | 0 .../sh/compile/conv_wino_rotate_fltbuf.sh | 0 .../sh/compile/conv_wino_trans_outbuf.sh | 0 .../sh/compile/conv_wino_trans_picbuf_left.sh | 0 .../compile/conv_wino_trans_picbuf_right.sh | 0 .../kernel_lib_compile/sh/compile/copy.sh | 16 + .../sh/compile/deconv_gemm_f2s2.sh | 30 + .../sh/compile/deconv_gemm_trans_fltbuf.sh | 13 + .../kernel_lib_compile/sh/compile/eltwise.sh | 69 + .../sh/compile/eltwise_broadcast.sh | 22 + .../sh/compile/eltwise_spe_nchw_c.sh | 13 + .../sh/compile/fc_trans_fltbuf.sh | 0 .../sh/compile/fill_memory_zero.sh | 12 + .../sh/compile/fill_memory_zero_vec4.sh | 12 + .../kernel_lib_compile/sh/compile/gemm_nt.sh | 0 .../kernel_lib_compile/sh/compile/gemm_tn.sh | 375 ++ .../sh/compile/mem_trans_nchw_to_ncwhc4.sh | 13 + .../sh/compile/mem_trans_ncwhc4_to_nchw.sh | 12 + .../sh/compile/mem_trans_ncwhc4_to_ncwhc4.sh | 12 + .../sh/compile/normalization.sh | 12 + .../kernel_lib_compile/sh/compile/power.sh | 12 + .../kernel_lib_compile/sh/compile/prelu.sh | 12 + .../kernel_lib_compile/sh/compile/sample.sh | 0 .../kernel_lib_compile/sh/compile/scale.sh | 2 + .../kernel_lib_compile/sh/compile/slice_h.sh | 0 .../sh/compile/transpose_nchw.sh | 14 + .../kernel_lib_compile/sh/packKernelBin.sh | 2 +- .../tools/kernel_lib_compile/sh/sh.config | 0 .../kernel_source_compile/CMakeLists.txt | 35 + .../buildKernelSourceLib.sh | 22 + .../kernel_cl2char/cl2char.cpp | 540 +++ common/memory/include/memory.hpp | 52 + common/memory/include/memory_cpu.hpp | 172 + common/memory/include/memory_ocl.hpp | 294 ++ common/memory/include/tensor.hpp | 168 + common/uni/CMakeLists.txt | 19 + common/uni/include/algorithm_map.h | 400 ++ {uni => common/uni}/include/arm_neon_expand.h | 182 +- common/uni/include/error.h | 183 + common/uni/include/graph.h | 294 ++ .../uni}/include/model_print.h | 16 +- .../include/model_serialize_deserialize.hpp | 70 + common/uni/include/op_type.h | 127 + common/uni/include/parse_command.h | 312 ++ common/uni/include/profiling.h | 49 + common/uni/include/schedule.h | 245 ++ common/uni/include/sys.h | 54 + common/uni/include/task.h | 130 + common/uni/include/tensor_desc.h | 516 +++ common/uni/include/thread_affinity.h | 535 +++ common/uni/include/types.h | 618 +++ {uni => common/uni}/include/ut_util.h | 247 +- common/uni/include/x86_avx2_expand.h | 140 + common/uni/src/CMakeLists.txt | 14 + .../uni}/src/model_deserialize.cpp | 367 +- common/uni/src/model_print.cpp | 127 + .../uni}/src/model_serialize.cpp | 213 +- common/uni/src/profiling.cpp | 91 + common/uni/src/tensor_desc.cpp | 614 +++ common/uni/src/types.cpp | 126 + common/uni/src/uni.cpp | 42 + compute/CMakeLists.txt | 17 + compute/blas_enhance/CMakeLists.txt | 20 + compute/blas_enhance/include/blas_enhance.h | 105 + compute/blas_enhance/src/CMakeLists.txt | 44 + compute/blas_enhance/src/axpby.cpp | 53 + compute/blas_enhance/src/cpu/arm/axpby.cpp | 46 + .../blas_enhance}/src/cpu/arm/blas_arm.h | 52 +- .../blas_enhance/src/cpu/arm/fp16/axpby.cpp | 34 + .../src/cpu/arm/fp16/blas_fp16.h | 30 +- .../blas_enhance}/src/cpu/arm/fp16/mmm.cpp | 49 +- .../blas_enhance/src/cpu/arm/fp16}/mmm.h | 20 +- .../blas_enhance/src/cpu/arm/fp16/mmm_A55.cpp | 783 ++++ .../blas_enhance/src/cpu/arm/fp16/mmm_A76.cpp | 592 +++ .../src/cpu/arm/fp16/mmm_common.h | 88 +- compute/blas_enhance/src/cpu/arm/fp16/mvm.cpp | 119 + .../blas_enhance}/src/cpu/arm/fp16/mvm.h | 17 +- .../blas_enhance/src/cpu/arm/fp16/mvm_A55.cpp | 138 + .../blas_enhance/src/cpu/arm/fp16/mvm_A76.cpp | 124 + .../src/cpu/arm/fp16/mvm_common.h | 253 ++ .../blas_enhance/src/cpu/arm/fp32/axpby.cpp | 34 + .../blas_enhance/src/cpu/arm/fp32/blas_fp32.h | 97 + .../blas_enhance}/src/cpu/arm/fp32/mmm_V7.cpp | 366 +- .../blas_enhance}/src/cpu/arm/fp32/mmm_V8.cpp | 219 +- compute/blas_enhance/src/cpu/arm/fp32/mvm.cpp | 118 + .../blas_enhance/src/cpu/arm/fp32/mvm_col.cpp | 61 +- .../blas_enhance/src/cpu/arm/fp32/mvm_row.cpp | 182 + .../src/cpu/arm/int8/blas_int8.h | 34 +- .../blas_enhance}/src/cpu/arm/int8/mmm.cpp | 47 +- compute/blas_enhance/src/cpu/arm/int8/mmm.h | 24 + .../blas_enhance/src/cpu/arm/int8/mmm_A55.cpp | 741 ++++ .../blas_enhance/src/cpu/arm/int8/mmm_A76.cpp | 685 ++++ .../src/cpu/arm/int8/mmm_common.h | 455 +++ compute/blas_enhance/src/cpu/arm/int8/mvm.cpp | 168 + compute/blas_enhance/src/cpu/arm/int8/mvm.h | 128 + .../blas_enhance}/src/cpu/arm/mmm.cpp | 79 +- .../blas_enhance}/src/cpu/arm/mvm.cpp | 75 +- .../blas_enhance/src/cpu/general/axpby.cpp | 45 + .../src/cpu/general/blas_general.h | 42 +- .../blas_enhance}/src/cpu/general/mmm.cpp | 57 +- .../blas_enhance}/src/cpu/general/mvm.cpp | 30 +- compute/blas_enhance/src/cpu/x86/blas_x86.h | 47 + .../src/cpu/x86}/fp32/blas_fp32.h | 40 +- .../src/cpu/x86/fp32/mmm_avx2.cpp | 1445 +++++++ .../src/cpu/x86/fp32/mvm_avx2_col.cpp | 565 +++ .../src/cpu/x86/fp32/mvm_avx2_row.cpp | 540 +++ compute/blas_enhance/src/cpu/x86/mmm.cpp | 130 + compute/blas_enhance/src/cpu/x86/mvm.cpp | 55 + compute/blas_enhance/src/mmm.cpp | 167 + .../blas_enhance}/src/mvm.cpp | 97 +- .../blas_enhance/tests/.CMakeLists.txt.swp | Bin 0 -> 12288 bytes compute/blas_enhance/tests/CMakeLists.txt | 13 + .../blas_enhance/tests}/test_mmm.cpp | 48 +- .../blas_enhance/tests}/test_mmm_int8.cpp | 50 +- .../blas_enhance/tests}/test_mvm.cpp | 52 +- .../blas_enhance/tests}/test_mvm_int8.cpp | 57 +- {image => compute/image}/CMakeLists.txt | 13 +- compute/image/include/image.h | 38 + .../image}/include/image_processing.hpp | 15 +- compute/image/src/CMakeLists.txt | 31 + .../image}/src/cpu/arm/image_arm.h | 14 +- .../image}/src/cpu/arm/resize_bilinear.cpp | 136 +- .../image}/src/cpu/general/image_general.h | 46 +- .../src/cpu/general/resize_bilinear.cpp | 47 +- .../image/src/gpu/mali/cl/resize_bilinear.cl | 72 + .../src/gpu/mali/cl/resize_bilinear_nchw.cl | 72 + .../mali/fp16/resize_bilinear_mali_fp16.cpp | 113 + .../gpu/mali/fp16/resize_bilinear_mali_fp16.h | 20 + compute/image/src/gpu/mali/image_mali.h | 30 + .../image/src/gpu/mali/resize_bilinear.cpp | 93 + .../image}/src/image_processing.cpp | 100 +- compute/image/src/resize.cpp | 157 + compute/image/tests/CMakeLists.txt | 12 + .../image/tests}/test_image_processing.cpp | 17 +- .../image/tests}/test_image_resize.cpp | 49 +- compute/image/tests/test_image_resize_ocl.cpp | 164 + compute/tensor/CMakeLists.txt | 20 + compute/tensor/include/tensor_computing.h | 702 ++++ ...ensor_computing_library_algorithm_search.h | 16 +- .../tensor/include/tensor_computing_type.h | 70 + .../tensor}/src/CMakeLists.txt | 28 +- compute/tensor/src/activation.cpp | 80 + compute/tensor/src/argmax.cpp | 99 + .../tensor}/src/attention.cpp | 51 +- .../tensor}/src/attention_mask.cpp | 56 +- compute/tensor/src/bilateral_slice_apply.cpp | 118 + compute/tensor/src/channel_resize.cpp | 66 + compute/tensor/src/check.cpp | 96 + compute/tensor/src/clip.cpp | 78 + compute/tensor/src/concat.cpp | 184 + compute/tensor/src/convolution.cpp | 334 ++ compute/tensor/src/copy.cpp | 66 + compute/tensor/src/cpu/activation.cpp | 32 + .../tensor/src/cpu}/argmax.cpp | 61 +- compute/tensor/src/cpu/arm/arm_functions.h | 249 ++ .../tensor}/src/cpu/arm/attention.cpp | 20 +- .../tensor/src/cpu/arm/attention_mask.cpp | 24 +- .../tensor/src/cpu/arm/bnn/convolution.cpp | 74 + .../src/cpu/arm/bnn/convolution_dorefa.h | 86 + .../cpu/arm/bnn/convolution_dorefa_A55.cpp | 779 ++++ .../cpu/arm/bnn/convolution_dorefa_A76.cpp | 759 ++++ .../cpu/arm/bnn/convolution_transform_bnn.h | 47 +- .../tensor/src/cpu/arm/bnn/convolution_xnor.h | 86 + .../src/cpu/arm/bnn/convolution_xnor_A55.cpp | 786 ++++ .../src/cpu/arm/bnn/convolution_xnor_A76.cpp | 774 ++++ .../src/cpu/arm/bnn/tensor_computing_bnn.h | 36 +- .../tensor}/src/cpu/arm/check.cpp | 75 +- .../tensor}/src/cpu/arm/clip.cpp | 22 +- compute/tensor/src/cpu/arm/convolution.cpp | 492 +++ compute/tensor/src/cpu/arm/deconvolution.cpp | 49 + .../src/cpu/arm/depthwise_convolution.cpp | 125 + .../arm/depthwise_pointwise_convolution.cpp | 211 + .../tensor/src/cpu/arm/eltwise.cpp | 93 +- .../src/cpu/arm/fp16/arm_functions_fp16.h | 206 +- .../tensor}/src/cpu/arm/fp16/attention.cpp | 48 +- .../src/cpu/arm/fp16/attention_mask.cpp | 36 +- .../tensor}/src/cpu/arm/fp16/check.cpp | 61 +- .../tensor}/src/cpu/arm/fp16/clip.cpp | 22 +- .../tensor/src/cpu/arm/fp16/convolution.cpp | 87 + .../src/cpu/arm/fp16/convolution_direct.cpp | 233 +- .../src/cpu/arm/fp16/convolution_direct.h | 34 +- .../src/cpu/arm/fp16/convolution_gemm.h | 76 + .../src/cpu/arm/fp16/convolution_gemm_A55.cpp | 975 +++++ .../src/cpu/arm/fp16/convolution_gemm_A76.cpp | 893 +++++ .../cpu/arm/fp16/convolution_gemm_icnchw.h | 79 + .../arm/fp16/convolution_gemm_icnchw_A55.cpp | 1003 +++++ .../arm/fp16/convolution_gemm_icnchw_A76.cpp | 920 +++++ .../cpu/arm/fp16/convolution_transform.cpp | 106 +- .../src/cpu/arm/fp16/convolution_winograd.h | 78 + .../cpu/arm/fp16/convolution_winograd_A55.cpp | 859 +++++ .../cpu/arm/fp16/convolution_winograd_A76.cpp | 725 ++++ .../arm/fp16/convolution_winograd_transform.h | 165 +- .../cpu/arm/fp16/deconvolution_transform.cpp | 97 + .../fp16/depthwise_pointwise_convolution.cpp | 93 + .../depthwise_pointwise_convolution_3x3s1p1.h | 95 + ...wise_pointwise_convolution_3x3s1p1_A55.cpp | 679 ++-- ...wise_pointwise_convolution_3x3s1p1_A76.cpp | 679 ++-- .../depthwise_pointwise_convolution_direct.h | 96 + ...hwise_pointwise_convolution_direct_A55.cpp | 1417 +++++++ ...hwise_pointwise_convolution_direct_A76.cpp | 1334 +++++++ ..._pointwise_convolution_direct_no_padding.h | 134 +- ...wise_convolution_direct_no_padding_A55.cpp | 605 ++- ...wise_convolution_direct_no_padding_A76.cpp | 605 ++- .../tensor}/src/cpu/arm/fp16/eltwise.cpp | 86 +- compute/tensor/src/cpu/arm/fp16/lstm.cpp | 263 ++ .../src/cpu/arm/fp16/normalization.cpp | 39 +- compute/tensor/src/cpu/arm/fp16/pooling.cpp | 91 + compute/tensor/src/cpu/arm/fp16/prelu.cpp | 61 + .../tensor}/src/cpu/arm/fp16/quantize.cpp | 93 +- .../tensor}/src/cpu/arm/fp16/scale.cpp | 63 +- .../tensor}/src/cpu/arm/fp16/softmax.cpp | 61 +- .../src/cpu/arm/fp16/tensor_computing_fp16.h | 178 + .../src/cpu/arm/fp32/arm_functions_fp32.h | 165 +- .../tensor}/src/cpu/arm/fp32/attention.cpp | 48 +- .../src/cpu/arm/fp32/attention_mask.cpp | 36 +- .../tensor}/src/cpu/arm/fp32/check.cpp | 61 +- .../tensor}/src/cpu/arm/fp32/clip.cpp | 19 +- .../tensor/src/cpu/arm/fp32/convolution.cpp | 93 + .../src/cpu/arm/fp32/convolution_gemm_V7.cpp | 677 ++++ .../src/cpu/arm/fp32/convolution_gemm_V8.cpp | 1010 +++++ .../arm/fp32/convolution_gemm_icnchw_V7.cpp | 246 +- .../arm/fp32/convolution_gemm_icnchw_V8.cpp | 845 ++++ .../cpu/arm/fp32/convolution_transform.cpp | 77 +- .../cpu/arm/fp32/convolution_winograd_V8.cpp | 393 +- .../arm/fp32/convolution_winograd_transform.h | 121 +- .../cpu/arm/fp32/deconvolution_transform.cpp | 89 + .../fp32/depthwise_pointwise_convolution.cpp | 75 + .../fp32/depthwise_pointwise_convolution.h | 59 + ...thwise_pointwise_convolution_direct_V7.cpp | 699 ++++ ...thwise_pointwise_convolution_direct_V8.cpp | 1264 ++++++ .../tensor}/src/cpu/arm/fp32/eltwise.cpp | 85 +- compute/tensor/src/cpu/arm/fp32/lstm.cpp | 467 +++ .../src/cpu/arm/fp32/normalization.cpp | 39 +- compute/tensor/src/cpu/arm/fp32/pooling.cpp | 86 + compute/tensor/src/cpu/arm/fp32/prelu.cpp | 68 + .../tensor}/src/cpu/arm/fp32/scale.cpp | 63 +- .../tensor}/src/cpu/arm/fp32/softmax.cpp | 61 +- .../src/cpu/arm/fp32/tensor_computing_fp32.h | 242 ++ .../src/cpu/arm/int8/arm_functions_int8.h | 20 +- .../tensor}/src/cpu/arm/int8/concat.cpp | 76 +- .../tensor/src/cpu/arm/int8/convolution.cpp | 94 + .../src/cpu/arm/int8/convolution_gemm.h | 502 +++ .../src/cpu/arm/int8/convolution_gemm_A55.cpp | 599 ++- .../src/cpu/arm/int8/convolution_gemm_A76.cpp | 603 ++- .../cpu/arm/int8/convolution_transform.cpp | 110 +- .../src/cpu/arm/int8/convolution_winograd.h | 181 + .../cpu/arm/int8/convolution_winograd_A55.cpp | 1487 +++++++ .../cpu/arm/int8/convolution_winograd_A76.cpp | 1440 +++++++ .../arm/int8/convolution_winograd_transform.h | 103 +- .../int8/depthwise_pointwise_convolution.cpp | 72 +- .../int8/depthwise_pointwise_convolution.h | 39 + ...depthwise_pointwise_convolution_direct.cpp | 1865 +++++++++ compute/tensor/src/cpu/arm/int8/pooling.cpp | 81 + .../tensor}/src/cpu/arm/int8/quantize.cpp | 32 +- .../src/cpu/arm/int8/tensor_computing_int8.h | 97 + .../tensor/src/cpu/arm/normalization.cpp | 25 +- compute/tensor/src/cpu/arm/padding.cpp | 126 + compute/tensor/src/cpu/arm/pooling.cpp | 177 + compute/tensor/src/cpu/arm/prelu.cpp | 50 + .../tensor}/src/cpu/arm/quantize.cpp | 19 +- compute/tensor/src/cpu/arm/rnn.cpp | 59 + .../tensor}/src/cpu/arm/scale.cpp | 36 +- .../tensor}/src/cpu/arm/softmax.cpp | 21 +- .../tensor/src/cpu/arm/tensor_computing_arm.h | 227 ++ .../tensor/src/cpu/arm/transform_functions.h | 148 + .../tensor/src/cpu/clip.cpp | 44 +- compute/tensor/src/cpu/concat.cpp | 108 + compute/tensor/src/cpu/convolution.cpp | 62 + compute/tensor/src/cpu/cpu_functions.h | 231 ++ .../tensor/src/cpu/cpu_functions_template.h | 215 ++ compute/tensor/src/cpu/deconvolution.cpp | 681 ++++ .../tensor/src/cpu/depthwise_convolution.cpp | 35 +- .../cpu/depthwise_pointwise_convolution.cpp | 68 + .../tensor/src/cpu}/detectionoutput.cpp | 183 +- compute/tensor/src/cpu/eltwise.cpp | 169 + compute/tensor/src/cpu/embedding.cpp | 66 + .../tensor}/src/cpu/general/attention.cpp | 38 +- .../src/cpu/general/attention_mask.cpp | 48 +- .../tensor}/src/cpu/general/check.cpp | 93 +- .../tensor}/src/cpu/general/clip.cpp | 32 +- .../tensor/src/cpu/general/convolution.cpp | 209 + .../tensor/src/cpu/general/deconvolution.cpp | 150 + .../src/cpu/general/depthwise_convolution.cpp | 34 + .../depthwise_pointwise_convolution.cpp | 191 + compute/tensor/src/cpu/general/eltwise.cpp | 88 + .../src/cpu/general/general_functions.h | 274 ++ .../tensor}/src/cpu/general/normalization.cpp | 55 +- compute/tensor/src/cpu/general/padding.cpp | 126 + .../tensor}/src/cpu/general/pooling.cpp | 108 +- compute/tensor/src/cpu/general/pooling_bp.cpp | 111 + compute/tensor/src/cpu/general/prelu.cpp | 85 + compute/tensor/src/cpu/general/rnn.cpp | 202 + .../tensor}/src/cpu/general/scale.cpp | 77 +- .../tensor}/src/cpu/general/softmax.cpp | 71 +- .../cpu/general/tensor_computing_general.h | 165 + .../tensor}/src/cpu/general/transpose.cpp | 25 +- compute/tensor/src/cpu/l2normalization.cpp | 57 + .../tensor/src/cpu/non_max_suppression.cpp | 222 ++ compute/tensor/src/cpu/padding.cpp | 163 + .../tensor/src/cpu/power.cpp | 36 +- compute/tensor/src/cpu/priorbox.cpp | 206 + compute/tensor/src/cpu/reduction.cpp | 198 + compute/tensor/src/cpu/reshape.cpp | 120 + compute/tensor/src/cpu/rnn.cpp | 273 ++ compute/tensor/src/cpu/roialign.cpp | 170 + .../arm => compute/tensor/src/cpu}/slice.cpp | 36 +- .../tensor/src/cpu}/split.cpp | 31 +- compute/tensor/src/cpu/tensor_computing_cpu.h | 286 ++ compute/tensor/src/cpu/tfslice.cpp | 131 + compute/tensor/src/cpu/transpose.cpp | 24 + compute/tensor/src/cpu/x86/attention_mask.cpp | 40 + compute/tensor/src/cpu/x86/check.cpp | 105 + .../tensor/src/cpu/x86/clip.cpp | 35 +- compute/tensor/src/cpu/x86/convolution.cpp | 228 ++ compute/tensor/src/cpu/x86/deconvolution.cpp | 42 + .../src/cpu/x86/depthwise_convolution.cpp | 102 + .../x86/depthwise_pointwise_convolution.cpp | 83 + compute/tensor/src/cpu/x86/eltwise.cpp | 41 + .../src/cpu/x86/fp32/attention_mask.cpp | 82 + compute/tensor/src/cpu/x86/fp32/check.cpp | 103 + compute/tensor/src/cpu/x86/fp32/clip.cpp | 38 + .../tensor/src/cpu/x86/fp32/convolution.cpp | 134 + .../cpu/x86/fp32/convolution_1x1_direct.cpp | 1749 +++++++++ .../cpu/x86/fp32/convolution_2x2_direct.cpp | 1769 +++++++++ .../src/cpu/x86/fp32/convolution_direct.cpp | 720 ++++ .../cpu/x86/fp32/convolution_direct_nchw.cpp | 1861 +++++++++ .../cpu/x86/fp32/convolution_transform.cpp | 153 + .../cpu/x86/fp32/deconvolution_transform.cpp | 185 + .../x86/fp32/depthwise_convolution_direct.cpp | 834 ++++ .../fp32/depthwise_convolution_transform.cpp | 71 + .../fp32/depthwise_pointwise_convolution.cpp | 69 + ...thwise_pointwise_convolution_transform.cpp | 38 + compute/tensor/src/cpu/x86/fp32/eltwise.cpp | 82 + .../src/cpu/x86/fp32/l2normalization.cpp | 64 + compute/tensor/src/cpu/x86/fp32/lstm.cpp | 318 ++ .../tensor/src/cpu/x86/fp32/normalization.cpp | 62 + compute/tensor/src/cpu/x86/fp32/pooling.cpp | 394 ++ compute/tensor/src/cpu/x86/fp32/scale.cpp | 119 + compute/tensor/src/cpu/x86/fp32/softmax.cpp | 139 + .../src/cpu/x86/fp32/tensor_computing_fp32.h | 259 ++ .../cpu/x86/fp32/transform_functions_fp32.h | 149 + .../src/cpu/x86/fp32/x86_functions_fp32.h | 361 ++ compute/tensor/src/cpu/x86/normalization.cpp | 37 + compute/tensor/src/cpu/x86/pooling.cpp | 41 + compute/tensor/src/cpu/x86/rnn.cpp | 49 + compute/tensor/src/cpu/x86/scale.cpp | 51 + compute/tensor/src/cpu/x86/softmax.cpp | 37 + .../tensor/src/cpu/x86/tensor_computing_x86.h | 212 + compute/tensor/src/cpu/x86/x86_functions.h | 158 + .../tensor/src/cpu/yolov3detectionoutput.cpp | 274 ++ compute/tensor/src/deconvolution.cpp | 278 ++ compute/tensor/src/depth2space.cpp | 79 + compute/tensor/src/depthwise_convolution.cpp | 306 ++ .../src/depthwise_pointwise_convolution.cpp | 355 ++ compute/tensor/src/detectionoutput.cpp | 85 + compute/tensor/src/eltwise.cpp | 176 + compute/tensor/src/embedding.cpp | 97 + compute/tensor/src/fully_connected.cpp | 451 +++ compute/tensor/src/gpu/mali/activation.cpp | 96 + compute/tensor/src/gpu/mali/argmax.cpp | 117 + .../src/gpu/mali/bilateral_slice_apply.cpp | 214 ++ .../tensor/src/gpu/mali/channel_resize.cpp | 84 + compute/tensor/src/gpu/mali/check.cpp | 153 + compute/tensor/src/gpu/mali/cl/activation.cl | 58 + compute/tensor/src/gpu/mali/cl/argmax_x.cl | 136 + .../gpu/mali/cl/bilateral_slice_apply_c12.cl | 129 +- .../gpu/mali/cl/bilateral_slice_apply_pre.cl | 38 +- .../tensor/src/gpu/mali/cl/channel_resize.cl | 230 ++ .../tensor/src/gpu/mali/cl/check_int_spe.cl | 33 + .../tensor}/src/gpu/mali/cl/clip.cl | 26 +- compute/tensor/src/gpu/mali/cl/col2im.cl | 78 + compute/tensor/src/gpu/mali/cl/concat.cl | 186 + .../src/gpu/mali/cl/conv_depthwise_s1.cl | 52 +- .../src/gpu/mali/cl/conv_depthwise_s2.cl | 67 +- .../mali/cl/conv_depthwise_trans_fltbuf.cl | 88 + .../cl/conv_direct_3d_sw1_nchw_to_ncwhc4.cl | 170 + .../cl/conv_direct_3d_sw2_nchw_to_ncwhc4.cl | 176 + .../tensor/src/gpu/mali/cl/conv_direct_s1.cl | 345 ++ .../src/gpu/mali/cl/conv_direct_s1_fn_spe.cl | 503 +++ .../mali/cl/conv_direct_s1_nchw_to_ncwhc4.cl | 156 + .../gpu/mali/cl/conv_direct_s1_spe_f1c3k1.cl | 34 +- .../tensor}/src/gpu/mali/cl/conv_direct_s2.cl | 90 +- .../mali/cl/conv_direct_s2_nchw_to_ncwhc4.cl | 168 + .../src/gpu/mali/cl/conv_direct_spe_fwhs1.cl | 134 + .../gpu/mali/cl/conv_direct_trans_fltbuf.cl | 207 + .../src/gpu/mali/cl/conv_direct_wh_s1.cl | 86 +- .../src/gpu/mali/cl/conv_direct_wh_s2.cl | 136 + .../src/gpu/mali/cl/conv_wino_gemm36_tn.cl | 38 +- .../gpu/mali/cl/conv_wino_rotate_fltbuf.cl | 14 +- .../gpu/mali/cl/conv_wino_trans_fltbuf_3x3.cl | 124 + .../src/gpu/mali/cl/conv_wino_trans_outbuf.cl | 262 ++ .../mali/cl/conv_wino_trans_outbuf_right.cl | 62 +- .../src/gpu/mali/cl/conv_wino_trans_picbuf.cl | 137 + .../mali/cl/conv_wino_trans_picbuf_left.cl | 49 +- .../mali/cl/conv_wino_trans_picbuf_right.cl | 51 +- compute/tensor/src/gpu/mali/cl/copy.cl | 87 + .../tensor/src/gpu/mali/cl/deconv_direct.cl | 100 + .../gpu/mali/cl/deconv_direct_trans_fltbuf.cl | 48 + .../src/gpu/mali/cl/deconv_gemm_f2s2.cl | 339 ++ .../gpu/mali/cl/deconv_gemm_trans_fltbuf.cl | 92 + compute/tensor/src/gpu/mali/cl/depth2space.cl | 33 + .../src/gpu/mali/cl/depth2space_nchw.cl | 46 + .../src/gpu/mali/cl/depth2space_ncwhc4_2x2.cl | 62 + compute/tensor/src/gpu/mali/cl/eltwise.cl | 175 + .../src/gpu/mali/cl/eltwise_broadcast.cl | 89 + .../src/gpu/mali/cl/eltwise_spe_nchw_c.cl | 74 + .../tensor}/src/gpu/mali/cl/embedding.cl | 32 +- .../tensor}/src/gpu/mali/cl/fc_p1.cl | 57 +- .../tensor}/src/gpu/mali/cl/fc_p2.cl | 43 +- .../tensor/src/gpu/mali/cl/fc_trans_fltbuf.cl | 100 + .../src/gpu/mali/cl/fill_memory_zero.cl | 40 +- .../src/gpu/mali/cl/fill_memory_zero_vec4.cl | 38 + .../tensor}/src/gpu/mali/cl/gemm_nt.cl | 88 +- compute/tensor/src/gpu/mali/cl/gemm_tn.cl | 269 ++ compute/tensor/src/gpu/mali/cl/kernel_def.h | 3424 +++++++++++++++++ .../mali/cl/mem_trans_3d_ncwhc4_to_nchw.cl | 123 + .../src/gpu/mali/cl/mem_trans_nchw_to_nchw.cl | 79 + .../gpu/mali/cl/mem_trans_nchw_to_ncwhc4.cl | 153 + .../mem_trans_nchw_to_ncwhc4_iw_equal_oh.cl | 48 +- .../gpu/mali/cl/mem_trans_ncwhc4_to_mtk.cl | 36 +- .../gpu/mali/cl/mem_trans_ncwhc4_to_nchw.cl | 137 + .../mem_trans_ncwhc4_to_nchw_ih_equal_ow.cl | 47 +- .../gpu/mali/cl/mem_trans_ncwhc4_to_ncwhc4.cl | 61 + .../tensor}/src/gpu/mali/cl/normalization.cl | 52 +- .../src/gpu/mali/cl/padding_constant.cl | 51 + .../tensor/src/gpu/mali/cl/padding_edge.cl | 72 + .../src/gpu/mali/cl/padding_input_gclmem.cl | 53 +- .../tensor/src/gpu/mali/cl/padding_reflect.cl | 75 + .../src/gpu/mali/cl/padding_symmetric.cl | 75 + .../src/gpu/mali/cl/pooling_global_mean_h.cl | 46 + .../src/gpu/mali/cl/pooling_global_mean_w.cl | 50 + .../tensor}/src/gpu/mali/cl/pooling_max.cl | 60 +- .../tensor}/src/gpu/mali/cl/pooling_mean.cl | 62 +- compute/tensor/src/gpu/mali/cl/power.cl | 93 + compute/tensor/src/gpu/mali/cl/prelu.cl | 60 + .../tensor}/src/gpu/mali/cl/reshape.cl | 27 +- .../src/gpu/mali/cl/rnncell_build_xh.cl | 33 + .../gpu/mali/cl/rnncell_update_project_res.cl | 118 + .../src/gpu/mali/cl/rnncell_update_res.cl | 166 + .../tensor}/src/gpu/mali/cl/scale.cl | 72 +- .../tensor}/src/gpu/mali/cl/slice_h.cl | 37 +- compute/tensor/src/gpu/mali/cl/softmax.cl | 115 + .../src/gpu/mali/cl/softmax_h1w1_max_all.cl | 48 +- .../src/gpu/mali/cl/softmax_h1w1_max_part.cl | 60 + .../src/gpu/mali/cl/softmax_h1w1_output.cl | 39 +- .../src/gpu/mali/cl/softmax_h1w1_sum_all.cl | 33 + .../src/gpu/mali/cl/softmax_h1w1_sum_part.cl | 49 + .../tensor}/src/gpu/mali/cl/softmax_nchw_c.cl | 64 +- .../tensor/src/gpu/mali/cl/softmax_nchw_w.cl | 116 + .../tensor}/src/gpu/mali/cl/space2depth.cl | 24 +- .../tensor}/src/gpu/mali/cl/squeeze.cl | 25 +- .../src/gpu/mali/cl/transpose_3d_nchw.cl | 119 + .../tensor/src/gpu/mali/cl/transpose_nchw.cl | 92 + compute/tensor/src/gpu/mali/clip.cpp | 85 + compute/tensor/src/gpu/mali/concat.cpp | 167 + compute/tensor/src/gpu/mali/convolution.cpp | 606 +++ compute/tensor/src/gpu/mali/copy.cpp | 177 + compute/tensor/src/gpu/mali/deconvolution.cpp | 433 +++ compute/tensor/src/gpu/mali/depth2space.cpp | 110 + .../src/gpu/mali/depthwise_convolution.cpp | 359 ++ .../mali/depthwise_pointwise_convolution.cpp | 522 +++ compute/tensor/src/gpu/mali/eltwise.cpp | 202 + compute/tensor/src/gpu/mali/embedding.cpp | 86 + .../gpu/mali/fp16/activation_mali_fp16.cpp | 106 + .../src/gpu/mali/fp16/activation_mali_fp16.h | 15 +- .../src/gpu/mali/fp16/argmax_mali_fp16.cpp | 204 + .../src/gpu/mali/fp16/argmax_mali_fp16.h | 24 +- .../fp16/bilateral_slice_apply_mali_fp16.cpp | 132 + .../fp16/bilateral_slice_apply_mali_fp16.h | 44 +- .../mali/fp16/channel_resize_mali_fp16.cpp | 92 + .../gpu/mali/fp16/channel_resize_mali_fp16.h | 26 + .../src/gpu/mali/fp16/clip_mali_fp16.cpp | 65 +- .../src/gpu/mali/fp16/clip_mali_fp16.h | 15 +- .../src/gpu/mali/fp16/concat_mali_fp16.cpp | 248 ++ .../src/gpu/mali/fp16/concat_mali_fp16.h | 19 +- .../fp16/convolution_direct_mali_fp16.cpp | 453 +++ .../mali/fp16/convolution_direct_mali_fp16.h | 55 + .../convolution_direct_spe_ck_mali_fp16.cpp | 188 + .../convolution_direct_spe_ck_mali_fp16.h | 55 + .../gpu/mali/fp16/convolution_mali_fp16.cpp | 201 + .../src/gpu/mali/fp16/convolution_mali_fp16.h | 55 +- .../mali/fp16/convolution_wino_mali_fp16.cpp | 420 ++ .../mali/fp16/convolution_wino_mali_fp16.h | 55 + .../fp16/deconvolution_direct_mali_fp16.cpp | 246 ++ .../fp16/deconvolution_direct_mali_fp16.h | 54 + .../fp16/deconvolution_gemm_mali_fp16.cpp | 261 ++ .../mali/fp16/deconvolution_gemm_mali_fp16.h | 54 + .../gpu/mali/fp16/deconvolution_mali_fp16.cpp | 166 + .../gpu/mali/fp16/deconvolution_mali_fp16.h | 55 + .../gpu/mali/fp16/depth2space_mali_fp16.cpp | 128 + .../src/gpu/mali/fp16/depth2space_mali_fp16.h | 31 + ...depthwise_convolution_direct_mali_fp16.cpp | 184 + .../depthwise_convolution_direct_mali_fp16.h | 54 + .../fp16/depthwise_convolution_mali_fp16.cpp | 157 + .../fp16/depthwise_convolution_mali_fp16.h | 54 + ...pointwise_convolution_direct_mali_fp16.cpp | 277 ++ ...e_pointwise_convolution_direct_mali_fp16.h | 66 + ...e_pointwise_convolution_gemm_mali_fp16.cpp | 278 ++ ...ise_pointwise_convolution_gemm_mali_fp16.h | 66 + ...thwise_pointwise_convolution_mali_fp16.cpp | 190 + ...epthwise_pointwise_convolution_mali_fp16.h | 66 + .../src/gpu/mali/fp16/eltwise_mali_fp16.cpp | 290 ++ .../src/gpu/mali/fp16/eltwise_mali_fp16.h | 19 +- .../src/gpu/mali/fp16/embedding_mali_fp16.cpp | 77 +- .../src/gpu/mali/fp16/embedding_mali_fp16.h | 22 +- .../mali/fp16/fully_connected_mali_fp16.cpp | 247 +- .../gpu/mali/fp16/fully_connected_mali_fp16.h | 48 +- .../src/gpu/mali/fp16/matmul_mali_fp16.cpp | 197 + .../src/gpu/mali/fp16/matmul_mali_fp16.h | 39 + .../fp16/multihead_attention_mali_fp16.cpp | 907 +++++ .../mali/fp16/multihead_attention_mali_fp16.h | 61 + .../gpu/mali/fp16/normalization_mali_fp16.cpp | 65 +- .../gpu/mali/fp16/normalization_mali_fp16.h | 18 +- .../src/gpu/mali/fp16/padding_mali_fp16.cpp | 168 + .../src/gpu/mali/fp16/padding_mali_fp16.h | 27 + .../src/gpu/mali/fp16/pooling_mali_fp16.cpp | 188 + .../src/gpu/mali/fp16/pooling_mali_fp16.h | 20 +- .../src/gpu/mali/fp16/power_mali_fp16.cpp | 98 + .../src/gpu/mali/fp16/power_mali_fp16.h | 25 + .../src/gpu/mali/fp16/prelu_mali_fp16.cpp | 93 + .../src/gpu/mali/fp16/prelu_mali_fp16.h | 28 + .../src/gpu/mali/fp16/reshape_mali_fp16.cpp | 310 ++ .../src/gpu/mali/fp16/reshape_mali_fp16.h | 19 +- .../src/gpu/mali/fp16/rnn_mali_fp16.cpp | 192 + .../tensor/src/gpu/mali/fp16/rnn_mali_fp16.h | 58 + .../src/gpu/mali/fp16/rnncell_mali_fp16.cpp | 233 ++ .../src/gpu/mali/fp16/rnncell_mali_fp16.h | 44 + .../src/gpu/mali/fp16/scale_mali_fp16.cpp | 93 +- .../src/gpu/mali/fp16/scale_mali_fp16.h | 15 +- .../src/gpu/mali/fp16/slice_mali_fp16.cpp | 92 +- .../src/gpu/mali/fp16/slice_mali_fp16.h | 17 +- .../src/gpu/mali/fp16/softmax_mali_fp16.cpp | 207 + .../src/gpu/mali/fp16/softmax_mali_fp16.h | 22 +- .../src/gpu/mali/fp16/squeeze_mali_fp16.cpp | 64 +- .../src/gpu/mali/fp16/squeeze_mali_fp16.h | 17 +- .../src/gpu/mali/fp16/transpose_mali_fp16.cpp | 229 ++ .../src/gpu/mali/fp16/transpose_mali_fp16.h | 20 +- .../src/gpu/mali/fp16/unsqueeze_mali_fp16.cpp | 81 + .../src/gpu/mali/fp16/unsqueeze_mali_fp16.h | 23 + .../tensor/src/gpu/mali/fully_connected.cpp | 507 +++ compute/tensor/src/gpu/mali/matmul.cpp | 476 +++ .../src/gpu/mali/multihead_attention.cpp | 722 ++++ .../tensor}/src/gpu/mali/normalization.cpp | 79 +- compute/tensor/src/gpu/mali/padding.cpp | 72 + compute/tensor/src/gpu/mali/pooling.cpp | 124 + compute/tensor/src/gpu/mali/power.cpp | 113 + .../src/gpu/mali/preallocated_memory.cpp | 94 + compute/tensor/src/gpu/mali/prelu.cpp | 84 + compute/tensor/src/gpu/mali/reshape.cpp | 157 + compute/tensor/src/gpu/mali/rnncell.cpp | 439 +++ .../tensor}/src/gpu/mali/scale.cpp | 79 +- .../tensor}/src/gpu/mali/slice.cpp | 117 +- compute/tensor/src/gpu/mali/softmax.cpp | 131 + compute/tensor/src/gpu/mali/space2depth.cpp | 142 + compute/tensor/src/gpu/mali/squeeze.cpp | 96 + .../src/gpu/mali/tensor_computing_mali.h | 785 ++++ compute/tensor/src/gpu/mali/transpose.cpp | 114 + .../bilateral_slice_apply_mali_uchar.cpp | 130 + .../uchar/bilateral_slice_apply_mali_uchar.h | 34 + compute/tensor/src/gpu/mali/unsqueeze.cpp | 102 + compute/tensor/src/l2normalization.cpp | 65 + compute/tensor/src/matmul.cpp | 384 ++ compute/tensor/src/multihead_attention.cpp | 227 ++ compute/tensor/src/non_max_suppression.cpp | 84 + compute/tensor/src/normalization.cpp | 88 + compute/tensor/src/padding.cpp | 72 + compute/tensor/src/pooling.cpp | 185 + .../tensor/src/pooling_bp.cpp | 98 +- compute/tensor/src/power.cpp | 78 + .../tensor/src/preallocated_memory.cpp | 42 +- compute/tensor/src/prelu.cpp | 98 + compute/tensor/src/priorbox.cpp | 104 + .../tensor}/src/quantize.cpp | 174 +- compute/tensor/src/reduction.cpp | 144 + compute/tensor/src/reshape.cpp | 157 + compute/tensor/src/rnn.cpp | 298 ++ compute/tensor/src/roialign.cpp | 80 + compute/tensor/src/scale.cpp | 98 + compute/tensor/src/slice.cpp | 128 + compute/tensor/src/softmax.cpp | 115 + compute/tensor/src/space2depth.cpp | 60 + compute/tensor/src/split.cpp | 49 + compute/tensor/src/squeeze.cpp | 101 + compute/tensor/src/tensor_computing_type.cpp | 174 + compute/tensor/src/tfslice.cpp | 55 + compute/tensor/src/tile.cpp | 80 + compute/tensor/src/transpose.cpp | 166 + compute/tensor/src/unsqueeze.cpp | 102 + compute/tensor/src/yolov3detectionoutput.cpp | 77 + compute/tensor/tests/CMakeLists.txt | 67 + compute/tensor/tests/test_activation.cpp | 124 + .../tensor/tests}/test_argmax.cpp | 64 +- .../tensor/tests}/test_attention.cpp | 79 +- compute/tensor/tests/test_axpby.cpp | 75 + .../tensor/tests/test_channel_resize_ocl.cpp | 166 + .../tensor/tests}/test_check.cpp | 67 +- {tests => compute/tensor/tests}/test_clip.cpp | 68 +- compute/tensor/tests/test_concat.cpp | 135 + .../tensor/tests}/test_concat_int8.cpp | 107 +- compute/tensor/tests/test_concat_ocl.cpp | 187 + compute/tensor/tests/test_convolution.cpp | 172 + compute/tensor/tests/test_convolution_bnn.cpp | 179 + .../tensor/tests/test_convolution_int8.cpp | 249 ++ compute/tensor/tests/test_convolution_ocl.cpp | 296 ++ compute/tensor/tests/test_deconvolution.cpp | 168 + .../tensor/tests/test_deconvolution_ocl.cpp | 268 ++ .../tests/test_depthwise_convolution.cpp | 257 ++ .../tests/test_depthwise_convolution_int8.cpp | 190 + .../tests/test_depthwise_convolution_ocl.cpp | 270 ++ ...st_depthwise_pointwise_convolution_ocl.cpp | 361 ++ compute/tensor/tests/test_detectionoutput.cpp | 145 + .../tensor/tests/test_dilated_convolution.cpp | 169 + .../tensor/tests}/test_eltwise.cpp | 76 +- compute/tensor/tests/test_fully_connected.cpp | 119 + .../tensor/tests/test_fully_connected_ocl.cpp | 270 ++ compute/tensor/tests/test_l2normalization.cpp | 101 + .../tests/test_multihead_attention_ocl.cpp | 393 ++ .../tensor/tests/test_non_max_suppression.cpp | 139 + compute/tensor/tests/test_normalization.cpp | 111 + compute/tensor/tests/test_padding.cpp | 115 + compute/tensor/tests/test_padding_ocl.cpp | 191 + compute/tensor/tests/test_pooling.cpp | 117 + compute/tensor/tests/test_pooling_bp.cpp | 111 + compute/tensor/tests/test_pooling_int8.cpp | 150 + compute/tensor/tests/test_pooling_ocl.cpp | 210 + compute/tensor/tests/test_power.cpp | 85 + compute/tensor/tests/test_power_ocl.cpp | 157 + compute/tensor/tests/test_prelu.cpp | 92 + compute/tensor/tests/test_prelu_ocl.cpp | 173 + .../tensor/tests}/test_priorbox.cpp | 138 +- compute/tensor/tests/test_reduction.cpp | 107 + .../tensor/tests}/test_reshape.cpp | 80 +- compute/tensor/tests/test_reshape_ocl.cpp | 162 + compute/tensor/tests/test_rnn.cpp | 151 + compute/tensor/tests/test_roialign.cpp | 131 + .../tensor/tests}/test_scale.cpp | 64 +- .../tensor/tests}/test_slice.cpp | 72 +- .../tensor/tests}/test_softmax.cpp | 64 +- .../tensor/tests/test_softmax_h1w1_ocl.cpp | 150 + .../tensor/tests}/test_split.cpp | 61 +- compute/tensor/tests/test_tile.cpp | 61 + .../tensor/tests}/test_transpose.cpp | 82 +- compute/tensor/tests/test_transpose_ocl.cpp | 158 + docs/ARCHITECTURE.md | 21 + docs/BENCHMARK.md | 497 ++- docs/CHANGELOG.md | 12 + docs/DEVELOPER.md | 973 ++--- docs/FAQ.md | 25 + docs/INSTALL.md | 337 +- docs/IOS_USAGE.md | 78 + docs/KIT.md | 50 + docs/QUANTIZATION.md | 48 + docs/REDUCE_GPU_PREPARE_TIME.md | 62 + ...THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.md | 208 + docs/USER_HANDBOOK.md | 416 +- docs/images/ADB.PNG | Bin 0 -> 3875 bytes docs/images/Framework.PNG | Bin 0 -> 47027 bytes docs/images/GNU.PNG | Bin 0 -> 5877 bytes docs/images/JDK.PNG | Bin 0 -> 6036 bytes docs/images/ModelConversion.PNG | Bin 0 -> 47962 bytes docs/images/NDK.PNG | Bin 0 -> 14189 bytes docs/images/PerformanceProfiling.PNG | Bin 0 -> 9671 bytes docs/images/QuickStart.PNG | Bin 0 -> 11037 bytes docs/images/cmake.PNG | Bin 0 -> 3959 bytes docs/images/dx.PNG | Bin 0 -> 1637 bytes docs/images/make.PNG | Bin 0 -> 12177 bytes gcl/include/context.h | 167 - gcl/include/event.h | 120 - gcl/include/gcl_common.h | 257 -- gcl/include/gcl_func.h | 1232 ------ gcl/include/gcl_kernel_binmap.h | 121 - gcl/include/kernel.h | 130 - gcl/include/memory.h | 487 --- gcl/include/platform.h | 397 -- gcl/include/program.h | 238 -- gcl/tools/gcl_sample/sample.cpp | 152 - .../kernel_lib_compile/sh/adbDeviceNum.sh | 11 - .../kernel_lib_compile/sh/compile/concat.sh | 18 - .../sh/compile/conv_direct_s1.sh | 102 - .../sh/compile/conv_direct_s2.sh | 102 - .../sh/compile/conv_direct_spe_fwhs1.sh | 19 - .../kernel_lib_compile/sh/compile/eltwise.sh | 36 - .../kernel_lib_compile/sh/compile/gemm_tn.sh | 114 - image/src/CMakeLists.txt | 22 - image/src/resize.cpp | 93 - inference/CMakeLists.txt | 28 +- .../engine}/CMakeLists.txt | 25 +- inference/engine/api/c/bolt.h | 341 ++ inference/engine/api/dllite/Bolt.h | 101 + inference/engine/api/java/BoltModel.java | 440 +++ inference/engine/api/java/BoltResult.java | 127 + inference/engine/include/BoltModel.h | 145 + inference/{ => engine}/include/activation.hpp | 38 +- inference/engine/include/argmax.hpp | 36 + inference/engine/include/attention.hpp | 65 + inference/engine/include/attention_mask.hpp | 57 + .../engine/include/bilateral_slice_apply.hpp | 37 + inference/engine/include/channel_resize.hpp | 42 + inference/engine/include/check.hpp | 36 + inference/{ => engine}/include/clip.hpp | 31 +- inference/engine/include/cnn.h | 114 + inference/{ => engine}/include/concat.hpp | 25 +- inference/{ => engine}/include/constant.hpp | 36 +- inference/engine/include/convolution.hpp | 69 + inference/engine/include/copy.hpp | 36 + .../include/cpu/activation_cpu.hpp | 48 +- inference/engine/include/cpu/argmax_cpu.hpp | 48 + .../engine/include/cpu/channel_resize_cpu.hpp | 121 + inference/engine/include/cpu/check_cpu.hpp | 46 + inference/engine/include/cpu/clip_cpu.hpp | 46 + inference/engine/include/cpu/concat_cpu.hpp | 52 + .../engine/include/cpu/convolution_cpu.hpp | 539 +++ inference/engine/include/cpu/copy_cpu.hpp | 83 + .../engine/include/cpu/deconvolution_cpu.hpp | 150 + inference/engine/include/cpu/eltwise_cpu.hpp | 81 + .../engine/include/cpu/embedding_cpu.hpp | 87 + inference/engine/include/cpu/factory_cpu.hpp | 366 ++ .../include/cpu/fully_connected_cpu.hpp | 266 ++ .../include/cpu/l2normalization_cpu.hpp | 47 + .../engine/include/cpu/layer_norm_cpu.hpp | 67 + inference/engine/include/cpu/matmul_cpu.hpp | 72 + inference/engine/include/cpu/padding_cpu.hpp | 48 + .../include/cpu/pooling_cpu.hpp} | 76 +- .../include/cpu/power_cpu.hpp} | 61 +- .../include/cpu/preallocated_memory_cpu.hpp | 48 + inference/engine/include/cpu/prelu_cpu.hpp | 65 + .../include/cpu/repeat_cpu.hpp} | 87 +- inference/engine/include/cpu/reshape_cpu.hpp | 89 + inference/engine/include/cpu/resize_cpu.hpp | 78 + inference/engine/include/cpu/rnn_cpu.hpp | 62 + inference/engine/include/cpu/rnncell_cpu.hpp | 140 + inference/engine/include/cpu/scale_cpu.hpp | 109 + .../include/cpu/shape_cpu.hpp} | 58 +- .../engine/include/cpu/shared_weight_cpu.hpp | 73 + inference/engine/include/cpu/slice_cpu.hpp | 50 + inference/engine/include/cpu/softmax_cpu.hpp | 46 + inference/engine/include/cpu/splice_cpu.hpp | 69 + inference/engine/include/cpu/squeeze_cpu.hpp | 48 + inference/engine/include/cpu/tfslice_cpu.hpp | 48 + inference/engine/include/cpu/tile_cpu.hpp | 44 + .../engine/include/cpu/transpose_cpu.hpp | 49 + .../engine/include/cpu/unsqueeze_cpu.hpp | 49 + .../{ => engine}/include/data_loader.hpp | 81 +- inference/engine/include/deconvolution.hpp | 45 + .../{ => engine}/include/depth2space.hpp | 26 +- inference/engine/include/detection_output.hpp | 57 + inference/{ => engine}/include/eltwise.hpp | 34 +- inference/engine/include/embedding.hpp | 36 + inference/engine/include/factory.hpp | 464 +++ inference/engine/include/fully_connected.hpp | 40 + inference/engine/include/inference.hpp | 84 + inference/{ => engine}/include/jump.hpp | 49 +- inference/engine/include/l2normalization.hpp | 32 + inference/{ => engine}/include/layer_norm.hpp | 24 +- inference/engine/include/matmul.hpp | 36 + inference/engine/include/memory_tracker.hpp | 115 + inference/engine/include/model.hpp | 196 + .../engine/include/ocl/activation_ocl.hpp | 55 + inference/engine/include/ocl/argmax_ocl.hpp | 66 + .../include/ocl/bilateral_slice_apply_ocl.hpp | 87 + .../engine/include/ocl/channel_resize_ocl.hpp | 72 + inference/engine/include/ocl/check_ocl.hpp | 54 + inference/engine/include/ocl/clip_ocl.hpp | 54 + inference/engine/include/ocl/concat_ocl.hpp | 60 + .../engine/include/ocl/convolution_ocl.hpp | 534 +++ inference/engine/include/ocl/copy_ocl.hpp | 72 + .../engine/include/ocl/deconvolution_ocl.hpp | 195 + .../engine/include/ocl/depth2space_ocl.hpp | 67 + inference/engine/include/ocl/eltwise_ocl.hpp | 54 + .../engine/include/ocl/embedding_ocl.hpp | 104 + inference/engine/include/ocl/factory_ocl.hpp | 357 ++ .../include/ocl/fully_connected_ocl.hpp | 207 + .../engine/include/ocl/layer_norm_ocl.hpp | 87 + inference/engine/include/ocl/matmul_ocl.hpp | 97 + inference/engine/include/ocl/padding_ocl.hpp | 57 + inference/engine/include/ocl/pooling_ocl.hpp | 66 + inference/engine/include/ocl/power_ocl.hpp | 56 + .../include/ocl/preallocated_memory_ocl.hpp | 58 + inference/engine/include/ocl/prelu_ocl.hpp | 86 + inference/engine/include/ocl/repeat_ocl.hpp | 116 + inference/engine/include/ocl/reshape_ocl.hpp | 67 + inference/engine/include/ocl/resize_ocl.hpp | 75 + inference/engine/include/ocl/rnn_ocl.hpp | 71 + inference/engine/include/ocl/rnncell_ocl.hpp | 228 ++ inference/engine/include/ocl/scale_ocl.hpp | 118 + .../engine/include/ocl/shared_weight_ocl.hpp | 134 + inference/engine/include/ocl/slice_ocl.hpp | 54 + inference/engine/include/ocl/softmax_ocl.hpp | 63 + .../engine/include/ocl/space2depth_ocl.hpp | 54 + inference/engine/include/ocl/squeeze_ocl.hpp | 55 + .../engine/include/ocl/transpose_ocl.hpp | 64 + .../engine/include/ocl/unsqueeze_ocl.hpp | 54 + inference/{ => engine}/include/operator.hpp | 526 ++- inference/engine/include/padding.hpp | 36 + inference/engine/include/pooling.hpp | 47 + .../multiply.hpp => engine/include/power.hpp} | 34 +- .../engine/include/preallocated_memory.hpp | 36 + inference/engine/include/prelu.hpp | 35 + inference/engine/include/prior_box.hpp | 54 + inference/engine/include/reduction.hpp | 83 + .../include/relative_position_embedding.hpp | 96 + .../{ => engine}/include/relative_shift.hpp | 78 +- inference/engine/include/repeat.hpp | 42 + inference/engine/include/reshape.hpp | 36 + inference/engine/include/resize.hpp | 41 + .../{ => engine}/include/result_format.hpp | 16 +- inference/engine/include/rnncell.hpp | 39 + inference/{ => engine}/include/scale.hpp | 34 +- inference/engine/include/sequential.hpp | 228 ++ inference/engine/include/sequential_ocl.hpp | 230 ++ inference/engine/include/shape.hpp | 30 + inference/engine/include/shared_weight.hpp | 43 + inference/{ => engine}/include/slice.hpp | 27 +- inference/{ => engine}/include/softmax.hpp | 24 +- .../{ => engine}/include/space2depth.hpp | 22 +- inference/engine/include/splice.hpp | 36 + inference/{ => engine}/include/squeeze.hpp | 30 +- inference/engine/include/tfslice.hpp | 36 + inference/engine/include/tile.hpp | 36 + inference/{ => engine}/include/transpose.hpp | 28 +- inference/engine/include/unsqueeze.hpp | 36 + inference/engine/include/weight_operator.hpp | 195 + .../include/yolov3_detection_output.hpp | 57 + inference/engine/src/BoltModel_Jni.cpp | 558 +++ inference/engine/src/CMakeLists.txt | 20 + inference/engine/src/bolt.cpp | 689 ++++ inference/engine/src/bolt_dllite.cpp | 489 +++ inference/engine/src/cnn.cpp | 610 +++ inference/engine/src/data_loader.cpp | 377 ++ inference/engine/src/result_format.cpp | 49 + inference/engine/tools/CMakeLists.txt | 25 + .../common_algo_search/common_algo_search.cpp | 128 + .../tools/preprocess_ocl/CMakeLists.txt | 10 + .../preprocess_ocl/build_preprocess_ocl.sh | 114 + .../tools/preprocess_ocl/preprocess_ocl.cpp | 252 ++ .../tools/ptq_calibration/ptq_calibration.cpp | 443 +++ inference/examples/CMakeLists.txt | 63 + .../asr_convolution_transformer.cpp | 133 +- .../asr_labels.txt | 2 + .../automatic_speech_recognition/asr_rnnt.cpp | 110 + .../audio_feature.cpp | 2352 +++++++++++ .../audio_feature.h | 77 + .../encoder_flow.prototxt | 350 ++ .../automatic_speech_recognition/example.wav | Bin 0 -> 93646 bytes .../automatic_speech_recognition/flow_asr.cpp | 884 +++++ .../joint_flow.prototxt | 33 + .../pinyin2hanzi_flow.prototxt | 24 + .../pinyin_lm_embedding.bin | Bin 0 -> 3272704 bytes .../prediction_flow.prototxt | 139 + .../automatic_speech_recognition/run.sh | 95 + .../automatic_speech_recognition/vad.cpp | 78 +- inference/examples/benchmark/benchmark.cpp | 188 + {kits => inference/examples}/bert/bert.cpp | 97 +- inference/examples/bert/flow_tinybert.cpp | 91 + .../examples/bert/flow_tinybert.prototxt | 44 + inference/examples/bert/graph_tinybert.cpp | 82 + inference/examples/bert/tinybert.cpp | 28 + inference/examples/bert/tinybert_onnx.cpp | 28 + inference/examples/bert/tinybert_test.h | 202 + inference/examples/c_api/test_api_c.c | 276 ++ inference/examples/dlaWOdcn/flow_dlaWOdcn.cpp | 101 + .../examples/dlaWOdcn/flow_dlaWOdcn.prototxt | 36 + inference/examples/dlaWOdcn/run.sh | 84 + inference/examples/facesr/flow_facesr.cpp | 82 + .../examples/facesr/flow_facesr.prototxt | 22 + inference/examples/facesr/run.sh | 84 + inference/examples/high_dynamic_range/hdr.cpp | 455 +++ .../image_classification/classification.cpp | 240 ++ .../examples/java_api/test_api_java.java | 707 ++++ .../examples}/machine_translation/nmt.cpp | 108 +- .../examples}/machine_translation/nmt_tsc.cpp | 128 +- .../examples/object_detection/detection.cpp | 140 + .../examples/sequential/test_pipeline_ocl.cpp | 434 +++ .../examples}/text_to_speech/tts.cpp | 150 +- inference/exports/c/bolt.h | 227 -- inference/exports/java/BoltModel.java | 302 -- inference/exports/java/BoltResult.java | 121 - .../flow}/CMakeLists.txt | 12 +- inference/flow/include/flow.h | 85 + .../flow/include/flow_function_factory.h | 35 + inference/flow/include/node.h | 72 + inference/flow/src/CMakeLists.txt | 32 + inference/flow/src/flow.cpp | 77 + inference/flow/src/flow.proto | 96 + inference/flow/src/flow_function_factory.cpp | 43 + inference/flow/src/node.cpp | 242 ++ inference/include/BoltModel.h | 98 - inference/include/argmax.hpp | 63 - inference/include/attention.hpp | 67 - inference/include/attention_mask.hpp | 65 - inference/include/bilateral_slice_apply.hpp | 70 - inference/include/check.hpp | 67 - inference/include/cnn.hpp | 864 ----- inference/include/convolution.hpp | 98 - inference/include/copy.hpp | 88 - inference/include/cpu/clip_cpu.hpp | 53 - inference/include/cpu/concat_cpu.hpp | 59 - inference/include/cpu/convolution_cpu.hpp | 527 --- inference/include/cpu/eltwise_cpu.hpp | 92 - inference/include/cpu/embedding_cpu.hpp | 109 - inference/include/cpu/factory_cpu.hpp | 296 -- inference/include/cpu/fully_connected_cpu.hpp | 481 --- inference/include/cpu/layer_norm_cpu.hpp | 97 - inference/include/cpu/matmul_cpu.hpp | 121 - inference/include/cpu/pooling_cpu.hpp | 91 - inference/include/cpu/reshape_cpu.hpp | 54 - inference/include/cpu/scale_cpu.hpp | 168 - inference/include/cpu/slice_cpu.hpp | 54 - inference/include/cpu/softmax_cpu.hpp | 70 - inference/include/cpu/squeeze_cpu.hpp | 84 - inference/include/cpu/transpose_cpu.hpp | 81 - inference/include/deconvolution.hpp | 334 -- inference/include/detection_output.hpp | 60 - inference/include/embedding.hpp | 45 - inference/include/factory.hpp | 536 --- inference/include/fully_connected.hpp | 56 - inference/include/inference.hpp | 128 - inference/include/lstm.hpp | 84 - inference/include/lstmcell.hpp | 199 - inference/include/matmul.hpp | 48 - inference/include/memory.hpp | 43 - inference/include/model.hpp | 384 -- inference/include/ocl/activation_ocl.hpp | 65 - .../include/ocl/bilateral_slice_apply_ocl.hpp | 108 - inference/include/ocl/clip_ocl.hpp | 63 - inference/include/ocl/concat_ocl.hpp | 75 - inference/include/ocl/convolution_ocl.hpp | 675 ---- inference/include/ocl/depth2space_ocl.hpp | 61 - inference/include/ocl/eltwise_ocl.hpp | 80 - inference/include/ocl/embedding_ocl.hpp | 119 - inference/include/ocl/factory_ocl.hpp | 310 -- inference/include/ocl/fully_connected_ocl.hpp | 339 -- inference/include/ocl/layer_norm_ocl.hpp | 164 - inference/include/ocl/matmul_ocl.hpp | 105 - inference/include/ocl/memory_ocl.hpp | 104 - inference/include/ocl/multiply_ocl.hpp | 60 - inference/include/ocl/pooling_ocl.hpp | 105 - inference/include/ocl/reshape_ocl.hpp | 68 - inference/include/ocl/scale_ocl.hpp | 179 - inference/include/ocl/slice_ocl.hpp | 72 - inference/include/ocl/softmax_ocl.hpp | 58 - inference/include/ocl/space2depth_ocl.hpp | 61 - inference/include/ocl/squeeze_ocl.hpp | 65 - inference/include/ocl/transpose_ocl.hpp | 62 - inference/include/point_cast.hpp | 58 - inference/include/pooling.hpp | 82 - inference/include/preallocated_memory.hpp | 56 - inference/include/prior_box.hpp | 57 - inference/include/reduction.hpp | 86 - .../include/relative_position_embedding.hpp | 93 - inference/include/reshape.hpp | 47 - inference/include/resize.hpp | 90 - inference/include/sequential.hpp | 201 - inference/include/sequential_ocl.hpp | 295 -- inference/include/shared_weight.hpp | 71 - inference/include/tensor.hpp | 196 - inference/include/unsqueeze.hpp | 97 - inference/include/utils.hpp | 37 - inference/include/weight_operator.hpp | 119 - inference/src/BoltModel_Jni.cpp | 516 --- inference/src/CMakeLists.txt | 10 - inference/src/bolt.cpp | 462 --- inference/src/data_loader.cpp | 422 -- inference/src/result_format.cpp | 74 - inference/src/utils.cpp | 96 - install.sh | 121 +- .../project.pbxproj | 878 +++++ .../contents.xcworkspacedata | 7 + .../xcshareddata/IDEWorkspaceChecks.plist | 8 + .../UserInterfaceState.xcuserstate | Bin 0 -> 140692 bytes .../xcschemes/xcschememanagement.plist | 14 + .../ImageClassificationDemo/AppDelegate.h | 18 + .../ImageClassificationDemo/AppDelegate.m | 46 + .../AppIcon.appiconset/Contents.json | 98 + .../Assets.xcassets/Contents.json | 6 + .../Base.lproj/LaunchScreen.storyboard | 25 + .../Base.lproj/Main.storyboard | 24 + .../ImageClassificationDemo/Info.plist | 66 + .../ImageClassificationDemo/SceneDelegate.h | 20 + .../ImageClassificationDemo/SceneDelegate.m | 63 + .../ImageClassificationDemo/ViewController.h | 18 + .../ImageClassificationDemo/ViewController.mm | 299 ++ .../libbolt/headers/flow/flow.pb.h | 1374 +++++++ .../libbolt/headers/kit_flags.h | 26 +- .../libbolt/headers/protobuf/arena.h | 930 +++++ .../libbolt/headers/protobuf/arenastring.h | 314 ++ .../libbolt/headers/protobuf/descriptor.h | 1924 +++++++++ .../libbolt/headers/protobuf/extension_set.h | 1318 +++++++ .../headers/protobuf/generated_message_util.h | 169 + .../libbolt/headers/protobuf/has_bits.h | 72 + .../headers/protobuf/io/coded_stream.h | 1367 +++++++ .../libbolt/headers/protobuf/io/strtod.h | 55 + .../headers/protobuf/io/zero_copy_stream.h | 248 ++ .../protobuf/io/zero_copy_stream_impl.h | 358 ++ .../protobuf/io/zero_copy_stream_impl_lite.h | 410 ++ .../libbolt/headers/protobuf/message.h | 1150 ++++++ .../libbolt/headers/protobuf/message_lite.h | 297 ++ .../libbolt/headers/protobuf/metadata.h | 159 + .../libbolt/headers/protobuf/repeated_field.h | 2504 ++++++++++++ .../protobuf/stubs/atomic_sequence_num.h | 54 + .../headers/protobuf/stubs/atomicops.h | 246 ++ .../stubs/atomicops_internals_arm64_gcc.h | 325 ++ .../stubs/atomicops_internals_arm_gcc.h | 151 + .../stubs/atomicops_internals_arm_qnx.h | 146 + .../atomicops_internals_atomicword_compat.h | 122 + .../stubs/atomicops_internals_generic_gcc.h | 155 + .../stubs/atomicops_internals_macosx.h | 225 ++ .../stubs/atomicops_internals_mips_gcc.h | 313 ++ .../stubs/atomicops_internals_pnacl.h | 231 ++ .../stubs/atomicops_internals_power.h | 440 +++ .../stubs/atomicops_internals_ppc_gcc.h | 155 + .../stubs/atomicops_internals_solaris.h | 188 + .../protobuf/stubs/atomicops_internals_tsan.h | 219 ++ .../stubs/atomicops_internals_x86_gcc.h | 293 ++ .../stubs/atomicops_internals_x86_msvc.h | 150 + .../libbolt/headers/protobuf/stubs/callback.h | 546 +++ .../libbolt/headers/protobuf/stubs/casts.h | 133 + .../libbolt/headers/protobuf/stubs/common.h | 225 ++ .../libbolt/headers/protobuf/stubs/fastmem.h | 152 + .../libbolt/headers/protobuf/stubs/logging.h | 237 ++ .../libbolt/headers/protobuf/stubs/macros.h | 168 + .../libbolt/headers/protobuf/stubs/mutex.h | 148 + .../libbolt/headers/protobuf/stubs/once.h | 167 + .../headers/protobuf/stubs/platform_macros.h | 125 + .../libbolt/headers/protobuf/stubs/port.h | 448 +++ .../headers/protobuf/stubs/scoped_ptr.h | 236 ++ .../headers/protobuf/stubs/shared_ptr.h | 470 +++ .../libbolt/headers/protobuf/stubs/stl_util.h | 121 + .../headers/protobuf/stubs/template_util.h | 138 + .../headers/protobuf/stubs/type_traits.h | 372 ++ .../libbolt/headers/protobuf/text_format.h | 521 +++ .../headers/protobuf/unknown_field_set.h | 354 ++ .../libbolt/image_classification.prototxt | 24 + .../libbolt/imagenet_classes.txt | 1000 +++++ .../ImageClassificationDemo/main.m | 31 +- .../ImageClassificationDemoTests.m | 42 + .../ImageClassificationDemoTests/Info.plist | 22 + .../ImageClassificationDemoUITests.m | 53 + .../ImageClassificationDemoUITests/Info.plist | 22 + kit/iOS/setup_lib_iOS.sh | 25 + kits/CMakeLists.txt | 49 - .../automatic_speech_recognition/asr_rnnt.cpp | 108 - kits/bert/tinybert.cpp | 158 - kits/high_dynamic_range/hdr.cpp | 352 -- kits/image_classification/classification.cpp | 192 - .../classification_bin.cpp | 119 - kits/super_resolution/super_resolution.cpp | 149 - model-tools/CMakeLists.txt | 34 - .../OPOptimizers/ChannelPaddingOptimizer.hpp | 103 - .../OPOptimizers/ConstUpsampleOptimizer.hpp | 46 - .../OPOptimizers/ConvActivationOptimizer.hpp | 85 - .../OPOptimizers/DeprecatedOPOptimizer.hpp | 71 - .../OPOptimizers/NoQuantLabelOptimizer.hpp | 209 - .../include/OPOptimizers/OPOptimizer.hpp | 138 - .../include/OPOptimizers/PadConvOptimizer.hpp | 58 - .../TransposeMulToScaleOptimizer.hpp | 61 - model-tools/include/model_optimizer.hpp | 131 - .../include/model_serialize_deserialize.hpp | 51 - model-tools/include/model_tools.h | 427 -- model-tools/src/CMakeLists.txt | 15 - model-tools/src/caffe/CMakeLists.txt | 24 - model-tools/src/caffe/caffe.proto | 1778 --------- model-tools/src/data_type_converter.cpp | 329 -- model-tools/src/model_adaptee.h | 410 -- model-tools/src/model_print.cpp | 178 - model-tools/src/onnx/CMakeLists.txt | 18 - model-tools/src/onnx/onnx.proto | 505 --- model-tools/src/onnx/onnx_adaptee.h | 1235 ------ model-tools/src/tflite/CMakeLists.txt | 15 - model-tools/src/tflite/tflite_adaptee.h | 792 ---- model-tools/tools/ms2bolt/CMakeLists.txt | 12 - model-tools/tools/ms2bolt/fixedMs2bolt.cpp | 286 -- model_tools/CMakeLists.txt | 22 + .../OPOptimizers/ActivationOptimizer.hpp | 74 + .../include/OPOptimizers/BNScaleOptimizer.hpp | 85 +- .../include/OPOptimizers/CastOptimizer.hpp | 50 + .../OPOptimizers/ChannelPaddingOptimizer.hpp | 357 ++ .../OPOptimizers/ClipClipOptimizer.hpp | 46 +- .../OPOptimizers/DeprecatedOPOptimizer.hpp | 66 + .../DepthwisePointwiseOptimizer.hpp | 104 +- .../include/OPOptimizers/FCFCOptimizer.hpp | 83 +- .../include/OPOptimizers/GeluOptimizer.hpp | 74 + .../include/OPOptimizers/InPlaceOptimizer.hpp | 78 +- .../OPOptimizers/InnerProductOptimizer.hpp | 72 + .../OPOptimizers/InvariantSliceOptimizer.hpp | 45 +- .../OPOptimizers/LayerNormOptimizer.hpp | 148 + .../OPOptimizers/MemoryReuseOptimizer.hpp | 76 +- .../MultiHeadAttentionOptimizer.hpp | 478 +++ .../OPOptimizers/NoQuantLabelOptimizer.hpp | 264 ++ .../include/OPOptimizers/OPOptimizer.hpp | 236 ++ .../include/OPOptimizers/PadOptimizer.hpp | 72 + .../include/OPOptimizers/PowerOptimizer.hpp | 133 + .../include/OPOptimizers/RNNOptimizer.hpp | 74 + .../OPOptimizers/ShGaUnCoReOptimizer.hpp | 59 + .../OPOptimizers/SqueezeReshapeOptimizer.hpp | 28 +- .../OPOptimizers/StdDeviationOptimizer.hpp | 45 + .../TransposeMatMulToFCOptimizer.hpp | 34 +- .../TransposeMulToScaleOptimizer.hpp | 61 + .../OPOptimizers/WeightBNOptimizer.hpp | 100 +- .../OPOptimizers/WeightScaleOptimizer.hpp | 114 +- .../include/converter.h | 29 +- model_tools/include/model_optimizer.hpp | 140 + model_tools/include/model_quantization.h | 26 + .../include/model_tools.h | 35 +- model_tools/include/online_conversion.h | 31 + model_tools/src/CMakeLists.txt | 36 + model_tools/src/caffe/CMakeLists.txt | 34 + .../src/caffe/caffe_adaptee.h | 856 +++-- .../src/caffe/caffe_wrapper.cpp | 18 +- model_tools/src/data_type_converter.cpp | 424 ++ model_tools/src/model_adaptee.h | 219 ++ model_tools/src/model_quantization.cpp | 96 + .../src/model_tools.cpp | 120 +- model_tools/src/online_conversion.cpp | 91 + model_tools/src/onnx/CMakeLists.txt | 28 + model_tools/src/onnx/onnx_adaptee.h | 1768 +++++++++ .../src/onnx/onnx_wrapper.cpp | 16 +- model_tools/src/tensorflow/CMakeLists.txt | 24 + .../src/tensorflow/tensorflow_adaptee.h | 977 +++++ .../src/tensorflow/tensorflow_wrapper.cpp | 25 + model_tools/src/tflite/CMakeLists.txt | 22 + model_tools/src/tflite/tflite_adaptee.h | 1455 +++++++ .../src/tflite/tflite_wrapper.cpp | 18 +- model_tools/tools/CMakeLists.txt | 10 + model_tools/tools/X2bolt/X2bolt.cpp | 138 + .../tools/pytorch2caffe/README.md | 0 .../tools/pytorch2caffe/lenet.py | 0 .../post_training_quantization.cpp | 177 + .../tools/tensorflow2caffe/Caffe/__init__.py | 0 .../tools/tensorflow2caffe/Caffe/caffe_net.py | 3 + .../tensorflow2caffe/Caffe/layer_parameter.py | 82 +- .../tools/tensorflow2caffe/README.md | 2 +- .../asr/convolution_transformer_params.py | 245 ++ .../asr/convolution_transformer_params_v2.py | 245 ++ ...ensorflow2caffe_convolution_transformer.py | 1297 +++++++ ...low2caffe_convolution_transformer_keras.py | 420 ++ .../asr/tensorflow2caffe_rnnt.py | 319 ++ .../asr/transform_convolution_transformer.py | 55 + ...transform_convolution_transformer_keras.py | 42 + .../tensorflow2caffe/asr/transform_rnnt.py | 35 + .../bert/albert/tensorflow2caffe_albert.py | 0 .../bert/albert/transform_albert.py | 0 .../bert/tensorflow2caffe_bert.py | 2 +- .../tensorflow2caffe/bert/tinybert/adb_run.sh | 2 +- .../tensorflow2caffe/bert/tinybert/result.txt | 0 .../bert/tinybert/sequence.seq | 0 .../tinybert/tensorflow2caffe_tinybert.py | 51 +- .../bert/tinybert/tinybert-infer.py | 0 .../bert/tinybert/tokenization.py | 0 .../bert/tinybert/transform_bert.py | 0 .../transform_tinybert_disambiguate.py | 4 +- .../transform_tinybert_intent_slot.py | 0 .../bert/tinybert/transform_tinybert_mrpc.py | 0 .../transform_tinybert_tts_preprocess.py | 31 + .../tensorflow2caffe/bert/transform_bert.py | 0 .../nmt/tensorflow2caffe_transformer_lstm.py | 661 ++++ .../nmt/tensorflow2caffe_transformer_tsc.py | 709 ++++ .../nmt/transform_transformer_lstm.py | 41 + .../nmt/transform_transformer_tsc.py | 41 + .../tools/tensorflow2caffe/operators.py | 116 +- .../tensorflow2caffe_punctuation.py | 51 + .../punctuation/transform_punctuation.py | 18 + .../tools/tensorflow2caffe/requirements.txt | 0 .../rotation/tensorflow2caffe_rotation.py | 45 + .../rotation/transform_rotation.py | 18 + .../tensorflow2caffe/tensorflow2caffe.py | 229 +- .../tts/tensorflow2caffe_tactron2.py | 520 +++ .../tts/transform_tactron2.py | 73 + model_tools/tools/tensorflow2json/tf2json.py | 55 + scripts/build_light_bolt.sh | 120 +- scripts/params/alexnet_convolution.csv | 6 - scripts/params/argmax.csv | 5 - scripts/params/bnn_convolution.csv | 53 - scripts/params/check.csv | 3 - scripts/params/convolution.csv | 9 - scripts/params/deconvolution.csv | 2 - scripts/params/dilated_convolution.csv | 5 - scripts/params/googlenet_convolution.csv | 58 - scripts/params/lenet_convolution.csv | 3 - .../mobilenetv1_depthwise_convolution.csv | 14 - .../mobilenetv2_depthwise_convolution.csv | 18 - scripts/params/mobilenetv3_convolution.csv | 33 - .../mobilenetv3_depthwise_convolution.csv | 16 - scripts/params/multiply.csv | 3 - scripts/params/reduction.csv | 5 - scripts/params/resnet50_convolution.csv | 54 - scripts/push_third_party.sh | 31 +- .../quick_benchmark.sh | 67 +- tensor_computing/include/tensor_computing.h | 360 -- .../include/tensor_computing_type.h | 160 - tensor_computing/src/activation.cpp | 65 - tensor_computing/src/argmax.cpp | 55 - .../src/bilateral_slice_apply.cpp | 69 - tensor_computing/src/clip.cpp | 65 - tensor_computing/src/concat.cpp | 105 - tensor_computing/src/convolution.cpp | 211 - tensor_computing/src/cpu/arm/activation.cpp | 60 - tensor_computing/src/cpu/arm/argmax.cpp | 85 - tensor_computing/src/cpu/arm/arm_functions.h | 193 - .../src/cpu/arm/bnn/convolution.cpp | 109 - .../src/cpu/arm/bnn/convolution_dorefa.h | 83 - .../cpu/arm/bnn/convolution_dorefa_A55.cpp | 777 ---- .../cpu/arm/bnn/convolution_dorefa_A76.cpp | 757 ---- .../src/cpu/arm/bnn/convolution_xnor.h | 83 - .../src/cpu/arm/bnn/convolution_xnor_A55.cpp | 797 ---- .../src/cpu/arm/bnn/convolution_xnor_A76.cpp | 776 ---- tensor_computing/src/cpu/arm/concat.cpp | 117 - tensor_computing/src/cpu/arm/convolution.cpp | 399 -- .../src/cpu/arm/deconvolution.cpp | 173 - .../src/cpu/arm/depthwise_convolution.cpp | 273 -- .../src/cpu/arm/detectionoutput.cpp | 119 - tensor_computing/src/cpu/arm/eltwise.cpp | 133 - .../src/cpu/arm/fp16/convolution.cpp | 133 - .../src/cpu/arm/fp16/convolution_gemm.h | 75 - .../src/cpu/arm/fp16/convolution_gemm_A55.cpp | 1024 ----- .../src/cpu/arm/fp16/convolution_gemm_A76.cpp | 943 ----- .../cpu/arm/fp16/convolution_gemm_icnchw.h | 76 - .../arm/fp16/convolution_gemm_icnchw_A55.cpp | 1045 ----- .../arm/fp16/convolution_gemm_icnchw_A76.cpp | 963 ----- .../src/cpu/arm/fp16/convolution_winograd.h | 74 - .../cpu/arm/fp16/convolution_winograd_A55.cpp | 865 ----- .../cpu/arm/fp16/convolution_winograd_A76.cpp | 733 ---- .../src/cpu/arm/fp16/deconvolution.cpp | 218 -- .../cpu/arm/fp16/deconvolution_transform.cpp | 157 - .../cpu/arm/fp16/depthwise_convolution.cpp | 99 - .../arm/fp16/depthwise_convolution_direct.h | 76 - .../fp16/depthwise_convolution_direct_A55.cpp | 500 --- .../fp16/depthwise_convolution_direct_A76.cpp | 500 --- .../fp16/depthwise_convolution_transform.cpp | 126 - .../depthwise_pointwise_convolution_3x3s1p1.h | 80 - .../depthwise_pointwise_convolution_direct.h | 82 - ...hwise_pointwise_convolution_direct_A55.cpp | 1442 ------- ...hwise_pointwise_convolution_direct_A76.cpp | 1360 ------- .../src/cpu/arm/fp16/detectionoutput.cpp | 139 - tensor_computing/src/cpu/arm/fp16/lstm.cpp | 265 -- tensor_computing/src/cpu/arm/fp16/pooling.cpp | 88 - .../src/cpu/arm/fp16/priorbox.cpp | 134 - .../src/cpu/arm/fp16/tensor_computing_fp16.h | 138 - .../src/cpu/arm/fp32/convolution.cpp | 151 - .../src/cpu/arm/fp32/convolution_gemm_V7.cpp | 631 --- .../src/cpu/arm/fp32/convolution_gemm_V8.cpp | 1047 ----- .../arm/fp32/convolution_gemm_icnchw_V8.cpp | 874 ----- .../src/cpu/arm/fp32/deconvolution.cpp | 223 -- .../cpu/arm/fp32/deconvolution_transform.cpp | 124 - .../cpu/arm/fp32/depthwise_convolution.cpp | 97 - .../src/cpu/arm/fp32/depthwise_convolution.h | 63 - .../fp32/depthwise_convolution_direct_V7.cpp | 634 --- .../fp32/depthwise_convolution_direct_V8.cpp | 666 ---- .../fp32/depthwise_convolution_transform.cpp | 119 - ...thwise_pointwise_convolution_direct_V7.cpp | 717 ---- ...thwise_pointwise_convolution_direct_V8.cpp | 1268 ------ .../src/cpu/arm/fp32/detectionoutput.cpp | 140 - tensor_computing/src/cpu/arm/fp32/lstm.cpp | 337 -- tensor_computing/src/cpu/arm/fp32/pooling.cpp | 96 - .../src/cpu/arm/fp32/priorbox.cpp | 133 - .../src/cpu/arm/fp32/tensor_computing_fp32.h | 179 - .../src/cpu/arm/int8/convolution.cpp | 127 - .../src/cpu/arm/int8/convolution_gemm.h | 486 --- .../src/cpu/arm/int8/convolution_winograd.h | 178 - .../cpu/arm/int8/convolution_winograd_A55.cpp | 1472 ------- .../cpu/arm/int8/convolution_winograd_A76.cpp | 1427 ------- .../int8/depthwise_convolution_transform.cpp | 119 - ...depthwise_pointwise_convolution_direct.cpp | 1910 --------- tensor_computing/src/cpu/arm/int8/pooling.cpp | 126 - .../src/cpu/arm/int8/tensor_computing_int8.h | 71 - tensor_computing/src/cpu/arm/lstm.cpp | 272 -- tensor_computing/src/cpu/arm/multiply.cpp | 30 - tensor_computing/src/cpu/arm/padding.cpp | 94 - tensor_computing/src/cpu/arm/pooling.cpp | 60 - tensor_computing/src/cpu/arm/reduction.cpp | 124 - tensor_computing/src/cpu/arm/reshape.cpp | 57 - .../src/cpu/arm/tensor_computing_arm.h | 188 - tensor_computing/src/cpu/arm/transpose.cpp | 93 - .../src/cpu/general/activation.cpp | 51 - tensor_computing/src/cpu/general/concat.cpp | 57 - .../src/cpu/general/convolution.cpp | 243 -- .../src/cpu/general/deconvolution.cpp | 153 - .../src/cpu/general/depthwise_convolution.cpp | 172 - tensor_computing/src/cpu/general/eltwise.cpp | 161 - .../src/cpu/general/general_functions.h | 197 - tensor_computing/src/cpu/general/lstm.cpp | 284 -- tensor_computing/src/cpu/general/multiply.cpp | 54 - tensor_computing/src/cpu/general/padding.cpp | 94 - tensor_computing/src/cpu/general/priorbox.cpp | 171 - .../src/cpu/general/reduction.cpp | 109 - tensor_computing/src/cpu/general/slice.cpp | 59 - .../cpu/general/tensor_computing_general.h | 128 - tensor_computing/src/deconvolution.cpp | 160 - tensor_computing/src/depth2space.cpp | 40 - .../src/depthwise_convolution.cpp | 223 -- tensor_computing/src/detectionoutput.cpp | 75 - tensor_computing/src/eltwise.cpp | 96 - tensor_computing/src/embedding.cpp | 65 - tensor_computing/src/fully_connected.cpp | 318 -- tensor_computing/src/get_output.cpp | 40 - tensor_computing/src/gpu/mali/activation.cpp | 97 - .../src/gpu/mali/bilateral_slice_apply.cpp | 186 - .../src/gpu/mali/cl/activation.cl | 262 -- tensor_computing/src/gpu/mali/cl/concat.cl | 109 - .../mali/cl/conv_depthwise_trans_fltbuf.cl | 78 - .../mali/cl/conv_direct_s1_nchw_to_ncwhc4.cl | 108 - .../mali/cl/conv_direct_s2_nchw_to_ncwhc4.cl | 111 - .../src/gpu/mali/cl/conv_direct_spe_fwhs1.cl | 100 - .../gpu/mali/cl/conv_direct_trans_fltbuf.cl | 176 - .../gpu/mali/cl/conv_wino_trans_fltbuf_3x3.cl | 118 - .../src/gpu/mali/cl/conv_wino_trans_outbuf.cl | 204 - .../src/gpu/mali/cl/conv_wino_trans_picbuf.cl | 116 - .../src/gpu/mali/cl/depth2space.cl | 20 - tensor_computing/src/gpu/mali/cl/eltwise.cl | 120 - .../src/gpu/mali/cl/fc_trans_fltbuf.cl | 86 - tensor_computing/src/gpu/mali/cl/gemm_tn.cl | 118 - tensor_computing/src/gpu/mali/cl/kernel_def.h | 2163 ----------- .../gpu/mali/cl/mem_trans_nchw_to_ncwhc4.cl | 68 - .../gpu/mali/cl/mem_trans_ncwhc4_to_nchw.cl | 70 - tensor_computing/src/gpu/mali/cl/softmax.cl | 65 - .../src/gpu/mali/cl/softmax_nchw_w.cl | 72 - tensor_computing/src/gpu/mali/clip.cpp | 82 - tensor_computing/src/gpu/mali/concat.cpp | 138 - tensor_computing/src/gpu/mali/convolution.cpp | 450 --- tensor_computing/src/gpu/mali/depth2space.cpp | 121 - .../src/gpu/mali/depthwise_convolution.cpp | 509 --- tensor_computing/src/gpu/mali/eltwise.cpp | 134 - tensor_computing/src/gpu/mali/embedding.cpp | 92 - .../gpu/mali/fp16/activation_mali_fp16.cpp | 103 - .../fp16/bilateral_slice_apply_mali_fp16.cpp | 124 - .../src/gpu/mali/fp16/concat_mali_fp16.cpp | 142 - .../fp16/convolution_direct_mali_fp16.cpp | 333 -- .../mali/fp16/convolution_direct_mali_fp16.h | 56 - .../convolution_direct_spe_ck_mali_fp16.cpp | 185 - .../convolution_direct_spe_ck_mali_fp16.h | 56 - .../gpu/mali/fp16/convolution_mali_fp16.cpp | 178 - .../src/gpu/mali/fp16/convolution_mali_fp16.h | 56 - .../mali/fp16/convolution_wino_mali_fp16.cpp | 391 -- .../mali/fp16/convolution_wino_mali_fp16.h | 58 - ...depthwise_convolution_direct_mali_fp16.cpp | 198 - .../depthwise_convolution_direct_mali_fp16.h | 56 - .../fp16/depthwise_convolution_mali_fp16.cpp | 162 - .../fp16/depthwise_convolution_mali_fp16.h | 56 - ...pointwise_convolution_direct_mali_fp16.cpp | 278 -- ...e_pointwise_convolution_direct_mali_fp16.h | 57 - ...e_pointwise_convolution_gemm_mali_fp16.cpp | 279 -- ...ise_pointwise_convolution_gemm_mali_fp16.h | 57 - .../src/gpu/mali/fp16/eltwise_mali_fp16.cpp | 138 - .../gpu/mali/fp16/fully_connected_mali_fp16.h | 51 - .../src/gpu/mali/fp16/matmul_mali_fp16.cpp | 148 - .../src/gpu/mali/fp16/matmul_mali_fp16.h | 41 - .../src/gpu/mali/fp16/multiply_mali_fp16.cpp | 80 - .../src/gpu/mali/fp16/pooling_mali_fp16.cpp | 128 - .../src/gpu/mali/fp16/reshape_mali_fp16.cpp | 118 - .../src/gpu/mali/fp16/softmax_mali_fp16.cpp | 117 - .../src/gpu/mali/fp16/transpose_mali_fp16.cpp | 84 - .../src/gpu/mali/fully_connected.cpp | 389 -- .../src/gpu/mali/infer_gclmem_desc_mali.h | 451 --- tensor_computing/src/gpu/mali/matmul.cpp | 293 -- tensor_computing/src/gpu/mali/multiply.cpp | 89 - tensor_computing/src/gpu/mali/pooling.cpp | 92 - tensor_computing/src/gpu/mali/reshape.cpp | 134 - tensor_computing/src/gpu/mali/softmax.cpp | 103 - tensor_computing/src/gpu/mali/space2depth.cpp | 122 - tensor_computing/src/gpu/mali/squeeze.cpp | 85 - .../gpu/mali/tensor_computing_get_output.cpp | 102 - .../src/gpu/mali/tensor_computing_mali.h | 482 --- .../gpu/mali/tensor_computing_set_input.cpp | 135 - tensor_computing/src/gpu/mali/transpose.cpp | 85 - .../bilateral_slice_apply_mali_uchar.cpp | 123 - .../src/library_algorithm_search.cpp | 118 - tensor_computing/src/lstm.cpp | 176 - tensor_computing/src/matmul.cpp | 298 -- tensor_computing/src/multiply.cpp | 67 - tensor_computing/src/normalization.cpp | 64 - tensor_computing/src/padding.cpp | 56 - tensor_computing/src/pooling.cpp | 107 - tensor_computing/src/priorbox.cpp | 73 - tensor_computing/src/reduction.cpp | 67 - tensor_computing/src/reshape.cpp | 98 - tensor_computing/src/scale.cpp | 69 - tensor_computing/src/slice.cpp | 97 - tensor_computing/src/softmax.cpp | 66 - tensor_computing/src/space2depth.cpp | 40 - tensor_computing/src/squeeze.cpp | 40 - tensor_computing/src/transpose.cpp | 83 - tests/CMakeLists.txt | 84 - tests/test_activation.cpp | 80 - tests/test_api_c.c | 163 - tests/test_api_java.java | 640 --- tests/test_concat.cpp | 101 - tests/test_convolution.cpp | 171 - tests/test_convolution_bnn.cpp | 188 - tests/test_convolution_int8.cpp | 231 -- tests/test_convolution_ocl.cpp | 240 -- tests/test_deconvolution.cpp | 167 - tests/test_depthwise_convolution.cpp | 198 - tests/test_depthwise_convolution_int8.cpp | 166 - tests/test_depthwise_convolution_ocl.cpp | 341 -- tests/test_detectionoutput.cpp | 122 - tests/test_dilated_convolution.cpp | 170 - tests/test_fully_connected.cpp | 95 - tests/test_fully_connected_ocl.cpp | 217 -- tests/test_lstm.cpp | 107 - tests/test_multiply.cpp | 75 - tests/test_padding.cpp | 94 - tests/test_pipeline_ocl.cpp | 360 -- tests/test_pooling.cpp | 119 - tests/test_pooling_int8.cpp | 144 - tests/test_reduction.cpp | 85 - third_party/install.sh | 174 +- tools/CMakeLists.txt | 87 - tools/caffe2bolt.cpp | 112 - tools/onnx2bolt.cpp | 106 - tools/ptq_calibration.cpp | 417 -- tools/tensor_computing_library_search.cpp | 81 - tools/tflite2bolt.cpp | 95 - uni/include/error.h | 117 - uni/include/op_type.h | 168 - uni/include/tensor_desc.h | 315 -- uni/include/thread_affinity.h | 431 --- uni/include/type.h | 154 - 1573 files changed, 179340 insertions(+), 96155 deletions(-) create mode 100644 CI_SCRIPTS/CPPLINT.cfg create mode 100644 CI_SCRIPTS/benchmark_verify.sh create mode 100644 CI_SCRIPTS/benchmark_verify_serial.sh create mode 100644 CI_SCRIPTS/dir_cpplint.sh create mode 100644 CI_SCRIPTS/format_code.sh create mode 100644 CI_SCRIPTS/genCommandLines.sh create mode 100644 CI_SCRIPTS/inference_big.txt create mode 100644 CI_SCRIPTS/inference_serial.txt create mode 100644 CI_SCRIPTS/inference_small.txt create mode 100644 CI_SCRIPTS/java_api_test.sh create mode 100644 CI_SCRIPTS/model_tools_test.sh rename {scripts => CI_SCRIPTS}/operator_driver.sh (73%) rename {scripts => CI_SCRIPTS}/operator_test.sh (73%) rename {scripts => CI_SCRIPTS}/params/activation.csv (100%) create mode 100644 CI_SCRIPTS/params/alexnet_convolution.csv create mode 100644 CI_SCRIPTS/params/argmax.csv rename {scripts => CI_SCRIPTS}/params/attention.csv (100%) create mode 100644 CI_SCRIPTS/params/bnn_convolution.csv create mode 100644 CI_SCRIPTS/params/check.csv rename {scripts => CI_SCRIPTS}/params/clip.csv (100%) rename {scripts => CI_SCRIPTS}/params/concat.csv (70%) create mode 100644 CI_SCRIPTS/params/convolution.csv create mode 100644 CI_SCRIPTS/params/deconvolution.csv create mode 100644 CI_SCRIPTS/params/detectionoutput.csv create mode 100644 CI_SCRIPTS/params/dilated_convolution.csv rename {scripts => CI_SCRIPTS}/params/eltwise.csv (100%) create mode 100644 CI_SCRIPTS/params/googlenet_convolution.csv create mode 100644 CI_SCRIPTS/params/l2normalization.csv create mode 100644 CI_SCRIPTS/params/lenet_convolution.csv rename {scripts => CI_SCRIPTS}/params/lenet_fully_connected.csv (100%) rename {scripts => CI_SCRIPTS}/params/mmm.csv (100%) create mode 100644 CI_SCRIPTS/params/mobilenetv1_depthwise_convolution.csv create mode 100644 CI_SCRIPTS/params/mobilenetv2_depthwise_convolution.csv create mode 100644 CI_SCRIPTS/params/mobilenetv3_convolution.csv create mode 100644 CI_SCRIPTS/params/mobilenetv3_depthwise_convolution.csv rename {scripts => CI_SCRIPTS}/params/mvm.csv (100%) create mode 100644 CI_SCRIPTS/params/non_max_suppression.csv create mode 100644 CI_SCRIPTS/params/normalization.csv create mode 100644 CI_SCRIPTS/params/padding.csv rename {scripts => CI_SCRIPTS}/params/pipeline.csv (100%) rename {scripts => CI_SCRIPTS}/params/pooling.csv (100%) create mode 100644 CI_SCRIPTS/params/pooling_bp.csv create mode 100644 CI_SCRIPTS/params/power.csv create mode 100644 CI_SCRIPTS/params/prelu.csv create mode 100644 CI_SCRIPTS/params/priorbox.csv create mode 100644 CI_SCRIPTS/params/reduction.csv rename {scripts => CI_SCRIPTS}/params/reshape.csv (100%) create mode 100644 CI_SCRIPTS/params/resnet50_convolution.csv rename scripts/params/lstm.csv => CI_SCRIPTS/params/rnn.csv (100%) create mode 100644 CI_SCRIPTS/params/roialign.csv rename {scripts => CI_SCRIPTS}/params/scale.csv (100%) rename {scripts => CI_SCRIPTS}/params/slice.csv (100%) rename {scripts => CI_SCRIPTS}/params/softmax.csv (100%) rename {scripts => CI_SCRIPTS}/params/split.csv (100%) create mode 100644 CI_SCRIPTS/params/tile.csv rename {scripts => CI_SCRIPTS}/params/transpose.csv (100%) create mode 100644 CI_SCRIPTS/parseAndExeCommands.sh create mode 100644 CI_SCRIPTS/transExecutors.sh delete mode 100644 THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.md delete mode 100644 blas-enhance/include/blas-enhance.h delete mode 100644 blas-enhance/src/CMakeLists.txt delete mode 100644 blas-enhance/src/cpu/arm/fp16/mmm_A55.cpp delete mode 100644 blas-enhance/src/cpu/arm/fp16/mmm_A76.cpp delete mode 100644 blas-enhance/src/cpu/arm/fp16/mvm_A55.cpp delete mode 100644 blas-enhance/src/cpu/arm/fp16/mvm_A76.cpp delete mode 100644 blas-enhance/src/cpu/arm/fp16/mvm_common.h delete mode 100644 blas-enhance/src/cpu/arm/fp32/mvm_row_V8.cpp delete mode 100644 blas-enhance/src/cpu/arm/int8/mmm_A55.cpp delete mode 100644 blas-enhance/src/cpu/arm/int8/mmm_A76.cpp delete mode 100644 blas-enhance/src/cpu/arm/int8/mmm_common.h delete mode 100644 blas-enhance/src/cpu/arm/int8/mvm.h delete mode 100644 blas-enhance/src/mmm.cpp delete mode 100644 bolt.cmake delete mode 100644 cmakes/FindBlasEnhance.cmake delete mode 100644 cmakes/FindGcl.cmake delete mode 100644 cmakes/FindImage.cmake delete mode 100644 cmakes/FindInference.cmake delete mode 100644 cmakes/FindModelTools.cmake delete mode 100644 cmakes/FindModelToolsCaffe.cmake delete mode 100644 cmakes/FindModelToolsOnnx.cmake delete mode 100644 cmakes/FindModelToolsTFLite.cmake delete mode 100644 cmakes/FindTFLite.cmake delete mode 100644 cmakes/FindTensorComputing.cmake delete mode 100644 cmakes/FindUni.cmake create mode 100644 common/CMakeLists.txt create mode 100644 common/cmakes/FindFFTW.cmake rename {cmakes => common/cmakes}/FindFlatBuffers.cmake (88%) create mode 100644 common/cmakes/FindGcl.cmake create mode 100644 common/cmakes/FindJNI.cmake rename {cmakes => common/cmakes}/FindOpenCL.cmake (77%) rename {cmakes => common/cmakes}/FindProtobuf.cmake (84%) create mode 100644 common/cmakes/FindTFLite.cmake rename {cmakes => common/cmakes}/Findjpeg.cmake (53%) create mode 100644 common/cmakes/Findjsoncpp.cmake create mode 100644 common/cmakes/bolt.cmake create mode 100644 common/gcl/CMakeLists.txt create mode 100644 common/gcl/include/context.h create mode 100644 common/gcl/include/event.h rename {gcl => common/gcl}/include/gcl.h (83%) create mode 100644 common/gcl/include/gcl_common.h create mode 100644 common/gcl/include/gcl_engine.h create mode 100644 common/gcl/include/gcl_func.h create mode 100644 common/gcl/include/gcl_kernel_binmap.h create mode 100644 common/gcl/include/gcl_kernel_source.h create mode 100644 common/gcl/include/gcl_kernel_type.h create mode 100644 common/gcl/include/gclmem_desc_infer.h create mode 100644 common/gcl/include/kernel.h create mode 100644 common/gcl/include/memory.h create mode 100644 common/gcl/include/ocl_context.h create mode 100644 common/gcl/include/ocl_data_alloc.h create mode 100644 common/gcl/include/ocl_data_trans.h create mode 100644 common/gcl/include/ocl_desc_trans.h create mode 100644 common/gcl/include/platform.h create mode 100644 common/gcl/include/program.h create mode 100644 common/gcl/src/CMakeLists.txt create mode 100644 common/gcl/src/ocl_context.cpp create mode 100644 common/gcl/src/ocl_data_trans.cpp rename {gcl => common/gcl}/tools/device_info/CMakeLists.txt (71%) rename {gcl => common/gcl}/tools/device_info/clinfo.cpp (81%) rename {gcl => common/gcl}/tools/gcl_sample/CMakeLists.txt (55%) create mode 100644 common/gcl/tools/gcl_sample/build.sh rename {gcl => common/gcl}/tools/gcl_sample/cl/sample.cl (75%) create mode 100644 common/gcl/tools/gcl_sample/sample.cpp rename {gcl => common/gcl}/tools/kernel_lib_compile/CMakeLists.txt (58%) rename {gcl => common/gcl}/tools/kernel_lib_compile/buildKernelLib.sh (100%) rename {gcl => common/gcl}/tools/kernel_lib_compile/device_name/CMakeLists.txt (68%) rename {gcl => common/gcl}/tools/kernel_lib_compile/device_name/device_name.cpp (74%) rename {gcl => common/gcl}/tools/kernel_lib_compile/kernel_bin/CMakeLists.txt (67%) rename {gcl => common/gcl}/tools/kernel_lib_compile/kernel_bin/clbinary.cpp (59%) rename {gcl => common/gcl}/tools/kernel_lib_compile/kernel_bin2char/bin2char.cpp (76%) create mode 100644 common/gcl/tools/kernel_lib_compile/sh/adbDeviceNum.sh rename {gcl => common/gcl}/tools/kernel_lib_compile/sh/buildKernelBin.sh (54%) rename {gcl => common/gcl}/tools/kernel_lib_compile/sh/buildKernelLibConfig.sh (84%) rename {gcl => common/gcl}/tools/kernel_lib_compile/sh/compile/activation.sh (100%) create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/argmax_x.sh rename {gcl => common/gcl}/tools/kernel_lib_compile/sh/compile/bilateral_slice_apply_c12.sh (100%) create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/channel_resize.sh rename {gcl => common/gcl}/tools/kernel_lib_compile/sh/compile/common.sh (50%) create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/concat.sh rename {gcl => common/gcl}/tools/kernel_lib_compile/sh/compile/conv_depthwise_s1.sh (52%) rename {gcl => common/gcl}/tools/kernel_lib_compile/sh/compile/conv_depthwise_s2.sh (65%) rename {gcl => common/gcl}/tools/kernel_lib_compile/sh/compile/conv_depthwise_trans_fltbuf.sh (100%) create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_3d_sw1_nchw_to_ncwhc4.sh create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_3d_sw2_nchw_to_ncwhc4.sh create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1.sh create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_fn_spe.sh rename {gcl => common/gcl}/tools/kernel_lib_compile/sh/compile/conv_direct_s1_nchw_to_ncwhc4.sh (53%) rename {gcl => common/gcl}/tools/kernel_lib_compile/sh/compile/conv_direct_s1_spe_f1c3k1.sh (100%) create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2.sh rename {gcl => common/gcl}/tools/kernel_lib_compile/sh/compile/conv_direct_s2_nchw_to_ncwhc4.sh (53%) create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_spe_fwhs1.sh rename {gcl => common/gcl}/tools/kernel_lib_compile/sh/compile/conv_direct_trans_fltbuf.sh (83%) create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_wh_s1.sh create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_wh_s2.sh rename {gcl => common/gcl}/tools/kernel_lib_compile/sh/compile/conv_wino_gemm36_tn.sh (100%) rename {gcl => common/gcl}/tools/kernel_lib_compile/sh/compile/conv_wino_rotate_fltbuf.sh (100%) rename {gcl => common/gcl}/tools/kernel_lib_compile/sh/compile/conv_wino_trans_outbuf.sh (100%) rename {gcl => common/gcl}/tools/kernel_lib_compile/sh/compile/conv_wino_trans_picbuf_left.sh (100%) rename {gcl => common/gcl}/tools/kernel_lib_compile/sh/compile/conv_wino_trans_picbuf_right.sh (100%) create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/copy.sh create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/deconv_gemm_f2s2.sh create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/deconv_gemm_trans_fltbuf.sh create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/eltwise.sh create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/eltwise_broadcast.sh create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/eltwise_spe_nchw_c.sh rename {gcl => common/gcl}/tools/kernel_lib_compile/sh/compile/fc_trans_fltbuf.sh (100%) create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/fill_memory_zero.sh create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/fill_memory_zero_vec4.sh rename {gcl => common/gcl}/tools/kernel_lib_compile/sh/compile/gemm_nt.sh (100%) create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/gemm_tn.sh create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/mem_trans_nchw_to_ncwhc4.sh create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/mem_trans_ncwhc4_to_nchw.sh create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/mem_trans_ncwhc4_to_ncwhc4.sh create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/normalization.sh create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/power.sh create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/prelu.sh rename {gcl => common/gcl}/tools/kernel_lib_compile/sh/compile/sample.sh (100%) rename {gcl => common/gcl}/tools/kernel_lib_compile/sh/compile/scale.sh (59%) rename {gcl => common/gcl}/tools/kernel_lib_compile/sh/compile/slice_h.sh (100%) create mode 100644 common/gcl/tools/kernel_lib_compile/sh/compile/transpose_nchw.sh rename {gcl => common/gcl}/tools/kernel_lib_compile/sh/packKernelBin.sh (97%) rename {gcl => common/gcl}/tools/kernel_lib_compile/sh/sh.config (100%) create mode 100644 common/gcl/tools/kernel_source_compile/CMakeLists.txt create mode 100644 common/gcl/tools/kernel_source_compile/buildKernelSourceLib.sh create mode 100644 common/gcl/tools/kernel_source_compile/kernel_cl2char/cl2char.cpp create mode 100644 common/memory/include/memory.hpp create mode 100644 common/memory/include/memory_cpu.hpp create mode 100644 common/memory/include/memory_ocl.hpp create mode 100644 common/memory/include/tensor.hpp create mode 100644 common/uni/CMakeLists.txt create mode 100644 common/uni/include/algorithm_map.h rename {uni => common/uni}/include/arm_neon_expand.h (59%) create mode 100644 common/uni/include/error.h create mode 100644 common/uni/include/graph.h rename {model-tools => common/uni}/include/model_print.h (84%) create mode 100644 common/uni/include/model_serialize_deserialize.hpp create mode 100644 common/uni/include/op_type.h create mode 100644 common/uni/include/parse_command.h create mode 100644 common/uni/include/profiling.h create mode 100644 common/uni/include/schedule.h create mode 100644 common/uni/include/sys.h create mode 100644 common/uni/include/task.h create mode 100644 common/uni/include/tensor_desc.h create mode 100644 common/uni/include/thread_affinity.h create mode 100644 common/uni/include/types.h rename {uni => common/uni}/include/ut_util.h (58%) create mode 100644 common/uni/include/x86_avx2_expand.h create mode 100644 common/uni/src/CMakeLists.txt rename {model-tools => common/uni}/src/model_deserialize.cpp (56%) create mode 100644 common/uni/src/model_print.cpp rename {model-tools => common/uni}/src/model_serialize.cpp (56%) create mode 100644 common/uni/src/profiling.cpp create mode 100644 common/uni/src/tensor_desc.cpp create mode 100644 common/uni/src/types.cpp create mode 100644 common/uni/src/uni.cpp create mode 100644 compute/CMakeLists.txt create mode 100644 compute/blas_enhance/CMakeLists.txt create mode 100644 compute/blas_enhance/include/blas_enhance.h create mode 100644 compute/blas_enhance/src/CMakeLists.txt create mode 100644 compute/blas_enhance/src/axpby.cpp create mode 100644 compute/blas_enhance/src/cpu/arm/axpby.cpp rename {blas-enhance => compute/blas_enhance}/src/cpu/arm/blas_arm.h (53%) create mode 100644 compute/blas_enhance/src/cpu/arm/fp16/axpby.cpp rename {blas-enhance => compute/blas_enhance}/src/cpu/arm/fp16/blas_fp16.h (62%) rename {blas-enhance => compute/blas_enhance}/src/cpu/arm/fp16/mmm.cpp (71%) rename {blas-enhance/src/cpu/arm/int8 => compute/blas_enhance/src/cpu/arm/fp16}/mmm.h (71%) create mode 100644 compute/blas_enhance/src/cpu/arm/fp16/mmm_A55.cpp create mode 100644 compute/blas_enhance/src/cpu/arm/fp16/mmm_A76.cpp rename {blas-enhance => compute/blas_enhance}/src/cpu/arm/fp16/mmm_common.h (63%) create mode 100644 compute/blas_enhance/src/cpu/arm/fp16/mvm.cpp rename {blas-enhance => compute/blas_enhance}/src/cpu/arm/fp16/mvm.h (72%) create mode 100644 compute/blas_enhance/src/cpu/arm/fp16/mvm_A55.cpp create mode 100644 compute/blas_enhance/src/cpu/arm/fp16/mvm_A76.cpp create mode 100644 compute/blas_enhance/src/cpu/arm/fp16/mvm_common.h create mode 100644 compute/blas_enhance/src/cpu/arm/fp32/axpby.cpp create mode 100644 compute/blas_enhance/src/cpu/arm/fp32/blas_fp32.h rename {blas-enhance => compute/blas_enhance}/src/cpu/arm/fp32/mmm_V7.cpp (57%) rename {blas-enhance => compute/blas_enhance}/src/cpu/arm/fp32/mmm_V8.cpp (87%) create mode 100644 compute/blas_enhance/src/cpu/arm/fp32/mvm.cpp rename blas-enhance/src/cpu/arm/fp32/mvm_col_V8.cpp => compute/blas_enhance/src/cpu/arm/fp32/mvm_col.cpp (70%) create mode 100644 compute/blas_enhance/src/cpu/arm/fp32/mvm_row.cpp rename {blas-enhance => compute/blas_enhance}/src/cpu/arm/int8/blas_int8.h (63%) rename {blas-enhance => compute/blas_enhance}/src/cpu/arm/int8/mmm.cpp (73%) create mode 100644 compute/blas_enhance/src/cpu/arm/int8/mmm.h create mode 100644 compute/blas_enhance/src/cpu/arm/int8/mmm_A55.cpp create mode 100644 compute/blas_enhance/src/cpu/arm/int8/mmm_A76.cpp create mode 100644 compute/blas_enhance/src/cpu/arm/int8/mmm_common.h create mode 100644 compute/blas_enhance/src/cpu/arm/int8/mvm.cpp create mode 100644 compute/blas_enhance/src/cpu/arm/int8/mvm.h rename {blas-enhance => compute/blas_enhance}/src/cpu/arm/mmm.cpp (63%) rename {blas-enhance => compute/blas_enhance}/src/cpu/arm/mvm.cpp (52%) create mode 100644 compute/blas_enhance/src/cpu/general/axpby.cpp rename {blas-enhance => compute/blas_enhance}/src/cpu/general/blas_general.h (62%) rename {blas-enhance => compute/blas_enhance}/src/cpu/general/mmm.cpp (65%) rename {blas-enhance => compute/blas_enhance}/src/cpu/general/mvm.cpp (70%) create mode 100644 compute/blas_enhance/src/cpu/x86/blas_x86.h rename {blas-enhance/src/cpu/arm => compute/blas_enhance/src/cpu/x86}/fp32/blas_fp32.h (57%) create mode 100644 compute/blas_enhance/src/cpu/x86/fp32/mmm_avx2.cpp create mode 100644 compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_col.cpp create mode 100644 compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_row.cpp create mode 100644 compute/blas_enhance/src/cpu/x86/mmm.cpp create mode 100644 compute/blas_enhance/src/cpu/x86/mvm.cpp create mode 100644 compute/blas_enhance/src/mmm.cpp rename {blas-enhance => compute/blas_enhance}/src/mvm.cpp (51%) create mode 100644 compute/blas_enhance/tests/.CMakeLists.txt.swp create mode 100644 compute/blas_enhance/tests/CMakeLists.txt rename {tests => compute/blas_enhance/tests}/test_mmm.cpp (67%) rename {tests => compute/blas_enhance/tests}/test_mmm_int8.cpp (66%) rename {tests => compute/blas_enhance/tests}/test_mvm.cpp (67%) rename {tests => compute/blas_enhance/tests}/test_mvm_int8.cpp (60%) rename {image => compute/image}/CMakeLists.txt (61%) create mode 100644 compute/image/include/image.h rename {image => compute/image}/include/image_processing.hpp (76%) create mode 100644 compute/image/src/CMakeLists.txt rename {image => compute/image}/src/cpu/arm/image_arm.h (79%) rename {image => compute/image}/src/cpu/arm/resize_bilinear.cpp (67%) rename {image => compute/image}/src/cpu/general/image_general.h (68%) rename {image => compute/image}/src/cpu/general/resize_bilinear.cpp (78%) create mode 100644 compute/image/src/gpu/mali/cl/resize_bilinear.cl create mode 100644 compute/image/src/gpu/mali/cl/resize_bilinear_nchw.cl create mode 100644 compute/image/src/gpu/mali/fp16/resize_bilinear_mali_fp16.cpp create mode 100644 compute/image/src/gpu/mali/fp16/resize_bilinear_mali_fp16.h create mode 100644 compute/image/src/gpu/mali/image_mali.h create mode 100644 compute/image/src/gpu/mali/resize_bilinear.cpp rename {image => compute/image}/src/image_processing.cpp (69%) create mode 100644 compute/image/src/resize.cpp create mode 100644 compute/image/tests/CMakeLists.txt rename {tests => compute/image/tests}/test_image_processing.cpp (81%) rename {tests => compute/image/tests}/test_image_resize.cpp (68%) create mode 100644 compute/image/tests/test_image_resize_ocl.cpp create mode 100644 compute/tensor/CMakeLists.txt create mode 100644 compute/tensor/include/tensor_computing.h rename {tensor_computing => compute/tensor}/include/tensor_computing_library_algorithm_search.h (84%) create mode 100644 compute/tensor/include/tensor_computing_type.h rename {tensor_computing => compute/tensor}/src/CMakeLists.txt (54%) create mode 100644 compute/tensor/src/activation.cpp create mode 100644 compute/tensor/src/argmax.cpp rename {tensor_computing => compute/tensor}/src/attention.cpp (52%) rename {tensor_computing => compute/tensor}/src/attention_mask.cpp (50%) create mode 100644 compute/tensor/src/bilateral_slice_apply.cpp create mode 100644 compute/tensor/src/channel_resize.cpp create mode 100644 compute/tensor/src/check.cpp create mode 100644 compute/tensor/src/clip.cpp create mode 100644 compute/tensor/src/concat.cpp create mode 100644 compute/tensor/src/convolution.cpp create mode 100644 compute/tensor/src/copy.cpp create mode 100644 compute/tensor/src/cpu/activation.cpp rename {tensor_computing/src/cpu/general => compute/tensor/src/cpu}/argmax.cpp (62%) create mode 100644 compute/tensor/src/cpu/arm/arm_functions.h rename {tensor_computing => compute/tensor}/src/cpu/arm/attention.cpp (80%) rename tensor_computing/src/cpu/arm/normalization.cpp => compute/tensor/src/cpu/arm/attention_mask.cpp (72%) create mode 100644 compute/tensor/src/cpu/arm/bnn/convolution.cpp create mode 100644 compute/tensor/src/cpu/arm/bnn/convolution_dorefa.h create mode 100644 compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A55.cpp create mode 100644 compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A76.cpp rename {tensor_computing => compute/tensor}/src/cpu/arm/bnn/convolution_transform_bnn.h (69%) create mode 100644 compute/tensor/src/cpu/arm/bnn/convolution_xnor.h create mode 100644 compute/tensor/src/cpu/arm/bnn/convolution_xnor_A55.cpp create mode 100644 compute/tensor/src/cpu/arm/bnn/convolution_xnor_A76.cpp rename {tensor_computing => compute/tensor}/src/cpu/arm/bnn/tensor_computing_bnn.h (63%) rename {tensor_computing => compute/tensor}/src/cpu/arm/check.cpp (62%) rename {tensor_computing => compute/tensor}/src/cpu/arm/clip.cpp (75%) create mode 100644 compute/tensor/src/cpu/arm/convolution.cpp create mode 100644 compute/tensor/src/cpu/arm/deconvolution.cpp create mode 100644 compute/tensor/src/cpu/arm/depthwise_convolution.cpp create mode 100644 compute/tensor/src/cpu/arm/depthwise_pointwise_convolution.cpp rename tensor_computing/src/cpu/arm/priorbox.cpp => compute/tensor/src/cpu/arm/eltwise.cpp (72%) rename {tensor_computing => compute/tensor}/src/cpu/arm/fp16/arm_functions_fp16.h (62%) rename {tensor_computing => compute/tensor}/src/cpu/arm/fp16/attention.cpp (66%) rename {tensor_computing => compute/tensor}/src/cpu/arm/fp16/attention_mask.cpp (78%) rename {tensor_computing => compute/tensor}/src/cpu/arm/fp16/check.cpp (72%) rename {tensor_computing => compute/tensor}/src/cpu/arm/fp16/clip.cpp (82%) create mode 100644 compute/tensor/src/cpu/arm/fp16/convolution.cpp rename {tensor_computing => compute/tensor}/src/cpu/arm/fp16/convolution_direct.cpp (77%) rename {tensor_computing => compute/tensor}/src/cpu/arm/fp16/convolution_direct.h (66%) create mode 100644 compute/tensor/src/cpu/arm/fp16/convolution_gemm.h create mode 100644 compute/tensor/src/cpu/arm/fp16/convolution_gemm_A55.cpp create mode 100644 compute/tensor/src/cpu/arm/fp16/convolution_gemm_A76.cpp create mode 100644 compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw.h create mode 100644 compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A55.cpp create mode 100644 compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A76.cpp rename {tensor_computing => compute/tensor}/src/cpu/arm/fp16/convolution_transform.cpp (61%) create mode 100644 compute/tensor/src/cpu/arm/fp16/convolution_winograd.h create mode 100644 compute/tensor/src/cpu/arm/fp16/convolution_winograd_A55.cpp create mode 100644 compute/tensor/src/cpu/arm/fp16/convolution_winograd_A76.cpp rename {tensor_computing => compute/tensor}/src/cpu/arm/fp16/convolution_winograd_transform.h (78%) create mode 100644 compute/tensor/src/cpu/arm/fp16/deconvolution_transform.cpp create mode 100644 compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution.cpp create mode 100644 compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1.h rename {tensor_computing => compute/tensor}/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A55.cpp (74%) rename {tensor_computing => compute/tensor}/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A76.cpp (72%) create mode 100644 compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct.h create mode 100644 compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A55.cpp create mode 100644 compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A76.cpp rename {tensor_computing => compute/tensor}/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h (51%) rename {tensor_computing => compute/tensor}/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A55.cpp (64%) rename {tensor_computing => compute/tensor}/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A76.cpp (61%) rename {tensor_computing => compute/tensor}/src/cpu/arm/fp16/eltwise.cpp (56%) create mode 100644 compute/tensor/src/cpu/arm/fp16/lstm.cpp rename {tensor_computing => compute/tensor}/src/cpu/arm/fp16/normalization.cpp (72%) create mode 100644 compute/tensor/src/cpu/arm/fp16/pooling.cpp create mode 100644 compute/tensor/src/cpu/arm/fp16/prelu.cpp rename {tensor_computing => compute/tensor}/src/cpu/arm/fp16/quantize.cpp (73%) rename {tensor_computing => compute/tensor}/src/cpu/arm/fp16/scale.cpp (64%) rename {tensor_computing => compute/tensor}/src/cpu/arm/fp16/softmax.cpp (76%) create mode 100644 compute/tensor/src/cpu/arm/fp16/tensor_computing_fp16.h rename {tensor_computing => compute/tensor}/src/cpu/arm/fp32/arm_functions_fp32.h (67%) rename {tensor_computing => compute/tensor}/src/cpu/arm/fp32/attention.cpp (65%) rename {tensor_computing => compute/tensor}/src/cpu/arm/fp32/attention_mask.cpp (78%) rename {tensor_computing => compute/tensor}/src/cpu/arm/fp32/check.cpp (72%) rename {tensor_computing => compute/tensor}/src/cpu/arm/fp32/clip.cpp (83%) create mode 100644 compute/tensor/src/cpu/arm/fp32/convolution.cpp create mode 100644 compute/tensor/src/cpu/arm/fp32/convolution_gemm_V7.cpp create mode 100644 compute/tensor/src/cpu/arm/fp32/convolution_gemm_V8.cpp rename {tensor_computing => compute/tensor}/src/cpu/arm/fp32/convolution_gemm_icnchw_V7.cpp (61%) create mode 100644 compute/tensor/src/cpu/arm/fp32/convolution_gemm_icnchw_V8.cpp rename {tensor_computing => compute/tensor}/src/cpu/arm/fp32/convolution_transform.cpp (63%) rename {tensor_computing => compute/tensor}/src/cpu/arm/fp32/convolution_winograd_V8.cpp (71%) rename {tensor_computing => compute/tensor}/src/cpu/arm/fp32/convolution_winograd_transform.h (70%) create mode 100644 compute/tensor/src/cpu/arm/fp32/deconvolution_transform.cpp create mode 100644 compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution.cpp create mode 100644 compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution.h create mode 100644 compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution_direct_V7.cpp create mode 100644 compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution_direct_V8.cpp rename {tensor_computing => compute/tensor}/src/cpu/arm/fp32/eltwise.cpp (56%) create mode 100644 compute/tensor/src/cpu/arm/fp32/lstm.cpp rename {tensor_computing => compute/tensor}/src/cpu/arm/fp32/normalization.cpp (72%) create mode 100644 compute/tensor/src/cpu/arm/fp32/pooling.cpp create mode 100644 compute/tensor/src/cpu/arm/fp32/prelu.cpp rename {tensor_computing => compute/tensor}/src/cpu/arm/fp32/scale.cpp (67%) rename {tensor_computing => compute/tensor}/src/cpu/arm/fp32/softmax.cpp (76%) create mode 100644 compute/tensor/src/cpu/arm/fp32/tensor_computing_fp32.h rename {tensor_computing => compute/tensor}/src/cpu/arm/int8/arm_functions_int8.h (81%) rename {tensor_computing => compute/tensor}/src/cpu/arm/int8/concat.cpp (69%) create mode 100644 compute/tensor/src/cpu/arm/int8/convolution.cpp create mode 100644 compute/tensor/src/cpu/arm/int8/convolution_gemm.h rename {tensor_computing => compute/tensor}/src/cpu/arm/int8/convolution_gemm_A55.cpp (76%) rename {tensor_computing => compute/tensor}/src/cpu/arm/int8/convolution_gemm_A76.cpp (75%) rename {tensor_computing => compute/tensor}/src/cpu/arm/int8/convolution_transform.cpp (56%) create mode 100644 compute/tensor/src/cpu/arm/int8/convolution_winograd.h create mode 100644 compute/tensor/src/cpu/arm/int8/convolution_winograd_A55.cpp create mode 100644 compute/tensor/src/cpu/arm/int8/convolution_winograd_A76.cpp rename {tensor_computing => compute/tensor}/src/cpu/arm/int8/convolution_winograd_transform.h (80%) rename tensor_computing/src/cpu/arm/int8/depthwise_convolution.cpp => compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution.cpp (51%) create mode 100644 compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution.h create mode 100644 compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution_direct.cpp create mode 100644 compute/tensor/src/cpu/arm/int8/pooling.cpp rename {tensor_computing => compute/tensor}/src/cpu/arm/int8/quantize.cpp (82%) create mode 100644 compute/tensor/src/cpu/arm/int8/tensor_computing_int8.h rename tensor_computing/src/cpu/arm/attention_mask.cpp => compute/tensor/src/cpu/arm/normalization.cpp (69%) create mode 100644 compute/tensor/src/cpu/arm/padding.cpp create mode 100644 compute/tensor/src/cpu/arm/pooling.cpp create mode 100644 compute/tensor/src/cpu/arm/prelu.cpp rename {tensor_computing => compute/tensor}/src/cpu/arm/quantize.cpp (83%) create mode 100644 compute/tensor/src/cpu/arm/rnn.cpp rename {tensor_computing => compute/tensor}/src/cpu/arm/scale.cpp (69%) rename {tensor_computing => compute/tensor}/src/cpu/arm/softmax.cpp (75%) create mode 100644 compute/tensor/src/cpu/arm/tensor_computing_arm.h create mode 100644 compute/tensor/src/cpu/arm/transform_functions.h rename tensor_computing/src/split.cpp => compute/tensor/src/cpu/clip.cpp (58%) create mode 100644 compute/tensor/src/cpu/concat.cpp create mode 100644 compute/tensor/src/cpu/convolution.cpp create mode 100644 compute/tensor/src/cpu/cpu_functions.h create mode 100644 compute/tensor/src/cpu/cpu_functions_template.h create mode 100644 compute/tensor/src/cpu/deconvolution.cpp rename tensor_computing/src/cpu/general/reshape.cpp => compute/tensor/src/cpu/depthwise_convolution.cpp (62%) create mode 100644 compute/tensor/src/cpu/depthwise_pointwise_convolution.cpp rename {tensor_computing/src/cpu/general => compute/tensor/src/cpu}/detectionoutput.cpp (57%) create mode 100644 compute/tensor/src/cpu/eltwise.cpp create mode 100644 compute/tensor/src/cpu/embedding.cpp rename {tensor_computing => compute/tensor}/src/cpu/general/attention.cpp (70%) rename {tensor_computing => compute/tensor}/src/cpu/general/attention_mask.cpp (73%) rename {tensor_computing => compute/tensor}/src/cpu/general/check.cpp (51%) rename {tensor_computing => compute/tensor}/src/cpu/general/clip.cpp (72%) create mode 100644 compute/tensor/src/cpu/general/convolution.cpp create mode 100644 compute/tensor/src/cpu/general/deconvolution.cpp create mode 100644 compute/tensor/src/cpu/general/depthwise_convolution.cpp create mode 100644 compute/tensor/src/cpu/general/depthwise_pointwise_convolution.cpp create mode 100644 compute/tensor/src/cpu/general/eltwise.cpp create mode 100644 compute/tensor/src/cpu/general/general_functions.h rename {tensor_computing => compute/tensor}/src/cpu/general/normalization.cpp (57%) create mode 100644 compute/tensor/src/cpu/general/padding.cpp rename {tensor_computing => compute/tensor}/src/cpu/general/pooling.cpp (64%) create mode 100644 compute/tensor/src/cpu/general/pooling_bp.cpp create mode 100644 compute/tensor/src/cpu/general/prelu.cpp create mode 100644 compute/tensor/src/cpu/general/rnn.cpp rename {tensor_computing => compute/tensor}/src/cpu/general/scale.cpp (60%) rename {tensor_computing => compute/tensor}/src/cpu/general/softmax.cpp (57%) create mode 100644 compute/tensor/src/cpu/general/tensor_computing_general.h rename {tensor_computing => compute/tensor}/src/cpu/general/transpose.cpp (76%) create mode 100644 compute/tensor/src/cpu/l2normalization.cpp create mode 100644 compute/tensor/src/cpu/non_max_suppression.cpp create mode 100644 compute/tensor/src/cpu/padding.cpp rename tensor_computing/src/cpu/arm/split.cpp => compute/tensor/src/cpu/power.cpp (60%) create mode 100644 compute/tensor/src/cpu/priorbox.cpp create mode 100644 compute/tensor/src/cpu/reduction.cpp create mode 100644 compute/tensor/src/cpu/reshape.cpp create mode 100644 compute/tensor/src/cpu/rnn.cpp create mode 100644 compute/tensor/src/cpu/roialign.cpp rename {tensor_computing/src/cpu/arm => compute/tensor/src/cpu}/slice.cpp (73%) rename {tensor_computing/src/cpu/general => compute/tensor/src/cpu}/split.cpp (68%) create mode 100644 compute/tensor/src/cpu/tensor_computing_cpu.h create mode 100644 compute/tensor/src/cpu/tfslice.cpp create mode 100644 compute/tensor/src/cpu/transpose.cpp create mode 100644 compute/tensor/src/cpu/x86/attention_mask.cpp create mode 100644 compute/tensor/src/cpu/x86/check.cpp rename blas-enhance/src/cpu/arm/fp16/mvm.cpp => compute/tensor/src/cpu/x86/clip.cpp (64%) create mode 100644 compute/tensor/src/cpu/x86/convolution.cpp create mode 100644 compute/tensor/src/cpu/x86/deconvolution.cpp create mode 100644 compute/tensor/src/cpu/x86/depthwise_convolution.cpp create mode 100644 compute/tensor/src/cpu/x86/depthwise_pointwise_convolution.cpp create mode 100644 compute/tensor/src/cpu/x86/eltwise.cpp create mode 100644 compute/tensor/src/cpu/x86/fp32/attention_mask.cpp create mode 100644 compute/tensor/src/cpu/x86/fp32/check.cpp create mode 100644 compute/tensor/src/cpu/x86/fp32/clip.cpp create mode 100644 compute/tensor/src/cpu/x86/fp32/convolution.cpp create mode 100644 compute/tensor/src/cpu/x86/fp32/convolution_1x1_direct.cpp create mode 100644 compute/tensor/src/cpu/x86/fp32/convolution_2x2_direct.cpp create mode 100644 compute/tensor/src/cpu/x86/fp32/convolution_direct.cpp create mode 100644 compute/tensor/src/cpu/x86/fp32/convolution_direct_nchw.cpp create mode 100644 compute/tensor/src/cpu/x86/fp32/convolution_transform.cpp create mode 100644 compute/tensor/src/cpu/x86/fp32/deconvolution_transform.cpp create mode 100644 compute/tensor/src/cpu/x86/fp32/depthwise_convolution_direct.cpp create mode 100644 compute/tensor/src/cpu/x86/fp32/depthwise_convolution_transform.cpp create mode 100644 compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution.cpp create mode 100644 compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution_transform.cpp create mode 100644 compute/tensor/src/cpu/x86/fp32/eltwise.cpp create mode 100644 compute/tensor/src/cpu/x86/fp32/l2normalization.cpp create mode 100644 compute/tensor/src/cpu/x86/fp32/lstm.cpp create mode 100644 compute/tensor/src/cpu/x86/fp32/normalization.cpp create mode 100644 compute/tensor/src/cpu/x86/fp32/pooling.cpp create mode 100644 compute/tensor/src/cpu/x86/fp32/scale.cpp create mode 100644 compute/tensor/src/cpu/x86/fp32/softmax.cpp create mode 100644 compute/tensor/src/cpu/x86/fp32/tensor_computing_fp32.h create mode 100644 compute/tensor/src/cpu/x86/fp32/transform_functions_fp32.h create mode 100644 compute/tensor/src/cpu/x86/fp32/x86_functions_fp32.h create mode 100644 compute/tensor/src/cpu/x86/normalization.cpp create mode 100644 compute/tensor/src/cpu/x86/pooling.cpp create mode 100644 compute/tensor/src/cpu/x86/rnn.cpp create mode 100644 compute/tensor/src/cpu/x86/scale.cpp create mode 100644 compute/tensor/src/cpu/x86/softmax.cpp create mode 100644 compute/tensor/src/cpu/x86/tensor_computing_x86.h create mode 100644 compute/tensor/src/cpu/x86/x86_functions.h create mode 100644 compute/tensor/src/cpu/yolov3detectionoutput.cpp create mode 100644 compute/tensor/src/deconvolution.cpp create mode 100644 compute/tensor/src/depth2space.cpp create mode 100644 compute/tensor/src/depthwise_convolution.cpp create mode 100644 compute/tensor/src/depthwise_pointwise_convolution.cpp create mode 100644 compute/tensor/src/detectionoutput.cpp create mode 100644 compute/tensor/src/eltwise.cpp create mode 100644 compute/tensor/src/embedding.cpp create mode 100644 compute/tensor/src/fully_connected.cpp create mode 100644 compute/tensor/src/gpu/mali/activation.cpp create mode 100644 compute/tensor/src/gpu/mali/argmax.cpp create mode 100644 compute/tensor/src/gpu/mali/bilateral_slice_apply.cpp create mode 100644 compute/tensor/src/gpu/mali/channel_resize.cpp create mode 100644 compute/tensor/src/gpu/mali/check.cpp create mode 100644 compute/tensor/src/gpu/mali/cl/activation.cl create mode 100644 compute/tensor/src/gpu/mali/cl/argmax_x.cl rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/bilateral_slice_apply_c12.cl (61%) rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/bilateral_slice_apply_pre.cl (72%) create mode 100644 compute/tensor/src/gpu/mali/cl/channel_resize.cl create mode 100644 compute/tensor/src/gpu/mali/cl/check_int_spe.cl rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/clip.cl (83%) create mode 100644 compute/tensor/src/gpu/mali/cl/col2im.cl create mode 100644 compute/tensor/src/gpu/mali/cl/concat.cl rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/conv_depthwise_s1.cl (71%) rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/conv_depthwise_s2.cl (67%) create mode 100644 compute/tensor/src/gpu/mali/cl/conv_depthwise_trans_fltbuf.cl create mode 100644 compute/tensor/src/gpu/mali/cl/conv_direct_3d_sw1_nchw_to_ncwhc4.cl create mode 100644 compute/tensor/src/gpu/mali/cl/conv_direct_3d_sw2_nchw_to_ncwhc4.cl create mode 100644 compute/tensor/src/gpu/mali/cl/conv_direct_s1.cl create mode 100644 compute/tensor/src/gpu/mali/cl/conv_direct_s1_fn_spe.cl create mode 100644 compute/tensor/src/gpu/mali/cl/conv_direct_s1_nchw_to_ncwhc4.cl rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/conv_direct_s1_spe_f1c3k1.cl (83%) rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/conv_direct_s2.cl (80%) create mode 100644 compute/tensor/src/gpu/mali/cl/conv_direct_s2_nchw_to_ncwhc4.cl create mode 100644 compute/tensor/src/gpu/mali/cl/conv_direct_spe_fwhs1.cl create mode 100644 compute/tensor/src/gpu/mali/cl/conv_direct_trans_fltbuf.cl rename tensor_computing/src/gpu/mali/cl/conv_direct_s1.cl => compute/tensor/src/gpu/mali/cl/conv_direct_wh_s1.cl (65%) create mode 100644 compute/tensor/src/gpu/mali/cl/conv_direct_wh_s2.cl rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/conv_wino_gemm36_tn.cl (74%) rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/conv_wino_rotate_fltbuf.cl (87%) create mode 100644 compute/tensor/src/gpu/mali/cl/conv_wino_trans_fltbuf_3x3.cl create mode 100644 compute/tensor/src/gpu/mali/cl/conv_wino_trans_outbuf.cl rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/conv_wino_trans_outbuf_right.cl (64%) create mode 100644 compute/tensor/src/gpu/mali/cl/conv_wino_trans_picbuf.cl rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/conv_wino_trans_picbuf_left.cl (78%) rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/conv_wino_trans_picbuf_right.cl (77%) create mode 100644 compute/tensor/src/gpu/mali/cl/copy.cl create mode 100644 compute/tensor/src/gpu/mali/cl/deconv_direct.cl create mode 100644 compute/tensor/src/gpu/mali/cl/deconv_direct_trans_fltbuf.cl create mode 100644 compute/tensor/src/gpu/mali/cl/deconv_gemm_f2s2.cl create mode 100644 compute/tensor/src/gpu/mali/cl/deconv_gemm_trans_fltbuf.cl create mode 100644 compute/tensor/src/gpu/mali/cl/depth2space.cl create mode 100644 compute/tensor/src/gpu/mali/cl/depth2space_nchw.cl create mode 100644 compute/tensor/src/gpu/mali/cl/depth2space_ncwhc4_2x2.cl create mode 100644 compute/tensor/src/gpu/mali/cl/eltwise.cl create mode 100644 compute/tensor/src/gpu/mali/cl/eltwise_broadcast.cl create mode 100644 compute/tensor/src/gpu/mali/cl/eltwise_spe_nchw_c.cl rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/embedding.cl (77%) rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/fc_p1.cl (55%) rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/fc_p2.cl (59%) create mode 100644 compute/tensor/src/gpu/mali/cl/fc_trans_fltbuf.cl rename uni/include/sys.h => compute/tensor/src/gpu/mali/cl/fill_memory_zero.cl (67%) create mode 100644 compute/tensor/src/gpu/mali/cl/fill_memory_zero_vec4.cl rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/gemm_nt.cl (56%) create mode 100644 compute/tensor/src/gpu/mali/cl/gemm_tn.cl create mode 100644 compute/tensor/src/gpu/mali/cl/kernel_def.h create mode 100644 compute/tensor/src/gpu/mali/cl/mem_trans_3d_ncwhc4_to_nchw.cl create mode 100644 compute/tensor/src/gpu/mali/cl/mem_trans_nchw_to_nchw.cl create mode 100644 compute/tensor/src/gpu/mali/cl/mem_trans_nchw_to_ncwhc4.cl rename tensor_computing/src/gpu/mali/cl/reshape_nchw_to_mkt.cl => compute/tensor/src/gpu/mali/cl/mem_trans_nchw_to_ncwhc4_iw_equal_oh.cl (60%) rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/mem_trans_ncwhc4_to_mtk.cl (71%) create mode 100644 compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_nchw.cl rename tensor_computing/src/gpu/mali/cl/reshape_mkt_to_nchw.cl => compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_nchw_ih_equal_ow.cl (61%) create mode 100644 compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_ncwhc4.cl rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/normalization.cl (72%) create mode 100644 compute/tensor/src/gpu/mali/cl/padding_constant.cl create mode 100644 compute/tensor/src/gpu/mali/cl/padding_edge.cl rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/padding_input_gclmem.cl (61%) create mode 100644 compute/tensor/src/gpu/mali/cl/padding_reflect.cl create mode 100644 compute/tensor/src/gpu/mali/cl/padding_symmetric.cl create mode 100644 compute/tensor/src/gpu/mali/cl/pooling_global_mean_h.cl create mode 100644 compute/tensor/src/gpu/mali/cl/pooling_global_mean_w.cl rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/pooling_max.cl (65%) rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/pooling_mean.cl (66%) create mode 100644 compute/tensor/src/gpu/mali/cl/power.cl create mode 100644 compute/tensor/src/gpu/mali/cl/prelu.cl rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/reshape.cl (79%) create mode 100644 compute/tensor/src/gpu/mali/cl/rnncell_build_xh.cl create mode 100644 compute/tensor/src/gpu/mali/cl/rnncell_update_project_res.cl create mode 100644 compute/tensor/src/gpu/mali/cl/rnncell_update_res.cl rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/scale.cl (58%) rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/slice_h.cl (72%) create mode 100644 compute/tensor/src/gpu/mali/cl/softmax.cl rename tensor_computing/src/gpu/mali/cl/multiply_align_nchw.cl => compute/tensor/src/gpu/mali/cl/softmax_h1w1_max_all.cl (59%) create mode 100644 compute/tensor/src/gpu/mali/cl/softmax_h1w1_max_part.cl rename tensor_computing/src/gpu/mali/cl/transpose_nchw_0132.cl => compute/tensor/src/gpu/mali/cl/softmax_h1w1_output.cl (59%) create mode 100644 compute/tensor/src/gpu/mali/cl/softmax_h1w1_sum_all.cl create mode 100644 compute/tensor/src/gpu/mali/cl/softmax_h1w1_sum_part.cl rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/softmax_nchw_c.cl (57%) create mode 100644 compute/tensor/src/gpu/mali/cl/softmax_nchw_w.cl rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/space2depth.cl (67%) rename {tensor_computing => compute/tensor}/src/gpu/mali/cl/squeeze.cl (80%) create mode 100644 compute/tensor/src/gpu/mali/cl/transpose_3d_nchw.cl create mode 100644 compute/tensor/src/gpu/mali/cl/transpose_nchw.cl create mode 100644 compute/tensor/src/gpu/mali/clip.cpp create mode 100644 compute/tensor/src/gpu/mali/concat.cpp create mode 100644 compute/tensor/src/gpu/mali/convolution.cpp create mode 100644 compute/tensor/src/gpu/mali/copy.cpp create mode 100644 compute/tensor/src/gpu/mali/deconvolution.cpp create mode 100644 compute/tensor/src/gpu/mali/depth2space.cpp create mode 100644 compute/tensor/src/gpu/mali/depthwise_convolution.cpp create mode 100644 compute/tensor/src/gpu/mali/depthwise_pointwise_convolution.cpp create mode 100644 compute/tensor/src/gpu/mali/eltwise.cpp create mode 100644 compute/tensor/src/gpu/mali/embedding.cpp create mode 100644 compute/tensor/src/gpu/mali/fp16/activation_mali_fp16.cpp rename {tensor_computing => compute/tensor}/src/gpu/mali/fp16/activation_mali_fp16.h (78%) create mode 100644 compute/tensor/src/gpu/mali/fp16/argmax_mali_fp16.cpp rename tensor_computing/src/gpu/mali/fp16/multiply_mali_fp16.h => compute/tensor/src/gpu/mali/fp16/argmax_mali_fp16.h (76%) create mode 100644 compute/tensor/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.cpp rename tensor_computing/src/cpu/arm/int8/depthwise_convolution.h => compute/tensor/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.h (59%) create mode 100644 compute/tensor/src/gpu/mali/fp16/channel_resize_mali_fp16.cpp create mode 100644 compute/tensor/src/gpu/mali/fp16/channel_resize_mali_fp16.h rename {tensor_computing => compute/tensor}/src/gpu/mali/fp16/clip_mali_fp16.cpp (61%) rename {tensor_computing => compute/tensor}/src/gpu/mali/fp16/clip_mali_fp16.h (81%) create mode 100644 compute/tensor/src/gpu/mali/fp16/concat_mali_fp16.cpp rename {tensor_computing => compute/tensor}/src/gpu/mali/fp16/concat_mali_fp16.h (77%) create mode 100644 compute/tensor/src/gpu/mali/fp16/convolution_direct_mali_fp16.cpp create mode 100644 compute/tensor/src/gpu/mali/fp16/convolution_direct_mali_fp16.h create mode 100644 compute/tensor/src/gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.cpp create mode 100644 compute/tensor/src/gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.h create mode 100644 compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.cpp rename tensor_computing/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.h => compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.h (52%) create mode 100644 compute/tensor/src/gpu/mali/fp16/convolution_wino_mali_fp16.cpp create mode 100644 compute/tensor/src/gpu/mali/fp16/convolution_wino_mali_fp16.h create mode 100644 compute/tensor/src/gpu/mali/fp16/deconvolution_direct_mali_fp16.cpp create mode 100644 compute/tensor/src/gpu/mali/fp16/deconvolution_direct_mali_fp16.h create mode 100644 compute/tensor/src/gpu/mali/fp16/deconvolution_gemm_mali_fp16.cpp create mode 100644 compute/tensor/src/gpu/mali/fp16/deconvolution_gemm_mali_fp16.h create mode 100644 compute/tensor/src/gpu/mali/fp16/deconvolution_mali_fp16.cpp create mode 100644 compute/tensor/src/gpu/mali/fp16/deconvolution_mali_fp16.h create mode 100644 compute/tensor/src/gpu/mali/fp16/depth2space_mali_fp16.cpp create mode 100644 compute/tensor/src/gpu/mali/fp16/depth2space_mali_fp16.h create mode 100644 compute/tensor/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.cpp create mode 100644 compute/tensor/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.h create mode 100644 compute/tensor/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.cpp create mode 100644 compute/tensor/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.h create mode 100644 compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.cpp create mode 100644 compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.h create mode 100644 compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.cpp create mode 100644 compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.h create mode 100644 compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_mali_fp16.cpp create mode 100644 compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_mali_fp16.h create mode 100644 compute/tensor/src/gpu/mali/fp16/eltwise_mali_fp16.cpp rename {tensor_computing => compute/tensor}/src/gpu/mali/fp16/eltwise_mali_fp16.h (75%) rename {tensor_computing => compute/tensor}/src/gpu/mali/fp16/embedding_mali_fp16.cpp (53%) rename {tensor_computing => compute/tensor}/src/gpu/mali/fp16/embedding_mali_fp16.h (72%) rename {tensor_computing => compute/tensor}/src/gpu/mali/fp16/fully_connected_mali_fp16.cpp (58%) rename tensor_computing/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.h => compute/tensor/src/gpu/mali/fp16/fully_connected_mali_fp16.h (52%) create mode 100644 compute/tensor/src/gpu/mali/fp16/matmul_mali_fp16.cpp create mode 100644 compute/tensor/src/gpu/mali/fp16/matmul_mali_fp16.h create mode 100644 compute/tensor/src/gpu/mali/fp16/multihead_attention_mali_fp16.cpp create mode 100644 compute/tensor/src/gpu/mali/fp16/multihead_attention_mali_fp16.h rename {tensor_computing => compute/tensor}/src/gpu/mali/fp16/normalization_mali_fp16.cpp (61%) rename {tensor_computing => compute/tensor}/src/gpu/mali/fp16/normalization_mali_fp16.h (78%) create mode 100644 compute/tensor/src/gpu/mali/fp16/padding_mali_fp16.cpp create mode 100644 compute/tensor/src/gpu/mali/fp16/padding_mali_fp16.h create mode 100644 compute/tensor/src/gpu/mali/fp16/pooling_mali_fp16.cpp rename {tensor_computing => compute/tensor}/src/gpu/mali/fp16/pooling_mali_fp16.h (78%) create mode 100644 compute/tensor/src/gpu/mali/fp16/power_mali_fp16.cpp create mode 100644 compute/tensor/src/gpu/mali/fp16/power_mali_fp16.h create mode 100644 compute/tensor/src/gpu/mali/fp16/prelu_mali_fp16.cpp create mode 100644 compute/tensor/src/gpu/mali/fp16/prelu_mali_fp16.h create mode 100644 compute/tensor/src/gpu/mali/fp16/reshape_mali_fp16.cpp rename {tensor_computing => compute/tensor}/src/gpu/mali/fp16/reshape_mali_fp16.h (80%) create mode 100644 compute/tensor/src/gpu/mali/fp16/rnn_mali_fp16.cpp create mode 100644 compute/tensor/src/gpu/mali/fp16/rnn_mali_fp16.h create mode 100644 compute/tensor/src/gpu/mali/fp16/rnncell_mali_fp16.cpp create mode 100644 compute/tensor/src/gpu/mali/fp16/rnncell_mali_fp16.h rename {tensor_computing => compute/tensor}/src/gpu/mali/fp16/scale_mali_fp16.cpp (56%) rename {tensor_computing => compute/tensor}/src/gpu/mali/fp16/scale_mali_fp16.h (82%) rename {tensor_computing => compute/tensor}/src/gpu/mali/fp16/slice_mali_fp16.cpp (61%) rename {tensor_computing => compute/tensor}/src/gpu/mali/fp16/slice_mali_fp16.h (76%) create mode 100644 compute/tensor/src/gpu/mali/fp16/softmax_mali_fp16.cpp rename {tensor_computing => compute/tensor}/src/gpu/mali/fp16/softmax_mali_fp16.h (82%) rename {tensor_computing => compute/tensor}/src/gpu/mali/fp16/squeeze_mali_fp16.cpp (64%) rename {tensor_computing => compute/tensor}/src/gpu/mali/fp16/squeeze_mali_fp16.h (79%) create mode 100644 compute/tensor/src/gpu/mali/fp16/transpose_mali_fp16.cpp rename {tensor_computing => compute/tensor}/src/gpu/mali/fp16/transpose_mali_fp16.h (79%) create mode 100644 compute/tensor/src/gpu/mali/fp16/unsqueeze_mali_fp16.cpp create mode 100644 compute/tensor/src/gpu/mali/fp16/unsqueeze_mali_fp16.h create mode 100644 compute/tensor/src/gpu/mali/fully_connected.cpp create mode 100644 compute/tensor/src/gpu/mali/matmul.cpp create mode 100644 compute/tensor/src/gpu/mali/multihead_attention.cpp rename {tensor_computing => compute/tensor}/src/gpu/mali/normalization.cpp (50%) create mode 100644 compute/tensor/src/gpu/mali/padding.cpp create mode 100644 compute/tensor/src/gpu/mali/pooling.cpp create mode 100644 compute/tensor/src/gpu/mali/power.cpp create mode 100644 compute/tensor/src/gpu/mali/preallocated_memory.cpp create mode 100644 compute/tensor/src/gpu/mali/prelu.cpp create mode 100644 compute/tensor/src/gpu/mali/reshape.cpp create mode 100644 compute/tensor/src/gpu/mali/rnncell.cpp rename {tensor_computing => compute/tensor}/src/gpu/mali/scale.cpp (53%) rename {tensor_computing => compute/tensor}/src/gpu/mali/slice.cpp (51%) create mode 100644 compute/tensor/src/gpu/mali/softmax.cpp create mode 100644 compute/tensor/src/gpu/mali/space2depth.cpp create mode 100644 compute/tensor/src/gpu/mali/squeeze.cpp create mode 100644 compute/tensor/src/gpu/mali/tensor_computing_mali.h create mode 100644 compute/tensor/src/gpu/mali/transpose.cpp create mode 100644 compute/tensor/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.cpp create mode 100644 compute/tensor/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.h create mode 100644 compute/tensor/src/gpu/mali/unsqueeze.cpp create mode 100644 compute/tensor/src/l2normalization.cpp create mode 100644 compute/tensor/src/matmul.cpp create mode 100644 compute/tensor/src/multihead_attention.cpp create mode 100644 compute/tensor/src/non_max_suppression.cpp create mode 100644 compute/tensor/src/normalization.cpp create mode 100644 compute/tensor/src/padding.cpp create mode 100644 compute/tensor/src/pooling.cpp rename tensor_computing/src/check.cpp => compute/tensor/src/pooling_bp.cpp (51%) create mode 100644 compute/tensor/src/power.cpp rename tensor_computing/src/set_input.cpp => compute/tensor/src/preallocated_memory.cpp (50%) create mode 100644 compute/tensor/src/prelu.cpp create mode 100644 compute/tensor/src/priorbox.cpp rename {tensor_computing => compute/tensor}/src/quantize.cpp (59%) create mode 100644 compute/tensor/src/reduction.cpp create mode 100644 compute/tensor/src/reshape.cpp create mode 100644 compute/tensor/src/rnn.cpp create mode 100644 compute/tensor/src/roialign.cpp create mode 100644 compute/tensor/src/scale.cpp create mode 100644 compute/tensor/src/slice.cpp create mode 100644 compute/tensor/src/softmax.cpp create mode 100644 compute/tensor/src/space2depth.cpp create mode 100644 compute/tensor/src/split.cpp create mode 100644 compute/tensor/src/squeeze.cpp create mode 100644 compute/tensor/src/tensor_computing_type.cpp create mode 100644 compute/tensor/src/tfslice.cpp create mode 100644 compute/tensor/src/tile.cpp create mode 100644 compute/tensor/src/transpose.cpp create mode 100644 compute/tensor/src/unsqueeze.cpp create mode 100644 compute/tensor/src/yolov3detectionoutput.cpp create mode 100644 compute/tensor/tests/CMakeLists.txt create mode 100644 compute/tensor/tests/test_activation.cpp rename {tests => compute/tensor/tests}/test_argmax.cpp (54%) rename {tests => compute/tensor/tests}/test_attention.cpp (50%) create mode 100644 compute/tensor/tests/test_axpby.cpp create mode 100644 compute/tensor/tests/test_channel_resize_ocl.cpp rename {tests => compute/tensor/tests}/test_check.cpp (52%) rename {tests => compute/tensor/tests}/test_clip.cpp (51%) create mode 100644 compute/tensor/tests/test_concat.cpp rename {tests => compute/tensor/tests}/test_concat_int8.cpp (53%) create mode 100644 compute/tensor/tests/test_concat_ocl.cpp create mode 100644 compute/tensor/tests/test_convolution.cpp create mode 100644 compute/tensor/tests/test_convolution_bnn.cpp create mode 100644 compute/tensor/tests/test_convolution_int8.cpp create mode 100644 compute/tensor/tests/test_convolution_ocl.cpp create mode 100644 compute/tensor/tests/test_deconvolution.cpp create mode 100644 compute/tensor/tests/test_deconvolution_ocl.cpp create mode 100644 compute/tensor/tests/test_depthwise_convolution.cpp create mode 100644 compute/tensor/tests/test_depthwise_convolution_int8.cpp create mode 100644 compute/tensor/tests/test_depthwise_convolution_ocl.cpp create mode 100644 compute/tensor/tests/test_depthwise_pointwise_convolution_ocl.cpp create mode 100644 compute/tensor/tests/test_detectionoutput.cpp create mode 100644 compute/tensor/tests/test_dilated_convolution.cpp rename {tests => compute/tensor/tests}/test_eltwise.cpp (50%) create mode 100644 compute/tensor/tests/test_fully_connected.cpp create mode 100644 compute/tensor/tests/test_fully_connected_ocl.cpp create mode 100644 compute/tensor/tests/test_l2normalization.cpp create mode 100644 compute/tensor/tests/test_multihead_attention_ocl.cpp create mode 100644 compute/tensor/tests/test_non_max_suppression.cpp create mode 100644 compute/tensor/tests/test_normalization.cpp create mode 100644 compute/tensor/tests/test_padding.cpp create mode 100644 compute/tensor/tests/test_padding_ocl.cpp create mode 100644 compute/tensor/tests/test_pooling.cpp create mode 100644 compute/tensor/tests/test_pooling_bp.cpp create mode 100644 compute/tensor/tests/test_pooling_int8.cpp create mode 100644 compute/tensor/tests/test_pooling_ocl.cpp create mode 100644 compute/tensor/tests/test_power.cpp create mode 100644 compute/tensor/tests/test_power_ocl.cpp create mode 100644 compute/tensor/tests/test_prelu.cpp create mode 100644 compute/tensor/tests/test_prelu_ocl.cpp rename {tests => compute/tensor/tests}/test_priorbox.cpp (50%) create mode 100644 compute/tensor/tests/test_reduction.cpp rename {tests => compute/tensor/tests}/test_reshape.cpp (52%) create mode 100644 compute/tensor/tests/test_reshape_ocl.cpp create mode 100644 compute/tensor/tests/test_rnn.cpp create mode 100644 compute/tensor/tests/test_roialign.cpp rename {tests => compute/tensor/tests}/test_scale.cpp (54%) rename {tests => compute/tensor/tests}/test_slice.cpp (54%) rename {tests => compute/tensor/tests}/test_softmax.cpp (52%) create mode 100644 compute/tensor/tests/test_softmax_h1w1_ocl.cpp rename {tests => compute/tensor/tests}/test_split.cpp (56%) create mode 100644 compute/tensor/tests/test_tile.cpp rename {tests => compute/tensor/tests}/test_transpose.cpp (50%) create mode 100644 compute/tensor/tests/test_transpose_ocl.cpp create mode 100644 docs/ARCHITECTURE.md create mode 100644 docs/FAQ.md create mode 100644 docs/IOS_USAGE.md create mode 100644 docs/KIT.md create mode 100644 docs/QUANTIZATION.md create mode 100644 docs/REDUCE_GPU_PREPARE_TIME.md create mode 100644 docs/THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.md create mode 100644 docs/images/ADB.PNG create mode 100644 docs/images/Framework.PNG create mode 100644 docs/images/GNU.PNG create mode 100644 docs/images/JDK.PNG create mode 100644 docs/images/ModelConversion.PNG create mode 100644 docs/images/NDK.PNG create mode 100644 docs/images/PerformanceProfiling.PNG create mode 100644 docs/images/QuickStart.PNG create mode 100644 docs/images/cmake.PNG create mode 100644 docs/images/dx.PNG create mode 100644 docs/images/make.PNG delete mode 100644 gcl/include/context.h delete mode 100644 gcl/include/event.h delete mode 100644 gcl/include/gcl_common.h delete mode 100644 gcl/include/gcl_func.h delete mode 100644 gcl/include/gcl_kernel_binmap.h delete mode 100644 gcl/include/kernel.h delete mode 100644 gcl/include/memory.h delete mode 100644 gcl/include/platform.h delete mode 100644 gcl/include/program.h delete mode 100644 gcl/tools/gcl_sample/sample.cpp delete mode 100644 gcl/tools/kernel_lib_compile/sh/adbDeviceNum.sh delete mode 100644 gcl/tools/kernel_lib_compile/sh/compile/concat.sh delete mode 100644 gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1.sh delete mode 100644 gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2.sh delete mode 100644 gcl/tools/kernel_lib_compile/sh/compile/conv_direct_spe_fwhs1.sh delete mode 100644 gcl/tools/kernel_lib_compile/sh/compile/eltwise.sh delete mode 100644 gcl/tools/kernel_lib_compile/sh/compile/gemm_tn.sh delete mode 100644 image/src/CMakeLists.txt delete mode 100644 image/src/resize.cpp rename {tensor_computing => inference/engine}/CMakeLists.txt (50%) create mode 100644 inference/engine/api/c/bolt.h create mode 100644 inference/engine/api/dllite/Bolt.h create mode 100644 inference/engine/api/java/BoltModel.java create mode 100644 inference/engine/api/java/BoltResult.java create mode 100644 inference/engine/include/BoltModel.h rename inference/{ => engine}/include/activation.hpp (77%) create mode 100644 inference/engine/include/argmax.hpp create mode 100644 inference/engine/include/attention.hpp create mode 100644 inference/engine/include/attention_mask.hpp create mode 100644 inference/engine/include/bilateral_slice_apply.hpp create mode 100644 inference/engine/include/channel_resize.hpp create mode 100644 inference/engine/include/check.hpp rename inference/{ => engine}/include/clip.hpp (69%) create mode 100644 inference/engine/include/cnn.h rename inference/{ => engine}/include/concat.hpp (73%) rename inference/{ => engine}/include/constant.hpp (66%) create mode 100644 inference/engine/include/convolution.hpp create mode 100644 inference/engine/include/copy.hpp rename inference/{ => engine}/include/cpu/activation_cpu.hpp (54%) create mode 100644 inference/engine/include/cpu/argmax_cpu.hpp create mode 100644 inference/engine/include/cpu/channel_resize_cpu.hpp create mode 100644 inference/engine/include/cpu/check_cpu.hpp create mode 100644 inference/engine/include/cpu/clip_cpu.hpp create mode 100644 inference/engine/include/cpu/concat_cpu.hpp create mode 100644 inference/engine/include/cpu/convolution_cpu.hpp create mode 100644 inference/engine/include/cpu/copy_cpu.hpp create mode 100644 inference/engine/include/cpu/deconvolution_cpu.hpp create mode 100644 inference/engine/include/cpu/eltwise_cpu.hpp create mode 100644 inference/engine/include/cpu/embedding_cpu.hpp create mode 100644 inference/engine/include/cpu/factory_cpu.hpp create mode 100644 inference/engine/include/cpu/fully_connected_cpu.hpp create mode 100644 inference/engine/include/cpu/l2normalization_cpu.hpp create mode 100644 inference/engine/include/cpu/layer_norm_cpu.hpp create mode 100644 inference/engine/include/cpu/matmul_cpu.hpp create mode 100644 inference/engine/include/cpu/padding_cpu.hpp rename inference/{include/cpu/memory_cpu.hpp => engine/include/cpu/pooling_cpu.hpp} (50%) rename inference/{include/cpu/multiply_cpu.hpp => engine/include/cpu/power_cpu.hpp} (50%) create mode 100644 inference/engine/include/cpu/preallocated_memory_cpu.hpp create mode 100644 inference/engine/include/cpu/prelu_cpu.hpp rename inference/{include/repeat.hpp => engine/include/cpu/repeat_cpu.hpp} (54%) create mode 100644 inference/engine/include/cpu/reshape_cpu.hpp create mode 100644 inference/engine/include/cpu/resize_cpu.hpp create mode 100644 inference/engine/include/cpu/rnn_cpu.hpp create mode 100644 inference/engine/include/cpu/rnncell_cpu.hpp create mode 100644 inference/engine/include/cpu/scale_cpu.hpp rename inference/{include/padding.hpp => engine/include/cpu/shape_cpu.hpp} (53%) create mode 100644 inference/engine/include/cpu/shared_weight_cpu.hpp create mode 100644 inference/engine/include/cpu/slice_cpu.hpp create mode 100644 inference/engine/include/cpu/softmax_cpu.hpp create mode 100644 inference/engine/include/cpu/splice_cpu.hpp create mode 100644 inference/engine/include/cpu/squeeze_cpu.hpp create mode 100644 inference/engine/include/cpu/tfslice_cpu.hpp create mode 100644 inference/engine/include/cpu/tile_cpu.hpp create mode 100644 inference/engine/include/cpu/transpose_cpu.hpp create mode 100644 inference/engine/include/cpu/unsqueeze_cpu.hpp rename inference/{ => engine}/include/data_loader.hpp (54%) create mode 100644 inference/engine/include/deconvolution.hpp rename inference/{ => engine}/include/depth2space.hpp (74%) create mode 100644 inference/engine/include/detection_output.hpp rename inference/{ => engine}/include/eltwise.hpp (65%) create mode 100644 inference/engine/include/embedding.hpp create mode 100644 inference/engine/include/factory.hpp create mode 100644 inference/engine/include/fully_connected.hpp create mode 100644 inference/engine/include/inference.hpp rename inference/{ => engine}/include/jump.hpp (65%) create mode 100644 inference/engine/include/l2normalization.hpp rename inference/{ => engine}/include/layer_norm.hpp (72%) create mode 100644 inference/engine/include/matmul.hpp create mode 100644 inference/engine/include/memory_tracker.hpp create mode 100644 inference/engine/include/model.hpp create mode 100644 inference/engine/include/ocl/activation_ocl.hpp create mode 100644 inference/engine/include/ocl/argmax_ocl.hpp create mode 100644 inference/engine/include/ocl/bilateral_slice_apply_ocl.hpp create mode 100644 inference/engine/include/ocl/channel_resize_ocl.hpp create mode 100644 inference/engine/include/ocl/check_ocl.hpp create mode 100644 inference/engine/include/ocl/clip_ocl.hpp create mode 100644 inference/engine/include/ocl/concat_ocl.hpp create mode 100644 inference/engine/include/ocl/convolution_ocl.hpp create mode 100644 inference/engine/include/ocl/copy_ocl.hpp create mode 100644 inference/engine/include/ocl/deconvolution_ocl.hpp create mode 100644 inference/engine/include/ocl/depth2space_ocl.hpp create mode 100644 inference/engine/include/ocl/eltwise_ocl.hpp create mode 100644 inference/engine/include/ocl/embedding_ocl.hpp create mode 100644 inference/engine/include/ocl/factory_ocl.hpp create mode 100644 inference/engine/include/ocl/fully_connected_ocl.hpp create mode 100644 inference/engine/include/ocl/layer_norm_ocl.hpp create mode 100644 inference/engine/include/ocl/matmul_ocl.hpp create mode 100644 inference/engine/include/ocl/padding_ocl.hpp create mode 100644 inference/engine/include/ocl/pooling_ocl.hpp create mode 100644 inference/engine/include/ocl/power_ocl.hpp create mode 100644 inference/engine/include/ocl/preallocated_memory_ocl.hpp create mode 100644 inference/engine/include/ocl/prelu_ocl.hpp create mode 100644 inference/engine/include/ocl/repeat_ocl.hpp create mode 100644 inference/engine/include/ocl/reshape_ocl.hpp create mode 100644 inference/engine/include/ocl/resize_ocl.hpp create mode 100644 inference/engine/include/ocl/rnn_ocl.hpp create mode 100644 inference/engine/include/ocl/rnncell_ocl.hpp create mode 100644 inference/engine/include/ocl/scale_ocl.hpp create mode 100644 inference/engine/include/ocl/shared_weight_ocl.hpp create mode 100644 inference/engine/include/ocl/slice_ocl.hpp create mode 100644 inference/engine/include/ocl/softmax_ocl.hpp create mode 100644 inference/engine/include/ocl/space2depth_ocl.hpp create mode 100644 inference/engine/include/ocl/squeeze_ocl.hpp create mode 100644 inference/engine/include/ocl/transpose_ocl.hpp create mode 100644 inference/engine/include/ocl/unsqueeze_ocl.hpp rename inference/{ => engine}/include/operator.hpp (53%) create mode 100644 inference/engine/include/padding.hpp create mode 100644 inference/engine/include/pooling.hpp rename inference/{include/multiply.hpp => engine/include/power.hpp} (66%) create mode 100644 inference/engine/include/preallocated_memory.hpp create mode 100644 inference/engine/include/prelu.hpp create mode 100644 inference/engine/include/prior_box.hpp create mode 100644 inference/engine/include/reduction.hpp create mode 100644 inference/engine/include/relative_position_embedding.hpp rename inference/{ => engine}/include/relative_shift.hpp (52%) create mode 100644 inference/engine/include/repeat.hpp create mode 100644 inference/engine/include/reshape.hpp create mode 100644 inference/engine/include/resize.hpp rename inference/{ => engine}/include/result_format.hpp (79%) create mode 100644 inference/engine/include/rnncell.hpp rename inference/{ => engine}/include/scale.hpp (66%) create mode 100644 inference/engine/include/sequential.hpp create mode 100644 inference/engine/include/sequential_ocl.hpp create mode 100644 inference/engine/include/shape.hpp create mode 100644 inference/engine/include/shared_weight.hpp rename inference/{ => engine}/include/slice.hpp (66%) rename inference/{ => engine}/include/softmax.hpp (75%) rename inference/{ => engine}/include/space2depth.hpp (78%) create mode 100644 inference/engine/include/splice.hpp rename inference/{ => engine}/include/squeeze.hpp (68%) create mode 100644 inference/engine/include/tfslice.hpp create mode 100644 inference/engine/include/tile.hpp rename inference/{ => engine}/include/transpose.hpp (67%) create mode 100644 inference/engine/include/unsqueeze.hpp create mode 100644 inference/engine/include/weight_operator.hpp create mode 100644 inference/engine/include/yolov3_detection_output.hpp create mode 100644 inference/engine/src/BoltModel_Jni.cpp create mode 100644 inference/engine/src/CMakeLists.txt create mode 100644 inference/engine/src/bolt.cpp create mode 100644 inference/engine/src/bolt_dllite.cpp create mode 100644 inference/engine/src/cnn.cpp create mode 100644 inference/engine/src/data_loader.cpp create mode 100644 inference/engine/src/result_format.cpp create mode 100644 inference/engine/tools/CMakeLists.txt create mode 100644 inference/engine/tools/common_algo_search/common_algo_search.cpp create mode 100644 inference/engine/tools/preprocess_ocl/CMakeLists.txt create mode 100644 inference/engine/tools/preprocess_ocl/build_preprocess_ocl.sh create mode 100644 inference/engine/tools/preprocess_ocl/preprocess_ocl.cpp create mode 100644 inference/engine/tools/ptq_calibration/ptq_calibration.cpp create mode 100644 inference/examples/CMakeLists.txt rename {kits => inference/examples}/automatic_speech_recognition/asr_convolution_transformer.cpp (62%) create mode 100644 inference/examples/automatic_speech_recognition/asr_labels.txt create mode 100644 inference/examples/automatic_speech_recognition/asr_rnnt.cpp create mode 100644 inference/examples/automatic_speech_recognition/audio_feature.cpp create mode 100644 inference/examples/automatic_speech_recognition/audio_feature.h create mode 100644 inference/examples/automatic_speech_recognition/encoder_flow.prototxt create mode 100644 inference/examples/automatic_speech_recognition/example.wav create mode 100644 inference/examples/automatic_speech_recognition/flow_asr.cpp create mode 100644 inference/examples/automatic_speech_recognition/joint_flow.prototxt create mode 100644 inference/examples/automatic_speech_recognition/pinyin2hanzi_flow.prototxt create mode 100644 inference/examples/automatic_speech_recognition/pinyin_lm_embedding.bin create mode 100644 inference/examples/automatic_speech_recognition/prediction_flow.prototxt create mode 100644 inference/examples/automatic_speech_recognition/run.sh rename {kits => inference/examples}/automatic_speech_recognition/vad.cpp (58%) create mode 100644 inference/examples/benchmark/benchmark.cpp rename {kits => inference/examples}/bert/bert.cpp (61%) create mode 100644 inference/examples/bert/flow_tinybert.cpp create mode 100644 inference/examples/bert/flow_tinybert.prototxt create mode 100644 inference/examples/bert/graph_tinybert.cpp create mode 100644 inference/examples/bert/tinybert.cpp create mode 100644 inference/examples/bert/tinybert_onnx.cpp create mode 100644 inference/examples/bert/tinybert_test.h create mode 100644 inference/examples/c_api/test_api_c.c create mode 100644 inference/examples/dlaWOdcn/flow_dlaWOdcn.cpp create mode 100644 inference/examples/dlaWOdcn/flow_dlaWOdcn.prototxt create mode 100644 inference/examples/dlaWOdcn/run.sh create mode 100644 inference/examples/facesr/flow_facesr.cpp create mode 100644 inference/examples/facesr/flow_facesr.prototxt create mode 100644 inference/examples/facesr/run.sh create mode 100644 inference/examples/high_dynamic_range/hdr.cpp create mode 100644 inference/examples/image_classification/classification.cpp create mode 100644 inference/examples/java_api/test_api_java.java rename {kits => inference/examples}/machine_translation/nmt.cpp (51%) rename {kits => inference/examples}/machine_translation/nmt_tsc.cpp (58%) create mode 100644 inference/examples/object_detection/detection.cpp create mode 100644 inference/examples/sequential/test_pipeline_ocl.cpp rename {kits => inference/examples}/text_to_speech/tts.cpp (55%) delete mode 100644 inference/exports/c/bolt.h delete mode 100644 inference/exports/java/BoltModel.java delete mode 100644 inference/exports/java/BoltResult.java rename {blas-enhance => inference/flow}/CMakeLists.txt (59%) create mode 100644 inference/flow/include/flow.h create mode 100644 inference/flow/include/flow_function_factory.h create mode 100644 inference/flow/include/node.h create mode 100644 inference/flow/src/CMakeLists.txt create mode 100644 inference/flow/src/flow.cpp create mode 100644 inference/flow/src/flow.proto create mode 100644 inference/flow/src/flow_function_factory.cpp create mode 100644 inference/flow/src/node.cpp delete mode 100644 inference/include/BoltModel.h delete mode 100644 inference/include/argmax.hpp delete mode 100644 inference/include/attention.hpp delete mode 100644 inference/include/attention_mask.hpp delete mode 100644 inference/include/bilateral_slice_apply.hpp delete mode 100644 inference/include/check.hpp delete mode 100644 inference/include/cnn.hpp delete mode 100644 inference/include/convolution.hpp delete mode 100644 inference/include/copy.hpp delete mode 100644 inference/include/cpu/clip_cpu.hpp delete mode 100644 inference/include/cpu/concat_cpu.hpp delete mode 100644 inference/include/cpu/convolution_cpu.hpp delete mode 100644 inference/include/cpu/eltwise_cpu.hpp delete mode 100644 inference/include/cpu/embedding_cpu.hpp delete mode 100644 inference/include/cpu/factory_cpu.hpp delete mode 100644 inference/include/cpu/fully_connected_cpu.hpp delete mode 100644 inference/include/cpu/layer_norm_cpu.hpp delete mode 100644 inference/include/cpu/matmul_cpu.hpp delete mode 100644 inference/include/cpu/pooling_cpu.hpp delete mode 100644 inference/include/cpu/reshape_cpu.hpp delete mode 100644 inference/include/cpu/scale_cpu.hpp delete mode 100644 inference/include/cpu/slice_cpu.hpp delete mode 100644 inference/include/cpu/softmax_cpu.hpp delete mode 100644 inference/include/cpu/squeeze_cpu.hpp delete mode 100644 inference/include/cpu/transpose_cpu.hpp delete mode 100644 inference/include/deconvolution.hpp delete mode 100644 inference/include/detection_output.hpp delete mode 100644 inference/include/embedding.hpp delete mode 100644 inference/include/factory.hpp delete mode 100644 inference/include/fully_connected.hpp delete mode 100644 inference/include/inference.hpp delete mode 100644 inference/include/lstm.hpp delete mode 100644 inference/include/lstmcell.hpp delete mode 100644 inference/include/matmul.hpp delete mode 100644 inference/include/memory.hpp delete mode 100644 inference/include/model.hpp delete mode 100644 inference/include/ocl/activation_ocl.hpp delete mode 100644 inference/include/ocl/bilateral_slice_apply_ocl.hpp delete mode 100644 inference/include/ocl/clip_ocl.hpp delete mode 100644 inference/include/ocl/concat_ocl.hpp delete mode 100644 inference/include/ocl/convolution_ocl.hpp delete mode 100644 inference/include/ocl/depth2space_ocl.hpp delete mode 100644 inference/include/ocl/eltwise_ocl.hpp delete mode 100644 inference/include/ocl/embedding_ocl.hpp delete mode 100644 inference/include/ocl/factory_ocl.hpp delete mode 100644 inference/include/ocl/fully_connected_ocl.hpp delete mode 100644 inference/include/ocl/layer_norm_ocl.hpp delete mode 100644 inference/include/ocl/matmul_ocl.hpp delete mode 100644 inference/include/ocl/memory_ocl.hpp delete mode 100644 inference/include/ocl/multiply_ocl.hpp delete mode 100644 inference/include/ocl/pooling_ocl.hpp delete mode 100644 inference/include/ocl/reshape_ocl.hpp delete mode 100644 inference/include/ocl/scale_ocl.hpp delete mode 100644 inference/include/ocl/slice_ocl.hpp delete mode 100644 inference/include/ocl/softmax_ocl.hpp delete mode 100644 inference/include/ocl/space2depth_ocl.hpp delete mode 100644 inference/include/ocl/squeeze_ocl.hpp delete mode 100644 inference/include/ocl/transpose_ocl.hpp delete mode 100644 inference/include/point_cast.hpp delete mode 100644 inference/include/pooling.hpp delete mode 100644 inference/include/preallocated_memory.hpp delete mode 100644 inference/include/prior_box.hpp delete mode 100644 inference/include/reduction.hpp delete mode 100644 inference/include/relative_position_embedding.hpp delete mode 100644 inference/include/reshape.hpp delete mode 100644 inference/include/resize.hpp delete mode 100644 inference/include/sequential.hpp delete mode 100644 inference/include/sequential_ocl.hpp delete mode 100644 inference/include/shared_weight.hpp delete mode 100644 inference/include/tensor.hpp delete mode 100644 inference/include/unsqueeze.hpp delete mode 100644 inference/include/utils.hpp delete mode 100644 inference/include/weight_operator.hpp delete mode 100644 inference/src/BoltModel_Jni.cpp delete mode 100644 inference/src/CMakeLists.txt delete mode 100644 inference/src/bolt.cpp delete mode 100644 inference/src/data_loader.cpp delete mode 100644 inference/src/result_format.cpp delete mode 100644 inference/src/utils.cpp create mode 100644 kit/iOS/image_classification/ImageClassificationDemo.xcodeproj/project.pbxproj create mode 100644 kit/iOS/image_classification/ImageClassificationDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata create mode 100644 kit/iOS/image_classification/ImageClassificationDemo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist create mode 100644 kit/iOS/image_classification/ImageClassificationDemo.xcodeproj/project.xcworkspace/xcuserdata/aizhen.xcuserdatad/UserInterfaceState.xcuserstate create mode 100644 kit/iOS/image_classification/ImageClassificationDemo.xcodeproj/xcuserdata/aizhen.xcuserdatad/xcschemes/xcschememanagement.plist create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/AppDelegate.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/AppDelegate.m create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/Assets.xcassets/AppIcon.appiconset/Contents.json create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/Assets.xcassets/Contents.json create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/Base.lproj/LaunchScreen.storyboard create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/Base.lproj/Main.storyboard create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/Info.plist create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/SceneDelegate.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/SceneDelegate.m create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/ViewController.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/ViewController.mm create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/flow/flow.pb.h rename blas-enhance/src/cpu/arm/fp16/mmm.h => kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/kit_flags.h (70%) create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/arena.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/arenastring.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/descriptor.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/extension_set.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/generated_message_util.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/has_bits.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/io/coded_stream.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/io/strtod.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/io/zero_copy_stream.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/io/zero_copy_stream_impl.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/io/zero_copy_stream_impl_lite.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/message.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/message_lite.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/metadata.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/repeated_field.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomic_sequence_num.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_arm64_gcc.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_arm_gcc.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_arm_qnx.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_atomicword_compat.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_generic_gcc.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_macosx.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_mips_gcc.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_pnacl.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_power.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_ppc_gcc.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_solaris.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_tsan.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_x86_gcc.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/atomicops_internals_x86_msvc.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/callback.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/casts.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/common.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/fastmem.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/logging.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/macros.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/mutex.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/once.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/platform_macros.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/port.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/scoped_ptr.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/shared_ptr.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/stl_util.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/template_util.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/stubs/type_traits.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/text_format.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/protobuf/unknown_field_set.h create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/image_classification.prototxt create mode 100644 kit/iOS/image_classification/ImageClassificationDemo/libbolt/imagenet_classes.txt rename blas-enhance/src/cpu/arm/int8/mvm.cpp => kit/iOS/image_classification/ImageClassificationDemo/main.m (64%) create mode 100644 kit/iOS/image_classification/ImageClassificationDemoTests/ImageClassificationDemoTests.m create mode 100644 kit/iOS/image_classification/ImageClassificationDemoTests/Info.plist create mode 100644 kit/iOS/image_classification/ImageClassificationDemoUITests/ImageClassificationDemoUITests.m create mode 100644 kit/iOS/image_classification/ImageClassificationDemoUITests/Info.plist create mode 100644 kit/iOS/setup_lib_iOS.sh delete mode 100644 kits/CMakeLists.txt delete mode 100644 kits/automatic_speech_recognition/asr_rnnt.cpp delete mode 100644 kits/bert/tinybert.cpp delete mode 100644 kits/high_dynamic_range/hdr.cpp delete mode 100644 kits/image_classification/classification.cpp delete mode 100644 kits/image_classification/classification_bin.cpp delete mode 100644 kits/super_resolution/super_resolution.cpp delete mode 100644 model-tools/CMakeLists.txt delete mode 100644 model-tools/include/OPOptimizers/ChannelPaddingOptimizer.hpp delete mode 100644 model-tools/include/OPOptimizers/ConstUpsampleOptimizer.hpp delete mode 100644 model-tools/include/OPOptimizers/ConvActivationOptimizer.hpp delete mode 100644 model-tools/include/OPOptimizers/DeprecatedOPOptimizer.hpp delete mode 100644 model-tools/include/OPOptimizers/NoQuantLabelOptimizer.hpp delete mode 100644 model-tools/include/OPOptimizers/OPOptimizer.hpp delete mode 100644 model-tools/include/OPOptimizers/PadConvOptimizer.hpp delete mode 100644 model-tools/include/OPOptimizers/TransposeMulToScaleOptimizer.hpp delete mode 100644 model-tools/include/model_optimizer.hpp delete mode 100644 model-tools/include/model_serialize_deserialize.hpp delete mode 100644 model-tools/include/model_tools.h delete mode 100644 model-tools/src/CMakeLists.txt delete mode 100644 model-tools/src/caffe/CMakeLists.txt delete mode 100644 model-tools/src/caffe/caffe.proto delete mode 100644 model-tools/src/data_type_converter.cpp delete mode 100644 model-tools/src/model_adaptee.h delete mode 100644 model-tools/src/model_print.cpp delete mode 100644 model-tools/src/onnx/CMakeLists.txt delete mode 100644 model-tools/src/onnx/onnx.proto delete mode 100644 model-tools/src/onnx/onnx_adaptee.h delete mode 100644 model-tools/src/tflite/CMakeLists.txt delete mode 100644 model-tools/src/tflite/tflite_adaptee.h delete mode 100644 model-tools/tools/ms2bolt/CMakeLists.txt delete mode 100644 model-tools/tools/ms2bolt/fixedMs2bolt.cpp create mode 100644 model_tools/CMakeLists.txt create mode 100644 model_tools/include/OPOptimizers/ActivationOptimizer.hpp rename {model-tools => model_tools}/include/OPOptimizers/BNScaleOptimizer.hpp (61%) create mode 100644 model_tools/include/OPOptimizers/CastOptimizer.hpp create mode 100644 model_tools/include/OPOptimizers/ChannelPaddingOptimizer.hpp rename {model-tools => model_tools}/include/OPOptimizers/ClipClipOptimizer.hpp (51%) create mode 100644 model_tools/include/OPOptimizers/DeprecatedOPOptimizer.hpp rename {model-tools => model_tools}/include/OPOptimizers/DepthwisePointwiseOptimizer.hpp (54%) rename {model-tools => model_tools}/include/OPOptimizers/FCFCOptimizer.hpp (64%) create mode 100644 model_tools/include/OPOptimizers/GeluOptimizer.hpp rename {model-tools => model_tools}/include/OPOptimizers/InPlaceOptimizer.hpp (70%) create mode 100644 model_tools/include/OPOptimizers/InnerProductOptimizer.hpp rename model-tools/include/OPOptimizers/FlattenGemmOptimizer.hpp => model_tools/include/OPOptimizers/InvariantSliceOptimizer.hpp (53%) create mode 100644 model_tools/include/OPOptimizers/LayerNormOptimizer.hpp rename {model-tools => model_tools}/include/OPOptimizers/MemoryReuseOptimizer.hpp (76%) create mode 100644 model_tools/include/OPOptimizers/MultiHeadAttentionOptimizer.hpp create mode 100644 model_tools/include/OPOptimizers/NoQuantLabelOptimizer.hpp create mode 100644 model_tools/include/OPOptimizers/OPOptimizer.hpp create mode 100644 model_tools/include/OPOptimizers/PadOptimizer.hpp create mode 100644 model_tools/include/OPOptimizers/PowerOptimizer.hpp create mode 100644 model_tools/include/OPOptimizers/RNNOptimizer.hpp create mode 100644 model_tools/include/OPOptimizers/ShGaUnCoReOptimizer.hpp rename {model-tools => model_tools}/include/OPOptimizers/SqueezeReshapeOptimizer.hpp (71%) create mode 100644 model_tools/include/OPOptimizers/StdDeviationOptimizer.hpp rename {model-tools => model_tools}/include/OPOptimizers/TransposeMatMulToFCOptimizer.hpp (71%) create mode 100644 model_tools/include/OPOptimizers/TransposeMulToScaleOptimizer.hpp rename model-tools/include/OPOptimizers/ConvBNOptimizer.hpp => model_tools/include/OPOptimizers/WeightBNOptimizer.hpp (54%) rename model-tools/include/OPOptimizers/ConvScaleOptimizer.hpp => model_tools/include/OPOptimizers/WeightScaleOptimizer.hpp (51%) rename {model-tools => model_tools}/include/converter.h (64%) create mode 100644 model_tools/include/model_optimizer.hpp create mode 100644 model_tools/include/model_quantization.h rename image/include/image.h => model_tools/include/model_tools.h (63%) create mode 100644 model_tools/include/online_conversion.h create mode 100644 model_tools/src/CMakeLists.txt create mode 100644 model_tools/src/caffe/CMakeLists.txt rename {model-tools => model_tools}/src/caffe/caffe_adaptee.h (63%) rename {model-tools => model_tools}/src/caffe/caffe_wrapper.cpp (76%) create mode 100644 model_tools/src/data_type_converter.cpp create mode 100644 model_tools/src/model_adaptee.h create mode 100644 model_tools/src/model_quantization.cpp rename {model-tools => model_tools}/src/model_tools.cpp (63%) create mode 100644 model_tools/src/online_conversion.cpp create mode 100644 model_tools/src/onnx/CMakeLists.txt create mode 100644 model_tools/src/onnx/onnx_adaptee.h rename {model-tools => model_tools}/src/onnx/onnx_wrapper.cpp (81%) create mode 100644 model_tools/src/tensorflow/CMakeLists.txt create mode 100644 model_tools/src/tensorflow/tensorflow_adaptee.h create mode 100644 model_tools/src/tensorflow/tensorflow_wrapper.cpp create mode 100644 model_tools/src/tflite/CMakeLists.txt create mode 100644 model_tools/src/tflite/tflite_adaptee.h rename {model-tools => model_tools}/src/tflite/tflite_wrapper.cpp (76%) create mode 100644 model_tools/tools/CMakeLists.txt create mode 100644 model_tools/tools/X2bolt/X2bolt.cpp rename {model-tools => model_tools}/tools/pytorch2caffe/README.md (100%) rename {model-tools => model_tools}/tools/pytorch2caffe/lenet.py (100%) create mode 100644 model_tools/tools/quantization/post_training_quantization.cpp rename {model-tools => model_tools}/tools/tensorflow2caffe/Caffe/__init__.py (100%) rename {model-tools => model_tools}/tools/tensorflow2caffe/Caffe/caffe_net.py (97%) rename {model-tools => model_tools}/tools/tensorflow2caffe/Caffe/layer_parameter.py (91%) rename {model-tools => model_tools}/tools/tensorflow2caffe/README.md (97%) create mode 100644 model_tools/tools/tensorflow2caffe/asr/convolution_transformer_params.py create mode 100644 model_tools/tools/tensorflow2caffe/asr/convolution_transformer_params_v2.py create mode 100644 model_tools/tools/tensorflow2caffe/asr/tensorflow2caffe_convolution_transformer.py create mode 100644 model_tools/tools/tensorflow2caffe/asr/tensorflow2caffe_convolution_transformer_keras.py create mode 100644 model_tools/tools/tensorflow2caffe/asr/tensorflow2caffe_rnnt.py create mode 100644 model_tools/tools/tensorflow2caffe/asr/transform_convolution_transformer.py create mode 100644 model_tools/tools/tensorflow2caffe/asr/transform_convolution_transformer_keras.py create mode 100644 model_tools/tools/tensorflow2caffe/asr/transform_rnnt.py rename {model-tools => model_tools}/tools/tensorflow2caffe/bert/albert/tensorflow2caffe_albert.py (100%) rename {model-tools => model_tools}/tools/tensorflow2caffe/bert/albert/transform_albert.py (100%) rename {model-tools => model_tools}/tools/tensorflow2caffe/bert/tensorflow2caffe_bert.py (98%) rename {model-tools => model_tools}/tools/tensorflow2caffe/bert/tinybert/adb_run.sh (78%) rename {model-tools => model_tools}/tools/tensorflow2caffe/bert/tinybert/result.txt (100%) rename {model-tools => model_tools}/tools/tensorflow2caffe/bert/tinybert/sequence.seq (100%) rename {model-tools => model_tools}/tools/tensorflow2caffe/bert/tinybert/tensorflow2caffe_tinybert.py (76%) rename {model-tools => model_tools}/tools/tensorflow2caffe/bert/tinybert/tinybert-infer.py (100%) rename {model-tools => model_tools}/tools/tensorflow2caffe/bert/tinybert/tokenization.py (100%) rename {model-tools => model_tools}/tools/tensorflow2caffe/bert/tinybert/transform_bert.py (100%) rename {model-tools => model_tools}/tools/tensorflow2caffe/bert/tinybert/transform_tinybert_disambiguate.py (93%) rename {model-tools => model_tools}/tools/tensorflow2caffe/bert/tinybert/transform_tinybert_intent_slot.py (100%) rename {model-tools => model_tools}/tools/tensorflow2caffe/bert/tinybert/transform_tinybert_mrpc.py (100%) create mode 100644 model_tools/tools/tensorflow2caffe/bert/tinybert/transform_tinybert_tts_preprocess.py rename {model-tools => model_tools}/tools/tensorflow2caffe/bert/transform_bert.py (100%) create mode 100644 model_tools/tools/tensorflow2caffe/nmt/tensorflow2caffe_transformer_lstm.py create mode 100644 model_tools/tools/tensorflow2caffe/nmt/tensorflow2caffe_transformer_tsc.py create mode 100644 model_tools/tools/tensorflow2caffe/nmt/transform_transformer_lstm.py create mode 100644 model_tools/tools/tensorflow2caffe/nmt/transform_transformer_tsc.py rename {model-tools => model_tools}/tools/tensorflow2caffe/operators.py (87%) create mode 100644 model_tools/tools/tensorflow2caffe/punctuation/tensorflow2caffe_punctuation.py create mode 100644 model_tools/tools/tensorflow2caffe/punctuation/transform_punctuation.py rename {model-tools => model_tools}/tools/tensorflow2caffe/requirements.txt (100%) create mode 100644 model_tools/tools/tensorflow2caffe/rotation/tensorflow2caffe_rotation.py create mode 100644 model_tools/tools/tensorflow2caffe/rotation/transform_rotation.py rename {model-tools => model_tools}/tools/tensorflow2caffe/tensorflow2caffe.py (83%) create mode 100644 model_tools/tools/tensorflow2caffe/tts/tensorflow2caffe_tactron2.py create mode 100644 model_tools/tools/tensorflow2caffe/tts/transform_tactron2.py create mode 100644 model_tools/tools/tensorflow2json/tf2json.py delete mode 100644 scripts/params/alexnet_convolution.csv delete mode 100644 scripts/params/argmax.csv delete mode 100644 scripts/params/bnn_convolution.csv delete mode 100644 scripts/params/check.csv delete mode 100644 scripts/params/convolution.csv delete mode 100644 scripts/params/deconvolution.csv delete mode 100644 scripts/params/dilated_convolution.csv delete mode 100644 scripts/params/googlenet_convolution.csv delete mode 100644 scripts/params/lenet_convolution.csv delete mode 100644 scripts/params/mobilenetv1_depthwise_convolution.csv delete mode 100644 scripts/params/mobilenetv2_depthwise_convolution.csv delete mode 100644 scripts/params/mobilenetv3_convolution.csv delete mode 100644 scripts/params/mobilenetv3_depthwise_convolution.csv delete mode 100644 scripts/params/multiply.csv delete mode 100644 scripts/params/reduction.csv delete mode 100644 scripts/params/resnet50_convolution.csv rename quick_benchmark.sh => scripts/quick_benchmark.sh (65%) delete mode 100644 tensor_computing/include/tensor_computing.h delete mode 100644 tensor_computing/include/tensor_computing_type.h delete mode 100644 tensor_computing/src/activation.cpp delete mode 100644 tensor_computing/src/argmax.cpp delete mode 100644 tensor_computing/src/bilateral_slice_apply.cpp delete mode 100644 tensor_computing/src/clip.cpp delete mode 100644 tensor_computing/src/concat.cpp delete mode 100644 tensor_computing/src/convolution.cpp delete mode 100644 tensor_computing/src/cpu/arm/activation.cpp delete mode 100644 tensor_computing/src/cpu/arm/argmax.cpp delete mode 100644 tensor_computing/src/cpu/arm/arm_functions.h delete mode 100644 tensor_computing/src/cpu/arm/bnn/convolution.cpp delete mode 100644 tensor_computing/src/cpu/arm/bnn/convolution_dorefa.h delete mode 100644 tensor_computing/src/cpu/arm/bnn/convolution_dorefa_A55.cpp delete mode 100644 tensor_computing/src/cpu/arm/bnn/convolution_dorefa_A76.cpp delete mode 100644 tensor_computing/src/cpu/arm/bnn/convolution_xnor.h delete mode 100644 tensor_computing/src/cpu/arm/bnn/convolution_xnor_A55.cpp delete mode 100644 tensor_computing/src/cpu/arm/bnn/convolution_xnor_A76.cpp delete mode 100644 tensor_computing/src/cpu/arm/concat.cpp delete mode 100644 tensor_computing/src/cpu/arm/convolution.cpp delete mode 100644 tensor_computing/src/cpu/arm/deconvolution.cpp delete mode 100644 tensor_computing/src/cpu/arm/depthwise_convolution.cpp delete mode 100644 tensor_computing/src/cpu/arm/detectionoutput.cpp delete mode 100644 tensor_computing/src/cpu/arm/eltwise.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp16/convolution.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp16/convolution_gemm.h delete mode 100644 tensor_computing/src/cpu/arm/fp16/convolution_gemm_A55.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp16/convolution_gemm_A76.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp16/convolution_gemm_icnchw.h delete mode 100644 tensor_computing/src/cpu/arm/fp16/convolution_gemm_icnchw_A55.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp16/convolution_gemm_icnchw_A76.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp16/convolution_winograd.h delete mode 100644 tensor_computing/src/cpu/arm/fp16/convolution_winograd_A55.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp16/convolution_winograd_A76.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp16/deconvolution.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp16/deconvolution_transform.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp16/depthwise_convolution.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp16/depthwise_convolution_direct.h delete mode 100644 tensor_computing/src/cpu/arm/fp16/depthwise_convolution_direct_A55.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp16/depthwise_convolution_direct_A76.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp16/depthwise_convolution_transform.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1.h delete mode 100644 tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct.h delete mode 100644 tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A55.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A76.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp16/detectionoutput.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp16/lstm.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp16/pooling.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp16/priorbox.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp16/tensor_computing_fp16.h delete mode 100644 tensor_computing/src/cpu/arm/fp32/convolution.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp32/convolution_gemm_V7.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp32/convolution_gemm_V8.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp32/convolution_gemm_icnchw_V8.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp32/deconvolution.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp32/deconvolution_transform.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp32/depthwise_convolution.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp32/depthwise_convolution.h delete mode 100644 tensor_computing/src/cpu/arm/fp32/depthwise_convolution_direct_V7.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp32/depthwise_convolution_direct_V8.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp32/depthwise_convolution_transform.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp32/depthwise_pointwise_convolution_direct_V7.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp32/depthwise_pointwise_convolution_direct_V8.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp32/detectionoutput.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp32/lstm.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp32/pooling.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp32/priorbox.cpp delete mode 100644 tensor_computing/src/cpu/arm/fp32/tensor_computing_fp32.h delete mode 100644 tensor_computing/src/cpu/arm/int8/convolution.cpp delete mode 100644 tensor_computing/src/cpu/arm/int8/convolution_gemm.h delete mode 100644 tensor_computing/src/cpu/arm/int8/convolution_winograd.h delete mode 100644 tensor_computing/src/cpu/arm/int8/convolution_winograd_A55.cpp delete mode 100644 tensor_computing/src/cpu/arm/int8/convolution_winograd_A76.cpp delete mode 100644 tensor_computing/src/cpu/arm/int8/depthwise_convolution_transform.cpp delete mode 100644 tensor_computing/src/cpu/arm/int8/depthwise_pointwise_convolution_direct.cpp delete mode 100644 tensor_computing/src/cpu/arm/int8/pooling.cpp delete mode 100644 tensor_computing/src/cpu/arm/int8/tensor_computing_int8.h delete mode 100644 tensor_computing/src/cpu/arm/lstm.cpp delete mode 100644 tensor_computing/src/cpu/arm/multiply.cpp delete mode 100644 tensor_computing/src/cpu/arm/padding.cpp delete mode 100644 tensor_computing/src/cpu/arm/pooling.cpp delete mode 100644 tensor_computing/src/cpu/arm/reduction.cpp delete mode 100644 tensor_computing/src/cpu/arm/reshape.cpp delete mode 100644 tensor_computing/src/cpu/arm/tensor_computing_arm.h delete mode 100644 tensor_computing/src/cpu/arm/transpose.cpp delete mode 100644 tensor_computing/src/cpu/general/activation.cpp delete mode 100644 tensor_computing/src/cpu/general/concat.cpp delete mode 100644 tensor_computing/src/cpu/general/convolution.cpp delete mode 100644 tensor_computing/src/cpu/general/deconvolution.cpp delete mode 100644 tensor_computing/src/cpu/general/depthwise_convolution.cpp delete mode 100644 tensor_computing/src/cpu/general/eltwise.cpp delete mode 100644 tensor_computing/src/cpu/general/general_functions.h delete mode 100644 tensor_computing/src/cpu/general/lstm.cpp delete mode 100644 tensor_computing/src/cpu/general/multiply.cpp delete mode 100644 tensor_computing/src/cpu/general/padding.cpp delete mode 100644 tensor_computing/src/cpu/general/priorbox.cpp delete mode 100644 tensor_computing/src/cpu/general/reduction.cpp delete mode 100644 tensor_computing/src/cpu/general/slice.cpp delete mode 100644 tensor_computing/src/cpu/general/tensor_computing_general.h delete mode 100644 tensor_computing/src/deconvolution.cpp delete mode 100644 tensor_computing/src/depth2space.cpp delete mode 100644 tensor_computing/src/depthwise_convolution.cpp delete mode 100644 tensor_computing/src/detectionoutput.cpp delete mode 100644 tensor_computing/src/eltwise.cpp delete mode 100644 tensor_computing/src/embedding.cpp delete mode 100644 tensor_computing/src/fully_connected.cpp delete mode 100644 tensor_computing/src/get_output.cpp delete mode 100644 tensor_computing/src/gpu/mali/activation.cpp delete mode 100644 tensor_computing/src/gpu/mali/bilateral_slice_apply.cpp delete mode 100644 tensor_computing/src/gpu/mali/cl/activation.cl delete mode 100644 tensor_computing/src/gpu/mali/cl/concat.cl delete mode 100644 tensor_computing/src/gpu/mali/cl/conv_depthwise_trans_fltbuf.cl delete mode 100644 tensor_computing/src/gpu/mali/cl/conv_direct_s1_nchw_to_ncwhc4.cl delete mode 100644 tensor_computing/src/gpu/mali/cl/conv_direct_s2_nchw_to_ncwhc4.cl delete mode 100644 tensor_computing/src/gpu/mali/cl/conv_direct_spe_fwhs1.cl delete mode 100644 tensor_computing/src/gpu/mali/cl/conv_direct_trans_fltbuf.cl delete mode 100644 tensor_computing/src/gpu/mali/cl/conv_wino_trans_fltbuf_3x3.cl delete mode 100644 tensor_computing/src/gpu/mali/cl/conv_wino_trans_outbuf.cl delete mode 100644 tensor_computing/src/gpu/mali/cl/conv_wino_trans_picbuf.cl delete mode 100644 tensor_computing/src/gpu/mali/cl/depth2space.cl delete mode 100644 tensor_computing/src/gpu/mali/cl/eltwise.cl delete mode 100644 tensor_computing/src/gpu/mali/cl/fc_trans_fltbuf.cl delete mode 100644 tensor_computing/src/gpu/mali/cl/gemm_tn.cl delete mode 100644 tensor_computing/src/gpu/mali/cl/kernel_def.h delete mode 100644 tensor_computing/src/gpu/mali/cl/mem_trans_nchw_to_ncwhc4.cl delete mode 100644 tensor_computing/src/gpu/mali/cl/mem_trans_ncwhc4_to_nchw.cl delete mode 100644 tensor_computing/src/gpu/mali/cl/softmax.cl delete mode 100644 tensor_computing/src/gpu/mali/cl/softmax_nchw_w.cl delete mode 100644 tensor_computing/src/gpu/mali/clip.cpp delete mode 100644 tensor_computing/src/gpu/mali/concat.cpp delete mode 100644 tensor_computing/src/gpu/mali/convolution.cpp delete mode 100644 tensor_computing/src/gpu/mali/depth2space.cpp delete mode 100644 tensor_computing/src/gpu/mali/depthwise_convolution.cpp delete mode 100644 tensor_computing/src/gpu/mali/eltwise.cpp delete mode 100644 tensor_computing/src/gpu/mali/embedding.cpp delete mode 100644 tensor_computing/src/gpu/mali/fp16/activation_mali_fp16.cpp delete mode 100644 tensor_computing/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.cpp delete mode 100644 tensor_computing/src/gpu/mali/fp16/concat_mali_fp16.cpp delete mode 100644 tensor_computing/src/gpu/mali/fp16/convolution_direct_mali_fp16.cpp delete mode 100644 tensor_computing/src/gpu/mali/fp16/convolution_direct_mali_fp16.h delete mode 100644 tensor_computing/src/gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.cpp delete mode 100644 tensor_computing/src/gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.h delete mode 100644 tensor_computing/src/gpu/mali/fp16/convolution_mali_fp16.cpp delete mode 100644 tensor_computing/src/gpu/mali/fp16/convolution_mali_fp16.h delete mode 100644 tensor_computing/src/gpu/mali/fp16/convolution_wino_mali_fp16.cpp delete mode 100644 tensor_computing/src/gpu/mali/fp16/convolution_wino_mali_fp16.h delete mode 100644 tensor_computing/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.cpp delete mode 100644 tensor_computing/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.h delete mode 100644 tensor_computing/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.cpp delete mode 100644 tensor_computing/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.h delete mode 100644 tensor_computing/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.cpp delete mode 100644 tensor_computing/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.h delete mode 100644 tensor_computing/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.cpp delete mode 100644 tensor_computing/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.h delete mode 100644 tensor_computing/src/gpu/mali/fp16/eltwise_mali_fp16.cpp delete mode 100644 tensor_computing/src/gpu/mali/fp16/fully_connected_mali_fp16.h delete mode 100644 tensor_computing/src/gpu/mali/fp16/matmul_mali_fp16.cpp delete mode 100644 tensor_computing/src/gpu/mali/fp16/matmul_mali_fp16.h delete mode 100644 tensor_computing/src/gpu/mali/fp16/multiply_mali_fp16.cpp delete mode 100644 tensor_computing/src/gpu/mali/fp16/pooling_mali_fp16.cpp delete mode 100644 tensor_computing/src/gpu/mali/fp16/reshape_mali_fp16.cpp delete mode 100644 tensor_computing/src/gpu/mali/fp16/softmax_mali_fp16.cpp delete mode 100644 tensor_computing/src/gpu/mali/fp16/transpose_mali_fp16.cpp delete mode 100644 tensor_computing/src/gpu/mali/fully_connected.cpp delete mode 100644 tensor_computing/src/gpu/mali/infer_gclmem_desc_mali.h delete mode 100644 tensor_computing/src/gpu/mali/matmul.cpp delete mode 100644 tensor_computing/src/gpu/mali/multiply.cpp delete mode 100644 tensor_computing/src/gpu/mali/pooling.cpp delete mode 100644 tensor_computing/src/gpu/mali/reshape.cpp delete mode 100644 tensor_computing/src/gpu/mali/softmax.cpp delete mode 100644 tensor_computing/src/gpu/mali/space2depth.cpp delete mode 100644 tensor_computing/src/gpu/mali/squeeze.cpp delete mode 100644 tensor_computing/src/gpu/mali/tensor_computing_get_output.cpp delete mode 100644 tensor_computing/src/gpu/mali/tensor_computing_mali.h delete mode 100644 tensor_computing/src/gpu/mali/tensor_computing_set_input.cpp delete mode 100644 tensor_computing/src/gpu/mali/transpose.cpp delete mode 100644 tensor_computing/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.cpp delete mode 100644 tensor_computing/src/library_algorithm_search.cpp delete mode 100644 tensor_computing/src/lstm.cpp delete mode 100644 tensor_computing/src/matmul.cpp delete mode 100644 tensor_computing/src/multiply.cpp delete mode 100644 tensor_computing/src/normalization.cpp delete mode 100644 tensor_computing/src/padding.cpp delete mode 100644 tensor_computing/src/pooling.cpp delete mode 100644 tensor_computing/src/priorbox.cpp delete mode 100644 tensor_computing/src/reduction.cpp delete mode 100644 tensor_computing/src/reshape.cpp delete mode 100644 tensor_computing/src/scale.cpp delete mode 100644 tensor_computing/src/slice.cpp delete mode 100644 tensor_computing/src/softmax.cpp delete mode 100644 tensor_computing/src/space2depth.cpp delete mode 100644 tensor_computing/src/squeeze.cpp delete mode 100644 tensor_computing/src/transpose.cpp delete mode 100644 tests/CMakeLists.txt delete mode 100644 tests/test_activation.cpp delete mode 100644 tests/test_api_c.c delete mode 100644 tests/test_api_java.java delete mode 100644 tests/test_concat.cpp delete mode 100644 tests/test_convolution.cpp delete mode 100644 tests/test_convolution_bnn.cpp delete mode 100644 tests/test_convolution_int8.cpp delete mode 100644 tests/test_convolution_ocl.cpp delete mode 100644 tests/test_deconvolution.cpp delete mode 100644 tests/test_depthwise_convolution.cpp delete mode 100644 tests/test_depthwise_convolution_int8.cpp delete mode 100644 tests/test_depthwise_convolution_ocl.cpp delete mode 100644 tests/test_detectionoutput.cpp delete mode 100644 tests/test_dilated_convolution.cpp delete mode 100644 tests/test_fully_connected.cpp delete mode 100644 tests/test_fully_connected_ocl.cpp delete mode 100644 tests/test_lstm.cpp delete mode 100644 tests/test_multiply.cpp delete mode 100644 tests/test_padding.cpp delete mode 100644 tests/test_pipeline_ocl.cpp delete mode 100644 tests/test_pooling.cpp delete mode 100644 tests/test_pooling_int8.cpp delete mode 100644 tests/test_reduction.cpp delete mode 100644 tools/CMakeLists.txt delete mode 100644 tools/caffe2bolt.cpp delete mode 100644 tools/onnx2bolt.cpp delete mode 100644 tools/ptq_calibration.cpp delete mode 100644 tools/tensor_computing_library_search.cpp delete mode 100644 tools/tflite2bolt.cpp delete mode 100644 uni/include/error.h delete mode 100644 uni/include/op_type.h delete mode 100644 uni/include/tensor_desc.h delete mode 100644 uni/include/thread_affinity.h delete mode 100644 uni/include/type.h diff --git a/CI_SCRIPTS/CPPLINT.cfg b/CI_SCRIPTS/CPPLINT.cfg new file mode 100644 index 00000000..db5b0f46 --- /dev/null +++ b/CI_SCRIPTS/CPPLINT.cfg @@ -0,0 +1 @@ +filter=-whitespace/line_length,-readability/casting,-whitespace/braces,-build/header_guard,-build/include_subdir,-runtime/explicit,-runtime/printf,-runtime/int,-whitespace/end_of_line,-readability/fn_size,-build/include_order,-build/include_what_you_use,-whitespace/indent diff --git a/CI_SCRIPTS/benchmark_verify.sh b/CI_SCRIPTS/benchmark_verify.sh new file mode 100644 index 00000000..37cfc79c --- /dev/null +++ b/CI_SCRIPTS/benchmark_verify.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +benchmark_verify() { + device=$1 + x2bolt_path=$2 + benchmark_path=$3 + model_zoo_directory=$4 + model_type=$5 + model_name=$6 + precision=$7 + affinity=$8 + loops=$9 + result=${10} + + model_directory=${model_zoo_directory}/${model_type}_models/${model_name} + if [[ "${precision}" == "FP32" ]]; then + precision_suffix="_f32" + fi + if [[ "${precision}" == "FP16" ]]; then + precision_suffix="_f16" + fi + if [[ "${precision}" == "INT8_Q" ]]; then + precision_suffix="_int8" + fi + model_convert_command="${x2bolt_path} -d ${model_directory} -m ${model_name} -i ${precision}" + benchmark_command="${benchmark_path} -m ${model_directory}/${model_name}${precision_suffix}.bolt -a ${affinity} -l ${loops}" + if [[ "${device}" == "host" ]]; then + ${model_convert_command} > /dev/null && ${benchmark_command} &> engine_result.txt + else + adb -s ${device} shell "${model_convert_command} && ${benchmark_command}" &> engine_result.txt + fi + + avg_time=$(grep -I "avg_time:" ./engine_result.txt) + verify_result=$(grep -I "${result}" ./engine_result.txt) + + rm -rf engine_result.txt + + if [[ ${#verify_result} > 0 ]] + then + echo "${model_name} on ${device} in ${precision} precision ${avg_time}" + else + echo "${model_name} on ${device} in ${precision} precision fail!" + exit 1 + fi +} diff --git a/CI_SCRIPTS/benchmark_verify_serial.sh b/CI_SCRIPTS/benchmark_verify_serial.sh new file mode 100644 index 00000000..386a6fec --- /dev/null +++ b/CI_SCRIPTS/benchmark_verify_serial.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +script_name=$0 +script_abs=$(readlink -f "$0") +script_dir=$(dirname $script_abs) + +source ${script_dir}/benchmark_verify.sh + +BOLT_ROOT=${script_dir}/.. +loops=6 +phone=$1 + +# arm gnu +arch=arm_gnu +x2bolt_path=/data/local/tmp/CI/${arch}/tools/X2bolt +benchmark_path=/data/local/tmp/CI/${arch}/bin/benchmark +model_zoo_directory=/data/local/tmp/CI/model_zoo +#benchmark_verify ${phone} ${x2bolt_path} ${benchmark_path} ${model_zoo_directory} tflite mbmelgan FP32 CPU_AFFINITY_HIGH_PERFORMANCE ${loops} '' + +# x86 gnu +arch=x86_gnu +x2bolt_path=${BOLT_ROOT}/install_${arch}/tools/X2bolt +benchmark_path=${BOLT_ROOT}/install_${arch}/examples/benchmark +model_zoo_directory=/data/bolt/model_zoo +benchmark_verify host ${x2bolt_path} ${benchmark_path} ${model_zoo_directory} tflite mbmelgan FP32 CPU_AFFINITY_HIGH_PERFORMANCE ${loops} '\-0.295808 0.563926 1.235842' diff --git a/CI_SCRIPTS/dir_cpplint.sh b/CI_SCRIPTS/dir_cpplint.sh new file mode 100644 index 00000000..b30b98e5 --- /dev/null +++ b/CI_SCRIPTS/dir_cpplint.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +script_name=$0 +script_abs=$(readlink -f "$0") +script_dir=$(dirname $script_abs) + +cp ${script_dir}/CPPLINT.cfg $1 +cd $1 +cpplint --recursive --extensions=cpp,h,hpp,cl . +rm CPPLINT.cfg +echo " " diff --git a/CI_SCRIPTS/format_code.sh b/CI_SCRIPTS/format_code.sh new file mode 100644 index 00000000..3961883f --- /dev/null +++ b/CI_SCRIPTS/format_code.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +script_name=$0 +script_abs=$(readlink -f "$0") +script_dir=$(dirname $script_abs) + +fileSuffix=(h hpp c cpp cl) + +cd ${script_dir}/../ +format() { + file=$1 + echo "format: $file" + #/data/opt/uncrustify-master/build/uncrustify -c /data/opt/uncrustify-master/forUncrustifySources.cfg -f $file > tmp.cpp + #sed -i "s/\/\/ /\/\//g" ./tmp.cpp + #sed -i "s/\/\//\/\/ /g" ./tmp.cpp + #clang-format -i tmp.cpp + #cp tmp.cpp $file + #rm tmp.cpp + clang-format -i $file +} + +format_all() { + dirs=(inference common model_tools compute kit) + for suffix in ${fileSuffix[*]} + do + for dir in ${dirs[*]} + do + for file in `find $dir -name "*.$suffix"` + do + format $file + done + done + done +} + +format_change() { + key=$1 + files=`git status | grep "${key}" | sed s/[[:space:]]//g | sed s/:/:/g | cut -d ":" -f 2` + for file in ${files[*]} + do + fresh=false + for suffix in ${fileSuffix[*]} + do + if [[ $file == *.${suffix} ]]; then + fresh=true + fi + done + if [[ $fresh == true ]]; then + format $file + fi + done +} + +format_change "modified:" +format_change "修改:" diff --git a/CI_SCRIPTS/genCommandLines.sh b/CI_SCRIPTS/genCommandLines.sh new file mode 100644 index 00000000..1a3dd1b8 --- /dev/null +++ b/CI_SCRIPTS/genCommandLines.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +fun_gen_in_two_arrs() { + rm -rf ./single_combinations.txt + #touch ./single_combinations.txt + local _firstArr=(`echo $1|cut -d " " --output-delimiter=" " -f 1-`) + local _firstArrLen=${#_firstArr[@]} + local _secondArr=(`echo $2|cut -d " " --output-delimiter=" " -f 1-`) + local _secondArrLen=${#_secondArr[@]} + index=0 + for ((i=0;i<_firstArrLen;i++)) + do + for ((j=0;j<_secondArrLen;j++)) + do + elem1=${_firstArr[$i]} + elem2=${_secondArr[$j]} + combine_str=$elem1"--"$elem2 + echo $combine_str >> ./single_combinations.txt + let index+=1 + done + done +} + +rm -rf ./final_combinations.txt +while read line +do + if [[ ${line} =~ ^#.* ]]; then + continue + fi + original_strs=() + original_index=0 + for i in $(echo $line| tr "&" "\n") + do + original_strs[$original_index]=$i + let original_index+=1 + done + + for i in "${!original_strs[@]}"; + do + sub_str=${original_strs[$i]} + if [ $i == 0 ] + then + rm -rf ./single_combinations.txt + for j in $(echo $sub_str| tr ";" "\n") + do + echo $j >> ./single_combinations.txt + done + else + sub_firstArr=() + sub_firstIndex=0 + for line in `cat ./single_combinations.txt` + do + sub_firstArr[$sub_firstIndex]=$line + let sub_firstIndex+=1 + done + sub_secondArr=($(echo "$sub_str"| tr ";" "\n")) + fun_gen_in_two_arrs "$(echo ${sub_firstArr[@]})" "$(echo ${sub_secondArr[@]})" + fi + done + + cat ./single_combinations.txt >> ./final_combinations.txt +done < $1 +rm -rf ./single_combinations.txt diff --git a/CI_SCRIPTS/inference_big.txt b/CI_SCRIPTS/inference_big.txt new file mode 100644 index 00000000..81a7eecb --- /dev/null +++ b/CI_SCRIPTS/inference_big.txt @@ -0,0 +1,66 @@ +# ARMv8+ CPU GNU section +asr_convolution_transformer_joint_net&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/asr/asr_convolution_transformer/joint_net&@S+joint_net&@S+joint_net& +tinybert384&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16;int8&cpu&1&nlp/slot_intent&32+32+32& +tinybert&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16;int8&cpu&1&nlp/slot_intent&32+32+32& +tinybert_onnx&arm&onnx&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/slot_intent_onnx&32+32+32& +nmt&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/machine_translation&32+32+32& +asr_rnnt&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/asr/asr_rnnt&32+32+32& +asr_convolution_transformer_encoder&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/asr/asr_convolution_transformer/first_frame&@S+encoder&@S+encoder& +asr_convolution_transformer_encoder&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/asr/asr_convolution_transformer/second_frame&@S+encoder&@S+encoder& +asr_convolution_transformer_prediction_net&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/asr/asr_convolution_transformer/prediction_net&@S+prediction_net&@S+prediction_net& +tts_encoder_decoder&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/tts/encoder_decoder&@S+encoder_decoder&@S+encoder_decoder& +tts_postnet&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/tts/postnet&@S+postnet&@S+postnet& +tts_melgan_vocoder&arm&onnx&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/tts/melgan_vocoder&@S+melgan_vocoder&@S+melgan_vocoder&0 +ghostnet&arm&onnx&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR_SC_RAW+@s+1+@t+5+@c+151&2 +mobilenet_v1&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+0.017+@t+5+@c+151 +mobilenet_v2&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+0.017+@t+5+@c+151 +squeezenet&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16;int8&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+1+@t+5+@c+151 +mobilenet_v3&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+RGB+@s+0.017+@t+5+@c+151 +fingerprint_resnet18&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&cv/fingerprint&UNKNOWN& +resnet50&arm&caffe&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16;int8&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+1+@t+5+@c+151 +vad&arm&tflite&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&&&& +birealnet18&arm&onnx&ubuntu16_04&gnu&E5B0119506000260;GCL5T19822000030&A55;A76&fp16&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+RGB_SC+@s+1+@t+5+@c+151&0 +# ARMv7 CPU section +#asr_convolution_transformer_joint_net&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&nlp/asr/asr_convolution_transformer/joint_net&@S+joint_net&@S+joint_net& +tinybert384&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&nlp/slot_intent&32+32+32& +tinybert&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&nlp/slot_intent&32+32+32& +tinybert_onnx&arm&onnx&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&nlp/slot_intent_onnx&32+32+32& +nmt&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&nlp/machine_translation&32+32+32& +asr_rnnt&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&nlp/asr/asr_rnnt&32+32+32& +asr_convolution_transformer_encoder&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&nlp/asr/asr_convolution_transformer/first_frame&@S+encoder&@S+encoder& +asr_convolution_transformer_encoder&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&nlp/asr/asr_convolution_transformer/second_frame&@S+encoder&@S+encoder& +asr_convolution_transformer_prediction_net&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&nlp/asr/asr_convolution_transformer/prediction_net&@S+prediction_net&@S+prediction_net& +tts_encoder_decoder&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&nlp/tts/encoder_decoder&@S+encoder_decoder&@S+encoder_decoder& +tts_postnet&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&nlp/tts/postnet&@S+postnet&@S+postnet& +tts_melgan_vocoder&arm&onnx&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&nlp/tts/melgan_vocoder&@S+melgan_vocoder&@S+melgan_vocoder&0 +ghostnet&arm&onnx&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR_SC_RAW+@s+1+@t+5+@c+151&2 +mobilenet_v1&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+0.017+@t+5+@c+151 +mobilenet_v2&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+0.017+@t+5+@c+151 +squeezenet&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+1+@t+5+@c+151 +mobilenet_v3&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+RGB+@s+0.017+@t+5+@c+151 +fingerprint_resnet18&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&cv/fingerprint&UNKNOWN& +resnet50&arm&caffe&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+1+@t+5+@c+151 +vad&arm&tflite&ubuntu16_04&ndkv7&GCL5T19822000030&A76&fp32&cpu&&&& +# X86 CPU section +tinybert384&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/slot_intent&32+32+32& +tinybert&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/slot_intent&32+32+32& +tinybert_onnx&x86&onnx&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/slot_intent_onnx&32+32+32& +tinybert_disambiguate&arm&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/slot_intent&32+32+32& +nmt&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/machine_translation&32+32+32& +nmt_tsc_encoder&arm&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/machine_translation&32+32+32& +nmt_tsc_decoder&arm&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/machine_translation&32+32+32& +asr_rnnt&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/asr/asr_rnnt&32+32+32& +asr_convolution_transformer_encoder&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/asr/asr_convolution_transformer/first_frame&@S+encoder&@S+encoder& +asr_convolution_transformer_encoder&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/asr/asr_convolution_transformer/second_frame&@S+encoder&@S+encoder& +asr_convolution_transformer_prediction_net&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/asr/asr_convolution_transformer/prediction_net&@S+prediction_net&@S+prediction_net& +tts_encoder_decoder&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/tts/encoder_decoder&@S+encoder_decoder&@S+encoder_decoder& +tts_postnet&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/tts/postnet&@S+postnet&@S+postnet& +tts_melgan_vocoder&x86&onnx&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/tts/melgan_vocoder&@S+melgan_vocoder&@S+melgan_vocoder&0 +ghostnet&x86&onnx&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR_SC_RAW+@s+1+@t+5+@c+151&2 +mobilenet_v1&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+0.017+@t+5+@c+151 +mobilenet_v2&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+0.017+@t+5+@c+151 +squeezenet&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+1+@t+5+@c+151 +mobilenet_v3&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+RGB+@s+0.017+@t+5+@c+151 +fingerprint_resnet18&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/fingerprint&UNKNOWN& +resnet50&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+1+@t+5+@c+151 +vad&x86&tflite&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&&&& diff --git a/CI_SCRIPTS/inference_serial.txt b/CI_SCRIPTS/inference_serial.txt new file mode 100644 index 00000000..a283f17e --- /dev/null +++ b/CI_SCRIPTS/inference_serial.txt @@ -0,0 +1,19 @@ +# X86 CPU section +tinybert384&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/slot_intent&32+32+32& +tinybert&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/slot_intent&32+32+32& +tinybert_onnx&x86&onnx&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/slot_intent_onnx&32+32+32& +nmt&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/machine_translation&32+32+32& +asr_rnnt&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/asr/asr_rnnt&32+32+32& +asr_convolution_transformer_encoder&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/asr/asr_convolution_transformer/first_frame&@S+encoder&@S+encoder& +asr_convolution_transformer_encoder&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/asr/asr_convolution_transformer/second_frame&@S+encoder&@S+encoder& +asr_convolution_transformer_prediction_net&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/asr/asr_convolution_transformer/prediction_net&@S+prediction_net&@S+prediction_net& +tts_encoder_decoder&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/tts/encoder_decoder&@S+encoder_decoder&@S+encoder_decoder& +tts_postnet&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/tts/postnet&@S+postnet&@S+postnet& +tts_melgan_vocoder&x86&onnx&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&nlp/tts/melgan_vocoder&@S+melgan_vocoder&@S+melgan_vocoder&0 +#mobilenet_v1&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+0.017+@t+5+@c+151 +#mobilenet_v2&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+0.017+@t+5+@c+151 +#squeezenet&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+1+@t+5+@c+151 +mobilenet_v3&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+RGB+@s+0.017+@t+5+@c+151 +fingerprint_resnet18&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/fingerprint&UNKNOWN& +#resnet50&x86&caffe&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+1+@t+5+@c+151 +vad&x86&tflite&ubuntu16_04&gnu&x86_HOST&x86_HOST&fp32&cpu&&&& diff --git a/CI_SCRIPTS/inference_small.txt b/CI_SCRIPTS/inference_small.txt new file mode 100644 index 00000000..16e6abd6 --- /dev/null +++ b/CI_SCRIPTS/inference_small.txt @@ -0,0 +1,30 @@ +# ARMv8+ CPU LLVM section +tinybert384&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16;int8&cpu&1&nlp/slot_intent&32+32+32& +tinybert&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16;int8&cpu&1&nlp/slot_intent&32+32+32& +tinybert_onnx&arm&onnx&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/slot_intent_onnx&32+32+32& +tinybert_disambiguate&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A76&fp32;fp16&cpu&1&nlp/slot_intent&32+32+32& +nmt&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/machine_translation&32+32+32& +nmt_tsc_encoder&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A76&fp32;fp16&cpu&1&nlp/machine_translation&32+32+32& +nmt_tsc_decoder&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A76&fp32;fp16&cpu&1&nlp/machine_translation&32+32+32& +asr_rnnt&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/asr/asr_rnnt&32+32+32& +asr_convolution_transformer_encoder&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/asr/asr_convolution_transformer/first_frame&@S+encoder&@S+encoder& +asr_convolution_transformer_encoder&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/asr/asr_convolution_transformer/second_frame&@S+encoder&@S+encoder& +asr_convolution_transformer_prediction_net&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/asr/asr_convolution_transformer/prediction_net&@S+prediction_net&@S+prediction_net& +tts_encoder_decoder&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/tts/encoder_decoder&@S+encoder_decoder&@S+encoder_decoder& +tts_postnet&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/tts/postnet&@S+postnet&@S+postnet& +tts_melgan_vocoder&arm&onnx&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&nlp/tts/melgan_vocoder&@S+melgan_vocoder&@S+melgan_vocoder&0 +ghostnet&arm&onnx&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A76&fp32;fp16&cpu&1&cv/ILSVRC/n02085620&1*224*224*3&@f+BGR_SC_RAW+@s+1+@t+5+@c+151&2 +mobilenet_v1&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+0.017+@t+5+@c+151 +mobilenet_v2&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+0.017+@t+5+@c+151 +squeezenet&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16;int8&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+1+@t+5+@c+151 +mobilenet_v3&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+RGB+@s+0.017+@t+5+@c+151 +fingerprint_resnet18&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&1&cv/fingerprint&UNKNOWN& +resnet50&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16;int8&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+1+@t+5+@c+151 +vad&arm&tflite&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp32;fp16&cpu&&&& +birealnet18&arm&onnx&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55;A76&fp16&cpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+RGB_SC+@s+1+@t+5+@c+151&0 +# ARMv8+ GPU LLVM section +mobilenet_v1&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55&fp16&gpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+0.017+@t+5+@c+151 +mobilenet_v2&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55&fp16&gpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+0.017+@t+5+@c+151 +mobilenet_v3&arm&caffe&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55&fp16&gpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+RGB+@s+0.017+@t+5+@c+151 +squeezenet&arm&caffe&ubuntu16_04&llvm&GCL5T19822000030&A55&fp16&gpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR+@s+1+@t+5+@c+151 +#ghostnet&arm&onnx&ubuntu16_04&llvm&E5B0119506000260;GCL5T19822000030&A55&fp16&gpu&1&cv/ILSVRC/n02085620&1*3*224*224&@f+BGR_SC_RAW+@s+1+@t+5+@c+151&3 diff --git a/CI_SCRIPTS/java_api_test.sh b/CI_SCRIPTS/java_api_test.sh new file mode 100644 index 00000000..600fd566 --- /dev/null +++ b/CI_SCRIPTS/java_api_test.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +device=$1 +script_name=$0 +script_abs=$(readlink -f "$0") +script_dir=$(dirname $script_abs) +BOLT_ROOT=${script_dir}/.. +if [ ${device} == "x86_HOST" ]; then + ci_dir=/data/bolt + build_dir=${BOLT_ROOT}/build_x86_gnu + install_dir=${BOLT_ROOT}/install_x86_gnu +else + ci_dir=/data/local/tmp/CI + build_dir=${BOLT_ROOT}/build_arm_llvm + install_dir=${BOLT_ROOT}/install_arm_llvm + device_dir=${ci_dir}/java +fi + +current_dir=${PWD} + +cd ${build_dir} +cp ${install_dir}/include/java/* . +cp ${BOLT_ROOT}/inference/examples/java_api/test_api_java.java . +javac BoltResult.java || exit 1 +javac BoltModel.java || exit 1 +javac test_api_java.java || exit 1 + +if [ ${device} != "x86_HOST" ]; then + dx --dex --output=test_java_api.jar *.class || exit 1 + adb -s ${device} shell rm -rf ${device_dir} + adb -s ${device} shell mkdir ${device_dir} || exit 1 + adb -s ${device} push ${install_dir}/lib/libBoltModel.so ${device_dir} > /dev/null || exit 1 + if [ -f "${install_dir}/lib/libkernelsource.so" ]; then + adb -s ${device} push ${install_dir}/lib/libkernelsource.so ${device_dir} > /dev/null || exit 1 + fi + if [ -f "${BOLT_ROOT}/third_party/arm_llvm/opencl/lib64/libc++_shared.so" ]; then + adb -s ${device} push ${BOLT_ROOT}/third_party/arm_llvm/opencl/lib64/libc++_shared.so ${device_dir} > /dev/null || exit 1 + fi + if [ -f "${install_dir}/lib/libOpenCL.so" ]; then + adb -s ${device} push ${install_dir}/lib/libOpenCL.so ${device_dir} > /dev/null || exit 1 + fi + adb -s ${device} push ./test_java_api.jar ${device_dir} > /dev/null || exit 1 + + adb -s ${device} shell "cd ${device_dir} && export LD_LIBRARY_PATH=/apex/com.android.runtime/lib64/bionic:/system/lib64 && dalvikvm -cp ./test_java_api.jar test_api_java ${device} ${ci_dir}" 2> status.txt +else + java test_api_java ${device} ${ci_dir} 2> status.txt +fi + +if [ "$?" != 0 ]; then + cat status.txt + if cat ./status.txt | grep "couldn't find an OpenCL implementation" > /dev/null + then + echo "GPU environment error" + else + exit 1 + fi +fi + +if [ ${device} != "x86_HOST" ]; then + adb -s ${device} shell rm -rf ${device_dir} +fi + +cd ${current_dir} diff --git a/CI_SCRIPTS/model_tools_test.sh b/CI_SCRIPTS/model_tools_test.sh new file mode 100644 index 00000000..0b488d4b --- /dev/null +++ b/CI_SCRIPTS/model_tools_test.sh @@ -0,0 +1,183 @@ +#!/bin/bash + +script_name=$0 +script_abs=$(readlink -f "$0") +script_dir=$(dirname $script_abs) + +host_bin_dir="" +host_lib_dir="" +excute_on_device=false +use_static_library=true +memory_reuse=true +remove=true +device="" +cpu_mask="2" +device_dir="" +model_zoo_dir="" + +print_help() { + cat < run specified program in . + -l, --lib use dynamic library in . + -d, --device run test on device. + -c, --cpu_mask taskset cpu mask(default: 2). + -p, --path run test on device in specified . + -m, --model_zoo use prepared models in model_zoo(/[caffe|onnx|tflite]_models) + -r, --remove remove device tmp directory or not +EOF + exit 1; +} + +TEMP=`getopt -o b:c:hl:d:p:r:m: --long bin:cpu_mask:help,lib:device:path:remove:model_zoo \ + -n ${script_name} -- "$@"` +if [ $? != 0 ] ; then echo "[ERROR] terminating..." >&2 ; exit 1 ; fi +eval set -- "$TEMP" +while true ; do + case "$1" in + -b|--bin) + host_bin_dir=$2 + echo "[INFO] run test in ${host_bin_dir}" ; + shift 2 ;; + -c|--cpu_mask) + cpu_mask=$2 + echo "[INFO] CPU mask ${cpu_mask}" ; + shift 2 ;; + -l|--lib) + host_lib_dir=$2 + use_static_library=false + echo "[INFO] use library in ${host_lib_dir}" ; + shift 2 ;; + -d|--device) + device=$2 + exe_on_device=true + echo "[INFO] test on device ${device}" ; + shift 2 ;; + -m|--model_zoo) + model_zoo_dir=$2 + echo "[INFO] use model_zoo ${model_zoo_dir}" ; + shift 2 ;; + -p|--path) + device_dir=$2 + echo "[INFO] test on device directory ${device_dir}" ; + shift 2 ;; + -r|--remove) + remove=$2 + echo "[INFO] clear tmp directory ${remove}" ; + shift 2;; + -h|--help) + print_help ; + shift ;; + --) shift ; + break ;; + *) echo "[ERROR] $1" ; exit 1 ;; + esac +done + +run_command() { + params=$* + + prefix="cd ${device_dir}/tmp" + if [[ ${memory_reuse} == true ]] ; then + prefix="$prefix && export BOLT_MEMORY_REUSE_OPTIMIZATION=ON" + else + prefix="$prefix && export BOLT_MEMORY_REUSE_OPTIMIZATION=OFF" + fi + if [[ ${exe_on_device} == true ]] ; then + if [[ ${use_static_library} == true ]] ; then + adb -s ${device} shell "$prefix && taskset ${cpu_mask} ./${params} || echo '[FAILURE]'" &> status.txt + else + adb -s ${device} shell "$prefix && export LD_LIBRARY_PATH=. && taskset ${cpu_mask} ./${params} || echo '[FAILURE]'" &> status.txt + fi + else + if [[ ${use_static_library} == true ]] ; then + $prefix && taskset ${cpu_mask} ${host_bin_dir}/${params} || echo '[FAILURE]' &> status.txt + else + export LD_LIBRARY_PATH=${host_lib_dir}:${LD_LIBRARY_PATH} && $prefix && taskset ${cpu_mask} ${host_bin_dir}/${params} || echo '[FAILURE]' &> status.txt + fi + fi + cat status.txt || exit 1 + if [ `grep -c "\[FAILURE\]" status.txt` -ne '0' ] ; then + exit 1 + fi + rm status.txt +} + +if [[ ${exe_on_device} == true ]] ; then + adb -s ${device} shell "mkdir ${device_dir}" + adb -s ${device} shell "rm -rf ${device_dir}/tmp" + adb -s ${device} shell "mkdir ${device_dir}/tmp" + adb -s ${device} shell "cp -r ${model_zoo_dir}/* ${device_dir}/tmp/" + adb -s ${device} shell "find ${device_dir}/tmp -name \"*\.bolt\" | xargs rm -rf" + if [[ ${use_static_library} != true ]] ; then + adb -s ${device} push ${host_lib_dir}/libuni.so ${device_dir}/tmp > /dev/null || exit 1 + adb -s ${device} push ${host_lib_dir}/libmodel_tools.so ${device_dir}/tmp > /dev/null || exit 1 + adb -s ${device} push ${host_lib_dir}/libmodel_tools_caffe.so ${device_dir}/tmp > /dev/null || exit 1 + adb -s ${device} push ${host_lib_dir}/libmodel_tools_onnx.so ${device_dir}/tmp > /dev/null || exit 1 + adb -s ${device} push ${host_lib_dir}/libmodel_tools_tflite.so ${device_dir}/tmp > /dev/null || exit 1 + bash ${script_dir}/../scripts/push_third_party.sh -l ${script_dir}/../third_party/arm_llvm -d ${device} -p ${device_dir}/tmp -c arm_llvm + fi + adb -s ${device} push ${host_bin_dir}/X2bolt ${device_dir}/tmp > /dev/null || exit 1 +else + mkdir ${host_bin_dir}/tmp + cp -r ${model_zoo_dir}/* ${host_bin_dir}/tmp/ +fi + +# caffe model +# INT8 +run_command X2bolt -d caffe_models/squeezenet -m squeezenet -i INT8_Q +run_command X2bolt -d caffe_models/tinybert384 -m tinybert384 -i INT8_Q +run_command X2bolt -d caffe_models/tinybert -m tinybert -i INT8_Q +# FP16 +run_command X2bolt -d caffe_models/mobilenet_v1 -m mobilenet_v1 -i FP16 +run_command X2bolt -d caffe_models/mobilenet_v2 -m mobilenet_v2 -i FP16 +run_command X2bolt -d caffe_models/mobilenet_v3 -m mobilenet_v3 -i FP16 +run_command X2bolt -d caffe_models/resnet50 -m resnet50 -i FP16 +run_command X2bolt -d caffe_models/squeezenet -m squeezenet -i FP16 +run_command X2bolt -d caffe_models/fingerprint_resnet18 -m fingerprint_resnet18 -i FP16 +run_command X2bolt -d caffe_models/tinybert384 -m tinybert384 -i FP16 +run_command X2bolt -d caffe_models/tinybert -m tinybert -i FP16 +run_command X2bolt -d caffe_models/tinybert_disambiguate -m tinybert_disambiguate -i FP16 +run_command X2bolt -d caffe_models/nmt -m nmt FP16 +run_command X2bolt -d caffe_models/nmt_tsc_encoder -m nmt_tsc_encoder -i FP16 +run_command X2bolt -d caffe_models/nmt_tsc_decoder -m nmt_tsc_decoder -i FP16 +run_command X2bolt -d caffe_models/tts_encoder_decoder -m tts_encoder_decoder -i FP16 +run_command X2bolt -d caffe_models/asr_rnnt -m asr_rnnt -i FP16 +run_command X2bolt -d caffe_models/tts_postnet -m tts_postnet -i FP16 +# FP32 +run_command X2bolt -d caffe_models/mobilenet_v1 -m mobilenet_v1 -i FP32 +run_command X2bolt -d caffe_models/mobilenet_v2 -m mobilenet_v2 -i FP32 +run_command X2bolt -d caffe_models/mobilenet_v3 -m mobilenet_v3 -i FP32 +run_command X2bolt -d caffe_models/resnet50 -m resnet50 -i FP32 +run_command X2bolt -d caffe_models/squeezenet -m squeezenet -i FP32 +run_command X2bolt -d caffe_models/fingerprint_resnet18 -m fingerprint_resnet18 -i FP32 +run_command X2bolt -d caffe_models/tinybert384 -m tinybert384 -i FP32 +run_command X2bolt -d caffe_models/tinybert -m tinybert -i FP32 +run_command X2bolt -d caffe_models/tinybert_disambiguate -m tinybert_disambiguate -i FP32 +run_command X2bolt -d caffe_models/nmt -m nmt -i FP32 +run_command X2bolt -d caffe_models/nmt_tsc_encoder -m nmt_tsc_encoder -i FP32 +run_command X2bolt -d caffe_models/nmt_tsc_decoder -m nmt_tsc_decoder -i FP32 +run_command X2bolt -d caffe_models/tts_encoder_decoder -m tts_encoder_decoder -i FP32 +run_command X2bolt -d caffe_models/asr_rnnt -m asr_rnnt -i FP32 +run_command X2bolt -d caffe_models/tts_postnet -m tts_postnet -i FP32 + +# onnx model +# BNN +run_command X2bolt -d onnx_models/birealnet18 -m birealnet18 -i FP16 +run_command X2bolt -d onnx_models/birealnet18 -m birealnet18 -i FP32 +# FP16 +run_command X2bolt -d onnx_models/tts_melgan_vocoder -m tts_melgan_vocoder -i FP16 +# FP32 +run_command X2bolt -d onnx_models/tts_melgan_vocoder -m tts_melgan_vocoder -i FP32 + +if [[ ${remove} == true ]] ; then + if [[ ${exe_on_device} == true ]] ; then + adb -s ${device} shell rm -rf ${device_dir}/tmp + else + rm -rf ${host_bin_dir}/tmp + fi +fi diff --git a/scripts/operator_driver.sh b/CI_SCRIPTS/operator_driver.sh similarity index 73% rename from scripts/operator_driver.sh rename to CI_SCRIPTS/operator_driver.sh index 5cb0e137..9da4abd7 100644 --- a/scripts/operator_driver.sh +++ b/CI_SCRIPTS/operator_driver.sh @@ -81,41 +81,46 @@ if [ "${parameter_file_path}" == "" ] || [ ! -f ${parameter_file_path} ] ; then exit 1 fi -if [ ${exe_on_device} == true ] ; then +if [[ ${exe_on_device} == true ]] ; then exe_name=${exe_host_path##*/} exe_device_path="${device_dir}/${exe_name}" - adb -s ${device} push ${exe_host_path} ${exe_device_path} || exit 1 + adb -s ${device} push ${exe_host_path} ${exe_device_path} > /dev/null || exit 1 fi -while read params -do +commands=() +while read line; do + commands[${#commands[*]}]=`echo ${line}` +done < ${parameter_file_path} + +for((k=0;k<${#commands[@]};k++)){ + params=${commands[k]} # filter out the params that starts with '#' if [[ ! "$params" =~ ^#.* ]]; then params_len=${#params} if [[ $params_len -gt 0 ]]; then #echo " parameter: ${params}" - if [ ${exe_on_device} == true ] ; then - if [ ${use_static_library} == true ] ; then - adb -s ${device} shell "taskset ${cpu_mask} ${exe_device_path} ${params} || echo '[FAILURE]'" &> status.txt - else - adb -s ${device} shell "export LD_LIBRARY_PATH=${device_dir} && taskset ${cpu_mask} ${exe_device_path} ${params} || echo '[FAILURE]'" &> status.txt + if [[ ${exe_on_device} == true ]] ; then + library_reference="" + if [[ ${use_static_library} != true ]] ; then + library_reference="export LD_LIBRARY_PATH=${device_dir} &&" + fi + adb -s ${device} shell "${library_reference} taskset ${cpu_mask} ${exe_device_path} ${params} || echo '[FAILURE]'" &> status.txt + + cat status.txt || exit 1 + if [ `grep -c "\[FAILURE\]" status.txt` -ne '0' ] ; then + exit 1 fi + rm status.txt else - if [ ${use_static_library} == true ] ; then - ${exe_host_path} ${params} || echo '[FAILURE]' &> status.txt - else - export LD_LIBRARY_PATH=${exe_host_path}/../lib:${LD_LIBRARY_PATH} && ${exe_host_path} ${params} || echo '[FAILURE]' &> status.txt + if [[ ${use_static_library} != true ]] ; then + export LD_LIBRARY_PATH=${exe_host_path}/../lib:${LD_LIBRARY_PATH} fi + taskset ${cpu_mask} ${exe_host_path} ${params} || exit 1 fi - cat status.txt || exit 1 - if [ `grep -c "\[FAILURE\]" status.txt` -ne '0' ] ; then - exit 1 - fi - rm status.txt fi fi -done < ${parameter_file_path} +} -if [ ${exe_on_device} == true ] ; then +if [[ ${exe_on_device} == true ]] ; then adb -s ${device} shell "rm -rf ${exe_device_path}" fi diff --git a/scripts/operator_test.sh b/CI_SCRIPTS/operator_test.sh similarity index 73% rename from scripts/operator_test.sh rename to CI_SCRIPTS/operator_test.sh index 499529a1..685303ac 100644 --- a/scripts/operator_test.sh +++ b/CI_SCRIPTS/operator_test.sh @@ -69,37 +69,41 @@ done run_command() { params=" -c ${cpu_mask} -e $1 -i $2" - if [ ${exe_on_device} == true ] ; then + if [[ ${exe_on_device} == true ]] ; then params="${params} -p ${device_dir} -d ${device}" fi - if [ ${use_static_library} == true ] ; then + if [[ ${use_static_library} == true ]] ; then params="${params} -s ${use_static_library}" fi ${driver_script_path} ${params} || exit 1 } -if [ ${exe_on_device} == true ] ; then - status=`adb -s ${device} shell "ls ${device_dir} && echo 'success'" | tail -n 1` - if [ "${status}" == "success" ] ; then - if [ ${use_static_library} != true ] ; then - adb -s ${device} push ${host_lib_dir}/libblas-enhance.so ${device_dir} - adb -s ${device} push ${host_lib_dir}/libtensor_computing.so ${device_dir} +if [[ ${exe_on_device} == true ]] ; then + adb -s ${device} shell "mkdir ${device_dir}" + if [[ ${use_static_library} != true ]] ; then + adb -s ${device} push ${host_lib_dir}/libuni.so ${device_dir} > /dev/null || exit 1 + adb -s ${device} push ${host_lib_dir}/libblas_enhance.so ${device_dir} > /dev/null || exit 1 + adb -s ${device} push ${host_lib_dir}/libtensor.so ${device_dir} > /dev/null || exit 1 + if [[ -f ${host_lib_dir}/libgcl.so ]] ; then + adb -s ${device} push ${host_lib_dir}/libgcl.so ${device_dir} > /dev/null || exit 1 + fi + if [[ -f ${host_lib_dir}/libkernelsource.so ]] ; then + adb -s ${device} push ${host_lib_dir}/libkernelsource.so ${device_dir} > /dev/null || exit 1 fi - else - adb -s ${device} shell "mkdir ${device_dir}" fi fi # FP32 & FP16 operator test -# blas-enhance +# blas_enhance run_command ${host_bin_dir}/test_mmm ${script_dir}/params/mmm.csv run_command ${host_bin_dir}/test_mvm ${script_dir}/params/mvm.csv # tensor_computing run_command ${host_bin_dir}/test_activation ${script_dir}/params/activation.csv +run_command ${host_bin_dir}/test_argmax ${script_dir}/params/argmax.csv run_command ${host_bin_dir}/test_attention ${script_dir}/params/attention.csv -run_command ${host_bin_dir}/test_reduction ${script_dir}/params/reduction.csv +run_command ${host_bin_dir}/test_check ${script_dir}/params/check.csv run_command ${host_bin_dir}/test_clip ${script_dir}/params/clip.csv run_command ${host_bin_dir}/test_concat ${script_dir}/params/concat.csv run_command ${host_bin_dir}/test_convolution ${script_dir}/params/convolution.csv @@ -111,40 +115,46 @@ run_command ${host_bin_dir}/test_depthwise_convolution ${script_dir}/params/mobi run_command ${host_bin_dir}/test_depthwise_convolution ${script_dir}/params/mobilenetv2_depthwise_convolution.csv run_command ${host_bin_dir}/test_depthwise_convolution ${script_dir}/params/mobilenetv3_depthwise_convolution.csv run_command ${host_bin_dir}/test_dilated_convolution ${script_dir}/params/dilated_convolution.csv +run_command ${host_bin_dir}/test_detectionoutput ${script_dir}/params/detectionoutput.csv run_command ${host_bin_dir}/test_eltwise ${script_dir}/params/eltwise.csv run_command ${host_bin_dir}/test_fully_connected ${script_dir}/params/lenet_fully_connected.csv -run_command ${host_bin_dir}/test_lstm ${script_dir}/params/lstm.csv -run_command ${host_bin_dir}/test_multiply ${script_dir}/params/multiply.csv +run_command ${host_bin_dir}/test_l2normalization ${script_dir}/params/l2normalization.csv +run_command ${host_bin_dir}/test_non_max_suppression ${script_dir}/params/non_max_suppression.csv +run_command ${host_bin_dir}/test_padding ${script_dir}/params/padding.csv +run_command ${host_bin_dir}/test_prelu ${script_dir}/params/prelu.csv +run_command ${host_bin_dir}/test_power ${script_dir}/params/power.csv run_command ${host_bin_dir}/test_pooling ${script_dir}/params/pooling.csv +run_command ${host_bin_dir}/test_pooling_bp ${script_dir}/params/pooling.csv +run_command ${host_bin_dir}/test_priorbox ${script_dir}/params/priorbox.csv run_command ${host_bin_dir}/test_reshape ${script_dir}/params/reshape.csv -run_command ${host_bin_dir}/test_softmax ${script_dir}/params/softmax.csv -run_command ${host_bin_dir}/test_split ${script_dir}/params/split.csv -run_command ${host_bin_dir}/test_slice ${script_dir}/params/slice.csv +run_command ${host_bin_dir}/test_reduction ${script_dir}/params/reduction.csv +run_command ${host_bin_dir}/test_roialign ${script_dir}/params/roialign.csv +run_command ${host_bin_dir}/test_rnn ${script_dir}/params/rnn.csv run_command ${host_bin_dir}/test_scale ${script_dir}/params/scale.csv +run_command ${host_bin_dir}/test_slice ${script_dir}/params/slice.csv +run_command ${host_bin_dir}/test_split ${script_dir}/params/split.csv +run_command ${host_bin_dir}/test_softmax ${script_dir}/params/softmax.csv run_command ${host_bin_dir}/test_transpose ${script_dir}/params/transpose.csv +run_command ${host_bin_dir}/test_tile ${script_dir}/params/tile.csv # INT8 operator test -# blas-enhance +# blas_enhance run_command ${host_bin_dir}/test_mmm_int8 ${script_dir}/params/mmm.csv run_command ${host_bin_dir}/test_mvm_int8 ${script_dir}/params/mvm.csv # tensor_computing run_command ${host_bin_dir}/test_concat_int8 ${script_dir}/params/concat.csv -run_command ${host_bin_dir}/test_pooling_int8 ${script_dir}/params/pooling.csv run_command ${host_bin_dir}/test_convolution_int8 ${script_dir}/params/alexnet_convolution.csv run_command ${host_bin_dir}/test_convolution_int8 ${script_dir}/params/googlenet_convolution.csv run_command ${host_bin_dir}/test_convolution_int8 ${script_dir}/params/resnet50_convolution.csv run_command ${host_bin_dir}/test_depthwise_convolution_int8 ${script_dir}/params/mobilenetv1_depthwise_convolution.csv run_command ${host_bin_dir}/test_depthwise_convolution_int8 ${script_dir}/params/mobilenetv2_depthwise_convolution.csv run_command ${host_bin_dir}/test_depthwise_convolution_int8 ${script_dir}/params/mobilenetv3_depthwise_convolution.csv +run_command ${host_bin_dir}/test_pooling_int8 ${script_dir}/params/pooling.csv # BNN operator test run_command ${host_bin_dir}/test_convolution_bnn ${script_dir}/params/bnn_convolution.csv - -if [ ${exe_on_device} == true ] ; then - if [ ${use_static_library} != true ] ; then - adb -s ${device} shell "rm -rf ${device_dir}/libblas-enhance.so" - adb -s ${device} shell "rm -rf ${device_dir}/libtensor_computing.so" - fi +if [[ ${exe_on_device} == true ]] ; then + adb -s ${device} shell "rm -rf ${device_dir}" fi diff --git a/scripts/params/activation.csv b/CI_SCRIPTS/params/activation.csv similarity index 100% rename from scripts/params/activation.csv rename to CI_SCRIPTS/params/activation.csv diff --git a/CI_SCRIPTS/params/alexnet_convolution.csv b/CI_SCRIPTS/params/alexnet_convolution.csv new file mode 100644 index 00000000..c20a9132 --- /dev/null +++ b/CI_SCRIPTS/params/alexnet_convolution.csv @@ -0,0 +1,6 @@ +#in_n in_c in_h in_w f_n f_c f_h f_w group stride padding out_n out_c out_h out_w +1 3 227 227 96 3 11 11 1 4 0 1 96 55 55 +1 96 27 27 256 96 5 5 1 2 0 1 256 13 13 +1 256 13 13 384 256 3 3 1 1 1 1 384 13 13 +1 384 13 13 384 384 3 3 1 1 1 1 384 13 13 +1 384 13 13 256 384 3 3 1 1 1 1 256 13 13 diff --git a/CI_SCRIPTS/params/argmax.csv b/CI_SCRIPTS/params/argmax.csv new file mode 100644 index 00000000..9b68c143 --- /dev/null +++ b/CI_SCRIPTS/params/argmax.csv @@ -0,0 +1,5 @@ +#in ic ih iw axis +1 64 24 24 -1 +1 8 100 100 1 +1 8 100 100 2 +1 8 100 100 3 diff --git a/scripts/params/attention.csv b/CI_SCRIPTS/params/attention.csv similarity index 100% rename from scripts/params/attention.csv rename to CI_SCRIPTS/params/attention.csv diff --git a/CI_SCRIPTS/params/bnn_convolution.csv b/CI_SCRIPTS/params/bnn_convolution.csv new file mode 100644 index 00000000..fb8445f6 --- /dev/null +++ b/CI_SCRIPTS/params/bnn_convolution.csv @@ -0,0 +1,53 @@ +#in_n in_c in_h in_w f_n f_c f_h f_w group stride padding out_n out_c out_h out_w +1 64 56 56 256 64 1 1 1 1 0 1 256 56 56 +1 64 56 56 64 64 1 1 1 1 0 1 64 56 56 +1 64 56 56 64 64 3 3 1 1 1 1 64 56 56 +1 64 56 56 256 64 1 1 1 1 0 1 256 56 56 +1 256 56 56 64 256 1 1 1 1 0 1 64 56 56 +1 64 56 56 64 64 3 3 1 1 1 1 64 56 56 +1 64 56 56 256 64 1 1 1 1 0 1 256 56 56 +1 256 56 56 64 256 1 1 1 1 0 1 64 56 56 +1 64 56 56 64 64 3 3 1 1 1 1 64 56 56 +1 64 56 56 256 64 1 1 1 1 0 1 256 56 56 +1 256 56 56 512 256 1 1 1 2 0 1 512 28 28 +1 256 56 56 128 256 1 1 1 2 0 1 128 28 28 +1 128 28 28 128 128 3 3 1 1 1 1 128 28 28 +1 128 28 28 512 128 1 1 1 1 0 1 512 28 28 +1 512 28 28 128 512 1 1 1 1 0 1 128 28 28 +1 128 28 28 128 128 3 3 1 1 1 1 128 28 28 +1 128 28 28 512 128 1 1 1 1 0 1 512 28 28 +1 512 28 28 128 512 1 1 1 1 0 1 128 28 28 +1 128 28 28 128 128 3 3 1 1 1 1 128 28 28 +1 128 28 28 512 128 1 1 1 1 0 1 512 28 28 +1 512 28 28 128 512 1 1 1 1 0 1 128 28 28 +1 128 28 28 128 128 3 3 1 1 1 1 128 28 28 +1 128 28 28 512 128 1 1 1 1 0 1 512 28 28 +1 512 28 28 1024 512 1 1 1 2 0 1 1024 14 14 +1 512 28 28 256 512 1 1 1 2 0 1 256 14 14 +1 256 14 14 256 256 3 3 1 1 1 1 256 14 14 +1 256 14 14 1024 256 1 1 1 1 0 1 1024 14 14 +1 1024 14 14 256 1024 1 1 1 1 0 1 256 14 14 +1 256 14 14 256 256 3 3 1 1 1 1 256 14 14 +1 256 14 14 1024 256 1 1 1 1 0 1 1024 14 14 +1 1024 14 14 256 1024 1 1 1 1 0 1 256 14 14 +1 256 14 14 256 256 3 3 1 1 1 1 256 14 14 +1 256 14 14 1024 256 1 1 1 1 0 1 1024 14 14 +1 1024 14 14 256 1024 1 1 1 1 0 1 256 14 14 +1 256 14 14 256 256 3 3 1 1 1 1 256 14 14 +1 256 14 14 1024 256 1 1 1 1 0 1 1024 14 14 +1 1024 14 14 256 1024 1 1 1 1 0 1 256 14 14 +1 256 14 14 256 256 3 3 1 1 1 1 256 14 14 +1 256 14 14 1024 256 1 1 1 1 0 1 1024 14 14 +1 1024 14 14 256 1024 1 1 1 1 0 1 256 14 14 +1 256 14 14 256 256 3 3 1 1 1 1 256 14 14 +1 256 14 14 1024 256 1 1 1 1 0 1 1024 14 14 +1 1024 14 14 2048 1024 1 1 1 2 0 1 2048 7 7 +1 1024 14 14 512 1024 1 1 1 2 0 1 512 7 7 +1 512 7 7 512 512 3 3 1 1 1 1 512 7 7 +1 512 7 7 2048 512 1 1 1 1 0 1 2048 7 7 +1 2048 7 7 512 2048 1 1 1 1 0 1 512 7 7 +1 512 7 7 512 512 3 3 1 1 1 1 512 7 7 +1 512 7 7 2048 512 1 1 1 1 0 1 2048 7 7 +1 2048 7 7 512 2048 1 1 1 1 0 1 512 7 7 +1 512 7 7 512 512 3 3 1 1 1 1 512 7 7 +1 512 7 7 2048 512 1 1 1 1 0 1 2048 7 7 diff --git a/CI_SCRIPTS/params/check.csv b/CI_SCRIPTS/params/check.csv new file mode 100644 index 00000000..b0c30bf5 --- /dev/null +++ b/CI_SCRIPTS/params/check.csv @@ -0,0 +1,3 @@ +#in ic ih iw +1 64 24 24 +1 8 100 100 diff --git a/scripts/params/clip.csv b/CI_SCRIPTS/params/clip.csv similarity index 100% rename from scripts/params/clip.csv rename to CI_SCRIPTS/params/clip.csv diff --git a/scripts/params/concat.csv b/CI_SCRIPTS/params/concat.csv similarity index 70% rename from scripts/params/concat.csv rename to CI_SCRIPTS/params/concat.csv index d2167561..b6260c91 100644 --- a/scripts/params/concat.csv +++ b/CI_SCRIPTS/params/concat.csv @@ -1,3 +1,3 @@ #num axis [in ic ih iw]* on oc oh ow 2 1 1 8 16 16 1 16 16 16 1 24 16 16 -2 1 1 3 7 7 1 16 7 7 1 19 7 7 +2 1 1 16 7 7 1 16 7 7 1 32 7 7 diff --git a/CI_SCRIPTS/params/convolution.csv b/CI_SCRIPTS/params/convolution.csv new file mode 100644 index 00000000..37b60801 --- /dev/null +++ b/CI_SCRIPTS/params/convolution.csv @@ -0,0 +1,17 @@ +#in_n in_c in_h in_w f_n f_c f_h f_w group stride padding out_n out_c out_h out_w +1 1 227 227 96 1 11 11 1 4 0 1 96 55 55 +1 2 227 227 96 2 11 11 1 4 0 1 96 55 55 +1 3 227 227 96 3 11 11 1 4 0 1 96 55 55 +1 4 227 227 96 4 11 11 1 4 0 1 96 55 55 +1 5 227 227 96 5 11 11 1 4 0 1 96 55 55 +1 6 227 227 96 6 11 11 1 4 0 1 96 55 55 +1 7 227 227 96 7 11 11 1 4 0 1 96 55 55 +1 8 227 227 96 8 11 11 1 4 0 1 96 55 55 +1 8 11 11 96 4 11 11 2 1 0 1 96 1 1 +1 8 227 227 96 4 11 11 2 4 0 1 96 55 55 +1 9 227 227 96 3 11 11 3 4 0 1 96 55 55 +1 16 227 227 96 8 11 11 2 4 0 1 96 55 55 +1 4 227 227 96 2 3 3 2 1 1 1 96 227 227 +1 8 227 227 96 4 3 3 2 1 1 1 96 227 227 +1 16 227 227 96 8 3 3 2 1 1 1 96 227 227 +1 32 227 227 96 16 3 3 2 1 1 1 96 227 227 diff --git a/CI_SCRIPTS/params/deconvolution.csv b/CI_SCRIPTS/params/deconvolution.csv new file mode 100644 index 00000000..f380ab68 --- /dev/null +++ b/CI_SCRIPTS/params/deconvolution.csv @@ -0,0 +1,12 @@ +#in_n in_c in_h in_w f_n f_c f_h f_w group stride padding out_n out_c out_h out_w +1 8 132 132 8 8 16 16 1 8 4 1 8 1056 1056 +1 8 4 4 8 8 2 2 1 2 0 1 8 8 8 +1 8 4 4 8 8 4 4 1 2 1 1 8 8 8 +1 8 8 8 8 8 16 16 1 8 4 1 8 64 64 +1 8 32 32 8 8 16 16 1 8 4 1 8 256 256 +1 8 4 4 8 8 2 2 1 2 0 1 8 8 8 +1 128 32 32 128 128 2 2 1 2 0 1 128 64 64 +1 128 3 3 128 128 3 3 1 2 1 1 128 5 5 +1 128 6 6 128 128 3 3 1 3 0 1 128 18 18 +1 64 8 8 1 64 8 8 64 4 2 1 64 32 32 +1 64 16 16 1 64 4 4 64 2 1 1 64 32 32 diff --git a/CI_SCRIPTS/params/detectionoutput.csv b/CI_SCRIPTS/params/detectionoutput.csv new file mode 100644 index 00000000..bba7e1b2 --- /dev/null +++ b/CI_SCRIPTS/params/detectionoutput.csv @@ -0,0 +1,2 @@ +#ih0 iw0 ih1 iw1 in2 ic2 ilens2 oh ow num_class +1 144 1 756 1 2 144 201 6 2 diff --git a/CI_SCRIPTS/params/dilated_convolution.csv b/CI_SCRIPTS/params/dilated_convolution.csv new file mode 100644 index 00000000..b7efe509 --- /dev/null +++ b/CI_SCRIPTS/params/dilated_convolution.csv @@ -0,0 +1,5 @@ +#in_n in_c in_h in_w f_n f_c f_h f_w group stride padding rate out_n out_c out_h out_w +1 96 27 27 256 96 5 5 1 2 0 2 1 256 10 10 +1 256 13 13 384 256 3 3 1 1 1 2 1 384 11 11 +1 384 13 13 384 384 3 3 1 1 1 3 1 384 9 9 +1 384 13 13 256 384 3 3 1 1 1 4 1 256 7 7 diff --git a/scripts/params/eltwise.csv b/CI_SCRIPTS/params/eltwise.csv similarity index 100% rename from scripts/params/eltwise.csv rename to CI_SCRIPTS/params/eltwise.csv diff --git a/CI_SCRIPTS/params/googlenet_convolution.csv b/CI_SCRIPTS/params/googlenet_convolution.csv new file mode 100644 index 00000000..d3e0a317 --- /dev/null +++ b/CI_SCRIPTS/params/googlenet_convolution.csv @@ -0,0 +1,58 @@ +#in_n in_c in_h in_w f_n f_c f_h f_w group stride padding out_n out_c out_h out_w +1 3 224 224 64 3 7 7 1 2 3 1 64 112 112 +1 64 56 56 64 64 1 1 1 1 0 1 64 56 56 +1 64 56 56 192 64 3 3 1 1 1 1 192 56 56 +1 192 28 28 64 192 1 1 1 1 0 1 64 28 28 +1 192 28 28 96 192 1 1 1 1 0 1 96 28 28 +1 96 28 28 128 96 3 3 1 1 1 1 128 28 28 +1 192 28 28 16 192 1 1 1 1 0 1 16 28 28 +1 16 28 28 32 16 5 5 1 1 2 1 32 28 28 +1 192 28 28 32 192 1 1 1 1 0 1 32 28 28 +1 256 28 28 128 256 1 1 1 1 0 1 128 28 28 +1 256 28 28 128 256 1 1 1 1 0 1 128 28 28 +1 128 28 28 192 128 3 3 1 1 1 1 192 28 28 +1 256 28 28 32 256 1 1 1 1 0 1 32 28 28 +1 32 28 28 96 32 5 5 1 1 2 1 96 28 28 +1 256 28 28 64 256 1 1 1 1 0 1 64 28 28 +1 480 14 14 192 480 1 1 1 1 0 1 192 14 14 +1 480 14 14 96 480 1 1 1 1 0 1 96 14 14 +1 96 14 14 208 96 3 3 1 1 1 1 208 14 14 +1 480 14 14 16 480 1 1 1 1 0 1 16 14 14 +1 16 14 14 48 16 5 5 1 1 2 1 48 14 14 +1 480 14 14 64 480 1 1 1 1 0 1 64 14 14 +1 512 14 14 160 512 1 1 1 1 0 1 160 14 14 +1 512 14 14 112 512 1 1 1 1 0 1 112 14 14 +1 112 14 14 224 112 3 3 1 1 1 1 224 14 14 +1 512 14 14 24 512 1 1 1 1 0 1 24 14 14 +1 24 14 14 64 24 5 5 1 1 2 1 64 14 14 +1 512 14 14 64 512 1 1 1 1 0 1 64 14 14 +1 512 14 14 128 512 1 1 1 1 0 1 128 14 14 +1 512 14 14 128 512 1 1 1 1 0 1 128 14 14 +1 128 14 14 256 128 3 3 1 1 1 1 256 14 14 +1 512 14 14 24 512 1 1 1 1 0 1 24 14 14 +1 24 14 14 64 24 5 5 1 1 2 1 64 14 14 +1 512 14 14 64 512 1 1 1 1 0 1 64 14 14 +1 512 14 14 112 512 1 1 1 1 0 1 112 14 14 +1 512 14 14 144 512 1 1 1 1 0 1 144 14 14 +1 144 14 14 288 144 3 3 1 1 1 1 288 14 14 +1 512 14 14 32 512 1 1 1 1 0 1 32 14 14 +1 32 14 14 64 32 5 5 1 1 2 1 64 14 14 +1 512 14 14 64 512 1 1 1 1 0 1 64 14 14 +1 528 14 14 256 528 1 1 1 1 0 1 256 14 14 +1 528 14 14 160 528 1 1 1 1 0 1 160 14 14 +1 160 14 14 320 160 3 3 1 1 1 1 320 14 14 +1 528 14 14 32 528 1 1 1 1 0 1 32 14 14 +1 32 14 14 128 32 5 5 1 1 2 1 128 14 14 +1 528 14 14 128 528 1 1 1 1 0 1 128 14 14 +1 832 7 7 256 832 1 1 1 1 0 1 256 7 7 +1 832 7 7 160 832 1 1 1 1 0 1 160 7 7 +1 160 7 7 320 160 3 3 1 1 1 1 320 7 7 +1 832 7 7 32 832 1 1 1 1 0 1 32 7 7 +1 32 7 7 128 32 5 5 1 1 2 1 128 7 7 +1 832 7 7 128 832 1 1 1 1 0 1 128 7 7 +1 832 7 7 384 832 1 1 1 1 0 1 384 7 7 +1 832 7 7 192 832 1 1 1 1 0 1 192 7 7 +1 192 7 7 384 192 3 3 1 1 1 1 384 7 7 +1 832 7 7 48 832 1 1 1 1 0 1 48 7 7 +1 48 7 7 128 48 5 5 1 1 2 1 128 7 7 +1 832 7 7 128 832 1 1 1 1 0 1 128 7 7 diff --git a/CI_SCRIPTS/params/l2normalization.csv b/CI_SCRIPTS/params/l2normalization.csv new file mode 100644 index 00000000..7e6d2e58 --- /dev/null +++ b/CI_SCRIPTS/params/l2normalization.csv @@ -0,0 +1,2 @@ +#ic ih iw +1 1 128 diff --git a/CI_SCRIPTS/params/lenet_convolution.csv b/CI_SCRIPTS/params/lenet_convolution.csv new file mode 100644 index 00000000..3559a7d1 --- /dev/null +++ b/CI_SCRIPTS/params/lenet_convolution.csv @@ -0,0 +1,3 @@ +#in_n in_c in_h in_w f_n f_c f_h f_w group stride padding out_n out_c out_h out_w +1 1 32 32 6 1 5 5 1 1 0 1 6 28 28 +1 6 14 14 16 6 5 5 1 1 0 1 16 10 10 diff --git a/scripts/params/lenet_fully_connected.csv b/CI_SCRIPTS/params/lenet_fully_connected.csv similarity index 100% rename from scripts/params/lenet_fully_connected.csv rename to CI_SCRIPTS/params/lenet_fully_connected.csv diff --git a/scripts/params/mmm.csv b/CI_SCRIPTS/params/mmm.csv similarity index 100% rename from scripts/params/mmm.csv rename to CI_SCRIPTS/params/mmm.csv diff --git a/CI_SCRIPTS/params/mobilenetv1_depthwise_convolution.csv b/CI_SCRIPTS/params/mobilenetv1_depthwise_convolution.csv new file mode 100644 index 00000000..319d4cd6 --- /dev/null +++ b/CI_SCRIPTS/params/mobilenetv1_depthwise_convolution.csv @@ -0,0 +1,14 @@ +#in_n in_c in_h in_w f_n f_c f_h f_w group stride padding out_n out_c out_h out_w +1 32 112 112 64 32 3 3 1 1 1 1 64 112 112 +1 64 112 112 128 64 3 3 1 2 1 1 128 56 56 +1 128 56 56 128 128 3 3 1 1 1 1 128 56 56 +1 128 56 56 256 128 3 3 1 2 1 1 256 28 28 +1 256 28 28 256 256 3 3 1 1 1 1 256 28 28 +1 256 28 28 512 256 3 3 1 2 1 1 512 14 14 +1 512 14 14 512 512 3 3 1 1 1 1 512 14 14 +1 512 14 14 512 512 3 3 1 1 1 1 512 14 14 +1 512 14 14 512 512 3 3 1 1 1 1 512 14 14 +1 512 14 14 512 512 3 3 1 1 1 1 512 14 14 +1 512 14 14 512 512 3 3 1 1 1 1 512 14 14 +1 512 14 14 1024 512 3 3 1 2 1 1 1024 7 7 +1 1024 7 7 1024 1024 3 3 1 1 1 1 1024 7 7 diff --git a/CI_SCRIPTS/params/mobilenetv2_depthwise_convolution.csv b/CI_SCRIPTS/params/mobilenetv2_depthwise_convolution.csv new file mode 100644 index 00000000..0879ed77 --- /dev/null +++ b/CI_SCRIPTS/params/mobilenetv2_depthwise_convolution.csv @@ -0,0 +1,18 @@ +#in_n in_c in_h in_w f_n f_c f_h f_w group stride padding out_n out_c out_h out_w +1 32 112 112 32 32 3 3 1 1 1 1 16 112 112 +1 96 112 112 24 96 3 3 1 2 1 1 24 56 56 +1 144 56 56 24 144 3 3 1 1 1 1 24 56 56 +1 144 56 56 32 144 3 3 1 2 1 1 32 28 28 +1 192 28 28 32 192 3 3 1 1 1 1 32 28 28 +1 192 28 28 32 192 3 3 1 1 1 1 32 28 28 +1 192 28 28 64 192 3 3 1 1 1 1 64 28 28 +1 384 28 28 64 384 3 3 1 1 1 1 64 28 28 +1 384 28 28 64 384 3 3 1 1 1 1 64 28 28 +1 384 28 28 64 384 3 3 1 1 1 1 64 28 28 +1 384 28 28 96 384 3 3 1 2 1 1 96 14 14 +1 576 14 14 96 576 3 3 1 1 1 1 96 14 14 +1 576 14 14 96 576 3 3 1 1 1 1 96 14 14 +1 576 14 14 160 576 3 3 1 2 1 1 160 7 7 +1 960 7 7 160 960 3 3 1 1 1 1 160 7 7 +1 960 7 7 160 960 3 3 1 1 1 1 160 7 7 +1 960 7 7 320 960 3 3 1 1 1 1 320 7 7 diff --git a/CI_SCRIPTS/params/mobilenetv3_convolution.csv b/CI_SCRIPTS/params/mobilenetv3_convolution.csv new file mode 100644 index 00000000..7de16faa --- /dev/null +++ b/CI_SCRIPTS/params/mobilenetv3_convolution.csv @@ -0,0 +1,33 @@ +#in_n in_c in_h in_w f_n f_c f_h f_w group stride padding out_n out_c out_h out_w +1 3 224 224 16 3 3 3 1 2 0 1 16 112 112 +1 16 112 112 16 16 1 1 1 1 0 1 16 112 112 +1 16 112 112 16 16 1 1 1 1 0 1 16 112 112 +1 16 112 112 64 16 1 1 1 1 0 1 64 112 112 +1 64 56 56 24 64 1 1 1 1 0 1 24 56 56 +1 24 56 56 72 24 1 1 1 1 0 1 72 56 56 +1 72 56 56 24 72 1 1 1 1 0 1 24 56 56 +1 24 56 56 72 24 1 1 1 1 0 1 72 56 56 +1 72 28 28 40 72 1 1 1 1 0 1 40 28 28 +1 40 28 28 120 40 1 1 1 1 0 1 120 28 28 +1 120 28 28 40 120 1 1 1 1 0 1 40 28 28 +1 40 28 28 120 40 1 1 1 1 0 1 120 28 28 +1 120 28 28 40 120 1 1 1 1 0 1 40 28 28 +1 40 28 28 240 40 1 1 1 1 0 1 240 28 28 +1 240 14 14 80 240 1 1 1 1 0 1 80 14 14 +1 80 14 14 200 80 1 1 1 1 0 1 200 14 14 +1 200 14 14 80 200 1 1 1 1 0 1 80 14 14 +1 80 14 14 184 80 1 1 1 1 0 1 184 14 14 +1 184 14 14 80 184 1 1 1 1 0 1 80 14 14 +1 80 14 14 184 80 1 1 1 1 0 1 184 14 14 +1 184 14 14 80 184 1 1 1 1 0 1 80 14 14 +1 80 14 14 480 80 1 1 1 1 0 1 480 14 14 +1 480 14 14 112 480 1 1 1 1 0 1 112 14 14 +1 112 14 14 672 112 1 1 1 1 0 1 672 14 14 +1 672 14 14 112 672 1 1 1 1 0 1 112 14 14 +1 112 14 14 672 112 1 1 1 1 0 1 672 14 14 +1 672 14 14 112 672 1 1 1 1 0 1 112 14 14 +1 112 14 14 672 112 1 1 1 1 0 1 672 14 14 +1 672 7 7 160 672 1 1 1 1 0 1 160 7 7 +1 160 7 7 960 160 1 1 1 1 0 1 960 7 7 +1 960 7 7 160 960 1 1 1 1 0 1 160 7 7 +1 160 7 7 960 160 1 1 1 1 0 1 960 7 7 diff --git a/CI_SCRIPTS/params/mobilenetv3_depthwise_convolution.csv b/CI_SCRIPTS/params/mobilenetv3_depthwise_convolution.csv new file mode 100644 index 00000000..3aa3ca67 --- /dev/null +++ b/CI_SCRIPTS/params/mobilenetv3_depthwise_convolution.csv @@ -0,0 +1,16 @@ +#in_n in_c in_h in_w f_n f_c f_h f_w group stride padding out_n out_c out_h out_w +1 16 112 112 16 16 3 3 1 1 1 1 16 112 112 +1 64 112 112 24 64 3 3 1 2 0 1 24 56 56 +1 72 56 56 24 72 3 3 1 1 1 1 24 56 56 +1 72 56 56 40 72 5 5 1 2 1 1 40 28 28 +1 120 28 28 40 120 5 5 1 1 2 1 40 28 28 +1 120 28 28 40 120 5 5 1 1 2 1 40 28 28 +1 240 28 28 80 240 3 3 1 2 0 1 80 14 14 +1 200 14 14 80 200 3 3 1 1 1 1 80 14 14 +1 184 14 14 80 184 3 3 1 1 1 1 80 14 14 +1 184 14 14 112 184 3 3 1 1 1 1 80 14 14 +1 480 14 14 112 480 3 3 1 1 1 1 112 14 14 +1 672 14 14 160 672 3 3 1 1 1 1 112 14 14 +1 672 14 14 160 672 5 5 1 1 2 1 160 14 14 +1 672 14 14 160 672 5 5 1 2 1 1 160 7 7 +1 960 7 7 160 960 5 5 1 1 2 1 160 7 7 diff --git a/scripts/params/mvm.csv b/CI_SCRIPTS/params/mvm.csv similarity index 100% rename from scripts/params/mvm.csv rename to CI_SCRIPTS/params/mvm.csv diff --git a/CI_SCRIPTS/params/non_max_suppression.csv b/CI_SCRIPTS/params/non_max_suppression.csv new file mode 100644 index 00000000..278a944d --- /dev/null +++ b/CI_SCRIPTS/params/non_max_suppression.csv @@ -0,0 +1,2 @@ +#in0 ic0 ilens0 in1 ic1 ilens1 oh ow max_output_boxes_per_class iou_threshold score_threshold +1 6 4 1 2 6 7 3 3 0.5 0 diff --git a/CI_SCRIPTS/params/normalization.csv b/CI_SCRIPTS/params/normalization.csv new file mode 100644 index 00000000..4ac9d079 --- /dev/null +++ b/CI_SCRIPTS/params/normalization.csv @@ -0,0 +1,3 @@ +#alpha beta in ic ih iw +2 2 1 64 24 24 +4 4 1 8 100 100 diff --git a/CI_SCRIPTS/params/padding.csv b/CI_SCRIPTS/params/padding.csv new file mode 100644 index 00000000..7c8b6240 --- /dev/null +++ b/CI_SCRIPTS/params/padding.csv @@ -0,0 +1,5 @@ +#in ic ih iw bn bc bh bw an ac ah aw mode +1 4 32 32 0 0 1 1 0 0 1 1 0 +1 4 32 32 0 0 2 2 0 0 2 2 0 +1 4 32 32 0 0 3 3 0 0 3 3 0 +1 4 32 32 0 0 4 4 0 0 4 4 0 diff --git a/scripts/params/pipeline.csv b/CI_SCRIPTS/params/pipeline.csv similarity index 100% rename from scripts/params/pipeline.csv rename to CI_SCRIPTS/params/pipeline.csv diff --git a/scripts/params/pooling.csv b/CI_SCRIPTS/params/pooling.csv similarity index 100% rename from scripts/params/pooling.csv rename to CI_SCRIPTS/params/pooling.csv diff --git a/CI_SCRIPTS/params/pooling_bp.csv b/CI_SCRIPTS/params/pooling_bp.csv new file mode 100644 index 00000000..6388af49 --- /dev/null +++ b/CI_SCRIPTS/params/pooling_bp.csv @@ -0,0 +1,2 @@ +#in ic ih iw fn fc fh fw stride padding on oc oh ow +1 16 3 3 1 1 2 2 2 1 1 16 4 4 diff --git a/CI_SCRIPTS/params/power.csv b/CI_SCRIPTS/params/power.csv new file mode 100644 index 00000000..f8ba774c --- /dev/null +++ b/CI_SCRIPTS/params/power.csv @@ -0,0 +1,3 @@ +#len alpha beta power +1000 1.1 0.2 1 +999 -0.2 -0.1 1 diff --git a/CI_SCRIPTS/params/prelu.csv b/CI_SCRIPTS/params/prelu.csv new file mode 100644 index 00000000..7886e1d9 --- /dev/null +++ b/CI_SCRIPTS/params/prelu.csv @@ -0,0 +1,2 @@ +#in ic ih iw +1 16 8 8 diff --git a/CI_SCRIPTS/params/priorbox.csv b/CI_SCRIPTS/params/priorbox.csv new file mode 100644 index 00000000..811ae33d --- /dev/null +++ b/CI_SCRIPTS/params/priorbox.csv @@ -0,0 +1,5 @@ +#in0 ic0 ih0 iw0 in1 ic1 ih1 iw1 min_size max_size flip clip step on oc olens ar1 [ar2] min_size1 max_size1 +1 1 38 38 1 1 300 300 64.0 90.0 1 0 8.0 1 2 23104 2.0 +1 1 38 38 1 1 300 300 64.0 90.0 1 0 8.0 1 2 34656 2.0 3.0 +1 1 38 38 1 1 300 300 64.0 90.0 1 0 8.0 1 2 46208 2.0 32.0 58.0 +1 1 38 38 1 1 300 300 64.0 90.0 1 0 8.0 1 2 69312 2.0 3.0 32.0 58.0 diff --git a/CI_SCRIPTS/params/reduction.csv b/CI_SCRIPTS/params/reduction.csv new file mode 100644 index 00000000..391ab931 --- /dev/null +++ b/CI_SCRIPTS/params/reduction.csv @@ -0,0 +1,6 @@ +#in ic ih iw axesNum axeses +1 64 24 24 1 -1 +1 8 100 100 1 1 +1 8 100 100 1 2 +1 8 100 100 1 3 +1 8 100 100 2 2 3 diff --git a/scripts/params/reshape.csv b/CI_SCRIPTS/params/reshape.csv similarity index 100% rename from scripts/params/reshape.csv rename to CI_SCRIPTS/params/reshape.csv diff --git a/CI_SCRIPTS/params/resnet50_convolution.csv b/CI_SCRIPTS/params/resnet50_convolution.csv new file mode 100644 index 00000000..710ce719 --- /dev/null +++ b/CI_SCRIPTS/params/resnet50_convolution.csv @@ -0,0 +1,54 @@ +#in_n in_c in_h in_w f_n f_c f_h f_w group stride padding out_n out_c out_h out_w +1 3 224 224 64 3 7 7 1 2 3 1 64 112 112 +1 64 56 56 256 64 1 1 1 1 0 1 256 56 56 +1 64 56 56 64 64 1 1 1 1 0 1 64 56 56 +1 64 56 56 64 64 3 3 1 1 1 1 64 56 56 +1 64 56 56 256 64 1 1 1 1 0 1 256 56 56 +1 256 56 56 64 256 1 1 1 1 0 1 64 56 56 +1 64 56 56 64 64 3 3 1 1 1 1 64 56 56 +1 64 56 56 256 64 1 1 1 1 0 1 256 56 56 +1 256 56 56 64 256 1 1 1 1 0 1 64 56 56 +1 64 56 56 64 64 3 3 1 1 1 1 64 56 56 +1 64 56 56 256 64 1 1 1 1 0 1 256 56 56 +1 256 56 56 512 256 1 1 1 2 0 1 512 28 28 +1 256 56 56 128 256 1 1 1 2 0 1 128 28 28 +1 128 28 28 128 128 3 3 1 1 1 1 128 28 28 +1 128 28 28 512 128 1 1 1 1 0 1 512 28 28 +1 512 28 28 128 512 1 1 1 1 0 1 128 28 28 +1 128 28 28 128 128 3 3 1 1 1 1 128 28 28 +1 128 28 28 512 128 1 1 1 1 0 1 512 28 28 +1 512 28 28 128 512 1 1 1 1 0 1 128 28 28 +1 128 28 28 128 128 3 3 1 1 1 1 128 28 28 +1 128 28 28 512 128 1 1 1 1 0 1 512 28 28 +1 512 28 28 128 512 1 1 1 1 0 1 128 28 28 +1 128 28 28 128 128 3 3 1 1 1 1 128 28 28 +1 128 28 28 512 128 1 1 1 1 0 1 512 28 28 +1 512 28 28 1024 512 1 1 1 2 0 1 1024 14 14 +1 512 28 28 256 512 1 1 1 2 0 1 256 14 14 +1 256 14 14 256 256 3 3 1 1 1 1 256 14 14 +1 256 14 14 1024 256 1 1 1 1 0 1 1024 14 14 +1 1024 14 14 256 1024 1 1 1 1 0 1 256 14 14 +1 256 14 14 256 256 3 3 1 1 1 1 256 14 14 +1 256 14 14 1024 256 1 1 1 1 0 1 1024 14 14 +1 1024 14 14 256 1024 1 1 1 1 0 1 256 14 14 +1 256 14 14 256 256 3 3 1 1 1 1 256 14 14 +1 256 14 14 1024 256 1 1 1 1 0 1 1024 14 14 +1 1024 14 14 256 1024 1 1 1 1 0 1 256 14 14 +1 256 14 14 256 256 3 3 1 1 1 1 256 14 14 +1 256 14 14 1024 256 1 1 1 1 0 1 1024 14 14 +1 1024 14 14 256 1024 1 1 1 1 0 1 256 14 14 +1 256 14 14 256 256 3 3 1 1 1 1 256 14 14 +1 256 14 14 1024 256 1 1 1 1 0 1 1024 14 14 +1 1024 14 14 256 1024 1 1 1 1 0 1 256 14 14 +1 256 14 14 256 256 3 3 1 1 1 1 256 14 14 +1 256 14 14 1024 256 1 1 1 1 0 1 1024 14 14 +1 1024 14 14 2048 1024 1 1 1 2 0 1 2048 7 7 +1 1024 14 14 512 1024 1 1 1 2 0 1 512 7 7 +1 512 7 7 512 512 3 3 1 1 1 1 512 7 7 +1 512 7 7 2048 512 1 1 1 1 0 1 2048 7 7 +1 2048 7 7 512 2048 1 1 1 1 0 1 512 7 7 +1 512 7 7 512 512 3 3 1 1 1 1 512 7 7 +1 512 7 7 2048 512 1 1 1 1 0 1 2048 7 7 +1 2048 7 7 512 2048 1 1 1 1 0 1 512 7 7 +1 512 7 7 512 512 3 3 1 1 1 1 512 7 7 +1 512 7 7 2048 512 1 1 1 1 0 1 2048 7 7 diff --git a/scripts/params/lstm.csv b/CI_SCRIPTS/params/rnn.csv similarity index 100% rename from scripts/params/lstm.csv rename to CI_SCRIPTS/params/rnn.csv diff --git a/CI_SCRIPTS/params/roialign.csv b/CI_SCRIPTS/params/roialign.csv new file mode 100644 index 00000000..a22b8664 --- /dev/null +++ b/CI_SCRIPTS/params/roialign.csv @@ -0,0 +1,2 @@ +#in0 ic0 ih0 iw0 ih1 iw1 ilens2 on0 oc0 oh0 ow0 output_h output_w sampling_ratio spatial_scale +1 1 10 10 1 4 1 1 1 5 5 5 5 2 1 diff --git a/scripts/params/scale.csv b/CI_SCRIPTS/params/scale.csv similarity index 100% rename from scripts/params/scale.csv rename to CI_SCRIPTS/params/scale.csv diff --git a/scripts/params/slice.csv b/CI_SCRIPTS/params/slice.csv similarity index 100% rename from scripts/params/slice.csv rename to CI_SCRIPTS/params/slice.csv diff --git a/scripts/params/softmax.csv b/CI_SCRIPTS/params/softmax.csv similarity index 100% rename from scripts/params/softmax.csv rename to CI_SCRIPTS/params/softmax.csv diff --git a/scripts/params/split.csv b/CI_SCRIPTS/params/split.csv similarity index 100% rename from scripts/params/split.csv rename to CI_SCRIPTS/params/split.csv diff --git a/CI_SCRIPTS/params/tile.csv b/CI_SCRIPTS/params/tile.csv new file mode 100644 index 00000000..13f2c4e6 --- /dev/null +++ b/CI_SCRIPTS/params/tile.csv @@ -0,0 +1,3 @@ +#in ic ih iw axis tile +1 16 16 16 0 4 +1 64 16 16 3 4 diff --git a/scripts/params/transpose.csv b/CI_SCRIPTS/params/transpose.csv similarity index 100% rename from scripts/params/transpose.csv rename to CI_SCRIPTS/params/transpose.csv diff --git a/CI_SCRIPTS/parseAndExeCommands.sh b/CI_SCRIPTS/parseAndExeCommands.sh new file mode 100644 index 00000000..80c2c97e --- /dev/null +++ b/CI_SCRIPTS/parseAndExeCommands.sh @@ -0,0 +1,430 @@ +#!/bin/bash + +declare CONVERTER + +declare BOLT_SUFFIX + +declare TASKSET_STR + +declare EXECUTOR="classification" + +declare CI_PATH="/data/local/tmp/CI" + +declare ARCH="arm" + +declare MODEL_TOOLS_EXE_PATH=${CI_PATH} + +declare ENGINE_EXE_PATH=${CI_PATH} + +declare BOLT_LIB_PATH=${CI_PATH} + +declare CAFFE_MODEL_ZOO_PATH="${CI_PATH}/model_zoo/caffe_models/" + +declare ONNX_MODEL_ZOO_PATH="${CI_PATH}/model_zoo/onnx_models/" + +declare TFLITE_MODEL_ZOO_PATH="${CI_PATH}/model_zoo/tflite_models/" + +declare DYNAMIC_MODEL_PATH_PREFIX + +declare PHONE_SPECIFICATION + +declare TESTING_DATA_PREFIX="${CI_PATH}/testing_data/" + +BOLT_DIR=$(dirname $(readlink -f "$0"))/.. + +function converter_selection() +{ + CONVERTER="X2bolt" + if [ "$1" == "caffe" ] + then + DYNAMIC_MODEL_PATH_PREFIX=$CAFFE_MODEL_ZOO_PATH + return + fi + + if [ "$1" == "onnx" ] + then + DYNAMIC_MODEL_PATH_PREFIX=$ONNX_MODEL_ZOO_PATH + return + fi + + if [ "$1" == "tflite" ] + then + DYNAMIC_MODEL_PATH_PREFIX=$TFLITE_MODEL_ZOO_PATH + return + fi + echo "[ERROR] error to convert model $1" + exit 1 +} + +function acc_selection() +{ + if [ "$1" == "fp32" ] + then + BOLT_SUFFIX="_f32.bolt" + return + fi + + if [ "$1" == "fp16" ] + then + BOLT_SUFFIX="_f16.bolt" + return + fi + + if [ "$1" == "int8" ] + then + BOLT_SUFFIX="_int8_q.bolt" + return + fi + + echo "[ERROR] error to process model precision $1" + exit 1 +} + +function core_selection() +{ + if [ "$1" == "A55" ] + then + TASKSET_STR="CPU_AFFINITY_LOW_POWER" + return + fi + + if [ "$1" == "A76" ] + then + TASKSET_STR="CPU_AFFINITY_HIGH_PERFORMANCE" + return + fi + + if [ "$1" == "x86_HOST" ] + then + TASKSET_STR="CPU_AFFINITY_HIGH_PERFORMANCE" + return + fi + + echo "[ERROR] error to set affinity setting $1" + exit 1 +} + +function arch_selection() +{ + if [ "$1" == "arm" ] + then + return + fi + + if [ "$1" == "x86" ] + then + ARCH="x86" + MODEL_TOOLS_EXE_PATH=${BOLT_DIR} + ENGINE_EXE_PATH=${BOLT_DIR} + CAFFE_MODEL_ZOO_PATH="/data/bolt/model_zoo/caffe_models/" + ONNX_MODEL_ZOO_PATH="/data/bolt/model_zoo/onnx_models/" + TFLITE_MODEL_ZOO_PATH="/data/bolt/model_zoo/tflite_models/" + TESTING_DATA_PREFIX="/data/bolt/testing_data/" + return + fi + + echo "[ERROR] error to set device $1" + exit 1 +} + +function device_selection() +{ + if [ "$1" == "cpu" ] + then + return + fi + + if [ "$1" == "gpu" ] + then + TASKSET_STR="GPU" + return + fi + + echo "[ERROR] error to set device $1" + exit 1 +} + +# device id to phone specification +function deviceId_to_phoneSpecification() +{ + if [ "$1" == "E5B0119506000260" ] + then + PHONE_SPECIFICATION="810" + return + fi + + if [ "$1" == "GCL5T19822000030" ] + then + PHONE_SPECIFICATION="990" + return + fi + + if [ "$1" == "x86_HOST" ] + then + return + fi + + echo "[ERROR] error to set mobile phone $1" + exit 1 +} + +combinations=() +commands=() +while read line; do + combinations[${#combinations[*]}]=`echo ${line}` +done < ./final_combinations.txt + +for((k=0;k<${#combinations[@]};k++)){ + line=${combinations[k]} + strs_arr=() + index=0 + for i in $(echo $line| tr "-" "\n") + do + strs_arr[$index]=$i; + let index+=1 + done + + commind_line="" + + arch_selection ${strs_arr[1]} + + DL_FRAMEWORK=${strs_arr[2]} + converter_selection $DL_FRAMEWORK + + core_selection ${strs_arr[6]} + + acc_selection ${strs_arr[7]} + + device_selection ${strs_arr[8]} + + # define model converter param + MODEL_NAME=${strs_arr[0]} + + EXECUTOR="classification" + if [[ "$MODEL_NAME" == "tinybert" || "$MODEL_NAME" == "tinybert384" ]] + then + EXECUTOR="tinybert" + fi + if [ "$MODEL_NAME" == "tinybert_onnx" ] + then + EXECUTOR="tinybert_onnx" + fi + if [ "$MODEL_NAME" == "nmt" ] + then + EXECUTOR="nmt" + fi + if [ "$MODEL_NAME" == "asr_rnnt" ] + then + EXECUTOR="asr_rnnt" + fi + if [[ "$MODEL_NAME" == "asr_convolution_transformer_encoder" || "$MODEL_NAME" == "asr_convolution_transformer_prediction_net" + || "$MODEL_NAME" == "asr_convolution_transformer_joint_net" ]] + then + EXECUTOR="asr_convolution_transformer" + fi + if [[ "$MODEL_NAME" == "tts_encoder_decoder" || "$MODEL_NAME" == "tts_postnet" + || "$MODEL_NAME" == "tts_melgan_vocoder" ]] + then + EXECUTOR="tts" + fi + if [ "$MODEL_NAME" == "vad" ] + then + EXECUTOR="vad" + fi + + REMOVE_OP_NUM=0 + if [ "$DL_FRAMEWORK" == "onnx" ] + then + REMOVE_OP_NUM=${strs_arr[13]} + fi + + COMPILER=${strs_arr[4]} + TESTING_DATA_PATH=$TESTING_DATA_PREFIX${strs_arr[10]} + ORIGINAL_PARAM=${strs_arr[12]} + MODEL_PATH=$DYNAMIC_MODEL_PATH_PREFIX$MODEL_NAME"/" + EXECUTE_PARAM= + BOLT_MODEL_PATH=$MODEL_PATH$MODEL_NAME$BOLT_SUFFIX + for i in $(echo $ORIGINAL_PARAM| tr "+" "\n") + do + j=${i/@/-} + EXECUTE_PARAM=$EXECUTE_PARAM" ""$j" + done + + if [ "$ARCH" == "arm" ] + then + mt_command_line=${MODEL_TOOLS_EXE_PATH}/${ARCH}_${COMPILER}"/bin/"$CONVERTER" -d "$MODEL_PATH" -m "$MODEL_NAME + engine_command_line=${ENGINE_EXE_PATH}/${ARCH}_${COMPILER}"/bin/"$EXECUTOR" ""-m "$BOLT_MODEL_PATH" ""-i "$TESTING_DATA_PATH" "$EXECUTE_PARAM" ""-a "$TASKSET_STR + if [ "$MODEL_NAME" == "vad" ] + then + engine_command_line=${ENGINE_EXE_PATH}/${ARCH}_${COMPILER}"/bin/"$EXECUTOR" ""-m "$BOLT_MODEL_PATH" "$EXECUTE_PARAM" ""-a "$TASKSET_STR + fi + fi + if [ "$ARCH" == "x86" ] + then + mt_command_line=${MODEL_TOOLS_EXE_PATH}/"install_"${ARCH}_${COMPILER}"/tools/"$CONVERTER" -d "$MODEL_PATH" -m "$MODEL_NAME + engine_command_line=${ENGINE_EXE_PATH}/"install_"${ARCH}_${COMPILER}"/examples/"$EXECUTOR" ""-m "$BOLT_MODEL_PATH" ""-i "$TESTING_DATA_PATH" "$EXECUTE_PARAM" " + fi + + if [ ${strs_arr[7]} == "fp32" ] + then + mt_command_line=$mt_command_line" -i FP32" + fi + if [ ${strs_arr[7]} == "fp16" ] + then + mt_command_line=$mt_command_line" -i FP16" + fi + if [ ${strs_arr[7]} == "int8" ] + then + mt_command_line=$mt_command_line" -i PTQ && export LD_LIBRARY_PATH=${BOLT_LIB_PATH}/${ARCH}_${COMPILER}/lib && "${MODEL_TOOLS_EXE_PATH}/${ARCH}_${COMPILER}"/bin/post_training_quantization -p "$MODEL_PATH$MODEL_NAME"_ptq_input.bolt" + fi + + if [[ "$DL_FRAMEWORK" == "onnx" && $REMOVE_OP_NUM -gt 0 ]] + then + mt_command_line=$mt_command_line" -r "$REMOVE_OP_NUM + fi + # skip engine run section + if [[ "$MODEL_NAME" == "tinybert_disambiguate" || "$MODEL_NAME" == "nmt_tsc_encoder" || "$MODEL_NAME" == "nmt_tsc_decoder" || "$MODEL_NAME" == "ghostnet" ]] + then + engine_command_line="echo 'avg_time:0ms/sequence'" + fi + if [ "$ARCH" == "arm" ] + then + mt_command_line="export LD_LIBRARY_PATH=${BOLT_LIB_PATH}/${ARCH}_${COMPILER}/lib && "$mt_command_line + engine_command_line="export LD_LIBRARY_PATH=${BOLT_LIB_PATH}/${ARCH}_${COMPILER}/lib && "$engine_command_line + + ADB_COMMAND_PREFIX="adb -s ${strs_arr[5]} shell" + adb_command_line="${ADB_COMMAND_PREFIX} \"${mt_command_line} > ${CI_PATH}/mt_result.txt && ${engine_command_line} > ${CI_PATH}/engine_result.txt\"" + adb_pull_result_line="adb -s ${strs_arr[5]} pull ${CI_PATH}/mt_result.txt . && adb -s ${strs_arr[5]} pull ${CI_PATH}/engine_result.txt ." + commands[${#commands[*]}]=`echo "${adb_command_line} && ${adb_pull_result_line}"` + fi + if [ "$ARCH" == "x86" ] + then + commands[${#commands[*]}]=`echo "${mt_command_line} > ./mt_result.txt && ${engine_command_line} > ./engine_result.txt"` + fi +} + +rm -r ./report.csv + +for((k=0;k<${#commands[@]};k++)){ + line=${commands[k]} + echo "Running_Beginning =====> $line" + eval $line || exit 1 + + MT_RUN_RESULT="MT_RUN_UNKNOWN" + + ENGINE_RUN_RESULT="ENGINE_RUN_UNKNOWN" + + TOP_ONE_ACC= + TOP_FIVE_ACC= + MAX_TIME_RESULT= + MIN_TIME_RESULT= + AVG_TIME_RESULT= + MESSAGE="ERROR" + + if cat ./mt_result.txt | grep "$MESSAGE" > /dev/null + then + MT_RUN_RESULT="MT_RUN_FAIL" + echo "Model conversion failed" + exit 1 + else + MT_RUN_RESULT="MT_RUN_PASS" + fi + + if cat ./engine_result.txt | grep "$MESSAGE" > /dev/null + then + ENGINE_RUN_RESULT="ENGINE_RUN_FAIL" + TOP_ONE_ACC="ERROR" + TOP_FIVE_ACC="ERROR" + MAX_TIME_RESULT="ERROR" + MIN_TIME_RESULT="ERROR" + AVG_TIME_RESULT="ERROR" + echo "Error during inference" + exit 1 + else + ENGINE_RUN_RESULT="ENGINE_RUN_PASS" + TOP_ONE_ACC=$(grep -I "top1" ./engine_result.txt) + TOP_FIVE_ACC=$(grep -I "top5" ./engine_result.txt) + MAX_TIME_RESULT=$(grep -I "max_time" ./engine_result.txt) + MIN_TIME_RESULT=$(grep -I "min_time" ./engine_result.txt) + AVG_TIME_RESULT=$(grep -I "avg_time:" ./engine_result.txt) + fi + + if [[ ${#AVG_TIME_RESULT} < 1 ]] + then + echo "Undetected error during Inference" + exit 1 + fi + + line=${combinations[k]} + final_arr=() + index=0 + for i in $(echo $line| tr "-" "\n") + do + final_arr[$index]=$i; + let index+=1 + done + + result_line="" + + report_index=0 + deviceId_to_phoneSpecification ${final_arr[5]} + final_arr[5]=$PHONE_SPECIFICATION + final_arr[12]="" + CUR_MODEL_NAME=${final_arr[0]} + for value in "${final_arr[@]}"; + do + if [ $report_index == 11 ] + then + break + fi + + if [ $report_index == 0 ] + then + result_line=$value + else + result_line=$result_line","$value + fi + let report_index+=1 + done + + # add segmentation fault check + SEGMENTATION_FAULT_CHECK=$(grep -I "Segmentation fault" ./mt_result.txt) + if [[ ${#SEGMENTATION_FAULT_CHECK} > 0 ]] + then + MT_RUN_RESULT="MT_SEGMENTATION_FAULT" + echo "Segmentation fault during model conversion" + exit 1 + fi + + SEGMENTATION_FAULT_CHECK=$(grep -I "Segmentation fault" ./engine_result.txt) + if [[ ${#SEGMENTATION_FAULT_CHECK} > 0 ]] + then + ENGINE_RUN_RESULT="ENGINE_SEGMENTATION_FAULT" + echo "Segmentation fault during inference" + exit 1 + fi + + COMPREHENSIVE_RESULT=$MAX_TIME_RESULT"+"$MIN_TIME_RESULT"+"$AVG_TIME_RESULT"+"$TOP_FIVE_ACC"+"$TOP_ONE_ACC + + if [[ "$CUR_MODEL_NAME" == "tinybert" || "$CUR_MODEL_NAME" == "fingerprint_resnet18" || "$CUR_MODEL_NAME" == "nmt" + || "$CUR_MODEL_NAME" == "asr_convolution_transformer_encoder" || "$CUR_MODEL_NAME" == "asr_convolution_transformer_prediction_net" + || "$CUR_MODEL_NAME" == "asr_convolution_transformer_joint_net" || "$CUR_MODEL_NAME" == "asr_rnnt" || "$CUR_MODEL_NAME" == "vad" + || "$CUR_MODEL_NAME" == "tts_encoder_decoder" || "$CUR_MODEL_NAME" == "tts_postnet" + || "$CUR_MODEL_NAME" == "tts_melgan_vocoder" ]] + then + result_line=$result_line","$MT_RUN_RESULT","$ENGINE_RUN_RESULT","$AVG_TIME_RESULT"," + else + result_line=$result_line","$MT_RUN_RESULT","$ENGINE_RUN_RESULT","$MAX_TIME_RESULT","$MIN_TIME_RESULT","$AVG_TIME_RESULT","$TOP_FIVE_ACC","$TOP_ONE_ACC"," + fi + rm -rf ./mt_result.txt + rm -rf ./engine_result.txt + + echo "Running_Result =====> $result_line" + + echo $result_line >> ./report.csv + echo " " >> ./report.csv + echo " " + echo " " +} + +cat ./report.csv diff --git a/CI_SCRIPTS/transExecutors.sh b/CI_SCRIPTS/transExecutors.sh new file mode 100644 index 00000000..86d56d7f --- /dev/null +++ b/CI_SCRIPTS/transExecutors.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +script_name=$0 +script_abs=$(readlink -f "$0") +script_dir=$(dirname $script_abs) +bolt_dir=${script_dir}/.. +compiler=$1 +device_dir=/data/local/tmp/CI/${compiler} + +echo "[INFO] compiler ${compiler}" + +upload_program() { + host_dir=$1 + device=$2 + device_dir=$3 + + adb -s ${device} shell "rm -rf ${device_dir}" + adb -s ${device} shell "mkdir ${device_dir}" + adb -s ${device} shell "mkdir ${device_dir}/bin ${device_dir}/lib" + for file in `ls ${host_dir}/examples/*` + do + adb -s ${device} push ${file} ${device_dir}/bin > /dev/null || exit 1 + done + for file in `ls ${host_dir}/lib/*.so` + do + adb -s ${device} push ${file} ${device_dir}/lib > /dev/null || exit 1 + done + adb -s ${device} push ${host_dir}/tools/X2bolt ${device_dir}/bin > /dev/null || exit 1 + adb -s ${device} push ${host_dir}/tools/post_training_quantization ${device_dir}/bin > /dev/null || exit 1 + if [[ "${compiler}" == "arm_llvm" ]] || [[ "${compiler}" == "arm_ndkv7" ]]; then + bash ${script_dir}/../scripts/push_third_party.sh -l ${script_dir}/../third_party/${compiler} -d ${device} -p ${device_dir}/lib -c ${compiler} || exit 1 + fi +} + +# Kirin 810 +upload_program ${bolt_dir}/install_${compiler} E5B0119506000260 ${device_dir} + +# Kirin 990 +upload_program ${bolt_dir}/install_${compiler} GCL5T19822000030 ${device_dir} diff --git a/CMakeLists.txt b/CMakeLists.txt index f7aae059..5e01dd06 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,134 +1,84 @@ cmake_minimum_required(VERSION 3.2) -file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake) +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) if (BOLT_CONFIGURE_FILE) include(${BOLT_CONFIGURE_FILE}) else (BOLT_CONFIGURE_FILE) message(FATAL_ERROR " -FATAL: can not find bolt.cmake in directory, +FATAL: can not find bolt.cmake in /common/cmakes directory, please set shell or cmake environment variable BOLT_ROOT. ") endif (BOLT_CONFIGURE_FILE) -project(bolt C CXX) - +if (USE_IOS_CLANG) + set(CMAKE_SYSTEM_NAME Darwin) + set(CMAKE_SYSTEM_VERSION 1) + set(UNIX True) + set(APPLE True) + set(IOS True) +endif (USE_IOS_CLANG) + +project(cheetah C CXX) + +set_policy() +SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${BOLT_ROOT}/common/cmakes") +if (USE_CAFFE OR USE_ONNX OR USE_FLOW) + find_package(Protobuf) +endif() +if (USE_TFLITE) + find_package(TFLite) +endif (USE_TFLITE) +if (USE_TENSORFLOW) + find_package(jsoncpp) +endif (USE_TENSORFLOW) if (USE_MALI) - add_subdirectory(gcl/tools/kernel_lib_compile) -endif (USE_MALI) -add_subdirectory(blas-enhance) -add_subdirectory(model-tools) -add_subdirectory(tensor_computing) -add_subdirectory(image) + find_package(Gcl) +endif(USE_MALI) +if (NOT USE_IOS_CLANG) + if (USE_LLVM_CLANG) + set(USE_JNI ON) + else() + find_package(JNI) + if (JNI_FOUND) + set(USE_JNI ON) + endif() + endif() +endif() + +add_subdirectory(common) +add_subdirectory(model_tools) +add_subdirectory(compute) add_subdirectory(inference) -add_subdirectory(tools) -add_subdirectory(kits) -add_subdirectory(tests) add_custom_target(bolt_library ALL - COMMAND ./scripts/build_light_bolt.sh ${CMAKE_BINARY_DIR} ${USE_MALI} ${USE_DEBUG} ${USE_LLVM_CLANG} ${CMAKE_CXX_COMPILER} ${CMAKE_AR} ${CMAKE_STRIP} + COMMAND ./scripts/build_light_bolt.sh ${CMAKE_CXX_COMPILER} ${CMAKE_AR} ${CMAKE_STRIP} ${CMAKE_BINARY_DIR} ${USE_MALI} ${USE_DEBUG} ${USE_LLVM_CLANG} ${USE_ANDROID_LOG} ${USE_IOS_CLANG} ${USE_OPENMP} WORKING_DIRECTORY $ENV{BOLT_ROOT}) +add_dependencies(bolt_library engine model_tools tensor image blas_enhance uni) +add_dependencies(bolt_library engine_static model_tools_static tensor_static image_static blas_enhance_static uni_static) -if (USE_MALI) - add_dependencies(inference kernelbin) - add_dependencies(inference_static kernelbin_static) - install(TARGETS kernelbin kernelbin_static - LIBRARY DESTINATION lib - ARCHIVE DESTINATION lib) - install(FILES ${CMAKE_BINARY_DIR}/libOpenCL.so - DESTINATION lib) -endif (USE_MALI) -add_dependencies(tensor_computing blas-enhance) -add_dependencies(tensor_computing_static blas-enhance_static) -add_dependencies(inference tensor_computing model-tools image) -add_dependencies(inference_static tensor_computing_static model-tools_static image_static) -add_dependencies(bolt_library inference) -add_dependencies(bolt_library inference_static) - -install(TARGETS blas-enhance blas-enhance_static - tensor_computing tensor_computing_static - model-tools model-tools_static - image image_static - inference inference_static - LIBRARY DESTINATION lib - ARCHIVE DESTINATION lib) - -if (USE_CAFFE) - add_dependencies(model-tools model-tools_caffe) - add_dependencies(model-tools_static model-tools_caffe_static) - install(TARGETS caffe2bolt - model-tools_caffe model-tools_caffe_static - RUNTIME DESTINATION tools - LIBRARY DESTINATION lib - ARCHIVE DESTINATION lib) -endif(USE_CAFFE) - -if (USE_ONNX) - add_dependencies(model-tools model-tools_onnx) - add_dependencies(model-tools_static model-tools_onnx_static) - install(TARGETS onnx2bolt - model-tools_onnx model-tools_onnx_static - RUNTIME DESTINATION tools - LIBRARY DESTINATION lib - ARCHIVE DESTINATION lib) -endif(USE_ONNX) - -if (USE_TFLITE) - add_dependencies(model-tools model-tools_tflite) - add_dependencies(model-tools_static model-tools_tflite_static) - install(TARGETS tflite2bolt - model-tools_tflite model-tools_tflite_static - RUNTIME DESTINATION tools - LIBRARY DESTINATION lib - ARCHIVE DESTINATION lib) -endif(USE_TFLITE) - -install(DIRECTORY model-tools/tools/tensorflow2caffe - model-tools/tools/pytorch2caffe - DESTINATION tools) - -if (USE_LIBRARY_TUNING) - install(TARGETS tensor_computing_library_search - RUNTIME DESTINATION tools) -endif (USE_LIBRARY_TUNING) - -if (BUILD_TEST) - if (USE_INT8) - install(TARGETS ptq_calibration - RUNTIME DESTINATION tools) - endif(USE_INT8) - install(TARGETS classification - bert - tinybert - nmt - asr_rnnt - asr_convolution_transformer - tts - vad - RUNTIME DESTINATION kits) -endif(BUILD_TEST) - -install(DIRECTORY inference/exports/java - inference/exports/c - DESTINATION include) - -install(FILES ${CMAKE_BINARY_DIR}/libBoltModel.so - ${CMAKE_BINARY_DIR}/libbolt.a - ${CMAKE_BINARY_DIR}/libbolt.so +# install section +install(FILES ${CMAKE_BINARY_DIR}/libbolt.a + DESTINATION lib) +if (USE_IOS_CLANG) + install(FILES ${CMAKE_BINARY_DIR}/libbolt.dylib DESTINATION lib) +else (USE_IOS_CLANG) + install(FILES ${CMAKE_BINARY_DIR}/libbolt.so + DESTINATION lib) + install(FILES ${CMAKE_BINARY_DIR}/libBoltModel.so + DESTINATION lib) +endif (USE_IOS_CLANG) execute_process(COMMAND doxygen .Doxyfile WORKING_DIRECTORY $ENV{BOLT_ROOT}) enable_testing() - find_program (BASH_PROGRAM bash) - -if (BASH_PROGRAM) - set(parameters -t $ENV{BOLT_ROOT}/tests/bin -k $ENV{BOLT_ROOT}/kits/bin -p /data/local/tmp/uldra) +if (BASH_PROGRAM AND USE_GENERAL) + set(parameters -t ${CMAKE_INSTALL_PREFIX} -p /data/local/tmp/uldra) if (USE_MALI) set(parameters ${parameters} -g) endif(USE_MALI) - if (USE_DYNAMIC_LIBRARY) - set(parameters ${parameters} -l $ENV{BOLT_ROOT}/install_llvm/lib) + set(parameters ${parameters} -l ${CMAKE_INSTALL_PREFIX}/lib) endif(USE_DYNAMIC_LIBRARY) - add_test (NAME quick_benchmark COMMAND $ENV{BOLT_ROOT}/quick_benchmark.sh ${parameters}) -endif (BASH_PROGRAM) + add_test (NAME quick_benchmark COMMAND $ENV{BOLT_ROOT}/scripts/quick_benchmark.sh ${parameters}) +endif (BASH_PROGRAM AND USE_GENERAL) diff --git a/README.md b/README.md index db758b2a..c2ef47a5 100644 --- a/README.md +++ b/README.md @@ -2,151 +2,66 @@ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -Bolt is a light-weight library for mobile devices. Bolt, as a universal deployment tool for all kinds of neural networks, aims to minimize the inference runtime as much as possible. Higher speed, better security and more efficient memory management are the advantages that Bolt strives to provide. Feel free to make good use of issue submission, or join our QQ chatroom (Chinese): 833345709. +Bolt is a light-weight library for deep learning. Bolt, as a universal deployment tool for all kinds of neural networks, aims to minimize the inference runtime as much as possible. Higher speed, better security and more efficient memory management are the advantages that Bolt strives to provide. Bolt has been widely deployed and used in many departments of HUAWEI company, such as 2012 Laboratory, CBG and HUAWEI Product Lines. Feel free to make good use of issue submission, or **join our QQ chatroom (Chinese): 833345709**. -# Features - -- ### Overview - - Bolt has almost supported all the ARM-A devices incude ARMv7/ARMv8/ARMv8.2/Mali-GPU. FP16/BNN for CPU and FP16 for GPU are highly optimized. Bolt also support FP32 on ARMv7/ARMv8/ARMv8.2 devices. - - Bolt has its own format of model storage, which helps reduce the memory footprint by storing in FP16, INT8 and 1-bit representations when possible. We provide model converters for the following formats: - - - caffe - - onnx - - tflite - - For PyTorch and TensorFlow models, please try to convert them to the onnx or tflite format first. We also had some success in converting these models into customized caffe models. - -- ### Verified Networks - - Bolt has shown its high performance in the inference of common CV and NLP neural networks. Some of the representative networks that we have verified are listed below. You can find detailed benchmark information in [docs/BENCHMARK.md](docs/BENCHMARK.md). - - - Squeezenet - - Mobilenet v1, v2, v3 - - Resnet50, [Ghostnet](https://github.com/huawei-noah/ghostnet) (plus FPN detection) - - Birealnet18 (BNN) - - SSD(Resnet) - - Bert, TinyBert, Albert - - Neural Machine Translation - - Automatic Speech Recognition - - Text To Speech - - For MALI GPU FP16 Support - - Squeezenet v1.1 - - Mobilenet v1, v2, v3 - - Ghostnet - - -- ### Inference Graph Optimizers - - Apart from the refined acceleration of convolutions and GeMM for the supported data precisions, Bolt has a easy use and powerful inference graph optimizer. As shown in [model-tools/include](model-tools/include), classic operator fusion is supported. Bolt is also equipped with a Memory Reuse Optmizer, which reassigns the space occupied by a feature map as soon as it is no longer needed as input or output. Most networks that we tested benefit from a two-third reduction in feature map storage. - -- ### Thread Affinity Setting - - Users can specify the preferred policy (high-performance or low-power). Bolt will select the most suitable core and set the thread affinity. - -- ### Algorithm Tuning - - Bolt can tailor-make the algorithm configuration for your specific target device. - -# Documentation - -- ### Installation - -Bolt provides [install.sh](install.sh) for fast installation. The major third-party dependency is protobuf, and some other may come from the original model format that you want to use. You may also need libjpeg for building [tests/classification](tests). - -After configuring [bolt.cmake](bolt.cmake), the compilation can be as simple as: - -```shell -./install.sh -t 48 -c llvm -``` +# Quick Start -For more details, please refer to [docs/INSTALL.md](docs/INSTALL.md) +![](docs/images/QuickStart.PNG) -- ### User Guide +Generally, there are two steps to get started with bolt. It's quiet easy for users to quickly running bolt. -As a user, what you are normally concerned about include the following 4 parts: +1. Conversion: use **[X2bolt](../model_tools/tools/X2bolt/X2bolt.cpp)** to convert your model from caffe,onnx,tflite or tensorflow to .bolt; -- API (We guarantee that the C API will not be changed in the future) -- Model Preparation -- Model Conversion -- Model Inference +2. Inference: run **[benchmark](../inference/examples/benchmark/benchmark.cpp)** with .bolt and data to get the inference result. -For the details, please refer to [docs/USER_HANDBOOK.md](docs/USER_HANDBOOK.md) + For more details about the usage of [**X2bolt**](model_tools/tools/X2bolt/X2bolt.cpp) and [**benchmark**](inference/examples/benchmark/benchmark.cpp) tools, see [docs/USER_HANDBOOK.md](docs/USER_HANDBOOK.md). -- ### Developer Guide - - We welcome all kinds of contribution. Before that, let's get familiar with the project structure. - -- ##### Project Structure - - - [uni](uni) hosts the common headers that are used in the project. - - [gcl](gcl) hosts the setup of MALI GPU environment. - - [image](image) hosts common preprocessing routines for image inputs (e.g. bilinear interpolation). - - [blas-enhance](blas-enhance) hosts the fast implementation of matrix-matrix multiplication and matrix-vector multiplication of FP32, FP16 and INT8. It is referenced by some of the operators in [tensor_computing](tensor_computing). - - [tensor_computing](tensor_computing) hosts the implementation for individual operators. - - [model-tools](model-tools) hosts everything related to model conversion and optimization. - - [inference](inference) hosts the inference engine of neural networks. - - Lastly, [tests](tests) include all the unit tests for the above functionalities. - - To support your own network, you can first try to convert it with the provided tools. If an operator is missing, you can first add the conversion to [model-tools](model-tools). You may then implement the missing computation routine in [tensor_computing](tensor_computing). Please also define a class for your new operator in [inference](inference). - -- ##### Contribution - -All contributions are welcomed. For the details, please refer to [docs/DEVELOPER.md](docs/DEVELOPER.md) - -- ### Benchmark - -We provide a detailed benchmark report for your reference. For more testing information please refer to [docs/BENCHMARK.md](docs/BENCHMARK.md) . - -# Road Map - -#### v0.4.0 - -Future Release 2020-09-01 - -- Yolo support -- TensorFlow model converter - -# Who are using Bolt - -- HUAWEI CBG -- HUAWEI PORTFOLIO - -# FAQ - -1. Why configuring bolt.cmake does not take effect? - - The [install.sh](install.sh) serves as an example of compilation setup, and it overwrites some settings in [bolt.cmake](bolt.cmake). Please check install.sh first. +# Features -2. More details about dependency libraries for cross-compilation? +- ## Support Frameworks - The major dependency is Protobuf. Protoc should be the x86 version but protbuf should be the ARM version. + caffe, onnx, tflite, tensorflow + +- ## Inference Precision -3. Requirements on tensor dimensions? + Float32, Float16, Int8, 1-bit + +- ## Hardware - For optimal performance, Bolt requires the number of output channels to be divisible by 8. For compatibility, Bolt will try to pad the output channels of convolution layers to the nearest multiple of 8. You can turn on USE_DEBUG in [bolt.cmake](bolt.cmake) to check the actual dimensions. + ARM CPU(v7, v8, v8.2), Mali GPU, X86(AVX2) + +- ## Verified Networks -4. Restrictions for BNN? + Bolt has shown its high performance in the inference of common CV and NLP neural networks. Some of the representative networks that we have verified are listed below. You can find detailed benchmark information in [docs/BENCHMARK.md](docs/BENCHMARK.md). - For BNN convolution layers, the number of output channels must be divisible by 32. + | Application | Models | + | ------------- | ------------------------------------------------------------ | + | CV | Squeezenet/Mobilenet_v1/Mobilenet_v2/Mobilenet_v3/Resnet50
/[Ghostnet]()/SSD/Yolov3/Pointnet/...etc. | + | NLP | Bert/[TinyBert]()/Albert/Neural Machine Translation/Text To Speech
/Automatic Speech Recognition/...etc. | + | More DL Tasks | ... | -5. Restrictions on quantization (int8)? + More models than these mentioned above are supported, users are encouraged to further explore. - For the time being, Bolt only supports post-training int8 quantization. The quantization method is symmetrical for both activation and weight. We have added a calibration tool for image CNN pipelines. Please feel free to report cases of usage failures. +- ## More Advanced Features -6. Requirements for fp16 and int8? + - Graph Optimization + - Thread Affinity + - Algorithm Tuning + - [Time-Series Data Acceleration](docs/USER_HANDBOOK.md#time-series-data-acceleration) - Only arm-v8.2 supports fp16 and int8 dotprod instructions. +# Documentations -7. Restrictions for MALI? +Everything you want to know about bolt is recorded in the detailed documentations stored in [docs](docs). - Only llvm compilation supports MALI computing. +- [How to install bolt with different compilers](docs/INSTALL.md). +- [How to use bolt to inference your ML models.](docs/USER_HANDBOOK.md) +- [How to develop bolt to customize more models.](docs/DEVELOPER.md) +- [Benchmark results on some universal models.](docs/BENCHMARK.md) +- [Frequently Asked Questions(FAQ)](docs/FAQ.md) # Acknowledgement -Bolt refers to the following projects: [caffe](https://github.com/BVLC/caffe), [onnx](https://github.com/onnx/onnx), [protobuf](https://github.com/protocolbuffers/protobuf), [flatbuffers](https://github.com/google/flatbuffers), [ncnn](https://github.com/Tencent/ncnn), [mnn](https://github.com/alibaba/MNN), [dabnn](https://github.com/JDAI-CV/dabnn). +Bolt refers to the following projects: [caffe](https://github.com/BVLC/caffe), [onnx](https://github.com/onnx/onnx), [tensorflow](https://github.com/tensorflow/tensorflow), [ncnn](https://github.com/Tencent/ncnn), [mnn](https://github.com/alibaba/MNN), [dabnn](https://github.com/JDAI-CV/dabnn). # License diff --git a/THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.md b/THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.md deleted file mode 100644 index 8a6531cf..00000000 --- a/THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.md +++ /dev/null @@ -1,71 +0,0 @@ -Please note we provide an open source software notice for the third party open source software along with this software and/or this software component contributed by Huawei (in the following just “this SOFTWARE”). The open source software licenses are granted by the respective right holders. - - - -Warranty Disclaimer - -THE OPEN SOURCE SOFTWARE IN THIS SOFTWARE IS DISTRIBUTED IN THE HOPE THAT IT WILL BE USEFUL, BUT WITHOUT ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. SEE THE APPLICABLE LICENSES FOR MORE DETAILS. - - - -Copyright Notice and License Texts - -Software: model-tools/src/caffe/caffe.proto () - -Copyright notice: - -Copyright (c) 2014-2017 The Regents of the University of California(Regents) - -All right reserved. - -License: BSD 2-Clause License - - - -Copyright - -Software:model-tools/src/onnx/onnx.proto () - -Copyright notice: - -Copyright (c) 2017 ONNX Project Contributors - -All rights reserved. - -License: MIT License - - - -Copyright - -Software:model-tools/cmakes/FindProtobuf.cmake () - -Copyright (c) 2008 Google Inc. - -All rights reserved. - -License: BSD License - - - -Copyright - -Software:model-tools/src/tflite/schema_generated.h () - -Copyright (c) 2019 The TensorFlow Authors. - -All rights reserved. - -License: Apache 2.0 - - - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files(the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - - - -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - - - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/blas-enhance/include/blas-enhance.h b/blas-enhance/include/blas-enhance.h deleted file mode 100644 index 5b8b5389..00000000 --- a/blas-enhance/include/blas-enhance.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_BLAS_ENHANCE -#define _H_BLAS_ENHANCE - -#include "sys.h" -#include "tensor_desc.h" - -#ifdef __cplusplus -extern "C" { -#endif - - EE matrix_matrix_multiply_tmp_bytes(TensorDesc matrixADesc, TensorDesc matrixBDesc, U32* bytes, Arch arch); - - EE matrix_matrix_multiply(TensorDesc matrixADesc, const void* matrixA, - TensorDesc matrixBDesc, const void* matrixB, - U32 bytes, void* tmp, - TensorDesc matrixCDesc, void* matrixC, Arch arch); - - EE matrix_vector_multiply_tmp_bytes(TensorDesc matrixDesc, TensorDesc vectorDesc, U32* bytes, Arch); - - EE matrix_vector_multiply(TensorDesc matrixDesc, const void* matrix, - TensorDesc vectorDesc, const void* vector, - U32 bytes, void* tmp, - TensorDesc resultDesc, void* result, Arch arch); - - inline DataFormat targetFormat4MatrixB(DataType dt) - { - switch (dt) { - case DT_F16: { - return DF_NKN24; - } - case DT_F32: { -#ifdef __aarch64__ - return DF_NKN12; -#else - return DF_NKN8; -#endif - } - case DT_I8: { - return DF_NKN12K4; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - exit(1); - } - } - } - - EE matrix_matrix_multiply_transform_rhs(TensorDesc desc, const void* src, TensorDesc* descTran,void* dst); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/blas-enhance/src/CMakeLists.txt b/blas-enhance/src/CMakeLists.txt deleted file mode 100644 index af3c35c3..00000000 --- a/blas-enhance/src/CMakeLists.txt +++ /dev/null @@ -1,32 +0,0 @@ -if (USE_GENERAL) - file(GLOB general_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/general/*.cpp) -endif (USE_GENERAL) - -if (USE_NEON) - if (USE_FP16) - file(GLOB arm_fp16_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/fp16/*.cpp) - endif (USE_FP16) - if (USE_FP32) - file(GLOB arm_fp32_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/fp32/*.cpp) - endif (USE_FP32) - if (USE_INT8) - file(GLOB arm_int8_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/int8/*.cpp) - endif (USE_INT8) - file(GLOB arm_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/*.cpp) - set(arm_srcs "${arm_srcs};${arm_fp16_srcs};${arm_fp32_srcs};${arm_int8_srcs}") -endif (USE_NEON) - -file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) -set(srcs "${srcs};${general_srcs};${arm_srcs}") - -include_directories(${CMAKE_CURRENT_SOURCE_DIR}) - -# shared library -ADD_LIBRARY(${PROJECT_NAME} SHARED ${srcs}) - -# static library -ADD_LIBRARY(${PROJECT_NAME}_static STATIC ${srcs}) - -SET_TARGET_PROPERTIES(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}") -SET_TARGET_PROPERTIES(${PROJECT_NAME} PROPERTIES CLEAN_DIRECT_OUTPUT 1) -SET_TARGET_PROPERTIES(${PROJECT_NAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) diff --git a/blas-enhance/src/cpu/arm/fp16/mmm_A55.cpp b/blas-enhance/src/cpu/arm/fp16/mmm_A55.cpp deleted file mode 100644 index fe473339..00000000 --- a/blas-enhance/src/cpu/arm/fp16/mmm_A55.cpp +++ /dev/null @@ -1,790 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include -#include - -#include "type.h" -#include "error.h" -#include "cpu/arm/fp16/mmm_common.h" -#include "cpu/arm/fp16/mmm.h" - - -inline void mmm_4x24_A55(U32 M, U32 K, F16* w, F16* in, F16* out) { - U32 KTail = K % 2; - U32 KInner = K - KTail; - asm volatile( - //init in0- > v1, w- > v0 - "ld1 {v1.4h}, [%1], #8\n" - "ldr x22, [%1], #8\n" - "ins v1.d[1], x22\n" - "ld1 {v0.4h}, [%2], #8\n" - "mov x26, %0\n" - "ld1 {v5.8h, v6.8h, v7.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.8h, v9.8h, v10.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.8h, v12.8h, v13.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v14.8h, v15.8h, v16.8h}, [x26]\n" - - "mov x20, %3\n" - - "0:\n" - //w- > v4, in0- > v2/v3/v1, out0=v5~v28 - "fmla v5.8h, v1.8h, v0.h[0]\n" - "ld1 {v2.4h}, [%1], #8\n" - "fmla v8.8h, v1.8h, v0.h[1]\n" - "ldr x23, [%1], #8\n" - "fmla v11.8h, v1.8h, v0.h[2]\n" - "ins v2.d[1], x23\n" - "fmla v14.8h, v1.8h, v0.h[3]\n" - "ld1 {v4.4h}, [%2], #8\n" - "fmla v6.8h, v2.8h, v0.h[0]\n" - "ld1 {v3.4h}, [%1], #8\n" - "fmla v9.8h, v2.8h, v0.h[1]\n" - "ldr x24, [%1], #8\n" - "fmla v12.8h, v2.8h, v0.h[2]\n" - "ins v3.d[1], x24\n" - "fmla v15.8h, v2.8h, v0.h[3]\n" - "fmla v7.8h, v3.8h, v0.h[0]\n" - "ld1 {v1.4h}, [%1], #8\n" - "fmla v10.8h, v3.8h, v0.h[1]\n" - "ldr x22, [%1], #8\n" - "fmla v13.8h, v3.8h, v0.h[2]\n" - "ins v1.d[1], x22\n" - "fmla v16.8h, v3.8h, v0.h[3]\n" - - //w- > v0, in0- > v2/v3/v1, out0- > v5~v28 - "fmla v5.8h, v1.8h, v4.h[0]\n" - "ld1 {v2.4h}, [%1], #8\n" - "fmla v8.8h, v1.8h, v4.h[1]\n" - "ldr x23, [%1], #8\n" - "fmla v11.8h, v1.8h, v4.h[2]\n" - "ins v2.d[1], x23\n" - "fmla v14.8h, v1.8h, v4.h[3]\n" - "ld1 {v0.4h}, [%2], #8\n" - "fmla v6.8h, v2.8h, v4.h[0]\n" - "ld1 {v3.4h}, [%1], #8\n" - "fmla v9.8h, v2.8h, v4.h[1]\n" - "ldr x24, [%1], #8\n" - "fmla v12.8h, v2.8h, v4.h[2]\n" - "ins v3.d[1], x24\n" - "fmla v15.8h, v2.8h, v4.h[3]\n" - "fmla v7.8h, v3.8h, v4.h[0]\n" - "ld1 {v1.4h}, [%1], #8\n" - "fmla v10.8h, v3.8h, v4.h[1]\n" - "ldr x22, [%1], #8\n" - "fmla v13.8h, v3.8h, v4.h[2]\n" - "ins v1.d[1], x22\n" - "fmla v16.8h, v3.8h, v4.h[3]\n" - - "subs x20, x20, #0x2\n" - "bne 0b\n" - - "cbz %5, 1f\n" - "fmla v5.8h, v1.8h, v0.h[0]\n" - "ld1 {v2.4h}, [%1], #8\n" - "fmla v8.8h, v1.8h, v0.h[1]\n" - "ldr x23, [%1], #8\n" - "fmla v11.8h, v1.8h, v0.h[2]\n" - "ins v2.d[1], x23\n" - "fmla v14.8h, v1.8h, v0.h[3]\n" - "fmla v6.8h, v2.8h, v0.h[0]\n" - "ld1 {v3.4h}, [%1], #8\n" - "fmla v9.8h, v2.8h, v0.h[1]\n" - "ldr x24, [%1], #8\n" - "fmla v12.8h, v2.8h, v0.h[2]\n" - "ins v3.d[1], x24\n" - "fmla v15.8h, v2.8h, v0.h[3]\n" - "fmla v7.8h, v3.8h, v0.h[0]\n" - "fmla v10.8h, v3.8h, v0.h[1]\n" - "fmla v13.8h, v3.8h, v0.h[2]\n" - "fmla v16.8h, v3.8h, v0.h[3]\n" - - "1:\n" - "mov x26, %0\n" - "st1 {v5.8h, v6.8h, v7.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.8h, v9.8h, v10.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.8h, v12.8h, v13.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v14.8h, v15.8h, v16.8h}, [x26]\n" - :"+r" (out), - "+r" (in), - "+r" (w) - :"r" ((I64)KInner), - "r" ((I64)M), - "r" ((I64)KTail) - :"memory", "cc", "x0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", - "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16" - ); -} - -inline void mmm_8x4_A55(U32 M, U32 K, F16* w, F16* in, F16* out) { - U32 KTail = K % 2; - U32 KInner = K - KTail; - asm volatile( - "mov x26, %0\n" - "ld1 {v5.h}[0], [x26], #2\n" - "ld1 {v6.h}[0], [x26], #2\n" - "ld1 {v7.h}[0], [x26], #2\n" - "ld1 {v8.h}[0], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[1], [x26], #2\n" - "ld1 {v6.h}[1], [x26], #2\n" - "ld1 {v7.h}[1], [x26], #2\n" - "ld1 {v8.h}[1], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[2], [x26], #2\n" - "ld1 {v6.h}[2], [x26], #2\n" - "ld1 {v7.h}[2], [x26], #2\n" - "ld1 {v8.h}[2], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[3], [x26], #2\n" - "ld1 {v6.h}[3], [x26], #2\n" - "ld1 {v7.h}[3], [x26], #2\n" - "ld1 {v8.h}[3], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[4], [x26], #2\n" - "ld1 {v6.h}[4], [x26], #2\n" - "ld1 {v7.h}[4], [x26], #2\n" - "ld1 {v8.h}[4], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[5], [x26], #2\n" - "ld1 {v6.h}[5], [x26], #2\n" - "ld1 {v7.h}[5], [x26], #2\n" - "ld1 {v8.h}[5], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[6], [x26], #2\n" - "ld1 {v6.h}[6], [x26], #2\n" - "ld1 {v7.h}[6], [x26], #2\n" - "ld1 {v8.h}[6], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[7], [x26], #2\n" - "ld1 {v6.h}[7], [x26], #2\n" - "ld1 {v7.h}[7], [x26], #2\n" - "ld1 {v8.h}[7], [x26], #2\n" - "add x26, x26, %4\n" - - "mov x20, %3\n" - - "ld1 {v1.4h}, [%2], #8\n" - "ldr x24, [%2], #8\n" - "ins v1.d[1], x24\n" - "ld1 {v2.4h}, [%1], #8\n" - - "0:\n" - "fmla v5.8h, v1.8h, v2.h[0]\n" - "ld1 {v3.4h}, [%2], #8\n" - "fmla v6.8h, v1.8h, v2.h[1]\n" - "ldr x25, [%2], #8\n" - "ld1 {v4.4h}, [%1], #8\n" - "fmla v7.8h, v1.8h, v2.h[2]\n" - "ins v3.d[1], x25\n" - "fmla v8.8h, v1.8h, v2.h[3]\n" - - "fmla v5.8h, v3.8h, v4.h[0]\n" - "ld1 {v1.4h}, [%2], #8\n" - "fmla v6.8h, v3.8h, v4.h[1]\n" - "ldr x24, [%2], #8\n" - "ld1 {v2.4h}, [%1], #8\n" - "fmla v7.8h, v3.8h, v4.h[2]\n" - "ins v1.d[1], x24\n" - "fmla v8.8h, v3.8h, v4.h[3]\n" - - "subs x20, x20, 0x2\n" - "bne 0b\n" - - "cbz %5, 1f\n" - "fmla v5.8h, v1.8h, v2.h[0]\n" - "fmla v6.8h, v1.8h, v2.h[1]\n" - "fmla v7.8h, v1.8h, v2.h[2]\n" - "fmla v8.8h, v1.8h, v2.h[3]\n" - - "1:\n" - "mov x26, %0\n" - "st1 {v5.h}[0], [x26], #2\n" - "st1 {v6.h}[0], [x26], #2\n" - "st1 {v7.h}[0], [x26], #2\n" - "st1 {v8.h}[0], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[1], [x26], #2\n" - "st1 {v6.h}[1], [x26], #2\n" - "st1 {v7.h}[1], [x26], #2\n" - "st1 {v8.h}[1], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[2], [x26], #2\n" - "st1 {v6.h}[2], [x26], #2\n" - "st1 {v7.h}[2], [x26], #2\n" - "st1 {v8.h}[2], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[3], [x26], #2\n" - "st1 {v6.h}[3], [x26], #2\n" - "st1 {v7.h}[3], [x26], #2\n" - "st1 {v8.h}[3], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[4], [x26], #2\n" - "st1 {v6.h}[4], [x26], #2\n" - "st1 {v7.h}[4], [x26], #2\n" - "st1 {v8.h}[4], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[5], [x26], #2\n" - "st1 {v6.h}[5], [x26], #2\n" - "st1 {v7.h}[5], [x26], #2\n" - "st1 {v8.h}[5], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[6], [x26], #2\n" - "st1 {v6.h}[6], [x26], #2\n" - "st1 {v7.h}[6], [x26], #2\n" - "st1 {v8.h}[6], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[7], [x26], #2\n" - "st1 {v6.h}[7], [x26], #2\n" - "st1 {v7.h}[7], [x26], #2\n" - "st1 {v8.h}[7], [x26], #2\n" - "add x26, x26, %4\n" - :"+r" (out), - "+r" (in), - "+r" (w) - :"r" ((I64)KInner), - "r" ((I64)M), - "r" ((I64)KTail) - :"memory", "cc", "x20","x24","x25","x26","x27", "v1", "v2", "v3","v4", "v5","v6", "v7", "v8" - ); -} - -inline void mmm_4x8_A55(U32 M, U32 K, F16* w, F16* in, F16* out) { - U32 KTail = K % 2; - U32 KInner = K - KTail; - asm volatile( - "mov x26, %0\n" - "ld1 {v5.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v6.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v7.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.8h}, [x26]\n" - - "mov x20, %3\n" - - "ld1 {v1.4h}, [%1], #8\n" - "ldr x24, [%1], #8\n" - "ins v1.d[1], x24\n" - "ld1 {v2.4h}, [%2], #8\n" - - "0:\n" - "fmla v5.8h, v1.8h, v2.h[0]\n" - "ld1 {v3.4h}, [%1], #8\n" - "fmla v6.8h, v1.8h, v2.h[1]\n" - "ldr x25, [%1], #8\n" - "ld1 {v4.4h}, [%2], #8\n" - "fmla v7.8h, v1.8h, v2.h[2]\n" - "ins v3.d[1], x25\n" - "fmla v8.8h, v1.8h, v2.h[3]\n" - "fmla v5.8h, v3.8h, v4.h[0]\n" - "ld1 {v1.4h}, [%1], #8\n" - "fmla v6.8h, v3.8h, v4.h[1]\n" - "ldr x24, [%1], #8\n" - "ld1 {v2.4h}, [%2], #8\n" - "fmla v7.8h, v3.8h, v4.h[2]\n" - "ins v1.d[1], x24\n" - "fmla v8.8h, v3.8h, v4.h[3]\n" - - "subs x20, x20, 0x2\n" - "bne 0b\n" - - "cbz %5, 1f\n" - "fmla v5.8h, v1.8h, v2.h[0]\n" - "fmla v6.8h, v1.8h, v2.h[1]\n" - "fmla v7.8h, v1.8h, v2.h[2]\n" - "fmla v8.8h, v1.8h, v2.h[3]\n" - - "1:\n" - "mov x26, %0\n" - "st1 {v5.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v6.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v7.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.8h}, [x26]\n" - :"+r" (out), - "+r" (in), - "+r" (w) - :"r" ((I64)KInner), - "r" ((I64)M), - "r" ((I64)KTail) - :"memory", "cc", "x20","x24","x25","x26","x27", "v1", "v2", "v3","v4", "v5","v6", "v7", "v8" - ); -} - -inline void mmm_4x4_A55(U32 M, U32 K, F16* w, F16* in, F16* out) { - U32 KTail = K % 2; - U32 KInner = K - KTail; - asm volatile( - "mov x26, %0\n" - "ld1 {v5.4h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v6.4h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v7.4h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.4h}, [x26]\n" - - "mov x20, %3\n" - - "ld1 {v1.4h}, [%1], #8\n" - "ld1 {v2.4h}, [%2], #8\n" - - "0:\n" - "fmla v5.4h, v1.4h, v2.h[0]\n" - "ld1 {v3.4h}, [%1], #8\n" - "fmla v6.4h, v1.4h, v2.h[1]\n" - "ld1 {v4.4h}, [%2], #8\n" - "fmla v7.4h, v1.4h, v2.h[2]\n" - "fmla v8.4h, v1.4h, v2.h[3]\n" - "fmla v5.4h, v3.4h, v4.h[0]\n" - "ld1 {v1.4h}, [%1], #8\n" - "fmla v6.4h, v3.4h, v4.h[1]\n" - "ld1 {v2.4h}, [%2], #8\n" - "fmla v7.4h, v3.4h, v4.h[2]\n" - "fmla v8.4h, v3.4h, v4.h[3]\n" - - "subs x20, x20, 0x2\n" - "bne 0b\n" - - "cbz %5, 1f\n" - "fmla v5.4h, v1.4h, v2.h[0]\n" - "fmla v6.4h, v1.4h, v2.h[1]\n" - "fmla v7.4h, v1.4h, v2.h[2]\n" - "fmla v8.4h, v1.4h, v2.h[3]\n" - - "1:\n" - "mov x26, %0\n" - "st1 {v5.4h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v6.4h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v7.4h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.4h}, [x26]\n" - :"+r" (out), - "+r" (in), - "+r" (w) - :"r" ((I64)KInner), - "r" ((I64)M), - "r" ((I64)KTail) - :"memory", "cc", "x20","x26","v1", "v2", "v3","v4", "v5","v6", "v7", "v8" - ); -} - -inline void mmm_8x8_A55(U32 M, U32 K, F16* w, F16* in, F16* out) { - U32 KTail = K % 2; - U32 KInner = K - KTail; - asm volatile( - "mov x26, %0\n" - "ld1 {v5.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v6.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v7.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v9.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v10.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v12.8h}, [x26]\n" - - "mov x20, %3\n" - - "ld1 {v1.4h}, [%1], #8\n" - "ldr x24, [%1], #8\n" - "ins v1.d[1], x24\n" - "ld1 {v2.4h}, [%2], #8\n" - "ldr x22, [%2], #8\n" - "ins v2.d[1], x22\n" - - "0:\n" - "fmla v5.8h, v1.8h, v2.h[0]\n" - "ld1 {v3.4h}, [%1], #8\n" - "fmla v6.8h, v1.8h, v2.h[1]\n" - "ldr x25, [%1], #8\n" - "fmla v7.8h, v1.8h, v2.h[2]\n" - "ins v3.d[1], x25\n" - "fmla v8.8h, v1.8h, v2.h[3]\n" - "ld1 {v4.4h}, [%2], #8\n" - "fmla v9.8h, v1.8h, v2.h[4]\n" - "ldr x23, [%2], #8\n" - "fmla v10.8h, v1.8h, v2.h[5]\n" - "ins v4.d[1], x23\n" - "fmla v11.8h, v1.8h, v2.h[6]\n" - "fmla v12.8h, v1.8h, v2.h[7]\n" - - "fmla v5.8h, v3.8h, v4.h[0]\n" - "ld1 {v1.4h}, [%1], #8\n" - "fmla v6.8h, v3.8h, v4.h[1]\n" - "ldr x24, [%1], #8\n" - "fmla v7.8h, v3.8h, v4.h[2]\n" - "ins v1.d[1], x24\n" - "fmla v8.8h, v3.8h, v4.h[3]\n" - "ld1 {v2.4h}, [%2], #8\n" - "fmla v9.8h, v3.8h, v4.h[4]\n" - "ldr x22, [%2], #8\n" - "fmla v10.8h, v3.8h, v4.h[5]\n" - "ins v2.d[1], x22\n" - "fmla v11.8h, v3.8h, v4.h[6]\n" - "fmla v12.8h, v3.8h, v4.h[7]\n" - - "subs x20, x20, 0x2\n" - "bne 0b\n" - - "cbz %5, 1f\n" - "fmla v5.8h, v1.8h, v2.h[0]\n" - "fmla v6.8h, v1.8h, v2.h[1]\n" - "fmla v7.8h, v1.8h, v2.h[2]\n" - "fmla v8.8h, v1.8h, v2.h[3]\n" - "fmla v9.8h, v1.8h, v2.h[4]\n" - "fmla v10.8h, v1.8h, v2.h[5]\n" - "fmla v11.8h, v1.8h, v2.h[6]\n" - "fmla v12.8h, v1.8h, v2.h[7]\n" - - "1:\n" - "mov x26, %0\n" - "st1 {v5.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v6.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v7.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v9.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v10.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v12.8h}, [x26]\n" - :"+r" (out), - "+r" (in), - "+r" (w) - :"r" ((I64)KInner), - "r" ((I64)M), - "r" ((I64)KTail) - :"memory", "cc", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", - "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12" - ); -} - -inline void mmm_8x24_A55(U32 M, U32 K, F16* w, F16* in, F16* out) { - U32 KTail = K % 2; - U32 KInner = K - KTail; - asm volatile( - //init in0- > v1, w- > v0 - "ld1 {v1.4h}, [%1], #8\n" - "ldr x22, [%1], #8\n" - "ins v1.d[1], x22\n" - "ld1 {v0.4h}, [%2], #8\n" - "ldr x21, [%2], #8\n" - "ins v0.d[1], x21\n" - - "mov x26, %0\n" - "ld1 {v5.8h, v6.8h, v7.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.8h, v9.8h, v10.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.8h, v12.8h, v13.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v14.8h, v15.8h, v16.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v17.8h, v18.8h, v19.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v20.8h, v21.8h, v22.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v23.8h, v24.8h, v25.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v26.8h, v27.8h, v28.8h}, [x26]\n" - - "mov x20, %3\n" - - "0:\n" - //w- > v4, in0- > v2/v3/v1, out0=v5~v28 - "ld1 {v2.4h}, [%1], #8\n" - "fmla v5.8h, v1.8h, v0.h[0]\n" - "fmla v8.8h, v1.8h, v0.h[1]\n" - "ldr x23, [%1], #8\n" - "fmla v11.8h, v1.8h, v0.h[2]\n" - "fmla v14.8h, v1.8h, v0.h[3]\n" - "ins v2.d[1], x23\n" - "fmla v17.8h, v1.8h, v0.h[4]\n" - "fmla v20.8h, v1.8h, v0.h[5]\n" - "ld1 {v4.4h}, [%2], #8\n" - "fmla v23.8h, v1.8h, v0.h[6]\n" - "fmla v26.8h, v1.8h, v0.h[7]\n" - - "ld1 {v3.4h}, [%1], #8\n" - "fmla v6.8h, v2.8h, v0.h[0]\n" - "fmla v9.8h, v2.8h, v0.h[1]\n" - "ldr x24, [%1], #8\n" - "fmla v12.8h, v2.8h, v0.h[2]\n" - "fmla v15.8h, v2.8h, v0.h[3]\n" - "ins v3.d[1], x24\n" - "fmla v18.8h, v2.8h, v0.h[4]\n" - "fmla v21.8h, v2.8h, v0.h[5]\n" - "ldr x25, [%2], #8\n" - "fmla v24.8h, v2.8h, v0.h[6]\n" - "fmla v27.8h, v2.8h, v0.h[7]\n" - - "ld1 {v1.4h}, [%1], #8\n" - "fmla v7.8h, v3.8h, v0.h[0]\n" - "fmla v10.8h, v3.8h, v0.h[1]\n" - "ldr x22, [%1], #8\n" - "fmla v13.8h, v3.8h, v0.h[2]\n" - "fmla v16.8h, v3.8h, v0.h[3]\n" - "ins v1.d[1], x22\n" - "fmla v19.8h, v3.8h, v0.h[4]\n" - "fmla v22.8h, v3.8h, v0.h[5]\n" - "ins v4.d[1], x25\n" - "fmla v25.8h, v3.8h, v0.h[6]\n" - "fmla v28.8h, v3.8h, v0.h[7]\n" - - //w- > v0, in0- > v2/v3/v1, out0- > v5~v28 - "ld1 {v2.4h}, [%1], #8\n" - "fmla v5.8h, v1.8h, v4.h[0]\n" - "fmla v8.8h, v1.8h, v4.h[1]\n" - "ldr x23, [%1], #8\n" - "fmla v11.8h, v1.8h, v4.h[2]\n" - "fmla v14.8h, v1.8h, v4.h[3]\n" - "ins v2.d[1], x23\n" - "fmla v17.8h, v1.8h, v4.h[4]\n" - "fmla v20.8h, v1.8h, v4.h[5]\n" - "ld1 {v0.4h}, [%2], #8\n" - "fmla v23.8h, v1.8h, v4.h[6]\n" - "fmla v26.8h, v1.8h, v4.h[7]\n" - - "ld1 {v3.4h}, [%1], #8\n" - "fmla v6.8h, v2.8h, v4.h[0]\n" - "fmla v9.8h, v2.8h, v4.h[1]\n" - "ldr x24, [%1], #8\n" - "fmla v12.8h, v2.8h, v4.h[2]\n" - "fmla v15.8h, v2.8h, v4.h[3]\n" - "ins v3.d[1], x24\n" - "fmla v18.8h, v2.8h, v4.h[4]\n" - "fmla v21.8h, v2.8h, v4.h[5]\n" - "ldr x21, [%2], #8\n" - "fmla v24.8h, v2.8h, v4.h[6]\n" - "fmla v27.8h, v2.8h, v4.h[7]\n" - - "ld1 {v1.4h}, [%1], #8\n" - "fmla v7.8h, v3.8h, v4.h[0]\n" - "fmla v10.8h, v3.8h, v4.h[1]\n" - "ldr x22, [%1], #8\n" - "fmla v13.8h, v3.8h, v4.h[2]\n" - "fmla v16.8h, v3.8h, v4.h[3]\n" - "ins v1.d[1], x22\n" - "fmla v19.8h, v3.8h, v4.h[4]\n" - "fmla v22.8h, v3.8h, v4.h[5]\n" - "ins v0.d[1], x21\n" - "fmla v25.8h, v3.8h, v4.h[6]\n" - "subs x20, x20, #0x2\n" - "fmla v28.8h, v3.8h, v4.h[7]\n" - - "bne 0b\n" - - "cbz %5, 1f\n" - "ld1 {v2.4h}, [%1], #8\n" - "fmla v5.8h, v1.8h, v0.h[0]\n" - "fmla v8.8h, v1.8h, v0.h[1]\n" - "ldr x23, [%1], #8\n" - "fmla v11.8h, v1.8h, v0.h[2]\n" - "fmla v14.8h, v1.8h, v0.h[3]\n" - "ins v2.d[1], x23\n" - "fmla v17.8h, v1.8h, v0.h[4]\n" - "fmla v20.8h, v1.8h, v0.h[5]\n" - "fmla v23.8h, v1.8h, v0.h[6]\n" - "fmla v26.8h, v1.8h, v0.h[7]\n" - - "ld1 {v3.4h}, [%1], #8\n" - "fmla v6.8h, v2.8h, v0.h[0]\n" - "fmla v9.8h, v2.8h, v0.h[1]\n" - "ldr x24, [%1], #8\n" - "fmla v12.8h, v2.8h, v0.h[2]\n" - "fmla v15.8h, v2.8h, v0.h[3]\n" - "ins v3.d[1], x24\n" - "fmla v18.8h, v2.8h, v0.h[4]\n" - "fmla v21.8h, v2.8h, v0.h[5]\n" - "fmla v24.8h, v2.8h, v0.h[6]\n" - "fmla v27.8h, v2.8h, v0.h[7]\n" - - "fmla v7.8h, v3.8h, v0.h[0]\n" - "fmla v10.8h, v3.8h, v0.h[1]\n" - "fmla v13.8h, v3.8h, v0.h[2]\n" - "fmla v16.8h, v3.8h, v0.h[3]\n" - "fmla v19.8h, v3.8h, v0.h[4]\n" - "fmla v22.8h, v3.8h, v0.h[5]\n" - "fmla v25.8h, v3.8h, v0.h[6]\n" - "fmla v28.8h, v3.8h, v0.h[7]\n" - - "1:\n" - "mov x26, %0\n" - "st1 {v5.8h, v6.8h, v7.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.8h, v9.8h, v10.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.8h, v12.8h, v13.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v14.8h, v15.8h, v16.8h}, [x26]\n" - "add x26, x26, %4\n" - - "st1 {v17.8h, v18.8h, v19.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v20.8h, v21.8h, v22.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v23.8h, v24.8h, v25.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v26.8h, v27.8h, v28.8h}, [x26]\n" - :"+r" (out), - "+r" (in), - "+r" (w) - :"r" ((I64)KInner), - "r" ((I64)M), - "r" ((I64)KTail) - :"memory", "cc", "x0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", - "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28" - ); -} - -void mmm_A55(int M, int N, int K, F16* matrix1, F16* matrix2, F16* tmp, F16* result) { - int blockK = K; - int blockM = 192; - F16* matrix1Trans = tmp; - F16* resultCurrent = result; - int KInner, MInner, m, n; - for(int k = 0; k < K; k += blockK) { - KInner = UNI_MIN(blockK, K - k); - for(int i = 0; i < M; i+=blockM) { - - MInner = UNI_MIN(blockM, M - i); - - for(n = 0; n <= N - 8; n+=8) { - if (i == 0) { - matrix1_trans(8, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); - } - for(m = 0; m <= (MInner-24); m+=24) { - resultCurrent = result + n * M + m + i; - mmm_8x24_A55(M*2, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - for(; m <=(MInner - 8); m+=8) { - resultCurrent = result + n * M + m + i; - mmm_8x8_A55(M*2, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - if ((MInner - m) >= 4) { - resultCurrent = result + n * M + m + i; - mmm_8x4_A55(M*2, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - m += 4; - } - - if (MInner - m) { - resultCurrent = result + n * M + m + i; - mmm_N8_MTail(MInner - m, M, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - } - - if ((N - n) >= 4) { - - if (i == 0) { - matrix1_trans(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); - } - - for(m = 0; m <= (MInner - 24); m+=24) { - resultCurrent = result + n * M + m + i; - mmm_4x24_A55(M*2, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - for(; m <= (MInner - 8); m+=8) { - resultCurrent = result + n * M + m + i; - mmm_4x8_A55(M*2, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - if ((MInner - m) >= 4) { - resultCurrent = result + n * M + m + i; - mmm_4x4_A55(M*2, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - m += 4; - } - - if (MInner - m) { - resultCurrent = result + n * M + m + i; - mmm_N4_MTail(MInner - m, M, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - n += 4; - - } - - if (N - n) { - if (i == 0) { - matrix1_trans(N-n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); - } - - for(m = 0; m <= (MInner - 24); m+=24) { - resultCurrent = result + n * M + m + i; - mmm_NTail_M24(M, N - n, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - for(; m <= (MInner - 8); m+=8) { - resultCurrent = result + n * M + m + i; - mmm_NTail_M8(M, N - n, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - if ((MInner - m) >= 4) { - resultCurrent = result + n * M + m + i; - mmm_NTail_M4(M, N - n, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - m += 4; - } - - if (MInner - m) { - resultCurrent = result + n * M + m + i; - mmm_NTail_M(MInner - m, M, N - n, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - } - - } - } -} diff --git a/blas-enhance/src/cpu/arm/fp16/mmm_A76.cpp b/blas-enhance/src/cpu/arm/fp16/mmm_A76.cpp deleted file mode 100644 index 83deca1e..00000000 --- a/blas-enhance/src/cpu/arm/fp16/mmm_A76.cpp +++ /dev/null @@ -1,634 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include -#include - -#include "type.h" -#include "error.h" -#include "cpu/arm/fp16/mmm_common.h" -#include "cpu/arm/fp16/mmm.h" - -#define MMM_FMA_4x8_V5V14s3_V1xV0 "fmla v5.8h, v1.8h, v0.h[0]\n"\ - "fmla v8.8h, v1.8h, v0.h[1]\n"\ - "fmla v11.8h, v1.8h, v0.h[2]\n"\ - "fmla v14.8h, v1.8h, v0.h[3]\n" -#define MMM_FMA_4x8_V17V26s3_V1xV0 "fmla v17.8h, v1.8h, v0.h[4]\n"\ - "fmla v20.8h, v1.8h, v0.h[5]\n"\ - "fmla v23.8h, v1.8h, v0.h[6]\n"\ - "fmla v26.8h, v1.8h, v0.h[7]\n" -#define MMM_FMA_4x8_V6V15s3_V2xV0 "fmla v6.8h, v2.8h, v0.h[0]\n"\ - "fmla v9.8h, v2.8h, v0.h[1]\n"\ - "fmla v12.8h, v2.8h, v0.h[2]\n"\ - "fmla v15.8h, v2.8h, v0.h[3]\n" -#define MMM_FMA_4x8_V18V27s3_V2xV0 "fmla v18.8h, v2.8h, v0.h[4]\n"\ - "fmla v21.8h, v2.8h, v0.h[5]\n"\ - "fmla v24.8h, v2.8h, v0.h[6]\n"\ - "fmla v27.8h, v2.8h, v0.h[7]\n" -#define MMM_FMA_4x8_V7V16s3_V3xV0 "fmla v7.8h, v3.8h, v0.h[0]\n"\ - "fmla v10.8h, v3.8h, v0.h[1]\n"\ - "fmla v13.8h, v3.8h, v0.h[2]\n"\ - "fmla v16.8h, v3.8h, v0.h[3]\n" -#define MMM_FMA_4x8_V19V28s3_V3xV0 "fmla v19.8h, v3.8h, v0.h[4]\n"\ - "fmla v22.8h, v3.8h, v0.h[5]\n"\ - "fmla v25.8h, v3.8h, v0.h[6]\n"\ - "fmla v28.8h, v3.8h, v0.h[7]\n" -#define MMM_FMA_4x8_V5V14s3_V29xV4 "fmla v5.8h, v29.8h, v4.h[0]\n"\ - "fmla v8.8h, v29.8h, v4.h[1]\n"\ - "fmla v11.8h, v29.8h, v4.h[2]\n"\ - "fmla v14.8h, v29.8h, v4.h[3]\n" -#define MMM_FMA_4x8_V17V26s3_V29xV4 "fmla v17.8h, v29.8h, v4.h[4]\n"\ - "fmla v20.8h, v29.8h, v4.h[5]\n"\ - "fmla v23.8h, v29.8h, v4.h[6]\n"\ - "fmla v26.8h, v29.8h, v4.h[7]\n" -#define MMM_FMA_4x8_V6V15s3_V30xV4 "fmla v6.8h, v30.8h, v4.h[0]\n"\ - "fmla v9.8h, v30.8h, v4.h[1]\n"\ - "fmla v12.8h, v30.8h, v4.h[2]\n"\ - "fmla v15.8h, v30.8h, v4.h[3]\n" -#define MMM_FMA_4x8_V18V27s3_V30xV4 "fmla v18.8h, v30.8h, v4.h[4]\n"\ - "fmla v21.8h, v30.8h, v4.h[5]\n"\ - "fmla v24.8h, v30.8h, v4.h[6]\n"\ - "fmla v27.8h, v30.8h, v4.h[7]\n" -#define MMM_FMA_4x8_V7V16s3_V31xV4 "fmla v7.8h, v31.8h, v4.h[0]\n"\ - "fmla v10.8h, v31.8h, v4.h[1]\n"\ - "fmla v13.8h, v31.8h, v4.h[2]\n"\ - "fmla v16.8h, v31.8h, v4.h[3]\n" -#define MMM_FMA_4x8_V19V28s3_V31xV4 "fmla v19.8h, v31.8h, v4.h[4]\n"\ - "fmla v22.8h, v31.8h, v4.h[5]\n"\ - "fmla v25.8h, v31.8h, v4.h[6]\n"\ - "fmla v28.8h, v31.8h, v4.h[7]\n" - -inline void mmm_4x24_A76(U32 M, U32 K, F16* w, F16* in, F16* out) { - U32 KTail = K % 2; - U32 KInner = K - KTail; - asm volatile( - //init in0- > v1, w- > v0 - "ld1 {v1.8h}, [%1], #16\n" - "ld1 {v0.4h}, [%2], #8\n" - "mov x26, %0\n" - "ld1 {v5.8h, v6.8h, v7.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.8h, v9.8h, v10.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.8h, v12.8h, v13.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v14.8h, v15.8h, v16.8h}, [x26]\n" - - "mov x20, %3\n" - - "0:\n" - //w- > v4, in0- > v2/v3/v1, out0=v5~v28 - "ld1 {v2.8h}, [%1], #16\n" - MMM_FMA_4x8_V5V14s3_V1xV0 - "ld1 {v3.8h}, [%1], #16\n" - "ld1 {v29.8h}, [%1], #16\n" - MMM_FMA_4x8_V6V15s3_V2xV0 - "ld1 {v4.4h}, [%2], #8\n" - MMM_FMA_4x8_V7V16s3_V3xV0 - - //w- > v0, in0- > v2/v3/v1, out0- > v5~v28 - "ld1 {v30.8h}, [%1], #16\n" - MMM_FMA_4x8_V5V14s3_V29xV4 - "ld1 {v31.8h}, [%1], #16\n" - MMM_FMA_4x8_V6V15s3_V30xV4 - "ld1 {v1.8h}, [%1], #16\n" - "ld1 {v0.4h}, [%2], #8\n" - MMM_FMA_4x8_V7V16s3_V31xV4 - - "subs x20, x20, #0x2\n" - "bne 0b\n" - - "cbz %5, 1f\n" - "ld1 {v2.8h}, [%1], #16\n" - MMM_FMA_4x8_V5V14s3_V1xV0 - "ld1 {v3.8h}, [%1], #16\n" - MMM_FMA_4x8_V6V15s3_V2xV0 - MMM_FMA_4x8_V7V16s3_V3xV0 - - "1:\n" - "mov x26, %0\n" - "st1 {v5.8h, v6.8h, v7.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.8h, v9.8h, v10.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.8h, v12.8h, v13.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v14.8h, v15.8h, v16.8h}, [x26]\n" - :"+r" (out), - "+r" (in), - "+r" (w) - :"r" ((I64)KInner), - "r" ((I64)M), - "r" ((I64)KTail) - :"memory", "cc", "x20", "x26", - "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v29", "v30", "v31" - ); -} -inline void mmm_8x4_A76(U32 M, U32 K, F16* w, F16* in, F16* out) { - U32 KTail = K % 2; - U32 KInner = K - KTail; - asm volatile( - "ld1 {v1.8h}, [%2], #16\n" - "ld1 {v0.4h}, [%1], #8\n" - - "mov x26, %0\n" - "ld1 {v5.h}[0], [x26], #2\n" - "ld1 {v8.h}[0], [x26], #2\n" - "ld1 {v11.h}[0], [x26], #2\n" - "ld1 {v14.h}[0], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[1], [x26], #2\n" - "ld1 {v8.h}[1], [x26], #2\n" - "ld1 {v11.h}[1], [x26], #2\n" - "ld1 {v14.h}[1], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[2], [x26], #2\n" - "ld1 {v8.h}[2], [x26], #2\n" - "ld1 {v11.h}[2], [x26], #2\n" - "ld1 {v14.h}[2], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[3], [x26], #2\n" - "ld1 {v8.h}[3], [x26], #2\n" - "ld1 {v11.h}[3], [x26], #2\n" - "ld1 {v14.h}[3], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[4], [x26], #2\n" - "ld1 {v8.h}[4], [x26], #2\n" - "ld1 {v11.h}[4], [x26], #2\n" - "ld1 {v14.h}[4], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[5], [x26], #2\n" - "ld1 {v8.h}[5], [x26], #2\n" - "ld1 {v11.h}[5], [x26], #2\n" - "ld1 {v14.h}[5], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[6], [x26], #2\n" - "ld1 {v8.h}[6], [x26], #2\n" - "ld1 {v11.h}[6], [x26], #2\n" - "ld1 {v14.h}[6], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "ld1 {v5.h}[7], [x26], #2\n" - "ld1 {v8.h}[7], [x26], #2\n" - "ld1 {v11.h}[7], [x26], #2\n" - "ld1 {v14.h}[7], [x26], #2\n" - - "mov x20, %3\n" - - "0:\n" - "ld1 {v4.4h}, [%1], #8\n" - "ld1 {v29.8h}, [%2], #16\n" - MMM_FMA_4x8_V5V14s3_V1xV0 - "ld1 {v1.8h}, [%2], #16\n" - "ld1 {v0.4h}, [%1], #8\n" - MMM_FMA_4x8_V5V14s3_V29xV4 - - "subs x20, x20, 0x2\n" - "bne 0b\n" - - "cbz %5, 1f\n" - MMM_FMA_4x8_V5V14s3_V1xV0 - - "1:\n" - "mov x26, %0\n" - "st1 {v5.h}[0], [x26], #2\n" - "st1 {v8.h}[0], [x26], #2\n" - "st1 {v11.h}[0], [x26], #2\n" - "st1 {v14.h}[0], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[1], [x26], #2\n" - "st1 {v8.h}[1], [x26], #2\n" - "st1 {v11.h}[1], [x26], #2\n" - "st1 {v14.h}[1], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[2], [x26], #2\n" - "st1 {v8.h}[2], [x26], #2\n" - "st1 {v11.h}[2], [x26], #2\n" - "st1 {v14.h}[2], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[3], [x26], #2\n" - "st1 {v8.h}[3], [x26], #2\n" - "st1 {v11.h}[3], [x26], #2\n" - "st1 {v14.h}[3], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[4], [x26], #2\n" - "st1 {v8.h}[4], [x26], #2\n" - "st1 {v11.h}[4], [x26], #2\n" - "st1 {v14.h}[4], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[5], [x26], #2\n" - "st1 {v8.h}[5], [x26], #2\n" - "st1 {v11.h}[5], [x26], #2\n" - "st1 {v14.h}[5], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[6], [x26], #2\n" - "st1 {v8.h}[6], [x26], #2\n" - "st1 {v11.h}[6], [x26], #2\n" - "st1 {v14.h}[6], [x26], #2\n" - "sub x26, x26, #8\n" - "add x26, x26, %4\n" - "st1 {v5.h}[7], [x26], #2\n" - "st1 {v8.h}[7], [x26], #2\n" - "st1 {v11.h}[7], [x26], #2\n" - "st1 {v14.h}[7], [x26], #2\n" - :"+r" (out), - "+r" (in), - "+r" (w) - :"r" ((I64)KInner), - "r" ((I64)M), - "r" ((I64)KTail) - :"memory", "cc", "x20", "x26", "v0", "v1", "v4", "v29", "v5", "v8", "v11", "v14" - ); -} - -inline void mmm_4x8_A76(U32 M, U32 K, F16* w, F16* in, F16* out) { - U32 KTail = K % 2; - U32 KInner = K - KTail; - asm volatile( - "ld1 {v1.8h}, [%1], #16\n" - "ld1 {v0.4h}, [%2], #8\n" - "mov x26, %0\n" - "ld1 {v5.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v14.8h}, [x26]\n" - - "mov x20, %3\n" - - "0:\n" - "ld1 {v29.8h}, [%1], #16\n" - "ld1 {v4.4h}, [%2], #8\n" - MMM_FMA_4x8_V5V14s3_V1xV0 - "ld1 {v1.8h}, [%1], #16\n" - "ld1 {v0.4h}, [%2], #8\n" - MMM_FMA_4x8_V5V14s3_V29xV4 - - "subs x20, x20, 0x2\n" - "bne 0b\n" - - "cbz %5, 1f\n" - MMM_FMA_4x8_V5V14s3_V1xV0 - - "1:\n" - "mov x26, %0\n" - "st1 {v5.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v14.8h}, [x26]\n" - :"+r" (out), - "+r" (in), - "+r" (w) - :"r" ((I64)KInner), - "r" ((I64)M), - "r" ((I64)KTail) - :"memory", "cc", "x20", "x26", "v0", "v1", "v4", "v5","v8", "v11","v14", "v29" - ); -} - -inline void mmm_4x4_A76(U32 M, U32 K, F16* w, F16* in, F16* out) { - U32 KTail = K % 2; - U32 KInner = K - KTail; - asm volatile( - "ld1 {v1.4h}, [%1], #8\n" - "ld1 {v0.4h}, [%2], #8\n" - "mov x26, %0\n" - "ld1 {v5.4h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.4h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.4h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v14.4h}, [x26]\n" - - "mov x20, %3\n" - - "0:\n" - "ld1 {v29.4h}, [%1], #8\n" - "ld1 {v4.4h}, [%2], #8\n" - MMM_FMA_4x8_V5V14s3_V1xV0 - "ld1 {v1.4h}, [%1], #8\n" - "ld1 {v0.4h}, [%2], #8\n" - MMM_FMA_4x8_V5V14s3_V29xV4 - - "subs x20, x20, 0x2\n" - "bne 0b\n" - - "cbz %5, 1f\n" - MMM_FMA_4x8_V5V14s3_V1xV0 - - "1:\n" - "mov x26, %0\n" - "st1 {v5.4h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.4h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.4h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v14.4h}, [x26]\n" - :"+r" (out), - "+r" (in), - "+r" (w) - :"r" ((I64)KInner), - "r" ((I64)M), - "r" ((I64)KTail) - :"memory", "cc", "x20", "x26", "v0", "v1", "v4", "v29", "v5", "v8", "v11", "v14" - ); -} - -inline void mmm_8x8_A76(U32 M, U32 K, F16* w, F16* in, F16* out) { - U32 KTail = K % 2; - U32 KInner = K - KTail; - asm volatile( - "mov x26, %0\n" - "ld1 {v5.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v14.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v17.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v20.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v23.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v26.8h}, [x26]\n" - - "mov x20, %3\n" - - "ld1 {v1.8h}, [%1], #16\n" - "ld1 {v0.8h}, [%2], #16\n" - - "0:\n" - "ld1 {v29.8h}, [%1], #16\n" - "ld1 {v4.8h}, [%2], #16\n" - MMM_FMA_4x8_V5V14s3_V1xV0 - MMM_FMA_4x8_V17V26s3_V1xV0 - - "ld1 {v1.8h}, [%1], #16\n" - "ld1 {v0.8h}, [%2], #16\n" - MMM_FMA_4x8_V5V14s3_V29xV4 - MMM_FMA_4x8_V17V26s3_V29xV4 - - "subs x20, x20, 0x2\n" - "bne 0b\n" - - "cbz %5, 1f\n" - MMM_FMA_4x8_V5V14s3_V1xV0 - MMM_FMA_4x8_V17V26s3_V1xV0 - - "1:\n" - "mov x26, %0\n" - "st1 {v5.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v14.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v17.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v20.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v23.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v26.8h}, [x26]\n" - :"+r" (out), - "+r" (in), - "+r" (w) - :"r" ((I64)KInner), - "r" ((I64)M), - "r" ((I64)KTail) - :"memory", "cc", "x20", "x26", - "v1", "v0", "v29", "v4", "v5", "v8", "v11", "v14", "v17", "v20", "v23", "v26" - ); -} - -inline void mmm_8x24_A76(U32 M, U32 K, F16* w, F16* in, F16* out) { - U32 KTail = K % 2; - U32 KInner = K - KTail; - asm volatile( - //init in0- > v1, w- > v0 - "ld1 {v1.8h}, [%1], #16\n" - "ld1 {v0.8h}, [%2], #16\n" - "mov x26, %0\n" - "ld1 {v5.8h, v6.8h, v7.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.8h, v9.8h, v10.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.8h, v12.8h, v13.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v14.8h, v15.8h, v16.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v17.8h, v18.8h, v19.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v20.8h, v21.8h, v22.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v23.8h, v24.8h, v25.8h}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v26.8h, v27.8h, v28.8h}, [x26]\n" - - "mov x20, %3\n" - - "0:\n" - //w- > v4, in0- > v2/v3/v1, out0=v5~v28 - "ld1 {v2.8h}, [%1], #16\n" - "ld1 {v3.8h}, [%1], #16\n" - MMM_FMA_4x8_V5V14s3_V1xV0 - MMM_FMA_4x8_V17V26s3_V1xV0 - - "ld1 {v4.8h}, [%2], #16\n" - MMM_FMA_4x8_V6V15s3_V2xV0 - MMM_FMA_4x8_V18V27s3_V2xV0 - - "ld1 {v29.8h}, [%1], #16\n" - MMM_FMA_4x8_V7V16s3_V3xV0 - MMM_FMA_4x8_V19V28s3_V3xV0 - - //w- > v0, in0- > v2/v3/v1, out0- > v5~v28 - "ld1 {v30.8h}, [%1], #16\n" - "ld1 {v0.8h}, [%2], #16\n" - MMM_FMA_4x8_V5V14s3_V29xV4 - MMM_FMA_4x8_V17V26s3_V29xV4 - - "ld1 {v31.8h}, [%1], #16\n" - MMM_FMA_4x8_V6V15s3_V30xV4 - MMM_FMA_4x8_V18V27s3_V30xV4 - - "ld1 {v1.8h}, [%1], #16\n" - MMM_FMA_4x8_V7V16s3_V31xV4 - "subs x20, x20, #0x2\n" - MMM_FMA_4x8_V19V28s3_V31xV4 - - "bne 0b\n" - - "cbz %5, 1f\n" - "ld1 {v2.8h}, [%1], #16\n" - "ld1 {v3.8h}, [%1], #16\n" - MMM_FMA_4x8_V5V14s3_V1xV0 - MMM_FMA_4x8_V17V26s3_V1xV0 - MMM_FMA_4x8_V6V15s3_V2xV0 - MMM_FMA_4x8_V18V27s3_V2xV0 - MMM_FMA_4x8_V7V16s3_V3xV0 - MMM_FMA_4x8_V19V28s3_V3xV0 - - "1:\n" - "mov x26, %0\n" - "st1 {v5.8h, v6.8h, v7.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.8h, v9.8h, v10.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.8h, v12.8h, v13.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v14.8h, v15.8h, v16.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v17.8h, v18.8h, v19.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v20.8h, v21.8h, v22.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v23.8h, v24.8h, v25.8h}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v26.8h, v27.8h, v28.8h}, [x26]\n" - :"+r" (out), - "+r" (in), - "+r" (w) - :"r" ((I64)KInner), - "r" ((I64)M), - "r" ((I64)KTail) - :"memory", "cc", "x0", "x20", "x26", - "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); -} - -void mmm_A76(int M, int N, int K, F16* matrix1, F16* matrix2, F16* tmp, F16* result) -{ - int blockK = K; - int blockM = 192; - F16* matrix1Trans = tmp; - F16* resultCurrent = result; - int KInner, MInner, m, n; - for(int k = 0; k < K; k += blockK) { - KInner = UNI_MIN(blockK, K - k); - for(int i = 0; i < M; i+=blockM) { - MInner = UNI_MIN(blockM, M - i); - for(n = 0; n <= N - 8; n+=8) { - if (i == 0) { - matrix1_trans(8, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); - } - for(m = 0; m <= (MInner-24); m+=24) { - resultCurrent = result + n * M + m + i; - mmm_8x24_A76(M*2, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - for(; m <=(MInner - 8); m+=8) { - resultCurrent = result + n * M + m + i; - mmm_8x8_A76(M*2, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - if ((MInner - m) >= 4) { - resultCurrent = result + n * M + m + i; - mmm_8x4_A76(M*2, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - m += 4; - } - - if (MInner - m) { - resultCurrent = result + n * M + m + i; - mmm_N8_MTail(MInner - m, M, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - } - - if ((N - n) >= 4) { - if (i == 0) { - matrix1_trans(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); - } - - for(m = 0; m <= (MInner - 24); m+=24) { - resultCurrent = result + n * M + m + i; - mmm_4x24_A76(M*2, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - for(; m <= (MInner - 8); m+=8) { - resultCurrent = result + n * M + m + i; - mmm_4x8_A76(M*2, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - if ((MInner - m) >= 4) { - resultCurrent = result + n * M + m + i; - mmm_4x4_A76(M*2, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - m += 4; - } - - if (MInner - m) { - resultCurrent = result + n * M + m + i; - mmm_N4_MTail(MInner - m, M, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - n += 4; - - } - - if (N - n) { - if (i == 0) { - matrix1_trans(N-n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); - } - - for(m = 0; m <= (MInner - 24); m+=24) { - resultCurrent = result + n * M + m + i; - mmm_NTail_M24(M, N - n, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - for(; m <= (MInner - 8); m+=8) { - resultCurrent = result + n * M + m + i; - mmm_NTail_M8(M, N - n, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - if ((MInner - m) >= 4) { - resultCurrent = result + n * M + m + i; - mmm_NTail_M4(M, N - n, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - m += 4; - } - - if (MInner - m) { - resultCurrent = result + n * M + m + i; - mmm_NTail_M(MInner - m, M, N - n, KInner, matrix1Trans + n * KInner, matrix2 + (i + m) * KInner, resultCurrent); - } - - } - - } - } -} diff --git a/blas-enhance/src/cpu/arm/fp16/mvm_A55.cpp b/blas-enhance/src/cpu/arm/fp16/mvm_A55.cpp deleted file mode 100644 index e468f4e3..00000000 --- a/blas-enhance/src/cpu/arm/fp16/mvm_A55.cpp +++ /dev/null @@ -1,144 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include - -#include "mvm_common.h" -#include "mvm.h" - - -inline void mvm_row_kernel_A55(U32 N, U32 K, F16* matrix, F16* vector, F16* result) { - U32 KTail = K % 8; - U32 KInner = K - KTail; - F16* w0 = matrix; - F16* w1 = matrix + K * N/2; - F16* w2 = matrix + K * 2 * N/2; - F16* w3 = matrix + K * 3 * N/2; - - asm volatile( - "mov x19, %5\n" - "ld1 {v18.h}[0], [x19]\n" - "add x19, x19, %8\n" - "ld1 {v18.h}[1], [x19]\n" - "add x19, x19, %8\n" - "ld1 {v18.h}[2], [x19]\n" - "add x19, x19, %8\n" - "ld1 {v18.h}[3], [x19]\n" - - "movi v17.8h, #0x0\n" - "movi v16.8h, #0x0\n" - "movi v9.8h, #0x0\n" - "movi v10.8h, #0x0\n" - "movi v11.8h, #0x0\n" - "movi v12.8h, #0x0\n" - "mov x20, %6\n" - "cmp x20, #0x0\n" - "beq 3f\n" - "0:\n" - - "ld1 {v0.4h}, [%0], #8\n" - "ldr x15, [%0], #8\n" - "ins v0.d[1], x15\n" - - "ld1 {v1.4h}, [%1], #8\n" - "ld1 {v2.4h}, [%2], #8\n" - "ldr x21, [%1], #8\n" - "ldr x22, [%2], #8\n" - "ins v1.d[1], x21\n" - "ins v2.d[1], x22\n" - - "ld1 {v3.4h}, [%3], #8\n" - "fmla v9.8h, v1.8h, v0.8h\n" - "ld1 {v4.4h}, [%4], #8\n" - "fmla v10.8h, v2.8h, v0.8h\n" - "ldr x23, [%3], #8\n" - "ldr x24, [%4], #8\n" - "ins v3.d[1], x23\n" - "ins v4.d[1], x24\n" - "fmla v11.8h, v3.8h, v0.8h\n" - "fmla v12.8h, v4.8h, v0.8h\n" - - "subs x20, x20, 0x8\n" - "bne 0b\n" - - "faddp v13.8h, v9.8h, v10.8h\n" - "faddp v14.8h, v11.8h, v12.8h\n" - "faddp v15.8h, v13.8h, v14.8h\n" - "faddp v17.8h, v15.8h, v15.8h\n" - "3:\n" - "mov x16, %7\n" - "cmp x16, #0x0\n" - "beq 2f\n" - - "1:\n" - "ld1 {v8.h}[0], [%0], #2\n" - - "ld1 {v1.h}[0], [%1], #2\n" - "ld1 {v1.h}[1], [%2], #2\n" - "ld1 {v1.h}[2], [%3], #2\n" - "ld1 {v1.h}[3], [%4], #2\n" - "fmla v16.8h, v1.8h, v8.h[0]\n" - - "subs x16, x16, 0x1\n" - "bne 1b\n" - - "fadd v17.8h, v17.8h, v16.8h\n" - - "2:\n" - - "fadd v17.8h, v17.8h, v18.8h\n" - - "mov x19, %5\n" - "st1 {v17.h}[0], [x19]\n" - "add x19, x19, %8\n" - "st1 {v17.h}[1], [x19]\n" - "add x19, x19, %8\n" - "st1 {v17.h}[2], [x19]\n" - "add x19, x19, %8\n" - "st1 {v17.h}[3], [x19]\n" - - :"+r" (vector), - "+r" (w0), - "+r" (w1), - "+r" (w2), - "+r" (w3), - "+r" (result) - :"r" ((I64)KInner), - "r" ((I64)KTail), - "r" ((I64)N) - :"memory", "cc", "x19", "x20", "x21", "x22", "x23", "x24", "x15", "x16", - "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18" - ); -} - -inline void mvm_row_A55(U32 numRows, U32 numColumns, F16* matrix, F16* vector, F16* result) { - //Actual layout is NK, and vector is K - U32 N = numRows; - U32 K = numColumns; - U32 NTail = N % 4; - U32 NInner = N / 4; - for(U32 i = 0; i < NInner; i++) { - mvm_row_kernel_A55(NInner * 2, K, matrix + i * K, vector, result + i); - } - if (NTail != 0) { - mvm_row_tail(NTail, K, matrix + (N - NTail) * K, vector, result + (N - NTail)); - } -} - -void mvm_A55(U32 row, U32 col, bool transpose, F16* matrix, F16* vector, F16* result) { - if (transpose) - mvm_col(row, col, matrix, vector, result); - else - mvm_row_A55(row, col, matrix, vector, result); -} diff --git a/blas-enhance/src/cpu/arm/fp16/mvm_A76.cpp b/blas-enhance/src/cpu/arm/fp16/mvm_A76.cpp deleted file mode 100644 index 135bfa51..00000000 --- a/blas-enhance/src/cpu/arm/fp16/mvm_A76.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include - -#include "mvm_common.h" -#include "mvm.h" - - -inline void mvm_row_kernel_A76(U32 N, U32 K, F16* matrix, F16* vector, F16* result) { - U32 KTail = K % 8; - U32 KInner = K - KTail; - F16* w0 = matrix; - F16* w1 = matrix + K * N/2; - F16* w2 = matrix + K * 2 * N/2; - F16* w3 = matrix + K * 3 * N/2; - asm volatile( - "mov x19, %5\n" - "ld1 {v18.h}[0], [x19]\n" - "add x19, x19, %8\n" - "ld1 {v18.h}[1], [x19]\n" - "add x19, x19, %8\n" - "ld1 {v18.h}[2], [x19]\n" - "add x19, x19, %8\n" - "ld1 {v18.h}[3], [x19]\n" - - "movi v17.8h, #0x0\n" - "movi v16.8h, #0x0\n" - "movi v9.8h, #0x0\n" - "movi v10.8h, #0x0\n" - "movi v11.8h, #0x0\n" - "movi v12.8h, #0x0\n" - "mov x20, %6\n" - "cmp x20, #0x0\n" - "beq 3f\n" - "0:\n" - - "ld1 {v0.8h}, [%0], #16\n" - "ld1 {v1.8h}, [%1], #16\n" - "ld1 {v2.8h}, [%2], #16\n" - "ld1 {v3.8h}, [%3], #16\n" - "ld1 {v4.8h}, [%4], #16\n" - - "fmla v9.8h, v1.8h, v0.8h\n" - "fmla v10.8h, v2.8h, v0.8h\n" - "fmla v11.8h, v3.8h, v0.8h\n" - "fmla v12.8h, v4.8h, v0.8h\n" - - "subs x20, x20, 0x8\n" - "bne 0b\n" - - "faddp v13.8h, v9.8h, v10.8h\n" - "faddp v14.8h, v11.8h, v12.8h\n" - "faddp v15.8h, v13.8h, v14.8h\n" - "faddp v17.8h, v15.8h, v15.8h\n" - "3:\n" - "mov x16, %7\n" - "cmp x16, #0x0\n" - "beq 2f\n" - - "1:\n" - "ld1 {v8.h}[0], [%0], #2\n" - - "ld1 {v1.h}[0], [%1], #2\n" - "ld1 {v1.h}[1], [%2], #2\n" - "ld1 {v1.h}[2], [%3], #2\n" - "ld1 {v1.h}[3], [%4], #2\n" - "fmla v16.8h, v1.8h, v8.h[0]\n" - - "subs x16, x16, 0x1\n" - "bne 1b\n" - - "fadd v17.8h, v17.8h, v16.8h\n" - - "2:\n" - - "fadd v17.8h, v17.8h, v18.8h\n" - "mov x19, %5\n" - "st1 {v17.h}[0], [x19]\n" - "add x19, x19, %8\n" - "st1 {v17.h}[1], [x19]\n" - "add x19, x19, %8\n" - "st1 {v17.h}[2], [x19]\n" - "add x19, x19, %8\n" - "st1 {v17.h}[3], [x19]\n" - :"+r" (vector), - "+r" (w0), - "+r" (w1), - "+r" (w2), - "+r" (w3), - "+r" (result) - :"r" ((I64)KInner), - "r" ((I64)KTail), - "r" ((I64)N) - :"memory", "cc", "x19", "x20", "x21", "x22", "x23", "x24", "x15", "x16", - "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18" - ); -} - -inline void mvm_row_A76(U32 numRows, U32 numColumns, F16* matrix, F16* vector, F16* result) { - //Actual layout is NK, and vector is K - U32 N = numRows; - U32 K = numColumns; - U32 NTail = N % 4; - U32 NInner = N / 4; - for (U32 i = 0; i < NInner; i++) { - mvm_row_kernel_A76(NInner * 2, K, matrix + i * K, vector, result + i); - } - if (NTail != 0) { - mvm_row_tail(NTail, K, matrix + (N - NTail) * K, vector, result + (N - NTail)); - } -} - -void mvm_A76(U32 row, U32 col, bool transpose, F16* matrix, F16* vector, F16* result) { - if (transpose) - mvm_col(row, col, matrix, vector, result); - else - mvm_row_A76(row, col, matrix, vector, result); -} diff --git a/blas-enhance/src/cpu/arm/fp16/mvm_common.h b/blas-enhance/src/cpu/arm/fp16/mvm_common.h deleted file mode 100644 index dfad769d..00000000 --- a/blas-enhance/src/cpu/arm/fp16/mvm_common.h +++ /dev/null @@ -1,260 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_MVM_COMMON -#define _H_MVM_COMMON - -#include -#include "type.h" -#include "arm_neon_expand.h" - -inline void mvm_row_tail(U32 N, U32 K, F16* matrix, F16* vector, F16* result) { - float16x8_t vec, res, mat; - U32 KTail = K % 8; - U32 KInner = K - KTail; - - for (U32 i = 0; i < N; i+=1) { - res = vdupq_n_f16(0); - - for (U32 j = 0; j < KInner; j+=8) { - vec = vld1q_f16(&vector[j]); - mat = vld1q_f16(&matrix[j + K * i]); - res = vfmaq_f16(res, vec, mat); - } - result[i] += vaddvq_f16(res); - - if (KTail != 0) { - for (U32 p = 0; p < KTail; p+=1) { - result[i] += vector[p + KInner] * matrix[KInner + p + K * i]; - } - } - - } -} - -inline void mvm_col_tail(U32 N, U32 K, F16* matrix, F16* vector, F16* result) { - float16x8_t tmp, res, mat; - U32 NTail = N % 8; - U32 NInner = N - NTail; - - for (U32 i = 0; i < K; i+=1) { - for (U32 j = 0; j < NInner; j+=8) { - tmp = vld1q_f16(result + j); - mat = vld1q_f16(&matrix[j + N * i]); - res = vfmaq_n_f16(tmp, mat, vector[i]); - vst1q_f16(result + j, res); - } - if (NTail != 0) { - for (U32 p = 0; p < NTail; p+=1) { - result[NInner + p] += vector[i] * matrix[NInner + N * i + p]; - } - } - } -} - -inline void mvm_col_kernel(U32 N, U32 K, F16* matrix, F16* vector, F16* result) { - float16x8_t mat[4] = {0}; - - F16* w0 = matrix; - F16* w1 = matrix + K * N; - F16* w2 = matrix + 2 * K * N; - F16* w3 = matrix + 3 * K * N; - - U32 N_tail = N % 8; - U32 N_inner = N - N_tail; - - for(U32 i = 0; i < K; i+=1) { - for(U32 j = 0; j < N_inner; j+=8) { - - float16x8_t res[4] = {0}; - - res[3] = vld1q_f16(result + j); - mat[0] = vld1q_f16(w0); - mat[1] = vld1q_f16(w1); - mat[2] = vld1q_f16(w2); - mat[3] = vld1q_f16(w3); - - res[0] = vfmaq_n_f16(res[3], mat[0], vector[i]); - res[1] = vfmaq_n_f16(res[0], mat[1], vector[K + i]); - res[2] = vfmaq_n_f16(res[1], mat[2], vector[2 * K + i]); - res[3] = vfmaq_n_f16(res[2], mat[3], vector[3 * K + i]); - - w0 += 8; - w1 += 8; - w2 += 8; - w3 += 8; - vst1q_f16(result + j, res[3]); - - } - if (N_tail != 0) { - for(U32 p = 0; p < N_tail; p+=1) { - result[N_inner + p] += vector[i] * *w0++; - result[N_inner + p] += vector[i + K] * *w1++; - result[N_inner + p] += vector[i + 2 * K] * *w2++; - result[N_inner + p] += vector[i + 3 * K] * *w3++; - } - } - } -} - -inline void mvm_col_kernel_4x8(U32 N, U32 K, F16* matrix, F16* vector, F16* result) { - F16* result_end8 = result + N / 8 * 8; - F16* result_end = result + N; - asm volatile( - "mov x20, %0\n" - "add x21, x20, %5\n" - "add x22, x21, %5\n" - "add x23, x22, %5\n" - "mov x24, %1\n" - "add x25, x24, %6\n" - "add x26, x25, %6\n" - "add x27, x26, %6\n" - "mov x29, x21\n" - - "00:\n" - "cmp x20, x29\n" - "bge 01f\n" - "ldr h0, [x20], 2\n" - "dup v0.8h, v0.h[0]\n" - "ldr h1, [x21], 2\n" - "dup v1.8h, v1.h[0]\n" - "ldr h2, [x22], 2\n" - "dup v2.8h, v2.h[0]\n" - "ldr h3, [x23], 2\n" - "dup v3.8h, v3.h[0]\n" - - "mov x28, %2\n" - - "10:\n" - "cmp x28, %3\n" - "bge 11f\n" - "ldr q4, [x28]\n" - "ldr q8, [x24], 16\n" - "ldr q9, [x25], 16\n" - "ldr q10, [x26], 16\n" - "fmla v4.8h, v8.8h, v0.8h\n" - "ldr q11, [x27], 16\n" - "fmla v4.8h, v9.8h, v1.8h\n" - "fmla v4.8h, v10.8h, v2.8h\n" - "fmla v4.8h, v11.8h, v3.8h\n" - "str q4, [x28], 16\n" - "b 10b\n" - - "11:\n" - "cmp x28, %4\n" - "bge 12f\n" - "ldr h4, [x28]\n" - "ldr h8, [x24], 2\n" - "ldr h9, [x25], 2\n" - "ldr h10, [x26], 2\n" - "fmla h4, h8, v0.h[0]\n" - "ldr h11, [x27], 2\n" - "fmla h4, h9, v1.h[0]\n" - "fmla h4, h10, v2.h[0]\n" - "fmla h4, h11, v3.h[0]\n" - "str h4, [x28], 2\n" - "b 11b\n" - - "12:\n" - "b 00b\n" - "01:\n" - :"+r" (vector), - "+r" (matrix), - "+r" (result), - "+r" (result_end8), - "+r" (result_end) - :"r" ((I64)K*2), - "r" ((I64)K*N*2) - :"memory", "cc", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x29", - "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11" - ); -} - -inline void mvm_row_kernel(U32 N, U32 K, F16* matrix, F16* vector, F16* result) { - float16x8_t res[4] = {0}, mat[4] = {0} , vec; - float16x8_t tmp[6] = {0}; - - F16* w0 = matrix; - F16* w1 = matrix + K * N; - F16* w2 = matrix + 2 * K * N; - F16* w3 = matrix + 3 * K * N; - - U32 K_tail = K % 8; - U32 K_inner = K - K_tail; - - for (U32 i = 0; i < N; i+=1) { - for (U32 j = 0; j < K_inner; j+=8) { - - vec = vld1q_f16(&vector[j]); - - mat[0] = vld1q_f16(w0); - mat[1] = vld1q_f16(w1); - mat[2] = vld1q_f16(w2); - mat[3] = vld1q_f16(w3); - for(U32 k = 0; k < 4; k++) { - res[k] = vfmaq_f16(res[k], vec , mat[k]); - } - w0 += 8; - w1 += 8; - w2 += 8; - w3 += 8; - - } - - for(U32 m = 0; m < 2; m++) { - tmp[m] = vpaddq_f16(res[m * 2], res[m * 2 + 1]); - } - tmp[4] = vpaddq_f16(tmp[0], tmp[1]); - tmp[5] = vpaddq_f16(tmp[4], tmp[3]); - F16 addbias; - for(U32 n = 0; n < 4; n++) { - vst1q_lane_f16_builtin(&addbias, tmp[5], n); - result[i + N * n] += addbias; - res[n] = vdupq_n_f16(0); - } - - if (K_tail != 0) { - for (U32 p = 0; p < K_tail; p += 1) { - *(result + i) += vector[p + K_inner] * *w0++; - *(result + N + i) += vector[p + K_inner] * *w1++; - *(result + 2*N + i) += vector[p + K_inner] * *w2++; - *(result + 3*N + i) += vector[p + K_inner] * *w3++; - } - } - - } -} - -inline void mvm_col(U32 numRows, U32 numColumns, F16* matrix, F16* vector, F16* result) { - //Actual layout is KN, and vector is K - U32 N = numRows; - U32 K = numColumns; - U32 KInner = K / 4; - U32 KTail = K % 4; - mvm_col_kernel_4x8(N, KInner, matrix, vector, result); - if (KTail != 0) { - mvm_col_tail(N, KTail, matrix + (K - KTail) * N, vector + (K - KTail), result); - } -} - -//N is number of rows, K for columns -inline void mvm_row(U32 N, U32 K, F16* matrix, F16* vector, F16* result) { - U32 NInner = (N / 4); - U32 NTail = N % 4 ; - mvm_row_kernel(NInner, K, matrix, vector, result); - if (NTail != 0) { - mvm_row_tail(NTail, K, matrix + (N - NTail) * K, vector, result + N - NTail); - } -} -#endif diff --git a/blas-enhance/src/cpu/arm/fp32/mvm_row_V8.cpp b/blas-enhance/src/cpu/arm/fp32/mvm_row_V8.cpp deleted file mode 100644 index 0ffb68c8..00000000 --- a/blas-enhance/src/cpu/arm/fp32/mvm_row_V8.cpp +++ /dev/null @@ -1,152 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "blas_fp32.h" - - -void mvm_row_tail(U32 N, U32 K, F32* matrix, F32* vector, F32* result) -{ - float32x4_t vec, res, mat; - U32 KTail = K % 4; - U32 KInner = K - KTail; - - for (U32 i = 0; i < N; i++) { - res = vdupq_n_f32(0); - - for (U32 j = 0; j < KInner; j += 4) { - vec = vld1q_f32(&vector[j]); - mat = vld1q_f32(&matrix[j + K * i]); - res = vfmaq_f32(res, vec, mat); - } - result[i] += vaddvq_f32(res); - - if (KTail != 0) { - for (U32 p = 0; p < KTail; p++) { - result[i] += vector[p + KInner] * matrix[KInner + p + K * i]; - } - } - - } -} - -void mvm_row_kernel(U32 N, U32 K, F32* matrix, F32* vector, F32* result) -{ -#ifdef __aarch64__ - I32 KTail = K % 4; - I32 KInner = K - KTail; - F32* w0 = matrix; - F32* w1 = matrix + K * N; - F32* w2 = matrix + K * 2 * N; - F32* w3 = matrix + K * 3 * N; - asm volatile( - "mov x19, %5\n" - "ld1 {v18.s}[0], [x19]\n" - "add x19, x19, %8\n" - "ld1 {v18.s}[1], [x19]\n" - "add x19, x19, %8\n" - "ld1 {v18.s}[2], [x19]\n" - "add x19, x19, %8\n" - "ld1 {v18.s}[3], [x19]\n" - - "movi v17.4s, #0x0\n" - "movi v16.4s, #0x0\n" - "movi v9.4s, #0x0\n" - "movi v10.4s, #0x0\n" - "movi v11.4s, #0x0\n" - "movi v12.4s, #0x0\n" - "mov x20, %6\n" - "cmp x20, #0x0\n" - "beq 3f\n" - "0:\n" - - "ld1 {v0.4s}, [%0], #16\n" - "ld1 {v1.4s}, [%1], #16\n" - "ld1 {v2.4s}, [%2], #16\n" - "ld1 {v3.4s}, [%3], #16\n" - "ld1 {v4.4s}, [%4], #16\n" - - "fmla v9.4s, v1.4s, v0.4s\n" - "fmla v10.4s, v2.4s, v0.4s\n" - "fmla v11.4s, v3.4s, v0.4s\n" - "fmla v12.4s, v4.4s, v0.4s\n" - - "subs x20, x20, #4\n" - "bne 0b\n" - - "faddp v13.4s, v9.4s, v10.4s\n" - "faddp v14.4s, v11.4s, v12.4s\n" - "faddp v17.4s, v13.4s, v14.4s\n" - "3:\n" - "mov x16, %7\n" - "cmp x16, #0x0\n" - "beq 2f\n" - - "1:\n" - "ld1 {v8.s}[0], [%0], #4\n" - - "ld1 {v1.s}[0], [%1], #4\n" - "ld1 {v1.s}[1], [%2], #4\n" - "ld1 {v1.s}[2], [%3], #4\n" - "ld1 {v1.s}[3], [%4], #4\n" - "fmla v16.4s, v1.4s, v8.s[0]\n" - - "subs x16, x16, 0x1\n" - "bne 1b\n" - - "fadd v17.4s, v17.4s, v16.4s\n" - - "2:\n" - - "fadd v17.4s, v17.4s, v18.4s\n" - "mov x19, %5\n" - "st1 {v17.s}[0], [x19]\n" - "add x19, x19, %8\n" - "st1 {v17.s}[1], [x19]\n" - "add x19, x19, %8\n" - "st1 {v17.s}[2], [x19]\n" - "add x19, x19, %8\n" - "st1 {v17.s}[3], [x19]\n" - :"+r" (vector), - "+r" (w0), - "+r" (w1), - "+r" (w2), - "+r" (w3), - "+r" (result) - :"r" ((I64)KInner), - "r" ((I64)KTail), - "r" ((I64)N*4) - :"memory", "cc", "x19", "x20", "x21", "x22", "x23", "x24", "x15", "x16", - "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18" - ); -#else - // TODO - std::cerr << "[ERROR] currently not support ARMv7 row MVM" < -#include -#include "cpu/arm/blas_arm.h" -#include "cpu/arm/int8/mmm_common.h" -#include "cpu/arm/int8/mmm.h" - -inline void mmm_4x4_A55(U32 offset, U32 K, INT8* in, INT8* w, I32* out) { - asm volatile( - //init in- > v1, w- > v0 - "ldr d1, [%0]\n" - "ldr x16, [%0, 8]\n" - "ins v1.d[1], x16\n" - - "ldr d0, [%1]\n" - "ldr x17, [%1, 8]\n" - "ins v0.d[1], x17\n" - - //give in address to x3 - "mov x3, %0\n" - - //give w address to x0 - "mov x0, %1\n" - - //K- > x2 - "mov x2, %3\n" - - //give out address to x26 - "mov x26, %2\n" - - //load in bias - "ldr q5, [x26]\n" - "add x26, x26, %4\n" - - "ldr q7, [x26]\n" - "add x26, x26, %4\n" - - "ldr q9, [x26]\n" - "add x26, x26, %4\n" - - "ldr q11, [x26]\n" - - //Computation loop - "0:\n" - - "ldr d3, [x3, 16] !\n" - "ldr x16, [x3, 24]\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr d29, [x0, 16] !\n" - "ldr x17, [x0, 24]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "ins v3.d[1], x16\n" - "subs x2, x2, #4\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "ins v29.d[1], x17\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - "mov v1.16b, v3.16b\n" - "mov v0.16b, v29.16b\n" - "bne 0b\n" - - "1:\n" - - //give out address to x26 - "mov x26, %2\n" - - "str q5, [x26]\n" - "add x26, x26, %4\n" - - "str q7, [x26]\n" - "add x26, x26, %4\n" - - "str q9, [x26]\n" - "add x26, x26, %4\n" - - "str q11, [x26]\n" - - :"+r" (in), - "+r" (w), - "+r" (out) - :"r" ((I64)K), - "r" ((I64)offset) - :"memory","cc","v30","v29","v11","v9","v7","v5","v3","v1","v0","x26","x16","x17","x3","x2","x0" - ); -} - -inline void mmm_8x4_A55(U32 offset, U32 K, INT8* in, INT8* w, I32* out) { - asm volatile( - // init in-> v1, w-> v0 - "ldr q1, [%0]\n" - "ldr q0, [%1]\n" - - // give in address to x3 - "mov x3, %0\n" - - // give w address to x0 - "mov x0, %1\n" - - // K-> x2 - "mov x2, %3\n" - - // give out address to x26 - "mov x26, %2\n" - - // load in bias - "ldr q5, [x26]\n" - "add x26, x26, %4\n" - - "ldr q7, [x26]\n" - "add x26, x26, %4\n" - - "ldr q9, [x26]\n" - "add x26, x26, %4\n" - - "ldr q11, [x26]\n" - "add x26, x26, %4\n" - - "ldr q13, [x26]\n" - "add x26, x26, %4\n" - - "ldr q15, [x26]\n" - "add x26, x26, %4\n" - - "ldr q17, [x26]\n" - "add x26, x26, %4\n" - - "ldr q19, [x26]\n" - - //Computation loop - "0:\n" - - "ldr d3, [x3, 16]\n" - "ldr x16, [x3, 24]\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr d29, [x0, 16]!\n" - "ldr x17, [x0, 8]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "ins v3.d[1], x16\n" - "ldr d30, [x3, 32]!\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "ins v29.d[1], x17\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - - "ldr x16, [x3, 8]\n" - "subs x2, x2, #4\n" - "sdot v13.4s, v0.16b, v3.4b[0]\n" - "ins v30.d[1], x16\n" - "sdot v15.4s, v0.16b, v3.4b[1]\n" - "sdot v17.4s, v0.16b, v3.4b[2]\n" - "mov v1.16b, v30.16b\n" - "sdot v19.4s, v0.16b, v3.4b[3]\n" - "mov v0.16b, v29.16b\n" - "bne 0b\n" - - //give out address to x26 - "mov x26, %2\n" - - "str q5, [x26]\n" - "add x26, x26, %4\n" - - "str q7, [x26]\n" - "add x26, x26, %4\n" - - "str q9, [x26]\n" - "add x26, x26, %4\n" - - "str q11, [x26]\n" - "add x26, x26, %4\n" - - "str q13, [x26]\n" - "add x26, x26, %4\n" - - "str q15, [x26]\n" - "add x26, x26, %4\n" - - "str q17, [x26]\n" - "add x26, x26, %4\n" - - "str q19, [x26]\n" - - :"+r" (in), - "+r" (w), - "+r" (out) - :"r" ((I64)K), - "r" ((I64)offset) - :"memory","cc","v30","v29","v19","v17","v15","v13","v11", - "v9","v7","v5","v3","v1","v0","x26","x16","x17","x3","x2","x0" - ); -} - -inline void mmm_4x8_A55(U32 offset, U32 K, INT8* in, INT8* w, I32* out) { - asm volatile( - //init in- > v1, w- > v0 - "ldr d1, [%0]\n" - "ldr x16, [%0, 8]\n" - "ins v1.d[1], x16\n" - - "ldr d0, [%1]\n" - "ldr x17, [%1, 8]\n" - "ins v0.d[1], x17\n" - - //give in address to x3 - "mov x3, %0\n" - - //give w address to x0 - "mov x0, %1\n" - - //K- > x2 - "mov x2, %3\n" - - //give out address to x26 - "mov x26, %2\n" - - "ld1 {v5.4s, v6.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v7.4s, v8.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.4s, v12.4s}, [x26]\n" - - /* Layout - 5 6 - 7 8 - 9 10 - 11 12 - */ - - //Computation loop - "0:\n" - - "ldr d29, [x0, 16]\n" - "ldr x17, [x0, 24]\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr d3, [x3, 16] !\n" - "ldr x16, [x3, 8]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "ins v29.d[1], x17\n" - "subs x2, x2, #4\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "ins v3.d[1], x16\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "ldr d0, [x0, 32] !\n" - "ldr x17, [x0, 8]\n" - "sdot v8.4s, v29.16b, v1.4b[1]\n" - "sdot v10.4s, v29.16b, v1.4b[2]\n" - "ins v0.d[1], x17\n" - "sdot v12.4s, v29.16b, v1.4b[3]\n" - "mov v1.16b, v3.16b\n" - "bne 0b\n" - - //give out address to x26 - "mov x26, %2\n" - - "st1 {v5.4s, v6.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v7.4s, v8.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.4s, v12.4s}, [x26]\n" - - :"+r" (in), - "+r" (w), - "+r" (out) - :"r" ((I64)K), - "r" ((I64)offset) - :"memory","cc","v29","v12","v11","v10","v9","v8","v7","v6","v5","v3","v1","v0", - "x26","x16","x17","x3","x2","x0" - ); -} - -inline void mmm_8x8_A55(U32 offset, U32 K, INT8* in, INT8* w, I32* out) { - asm volatile( - //init in- > v1, w- > v0 - "ldr d1, [%0]\n" - "ldr x16, [%0, 8]\n" - "ins v1.d[1], x16\n" - - "ldr d0, [%1]\n" - "ldr x17, [%1, 8]\n" - "ins v0.d[1], x17\n" - - //give in address to x3 - "mov x3, %0\n" - - //give w address to x0 - "mov x0, %1\n" - - //K- > x2 - "mov x2, %3\n" - - //give out address to x26 - "mov x26, %2\n" - - //load in bias - "ld1 {v5.4s, v6.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v7.4s, v8.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.4s, v12.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v13.4s, v14.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v15.4s, v16.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v17.4s, v18.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v19.4s, v20.4s}, [x26]\n" - - /* Layout - 5 6 - 7 8 - 9 10 - 11 12 - - 13 14 - 15 16 - 17 18 - 19 20 - */ - - //Computation loop - "0:\n" - - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr d3, [x3, 16] !\n" - "ldr x16, [x3, 8]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "ldr d29, [x0, 16]\n" - "ldr x17, [x0, 24]\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "ins v3.d[1], x16\n" - "ldr d30, [x3, 16] !\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - "ins v29.d[1], x17\n" - - "sdot v13.4s, v0.16b, v3.4b[0]\n" - "ldr x16, [x3, 8]\n" - "subs x2, x2, #4\n" - "sdot v15.4s, v0.16b, v3.4b[1]\n" - "sdot v17.4s, v0.16b, v3.4b[2]\n" - "ins v30.d[1], x16\n" - "sdot v19.4s, v0.16b, v3.4b[3]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "sdot v8.4s, v29.16b, v1.4b[1]\n" - "ldr d0, [x0, 32] !\n" - "ldr x17, [x0, 8]\n" - "sdot v10.4s, v29.16b, v1.4b[2]\n" - "sdot v12.4s, v29.16b, v1.4b[3]\n" - - "sdot v14.4s, v29.16b, v3.4b[0]\n" - "ins v0.d[1], x17\n" - "mov v1.16b, v30.16b\n" - "sdot v16.4s, v29.16b, v3.4b[1]\n" - "sdot v18.4s, v29.16b, v3.4b[2]\n" - "sdot v20.4s, v29.16b, v3.4b[3]\n" - - "bne 0b\n" - - //give out address to x26 - "mov x26, %2\n" - - "st1 {v5.4s, v6.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v7.4s, v8.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.4s, v12.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v13.4s, v14.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v15.4s, v16.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v17.4s, v18.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v19.4s, v20.4s}, [x26]\n" - - :"+r" (in), - "+r" (w), - "+r" (out) - :"r" ((I64)K), - "r" ((I64)offset) - :"memory", "cc", "v30", "v29", "v20", "v19", "v18", "v17", "v16", "v15", "v14", "v13", "v12", "v11", "v10", - "v9", "v8", "v7", "v6", "v5", "v3", "v1", "v0", - "x26", "x16", "x17", "x3", "x2", "x0" - ); -} - -inline void mmm_4x12_A55(U32 offset, U32 K, INT8* in, INT8* w, I32* out) -{ - asm volatile( - //init in->v1, w->v0 - "ldr q1, [%0]\n" - - "ldr q0, [%1]\n" - - "ldr q29, [%1, 16]\n" // prefetch one more w - - //give in address to x3 - "mov x3, %0\n" - - //give w address to x0 - "mov x0, %1\n" - - //K->x2 - "mov x2, %3\n" - - //give out address to x26 - "mov x26, %2\n" - - //load in bias - "ld1 {v5.4s, v6.4s, v7.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.4s, v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.4s, v12.4s, v13.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v14.4s, v15.4s, v16.4s}, [x26]\n" - - /* Layout - 5 6 7 - 8 9 10 - 11 12 13 - 14 15 16 - */ - - //Computation loop - "0:\n" - // in(x3): v1 - // w(x0): v0 v29 v30 - - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr d30, [x0, 32]\n" - "sdot v8.4s, v0.16b, v1.4b[1]\n" - "ldr x16, [x0, 40]\n" - "sdot v11.4s, v0.16b, v1.4b[2]\n" - "ldr d2, [x3, 16]!\n" // input of next round - "sdot v14.4s, v0.16b, v1.4b[3]\n" - "ldr x17, [x3, 8]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "ins v30.d[1], x16\n" - "sdot v9.4s, v29.16b, v1.4b[1]\n" - "ldr d0, [x0, 48]!\n" // first w of next round - "sdot v12.4s, v29.16b, v1.4b[2]\n" - "ins v2.d[1], x17\n" - "sdot v15.4s, v29.16b, v1.4b[3]\n" - "ldr x16, [x0, 8]\n" - - "sdot v7.4s, v30.16b, v1.4b[0]\n" - "ldr d29, [x0, 16]\n" - "sdot v10.4s, v30.16b, v1.4b[1]\n" - "ldr x19, [x0, 24]\n" - "ins v0.d[1], x16\n" - "sdot v13.4s, v30.16b, v1.4b[2]\n" - "subs x2, x2, #4\n" - "sdot v16.4s, v30.16b, v1.4b[3]\n" - - "mov v1.16b, v2.16b\n" - "ins v29.d[1], x19\n" - "bne 0b\n" - - //give out address to x26 - "mov x26, %2\n" - - "st1 {v5.4s, v6.4s, v7.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.4s, v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.4s, v12.4s, v13.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v14.4s, v15.4s, v16.4s}, [x26]\n" - - :"+r"(in), - "+r"(w), - "+r"(out) - :"r"((I64)K), - "r"((I64)offset) - :"memory","cc","v30","v29","v16","v15","v14","v13","v12","v11","v10","v9","v8","v7","v6","v5","v3","v2","v1","v0","x26","x19","x16","x17","x3","x2","x0" - ); -} - -inline void mmm_8x12_A55(U32 offset, U32 K, INT8* in, INT8* w, I32* out) -{ - asm volatile( - //init in->v1, w->v0 - "ldr q1, [%0]\n" - - "ldr q0, [%1]\n" - - "ldr q29, [%1, 16]\n" // prefetch one more w - - //give in address to x3 - "mov x3, %0\n" - - //give w address to x0 - "mov x0, %1\n" - - //K->x2 - "mov x2, %3\n" - - //give out address to x26 - "mov x26, %2\n" - - //load in bias - "ld1 {v5.4s, v6.4s, v7.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.4s, v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.4s, v12.4s, v13.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v14.4s, v15.4s, v16.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v17.4s, v18.4s, v19.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v20.4s, v21.4s, v22.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v23.4s, v24.4s, v25.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v26.4s, v27.4s, v28.4s}, [x26]\n" - - /* Layout - 5 6 7 - 8 9 10 - 11 12 13 - 14 15 16 - - 17 18 19 - 20 21 22 - 23 24 25 - 26 27 28 - */ - - //Computation loop - "0:\n" - // in(x3): v1 v2 - // w(x0): v0 v29 v30 - - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr d30, [x0, 32]\n" - "sdot v8.4s, v0.16b, v1.4b[1]\n" - "ldr x16, [x0, 40]\n" - "sdot v11.4s, v0.16b, v1.4b[2]\n" - "ldr d2, [x3, 16]\n" - "sdot v14.4s, v0.16b, v1.4b[3]\n" - "ldr x17, [x3, 24]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "ins v30.d[1], x16\n" - "sdot v9.4s, v29.16b, v1.4b[1]\n" - "ldr d3, [x0, 48]!\n" // first w of next round - "sdot v12.4s, v29.16b, v1.4b[2]\n" - "ins v2.d[1], x17\n" - "sdot v15.4s, v29.16b, v1.4b[3]\n" - "ldr x16, [x0, 8]\n" - - "sdot v7.4s, v30.16b, v1.4b[0]\n" - "subs x2, x2, #4\n" - "sdot v10.4s, v30.16b, v1.4b[1]\n" - "ins v3.d[1], x16\n" - "sdot v13.4s, v30.16b, v1.4b[2]\n" - "sdot v16.4s, v30.16b, v1.4b[3]\n" - - "sdot v17.4s, v0.16b, v2.4b[0]\n" - "ldr d1, [x3, 32]!\n" - "sdot v20.4s, v0.16b, v2.4b[1]\n" - "ldr x17, [x3, 8]\n" - "sdot v23.4s, v0.16b, v2.4b[2]\n" - "sdot v26.4s, v0.16b, v2.4b[3]\n" - - "sdot v18.4s, v29.16b, v2.4b[0]\n" - "mov v0.16b, v3.16b\n" - "sdot v21.4s, v29.16b, v2.4b[1]\n" - "ins v1.d[1], x17\n" - "sdot v24.4s, v29.16b, v2.4b[2]\n" - "sdot v27.4s, v29.16b, v2.4b[3]\n" - - "sdot v19.4s, v30.16b, v2.4b[0]\n" - "ldr d29, [x0, 16]\n" - "sdot v22.4s, v30.16b, v2.4b[1]\n" - "ldr x16, [x0, 24]\n" - "sdot v25.4s, v30.16b, v2.4b[2]\n" - "sdot v28.4s, v30.16b, v2.4b[3]\n" - "ins v29.d[1], x16\n" - - "bne 0b\n" - - //give out address to x26 - "mov x26, %2\n" - - "st1 {v5.4s, v6.4s, v7.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.4s, v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.4s, v12.4s, v13.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v14.4s, v15.4s, v16.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v17.4s, v18.4s, v19.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v20.4s, v21.4s, v22.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v23.4s, v24.4s, v25.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v26.4s, v27.4s, v28.4s}, [x26]\n" - - :"+r"(in), - "+r"(w), - "+r"(out) - :"r"((I64)K), - "r"((I64)offset) - :"memory","cc","v30","v29","v28","v27","v26","v25","v24","v23","v22","v21","v20","v19","v18","v17","v16","v15","v14","v13","v12","v11","v10","v9","v8","v7","v6","v5","v3","v2","v1","v0","x26","x16","x17","x3","x2","x0" - ); -} - -void mmm_A55(int M, int N, int K, INT8* matrix1, INT8* matrix2, INT8* tmp, I32* result) -{ - int blockK = K; - int K4 = pad_to_4_multiple(K); - int blockM = 96; - INT8* matrix1Trans = tmp; - I32* resultCurrent = result; - - int KInner, MInner, m, n; - for (int k = 0; k < K; k += blockK) { - KInner = UNI_MIN(blockK, K - k); //K for this inner iteration - for (int i = 0; i < M; i+=blockM) { - MInner = UNI_MIN(blockM, M - i); //M for this inner iteration - for(n = 0; n <= N - 8; n+=8){ - if(i == 0){ - matrix1_trans_n8(KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); - } - - for(m = 0; m <= (MInner-12); m+=12){ - resultCurrent = result + n * M + m + i; - mmm_8x12_A55(M*4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - for(; m <=(MInner - 8); m+=8){ - resultCurrent = result + n * M + m + i; - mmm_8x8_A55(M*4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - - if((MInner - m) >= 4){ - resultCurrent = result + n * M + m + i; - mmm_8x4_A55(M*4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - m += 4; - } - - if(MInner - m){ - resultCurrent = result + n * M + m + i; - mmm_N8_MTail(MInner - m, M, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - } - - if((N - n) >= 4){ - - if(i == 0){ - matrix1_trans_int8(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); - } - - for(m = 0; m <= (MInner - 12); m+=12){ - resultCurrent = result + n * M + m + i; - mmm_4x12_A55(M*4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - - for(; m <= (MInner - 8); m+=8){ - resultCurrent = result + n * M + m + i; - mmm_4x8_A55(M*4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - - if((MInner - m) >= 4){ - resultCurrent = result + n * M + m + i; - mmm_4x4_A55(M*4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - m += 4; - } - - if(MInner - m){ - resultCurrent = result + n * M + m + i; - mmm_N4_MTail(MInner - m, M, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - n += 4; - } - - if (N - n) { - if(i == 0){ - matrix1_trans_int8(N-n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); - } - - for(m = 0; m <= (MInner - 12); m+=12){ - resultCurrent = result + n * M + m + i; - mmm_NTail_M12(M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - - for(; m <= (MInner - 8); m+=8){ - resultCurrent = result + n * M + m + i; - mmm_NTail_M8(M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - - if((MInner - m) >= 4){ - resultCurrent = result + n * M + m + i; - mmm_NTail_M4(M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - m += 4; - } - - if(MInner - m){ - resultCurrent = result + n * M + m + i; - mmm_NTail_M(MInner - m, M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - } - } - } -} -#endif diff --git a/blas-enhance/src/cpu/arm/int8/mmm_A76.cpp b/blas-enhance/src/cpu/arm/int8/mmm_A76.cpp deleted file mode 100644 index a3cca0c9..00000000 --- a/blas-enhance/src/cpu/arm/int8/mmm_A76.cpp +++ /dev/null @@ -1,685 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifdef _USE_INT8 -#include -#include -#include "cpu/arm/blas_arm.h" -#include "cpu/arm/int8/mmm_common.h" -#include "cpu/arm/int8/mmm.h" - -inline void mmm_4x4_A76(U32 offset, U32 K, INT8* in, INT8* w, I32* out) -{ - asm volatile( - //init in- > v1, w- > v0 - "ldr q1, [%0]\n" - - "ldr q0, [%1]\n" - - //give in address to x3 - "mov x3, %0\n" - - //give w address to x0 - "mov x0, %1\n" - - //K- > x2 - "mov x2, %3\n" - - //give out address to x26 - "mov x26, %2\n" - - //load in bias - "ldr q5, [x26]\n" - "add x26, x26, %4\n" - - "ldr q7, [x26]\n" - "add x26, x26, %4\n" - - "ldr q9, [x26]\n" - "add x26, x26, %4\n" - - "ldr q11, [x26]\n" - - //Computation loop - "0:\n" - - "ldr q3, [x3, 16]!\n" - "ldr q29, [x0, 16]!\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "subs x2, x2, #4\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - "mov v1.16b, v3.16b\n" - "mov v0.16b, v29.16b\n" - "bne 0b\n" - - "1:\n" - - //give out address to x26 - "mov x26, %2\n" - - "str q5, [x26]\n" - "add x26, x26, %4\n" - - "str q7, [x26]\n" - "add x26, x26, %4\n" - - "str q9, [x26]\n" - "add x26, x26, %4\n" - - "str q11, [x26]\n" - - :"+r" (in), - "+r" (w), - "+r" (out) - :"r" ((I64)K), - "r" ((I64)offset) - :"memory","cc","v30","v29","v11","v9","v7","v5","v3","v1","v0","x26","x3","x2","x0" - ); -} - -inline void mmm_8x4_A76(U32 offset, U32 K, INT8* in, INT8* w, I32* out) -{ - asm volatile( - //init in- > v1, w- > v0 - "ldr q1, [%0]\n" - - "ldr q0, [%1]\n" - - //give in address to x3 - "mov x3, %0\n" - - //give w address to x0 - "mov x0, %1\n" - - //K- > x2 - "mov x2, %3\n" - - //give out address to x26 - "mov x26, %2\n" - - //load in bias - "ldr q5, [x26]\n" - "add x26, x26, %4\n" - - "ldr q7, [x26]\n" - "add x26, x26, %4\n" - - "ldr q9, [x26]\n" - "add x26, x26, %4\n" - - "ldr q11, [x26]\n" - "add x26, x26, %4\n" - - "ldr q13, [x26]\n" - "add x26, x26, %4\n" - - "ldr q15, [x26]\n" - "add x26, x26, %4\n" - - "ldr q17, [x26]\n" - "add x26, x26, %4\n" - - "ldr q19, [x26]\n" - - //Computation loop - "0:\n" - - "ldr q3, [x3, 16]\n" - "ldr q29, [x0, 16]!\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - - "subs x2, x2, #4\n" - "sdot v13.4s, v0.16b, v3.4b[0]\n" - "sdot v15.4s, v0.16b, v3.4b[1]\n" - "ldr q1, [x3, 32]!\n" - "sdot v17.4s, v0.16b, v3.4b[2]\n" - "sdot v19.4s, v0.16b, v3.4b[3]\n" - "mov v0.16b, v29.16b\n" - "bne 0b\n" - - //give out address to x26 - "mov x26, %2\n" - - "str q5, [x26]\n" - "add x26, x26, %4\n" - - "str q7, [x26]\n" - "add x26, x26, %4\n" - - "str q9, [x26]\n" - "add x26, x26, %4\n" - - "str q11, [x26]\n" - "add x26, x26, %4\n" - - "str q13, [x26]\n" - "add x26, x26, %4\n" - - "str q15, [x26]\n" - "add x26, x26, %4\n" - - "str q17, [x26]\n" - "add x26, x26, %4\n" - - "str q19, [x26]\n" - - :"+r" (in), - "+r" (w), - "+r" (out) - :"r" ((I64)K), - "r" ((I64)offset) - :"memory","cc","v30","v29","v19","v17","v15","v13","v11", - "v9","v7","v5","v3","v1","v0","x26","x3","x2","x0" - ); -} - -inline void mmm_4x8_A76(U32 offset, U32 K, INT8* in, INT8* w, I32* out) -{ - asm volatile( - //init in- > v1, w- > v0 - "ldr q1, [%0]\n" - - "ldr q0, [%1]\n" - - //give in address to x3 - "mov x3, %0\n" - - //give w address to x0 - "mov x0, %1\n" - - //K- > x2 - "mov x2, %3\n" - - //give out address to x26 - "mov x26, %2\n" - - "ld1 {v5.4s, v6.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v7.4s, v8.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.4s, v12.4s}, [x26]\n" - - /* Layout - 5 6 - 7 8 - 9 10 - 11 12 - */ - - //Computation loop - "0:\n" - - "ldr q29, [x0, 16]\n" - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr q3, [x3, 16]!\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "subs x2, x2, #4\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "ldr q0, [x0, 32]!\n" - "sdot v8.4s, v29.16b, v1.4b[1]\n" - "sdot v10.4s, v29.16b, v1.4b[2]\n" - "sdot v12.4s, v29.16b, v1.4b[3]\n" - "mov v1.16b, v3.16b\n" - "bne 0b\n" - - //give out address to x26 - "mov x26, %2\n" - - "st1 {v5.4s, v6.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v7.4s, v8.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.4s, v12.4s}, [x26]\n" - - :"+r" (in), - "+r" (w), - "+r" (out) - :"r" ((I64)K), - "r" ((I64)offset) - :"memory","cc","v29","v12","v11","v10","v9","v8","v7","v6","v5","v3","v1","v0", - "x26","x3","x2","x0" - ); -} - -inline void mmm_8x8_A76(U32 offset, U32 K, INT8* in, INT8* w, I32* out) -{ - asm volatile( - //init in- > v1, w- > v0 - "ldr q1, [%0]\n" - - "ldr q0, [%1]\n" - - //give in address to x3 - "mov x3, %0\n" - - //give w address to x0 - "mov x0, %1\n" - - //K- > x2 - "mov x2, %3\n" - - //give out address to x26 - "mov x26, %2\n" - - //load in bias - "ld1 {v5.4s, v6.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v7.4s, v8.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.4s, v12.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v13.4s, v14.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v15.4s, v16.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v17.4s, v18.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v19.4s, v20.4s}, [x26]\n" - - /* Layout - 5 6 - 7 8 - 9 10 - 11 12 - - 13 14 - 15 16 - 17 18 - 19 20 - */ - - //Computation loop - "0:\n" - - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr q3, [x3, 16]!\n" - "sdot v7.4s, v0.16b, v1.4b[1]\n" - "ldr q29, [x0, 16]\n" - "sdot v9.4s, v0.16b, v1.4b[2]\n" - "sdot v11.4s, v0.16b, v1.4b[3]\n" - - "sdot v13.4s, v0.16b, v3.4b[0]\n" - "subs x2, x2, #4\n" - "sdot v15.4s, v0.16b, v3.4b[1]\n" - "sdot v17.4s, v0.16b, v3.4b[2]\n" - "sdot v19.4s, v0.16b, v3.4b[3]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "sdot v8.4s, v29.16b, v1.4b[1]\n" - "ldr q0, [x0, 32]!\n" - "sdot v10.4s, v29.16b, v1.4b[2]\n" - "sdot v12.4s, v29.16b, v1.4b[3]\n" - - "sdot v14.4s, v29.16b, v3.4b[0]\n" - "sdot v16.4s, v29.16b, v3.4b[1]\n" - "ldr q1, [x3, 16]!\n" - "sdot v18.4s, v29.16b, v3.4b[2]\n" - "sdot v20.4s, v29.16b, v3.4b[3]\n" - - "bne 0b\n" - - //give out address to x26 - "mov x26, %2\n" - - "st1 {v5.4s, v6.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v7.4s, v8.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.4s, v12.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v13.4s, v14.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v15.4s, v16.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v17.4s, v18.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v19.4s, v20.4s}, [x26]\n" - - :"+r" (in), - "+r" (w), - "+r" (out) - :"r" ((I64)K), - "r" ((I64)offset) - :"memory", "cc", "v29", "v20", "v19", "v18", "v17", "v16", "v15", "v14", "v13", "v12", "v11", "v10", - "v9", "v8", "v7", "v6", "v5", "v3", "v1", "v0", - "x26", "x3", "x2", "x0" - ); -} - -inline void mmm_4x12_A76(U32 offset, U32 K, INT8* in, INT8* w, I32* out) -{ - asm volatile( - //init in->v1, w->v0 - "ldr q1, [%0]\n" - - "ldr q0, [%1]\n" - - "ldr q29, [%1, 16]\n" // prefetch one more w - - //give in address to x3 - "mov x3, %0\n" - - //give w address to x0 - "mov x0, %1\n" - - //K->x2 - "mov x2, %3\n" - - //give out address to x26 - "mov x26, %2\n" - - //load in bias - "ld1 {v5.4s, v6.4s, v7.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.4s, v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.4s, v12.4s, v13.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v14.4s, v15.4s, v16.4s}, [x26]\n" - - /* Layout - 5 6 7 - 8 9 10 - 11 12 13 - 14 15 16 - */ - - //Computation loop - "0:\n" - // in(x3): v1 - // w(x0): v0 v29 v30 - - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr q30, [x0, 32]\n" - "sdot v8.4s, v0.16b, v1.4b[1]\n" - "sdot v11.4s, v0.16b, v1.4b[2]\n" - "ldr q2, [x3, 16]!\n" // input of next round - "sdot v14.4s, v0.16b, v1.4b[3]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "sdot v9.4s, v29.16b, v1.4b[1]\n" - "ldr q0, [x0, 48]!\n" // first w of next round - "sdot v12.4s, v29.16b, v1.4b[2]\n" - "sdot v15.4s, v29.16b, v1.4b[3]\n" - - "sdot v7.4s, v30.16b, v1.4b[0]\n" - "ldr q29, [x0, 16]\n" - "sdot v10.4s, v30.16b, v1.4b[1]\n" - "sdot v13.4s, v30.16b, v1.4b[2]\n" - "subs x2, x2, #4\n" - "sdot v16.4s, v30.16b, v1.4b[3]\n" - - "mov v1.16b, v2.16b\n" - "bne 0b\n" - - //give out address to x26 - "mov x26, %2\n" - - "st1 {v5.4s, v6.4s, v7.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.4s, v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.4s, v12.4s, v13.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v14.4s, v15.4s, v16.4s}, [x26]\n" - - :"+r"(in), - "+r"(w), - "+r"(out) - :"r"((I64)K), - "r"((I64)offset) - :"memory","cc","v30","v29","v16","v15","v14","v13","v12","v11","v10", - "v9","v8","v7","v6","v5","v3","v2","v1","v0","x26","x19","x3","x2","x0" - ); -} - -inline void mmm_8x12_A76(U32 offset, U32 K, INT8* in, INT8* w, I32* out) -{ - asm volatile( - //init in->v1, w->v0 - "ldr q1, [%0]\n" - - "ldr q0, [%1]\n" - - "ldr q29, [%1, 16]\n" // prefetch one more w - - //give in address to x3 - "mov x3, %0\n" - - //give w address to x0 - "mov x0, %1\n" - - //K->x2 - "mov x2, %3\n" - - //give out address to x26 - "mov x26, %2\n" - - //load in bias - "ld1 {v5.4s, v6.4s, v7.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v8.4s, v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v11.4s, v12.4s, v13.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v14.4s, v15.4s, v16.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v17.4s, v18.4s, v19.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v20.4s, v21.4s, v22.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v23.4s, v24.4s, v25.4s}, [x26]\n" - "add x26, x26, %4\n" - "ld1 {v26.4s, v27.4s, v28.4s}, [x26]\n" - - /* Layout - 5 6 7 - 8 9 10 - 11 12 13 - 14 15 16 - - 17 18 19 - 20 21 22 - 23 24 25 - 26 27 28 - */ - - //Computation loop - "0:\n" - // in(x3): v1 v2 - // w(x0): v0 v29 v30 - - "sdot v5.4s, v0.16b, v1.4b[0]\n" - "ldr q30, [x0, 32]\n" - "sdot v8.4s, v0.16b, v1.4b[1]\n" - "sdot v11.4s, v0.16b, v1.4b[2]\n" - "ldr q2, [x3, 16]\n" - "sdot v14.4s, v0.16b, v1.4b[3]\n" - - "sdot v6.4s, v29.16b, v1.4b[0]\n" - "sdot v9.4s, v29.16b, v1.4b[1]\n" - "ldr q3, [x0, 48]!\n" // first w of next round - "sdot v12.4s, v29.16b, v1.4b[2]\n" - "sdot v15.4s, v29.16b, v1.4b[3]\n" - - "sdot v7.4s, v30.16b, v1.4b[0]\n" - "subs x2, x2, #4\n" - "sdot v10.4s, v30.16b, v1.4b[1]\n" - "sdot v13.4s, v30.16b, v1.4b[2]\n" - "sdot v16.4s, v30.16b, v1.4b[3]\n" - - "sdot v17.4s, v0.16b, v2.4b[0]\n" - "ldr q1, [x3, 32]!\n" - "sdot v20.4s, v0.16b, v2.4b[1]\n" - "sdot v23.4s, v0.16b, v2.4b[2]\n" - "sdot v26.4s, v0.16b, v2.4b[3]\n" - - "sdot v18.4s, v29.16b, v2.4b[0]\n" - "mov v0.16b, v3.16b\n" - "sdot v21.4s, v29.16b, v2.4b[1]\n" - "sdot v24.4s, v29.16b, v2.4b[2]\n" - "sdot v27.4s, v29.16b, v2.4b[3]\n" - - "sdot v19.4s, v30.16b, v2.4b[0]\n" - "ldr q29, [x0, 16]\n" - "sdot v22.4s, v30.16b, v2.4b[1]\n" - "sdot v25.4s, v30.16b, v2.4b[2]\n" - "sdot v28.4s, v30.16b, v2.4b[3]\n" - - "bne 0b\n" - - //give out address to x26 - "mov x26, %2\n" - - "st1 {v5.4s, v6.4s, v7.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v8.4s, v9.4s, v10.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v11.4s, v12.4s, v13.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v14.4s, v15.4s, v16.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v17.4s, v18.4s, v19.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v20.4s, v21.4s, v22.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v23.4s, v24.4s, v25.4s}, [x26]\n" - "add x26, x26, %4\n" - "st1 {v26.4s, v27.4s, v28.4s}, [x26]\n" - - :"+r"(in), - "+r"(w), - "+r"(out) - :"r"((I64)K), - "r"((I64)offset) - :"memory","cc","v30","v29","v28","v27","v26","v25","v24","v23","v22","v21","v20", - "v19","v18","v17","v16","v15","v14","v13","v12","v11","v10","v9","v8","v7","v6", - "v5","v3","v2","v1","v0","x26","x3","x2","x0" - ); -} - -void mmm_A76(int M, int N, int K, INT8* matrix1, INT8* matrix2, INT8* tmp, I32* result) -{ - int blockK = K; - U32 K4 = pad_to_4_multiple(K); - int blockM = 96; - INT8* matrix1Trans = tmp; - I32* resultCurrent = result; - - int KInner, MInner, m, n; - for (int k = 0; k < K; k += blockK) { - KInner = UNI_MIN(blockK, K - k);//K for this inner iteration - for (int i = 0; i < M; i+=blockM) { - MInner = UNI_MIN(blockM, M - i);//M for this inner iteration - for(n = 0; n <= N - 8; n+=8){ - if (i == 0) { - matrix1_trans_n8(KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); - } - - for(m = 0; m <= (MInner-12); m+=12){ - resultCurrent = result + n * M + m + i; - mmm_8x12_A76(M*4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - for(; m <=(MInner - 8); m+=8){ - resultCurrent = result + n * M + m + i; - mmm_8x8_A76(M*4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - - if((MInner - m) >= 4){ - resultCurrent = result + n * M + m + i; - mmm_8x4_A76(M*4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - m += 4; - } - - if(MInner - m){ - resultCurrent = result + n * M + m + i; - mmm_N8_MTail(MInner - m, M, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - } - - if((N - n) >= 4){ - - if(i == 0){ - matrix1_trans_int8(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); - } - - for(m = 0; m <= (MInner - 12); m+=12){ - resultCurrent = result + n * M + m + i; - mmm_4x12_A76(M*4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - - for(; m <= (MInner - 8); m+=8){ - resultCurrent = result + n * M + m + i; - mmm_4x8_A76(M*4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - - if((MInner - m) >= 4){ - resultCurrent = result + n * M + m + i; - mmm_4x4_A76(M*4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - m += 4; - } - - if(MInner - m){ - resultCurrent = result + n * M + m + i; - mmm_N4_MTail(MInner - m, M, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - n += 4; - } - - if (N - n) { - if(i == 0){ - matrix1_trans_int8(N-n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); - } - - for(m = 0; m <= (MInner - 12); m+=12){ - resultCurrent = result + n * M + m + i; - mmm_NTail_M12(M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - - for(; m <= (MInner - 8); m+=8){ - resultCurrent = result + n * M + m + i; - mmm_NTail_M8(M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - - if((MInner - m) >= 4){ - resultCurrent = result + n * M + m + i; - mmm_NTail_M4(M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - m += 4; - } - - if(MInner - m){ - resultCurrent = result + n * M + m + i; - mmm_NTail_M(MInner - m, M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); - } - } - } - } -} -#endif diff --git a/blas-enhance/src/cpu/arm/int8/mmm_common.h b/blas-enhance/src/cpu/arm/int8/mmm_common.h deleted file mode 100644 index 9c2ec833..00000000 --- a/blas-enhance/src/cpu/arm/int8/mmm_common.h +++ /dev/null @@ -1,489 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _H_MMM_COMMON -#define _H_MMM_COMMON - -#ifdef _USE_INT8 -#include -#include - -#include "type.h" -#include "error.h" -#include "arm_neon_expand.h" - -inline void matrix1_trans_n8(U32 blockK, U32 K, INT8* src, INT8* dst) -{ - // Move k4 as one I32 - I32* dst1 = (I32*)dst; - - I32 *in[8]; - for (U32 i=0; i<8; i++) { - in[i] = (I32*)(src + i * K); - } - U32 k = 0; - for (; k < blockK - 7; k += 8) { - if(k % 64 == 0){ - asm volatile( - "prfm pldl2keep, [%[in0], 64]\n" - "prfm pldl2keep, [%[in1], 64]\n" - "prfm pldl2keep, [%[in2], 64]\n" - "prfm pldl2keep, [%[in3], 64]\n" - "prfm pldl2keep, [%[in4], 64]\n" - "prfm pldl2keep, [%[in5], 64]\n" - "prfm pldl2keep, [%[in6], 64]\n" - "prfm pldl2keep, [%[in7], 64]\n" - :[in0]"+r"(in[0]), - [in1]"+r"(in[1]), - [in2]"+r"(in[2]), - [in3]"+r"(in[3]), - [in4]"+r"(in[4]), - [in5]"+r"(in[5]), - [in6]"+r"(in[6]), - [in7]"+r"(in[7]) - : - :"memory","cc" - ); - } - asm volatile( - "ldr d0, [%[in0]], 8\n" - "ldr d1, [%[in1]], 8\n" - "ldr d2, [%[in2]], 8\n" - "ldr d3, [%[in3]], 8\n" - "ldr d4, [%[in4]], 8\n" - "ldr d5, [%[in5]], 8\n" - "ldr d6, [%[in6]], 8\n" - "ldr d7, [%[in7]], 8\n" - - "zip1 v8.2s, v0.2s, v1.2s\n" - "zip2 v12.2s, v0.2s, v1.2s\n" - "zip1 v9.2s, v2.2s, v3.2s\n" - "zip2 v13.2s, v2.2s, v3.2s\n" - "zip1 v10.2s, v4.2s, v5.2s\n" - "zip2 v14.2s, v4.2s, v5.2s\n" - "zip1 v11.2s, v6.2s, v7.2s\n" - "zip2 v15.2s, v6.2s, v7.2s\n" - - "str d8, [%[out]]\n" - "str d9, [%[out], 8]\n" - "str d10, [%[out], 16]\n" - "str d11, [%[out], 24]\n" - "str d12, [%[out], 32]\n" - "str d13, [%[out], 40]\n" - "str d14, [%[out], 48]\n" - "str d15, [%[out], 56]\n" - :[in0]"+r"(in[0]), - [in1]"+r"(in[1]), - [in2]"+r"(in[2]), - [in3]"+r"(in[3]), - [in4]"+r"(in[4]), - [in5]"+r"(in[5]), - [in6]"+r"(in[6]), - [in7]"+r"(in[7]) - :[out]"r"(dst1) - :"memory","cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" - ); - dst1 += 16; - } - - if (k < blockK - 3) { - for (U32 i = 0; i < 8; i++) { - dst1[0] = in[i][0]; - dst1++; - in[i]++; - } - k += 4; - } - - if (k < blockK) { - U32 kTail = blockK - k; - INT8 *dstI8 = (INT8*)dst1; - INT8 *inI[8]; - for (U32 i = 0; i < 8; i++) { - inI[i] = (INT8*)in[i]; - } - for (U32 i = 0; i < 8; i++) { - for (U32 j = 0; j < 4; j++) { - if (j < kTail) { - dstI8[i * 4 + j] = inI[i][j]; - } else { - dstI8[i * 4 + j] = 0; - } - } - } - } -} - -//Trans from NK to NKn(size)k4 -inline void matrix1_trans_int8(U32 size, U32 blockK, U32 K, INT8* src, INT8* dst) -{ - // Move k4 as one I32 - I32* src1; - I32* dst1 = (I32*)dst; - U32 offset = 64; - - U32 i = 0; - for (; i < blockK/4; i++) { - for (U32 j = 0; j < size; j++) { - src1 = (I32*)(src + j * K); - - if(i % 16 == 0){ - asm volatile( - "prfm pldl2keep, [%0, %1]\n" - :"+r"(src1) - :"r"((I64)offset) - :"memory","cc" - ); - } - *dst1++ = *(src1 + i); - } - } - U32 kTail = blockK % 4; - if (kTail > 0) { - INT8 *srcI8; - INT8 *dstI8 = (INT8*)dst1; - for (U32 j = 0; j < size; j++) { - srcI8 = src + j * K + i * 4; - for (U32 k = 0; k < 4; k++) { - if (k < kTail) { - dstI8[j * 4 + k] = srcI8[k]; - } else { - dstI8[j * 4 + k] = 0; - } - } - } - } -} - -inline void matrix2_trans_m12(U32 blockK, U32 M, INT8* src, INT8* dst) -{ - INT8* src1 = src; - INT8* dst1 = dst; - U32 offset = 4 * M; - - U32 i = 0; - for (; i < blockK - 3; i += 4) { - // Prefetch for the next iteration - asm volatile( - "prfm pldl2keep, [%0, %1]\n" - :"+r"(src1) - :"r"((I64)offset) - :"memory","cc" - ); - - INT8 *in12[4]; - for (U32 j=0; j<4; j++) { - in12[j] = src1 + j * M; - } - src1 += offset; - - asm volatile( - "ldr d0, [%[in0]]\n" - "ldr d1, [%[in1]]\n" - "ldr d2, [%[in2]]\n" - "ldr d3, [%[in3]]\n" - "zip1 v4.8b, v0.8b, v1.8b\n" - "zip2 v5.8b, v0.8b, v1.8b\n" - "zip1 v6.8b, v2.8b, v3.8b\n" - "zip2 v7.8b, v2.8b, v3.8b\n" - - "zip1 v0.4h, v4.4h, v6.4h\n" - "zip2 v1.4h, v4.4h, v6.4h\n" - "zip1 v2.4h, v5.4h, v7.4h\n" - "zip2 v3.4h, v5.4h, v7.4h\n" - "str d0, [%[out]]\n" - "str d1, [%[out], 8]\n" - "str d2, [%[out], 16]\n" - "str d3, [%[out], 24]\n" - : - :[in0]"r"(in12[0]), - [in1]"r"(in12[1]), - [in2]"r"(in12[2]), - [in3]"r"(in12[3]), - [out]"r"(dst1) - :"memory","cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); - - for (U32 j=0; j<4; j++) { - for (U32 k=0; k<4; k++) { - dst1[32 + j*4 + k] = in12[k][8+j]; - } - } - - dst1 += 48; - } - if (i < blockK) { - U32 kTail = blockK - i; - - INT8 *in12[4]; - INT8 zero[12] = {0}; - for (U32 j = 0; j < 4; j++) { - if (j < kTail) { - in12[j] = src1 + j * M; - } else { - in12[j] = zero; - } - } - - asm volatile( - "ldr d0, [%[in0]]\n" - "ldr d1, [%[in1]]\n" - "ldr d2, [%[in2]]\n" - "ldr d3, [%[in3]]\n" - "zip1 v4.8b, v0.8b, v1.8b\n" - "zip2 v5.8b, v0.8b, v1.8b\n" - "zip1 v6.8b, v2.8b, v3.8b\n" - "zip2 v7.8b, v2.8b, v3.8b\n" - - "zip1 v0.4h, v4.4h, v6.4h\n" - "zip2 v1.4h, v4.4h, v6.4h\n" - "zip1 v2.4h, v5.4h, v7.4h\n" - "zip2 v3.4h, v5.4h, v7.4h\n" - "str d0, [%[out]]\n" - "str d1, [%[out], 8]\n" - "str d2, [%[out], 16]\n" - "str d3, [%[out], 24]\n" - : - :[in0]"r"(in12[0]), - [in1]"r"(in12[1]), - [in2]"r"(in12[2]), - [in3]"r"(in12[3]), - [out]"r"(dst1) - :"memory","cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); - - for (U32 j = 0; j < 4; j++) { - for (U32 k = 0; k < 4; k++) { - dst1[32 + j * 4 + k] = in12[k][8 + j]; - } - } - } -} - -//Trans from KM to MKm(size)k4 -inline void matrix2_trans_int8(U32 size, U32 blockK, U32 M, INT8* src, INT8* dst) -{ - INT8* src1 = src; - INT8* dst1 = dst; - U32 offset = 4 * M; - - U32 i = 0; - for(; i < blockK - 3; i += 4){ - src1 = src + i * M; - asm volatile( - "prfm pldl2keep, [%0, %1]\n" - :"+r"(src1) - :"r"((I64)offset) - :"memory","cc" - ); - for(U32 j = 0; j < size; j++){ - src1 = src + i * M + j; - for (U32 k = 0; k < 4; k++){ - *dst1 = *src1; - dst1++; - src1 += M; - } - } - } - if (i < blockK) { - U32 kTail = blockK - i; - for (U32 j = 0; j < size; j++) { - src1 = src + i * M + j; - for (U32 k = 0; k < 4; k++) { - if (k < kTail) { - *dst1 = *src1; - dst1++; - src1 += M; - } else { - *dst1 = 0; - dst1++; - } - } - } - } -} - -inline void mmm_N8_MTail(U32 MInner, U32 M, U32 K, INT8* matrix1, INT8* matrix2, I32* result) -{ - int8x16_t mat1[2]; - int8x16_t mat2; - int32x4_t res[4][2] = {{0}}; - I32 tmp[8] = {0}; - - CHECK_REQUIREMENT(MInner < 4); - - for(U32 i = 0; i < K; i+=4){ - mat1[0] = vld1q_s8(matrix1 + i * 8); - mat1[1] = vld1q_s8(matrix1 + i * 8 + 16); - - mat2 = vld1q_s8(matrix2 + i * MInner); - - for(U32 j = 0; j < MInner; j++){ - res[j][0] = vdotq_laneq_s32_builtin(res[j][0], mat1[0], mat2, j); - res[j][1] = vdotq_laneq_s32_builtin(res[j][1], mat1[1], mat2, j); - } - } - for(U32 p = 0; p < MInner; p++){ - vst1q_s32(tmp, res[p][0]); - vst1q_s32(tmp+4, res[p][1]); - for(U32 q = 0; q < 8; q++){ - result[q * M + p] += tmp[q]; - } - res[p][0] = vdupq_n_s32(0); - res[p][1] = vdupq_n_s32(0); - } -} - -inline void mmm_N4_MTail(U32 MInner, U32 M, U32 K, INT8* matrix1, INT8* matrix2, I32* result) -{ - int8x16_t mat1 = {0}; - int8x16_t mat2 = {0}; - int32x4_t res[4] = {0}; - I32 tmp[8] = {0}; - - CHECK_REQUIREMENT(MInner < 4); - - for(U32 i = 0; i < K; i+=4){ - mat1 = vld1q_s8(matrix1 + i * 8); - - mat2 = vld1q_s8(matrix2 + i * MInner); - - for(U32 j = 0; j < MInner; j++){ - res[j] = vdotq_laneq_s32_builtin(res[j], mat1, mat2, j); - } - } - for(U32 p = 0; p < MInner; p++){ - vst1q_s32(tmp, res[p]); - for(U32 q = 0; q < 8; q++){ - result[q * M + p] += tmp[q]; - } - res[p] = vdupq_n_s32(0); - } -} - -inline void mmm_NTail_M12(U32 M, U32 N, U32 K, INT8* matrix1, INT8* matrix2, I32* result) { - int8x16_t mat1 = {0}; - int8x16_t mat2[3] = {0}; - int32x4_t res[4][3] = {{0}}; - - for (U32 i = 0; i < N; i++) { - res[i][0] = vld1q_s32(result + i*M); - res[i][1] = vld1q_s32(result + i*M + 4); - res[i][2] = vld1q_s32(result + i*M + 8); - } - - for (U32 q=0; q -#include - - -inline void mvm_col_tail(U32 N, U32 K, INT8* matrix, INT8* vector, I32* result) { - for (U32 n = 0; n < N; n++) { - I32 tmp = 0; - for (U32 k = 0; k < K; k++) { - tmp += vector[k] * matrix[k*N + n]; - } - result[n] += tmp; - } -} - -inline void mvm_row_tail(U32 N, U32 K, INT8* matrix, INT8* vector, I32* result) { - INT8* cur_row = matrix; - for (U32 n = 0; n < N; n++) { - I32 tmp = 0; - for(U32 k = 0; k < K; k++) { - tmp += vector[k] * cur_row[k]; - } - result[n] += tmp; - cur_row += K; - } -} - -inline void mvm_row_kernel(U32 Nbatch, U32 K, INT8* matrix, INT8* vector, I32* result) { - U32 N = Nbatch * 4; - int8x16_t mat[4], v; - U32 K_tail = K % 16; - U32 K_inner = K - K_tail; - for (U32 n = 0; n < N; n+=4) { - int32x4_t res[4] = {0}; - int32x4_t bias; - - INT8* w0 = matrix + n * K; - INT8* w1 = w0 + K; - INT8* w2 = w1 + K; - INT8* w3 = w2 + K; - - for (U32 k = 0; k < K_inner; k+=16) { - v = vld1q_s8(vector + k); - mat[0] = vld1q_s8(w0); - mat[1] = vld1q_s8(w1); - mat[2] = vld1q_s8(w2); - mat[3] = vld1q_s8(w3); - - res[0] = vdotq_s32(res[0], mat[0], v); - res[1] = vdotq_s32(res[1], mat[1], v); - res[2] = vdotq_s32(res[2], mat[2], v); - res[3] = vdotq_s32(res[3], mat[3], v); - - w0 += 16; - w1 += 16; - w2 += 16; - w3 += 16; - } - bias = vld1q_s32(result + n); - - res[0] = vpaddq_s32(res[0], res[1]); - res[2] = vpaddq_s32(res[2], res[3]); - res[0] = vpaddq_s32(res[0], res[2]); - res[0] = vaddq_s32(res[0], bias); - - vst1q_s32(result + n, res[0]); - - if (K_tail != 0) { - I32 tmp[4] = {0}; - for(U32 p = K_inner; p < K; p++) { - tmp[0] += vector[p] * *w0++; - tmp[1] += vector[p] * *w1++; - tmp[2] += vector[p] * *w2++; - tmp[3] += vector[p] * *w3++; - } - result[n] += tmp[0]; - result[n+1] += tmp[1]; - result[n+2] += tmp[2]; - result[n+3] += tmp[3]; - } - } -} - -inline void mvm_col(U32 numRows, U32 numColumns, INT8* matrix, INT8* vector, I32*tmp, I32* result) { - //Actual layout is KN, and vector is K - U32 N = numRows; - U32 K = numColumns; - U32 NTail = N % 64; - U32 NInner = N - NTail; - - for (U32 n = 0; n < NInner; n+=64) { - memset(tmp, 0, sizeof(I32)*64); - for (U32 k = 0; k < K; k++) { - for(U32 i = 0; i < 64; i++) { - tmp[i] += vector[k] * matrix[k * N + n + i]; - } - } - - for (U32 i = 0; i < 64; i++) { - result[n + i] += tmp[i]; - } - } - - memset(tmp, 0, sizeof(I32)*64); - for (U32 k = 0; k < K; k++) { - for(U32 i = 0; i < NTail; i++) { - tmp[i] += vector[k] * matrix[k * N + NInner + i]; - } - for(U32 i=0; i < NTail; i++) { - result[NInner + i] += tmp[i]; - } - } -} - -inline void mvm_row(U32 numRows, U32 numColumns, INT8* matrix, INT8* vector, I32* result) { - //Actual layout is NK, and vector is K - U32 N = numRows; - U32 K = numColumns; - U32 Nbatch = N / 4; - U32 NTail = N % 4; - - mvm_row_kernel(Nbatch, K, matrix, vector, result); - - if (NTail != 0) { - mvm_row_tail(NTail, K, matrix + (N - NTail) * K, vector, result + N - NTail); - } -} -#endif -#endif diff --git a/blas-enhance/src/mmm.cpp b/blas-enhance/src/mmm.cpp deleted file mode 100644 index 2e9e238d..00000000 --- a/blas-enhance/src/mmm.cpp +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "blas-enhance.h" -#ifdef _USE_GENERAL -#include "cpu/general/blas_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/blas_arm.h" -#endif - -EE matrix_matrix_multiply_tmp_bytes(TensorDesc matrixADesc, TensorDesc matrixBDesc, U32* bytes, Arch arch) -{ - DataType matrixADataType, matrixBDataType; - U32 matrixA_M, matrixA_K, matrixB_K, matrixB_N; - CHECK_STATUS(tensor2dGet(matrixADesc, &matrixADataType, &matrixA_M, &matrixA_K)); - CHECK_STATUS(tensor2dGet(matrixBDesc, &matrixBDataType, &matrixB_K, &matrixB_N)); - - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = SUCCESS; -#endif -#ifdef _USE_NEON - } else { - ret = matrix_matrix_multiply_tmp_bytes_arm(matrixA_M, matrixA_K, matrixB_K, matrixB_N, matrixADataType, bytes); -#endif - } - return ret; -} - -EE matrix_matrix_multiply(TensorDesc matrixADesc, const void* matrixAData, - TensorDesc matrixBDesc, const void* matrixBData, - U32 bytes, void* tmp, - TensorDesc matrixCDesc, void* matrixCData, - Arch arch) -{ - if (bytes != 0 && tmp == nullptr) { - CHECK_STATUS(NULL_POINTER); - } - if (nullptr == matrixAData || nullptr == matrixBData || nullptr == matrixCData) { - CHECK_STATUS(NULL_POINTER); - } - - DataType matrixADataType, matrixBDataType, matrixCDataType; - DataFormat matrixADataFormat, matrixBDataFormat; - U32 matrixA_M, matrixA_K, matrixB_K, matrixB_N, matrixC_M, matrixC_N; - CHECK_STATUS(tensor2dfGet(matrixADesc, &matrixADataType, &matrixADataFormat, &matrixA_M, &matrixA_K)); - CHECK_STATUS(tensor2dfGet(matrixBDesc, &matrixBDataType, &matrixBDataFormat, &matrixB_K, &matrixB_N)); - CHECK_STATUS(tensor2dGet(matrixCDesc, &matrixCDataType, &matrixC_M, &matrixC_N)); - - if (matrixADataType != matrixBDataType) - CHECK_STATUS(NOT_MATCH); - if (matrixADataType != matrixCDataType) - if (matrixADataType != DT_I8 || matrixCDataType != DT_I32) - CHECK_STATUS(NOT_MATCH); - - bool transposeA = false, transposeB = false; - if (matrixADataFormat == DF_TRANSPOSE) { - std::swap(matrixA_M, matrixA_K); - transposeA = true; - } - if (matrixBDataFormat == DF_TRANSPOSE) { - std::swap(matrixB_K, matrixB_N); - transposeB = true; - } - if (matrixA_M != matrixC_M || matrixB_N != matrixC_N || matrixA_K != matrixB_K) - CHECK_STATUS(NOT_MATCH); - - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = mmm_general(matrixC_N, matrixC_M, matrixA_K, transposeA, transposeB, matrixADataType, matrixAData, matrixBData, matrixCData); -#endif -#ifdef _USE_NEON - } else { - TensorDesc tranDescB; - U8 *dataB = (U8*)matrixBData; - if (matrixBDataFormat != targetFormat4MatrixB(matrixBDataType)) { - U32 K = matrixA_K; - if (DT_I8 == matrixADataType) { - K = pad_to_4_multiple(K); - } - dataB = ((U8*)tmp) + matrixA_M * K * bytesOf(matrixADataType); - ret = matrix_matrix_multiply_transform_rhs(matrixBDesc, matrixBData, &tranDescB, dataB); - } - ret = mmm_arm(matrixC_N, matrixC_M, matrixA_K, matrixADataType, matrixAData, dataB, tmp, matrixCData, arch); -#endif - } - return ret; -} diff --git a/bolt.cmake b/bolt.cmake deleted file mode 100644 index adbcd27e..00000000 --- a/bolt.cmake +++ /dev/null @@ -1,207 +0,0 @@ -option(USE_CROSS_COMPILE "set use cross compile or not" ON) -option(USE_GNU_GCC "set use GNU gcc compiler or not" OFF) -option(USE_LLVM_CLANG "set use LLVM clang compiler or not" OFF) -option(USE_DEBUG "set use debug information or not" OFF) -option(USE_DYNAMIC_LIBRARY "set use dynamic library or not" OFF) -option(USE_MINSIZEREL ".so lib will be 300KB smaller but performance will be affected" OFF) - -# model-tools variable -option(USE_CAFFE "set use caffe model as input or not" ON) -option(USE_ONNX "set use onnx model as input or not" ON) -option(USE_TFLITE "set use tflite model as input or not" ON) - -# blas-enhance tensor_computing -option(USE_GENERAL "set use CPU serial code or not" ON) -option(USE_NEON "set use ARM NEON instruction or not" ON) -option(USE_ARMV7 "set use ARMv7 NEON instruction or not" OFF) -option(USE_ARMV8 "set use ARMv8 NEON instruction or not" ON) -option(USE_FP32 "set use ARM NEON FP32 instruction or not" ON) -option(USE_FP16 "set use ARM NEON FP16 instruction or not" ON) -option(USE_F16_MIX_PRECISION "set use ARM NEON mix precision f16/f32 instruction or not" ON) -option(USE_INT8 "set use ARM NEON INT8 instruction or not" ON) -option(BUILD_TEST "set to build unit test or not" OFF) -option(USE_OPENMP "set use OpenMP for parallel or not" ON) -option(USE_MALI "set use mali for parallel or not" ON) -option(USE_LIBRARY_TUNING "set use algorithm tuning or not" ON) - -set(BOLT_ROOT $ENV{BOLT_ROOT}) - -function (set_policy) - cmake_policy(SET CMP0074 NEW) -endfunction(set_policy) - -macro (set_c_cxx_flags) - set(COMMON_FLAGS "-W -Wall -Wextra -Wno-unused-command-line-argument -Wno-unused-parameter -O3") - - if (USE_LIBRARY_TUNING) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_LIBRARY_TUNING") - endif(USE_LIBRARY_TUNING) - - if (BUILD_TEST) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_BUILD_TEST") - endif(BUILD_TEST) - - if (USE_DEBUG) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_DEBUG") - if (USE_LLVM_CLANG) - set(COMMON_FLAGS "${COMMON_FLAGS} -llog") - endif(USE_LLVM_CLANG) - endif(USE_DEBUG) - - if (USE_GENERAL) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_GENERAL") - endif(USE_GENERAL) - - if (USE_MALI) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_MALI") - endif(USE_MALI) - - if (USE_NEON) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_NEON") - - if (USE_ARMV8) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_ARMV8") - endif (USE_ARMV8) - - if (USE_ARMV7) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_ARMV7 -march=armv7-a -mfloat-abi=softfp -mfpu=neon-vfpv4") - if (USE_LLVM_CLANG) - set(COMMON_FLAGS "${COMMON_FLAGS} -Wl,--allow-multiple-definition") - endif (USE_LLVM_CLANG) - endif (USE_ARMV7) - - if (USE_FP32) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_FP32") - endif (USE_FP32) - - if (USE_FP16) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_FP16") - if (USE_F16_MIX_PRECISION) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_F16_MIX_PRECISION") - endif (USE_F16_MIX_PRECISION) - if (USE_INT8) - if (USE_LLVM_CLANG) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_INT8 -march=armv8-a+fp16+dotprod") - else (USE_LLVM_CLANG) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_INT8 -march=armv8.2-a+fp16+dotprod") - endif (USE_LLVM_CLANG) - else (USE_INT8) - if (USE_LLVM_CLANG) - set(COMMON_FLAGS "${COMMON_FLAGS} -march=armv8-a+fp16") - else (USE_LLVM_CLANG) - set(COMMON_FLAGS "${COMMON_FLAGS} -march=armv8.2-a+fp16") - endif (USE_LLVM_CLANG) - endif (USE_INT8) - endif (USE_FP16) - endif(USE_NEON) - - if (USE_CAFFE) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_CAFFE_MODEL") - endif() - if (USE_ONNX) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_ONNX_MODEL") - endif() - if (USE_TFLITE) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_TFLITE_MODEL") - endif() - - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}") - link_libraries("-static-libstdc++") - - if (USE_DEBUG) - set(CMAKE_BUILD_TYPE "Debug") - elseif (USE_MINSIZEREL) - set(CMAKE_BUILD_TYPE "MinSizeRel") - endif (USE_DEBUG) -endmacro(set_c_cxx_flags) - -macro (set_test_c_cxx_flags) - if (${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") - if (USE_CROSS_COMPILE) - if (USE_GNU_GCC) - set(COMMON_FLAGS "${COMMON_FLAGS} -static") - endif(USE_GNU_GCC) - endif(USE_CROSS_COMPILE) - endif(${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") - - if (USE_LLVM_CLANG) - if (${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") - set(COMMON_FLAGS "${COMMON_FLAGS} -Wl,-allow-shlib-undefined, -static-libstdc++") - else (${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") - set(COMMON_FLAGS "${COMMON_FLAGS} -Wl,-allow-shlib-undefined") - endif(${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") - endif(USE_LLVM_CLANG) - - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}") -endmacro (set_test_c_cxx_flags) - -macro (set_project_install_directory) - SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/bin) - SET(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/lib) -endmacro (set_project_install_directory) - -function(blas_enhance name) - add_executable(${name} ${name}.cpp) - add_dependencies(${name} blas-enhance) - add_dependencies(${name} blas-enhance_static) - target_link_libraries(${name} ${BLAS_ENHANCE_LIBRARY}) -endfunction() - -function(tensor_computing name) - add_executable(${name} ${name}.cpp) - add_dependencies(${name} tensor_computing) - add_dependencies(${name} tensor_computing_static) - target_link_libraries(${name} ${TENSOR_COMPUTING_LIBRARIES}) - if(USE_MALI) - target_link_libraries(${name} ${OPENCL_LIBRARIES}) - endif(USE_MALI) -endfunction() - -function(image name) - add_executable(${name} ${name}.cpp) - add_dependencies(${name} image) - add_dependencies(${name} image_static) - target_link_libraries(${name} ${IMAGE_LIBRARIES}) -endfunction() - -function(model_tools name) - add_executable(${name} ${name}.cpp) - if (USE_CAFFE) - add_dependencies(${name} model-tools) - add_dependencies(${name} model-tools_static) - add_dependencies(${name} model-tools_caffe) - add_dependencies(${name} model-tools_caffe_static) - TARGET_LINK_LIBRARIES(${name} ${MODEL_TOOLS_LIBRARIES}) - endif (USE_CAFFE) - - if (USE_ONNX) - add_dependencies(${name} model-tools) - add_dependencies(${name} model-tools_static) - add_dependencies(${name} model-tools_onnx) - add_dependencies(${name} model-tools_onnx_static) - TARGET_LINK_LIBRARIES(${name} ${MODEL_TOOLS_LIBRARIES}) - endif (USE_ONNX) - - if (USE_TFLITE) - add_dependencies(${name} model-tools) - add_dependencies(${name} model-tools_static) - add_dependencies(${name} model-tools_tflite) - add_dependencies(${name} model-tools_tflite_static) - TARGET_LINK_LIBRARIES(${name} ${MODEL_TOOLS_LIBRARIES}) - endif (USE_TFLITE) -endfunction() - -function(inference name src_name) - add_executable(${name} ${src_name}) - if (USE_DYNAMIC_LIBRARY) - TARGET_LINK_LIBRARIES(${name} inference) - else (USE_DYNAMIC_LIBRARY) - TARGET_LINK_LIBRARIES(${name} inference_static) - endif (USE_DYNAMIC_LIBRARY) - TARGET_LINK_LIBRARIES(${name} ${INFERENCE_LIBRARIES} ${JPEG_LIBRARY}) - if (USE_MALI) - TARGET_LINK_LIBRARIES(${name} ${KERNELBIN_LIBRARIES} ${OPENCL_LIBRARIES}) - endif (USE_MALI) -endfunction() diff --git a/cmakes/FindBlasEnhance.cmake b/cmakes/FindBlasEnhance.cmake deleted file mode 100644 index 2b3f4af8..00000000 --- a/cmakes/FindBlasEnhance.cmake +++ /dev/null @@ -1,26 +0,0 @@ -set(BLAS_ENHANCE_PROJECT_NAME "blas-enhance") -unset(BLAS_ENHANCE_ROOT) -find_path(BLAS_ENHANCE_ROOT NAMES ${BLAS_ENHANCE_PROJECT_NAME} HINTS ${BOLT_ROOT} $ENV{BOLT_ROOT}) -set(BLAS_ENHANCE_ROOT "${BLAS_ENHANCE_ROOT}/${BLAS_ENHANCE_PROJECT_NAME}") - -set(BLAS_ENHANCE_INCLUDE_DIR "${BLAS_ENHANCE_ROOT}/include") -if (USE_DYNAMIC_LIBRARY) - set(BLAS_ENHANCE_LIBRARY "${BLAS_ENHANCE_ROOT}/lib/lib${BLAS_ENHANCE_PROJECT_NAME}.so") -else (USE_DYNAMIC_LIBRARY) - set(BLAS_ENHANCE_LIBRARY "${BLAS_ENHANCE_ROOT}/lib/lib${BLAS_ENHANCE_PROJECT_NAME}.a") -endif (USE_DYNAMIC_LIBRARY) - -if (BLAS_ENHANCE_INCLUDE_DIR AND BLAS_ENHANCE_LIBRARY) - set(BLAS_ENHANCE_FOUND true) -endif (BLAS_ENHANCE_INCLUDE_DIR AND BLAS_ENHANCE_LIBRARY) - -if (BLAS_ENHANCE_FOUND) - include_directories(include ${BLAS_ENHANCE_INCLUDE_DIR}) - message(STATUS "Found ${BLAS_ENHANCE_PROJECT_NAME}.h: ${BLAS_ENHANCE_INCLUDE_DIR}") - message(STATUS "Found ${BLAS_ENHANCE_PROJECT_NAME}: ${BLAS_ENHANCE_LIBRARY}") -else (BLAS_ENHANCE_FOUND) - message(FATAL_ERROR " -FATAL: can not find ${BLAS_ENHANCE_PROJECT_NAME} library in /${BLAS_ENHANCE_PROJECT_NAME}/[include/lib] directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (BLAS_ENHANCE_FOUND) diff --git a/cmakes/FindGcl.cmake b/cmakes/FindGcl.cmake deleted file mode 100644 index 5f4973c6..00000000 --- a/cmakes/FindGcl.cmake +++ /dev/null @@ -1,55 +0,0 @@ -find_path(GCL_ROOT NAMES gcl HINTS ${BOLT_ROOT} $ENV{BOLT_ROOT}) -set(GCL_ROOT "${GCL_ROOT}/gcl") - -set(GCL_INCLUDE_DIR "${GCL_ROOT}/include") - -if (GCL_INCLUDE_DIR) - set(GCL_FOUND true) -endif (GCL_INCLUDE_DIR) - -if (GCL_FOUND) - include_directories(include ${GCL_INCLUDE_DIR}) - message(STATUS "Found gcl.h: ${GCL_INCLUDE_DIR}") -else (GCL_FOUND) - message(FATAL_ERROR " -FATAL: can not find gcl.h in /gcl/include directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (GCL_FOUND) - -find_package(OpenCL) - -set(GCL_KERNELBIN_INCLUDE_DIR "${GCL_ROOT}/kernelBin/include") -if (${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") - set(GCL_KERNELBIN_LIBRARY "${GCL_ROOT}/tools/kernel_lib_compile/lib/libkernelbin.a") -else (${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") - set(GCL_KERNELBIN_LIBRARY "${GCL_ROOT}/tools/kernel_lib_compile/lib/libkernelbin.so") -endif (${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") - -if(GCL_KERNELBIN_INCLUDE_DIR) - set(KERNELBIN_HEAD_FOUND true) -endif(GCL_KERNELBIN_INCLUDE_DIR) - -if(GCL_KERNELBIN_LIBRARY) - set(KERNELBIN_LIB_FOUND true) -endif(GCL_KERNELBIN_LIBRARY) - - -if (KERNELBIN_HEAD_FOUND) - include_directories(include ${GCL_KERNELBIN_INCLUDE_DIR}) - message(STATUS "Found kernel bin head file: ${GCL_KERNELBIN_INCLUDE_DIR}") -else (KERNELBIN_HEAD_FOUND) - message(FATAL_ERROR " -FATAL: can not find kernelbin header files in /gcl/kernelBin/include directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (KERNELBIN_HEAD_FOUND) - -if (KERNELBIN_LIB_FOUND) - set(KERNELBIN_LIBRARIES "${GCL_KERNELBIN_LIBRARY}") -else (KERNELBIN_LIB_FOUND) - message(FATAL_ERROR " -FATAL: can not find libkernelbin.a in /gcl/tools/kernel_lib_compile directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (KERNELBIN_LIB_FOUND) diff --git a/cmakes/FindImage.cmake b/cmakes/FindImage.cmake deleted file mode 100644 index e6c0f197..00000000 --- a/cmakes/FindImage.cmake +++ /dev/null @@ -1,33 +0,0 @@ -unset(IMAGE_ROOT) -find_path(IMAGE_ROOT NAMES image HINTS ${BOLT_ROOT} $ENV{BOLT_ROOT}) -set(IMAGE_ROOT "${IMAGE_ROOT}/image") - -set(IMAGE_INCLUDE_DIR "${IMAGE_ROOT}/include") -if (USE_DYNAMIC_LIBRARY) - set(IMAGE_LIBRARY "${IMAGE_ROOT}/lib/libimage.so") -else (USE_DYNAMIC_LIBRARY) - set(IMAGE_LIBRARY "${IMAGE_ROOT}/lib/libimage.a") -endif (USE_DYNAMIC_LIBRARY) - -if (IMAGE_INCLUDE_DIR AND IMAGE_LIBRARY) - set(IMAGE_FOUND true) -endif (IMAGE_INCLUDE_DIR AND IMAGE_LIBRARY) - -if (IMAGE_FOUND) - if (USE_GNU_GCC) - set(IMAGE_LIBRARIES "${IMAGE_LIBRARY};-lpthread;-ldl") - endif(USE_GNU_GCC) - if (USE_LLVM_CLANG) - set(IMAGE_LIBRARIES "${IMAGE_LIBRARY}") - endif(USE_LLVM_CLANG) - - include_directories(include ${IMAGE_INCLUDE_DIR}) - - message(STATUS "Found image.h: ${IMAGE_INCLUDE_DIR}") - message(STATUS "Found image: ${IMAGE_LIBRARY}") -else (IMAGE_FOUND) - message(FATAL_ERROR " -FATAL: can not find image library in /image/lib directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (IMAGE_FOUND) diff --git a/cmakes/FindInference.cmake b/cmakes/FindInference.cmake deleted file mode 100644 index 951cbcaf..00000000 --- a/cmakes/FindInference.cmake +++ /dev/null @@ -1,34 +0,0 @@ -set(INFERENCE_PROJECT_NAME "inference") -unset(INFERENCE_ROOT) -find_path(INFERENCE_ROOT NAMES ${INFERENCE_PROJECT_NAME} HINTS ${BOLT_ROOT} $ENV{BOLT_ROOT}) -set(INFERENCE_ROOT "${INFERENCE_ROOT}/${INFERENCE_PROJECT_NAME}") - -set(INFERENCE_INCLUDE_DIR "${INFERENCE_ROOT}/include") -if (USE_DYNAMIC_LIBRARY) - set(INFERENCE_LIBRARY "${INFERENCE_ROOT}/lib/lib${INFERENCE_PROJECT_NAME}.so") -else (USE_DYNAMIC_LIBRARY) - set(INFERENCE_LIBRARY "${INFERENCE_ROOT}/lib/lib${INFERENCE_PROJECT_NAME}.a") -endif (USE_DYNAMIC_LIBRARY) - -if (INFERENCE_INCLUDE_DIR AND INFERENCE_LIBRARY) - set(INFERENCE_FOUND true) -endif (INFERENCE_INCLUDE_DIR AND INFERENCE_LIBRARY) - -find_package(BlasEnhance) -find_package(TensorComputing) -find_package(ModelTools) -find_package(Image) - -if (INFERENCE_FOUND) - set(INFERENCE_LIBRARIES "${INFERENCE_LIBRARY};${IMAGE_LIBRARY};${MODEL_TOOLS_LIBRARY};${TENSOR_COMPUTING_LIBRARY};${BLAS_ENHANCE_LIBRARY}") - include_directories(include ${INFERENCE_INCLUDE_DIR}) - message(STATUS "Found ${INFERENCE_PROJECT_NAME}.hpp: ${INFERENCE_INCLUDE_DIR}") - message(STATUS "Found ${INFERENCE_PROJECT_NAME}: ${INFERENCE_LIBRARIES}") -else (INFERENCE_FOUND) - message(FATAL_ERROR " -FATAL: can not find ${INFERENCE_PROJECT_NAME} library in /${INFERENCE_PROJECT_NAME}/[include/lib] directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (INFERENCE_FOUND) - -message(STATUS ${INFERENCE_LIBRARIES}) diff --git a/cmakes/FindModelTools.cmake b/cmakes/FindModelTools.cmake deleted file mode 100644 index 3b283eea..00000000 --- a/cmakes/FindModelTools.cmake +++ /dev/null @@ -1,37 +0,0 @@ -set(MODEL_TOOLS_PROJECT_NAME "model-tools") -unset(MODEL_TOOLS_ROOT) -find_path(MODEL_TOOLS_ROOT NAMES ${MODEL_TOOLS_PROJECT_NAME} HINTS ${BOLT_ROOT} $ENV{BOLT_ROOT}) -set(MODEL_TOOLS_ROOT "${MODEL_TOOLS_ROOT}/${MODEL_TOOLS_PROJECT_NAME}") - -set(MODEL_TOOLS_INCLUDE_DIR "${MODEL_TOOLS_ROOT}/include") -if (USE_DYNAMIC_LIBRARY) - set(MODEL_TOOLS_LIBRARY "${MODEL_TOOLS_ROOT}/lib/lib${MODEL_TOOLS_PROJECT_NAME}.so") -else (USE_DYNAMIC_LIBRARY) - set(MODEL_TOOLS_LIBRARY "${MODEL_TOOLS_ROOT}/lib/lib${MODEL_TOOLS_PROJECT_NAME}.a") -endif (USE_DYNAMIC_LIBRARY) - -if (MODEL_TOOLS_INCLUDE_DIR AND MODEL_TOOLS_LIBRARY) - set(MODEL_TOOLS_FOUND true) -endif (MODEL_TOOLS_INCLUDE_DIR AND MODEL_TOOLS_LIBRARY) - -if (USE_CAFFE) - find_package(ModelToolsCaffe) -endif (USE_CAFFE) -if (USE_ONNX) - find_package(ModelToolsOnnx) -endif(USE_ONNX) -if (USE_TFLITE) - find_package(ModelToolsTFLite) -endif (USE_TFLITE) - -if (MODEL_TOOLS_FOUND) - set(MODEL_TOOLS_LIBRARIES "${MODEL_TOOLS_LIBRARY};${MODEL_TOOLS_CAFFE_LIBRARIES};${MODEL_TOOLS_ONNX_LIBRARIES};${MODEL_TOOLS_TFLITE_LIBRARIES}") - include_directories(include ${MODEL_TOOLS_INCLUDE_DIR}) - message(STATUS "Found ${MODEL_TOOLS_PROJECT_NAME}.h: ${MODEL_TOOLS_INCLUDE_DIR}") - message(STATUS "Found ${MODEL_TOOLS_PROJECT_NAME}: ${MODEL_TOOLS_LIBRARIES}") -else (MODEL_TOOLS_FOUND) - message(FATAL_ERROR " -FATAL: can not find lib${MODEL_TOOLS_PROJECT_NAME}.* library in /model-tools/lib directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (MODEL_TOOLS_FOUND) diff --git a/cmakes/FindModelToolsCaffe.cmake b/cmakes/FindModelToolsCaffe.cmake deleted file mode 100644 index dace7a38..00000000 --- a/cmakes/FindModelToolsCaffe.cmake +++ /dev/null @@ -1,33 +0,0 @@ -set(MODEL_TOOLS_PROJECT_NAME "model-tools") -unset(MODEL_TOOLS_ROOT) -find_path(MODEL_TOOLS_ROOT NAMES ${MODEL_TOOLS_PROJECT_NAME} HINTS ${BOLT_ROOT} $ENV{BOLT_ROOT}) -set(MODEL_TOOLS_ROOT "${MODEL_TOOLS_ROOT}/${MODEL_TOOLS_PROJECT_NAME}") - -if (USE_DYNAMIC_LIBRARY) - set(MODEL_TOOLS_CAFFE_LIBRARY "${MODEL_TOOLS_ROOT}/lib/lib${MODEL_TOOLS_PROJECT_NAME}_caffe.so") -else (USE_DYNAMIC_LIBRARY) - set(MODEL_TOOLS_CAFFE_LIBRARY "${MODEL_TOOLS_ROOT}/lib/lib${MODEL_TOOLS_PROJECT_NAME}_caffe.a") -endif (USE_DYNAMIC_LIBRARY) - -if (MODEL_TOOLS_CAFFE_LIBRARY) - set(MODEL_TOOLS_CAFFE_FOUND true) -endif (MODEL_TOOLS_CAFFE_LIBRARY) - -find_package(Protobuf) - -if (MODEL_TOOLS_CAFFE_FOUND) - if (USE_GNU_GCC) - set(MODEL_TOOLS_CAFFE_LIBRARIES "${MODEL_TOOLS_CAFFE_LIBRARY};${Protobuf_LIBRARY};-lpthread") - endif(USE_GNU_GCC) - if (USE_LLVM_CLANG) - set(MODEL_TOOLS_CAFFE_LIBRARIES "${MODEL_TOOLS_CAFFE_LIBRARY};${Protobuf_LIBRARY};-lz") - endif(USE_LLVM_CLANG) - - include_directories(include ${Protobuf_INCLUDE_DIR}) - message(STATUS "Found ${MODEL_TOOLS_PROJECT_NAME}_caffe: ${MODEL_TOOLS_CAFFE_LIBRARY}") -else (MODEL_TOOLS_CAFFE_FOUND) - message(FATAL_ERROR " -FATAL: can not find lib${MODEL_TOOLS_PROJECT_NAME}_caffe.* library in /model-tools/lib directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (MODEL_TOOLS_CAFFE_FOUND) diff --git a/cmakes/FindModelToolsOnnx.cmake b/cmakes/FindModelToolsOnnx.cmake deleted file mode 100644 index c3c597c9..00000000 --- a/cmakes/FindModelToolsOnnx.cmake +++ /dev/null @@ -1,33 +0,0 @@ -set(MODEL_TOOLS_PROJECT_NAME "model-tools") -unset(MODEL_TOOLS_ROOT) -find_path(MODEL_TOOLS_ROOT NAMES ${MODEL_TOOLS_PROJECT_NAME} HINTS ${BOLT_ROOT} $ENV{BOLT_ROOT}) -set(MODEL_TOOLS_ROOT "${MODEL_TOOLS_ROOT}/${MODEL_TOOLS_PROJECT_NAME}") - -if (USE_DYNAMIC_LIBRARY) - set(MODEL_TOOLS_ONNX_LIBRARY "${MODEL_TOOLS_ROOT}/lib/lib${MODEL_TOOLS_PROJECT_NAME}_onnx.so") -else (USE_DYNAMIC_LIBRARY) - set(MODEL_TOOLS_ONNX_LIBRARY "${MODEL_TOOLS_ROOT}/lib/lib${MODEL_TOOLS_PROJECT_NAME}_onnx.a") -endif (USE_DYNAMIC_LIBRARY) - -if (MODEL_TOOLS_ONNX_LIBRARY) - set(MODEL_TOOLS_ONNX_FOUND true) -endif (MODEL_TOOLS_ONNX_LIBRARY) - -find_package(Protobuf) - -if (MODEL_TOOLS_ONNX_FOUND) - if (USE_GNU_GCC) - set(MODEL_TOOLS_ONNX_LIBRARIES "${MODEL_TOOLS_ONNX_LIBRARY};${Protobuf_LIBRARY};-lpthread") - endif(USE_GNU_GCC) - if (USE_LLVM_CLANG) - set(MODEL_TOOLS_ONNX_LIBRARIES "${MODEL_TOOLS_ONNX_LIBRARY};${Protobuf_LIBRARY}") - endif(USE_LLVM_CLANG) - - include_directories(include ${Protobuf_INCLUDE_DIR}) - message(STATUS "Found ${MODEL_TOOLS_PROJECT_NAME}_onnx: ${MODEL_TOOLS_ONNX_LIBRARY}") -else (MODEL_TOOLS_ONNX_FOUND) - message(FATAL_ERROR " -FATAL: can not find lib${MODEL_TOOLS_PROJECT_NAME}_onnx.* library in /model-tools/lib directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (MODEL_TOOLS_ONNX_FOUND) diff --git a/cmakes/FindModelToolsTFLite.cmake b/cmakes/FindModelToolsTFLite.cmake deleted file mode 100644 index 66b91d0d..00000000 --- a/cmakes/FindModelToolsTFLite.cmake +++ /dev/null @@ -1,33 +0,0 @@ -set(MODEL_TOOLS_PROJECT_NAME "model-tools") -unset(MODEL_TOOLS_ROOT) -find_path(MODEL_TOOLS_ROOT NAMES ${MODEL_TOOLS_PROJECT_NAME} HINTS ${BOLT_ROOT} $ENV{BOLT_ROOT}) -set(MODEL_TOOLS_ROOT "${MODEL_TOOLS_ROOT}/${MODEL_TOOLS_PROJECT_NAME}") - -if (USE_DYNAMIC_LIBRARY) - set(MODEL_TOOLS_TFLITE_LIBRARY "${MODEL_TOOLS_ROOT}/lib/lib${MODEL_TOOLS_PROJECT_NAME}_tflite.so") -else (USE_DYNAMIC_LIBRARY) - set(MODEL_TOOLS_TFLITE_LIBRARY "${MODEL_TOOLS_ROOT}/lib/lib${MODEL_TOOLS_PROJECT_NAME}_tflite.a") -endif (USE_DYNAMIC_LIBRARY) - -if (MODEL_TOOLS_TFLITE_LIBRARY) - set(MODEL_TOOLS_TFLITE_FOUND true) -endif (MODEL_TOOLS_TFLITE_LIBRARY) - -find_package(FlatBuffers) -find_package(TFLite) - -if (MODEL_TOOLS_TFLITE_FOUND) - include_directories(include ${FlatBuffers_INCLUDE_DIR}) - if (USE_GNU_GCC) - set(MODEL_TOOLS_TFLITE_LIBRARIES "${MODEL_TOOLS_TFLITE_LIBRARY};-lpthread") - endif(USE_GNU_GCC) - if (USE_LLVM_CLANG) - set(MODEL_TOOLS_TFLITE_LIBRARIES "${MODEL_TOOLS_TFLITE_LIBRARY}") - endif(USE_LLVM_CLANG) - message(STATUS "Found ${MODEL_TOOLS_PROJECT_NAME}_tflite: ${MODEL_TOOLS_TFLITE_LIBRARY}") -else (MODEL_TOOLS_TFLITE_FOUND) - message(FATAL_ERROR " -FATAL: can not find lib${MODEL_TOOLS_PROJECT_NAME}_tflite.* library in /model-tools/lib directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (MODEL_TOOLS_TFLITE_FOUND) diff --git a/cmakes/FindTFLite.cmake b/cmakes/FindTFLite.cmake deleted file mode 100644 index 437b1329..00000000 --- a/cmakes/FindTFLite.cmake +++ /dev/null @@ -1,15 +0,0 @@ -find_path(TFLITE_INCLUDE_DIR NAMES schema_generated.h HINTS ${TFLite_ROOT}/include $ENV{TFLite_ROOT}/include) - -if (TFLITE_INCLUDE_DIR) - set(TFLITE_FOUND true) -endif (TFLITE_INCLUDE_DIR) - -if (TFLITE_FOUND) - include_directories(include ${TFLITE_INCLUDE_DIR}) - message(STATUS "Found schema_generated.h: ${TFLITE_INCLUDE_DIR}") -else (TFLITE_FOUND) - message(FATAL_ERROR " -FATAL: can not find schema_generated.h in /include directory, - please set shell environment variable TFLite_ROOT. - ") -endif (TFLITE_FOUND) diff --git a/cmakes/FindTensorComputing.cmake b/cmakes/FindTensorComputing.cmake deleted file mode 100644 index e8970f50..00000000 --- a/cmakes/FindTensorComputing.cmake +++ /dev/null @@ -1,29 +0,0 @@ -set(TENSOR_COMPUTING_PROJECT_NAME "tensor_computing") -unset(TENSOR_COMPUTING_ROOT) -find_path(TENSOR_COMPUTING_ROOT NAMES ${TENSOR_COMPUTING_PROJECT_NAME} HINTS ${BOLT_ROOT} $ENV{BOLT_ROOT}) -set(TENSOR_COMPUTING_ROOT "${TENSOR_COMPUTING_ROOT}/${TENSOR_COMPUTING_PROJECT_NAME}") - -set(TENSOR_COMPUTING_INCLUDE_DIR "${TENSOR_COMPUTING_ROOT}/include") -if (USE_DYNAMIC_LIBRARY) - set(TENSOR_COMPUTING_LIBRARY "${TENSOR_COMPUTING_ROOT}/lib/lib${TENSOR_COMPUTING_PROJECT_NAME}.so") -else (USE_DYNAMIC_LIBRARY) - set(TENSOR_COMPUTING_LIBRARY "${TENSOR_COMPUTING_ROOT}/lib/lib${TENSOR_COMPUTING_PROJECT_NAME}.a") -endif (USE_DYNAMIC_LIBRARY) - -if (TENSOR_COMPUTING_INCLUDE_DIR AND TENSOR_COMPUTING_LIBRARY) - set(TENSOR_COMPUTING_FOUND true) -endif (TENSOR_COMPUTING_INCLUDE_DIR AND TENSOR_COMPUTING_LIBRARY) - -find_package(BlasEnhance) - -if (TENSOR_COMPUTING_FOUND) - set(TENSOR_COMPUTING_LIBRARIES "${TENSOR_COMPUTING_LIBRARY};${BLAS_ENHANCE_LIBRARY}") - include_directories(include ${TENSOR_COMPUTING_INCLUDE_DIR}) - message(STATUS "Found ${TENSOR_COMPUTING_PROJECT_NAME}.h: ${TENSOR_COMPUTING_INCLUDE_DIR}") - message(STATUS "Found ${TENSOR_COMPUTING_PROJECT_NAME}: ${TENSOR_COMPUTING_LIBRARIES}") -else (TENSOR_COMPUTING_FOUND) - message(FATAL_ERROR " -FATAL: can not find ${TENSOR_COMPUTING_PROJECT_NAME} library in /${TENSOR_COMPUTING_PROJECT_NAME}/[include/lib] directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (TENSOR_COMPUTING_FOUND) diff --git a/cmakes/FindUni.cmake b/cmakes/FindUni.cmake deleted file mode 100644 index a2084b1b..00000000 --- a/cmakes/FindUni.cmake +++ /dev/null @@ -1,18 +0,0 @@ -find_path(UNI_ROOT NAMES uni HINTS ${BOLT_ROOT} $ENV{BOLT_ROOT}) -set(UNI_ROOT "${UNI_ROOT}/uni") - -set(UNI_INCLUDE_DIR "${UNI_ROOT}/include") - -if (UNI_INCLUDE_DIR) - set(UNI_FOUND true) -endif (UNI_INCLUDE_DIR) - -if (UNI_FOUND) - include_directories(include ${UNI_INCLUDE_DIR}) - message(STATUS "Found type.h: ${UNI_INCLUDE_DIR}") -else (UNI_FOUND) - message(FATAL_ERROR " -FATAL: can not find uni library in /uni/[include/lib] directory, - please set shell or cmake environment variable BOLT_ROOT. - ") -endif (UNI_FOUND) diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt new file mode 100644 index 00000000..21a7eb10 --- /dev/null +++ b/common/CMakeLists.txt @@ -0,0 +1,18 @@ +cmake_minimum_required(VERSION 3.2) + +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) +if (BOLT_CONFIGURE_FILE) + include(${BOLT_CONFIGURE_FILE}) +else (BOLT_CONFIGURE_FILE) + message(FATAL_ERROR " +FATAL: can not find bolt.cmake in /common/cmakes directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (BOLT_CONFIGURE_FILE) + +project(common) + +add_subdirectory(uni) +if (USE_MALI) + add_subdirectory(gcl) +endif (USE_MALI) diff --git a/common/cmakes/FindFFTW.cmake b/common/cmakes/FindFFTW.cmake new file mode 100644 index 00000000..e2f59c4a --- /dev/null +++ b/common/cmakes/FindFFTW.cmake @@ -0,0 +1,23 @@ +find_path(FFTW_INCLUDE_DIR NAMES fftw3.h HINTS $ENV{FFTW_ROOT}/include ${FFTW_ROOT}/include) + +if (USE_DYNAMIC_LIBRARY) + find_library(FFTW_LIBRARY NAMES libfftw3f.so HINTS $ENV{FFTW_ROOT}/lib ${FFTW_ROOT}/lib) +else (USE_DYNAMIC_LIBRARY) + find_library(FFTW_LIBRARY NAMES libfftw3f.a HINTS $ENV{FFTW_ROOT}/lib ${FFTW_ROOT}/lib) +endif (USE_DYNAMIC_LIBRARY) + +if (FFTW_INCLUDE_DIR AND FFTW_LIBRARY) + set(FFTW_FOUND true) +endif (FFTW_INCLUDE_DIR AND FFTW_LIBRARY) + +if (FFTW_FOUND) + include_directories(${FFTW_INCLUDE_DIR}) + set(FFTW_LIBRARIES "${FFTW_LIBRARY}") + message(STATUS "Found fftw3f.h: ${FFTW_INCLUDE_DIR}") + message(STATUS "Found fftw3: ${FFTW_LIBRARIES}") +else (FFTW_FOUND) + message(FATAL_ERROR " +FATAL: can not find fftw library in /[include|lib] directory, + please set shell environment variable FFTW_ROOT. + ") +endif (FFTW_FOUND) diff --git a/cmakes/FindFlatBuffers.cmake b/common/cmakes/FindFlatBuffers.cmake similarity index 88% rename from cmakes/FindFlatBuffers.cmake rename to common/cmakes/FindFlatBuffers.cmake index fc61fc8c..9fa15283 100644 --- a/cmakes/FindFlatBuffers.cmake +++ b/common/cmakes/FindFlatBuffers.cmake @@ -1,5 +1,5 @@ -find_path(FlatBuffers_INCLUDE_DIR NAMES flatbuffers/flatbuffers.h HINTS ${FlatBuffers_ROOT}/include - $ENV{FlatBuffers_ROOT}/include +find_path(FlatBuffers_INCLUDE_DIR NAMES flatbuffers/flatbuffers.h HINTS $ENV{FlatBuffers_ROOT}/include + ${FlatBuffers_ROOT}/include /usr/local/include) if (FlatBuffers_INCLUDE_DIR) diff --git a/common/cmakes/FindGcl.cmake b/common/cmakes/FindGcl.cmake new file mode 100644 index 00000000..8fab6e42 --- /dev/null +++ b/common/cmakes/FindGcl.cmake @@ -0,0 +1,56 @@ +find_path(GCL_ROOT NAMES gcl HINTS ${BOLT_ROOT}/common $ENV{BOLT_ROOT}/common) +set(GCL_ROOT "${GCL_ROOT}/gcl") + +set(GCL_INCLUDE_DIR "${GCL_ROOT}/include") + +if (GCL_INCLUDE_DIR) + set(GCL_FOUND true) +endif (GCL_INCLUDE_DIR) + +if (GCL_FOUND) + include_directories(${GCL_INCLUDE_DIR}) + message(STATUS "Found gcl.h: ${GCL_INCLUDE_DIR}") +else (GCL_FOUND) + message(FATAL_ERROR " +FATAL: can not find gcl.h in /gcl/include directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (GCL_FOUND) + +find_package(OpenCL) + +set(GCL_KERNELSOURCE_INCLUDE_DIR "${GCL_ROOT}/tools/kernel_source_compile/include") + +if (${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") + set(GCL_KERNELSOURCE_LIBRARY "${GCL_ROOT}/tools/kernel_source_compile/lib/libkernelsource.a") +else (${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") + set(GCL_KERNELSOURCE_LIBRARY "${GCL_ROOT}/tools/kernel_source_compile/lib/libkernelsource.so") +endif (${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") + +if(GCL_KERNELSOURCE_INCLUDE_DIR) + set(KERNELSOURCE_INCLUDE_FOUND true) +endif(GCL_KERNELSOURCE_INCLUDE_DIR) + +if(GCL_KERNELSOURCE_LIBRARY) + set(KERNELSOURCE_LIB_FOUND true) +endif(GCL_KERNELSOURCE_LIBRARY) + +if (KERNELSOURCE_INCLUDE_FOUND) + include_directories(${GCL_KERNELSOURCE_INCLUDE_DIR}) + message(STATUS "Found libkernelsource.h: ${GCL_KERNELSOURCE_INCLUDE_DIR}") +else (KERNELSOURCE_INCLUDE_FOUND) + message(FATAL_ERROR " +FATAL: can not find libkernelsource.h in /gcl/tools/kernel_source_compile/include/ directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (KERNELSOURCE_INCLUDE_FOUND) + +if (KERNELSOURCE_LIB_FOUND) + set(KERNELSOURCE_LIBRARIES "${GCL_KERNELSOURCE_LIBRARY}") + message(STATUS "Found kernelsource: ${KERNELSOURCE_LIBRARIES}") +else (KERNELSOURCE_LIB_FOUND) + message(FATAL_ERROR " +FATAL: can not find libkernelsource.a in /gcl/tools/kernel_source_compile/lib directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (KERNELSOURCE_LIB_FOUND) diff --git a/common/cmakes/FindJNI.cmake b/common/cmakes/FindJNI.cmake new file mode 100644 index 00000000..ae7e662d --- /dev/null +++ b/common/cmakes/FindJNI.cmake @@ -0,0 +1,14 @@ +find_path(JNI_INCLUDE_DIR NAMES jni.h HINTS $ENV{JNI_ROOT}/include ${JNI_ROOT}/include $ENV{JNI_ROOT}/include/linux ${JNI_ROOT}/include/linux) +find_path(JNI_MD_INCLUDE_DIR NAMES jni_md.h HINTS $ENV{JNI_ROOT}/include ${JNI_ROOT}/include $ENV{JNI_ROOT}/include/linux ${JNI_ROOT}/include/linux) + +if (JNI_INCLUDE_DIR AND JNI_MD_INCLUDE_DIR) + set(JNI_FOUND true) +else (JNI_INCLUDE_DIR AND JNI_MD_INCLUDE_DIR) + set(JNI_FOUND false) +endif (JNI_INCLUDE_DIR AND JNI_MD_INCLUDE_DIR) + +if (JNI_FOUND) + message(STATUS "Found jni.h: ${JNI_INCLUDE_DIR}") +else (JNI_FOUND) + message(WARNING "WARNING: can not find jni.h/jni_md.h in /include or /include/linux directory, so can not use Java API. If you want to use Java API, please set shell or cmake environment variable JNI_ROOT.") +endif (JNI_FOUND) diff --git a/cmakes/FindOpenCL.cmake b/common/cmakes/FindOpenCL.cmake similarity index 77% rename from cmakes/FindOpenCL.cmake rename to common/cmakes/FindOpenCL.cmake index f77b9f0d..8b8fdf2a 100644 --- a/cmakes/FindOpenCL.cmake +++ b/common/cmakes/FindOpenCL.cmake @@ -1,6 +1,6 @@ -find_path(OPENCL_INCLUDE_DIR NAMES CL/cl.h HINTS ${OpenCL_ROOT}/include $ENV{OpenCL_ROOT}/include /usr/local/include) -find_path(OPENCL_LIB_DIR NAMES libOpenCL.so HINTS ${OpenCL_ROOT}/lib64 $ENV{OpenCL_ROOT}/lib64 /usr/local/lib) -find_path(GLES_MALI_LIB_DIR NAMES libGLES_mali.so HINT ${OpenCL_ROOT}/lib64 $ENV{OpenCL_ROOT}/lib64 /usr/local/lib) +find_path(OPENCL_INCLUDE_DIR NAMES CL/cl.h HINTS $ENV{OpenCL_ROOT}/include ${OpenCL_ROOT}/include /usr/local/include) +find_path(OPENCL_LIB_DIR NAMES libOpenCL.so HINTS $ENV{OpenCL_ROOT}/lib64 ${OpenCL_ROOT}/lib64 /usr/local/lib) +find_path(GLES_MALI_LIB_DIR NAMES libGLES_mali.so HINT $ENV{OpenCL_ROOT}/lib64 ${OpenCL_ROOT}/lib64 /usr/local/lib) if (OPENCL_INCLUDE_DIR) set(OPENCL_HEAD_FOUND true) @@ -19,7 +19,6 @@ if(GLES_MALI_LIB_DIR) endif(GLES_MALI_LIB_DIR) if (OPENCL_HEAD_FOUND) - include_directories(include ${OPENCL_INCLUDE_DIR}) message(STATUS "Found CL/cl.h: ${OPENCL_INCLUDE_DIR}") else (OPENCL_HEAD_FOUND) message(FATAL_ERROR " diff --git a/cmakes/FindProtobuf.cmake b/common/cmakes/FindProtobuf.cmake similarity index 84% rename from cmakes/FindProtobuf.cmake rename to common/cmakes/FindProtobuf.cmake index 3ca86118..d6e4bafa 100644 --- a/cmakes/FindProtobuf.cmake +++ b/common/cmakes/FindProtobuf.cmake @@ -1,12 +1,20 @@ -find_path(Protobuf_INCLUDE_DIR NAMES google/protobuf/service.h HINTS ${Protobuf_ROOT}/include $ENV{Protobuf_ROOT}/include) +if (Protobuf_FOUND) + return() +endif (Protobuf_FOUND) + +find_path(Protobuf_INCLUDE_DIR NAMES google/protobuf/service.h HINTS $ENV{Protobuf_ROOT}/include ${Protobuf_ROOT}/include) if (USE_DYNAMIC_LIBRARY) - find_library(Protobuf_LIBRARY NAMES libprotobuf.so HINTS ${Protobuf_ROOT}/lib $ENV{Protobuf_ROOT}/lib) + if (USE_IOS_CLANG) + find_library(Protobuf_LIBRARY NAMES libprotobuf.dylib HINTS $ENV{Protobuf_ROOT}/lib ${Protobuf_ROOT}/lib) + else (USE_IOS_CLANG) + find_library(Protobuf_LIBRARY NAMES libprotobuf.so HINTS $ENV{Protobuf_ROOT}/lib ${Protobuf_ROOT}/lib) + endif (USE_IOS_CLANG) else (USE_DYNAMIC_LIBRARY) - find_library(Protobuf_LIBRARY NAMES libprotobuf.a HINTS ${Protobuf_ROOT}/lib $ENV{Protobuf_ROOT}/lib) + find_library(Protobuf_LIBRARY NAMES libprotobuf.a HINTS $ENV{Protobuf_ROOT}/lib ${Protobuf_ROOT}/lib) endif (USE_DYNAMIC_LIBRARY) -find_program(Protobuf_PROTOC_EXECUTABLE NAMES protoc HINTS ${Protobuf_ROOT}/bin $ENV{Protobuf_ROOT}/bin) +find_program(Protobuf_PROTOC_EXECUTABLE NAMES protoc HINTS $ENV{Protobuf_ROOT}/bin ${Protobuf_ROOT}/bin) #set(Protobuf_DEBUG ON) if (Protobuf_INCLUDE_DIR AND Protobuf_LIBRARY AND Protobuf_PROTOC_EXECUTABLE) @@ -50,12 +58,11 @@ if (Protobuf_FOUND) message(STATUS "Found Protobuf: ${Protobuf_LIBRARY}") else (Protobuf_FOUND) message(FATAL_ERROR " -FATAL: can not find protobuf library in /uni/[include/lib] directory, +FATAL: can not find protobuf library in /[include/lib] directory, please set shell environment variable Protobuf_ROOT. ") endif (Protobuf_FOUND) - function(protobuf_generate) set(_options APPEND_PATH DESCRIPTORS) set(_singleargs LANGUAGE OUT_VAR EXPORT_MACRO PROTOC_OUT_DIR) @@ -150,7 +157,8 @@ function(protobuf_generate) set(_generated_srcs) foreach(_ext ${protobuf_generate_GENERATE_EXTENSIONS}) - list(APPEND _generated_srcs "${protobuf_generate_PROTOC_OUT_DIR}/${_possible_rel_dir}${_basename}${_ext}") + #list(APPEND _generated_srcs "${protobuf_generate_PROTOC_OUT_DIR}/${_possible_rel_dir}${_basename}${_ext}") + list(APPEND _generated_srcs "${protobuf_generate_PROTOC_OUT_DIR}/${_basename}${_ext}") endforeach() if(protobuf_generate_DESCRIPTORS AND protobuf_generate_LANGUAGE STREQUAL cpp) @@ -230,38 +238,38 @@ endfunction() function(PROTOBUF_GENERATE_PYTHON SRCS) - if(NOT ARGN) - message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files") - return() - endif() + if(NOT ARGN) + message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files") + return() + endif() - if(PROTOBUF_GENERATE_CPP_APPEND_PATH) - set(_append_arg APPEND_PATH) - endif() + if(PROTOBUF_GENERATE_CPP_APPEND_PATH) + set(_append_arg APPEND_PATH) + endif() - if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS) - set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}") - endif() + if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS) + set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}") + endif() - if(DEFINED Protobuf_IMPORT_DIRS) - set(_import_arg IMPORT_DIRS ${Protobuf_IMPORT_DIRS}) - endif() + if(DEFINED Protobuf_IMPORT_DIRS) + set(_import_arg IMPORT_DIRS ${Protobuf_IMPORT_DIRS}) + endif() - set(_outvar) - protobuf_generate(${_append_arg} LANGUAGE python OUT_VAR _outvar ${_import_arg} PROTOS ${ARGN}) - set(${SRCS} ${_outvar} PARENT_SCOPE) + set(_outvar) + protobuf_generate(${_append_arg} LANGUAGE python OUT_VAR _outvar ${_import_arg} PROTOS ${ARGN}) + set(${SRCS} ${_outvar} PARENT_SCOPE) endfunction() if(Protobuf_INCLUDE_DIR) if(Protobuf_PROTOC_EXECUTABLE) - if(NOT TARGET protobuf::protoc) - add_executable(protobuf::protoc IMPORTED) - if(EXISTS "${Protobuf_PROTOC_EXECUTABLE}") - set_target_properties(protobuf::protoc PROPERTIES - IMPORTED_LOCATION "${Protobuf_PROTOC_EXECUTABLE}") - endif() + if(NOT TARGET protobuf::protoc) + add_executable(protobuf::protoc IMPORTED) + if(EXISTS "${Protobuf_PROTOC_EXECUTABLE}") + set_target_properties(protobuf::protoc PROPERTIES + IMPORTED_LOCATION "${Protobuf_PROTOC_EXECUTABLE}") endif() + endif() endif() endif() diff --git a/common/cmakes/FindTFLite.cmake b/common/cmakes/FindTFLite.cmake new file mode 100644 index 00000000..5dcaf1cc --- /dev/null +++ b/common/cmakes/FindTFLite.cmake @@ -0,0 +1,16 @@ +find_path(TFLITE_INCLUDE_DIR NAMES tensorflow/lite/schema/schema_generated.h HINTS $ENV{TFLite_ROOT}/include ${TFLite_ROOT}/include) + +if (TFLITE_INCLUDE_DIR) + set(TFLITE_FOUND true) +endif (TFLITE_INCLUDE_DIR) +find_package(FlatBuffers) + +if (TFLITE_FOUND) + message(STATUS "Found tensorflow/lite/schema/schema_generated.h: ${TFLITE_INCLUDE_DIR}") + set(TFLITE_INCLUDE_DIR "${TFLITE_INCLUDE_DIR};${FlatBuffers_INCLUDE_DIR}") +else (TFLITE_FOUND) + message(FATAL_ERROR " +FATAL: can not find tensorflow/lite/schema/schema_generated.h in /include directory, + please set shell environment variable TFLite_ROOT. + ") +endif (TFLITE_FOUND) diff --git a/cmakes/Findjpeg.cmake b/common/cmakes/Findjpeg.cmake similarity index 53% rename from cmakes/Findjpeg.cmake rename to common/cmakes/Findjpeg.cmake index 10c2b766..fa5e8ae7 100644 --- a/cmakes/Findjpeg.cmake +++ b/common/cmakes/Findjpeg.cmake @@ -1,9 +1,13 @@ -find_path(JPEG_INCLUDE_DIR NAMES jpeglib.h HINTS ${JPEG_ROOT}/include $ENV{JPEG_ROOT}/include) +find_path(JPEG_INCLUDE_DIR NAMES jpeglib.h HINTS $ENV{JPEG_ROOT}/include ${JPEG_ROOT}/include) if (USE_DYNAMIC_LIBRARY) - find_library(JPEG_LIBRARY NAMES libjpeg.so HINTS ${JPEG_ROOT}/lib $ENV{JPEG_ROOT}/lib) + if (USE_IOS_CLANG) + find_library(JPEG_LIBRARY NAMES libjpeg.dylib HINTS $ENV{JPEG_ROOT}/lib ${JPEG_ROOT}/lib) + else (USE_IOS_CLANG) + find_library(JPEG_LIBRARY NAMES libjpeg.so HINTS $ENV{JPEG_ROOT}/lib ${JPEG_ROOT}/lib) + endif (USE_IOS_CLANG) else (USE_DYNAMIC_LIBRARY) - find_library(JPEG_LIBRARY NAMES libjpeg.a HINTS ${JPEG_ROOT}/lib $ENV{JPEG_ROOT}/lib) + find_library(JPEG_LIBRARY NAMES libjpeg.a HINTS $ENV{JPEG_ROOT}/lib ${JPEG_ROOT}/lib) endif (USE_DYNAMIC_LIBRARY) if (JPEG_INCLUDE_DIR AND JPEG_LIBRARY) @@ -11,7 +15,6 @@ if (JPEG_INCLUDE_DIR AND JPEG_LIBRARY) endif (JPEG_INCLUDE_DIR AND JPEG_LIBRARY) if (JPEG_FOUND) - include_directories(include ${JPEG_INCLUDE_DIR}) message(STATUS "Found jpeglib.h: ${JPEG_INCLUDE_DIR}") message(STATUS "Found jpeg: ${JPEG_LIBRARY}") else (JPEG_FOUND) diff --git a/common/cmakes/Findjsoncpp.cmake b/common/cmakes/Findjsoncpp.cmake new file mode 100644 index 00000000..ba4ae1d5 --- /dev/null +++ b/common/cmakes/Findjsoncpp.cmake @@ -0,0 +1,25 @@ +find_path(JSONCPP_INCLUDE_DIR NAMES json/json.h HINTS $ENV{JSONCPP_ROOT}/include ${JSONCPP_ROOT}/include) + +if (USE_DYNAMIC_LIBRARY) + if (USE_IOS_CLANG) + find_library(JSONCPP_LIBRARY NAMES libjsoncpp.dylib HINTS $ENV{JSONCPP_ROOT}/lib ${JSONCPP_ROOT}/lib) + else (USE_IOS_CLANG) + find_library(JSONCPP_LIBRARY NAMES libjsoncpp.so HINTS $ENV{JSONCPP_ROOT}/lib ${JSONCPP_ROOT}/lib) + endif (USE_IOS_CLANG) +else (USE_DYNAMIC_LIBRARY) + find_library(JSONCPP_LIBRARY NAMES libjsoncpp.a HINTS $ENV{JSONCPP_ROOT}/lib ${JSONCPP_ROOT}/lib) +endif (USE_DYNAMIC_LIBRARY) + +if (JSONCPP_INCLUDE_DIR AND JSONCPP_LIBRARY) + set(JSONCPP_FOUND true) +endif (JSONCPP_INCLUDE_DIR AND JSONCPP_LIBRARY) + +if (JSONCPP_FOUND) + message(STATUS "Found jsoncpplib.h: ${JSONCPP_INCLUDE_DIR}") + message(STATUS "Found jsoncpp: ${JSONCPP_LIBRARY}") +else (JSONCPP_FOUND) + message(FATAL_ERROR " +FATAL: can not find jsoncpp library in /[include|lib] directory, + please set shell environment variable JSONCPP_ROOT. + ") +endif (JSONCPP_FOUND) diff --git a/common/cmakes/bolt.cmake b/common/cmakes/bolt.cmake new file mode 100644 index 00000000..5d4bae6c --- /dev/null +++ b/common/cmakes/bolt.cmake @@ -0,0 +1,412 @@ +option(USE_CROSS_COMPILE "set use cross compile or not" ON) +option(USE_GNU_GCC "set use GNU gcc compiler or not" OFF) +option(USE_LLVM_CLANG "set use LLVM clang compiler or not" OFF) +option(USE_IOS_CLANG "set use ios compiler or not" OFF) +option(USE_DYNAMIC_LIBRARY "set use dynamic library or not" OFF) +option(USE_MINSIZEREL ".so lib will be 300KB smaller but performance will be affected" OFF) + +option(USE_ANDROID_LOG "set use Android log or not" OFF) +option(USE_DEBUG "set use debug information or not" OFF) +option(USE_PROFILE "set use profile information or not" OFF) +option(USE_PROFILE_STATISTICS "set use profile statistics information or not" OFF) +option(USE_THREAD_SAFE "set use thread safe or not" OFF) + +# model_tools variable +option(USE_CAFFE "set use caffe model as input or not" ON) +option(USE_ONNX "set use onnx model as input or not" ON) +option(USE_TFLITE "set use tflite model as input or not" ON) +option(USE_TENSORFLOW "set use tensorflow model as input or not" ON) + +# blas_enhance tensor +option(USE_GENERAL "set use CPU serial code or not" ON) +option(USE_X86 "set use X86 instruction or not" OFF) +option(USE_NEON "set use ARM NEON instruction or not" OFF) +option(USE_ARMV7 "set use ARMv7 NEON instruction or not" OFF) +option(USE_ARMV8 "set use ARMv8 NEON instruction or not" ON) +option(USE_MALI "set use mali for parallel or not" OFF) +option(USE_FP32 "set use ARM NEON FP32 instruction or not" ON) +option(USE_FP16 "set use ARM NEON FP16 instruction or not" ON) +option(USE_F16_MIX_PRECISION "set use ARM NEON mix precision f16/f32 instruction or not" ON) +option(USE_INT8 "set use ARM NEON INT8 instruction or not" ON) + +option(USE_OPENMP "set use openmp to run test(tinybert) or not" OFF) +option(USE_LIBRARY_TUNING "set use algorithm tuning or not" OFF) +option(USE_FLOW "set whether to use flow or not" ON) + +option(BUILD_TEST "set to build unit test or not" OFF) + +set(BOLT_ROOT $ENV{BOLT_ROOT}) + +function (set_policy) + if (POLICY CMP0074) + cmake_policy(SET CMP0074 NEW) + endif() +endfunction(set_policy) + +macro (set_c_cxx_flags) + set(COMMON_FLAGS "-W -Wall -Wextra -Wno-unused-command-line-argument -Wno-unused-parameter -O3 -fPIC -fstack-protector") + + if (USE_OPENMP) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_OPENMP -fopenmp") + endif(USE_OPENMP) + + if (USE_LIBRARY_TUNING) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_LIBRARY_TUNING") + endif(USE_LIBRARY_TUNING) + + if (BUILD_TEST) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_BUILD_TEST") + endif(BUILD_TEST) + + if (USE_DEBUG) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_DEBUG") + endif(USE_DEBUG) + + if (USE_JNI) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_JNI") + endif(USE_JNI) + + if (USE_LLVM_CLANG AND USE_ANDROID_LOG) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_ANDROID_LOG -llog") + endif(USE_LLVM_CLANG AND USE_ANDROID_LOG) + + if (USE_PROFILE) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_PROFILE") + endif(USE_PROFILE) + + if (USE_PROFILE_STATISTICS) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_PROFILE_STATISTICS") + endif(USE_PROFILE_STATISTICS) + + if (USE_THREAD_SAFE) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_THREAD_SAFE") + endif(USE_THREAD_SAFE) + + if (USE_GENERAL) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_GENERAL") + endif(USE_GENERAL) + + if (USE_MALI) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_MALI") + endif(USE_MALI) + + if (USE_X86) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_X86 -mavx2 -mfma") + endif(USE_X86) + + if (USE_IOS_CLANG) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_IOS") + endif(USE_IOS_CLANG) + + if (USE_FP32) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_FP32") + endif (USE_FP32) + + if (USE_NEON) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_NEON") + + if (USE_ARMV8) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_ARMV8") + endif (USE_ARMV8) + + if (USE_ARMV7) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_ARMV7 -march=armv7-a -mfloat-abi=softfp -mfpu=neon-vfpv4") + if (USE_LLVM_CLANG) + set(COMMON_FLAGS "${COMMON_FLAGS} -Wl,--allow-multiple-definition") + endif (USE_LLVM_CLANG) + endif (USE_ARMV7) + + if (USE_FP16) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_FP16") + if (USE_F16_MIX_PRECISION) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_F16_MIX_PRECISION") + endif (USE_F16_MIX_PRECISION) + if (USE_INT8) + if (USE_LLVM_CLANG) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_INT8 -march=armv8-a+fp16+dotprod") + else (USE_LLVM_CLANG) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_INT8 -march=armv8.2-a+fp16+dotprod") + endif (USE_LLVM_CLANG) + else (USE_INT8) + if (USE_LLVM_CLANG) + set(COMMON_FLAGS "${COMMON_FLAGS} -march=armv8-a+fp16") + else (USE_LLVM_CLANG) + set(COMMON_FLAGS "${COMMON_FLAGS} -march=armv8.2-a+fp16") + endif (USE_LLVM_CLANG) + endif (USE_INT8) + endif (USE_FP16) + endif(USE_NEON) + + if (USE_CAFFE) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_CAFFE") + endif() + if (USE_ONNX) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_ONNX") + endif() + if (USE_TFLITE) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_TFLITE") + endif() + if (USE_TENSORFLOW) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_TENSORFLOW") + endif() + + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}") + if (USE_X86) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + endif (USE_X86) + + if (USE_DEBUG) + set(CMAKE_BUILD_TYPE "Debug") + elseif (USE_MINSIZEREL) + set(CMAKE_BUILD_TYPE "MinSizeRel") + endif (USE_DEBUG) +endmacro(set_c_cxx_flags) + +macro (set_test_c_cxx_flags) + if (${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") + if (USE_CROSS_COMPILE) + if (USE_GNU_GCC) + set(COMMON_FLAGS "${COMMON_FLAGS} -static") + endif(USE_GNU_GCC) + endif(USE_CROSS_COMPILE) + endif(${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") + + if (USE_IOS_CLANG) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_IOS") + endif(USE_IOS_CLANG) + + if (USE_LLVM_CLANG) + if (${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") + set(COMMON_FLAGS "${COMMON_FLAGS} -Wl,-allow-shlib-undefined, -static-libstdc++") + else (${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") + set(COMMON_FLAGS "${COMMON_FLAGS} -Wl,-allow-shlib-undefined") + endif(${USE_DYNAMIC_LIBRARY} STREQUAL "OFF") + endif(USE_LLVM_CLANG) + + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}") +endmacro (set_test_c_cxx_flags) + +macro (set_project_install_directory) + set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/bin) + set(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/lib) +endmacro (set_project_install_directory) + +if(USE_DYNAMIC_LIBRARY) + set(uni_library uni) + set(gcl_library gcl) + set(kernelsource_library kernelsource) + set(blas_enhance_library blas_enhance) + set(tensor_library tensor) + set(image_library image) + set(model_tools_caffe_library model_tools_caffe) + set(model_tools_onnx_library model_tools_onnx) + set(model_tools_tflite_library model_tools_tflite) + set(model_tools_tensorflow_library model_tools_tensorflow) + set(model_tools_library model_tools) + set(engine_library engine) + set(flow_library flow) +else() + set(uni_library uni_static) + set(gcl_library gcl_static) + set(kernelsource_library kernelsource_static) + set(blas_enhance_library blas_enhance_static) + set(tensor_library tensor_static) + set(image_library image_static) + set(model_tools_caffe_library model_tools_caffe_static) + set(model_tools_onnx_library model_tools_onnx_static) + set(model_tools_tflite_library model_tools_tflite_static) + set(model_tools_tensorflow_library model_tools_tensorflow_static) + set(model_tools_library model_tools_static) + set(engine_library engine_static) + set(flow_library flow_static) +endif() + +macro(include_uni) + include_directories(${BOLT_ROOT}/common/uni/include) +endmacro() + +macro(link_uni name) + target_link_libraries(${name} ${uni_library}) + if (USE_THREAD_SAFE AND USE_GNU_GCC) + target_link_libraries(${name} -lpthread) + endif (USE_THREAD_SAFE AND USE_GNU_GCC) +endmacro() + +macro(include_gcl) + include_directories(${BOLT_ROOT}/common/gcl/include) + include_directories(${BOLT_ROOT}/common/gcl/tools/kernel_source_compile/include) + include_directories(${OPENCL_INCLUDE_DIR}) + include_uni() +endmacro() + +macro(link_opencl name) + if (USE_MALI) + target_link_libraries(${name} ${OPENCL_LIBRARIES}) + endif(USE_MALI) +endmacro() + +macro(link_gcl name) + if (USE_MALI) + target_link_libraries(${name} ${gcl_library} ${kernelsource_library}) + link_opencl(${name}) + endif (USE_MALI) +endmacro() + +macro(include_memory) + include_directories(${BOLT_ROOT}/common/memory/include) + include_uni() + include_gcl() +endmacro() + +macro(include_blas_enhance) + include_directories(${BOLT_ROOT}/compute/blas_enhance/include) + include_uni() +endmacro() + +macro(link_blas_enhance name) + target_link_libraries(${name} ${blas_enhance_library}) + link_uni(${name}) +endmacro() + +macro(include_tensor) + include_directories(${BOLT_ROOT}/compute/tensor/include) + include_blas_enhance() + include_gcl() + include_memory() +endmacro() + +macro(link_tensor name) + target_link_libraries(${name} ${tensor_library} ${blas_enhance_library}) + link_blas_enhance(${name}) + link_gcl(${name}) +endmacro() + +macro(include_image) + include_directories(${BOLT_ROOT}/compute/image/include) + include_tensor() +endmacro() + +macro(link_image name) + target_link_libraries(${name} ${image_library}) + link_tensor(${name}) +endmacro() + +macro(include_protobuf) + include_directories(${Protobuf_INCLUDE_DIR}) +endmacro() + +macro(link_protobuf name) + target_link_libraries(${name} ${Protobuf_LIBRARY}) + if (USE_GNU_GCC) + target_link_libraries(${name} ${Protobuf_LIBRARY} -lpthread) + endif(USE_GNU_GCC) + if (USE_LLVM_CLANG) + target_link_libraries(${name} ${Protobuf_LIBRARY} -lz) + endif(USE_LLVM_CLANG) +endmacro() + +macro(include_model_tools) + include_directories(${BOLT_ROOT}/model_tools/include) + include_uni() +endmacro() + +macro(link_model_tools name) + target_link_libraries(${name} ${model_tools_library}) + if(USE_CAFFE) + target_link_libraries(${name} ${model_tools_caffe_library}) + endif() + if(USE_ONNX) + target_link_libraries(${name} ${model_tools_onnx_library}) + endif() + if(USE_ONNX) + target_link_libraries(${name} ${model_tools_tflite_library}) + endif() + if(USE_TENSORFLOW) + target_link_libraries(${name} ${model_tools_tensorflow_library}) + target_link_libraries(${name} ${JSONCPP_LIBRARY}) + endif() + if(USE_CAFFE OR USE_ONNX) + link_protobuf(${name}) + endif() + link_uni(${name}) +endmacro() + +macro(model_tools_test name src_name) + include_directories(${BOLT_ROOT}/model_tools/include) + add_executable(${name} ${src_name}) + link_model_tools(${name}) +endmacro() + +macro(include_engine) + if (BUILD_TEST) + include_directories(${JPEG_INCLUDE_DIR}) + endif (BUILD_TEST) + include_directories(${BOLT_ROOT}/inference/engine/include) + if (USE_JNI) + include_directories(${JNI_INCLUDE_DIR}) + include_directories(${JNI_MD_INCLUDE_DIR}) + endif (USE_JNI) + include_model_tools() + include_tensor() + include_image() +endmacro() + +macro(link_engine name) + target_link_libraries(${name} ${engine_library}) + if (BUILD_TEST) + target_link_libraries(${name} ${JPEG_LIBRARY}) + endif (BUILD_TEST) + link_model_tools(${name}) + target_link_libraries(${name} ${image_library} ${tensor_library} ${blas_enhance_library}) + link_gcl(${name}) + link_uni(${name}) +endmacro() + +macro(engine_test name src_name) + add_executable(${name} ${src_name}) + link_engine(${name}) +endmacro() + +macro(include_flow) + include_directories(${BOLT_ROOT}/inference/flow/include) + include_engine() +endmacro() + +macro(flow_test name src_name) + include_protobuf() + include_directories(${BOLT_ROOT}/flow/include) + if ("${name}" STREQUAL "flow_asr") + set_policy() + find_package(FFTW) + add_executable(${name} ${src_name}) + target_link_libraries(${name} ${FFTW_LIBRARIES}) + else () + add_executable(${name} ${src_name}) + endif() + target_link_libraries(${name} ${flow_library}) + link_engine(${name}) + link_protobuf(${name}) + add_dependencies(${name} flow.pb.h) +endmacro() + +macro(include_train) + include_model_tools() + include_tensor() + include_image() +endmacro() + +macro(link_train name) + target_link_libraries(${name} RaulLib) + link_model_tools(${name}) + target_link_libraries(${name} ${image_library} ${tensor_library} ${blas_enhance_library}) + link_gcl(${name}) + link_uni(${name}) +endmacro() + +macro(train_test name src_name) + include_directories(${BOLT_ROOT}/training/include) + add_executable(${name} ${src_name}) + link_train(${name}) +endmacro() diff --git a/common/gcl/CMakeLists.txt b/common/gcl/CMakeLists.txt new file mode 100644 index 00000000..8b44ba49 --- /dev/null +++ b/common/gcl/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 3.2) + +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) +if (BOLT_CONFIGURE_FILE) + include(${BOLT_CONFIGURE_FILE}) +else (BOLT_CONFIGURE_FILE) + message(FATAL_ERROR " +FATAL: can not find bolt.cmake in directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (BOLT_CONFIGURE_FILE) + +project(gcl) + +set_c_cxx_flags() + +include_gcl() + +add_subdirectory(src) +add_subdirectory(tools/kernel_source_compile) diff --git a/common/gcl/include/context.h b/common/gcl/include/context.h new file mode 100644 index 00000000..f6e1a793 --- /dev/null +++ b/common/gcl/include/context.h @@ -0,0 +1,212 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONTEXT +#define _H_CONTEXT + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief create OpenCL Context based on platform + * + * @param platform input, context will be created on this platform + * @param num_devices input, context will be created on num_devices Device + * @param devices input, context created contains devices + * @param context output, return context created + * + * @return + * + */ +inline EE create_context(Platform platform, U32 num_devices, Device *devices, Context *context) +{ + if (NULL == context) { + return NULL_POINTER; + } + + I32 ret; + cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0}; + *context = clCreateContext(properties, num_devices, devices, NULL, NULL, &ret); + map_cl_error_2_ee(ret); +} + +/** + * @brief get context information + * + * @warning please free the memory allocate by this function + **/ +inline EE get_context_info(Context context, cl_context_info info, void **value, U32 *len) +{ + if (NULL == value) { + return NULL_POINTER; + } + + size_t size; + I32 ret = clGetContextInfo(context, info, 0, NULL, &size); + if (CL_SUCCESS == ret) { + if (NULL == len) { + *len = size; + } + void *data = malloc(size); + if (NULL == data) { + return ALLOC_FAILED; + } + ret = clGetContextInfo(context, info, size, data, NULL); + if (CL_SUCCESS == ret) { + *value = data; + } else { + free(data); + } + } + + map_cl_error_2_ee(ret); +} + +inline EE retain_context(Context context) +{ + I32 ret = clRetainContext(context); + map_cl_error_2_ee(ret); +} + +inline EE release_context(Context context) +{ + I32 ret = clReleaseContext(context); + map_cl_error_2_ee(ret); +} + +inline EE create_command_queue_properties( + Context context, Device device, cl_queue_properties *properties, CommandQueue *queue) +{ + if (NULL == queue) { + return NULL_POINTER; + } + I32 ret; + *queue = clCreateCommandQueueWithProperties(context, device, properties, &ret); + map_cl_error_2_ee(ret); +} +/* + inline EE create_command_queue(Context context, Device device, + cl_command_queue_properties properties, CommandQueue* queue) { + if(NULL == queue) return NULL_POINTER; + I32 ret; + * queue = clCreateCommandQueue(context, device, properties, &ret); + map_cl_error_2_ee(ret); + } + */ +/** + * @brief get information of command queue + * + * @warning please free memory associated with value + * + **/ +inline EE get_command_queue_info( + CommandQueue queue, cl_command_queue_info info, void **value, size_t *len) +{ + if (NULL == value) { + return NULL_POINTER; + } + + size_t size; + I32 ret = clGetCommandQueueInfo(queue, info, 0, NULL, &size); + if (CL_SUCCESS == ret) { + if (NULL != len) { + *len = size; + } + void *data = malloc(size); + if (NULL == data) { + return ALLOC_FAILED; + } + ret = clGetCommandQueueInfo(queue, info, size, data, NULL); + if (CL_SUCCESS == ret) { + *value = data; + } else { + free(data); + } + } + + map_cl_error_2_ee(ret); +} + +/** + * @brief get context of command queue + * + **/ +inline EE command_queue_get_context(CommandQueue queue, Context *context) +{ + if (NULL == context) { + return NULL_POINTER; + } + I32 ret = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(Context), context, NULL); + map_cl_error_2_ee(ret); +} + +/** + * @brief get device of command queue + * + **/ +inline EE command_queue_get_device(CommandQueue queue, Device *device) +{ + if (NULL == device) { + return NULL_POINTER; + } + I32 ret = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(Device), device, NULL); + map_cl_error_2_ee(ret); +} + +inline EE retain_command_queue(CommandQueue queue) +{ + I32 ret = clRetainCommandQueue(queue); + map_cl_error_2_ee(ret); +} + +inline EE release_command_queue(CommandQueue queue) +{ + I32 ret = clReleaseCommandQueue(queue); + map_cl_error_2_ee(ret); +} + +/** + * @brief flush command queue, issue all command to execuate + **/ +inline EE flush(CommandQueue queue) +{ + I32 ret = clFlush(queue); + map_cl_error_2_ee(ret); +} + +/** + * @brief wait all commands finish + **/ +inline EE finish(CommandQueue queue) +{ + I32 ret = clFinish(queue); + map_cl_error_2_ee(ret); +} + +inline EE check_queue_profiling(CommandQueue queue, bool *enable) +{ + cl_bitfield prop; + I32 ret = clGetCommandQueueInfo(queue, CL_QUEUE_PROPERTIES, sizeof(prop), &prop, NULL); + if ((prop | CL_QUEUE_PROFILING_ENABLE) == prop) { + *enable = true; + } else { + *enable = false; + } + map_cl_error_2_ee(ret); +} + +#ifdef __cplusplus +} +#endif +#endif diff --git a/common/gcl/include/event.h b/common/gcl/include/event.h new file mode 100644 index 00000000..9db350f4 --- /dev/null +++ b/common/gcl/include/event.h @@ -0,0 +1,160 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef EVENT_H_ +#define EVENT_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief wait for event to complete + **/ +inline EE wait_events(U32 num_events, const Event *event_list) +{ + I32 ret = clWaitForEvents(num_events, event_list); + map_cl_error_2_ee(ret); +} + +/** + * @brief get informaiton about event + * + * @warning please free memory associated with value + **/ +inline EE get_event_info(cl_event event, cl_event_info info, void **value, size_t *size) +{ + size_t len; + I32 ret = clGetEventInfo(event, info, 0, NULL, &len); + if (CL_SUCCESS == ret) { + if (NULL != size) { + *size = len; + } + void *data = malloc(len); + if (NULL == data) { + return ALLOC_FAILED; + } + ret = clGetEventInfo(event, info, len, data, NULL); + if (CL_SUCCESS == ret) { + *value = data; + } else { + free(data); + } + } + + map_cl_error_2_ee(ret); +} + +/** + * @brief increase reference count of event + **/ +inline EE retain_event(Event event) +{ + I32 ret = clRetainEvent(event); + map_cl_error_2_ee(ret); +} + +inline EE release_event(Event event) +{ + I32 ret = clReleaseEvent(event); + map_cl_error_2_ee(ret); +} + +inline EE enqueue_barrier_wait_lists( + CommandQueue queue, U32 num_wait_events, const Event *wait_events, Event *event) +{ + I32 ret = clEnqueueBarrierWithWaitList(queue, num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); +} + +inline EE event_counting_time( + Event *event, double *t_queue, double *t_submit, double *t_start, double *t_end, double *t_execute) +{ + cl_ulong queued, submit, start, end; + CHECK_STATUS(wait_events(1, event)); + I32 ret; + ret = clGetEventProfilingInfo( + *event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &queued, NULL); + if (ret) { + map_cl_error_2_ee(ret); + } + ret = clGetEventProfilingInfo( + *event, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submit, NULL); + if (ret) { + map_cl_error_2_ee(ret); + } + ret = + clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); + if (ret) { + map_cl_error_2_ee(ret); + } + ret = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); + if (ret) { + map_cl_error_2_ee(ret); + } + + double t0, t1, t2, t3, t4; + t0 = (double)(queued)*1e-03; + t1 = (double)(submit)*1e-03; + t2 = (double)(start)*1e-03; + t3 = (double)(end)*1e-03; + t4 = ((double)(end) - (double)(start)) * 1e-03; + + if (t_queue) { + *t_queue = t0; + } + if (t_submit) { + *t_submit = t1; + } + if (t_start) { + *t_start = t2; + } + if (t_end) { + *t_end = t3; + } + if (t_execute) { + *t_execute = t4; + } + return SUCCESS; +} +/** + * @brief get profiling information + **/ +inline EE event_get_profiling_info(Event event, cl_profiling_info info, void **value, size_t *size) +{ + size_t len; + I32 ret = clGetEventProfilingInfo(event, info, 0, NULL, &len); + if (CL_SUCCESS == ret) { + if (NULL != size) { + *size = len; + } + void *data = malloc(len); + if (NULL == data) { + return ALLOC_FAILED; + } + ret = clGetEventProfilingInfo(event, info, len, data, NULL); + if (CL_SUCCESS == ret) { + *value = data; + } else { + free(data); + } + } + + map_cl_error_2_ee(ret); +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/gcl/include/gcl.h b/common/gcl/include/gcl.h similarity index 83% rename from gcl/include/gcl.h rename to common/gcl/include/gcl.h index fcab35d5..2a0ab241 100644 --- a/gcl/include/gcl.h +++ b/common/gcl/include/gcl.h @@ -1,20 +1,18 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - #ifndef _H_GCL #define _H_GCL #include "gcl_func.h" +#include "gclmem_desc_infer.h" #endif - diff --git a/common/gcl/include/gcl_common.h b/common/gcl/include/gcl_common.h new file mode 100644 index 00000000..46cbb05d --- /dev/null +++ b/common/gcl/include/gcl_common.h @@ -0,0 +1,275 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef H_GCL_COMMON +#define H_GCL_COMMON +#define CL_TARGET_OPENCL_VERSION 200 + +#include "tensor_desc.h" +#include "gcl_kernel_type.h" +#include "CL/cl.h" +#include +#include +#include +#include +#include +/** + * @file + */ +#define ERROR_CASE(x) \ + case x: \ + return (#x) + +#ifdef __cplusplus +extern "C" { +#endif + +typedef cl_platform_id Platform; +typedef cl_device_id Device; +typedef cl_context Context; +typedef cl_command_queue CommandQueue; +typedef cl_program Program; +typedef cl_mem Mem; +typedef cl_sampler Sampler; +typedef cl_kernel Kernel; +typedef cl_event Event; +typedef cl_mem_flags MemFlags; +typedef cl_image_format ImgFormat; + +inline CI8 *map_cl_error_2_string(cl_int err) +{ + switch (err) { + ERROR_CASE(CL_SUCCESS); + ERROR_CASE(CL_DEVICE_NOT_FOUND); + ERROR_CASE(CL_DEVICE_NOT_AVAILABLE); + ERROR_CASE(CL_COMPILER_NOT_AVAILABLE); + ERROR_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE); + ERROR_CASE(CL_OUT_OF_RESOURCES); + ERROR_CASE(CL_OUT_OF_HOST_MEMORY); + ERROR_CASE(CL_PROFILING_INFO_NOT_AVAILABLE); + ERROR_CASE(CL_MEM_COPY_OVERLAP); + ERROR_CASE(CL_IMAGE_FORMAT_MISMATCH); + ERROR_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED); + ERROR_CASE(CL_BUILD_PROGRAM_FAILURE); + ERROR_CASE(CL_MAP_FAILURE); +#ifdef CL_VERSION_1_1 + ERROR_CASE(CL_MISALIGNED_SUB_BUFFER_OFFSET); + ERROR_CASE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST); +#endif +#ifdef CL_VERSION_1_2 + ERROR_CASE(CL_COMPILE_PROGRAM_FAILURE); + ERROR_CASE(CL_LINKER_NOT_AVAILABLE); + ERROR_CASE(CL_LINK_PROGRAM_FAILURE); + ERROR_CASE(CL_DEVICE_PARTITION_FAILED); + ERROR_CASE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE); +#endif + ERROR_CASE(CL_INVALID_VALUE); + ERROR_CASE(CL_INVALID_DEVICE_TYPE); + ERROR_CASE(CL_INVALID_PLATFORM); + ERROR_CASE(CL_INVALID_DEVICE); + ERROR_CASE(CL_INVALID_CONTEXT); + ERROR_CASE(CL_INVALID_QUEUE_PROPERTIES); + ERROR_CASE(CL_INVALID_COMMAND_QUEUE); + ERROR_CASE(CL_INVALID_HOST_PTR); + ERROR_CASE(CL_INVALID_MEM_OBJECT); + ERROR_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); + ERROR_CASE(CL_INVALID_IMAGE_SIZE); + ERROR_CASE(CL_INVALID_SAMPLER); + ERROR_CASE(CL_INVALID_BINARY); + ERROR_CASE(CL_INVALID_BUILD_OPTIONS); + ERROR_CASE(CL_INVALID_PROGRAM); + ERROR_CASE(CL_INVALID_PROGRAM_EXECUTABLE); + ERROR_CASE(CL_INVALID_KERNEL_NAME); + ERROR_CASE(CL_INVALID_KERNEL_DEFINITION); + ERROR_CASE(CL_INVALID_KERNEL); + ERROR_CASE(CL_INVALID_ARG_INDEX); + ERROR_CASE(CL_INVALID_ARG_VALUE); + ERROR_CASE(CL_INVALID_ARG_SIZE); + ERROR_CASE(CL_INVALID_KERNEL_ARGS); + ERROR_CASE(CL_INVALID_WORK_DIMENSION); + ERROR_CASE(CL_INVALID_WORK_GROUP_SIZE); + ERROR_CASE(CL_INVALID_WORK_ITEM_SIZE); + ERROR_CASE(CL_INVALID_GLOBAL_OFFSET); + ERROR_CASE(CL_INVALID_EVENT_WAIT_LIST); + ERROR_CASE(CL_INVALID_EVENT); + ERROR_CASE(CL_INVALID_OPERATION); + ERROR_CASE(CL_INVALID_GL_OBJECT); + ERROR_CASE(CL_INVALID_BUFFER_SIZE); + ERROR_CASE(CL_INVALID_MIP_LEVEL); + ERROR_CASE(CL_INVALID_GLOBAL_WORK_SIZE); +#ifdef CL_VERSION_1_1 + ERROR_CASE(CL_INVALID_PROPERTY); +#endif +#ifdef CL_VERSION_1_2 + ERROR_CASE(CL_INVALID_IMAGE_DESCRIPTOR); + ERROR_CASE(CL_INVALID_COMPILER_OPTIONS); + ERROR_CASE(CL_INVALID_LINKER_OPTIONS); + ERROR_CASE(CL_INVALID_DEVICE_PARTITION_COUNT); +#endif +#ifdef CL_VERSION_2_0 + ERROR_CASE(CL_INVALID_PIPE_SIZE); + ERROR_CASE(CL_INVALID_DEVICE_QUEUE); +#endif +#ifdef CL_VERSION_2_2 + ERROR_CASE(CL_INVALID_SPEC_ID); + ERROR_CASE(CL_MAX_SIZE_RESTRICTION_EXCEEDED); +#endif + + default: + return "CL_UNKNOW_ERROR"; + } +} + +#define map_cl_error_2_ee(err) \ + { \ + if (err == 0) \ + return SUCCESS; \ + UNI_ERROR_LOG("GCLAPI error in: File: %s Line: %d Func name is: %s GCLERROR = %s\n", \ + __FILE__, __LINE__, __func__, map_cl_error_2_string(err)); \ + return GCL_ERROR; \ + } + +inline EE has_dedicated_local(Device device, I32 *b) +{ + void *value; + I32 ret = clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(void *), &value, nullptr); + if (CL_SUCCESS == ret) { + *b = (*((cl_device_local_mem_type *)value) == CL_LOCAL); + } + free(value); + map_cl_error_2_ee(ret); +} + +/** + *@ enum define + **/ +typedef enum { + GCL_MEM_BUF = 0, + GCL_MEM_IMG_1D = 1, + GCL_MEM_IMG_2D = 2, + GCL_MEM_IMG_3D = 3 +} GCLMemType; + +typedef enum { + HOST_TO_DEVICE_BUF = 0, + HOST_TO_DEVICE_IMG = 1, + DEVICE_BUF_TO_HOST = 2, + DEVICE_IMG_TO_HOST = 3, + DEVICE_BUF_TO_BUF = 4, + DEVICE_BUF_TO_IMG = 5, + DEVICE_IMG_TO_BUF = 6, + DEVICE_IMG_TO_IMG = 7 +} GCLMemTransType; +/** + *@ struct define + **/ +struct GCLKernelInfo { + Kernel kernel = NULL; + U32 dim = 0; + U32 gs[3] = {0}; + U32 ls[3] = {0}; + std::string name; +}; + +struct GCLHandle { + Platform *platforms; + U32 numPlatform; + U32 platformId; + + Device *devices; + U32 numDevice; + U32 deviceId; + cl_device_type deviceType; + + Context context; + CommandQueue queue; + CommandQueue queue_profiling; + bool existProfilingQueue; + + Event eventObj; + Event *eventPtr; + U32 numWaitEvents; + Event *waitEvents; + double t_execute; + double t_total; + + std::string deviceName; + std::map kernelMap; + std::map programMap; + std::vector *kernelVec; + std::string curOpName; + void *kernel_source; + void *kernel_binmap_handle; + void *kernel_binmap; + bool useBinMap; + std::string common_source_opt; + std::string common_source_ext; + Program source_head[1]; + CI8 *source_head_name[1]; +}; + +typedef struct GCLHandle *GCLHandle_t; + +struct GCLHandleConfig { + CI8 *deviceBinmapName; +}; + +typedef GCLHandleConfig *GCLHandleConfig_t; + +struct GCLMemDesc { + U32 dims[6]; + U32 nDims; + DataType dt; + DataFormat df; + + U32 stride[3]; + U32 offset[3]; + GCLMemType memType; + DataFormat memFormat; + U32 byteSize; + U32 num; + MemFlags flags; + ImgFormat imgFormat; + void *host_ptr; + bool need_pad; +}; +typedef struct GCLMemDesc *GCLMemDesc_t; +struct GCLMem { + Mem mem; + GCLMemDesc desc; + std::vector subMem; + std::vector mapPtrArray; +}; +typedef struct GCLMem *GCLMem_t; + +typedef struct { + I32 algorithm; + U32 best_w[6]; + U32 best_c[6]; + U32 best_k[6]; +} ForwardRunInfoMali; +typedef ForwardRunInfoMali *ForwardRunInfoMali_t; + +typedef struct { + GCLHandle_t handle; + GCLMemDesc_t gclmemInputDesc; + GCLMemDesc_t gclmemOutputDesc; + GCLMemDesc_t gclmemFilterDesc; + ForwardRunInfoMali_t forwardRunInfo; +} MaliPara; +typedef MaliPara *MaliPara_t; + +#ifdef __cplusplus +} +#endif +#endif diff --git a/common/gcl/include/gcl_engine.h b/common/gcl/include/gcl_engine.h new file mode 100644 index 00000000..25ff32eb --- /dev/null +++ b/common/gcl/include/gcl_engine.h @@ -0,0 +1,66 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef H_OCL_ENGINE +#define H_OCL_ENGINE + +#include "sys.h" +#include "ocl_context.h" + +#define REGISTER_OCL_OPERATOR_RUN \ + virtual void run() override \ + { \ + GCLHandle_t handle = OCLContext::getInstance().handle.get(); \ + handle->kernelVec = &this->opKernelVec; \ + if (this->needSetKernelVec) { \ + run_prepare(); \ + this->needSetKernelVec = false; \ + if (this->needSelectKernelLS) { \ + CHECK_STATUS(gcl_infer_best_kernelVec_ls_with_map(handle, this->algorithmMap)); \ + this->needSelectKernelLS = false; \ + } \ + } \ + CHECK_STATUS(gcl_run_kernelVec(handle)); \ + } \ + \ +private: \ + bool needSetKernelVec; \ + bool needSelectKernelLS; \ + std::vector opKernelVec; + +#define DESTROY_OCL_KERNEL \ + GCLHandle_t handle = OCLContext::getInstance().handle.get(); \ + handle->kernelVec = &this->opKernelVec; \ + CHECK_STATUS(gcl_clean_kernelVec(handle)); + +inline void setMALIArchInfo( + ArchInfo *archInfo, ForwardRunInfoMali *runInfo, bool *needSetKernelVec, bool *needSelectKernelLS) +{ + if (runInfo != nullptr) { + runInfo->algorithm = 0; + runInfo->best_w[0] = 1; + runInfo->best_w[1] = 1; + runInfo->best_c[0] = 1; + runInfo->best_c[1] = 1; + runInfo->best_k[0] = 1; + runInfo->best_k[1] = 1; + } + MaliPara *maliPara = (MaliPara *)malloc(sizeof(MaliPara)); + maliPara->handle = OCLContext::getInstance().handle.get(); + maliPara->forwardRunInfo = runInfo; + archInfo->arch = MALI; + archInfo->archPara = (void *)maliPara; + *needSetKernelVec = true; + *needSelectKernelLS = true; +} +#endif // H_OCL_ENGINE diff --git a/common/gcl/include/gcl_func.h b/common/gcl/include/gcl_func.h new file mode 100644 index 00000000..d8529df9 --- /dev/null +++ b/common/gcl/include/gcl_func.h @@ -0,0 +1,1351 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef H_GCL_FUNC +#define H_GCL_FUNC + +#include +#include +#include +#include +#include "gcl_common.h" +#include "platform.h" +#include "context.h" +#include "program.h" +#include "memory.h" +#include "kernel.h" +#include "event.h" +#include "gcl_kernel_binmap.h" +#include "gcl_kernel_source.h" +#include "libkernelsource.h" +#include "algorithm_map.h" +#include "types.h" + +#ifdef __cplusplus +extern "C" { +#endif +inline EE gcl_regist_binMap(GCLHandle_t handle) +{ + std::string deviceName = handle->deviceName; + std::string libKernelBinName = "lib" + deviceName + "_map.so"; + char *err; + void *dvm_handle = dlopen(libKernelBinName.c_str(), RTLD_LAZY); + if (dvm_handle) { + std::string func = "create_" + deviceName + "_kernelbin_map"; + gcl_kernel_binmap *(*create_kernelbin_map)(); + dlerror(); + create_kernelbin_map = (gcl_kernel_binmap * (*)()) dlsym(dvm_handle, func.c_str()); + if ((err = dlerror()) != NULL) { + UNI_ERROR_LOG( + "Get %s in %s failed, error %s\n", func.c_str(), libKernelBinName.c_str(), err); + dlclose(dvm_handle); + return NULL_POINTER; + } + gcl_kernel_binmap *kernel_binmap = create_kernelbin_map(); + handle->kernel_binmap = (void *)kernel_binmap; + handle->useBinMap = true; + handle->kernel_binmap_handle = dvm_handle; + } else { + UNI_DEBUG_LOG("try to dlopen %s failed, %s, create kernel from source code\n", + libKernelBinName.c_str(), dlerror()); + } + return SUCCESS; +} + +inline EE gcl_regist_sourceMap(GCLHandle_t handle) +{ + gcl_kernel_source *kernel_source = (gcl_kernel_source*) new kernel_source_executor(); + handle->kernel_source = kernel_source; + KernelOption *common_opt; + if (!kernel_source->get_option("common", &common_opt)) { + UNI_ERROR_LOG("the common doesn't exist in optionMap\n"); + CHECK_STATUS(NULL_POINTER); + } + handle->common_source_opt = common_opt->option; + handle->common_source_ext = "#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable\n"; + handle->common_source_ext += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"; + handle->source_head_name[0] = "kernel_def.h"; + KernelSource *head_source; + if (!kernel_source->get_source("kernel_def", &head_source)) { + UNI_ERROR_LOG("the kernel_def doesn't exist in sourceMap\n"); + CHECK_STATUS(NULL_POINTER); + } + CHECK_STATUS(create_program_from_source( + handle->context, (U32 *)&head_source->len, head_source->data, handle->source_head)); + return SUCCESS; +} + +inline EE gcl_get_device_name(GCLHandle_t handle) +{ + cl_device_id device = handle->devices[handle->deviceId]; + U32 len; + I8 *data; + CHECK_STATUS(get_device_info(device, CL_DEVICE_NAME, (void **)&data, &len)); + I8 devName[64]; + for (U32 i = 0; i < len - 1; i++) { + if (data[i] == '-') { + data[i] = '_'; + } + if (data[i] == ' ') { + data[i] = '_'; + } + devName[i] = data[i]; + } + U32 version_len; + free(data); + CHECK_STATUS(get_device_info(device, CL_DEVICE_VERSION, (void **)&data, &version_len)); + std::string deviceV = std::string(data); + U32 be = deviceV.find("r"); + U32 end = deviceV.find("p", be + 1); + std::string numV = deviceV.substr(be + 1, end - be - 1); + U32 i = atoi(numV.c_str()); + if (i >= 14) { + devName[len - 1] = 'p'; + devName[len] = '\0'; + } else { + devName[len - 1] = '\0'; + } + free(data); + handle->deviceName = devName; + return SUCCESS; +} + +inline EE gcl_create_handle(GCLHandle_t *handlePtr) +{ + if (handlePtr == NULL) { + UNI_ERROR_LOG("the handlePtr set to gcl_create_handle is NULL\n"); + } + GCLHandle_t handle = new GCLHandle(); + handle->platformId = 0; + handle->deviceId = 0; + handle->deviceType = CL_DEVICE_TYPE_GPU; + handle->eventPtr = nullptr; + handle->numWaitEvents = 0; + handle->waitEvents = nullptr; + handle->t_execute = 0; + handle->t_total = 0; + handle->curOpName = "unknow"; + handle->deviceName = "unknow"; + handle->kernel_source = nullptr; + handle->kernel_binmap = nullptr; + handle->kernel_binmap_handle = nullptr; + handle->common_source_opt = "unknow"; + handle->common_source_ext = "unknow"; + handle->source_head_name[0] = "unknow"; + handle->useBinMap = false; + handle->existProfilingQueue = false; + U32 platformId = handle->platformId; + U32 deviceId = handle->deviceId; + CHECK_STATUS(get_platforms(&handle->numPlatform, &handle->platforms)); + CHECK_STATUS(platform_get_devices( + handle->platforms[platformId], handle->deviceType, &handle->numDevice, &handle->devices)); + CHECK_STATUS(create_context( + handle->platforms[platformId], handle->numDevice, handle->devices, &handle->context)); + cl_queue_properties props[] = {CL_QUEUE_PROPERTIES, 0, 0}; +#ifdef _DEBUG + handle->eventPtr = &handle->eventObj; + props[1] = props[1] | CL_QUEUE_PROFILING_ENABLE; +#endif + CHECK_STATUS(create_command_queue_properties( + handle->context, handle->devices[deviceId], props, &handle->queue)); + CHECK_STATUS(gcl_get_device_name(handle)); + CHECK_STATUS(gcl_regist_binMap(handle)); + if (!handle->useBinMap) { + CHECK_STATUS(gcl_regist_sourceMap(handle)); + } + *handlePtr = handle; + return SUCCESS; +} + +inline void gcl_destroy_handle(GCLHandle_t handle) +{ + U32 deviceId = handle->deviceId; + CHECK_STATUS(finish(handle->queue)); + for (auto k : handle->programMap) { + CHECK_STATUS(release_program(k.second)); + } + for (auto k : handle->kernelMap) { + CHECK_STATUS(release_kernel(k.second)); + } + if (handle->useBinMap) { + delete (gcl_kernel_binmap *)handle->kernel_binmap; + dlclose(handle->kernel_binmap_handle); + } else { + CHECK_STATUS(release_program(handle->source_head[0])); + delete (gcl_kernel_source *)handle->kernel_source; + } + handle->kernelMap.clear(); + if (handle->existProfilingQueue) { + CHECK_STATUS(finish(handle->queue_profiling)); + CHECK_STATUS(release_command_queue(handle->queue_profiling)); + } + CHECK_STATUS(release_command_queue(handle->queue)); + CHECK_STATUS(release_context(handle->context)); + CHECK_STATUS(release_device(handle->devices[deviceId])); + free(handle->devices); + free(handle->platforms); + delete handle; +} + +inline EE gcl_enable_queue_profiling(GCLHandle_t handle) +{ +#ifndef _DEBUG + handle->eventPtr = &handle->eventObj; + bool enableProfiling; + CHECK_STATUS(check_queue_profiling(handle->queue, &enableProfiling)); + if (enableProfiling) { + return SUCCESS; + } + if (!handle->existProfilingQueue) { + cl_queue_properties props[] = {CL_QUEUE_PROPERTIES, 0, 0}; + props[1] = props[1] | CL_QUEUE_PROFILING_ENABLE; + CHECK_STATUS(create_command_queue_properties( + handle->context, handle->devices[handle->deviceId], props, &handle->queue_profiling)); + handle->existProfilingQueue = true; + } + CommandQueue tmpQueue = handle->queue; + handle->queue = handle->queue_profiling; + handle->queue_profiling = tmpQueue; +#endif + return SUCCESS; +} + +inline EE gcl_off_queue_profiling(GCLHandle_t handle) +{ +#ifndef _DEBUG + handle->eventPtr = NULL; + bool enableProfiling; + CHECK_STATUS(check_queue_profiling(handle->queue, &enableProfiling)); + if (!enableProfiling) { + return SUCCESS; + } + CHECK_STATUS(check_queue_profiling(handle->queue_profiling, &enableProfiling)); + if (!enableProfiling) { + CHECK_STATUS(finish(handle->queue)); + CommandQueue tmpQueue = handle->queue; + handle->queue = handle->queue_profiling; + handle->queue_profiling = tmpQueue; + } else { + return NOT_SUPPORTED; + } +#endif + return SUCCESS; +} + +inline GCLMemDesc gcl_mem_desc(U32 stride[], U32 offset[], DataType dt, DataFormat memFormat) +{ + GCLMemDesc desc; + U32 s0, s1, s2; + s0 = stride[0]; + s1 = stride[1]; + s2 = stride[2]; + desc.stride[0] = s0; + desc.stride[1] = s1; + desc.stride[2] = s2; + desc.offset[0] = offset[0]; + desc.offset[1] = offset[1]; + desc.offset[2] = offset[2]; + desc.memFormat = memFormat; + desc.memType = GCL_MEM_BUF; + desc.num = s0 * s1 * s2; + desc.byteSize = s0 * s1 * s2 * bytesOf(dt); + desc.flags = CL_MEM_READ_WRITE; + desc.host_ptr = NULL; + desc.imgFormat.image_channel_order = CL_RGBA; + desc.imgFormat.image_channel_data_type = CL_HALF_FLOAT; + desc.need_pad = false; + return desc; +} + +inline GCLMem_t gcl_create_gclmem() +{ + GCLMem_t ret = new GCLMem; + ret->mem = NULL; + U32 str[3] = {0, 0, 0}; + U32 off[3] = {0, 0, 0}; + ret->desc = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); + return ret; +} + +inline EE gcl_release_subMem(GCLMem_t gclMem) +{ + if (gclMem == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (gclMem->subMem.size()) { + for (auto p : gclMem->subMem) { + CHECK_STATUS(release_memory(p)); + } + gclMem->subMem.clear(); + } + return SUCCESS; +} + +inline EE gcl_release_memory(GCLMem_t gclMem) +{ + if (gclMem == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (gclMem->mem) { + CHECK_STATUS(release_memory(gclMem->mem)); + gclMem->mem = NULL; + } + return SUCCESS; +} + +inline void gcl_destroy_gclmem(GCLMem_t mem) +{ + CHECK_STATUS(gcl_release_subMem(mem)); + CHECK_STATUS(gcl_release_memory(mem)); + delete mem; +} + +inline EE gcl_finish(GCLHandle_t handle) +{ + CHECK_STATUS(finish(handle->queue)); + return SUCCESS; +} + +inline EE gcl_unmap_memory(GCLHandle_t handle, GCLMem_t gclMem) +{ + for (auto p : gclMem->mapPtrArray) { + CHECK_STATUS(enqueue_unmap_memory(handle->queue, gclMem->mem, (void *)p, + handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); +#ifdef _DEBUG + double executeTime = 0; + CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); + CHECK_STATUS(release_event(handle->eventObj)); + UNI_DEBUG_LOG( + "DATAUNMAP>>> enqueue_unmap_memory runInfo: executeTime = %f us\n", executeTime); + CHECK_STATUS(gcl_finish(handle)); +#endif + } + if (gclMem->mapPtrArray.size()) { + gclMem->mapPtrArray.clear(); + } + return SUCCESS; +} + +inline EE gcl_produce_program_kernel_with_source(GCLHandle_t handle, + U32 *len, + CI8 *src, + CI8 *option, + Program *program, + U32 numKernel, + Kernel *kernels) +{ + U32 deviceId = handle->deviceId; + CHECK_STATUS(create_build_program_from_source( + handle->context, len, src, handle->devices[deviceId], option, program)); + CHECK_STATUS(create_kernels_in_program(*program, numKernel, kernels)); + return SUCCESS; +} + +inline EE gcl_get_program_info(Program program, U8 **binary, U32 *len) +{ + CHECK_STATUS(get_program_binary(program, binary, len)); + return SUCCESS; +} + +inline EE gcl_kernelmap_put(GCLHandle_t handle, std::string kernelName, Kernel kernel) +{ + handle->kernelMap.insert(std::pair(kernelName, kernel)); + return SUCCESS; +} + +inline Kernel gcl_kernelmap_get(GCLHandle_t handle, std::string kernelName) +{ + auto it = handle->kernelMap.find(std::string(kernelName)); + if (it == handle->kernelMap.end()) { + CHECK_STATUS(NOT_MATCH); + } + return it->second; +} + +inline EE gcl_create_kernel_binary(GCLHandle_t handle, CI8 *kernelName, Kernel *kernel) +{ + std::string binmapname = handle->deviceName; + std::string binmap_kernelname = binmapname + "_" + std::string(kernelName); + gcl_kernel_binmap *kernel_binmap = (gcl_kernel_binmap *)handle->kernel_binmap; + KernelBin *binmap; + if (!kernel_binmap->get(binmap_kernelname, &binmap)) { + UNI_ERROR_LOG( + "get kernel %s from %s kernel_binmap failed\n", kernelName, binmapname.c_str()); + return NULL_POINTER; + } + + U32 length = binmap->len; + CU8 *data = binmap->data; + I32 binsta; + Program program; + CI8 *options = ""; + Device device = handle->devices[handle->deviceId]; + CHECK_STATUS( + create_program_from_binary(handle->context, device, &length, &data, &binsta, &program)); + CHECK_STATUS(build_program(program, device, options)); + CHECK_STATUS(create_kernel(program, kernelName, kernel)); + CHECK_STATUS(release_program(program)); + return SUCCESS; +} + +inline EE gcl_create_kernel_with_source_map(GCLHandle_t handle, CI8 *kernelName, Kernel *kernel) +{ + Program program; + auto it = handle->programMap.find(kernelName); + if (it == handle->programMap.end()) { + gcl_kernel_source *kernel_source = (gcl_kernel_source *)handle->kernel_source; + KernelOption *option_ptr; + KernelSource *source_ptr; + CI8 *sourceName; + std::string option; + std::string optionName = kernelName; + bool use_common_opt; + if (!kernel_source->get_option(optionName, &option_ptr)) { + sourceName = kernelName; + option = ""; + use_common_opt = true; + } else { + use_common_opt = option_ptr->use_common_opt; + sourceName = option_ptr->sourceName; + option = option_ptr->option; + } + if (use_common_opt) { + option = handle->common_source_opt + option; + } + if (!kernel_source->get_source(sourceName, &source_ptr)) { + UNI_ERROR_LOG("the %s doesn't exist in sourceMap\n", sourceName); + CHECK_STATUS(NULL_POINTER); + } + + U32 len = source_ptr->len + handle->common_source_ext.size(); + std::string source = source_ptr->data; + source = handle->common_source_ext + source; + bool use_kernel_def_head = source_ptr->use_kernel_def_head; + create_program_from_source(handle->context, &len, source.c_str(), &program); + Device device = handle->devices[handle->deviceId]; + if (use_kernel_def_head) { + CHECK_STATUS(compile_program( + program, device, option.c_str(), 1, handle->source_head, handle->source_head_name)); + CHECK_STATUS(link_program(handle->context, device, NULL, 1, &program, &program)); + } else { + CHECK_STATUS(build_program(program, device, option.c_str())); + } + handle->programMap.insert(std::pair(kernelName, program)); + } else { + program = it->second; + } + CHECK_STATUS(create_kernel(program, kernelName, kernel)); + return SUCCESS; +} + +inline EE gcl_create_kernel(GCLHandle_t handle, CI8 *kernelName, Kernel *kernel) +{ + if (handle->useBinMap) { + CHECK_STATUS(gcl_create_kernel_binary(handle, kernelName, kernel)); + } else { + CHECK_STATUS(gcl_create_kernel_with_source_map(handle, kernelName, kernel)); + } + return SUCCESS; +} + +inline EE gcl_get_kernel_from_map(GCLHandle_t handle, CI8 *kernelName, Kernel *kernel) +{ + std::string binmapname = handle->deviceName; + std::string binmap_kernelname = binmapname + "_" + std::string(kernelName); + if (handle->kernelMap.find(binmap_kernelname) == handle->kernelMap.end()) { + CHECK_STATUS(gcl_create_kernel(handle, kernelName, kernel)); + CHECK_STATUS(gcl_kernelmap_put(handle, binmap_kernelname, *kernel)); + } else { + *kernel = gcl_kernelmap_get(handle, binmap_kernelname); + } + return SUCCESS; +} + +inline EE gcl_set_kernelVec(GCLHandle_t handle, + Kernel kernel, + U32 work_dim, + U32 global_work_size[], + U32 local_work_size[], + CI8 *kernelName = NULL) +{ + GCLKernelInfo kernelInfo; + kernelInfo.kernel = kernel; + kernelInfo.dim = work_dim; + kernelInfo.name = handle->curOpName + "_" + std::string(kernelName); + switch (work_dim) { + case 1: { + kernelInfo.gs[0] = global_work_size[0]; + kernelInfo.gs[1] = 1; + kernelInfo.gs[2] = 1; + kernelInfo.ls[0] = local_work_size[0]; + kernelInfo.ls[1] = 0; + kernelInfo.ls[2] = 0; + break; + } + case 2: { + kernelInfo.gs[0] = global_work_size[0]; + kernelInfo.gs[1] = global_work_size[1]; + kernelInfo.gs[2] = 1; + kernelInfo.ls[0] = local_work_size[0]; + kernelInfo.ls[1] = local_work_size[1]; + kernelInfo.ls[2] = 0; + break; + } + case 3: { + kernelInfo.gs[0] = global_work_size[0]; + kernelInfo.gs[1] = global_work_size[1]; + kernelInfo.gs[2] = global_work_size[2]; + kernelInfo.ls[0] = local_work_size[0]; + kernelInfo.ls[1] = local_work_size[1]; + kernelInfo.ls[2] = local_work_size[2]; + break; + } + default: + return NOT_SUPPORTED; + } + handle->kernelVec->push_back(kernelInfo); + return SUCCESS; +} + +inline EE gcl_run_kernelVec(GCLHandle_t handle, U32 *index = NULL) +{ + CommandQueue queue = handle->queue; + U32 numWaitEvents = handle->numWaitEvents; + Event *waitEvents = handle->waitEvents; + Event *eventPtr = handle->eventPtr; + U32 runBe; + U32 runEnd; + if (index) { + runBe = index[0]; + runEnd = index[1]; + } else { + runBe = 0; + runEnd = handle->kernelVec->size(); + } + for (U32 i = runBe; i < runEnd; ++i) { + auto kernelInfo = (*handle->kernelVec)[i]; + CHECK_STATUS(enqueue_ndrange_kernel(queue, kernelInfo.kernel, kernelInfo.dim, NULL, + kernelInfo.gs, kernelInfo.ls, numWaitEvents, waitEvents, eventPtr)); +#ifdef _DEBUG + double executeTime = 0; + CHECK_STATUS(event_counting_time(eventPtr, NULL, NULL, NULL, NULL, &executeTime)); + CHECK_STATUS(release_event(*eventPtr)); + handle->t_execute = executeTime; + UNI_DEBUG_LOG( + "KERNEL>>> %s runInfo: executeTime = %f us\n", kernelInfo.name.c_str(), executeTime); + CHECK_STATUS(gcl_finish(handle)); +#endif + } + return SUCCESS; +} + +inline EE gcl_run_kernelVec_timing( + GCLHandle_t handle, U32 be, U32 end, std::vector *kernelArrayTime = NULL) +{ + bool enableProfiling; + CHECK_STATUS(check_queue_profiling(handle->queue, &enableProfiling)); + if (enableProfiling) { + double executeTime = 0; + double totalTime = 0; + CommandQueue queue = handle->queue; + U32 numWaitEvents = handle->numWaitEvents; + Event *waitEvents = handle->waitEvents; + Event *eventPtr = handle->eventPtr; + for (U32 i = be; i < end; ++i) { + auto kernelInfo = (*handle->kernelVec)[i]; + for (U32 j = 0; j < 3; j++) { + CHECK_STATUS(enqueue_ndrange_kernel(queue, kernelInfo.kernel, kernelInfo.dim, NULL, + kernelInfo.gs, kernelInfo.ls, numWaitEvents, waitEvents, eventPtr)); + CHECK_STATUS( + event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); + CHECK_STATUS(release_event(handle->eventObj)); + } + UNI_DEBUG_LOG("KERNEL>>> %s runInfo: executeTime = %f us\n", kernelInfo.name.c_str(), + executeTime); + CHECK_STATUS(gcl_finish(handle)); + totalTime += executeTime; + if (kernelArrayTime) { + (*kernelArrayTime).push_back(executeTime); + } + } + handle->t_execute = totalTime; + return SUCCESS; + } + return NOT_SUPPORTED; +} + +inline EE gcl_clean_kernelVec(GCLHandle_t handle) +{ + for (U32 i = 0; i < handle->kernelVec->size(); i++) { + auto k = (*handle->kernelVec)[i]; + CHECK_STATUS(release_kernel(k.kernel)); + } + handle->kernelVec->clear(); + return SUCCESS; +} + +inline EE gcl_run_kernel( + GCLHandle_t handle, Kernel kernel, U32 work_dim, U32 *gs, U32 *ls, CI8 *kernelName = NULL) +{ + CHECK_STATUS(enqueue_ndrange_kernel(handle->queue, kernel, work_dim, NULL, gs, ls, + handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); +#ifdef _DEBUG + std::string name = "unknown kernel"; + if (kernelName) { + name = handle->curOpName + "_" + std::string(kernelName); + } + double executeTime = 0; + CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); + CHECK_STATUS(release_event(handle->eventObj)); + handle->t_execute = executeTime; + UNI_DEBUG_LOG("KERNEL>>> %s runInfo: executeTime = %f us\n", name.c_str(), executeTime); + CHECK_STATUS(gcl_finish(handle)); +#else + UNUSED(kernelName); +#endif + return SUCCESS; +} + +inline U32 get_next_ls_size(U32 ls_size) +{ + return (ls_size << 1); +} +inline EE gcl_run_kernel_select_ls(GCLHandle_t handle, GCLKernelInfo *kernelInfo) +{ + auto kernel = kernelInfo->kernel; + auto work_dim = kernelInfo->dim; + auto gs = kernelInfo->gs; + double minTime = DBL_MAX; + double time; + U32 test_ls[3]; + U32 best_ls[3]; + U32 test_gs[3]; + U32 maxSize = 384; + U32 gs_x = 256; + U32 gs_y = (work_dim > 1) ? 256 : 1; + U32 gs_z = (work_dim > 2) ? gs[2] : 1; + for (U32 z = 1; z <= gs_z; z = get_next_ls_size(z)) { + if (0 != gs_z % z) { + continue; + } + for (U32 y = 1; y <= gs_y; y = get_next_ls_size(y)) { + if (0 != gs_y % y) { + continue; + } + for (U32 x = 1; x <= gs_x; x = get_next_ls_size(x)) { + if (0 != gs_x % x) { + continue; + } + U32 total = x * y * z; + if (total <= maxSize) { + test_gs[0] = (gs[0] + x - 1) / x * x; + test_gs[1] = (gs[1] + y - 1) / y * y; + test_gs[2] = (gs[2] + z - 1) / z * z; + test_ls[0] = x; + test_ls[1] = y; + test_ls[2] = z; + CHECK_STATUS( + enqueue_ndrange_kernel(handle->queue, kernel, work_dim, NULL, test_gs, + test_ls, handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); + CHECK_STATUS( + event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &time)); + if (minTime > time) { + minTime = time; + best_ls[0] = test_ls[0]; + best_ls[1] = test_ls[1]; + best_ls[2] = test_ls[2]; + } + CHECK_STATUS(release_event(handle->eventObj)); + } + } + } + } + test_ls[0] = 0; + test_ls[1] = 0; + test_ls[2] = 0; + CHECK_STATUS(enqueue_ndrange_kernel(handle->queue, kernel, work_dim, NULL, gs, test_ls, + handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); + CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &time)); + if (minTime > time) { + minTime = time; + best_ls[0] = test_ls[0]; + best_ls[1] = test_ls[1]; + best_ls[2] = test_ls[2]; + } + CHECK_STATUS(release_event(handle->eventObj)); + if (best_ls[0] != 0 && best_ls[1] != 0 && best_ls[2] != 0) { + kernelInfo->gs[0] = (gs[0] + best_ls[0] - 1) / best_ls[0] * best_ls[0]; + kernelInfo->gs[1] = (gs[1] + best_ls[1] - 1) / best_ls[1] * best_ls[1]; + kernelInfo->gs[2] = (gs[2] + best_ls[2] - 1) / best_ls[2] * best_ls[2]; + } + kernelInfo->ls[0] = best_ls[0]; + kernelInfo->ls[1] = best_ls[1]; + kernelInfo->ls[2] = best_ls[2]; + handle->t_execute = minTime; + UNI_DEBUG_LOG("SELECT LS KERNEL>>> %s runInfo: best ls = %u %u %u executeTime = %f us\n", + kernelInfo->name.c_str(), best_ls[0], best_ls[1], best_ls[2], minTime); + return SUCCESS; +} + +inline EE gcl_run_kernelVec_select_ls(GCLHandle_t handle, std::vector kernelIndex) +{ + if (kernelIndex.size() == 0) { + return SUCCESS; + } + CHECK_STATUS(gcl_enable_queue_profiling(handle)); + for (auto index : kernelIndex) { + auto kernelInfo = (*handle->kernelVec)[index]; + CHECK_STATUS(gcl_run_kernel_select_ls(handle, &kernelInfo)); + (*handle->kernelVec)[index].gs[0] = kernelInfo.gs[0]; + (*handle->kernelVec)[index].gs[1] = kernelInfo.gs[1]; + (*handle->kernelVec)[index].gs[2] = kernelInfo.gs[2]; + (*handle->kernelVec)[index].ls[0] = kernelInfo.ls[0]; + (*handle->kernelVec)[index].ls[1] = kernelInfo.ls[1]; + (*handle->kernelVec)[index].ls[2] = kernelInfo.ls[2]; + } + CHECK_STATUS(gcl_off_queue_profiling(handle)); + return SUCCESS; +} + +inline EE gcl_infer_best_kernelVec_ls_with_map( + GCLHandle_t handle, std::shared_ptr algoMap) +{ + std::vector kernelIndex; + U32 len = handle->kernelVec->size(); + for (U32 i = 0; i < len; i++) { + auto kernelInfo = (*handle->kernelVec)[i]; + U32 gs[3]; + U32 ls[3]; + bool findKernelThreadInfo = false; + findKernelThreadInfo = algoMap->getKernelThreadInfoFromMap(kernelInfo.name, gs, ls); + if (findKernelThreadInfo) { + (*handle->kernelVec)[i].gs[0] = gs[0]; + (*handle->kernelVec)[i].gs[1] = gs[1]; + (*handle->kernelVec)[i].gs[2] = gs[2]; + (*handle->kernelVec)[i].ls[0] = ls[0]; + (*handle->kernelVec)[i].ls[1] = ls[1]; + (*handle->kernelVec)[i].ls[2] = ls[2]; + } else { + kernelIndex.push_back(i); + } + } + CHECK_STATUS(gcl_run_kernelVec_select_ls(handle, kernelIndex)); + for (U32 i = 0; i < len; i++) { + auto kernelInfo = (*handle->kernelVec)[i]; + algoMap->setKernelThreadInfoToMap(kernelInfo.name, kernelInfo.gs, kernelInfo.ls); + } + return SUCCESS; +} + +#ifdef _DEBUG +inline EE gcl_run_kernel_profiling( + GCLHandle_t handle, Kernel kernel, U32 work_dim, U32 *gs, U32 *ls, CI8 *kernelName = NULL) +{ + std::string name = "unknown kernel"; + if (kernelName) { + name = kernelName; + } + std::ostringstream debugLog; + debugLog << "KERNEL>>> " << name << " runInfo: "; + double totalTime = 0; + double executeTime = 0; + U32 loop = 10; + for (U32 i = 0; i < loop; i++) { + double t; + CHECK_STATUS(enqueue_ndrange_kernel(handle->queue, kernel, work_dim, NULL, gs, ls, + handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); + CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &t)); + CHECK_STATUS(release_event(handle->eventObj)); + debugLog << "loop " << i << " executeTime = " << t << " us " << std::endl; + totalTime += t; + } + executeTime = totalTime / loop; + debugLog << "executeTime = " << executeTime << " us for " << loop << " times average"; + UNI_DEBUG_LOG("%s\n", debugLog.str().c_str()); + CHECK_STATUS(gcl_finish(handle)); + return SUCCESS; +} +#endif + +inline EE gcl_create_memory(GCLHandle_t handle, GCLMem_t gclMem) +{ + GCLMemDesc_t desc = &gclMem->desc; + switch (desc->memType) { + case GCL_MEM_BUF: { + CHECK_STATUS(create_buffer( + handle->context, desc->flags, desc->byteSize, desc->host_ptr, &gclMem->mem)); + break; + } + case GCL_MEM_IMG_1D: { + CHECK_STATUS(create_image1D(handle->context, desc->flags, &desc->imgFormat, + desc->stride[0], 0, desc->host_ptr, &gclMem->mem)); + break; + } + case GCL_MEM_IMG_2D: { + CHECK_STATUS(create_image2D(handle->context, desc->flags, &desc->imgFormat, + desc->stride[0], desc->stride[1], 0, desc->host_ptr, &gclMem->mem)); + break; + } + case GCL_MEM_IMG_3D: { + CHECK_STATUS( + create_image3D(handle->context, desc->flags, &desc->imgFormat, desc->stride[0], + desc->stride[1], desc->stride[2], 0, 0, desc->host_ptr, &gclMem->mem)); + break; + } + default: + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE gcl_trans_memory(GCLHandle_t handle, + void *src, + void *dst, + U32 *size, + GCLMemTransType type, + cl_bool blocking, + U32 *offset = NULL) +{ +#ifdef _DEBUG + std::string debug_info = "DATATRANS>>> "; +#endif + switch (type) { + case HOST_TO_DEVICE_BUF: { + U8 *hostPtr = (U8 *)src; + GCLMem_t gclMem = (GCLMem_t)dst; + U32 dstOff = (offset) ? offset[0] : 0; + CHECK_STATUS(enqueue_write_buffer(handle->queue, gclMem->mem, blocking, dstOff, *size, + hostPtr, handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); +#ifdef _DEBUG + debug_info += "enqueue_write_buffer runInfo: "; +#endif + break; + } + case HOST_TO_DEVICE_IMG: { + U8 *hostPtr = (U8 *)src; + GCLMem_t gclMem = (GCLMem_t)dst; + U32 origin[3] = {0, 0, 0}; + if (offset) { + origin[0] = offset[0]; + origin[1] = offset[1]; + origin[2] = offset[2]; + } + CHECK_STATUS(enqueue_write_image(handle->queue, gclMem->mem, blocking, origin, size, 0, + 0, hostPtr, handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); +#ifdef _DEBUG + debug_info += "enqueue_write_image runInfo: "; +#endif + break; + } + case DEVICE_BUF_TO_HOST: { + U8 *hostPtr = (U8 *)dst; + GCLMem_t gclMem = (GCLMem_t)src; + U32 srcOff = (offset) ? offset[0] : 0; + CHECK_STATUS(enqueue_read_buffer(handle->queue, gclMem->mem, blocking, srcOff, *size, + hostPtr, handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); +#ifdef _DEBUG + debug_info += "enqueue_read_buffer runInfo: "; +#endif + break; + } + case DEVICE_IMG_TO_HOST: { + U8 *hostPtr = (U8 *)dst; + GCLMem_t gclMem = (GCLMem_t)src; + U32 origin[3] = {0, 0, 0}; + if (offset) { + origin[0] = offset[0]; + origin[1] = offset[1]; + origin[2] = offset[2]; + } + CHECK_STATUS(enqueue_read_image(handle->queue, gclMem->mem, blocking, origin, size, 0, + 0, hostPtr, handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); +#ifdef _DEBUG + debug_info += "enqueue_read_image runInfo: "; +#endif + break; + } + case DEVICE_BUF_TO_BUF: { + GCLMem_t srcBuf = (GCLMem_t)src; + GCLMem_t dstBuf = (GCLMem_t)dst; + U32 srcOff = 0; + U32 dstOff = 0; + if (offset) { + srcOff = offset[0]; + dstOff = offset[1]; + } + CHECK_STATUS(enqueue_copy_buffer(handle->queue, srcBuf->mem, dstBuf->mem, srcOff, + dstOff, *size, handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); +#ifdef _DEBUG + debug_info += "enqueue_copy_buffer runInfo: "; +#endif + break; + } + case DEVICE_BUF_TO_IMG: { + GCLMem_t srcBuf = (GCLMem_t)src; + GCLMem_t dstImg = (GCLMem_t)dst; + U32 origin[3] = {0, 0, 0}; + U32 srcOff = 0; + if (offset) { + srcOff = offset[0]; + origin[0] = offset[1]; + origin[1] = offset[2]; + origin[2] = offset[3]; + } + CHECK_STATUS(enqueue_copy_buffer_to_image(handle->queue, srcBuf->mem, dstImg->mem, + srcOff, origin, size, handle->numWaitEvents, handle->waitEvents, handle->eventPtr)) +#ifdef _DEBUG + debug_info += "enqueue_copy_buffer_to_image runInfo: "; +#endif + break; + } + case DEVICE_IMG_TO_BUF: { + GCLMem_t srcImg = (GCLMem_t)src; + GCLMem_t dstBuf = (GCLMem_t)dst; + U32 origin[3] = {0, 0, 0}; + U32 dstOff = 0; + if (offset) { + origin[0] = offset[0]; + origin[1] = offset[1]; + origin[2] = offset[2]; + dstOff = offset[3]; + } + CHECK_STATUS(enqueue_copy_image_to_buffer(handle->queue, srcImg->mem, dstBuf->mem, + origin, size, dstOff, handle->numWaitEvents, handle->waitEvents, handle->eventPtr)) +#ifdef _DEBUG + debug_info += "enqueue_copy_image_to_buffer runInfo: "; +#endif + break; + } + case DEVICE_IMG_TO_IMG: { + return NOT_SUPPORTED; + break; + } + default: + return NOT_SUPPORTED; + } +#ifdef _DEBUG + double executeTime = 0; + CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); + CHECK_STATUS(release_event(handle->eventObj)); + UNI_DEBUG_LOG("%sexecuteTime = %f us\n", debug_info.c_str(), executeTime); + CHECK_STATUS(gcl_finish(handle)); +#endif + return SUCCESS; +} + +inline EE gcl_trans_buffer_rect(GCLHandle_t handle, + void *src, + void *dst, + U32 *host_org, + U32 *buf_org, + U32 *region, + U32 host_row_pitch, + U32 host_slice_pitch, + U32 buf_row_pitch, + U32 buf_slice_pitch, + GCLMemTransType type, + cl_bool blocking) +{ +#ifdef _DEBUG + std::string debug_info = "DATATRANS>>> "; +#endif + switch (type) { + case HOST_TO_DEVICE_BUF: { + GCLMem_t dstBuf = (GCLMem_t)dst; + CHECK_STATUS(enqueue_write_buffer_rect(handle->queue, dstBuf->mem, blocking, buf_org, + host_org, region, buf_row_pitch, buf_slice_pitch, host_row_pitch, host_slice_pitch, + src, handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); +#ifdef _DEBUG + debug_info = "enqueue_write_buffer_rect runInfo: "; +#endif + break; + } + case DEVICE_BUF_TO_HOST: { + return NOT_SUPPORTED; + break; + } + default: + return NOT_SUPPORTED; + } +#ifdef _DEBUG + double executeTime = 0; + CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); + CHECK_STATUS(release_event(handle->eventObj)); + UNI_DEBUG_LOG("%sexecuteTime = %f us\n", debug_info.c_str(), executeTime); + CHECK_STATUS(gcl_finish(handle)); +#endif + return SUCCESS; +} + +inline EE gcl_map_memory( + GCLHandle_t handle, GCLMem_t gclMem, U32 *offset, U32 *size, cl_map_flags flags, cl_bool blocking) +{ +#ifdef _DEBUG + std::string debug_info = "DATATMAP>>> "; +#endif + if (gclMem->desc.memType == GCL_MEM_BUF) { + U8 *map_ptr; + CHECK_STATUS(enqueue_map_buffer(handle->queue, gclMem->mem, blocking, flags, *offset, *size, + handle->numWaitEvents, handle->waitEvents, handle->eventPtr, (void **)&map_ptr)); + gclMem->mapPtrArray.push_back(map_ptr); +#ifdef _DEBUG + debug_info = "enqueue_map_buffer runInfo: "; +#endif + } else { + return NOT_SUPPORTED; + } +#ifdef _DEBUG + double executeTime = 0; + CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); + CHECK_STATUS(release_event(handle->eventObj)); + UNI_DEBUG_LOG("%sexecuteTime = %f us\n", debug_info.c_str(), executeTime); + CHECK_STATUS(gcl_finish(handle)); +#endif + return SUCCESS; +} + +inline EE gcl_fill_memory_zero(GCLHandle_t handle, GCLMem_t gclMem) +{ +#ifdef _DEBUG + std::string debug_info = "FILLMEM>>> "; +#endif + if (gclMem->desc.memType == GCL_MEM_BUF) { +#ifdef _DEBUG + debug_info = "enqueue_fill_buffer runInfo: "; +#endif + U8 pat_val = 0; + CHECK_STATUS(enqueue_fill_buffer(handle->queue, gclMem->mem, &pat_val, sizeof(pat_val), 0, + gclMem->desc.byteSize, handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); + } else { +#ifdef _DEBUG + debug_info = "enqueue_fill_image runInfo: "; +#endif + F32 color[4] = {0.0f, 0.0f, 0.0f, 0.0f}; + U32 origin[3] = {0, 0, 0}; + U32 region[3]; + region[0] = gclMem->desc.stride[0]; + region[1] = gclMem->desc.stride[1]; + region[2] = gclMem->desc.stride[2]; + CHECK_STATUS(enqueue_fill_image(handle->queue, gclMem->mem, color, origin, region, + handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); + } +#ifdef _DEBUG + double executeTime = 0; + CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); + CHECK_STATUS(release_event(handle->eventObj)); + UNI_DEBUG_LOG("%sexecuteTime = %f us\n", debug_info.c_str(), executeTime); + CHECK_STATUS(gcl_finish(handle)); +#endif + return SUCCESS; +} + +inline EE gcl_get_mem_size(GCLMem_t gclMem, U32 *size) +{ + CHECK_STATUS(get_memory_size(gclMem->mem, size)); + return SUCCESS; +} + +inline EE gcl_create_sub_buffer(U32 size, U32 *offset, GCLMem_t src, Mem *subbuf) +{ + CHECK_STATUS(create_sub_buffer(src->mem, CL_MEM_READ_WRITE, *offset, size, subbuf)); + src->subMem.push_back(*subbuf); + *offset += (size + 1023) / 1024 * 1024; + return SUCCESS; +} +#ifdef __cplusplus +} +#endif +template +struct DummpyWrapper { + static void set_kernel_arg_wrapper(Kernel kernel, const Tuple &t) + { + DummpyWrapper::set_kernel_arg_wrapper(kernel, t); + auto arg = std::get(t); + set_kernel_arg(kernel, N - 1, sizeof(arg), (void *)&arg); + } +}; + +template +struct DummpyWrapper { + static void set_kernel_arg_wrapper(Kernel kernel, const Tuple &t) + { + UNUSED(kernel); + UNUSED(t); + } +}; + +template +inline EE gcl_set_kernelArgs(Kernel kernel, Args... args) +{ + std::tuple t = std::make_tuple(args...); + DummpyWrapper::set_kernel_arg_wrapper(kernel, t); + return SUCCESS; +} + +inline std::string gclMemDesc2Str(GCLMemDesc desc) +{ + char buff[128]; + snprintf(buff, sizeof(buff), "memFormat: %d, ", desc.memFormat); + std::string descStr = buff; + descStr += "stride("; + for (U32 i = 0; i < 3; i++) { + descStr += std::to_string(desc.stride[i]); + if (i < 2) { + descStr += ","; + } + } + descStr += "), "; + descStr += "offset("; + for (U32 i = 0; i < 3; i++) { + descStr += std::to_string(desc.offset[i]); + if (i < 2) { + descStr += ","; + } + } + descStr += ")"; + return descStr; +} +#ifdef _DEBUG +template +inline EE gcl_print_memory(GCLHandle_t handle, GCLMem_t gclMem, CI8 *gclMemName = NULL) +{ + UNUSED(handle); + UNUSED(gclMem); + UNUSED(gclMemName); + return SUCCESS; +} + +template +inline EE gcl_print_buffer(GCLHandle_t handle, Mem buf, U32 num, CI8 *bufferName = NULL) +{ + UNUSED(handle); + UNUSED(buf); + UNUSED(num); + return SUCCESS; +} + +template +inline EE gcl_check_buf(GCLHandle_t handle, Mem buf, U32 size, bool write2bin, CI8 *dataName = NULL) +{ + U32 num = size / sizeof(T); + U8 *hostPtr = new U8[size]; + F32 *hostPtrTran = new F32[num]; + CHECK_STATUS(enqueue_read_buffer(handle->queue, buf, CL_TRUE, 0, size, hostPtr, + handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); + T *val = (T *)hostPtr; + for (U32 i = 0; i < num; i++) { + hostPtrTran[i] = (F32)val[i]; + } + + if (write2bin) { + FILE *outfile; + if (!dataName) { + dataName = "unknow"; + } + std::string fileName = dataName; + replace(fileName.begin(), fileName.end(), '/', '_'); + replace(fileName.begin(), fileName.end(), '.', '_'); + replace(fileName.begin(), fileName.end(), ' ', '_'); + fileName += "_gpu"; + fileName += ".out"; + outfile = fopen(fileName.c_str(), "wb"); + if (outfile == NULL) { + UNI_DEBUG_LOG("waring fopen outfile %s failed\n", fileName.c_str()); + delete[] hostPtr; + delete[] hostPtrTran; + return SUCCESS; + } + fwrite(hostPtrTran, sizeof(float), num, outfile); + fclose(outfile); + } else { + //U32 len = (num > 64) ? 64 : num; + U32 len = num; + std::string line = "GPU result : "; + for (U32 i = 0; i < len; i++) { + if (i % 8 == 0) { + line = line + "\n\t"; + } + line = line + std::to_string(hostPtrTran[i]) + " "; + } + UNI_DEBUG_LOG("%s\n", line.c_str()); + } + delete[] hostPtr; + delete[] hostPtrTran; + return SUCCESS; +} +template +inline std::string gcl_check_data(GCLHandle_t handle, + GCLMemDesc memDesc, + void *ptr, + U32 len, + U32 ptrType, + bool write2bin, + CI8 *dataName = NULL) +{ + /*ptrType: + * GPU: 0 + * CPU: 1 + */ + DataFormat tdf; + DataType tdt; + U32 tn, tc, th, tw; + U32 dims; + tn = 1; + tc = 1; + th = 1; + tw = 1; + dims = memDesc.nDims; + tdt = memDesc.dt; + tdf = memDesc.df; + tw = memDesc.dims[0]; + if (dims > 1) { + th = memDesc.dims[1]; + } + if (dims > 2) { + tc = memDesc.dims[2]; + } + if (dims > 3) { + tn = memDesc.dims[3]; + } + if (dims > 4) { + CHECK_STATUS(NOT_SUPPORTED); + } + U32 num = tn * tc * th * tw; + F32 *hostPtrTran = new F32[num]; + if (!dataName) { + dataName = "unknow"; + } + + if (ptrType == 0) { + GCLMem_t mem = (GCLMem_t)ptr; + GCLMemDesc desc = memDesc; + GCLMemType type = desc.memType; + DataFormat df = desc.memFormat; + U8 *hostPtr = nullptr; + U32 s0 = desc.stride[0]; + U32 s1 = desc.stride[1]; + U32 off0 = desc.offset[0]; + U32 off1 = desc.offset[1]; + U32 byteSize = desc.byteSize; + hostPtr = new U8[(size_t)byteSize]; + + GCLMemTransType tranType = DEVICE_BUF_TO_HOST; + U32 size[3] = {byteSize, 1, 1}; + if (type == GCL_MEM_IMG_1D) { + tranType = DEVICE_IMG_TO_HOST; + size[0] = s0; + } + gcl_trans_memory(handle, (void *)mem, (void *)hostPtr, size, tranType, CL_TRUE); + + T *val = (T *)hostPtr; + if (df == DF_NCWHC4) { + if (tdf == DF_NCHW) { + for (U32 i = 0; i < num; i++) { + U32 iw = i % tw; + U32 ih = (i / tw) % th; + U32 ic = i / (tw * th); + hostPtrTran[i] = + (float)(val[((ic / 4) * s1 + iw + off1) * s0 * 4 + (ih + off0) * 4 + (ic & 3)]); + } + } + if (tdf == DF_MKT) { + for (U32 i = 0; i < num; i++) { + U32 ih = i % tw; + U32 ic = i / tw; + U32 in_off = ((ic / 4) * s1 + off1) * s0 * 4 + (ih + off0) * 4 + (ic & 3); + hostPtrTran[i] = (float)val[in_off]; + } + } + } else if (df == DF_NCHW || df == DF_NHWC) { + for (U32 i = 0; i < num; i++) { + U32 iw = i % tw; + U32 ih = (i / tw) % th; + U32 ic = i / (tw * th); + hostPtrTran[i] = (float)(val[(ic * s1 + ih + off1) * s0 + (iw + off0)]); + } + } else if (df == DF_NORMAL) { + for (U32 i = 0; i < num; i++) { + hostPtrTran[i] = (float)val[i]; + } + } else { + UNI_DEBUG_LOG( + "warning write GPU memory %s to bin, format not support: %d\n", dataName, (int)df); + delete[] hostPtrTran; + delete[] hostPtr; + } + delete[] hostPtr; + } + + if (ptrType == 1) { + T *val = (T *)ptr; + if (tdf == DF_NCHWC8) { + for (U32 i = 0; i < num; i++) { + U32 iw = i % tw; + U32 ih = (i / tw) % th; + U32 ic = i / (tw * th); + hostPtrTran[i] = (float)(val[((ic / 8) * th + ih) * tw * 8 + iw * 8 + (ic & 7)]); + } + } else if (tdf == DF_NORMAL || tdf == DF_NCHW) { + for (U32 i = 0; i < num; i++) { + hostPtrTran[i] = (float)(val[i]); + } + } else if (tdf == DF_MTK) { + for (U32 i = 0; i < num; i++) { + U32 it = i % th; + U32 ik = i / th; + U32 in_off = it * tw + ik; + hostPtrTran[i] = (float)(val[in_off]); //write as MKT, for compare with gpu + } + } else { + UNI_DEBUG_LOG( + "warning write GPU memory %s to bin, format not support: %d\n", dataName, (int)tdf); + delete[] hostPtrTran; + } + } + if (write2bin) { + FILE *outfile; + std::string fileName = dataName; + replace(fileName.begin(), fileName.end(), '/', '_'); + replace(fileName.begin(), fileName.end(), '.', '_'); + replace(fileName.begin(), fileName.end(), ' ', '_'); + if (ptrType == 0) { + fileName += "_gpu"; + } + if (ptrType == 1) { + fileName += "_cpu"; + } + fileName += ".out"; + + outfile = fopen(fileName.c_str(), "wb"); + if (outfile == NULL) { + UNI_DEBUG_LOG("waring fopen outfile %s failed\n", fileName.c_str()); + delete[] hostPtrTran; + } + fwrite(hostPtrTran, sizeof(float), num, outfile); + fclose(outfile); + } + std::string line = "GPU result nchw: "; + if (len > num) { + len = num; + } + for (U32 i = 0; i < len; i++) { + if (i % 8 == 0) { + line = line + "\n\t"; + } + line = line + std::to_string(hostPtrTran[i]) + " "; + } + delete[] hostPtrTran; + return line; +} +#endif +#endif diff --git a/common/gcl/include/gcl_kernel_binmap.h b/common/gcl/include/gcl_kernel_binmap.h new file mode 100644 index 00000000..6f95158c --- /dev/null +++ b/common/gcl/include/gcl_kernel_binmap.h @@ -0,0 +1,148 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef GCL_KERNELBIN_MAP +#define GCL_KERNELBIN_MAP + +#include "gcl_kernel_type.h" +#include +#include +typedef GCLKernelBin KernelBin; + +class gcl_kernel_binmap { +public: + gcl_kernel_binmap() + {} + std::unordered_map &binMap() + { + return binMap_; + } + + void put(std::string kernelname, KernelBin kernelbin) + { + std::lock_guard lock(mtx_); + auto it = binMap_.find(kernelname); + if (it == binMap_.end()) { + binMap_.insert({kernelname, kernelbin}); + } + } + + bool get(std::string kernelname, KernelBin **kernelbin_ptr) + { + std::lock_guard lock(mtx_); + auto it = binMap_.find(kernelname); + if (it == binMap_.end()) { + return false; + } + *kernelbin_ptr = &it->second; + return true; + } + +private: + std::unordered_map binMap_; + std::mutex mtx_; +}; + +class gcl_kernel_binmap_container { +public: + static gcl_kernel_binmap_container *instance() + { + static gcl_kernel_binmap_container sInst; + return &sInst; + } + void put(std::string kernel_binmap_name, std::unique_ptr kernel_binmap) + { + std::lock_guard lock(mtx_); + auto it = kernel_binmap_container_.find(kernel_binmap_name); + if (it == kernel_binmap_container_.end()) { + kernel_binmap_container_.insert( + std::make_pair(kernel_binmap_name, std::move(kernel_binmap))); + } + } + bool get(std::string kernel_binmap_name, gcl_kernel_binmap **kernel_binmap_ptr) + { + std::lock_guard lock(mtx_); + auto it = kernel_binmap_container_.find(kernel_binmap_name); + if (it == kernel_binmap_container_.end()) { + return false; + } + *kernel_binmap_ptr = it->second.get(); + return true; + } + +private: + gcl_kernel_binmap_container() + {} + std::unordered_map> kernel_binmap_container_; + std::mutex mtx_; +}; + +class gcl_kernel_binmap_factory { +public: + static gcl_kernel_binmap_factory *instance() + { + static gcl_kernel_binmap_factory sInst; + return &sInst; + } + typedef gcl_kernel_binmap *(*PFN_GCLKERNELMAP_CREATOR)(); + void register_gcl_kernel_binmap( + const std::string &kernel_binmap_name, PFN_GCLKERNELMAP_CREATOR pfnCreator) + { + std::lock_guard lock(mtx_); + auto it = creators_.find(kernel_binmap_name); + if (it == creators_.end()) { + creators_.insert({kernel_binmap_name, pfnCreator}); + } + } + bool create_gcl_kernel_binmap(const std::string &kernel_binmap_name) + { + std::lock_guard lock(mtx_); + auto it = creators_.find(kernel_binmap_name); + if (it == creators_.end()) { + printf("the kernel_binmap creator %s doesn't exist in kernel_binmap factory\n", + kernel_binmap_name.c_str()); + return false; + } + PFN_GCLKERNELMAP_CREATOR pfn = it->second; + gcl_kernel_binmap_container::instance()->put( + kernel_binmap_name, std::unique_ptr(pfn())); + return true; + } + +private: + gcl_kernel_binmap_factory() + {} + std::unordered_map creators_; + std::mutex mtx_; +}; + +#define REGISTER_GCLKERNELMAP_CREATOR_IMPL(kernel_binmap_name) \ + namespace { \ + static gcl_kernel_binmap *kernel_binmap_name##_gcl_kernel_binmap_pfn() \ + { \ + return new kernel_binmap_name(); \ + } \ + class kernel_binmap_name##_gcl_kernel_binmap_loader { \ + public: \ + kernel_binmap_name##_gcl_kernel_binmap_loader() \ + { \ + gcl_kernel_binmap_factory::instance()->register_gcl_kernel_binmap( \ + #kernel_binmap_name, kernel_binmap_name##_gcl_kernel_binmap_pfn); \ + } \ + }; \ + static kernel_binmap_name##_gcl_kernel_binmap_loader kernel_binmap_name##_sLoader; \ + } + +#define REGISTER_GCLKERNELMAP(kernel_binmap_name) \ + REGISTER_GCLKERNELMAP_CREATOR_IMPL(kernel_binmap_name) +#endif diff --git a/common/gcl/include/gcl_kernel_source.h b/common/gcl/include/gcl_kernel_source.h new file mode 100644 index 00000000..4284d9f5 --- /dev/null +++ b/common/gcl/include/gcl_kernel_source.h @@ -0,0 +1,85 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef GCL_KERNEL_SOURCE +#define GCL_KERNEL_SOURCE + +#include "gcl_kernel_type.h" +#include "error.h" + +#include +#include +typedef GCLKernelSource KernelSource; +typedef GCLKernelOption KernelOption; + +class gcl_kernel_source { +public: + gcl_kernel_source() + { + UNI_DEBUG_LOG("gcl_kernel_source %p constructor\n", (char *)this); + } + ~gcl_kernel_source() + { + UNI_DEBUG_LOG("gcl_kernel_source %p constructor\n", (char *)this); + } + + std::unordered_map &kernelSourceMap() + { + return kernelSourceMap_; + } + std::unordered_map &kernelOptionMap() + { + return kernelOptionMap_; + } + + void put_source(std::string kernelname, KernelSource kernelSource) + { + auto it = kernelSourceMap_.find(kernelname); + if (it == kernelSourceMap_.end()) { + kernelSourceMap_.insert({kernelname, kernelSource}); + } + } + + bool get_source(std::string kernelname, KernelSource **kernelSource_ptr) + { + auto it = kernelSourceMap_.find(kernelname); + if (it == kernelSourceMap_.end()) { + return false; + } + *kernelSource_ptr = &it->second; + return true; + } + + void put_option(std::string kernelname, KernelOption kernelOption) + { + auto it = kernelOptionMap_.find(kernelname); + if (it == kernelOptionMap_.end()) { + kernelOptionMap_.insert({kernelname, kernelOption}); + } + } + + bool get_option(std::string kernelname, KernelOption **kernelOption_ptr) + { + auto it = kernelOptionMap_.find(kernelname); + if (it == kernelOptionMap_.end()) { + return false; + } + *kernelOption_ptr = &it->second; + return true; + } + +private: + std::unordered_map kernelSourceMap_; + std::unordered_map kernelOptionMap_; +}; +#endif diff --git a/common/gcl/include/gcl_kernel_type.h b/common/gcl/include/gcl_kernel_type.h new file mode 100644 index 00000000..6979e004 --- /dev/null +++ b/common/gcl/include/gcl_kernel_type.h @@ -0,0 +1,20 @@ +#ifndef H_GCL_KERNEL_TYPE_H +#define H_GCL_KERNEL_TYPE_H + +struct GCLKernelBin { + const unsigned char *data; + const unsigned int len; +}; + +struct GCLKernelSource { + const char *data; + const unsigned int len; + bool use_kernel_def_head; +}; + +struct GCLKernelOption { + const char *option; + const char *sourceName; + bool use_common_opt; +}; +#endif diff --git a/common/gcl/include/gclmem_desc_infer.h b/common/gcl/include/gclmem_desc_infer.h new file mode 100644 index 00000000..4c646e37 --- /dev/null +++ b/common/gcl/include/gclmem_desc_infer.h @@ -0,0 +1,713 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _GCLMEM_DESC_INFER +#define _GCLMEM_DESC_INFER +#include +#include +#include "gcl_func.h" + +inline EE infer_gclmem_desc_nchwc3_to_nchw(U32 iw, + U32 ih, + U32 ic, + U32 pw, + U32 ph, + U32 ow, + U32 oh, + U32 oc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + bool need_pad = false) +{ + /*Intend to deprecate this API*/ + if (gclmemInputDesc == nullptr || gclmemOutputDesc == nullptr) { + return NULL_POINTER; + } + U32 s0, s1, s2; + s0 = ow; + s1 = oh; + s2 = oc; + + U32 num, byteSize; + num = s0 * s1 * s2; + byteSize = num * bytesOf(DT_F16); + gclmemOutputDesc->stride[0] = s0; + gclmemOutputDesc->stride[1] = s1; + gclmemOutputDesc->stride[2] = s2; + gclmemOutputDesc->offset[0] = 0; + gclmemOutputDesc->offset[1] = 0; + gclmemOutputDesc->offset[2] = 0; + gclmemOutputDesc->num = num; + gclmemOutputDesc->byteSize = byteSize; + + U32 pw_org, ph_org; + U32 s0_org, s1_org, s2_org; + U32 byteSize_org; + + s0_org = gclmemInputDesc->stride[0]; + s1_org = gclmemInputDesc->stride[1]; + s2_org = gclmemInputDesc->stride[2]; + pw_org = gclmemInputDesc->offset[0]; + ph_org = gclmemInputDesc->offset[1]; + byteSize_org = gclmemInputDesc->byteSize; + bool need_pad_org = gclmemInputDesc->need_pad; + if (byteSize_org != 0 && gclmemInputDesc->memFormat != DF_NCHWC3) { + return NOT_SUPPORTED; + } + + pw = (pw > pw_org) ? pw : pw_org; + ph = (ph > ph_org) ? ph : ph_org; + + s0 = iw + (pw << 1); + s1 = ih + (ph << 1); + s2 = (ic + 2) / 3; + s0 = (s0 > s0_org) ? s0 : s0_org; + s1 = (s1 > s1_org) ? s1 : s1_org; + s2 = (s2 > s2_org) ? s2 : s2_org; + + num = s0 * s1 * s2 * 3; + byteSize = num * bytesOf(DT_F16); + byteSize = (byteSize > byteSize_org) ? byteSize : byteSize_org; + + gclmemInputDesc->stride[0] = s0; + gclmemInputDesc->stride[1] = s1; + gclmemInputDesc->stride[2] = s2; + gclmemInputDesc->offset[0] = pw; + gclmemInputDesc->offset[1] = ph; + gclmemInputDesc->offset[2] = 0; + gclmemInputDesc->num = num; + gclmemInputDesc->byteSize = byteSize; + + gclmemInputDesc->memType = GCL_MEM_BUF; + gclmemInputDesc->memFormat = DF_NCHWC3; + gclmemInputDesc->flags = CL_MEM_READ_WRITE; + gclmemInputDesc->host_ptr = NULL; + gclmemOutputDesc->memType = GCL_MEM_BUF; + gclmemOutputDesc->memFormat = DF_NCHW; + gclmemOutputDesc->flags = CL_MEM_READ_WRITE; + gclmemOutputDesc->host_ptr = NULL; + gclmemOutputDesc->need_pad = need_pad | need_pad_org; + return SUCCESS; +} + +inline EE trans_gclmem_desc_nchw_ncwhc4( + U32 iw, U32 ih, U32 ic, U32 pw, U32 ph, DataType dt, GCLMemDesc_t gclmemDesc, bool need_pad = false) +{ + U32 s0, s1, s2; + U32 num, byteSize; + U32 pw_org, ph_org; + U32 s0_org, s1_org, s2_org; + U32 byteSize_org; + + if (gclmemDesc) { + if (gclmemDesc->memFormat != DF_NCHW) { + return NOT_SUPPORTED; + } + s0_org = gclmemDesc->stride[1]; + s1_org = gclmemDesc->stride[0]; + s2_org = gclmemDesc->stride[2]; + ph_org = gclmemDesc->offset[1]; + pw_org = gclmemDesc->offset[0]; + if (pw_org == 0 && ph_org == 0) { + if (s2_org == 1 && (s0_org == 1 || s1_org == 1)) { + s2_org = (s0_org == 1) ? s1_org : s0_org; + s0_org = 1; + s1_org = 1; + } + } + s2_org = (s2_org + 3) / 4; + byteSize_org = gclmemDesc->byteSize; + bool need_pad_org = gclmemDesc->need_pad; + if (pw == 0 && ph == 0) { + if (ic == 1 && (iw == 1 || ih == 1)) { + ic = (iw == 1) ? ih : iw; + iw = 1; + ih = 1; + } + } + ph = (ph > ph_org) ? ph : ph_org; + pw = (pw > pw_org) ? pw : pw_org; + + s0 = ih + (ph << 1); + s1 = iw + (pw << 1); + s2 = (ic + 3) / 4; + s0 = (s0 > s0_org) ? s0 : s0_org; + s1 = (s1 > s1_org) ? s1 : s1_org; + s2 = (s2 > s2_org) ? s2 : s2_org; + num = s0 * s1 * s2 * 4; + byteSize = num * bytesOf(dt); + byteSize = (byteSize > byteSize_org) ? byteSize : byteSize_org; + + gclmemDesc->stride[0] = s0; + gclmemDesc->stride[1] = s1; + gclmemDesc->stride[2] = s2; + gclmemDesc->offset[0] = ph; + gclmemDesc->offset[1] = pw; + gclmemDesc->offset[2] = 0; + gclmemDesc->num = num; + gclmemDesc->byteSize = byteSize; + gclmemDesc->memType = GCL_MEM_BUF; + gclmemDesc->memFormat = DF_NCWHC4; + gclmemDesc->flags = CL_MEM_READ_WRITE; + gclmemDesc->host_ptr = NULL; + gclmemDesc->need_pad = need_pad | need_pad_org; + } + return SUCCESS; +} +inline EE infer_gclmem_desc_ncwhc4(U32 iw, + U32 ih, + U32 ic, + U32 pw, + U32 ph, + U32 ow, + U32 oh, + U32 oc, + DataType idt, + DataType odt, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + bool need_pad = false) +{ + U32 s0, s1, s2; + U32 num, byteSize; + U32 pw_org, ph_org; + U32 s0_org, s1_org, s2_org; + U32 byteSize_org; + if (gclmemOutputDesc) { + s0 = oh; + s1 = ow; + s2 = (oc + 3) / 4; + num = s0 * s1 * s2 * 4; + byteSize = num * bytesOf(odt); + gclmemOutputDesc->stride[0] = s0; + gclmemOutputDesc->stride[1] = s1; + gclmemOutputDesc->stride[2] = s2; + gclmemOutputDesc->offset[0] = 0; + gclmemOutputDesc->offset[1] = 0; + gclmemOutputDesc->offset[2] = 0; + gclmemOutputDesc->num = num; + gclmemOutputDesc->byteSize = byteSize; + gclmemOutputDesc->memType = GCL_MEM_BUF; + gclmemOutputDesc->memFormat = DF_NCWHC4; + gclmemOutputDesc->flags = CL_MEM_READ_WRITE; + gclmemOutputDesc->host_ptr = NULL; + } + + if (gclmemInputDesc) { + byteSize_org = gclmemInputDesc->byteSize; + if (byteSize_org != 0 && gclmemInputDesc->memFormat != DF_NCWHC4) { + return trans_gclmem_desc_nchw_ncwhc4(iw, ih, ic, pw, ph, idt, gclmemInputDesc, need_pad); + } + s0_org = gclmemInputDesc->stride[0]; + s1_org = gclmemInputDesc->stride[1]; + s2_org = gclmemInputDesc->stride[2]; + ph_org = gclmemInputDesc->offset[0]; + pw_org = gclmemInputDesc->offset[1]; + if (pw_org == 0 && ph_org == 0) { + if (s2_org == 1 && (s0_org == 1 || s1_org == 1)) { + s2_org = (s0_org == 1) ? s1_org : s0_org; + s0_org = 1; + s1_org = 1; + } + } + bool need_pad_org = gclmemInputDesc->need_pad; + if (pw == 0 && ph == 0) { + if (ic == 1 && (iw == 1 || ih == 1)) { + ic = (iw == 1) ? ih : iw; + iw = 1; + ih = 1; + } + } + + ph = (ph > ph_org) ? ph : ph_org; + pw = (pw > pw_org) ? pw : pw_org; + + s0 = ih + (ph << 1); + s1 = iw + (pw << 1); + s2 = (ic + 3) / 4; + s0 = (s0 > s0_org) ? s0 : s0_org; + s1 = (s1 > s1_org) ? s1 : s1_org; + s2 = (s2 > s2_org) ? s2 : s2_org; + num = s0 * s1 * s2 * 4; + byteSize = num * bytesOf(idt); + byteSize = (byteSize > byteSize_org) ? byteSize : byteSize_org; + + gclmemInputDesc->stride[0] = s0; + gclmemInputDesc->stride[1] = s1; + gclmemInputDesc->stride[2] = s2; + gclmemInputDesc->offset[0] = ph; + gclmemInputDesc->offset[1] = pw; + gclmemInputDesc->offset[2] = 0; + gclmemInputDesc->num = num; + gclmemInputDesc->byteSize = byteSize; + gclmemInputDesc->memType = GCL_MEM_BUF; + gclmemInputDesc->memFormat = DF_NCWHC4; + gclmemInputDesc->flags = CL_MEM_READ_WRITE; + gclmemInputDesc->host_ptr = NULL; + gclmemInputDesc->need_pad = need_pad | need_pad_org; + } + return SUCCESS; +} + +inline EE infer_gclmem_desc_nhwc(U32 iw, + U32 ih, + U32 ic, + U32 pc, + U32 pw, + U32 ow, + U32 oh, + U32 oc, + DataType idt, + DataType odt, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + bool need_pad = false) +{ + U32 s0, s1, s2; + U32 num, byteSize; + U32 pc_org, pw_org; + U32 s0_org, s1_org, s2_org; + U32 byteSize_org; + + if (gclmemOutputDesc) { + s0 = oc; + s1 = ow; + s2 = oh; + num = s0 * s1 * s2; + byteSize = num * bytesOf(odt); + gclmemOutputDesc->stride[0] = s0; + gclmemOutputDesc->stride[1] = s1; + gclmemOutputDesc->stride[2] = s2; + gclmemOutputDesc->offset[0] = 0; + gclmemOutputDesc->offset[1] = 0; + gclmemOutputDesc->offset[2] = 0; + gclmemOutputDesc->num = num; + gclmemOutputDesc->byteSize = byteSize; + gclmemOutputDesc->memType = GCL_MEM_BUF; + gclmemOutputDesc->memFormat = DF_NHWC; + gclmemOutputDesc->flags = CL_MEM_READ_WRITE; + gclmemOutputDesc->host_ptr = NULL; + } + + if (gclmemInputDesc) { + s0_org = gclmemInputDesc->stride[0]; + s1_org = gclmemInputDesc->stride[1]; + s2_org = gclmemInputDesc->stride[2]; + pc_org = gclmemInputDesc->offset[0]; + pw_org = gclmemInputDesc->offset[1]; + byteSize_org = gclmemInputDesc->byteSize; + bool need_pad_org = gclmemInputDesc->need_pad; + if (byteSize_org != 0 && gclmemInputDesc->memFormat != DF_NHWC) { + return NOT_SUPPORTED; + } + + pc = (pc > pc_org) ? pc : pc_org; + pw = (pw > pw_org) ? pw : pw_org; + s0 = ic + (pc << 1); + s1 = iw + (pw << 1); + s2 = ih; + s0 = (s0 > s0_org) ? s0 : s0_org; + s1 = (s1 > s1_org) ? s1 : s1_org; + s2 = (s2 > s2_org) ? s2 : s2_org; + + num = s0 * s1 * s2; + byteSize = num * bytesOf(idt); + byteSize = (byteSize > byteSize_org) ? byteSize : byteSize_org; + gclmemInputDesc->stride[0] = s0; + gclmemInputDesc->stride[1] = s1; + gclmemInputDesc->stride[2] = s2; + gclmemInputDesc->offset[0] = pc; + gclmemInputDesc->offset[1] = pw; + gclmemInputDesc->offset[2] = 0; + gclmemInputDesc->num = num; + gclmemInputDesc->byteSize = byteSize; + gclmemInputDesc->memType = GCL_MEM_BUF; + gclmemInputDesc->memFormat = DF_NHWC; + gclmemInputDesc->flags = CL_MEM_READ_WRITE; + gclmemInputDesc->host_ptr = NULL; + gclmemInputDesc->need_pad = need_pad | need_pad_org; + } + return SUCCESS; +} + +inline EE infer_gclmem_desc_nchw(U32 iw, + U32 ih, + U32 ic, + U32 pw, + U32 ph, + U32 ow, + U32 oh, + U32 oc, + DataType idt, + DataType odt, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + bool need_pad = false) +{ + U32 s0, s1, s2; + U32 num, byteSize; + U32 pw_org, ph_org; + U32 s0_org, s1_org, s2_org; + U32 byteSize_org; + + if (gclmemOutputDesc) { + s0 = ow; + s1 = oh; + s2 = oc; + num = s0 * s1 * s2; + byteSize = num * bytesOf(odt); + gclmemOutputDesc->stride[0] = s0; + gclmemOutputDesc->stride[1] = s1; + gclmemOutputDesc->stride[2] = s2; + gclmemOutputDesc->offset[0] = 0; + gclmemOutputDesc->offset[1] = 0; + gclmemOutputDesc->offset[2] = 0; + gclmemOutputDesc->num = num; + gclmemOutputDesc->byteSize = byteSize; + gclmemOutputDesc->memType = GCL_MEM_BUF; + gclmemOutputDesc->memFormat = DF_NCHW; + gclmemOutputDesc->flags = CL_MEM_READ_WRITE; + gclmemOutputDesc->host_ptr = NULL; + } + + if (gclmemInputDesc) { + s0_org = gclmemInputDesc->stride[0]; + s1_org = gclmemInputDesc->stride[1]; + s2_org = gclmemInputDesc->stride[2]; + pw_org = gclmemInputDesc->offset[0]; + ph_org = gclmemInputDesc->offset[1]; + byteSize_org = gclmemInputDesc->byteSize; + bool need_pad_org = gclmemInputDesc->need_pad; + if (byteSize_org != 0 && gclmemInputDesc->memFormat != DF_NCHW) { + return NOT_SUPPORTED; + } + + pw = (pw > pw_org) ? pw : pw_org; + ph = (ph > ph_org) ? ph : ph_org; + s0 = iw + (pw << 1); + s1 = ih + (ph << 1); + s2 = ic; + s0 = (s0 > s0_org) ? s0 : s0_org; + s1 = (s1 > s1_org) ? s1 : s1_org; + s2 = (s2 > s2_org) ? s2 : s2_org; + + num = s0 * s1 * s2; + byteSize = num * bytesOf(idt); + byteSize = (byteSize > byteSize_org) ? byteSize : byteSize_org; + gclmemInputDesc->stride[0] = s0; + gclmemInputDesc->stride[1] = s1; + gclmemInputDesc->stride[2] = s2; + gclmemInputDesc->offset[0] = pw; + gclmemInputDesc->offset[1] = ph; + gclmemInputDesc->offset[2] = 0; + gclmemInputDesc->num = num; + gclmemInputDesc->byteSize = byteSize; + gclmemInputDesc->memType = GCL_MEM_BUF; + gclmemInputDesc->memFormat = DF_NCHW; + gclmemInputDesc->flags = CL_MEM_READ_WRITE; + gclmemInputDesc->host_ptr = NULL; + gclmemInputDesc->need_pad = need_pad | need_pad_org; + } + return SUCCESS; +} + +inline void get_nlp_mkt_val(TensorDesc desc, DataType *dt, U32 *m, U32 *k, U32 *t) +{ + if (dt) { + *dt = desc.dt; + } + if (desc.df == DF_MTK) { + if (m) { + *m = desc.dims[2]; + } + if (t) { + *t = desc.dims[1]; + } + if (k) { + *k = desc.dims[0]; + } + } else if (desc.df == DF_MKT) { + if (m) { + *m = desc.dims[2]; + } + if (k) { + *k = desc.dims[1]; + } + if (t) { + *t = desc.dims[0]; + } + } else { + CHECK_STATUS(NOT_MATCH); + } +} + +inline void map_nlp_mkt_to_ncwhc4(U32 m, U32 k, U32 t, U32 *gw, U32 *gh, U32 *gc) +{ + if (gw) { + *gw = 1; + } + if (gh) { + *gh = t; + } + if (gc) { + *gc = (k + 3) / 4 * m; + } +} + +inline void get_gclmem_dim( + GCLMemDesc desc, U32 *w_str, U32 *h_str, U32 *c_str, U32 *w_off, U32 *h_off) +{ + if (desc.memFormat == DF_NCHW) { + if (w_str) { + *w_str = desc.stride[0]; + } + if (h_str) { + *h_str = desc.stride[1]; + } + if (c_str) { + *c_str = desc.stride[2]; + } + if (w_off) { + *w_off = desc.offset[0]; + } + if (h_off) { + *h_off = desc.offset[1]; + } + } else if (desc.memFormat == DF_NCWHC4) { + if (w_str) { + *w_str = desc.stride[1]; + } + if (h_str) { + *h_str = desc.stride[0]; + } + if (c_str) { + *c_str = desc.stride[2]; + } + if (w_off) { + *w_off = desc.offset[1]; + } + if (h_off) { + *h_off = desc.offset[0]; + } + } else if (desc.memFormat == DF_NHWC) { + if (w_str) { + *w_str = desc.stride[1]; + } + if (h_str) { + *h_str = desc.stride[2]; + } + if (c_str) { + *c_str = desc.stride[0]; + } + if (w_off) { + *w_off = desc.offset[1]; + } + if (h_off) { + *h_off = desc.offset[0]; + } + } else { + CHECK_STATUS(NOT_SUPPORTED); + } +} + +inline EE fill_output_zero(GCLHandle_t handle, GCLMem_t output, TensorDesc outputDesc) +{ + GCLMemDesc outGCLDesc = output->desc; + if (!outGCLDesc.need_pad) { + return SUCCESS; + } + DataType dt; + U32 ow_str, oh_str, oc_str; + get_gclmem_dim(outGCLDesc, &ow_str, &oh_str, &oc_str, NULL, NULL); + char kernelname[128]; + U32 gs = ow_str * oh_str * oc_str; + U32 ls = 0; + U32 dim = 1; + Kernel kernel; + U32 ow, oh; + if (outGCLDesc.memFormat == DF_NCWHC4) { + if (outputDesc.df == DF_NCHW || outputDesc.df == DF_MKT || outputDesc.df == DF_MTK) { + if (outputDesc.df == DF_NCHW) { + tensorSelectGet(outputDesc, &dt, NULL, NULL, NULL, &oh, &ow); + } + if (outputDesc.df == DF_MKT || outputDesc.df == DF_MTK) { + get_nlp_mkt_val(outputDesc, &dt, NULL, NULL, &oh); + ow = 1; + } + if (ow_str != ow || oh_str != oh) { + if (dt == DT_F16) { + sprintf(kernelname, "fill_memory_zero_vec4_f16"); + } else if (dt == DT_I32 || dt == DT_U32) { + sprintf(kernelname, "fill_memory_zero_vec4_i32"); + } else { + return NOT_SUPPORTED; + } + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, gs * 4, 0, gs, output->mem)); + gcl_set_kernelVec(handle, kernel, dim, &gs, &ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs, &ls, kernelname)); +#endif + } + return SUCCESS; + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + } else if (outGCLDesc.memFormat == DF_NCHW || outGCLDesc.memFormat == DF_NHWC) { + if (outputDesc.df == DF_NCHW || outputDesc.df == DF_NORMAL || outputDesc.df == DF_NHWC) { + tensorSelectGet(outputDesc, &dt, NULL, NULL, NULL, &oh, &ow); + if (ow_str != ow || oh_str != oh) { + if (dt == DT_F16) { + sprintf(kernelname, "fill_memory_zero_vec4_f16"); + } else if (dt == DT_I32 || dt == DT_U32) { + sprintf(kernelname, "fill_memory_zero_vec4_i32"); + } else { + return NOT_SUPPORTED; + } + U32 len = gs; + gs = (gs + 3) / 4; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, len, 0, gs, output->mem)); + gcl_set_kernelVec(handle, kernel, dim, &gs, &ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs, &ls, kernelname)); +#endif + } + return SUCCESS; + + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + } + return NOT_SUPPORTED; +} + +inline GCLMemDesc gclmem_build_desc() +{ + GCLMemDesc desc; + for (U32 i = 0; i < 6; i++) { + desc.dims[i] = 0; + } + for (U32 i = 0; i < 3; i++) { + desc.stride[i] = 0; + desc.offset[i] = 0; + } + desc.nDims = 4; + desc.dt = DT_U8; + desc.df = DF_NCHW; + desc.memFormat = DF_NCWHC4; + desc.memType = GCL_MEM_BUF; + desc.byteSize = 0; + desc.num = 0; + desc.flags = CL_MEM_READ_WRITE; + desc.imgFormat.image_channel_order = CL_RGBA; + desc.imgFormat.image_channel_data_type = CL_HALF_FLOAT; + desc.host_ptr = NULL; + desc.need_pad = false; + return desc; +} + +inline EE gclmem_set_desc_padding(GCLMemDesc *desc, + U32 *stride, + U32 *offset, + DataType dt, + DataFormat mf, + GCLMemType mt, + MemFlags flags, + void *host_ptr = NULL) +{ + if (desc == NULL) { + return NULL_POINTER; + } + desc->stride[0] = stride[0]; + desc->stride[1] = stride[1]; + desc->stride[2] = stride[2]; + desc->offset[0] = offset[0]; + desc->offset[1] = offset[1]; + desc->offset[2] = offset[2]; + desc->memFormat = mf; + desc->memType = mt; + desc->flags = flags; + desc->host_ptr = host_ptr; + U32 num = 0; + U32 bytes = 0; + if (mf == DF_NHWC || mf == DF_NCHW || mt != GCL_MEM_BUF) { + num = stride[0] * stride[1] * stride[2]; + } else if (mf == DF_NCWHC4) { + num = stride[0] * stride[1] * stride[2] * 4; + } else { + return NOT_SUPPORTED; + } + bytes = num * bytesOf(dt); + if (mt != GCL_MEM_BUF) { + bytes = bytes * 4; + } + desc->num = num; + desc->byteSize = bytes; + return SUCCESS; +} + +inline EE gclmem_get_desc_non_padding( + GCLMemDesc desc, DataType *dt, DataFormat *df, U32 *num, U32 *numChannels, U32 *height, U32 *width) +{ + U32 ndims = desc.nDims; + if (dt) { + *dt = desc.dt; + } + if (df) { + *df = desc.df; + } + if (desc.df == DF_MKT) { + if (num) { + *num = desc.dims[2]; + } + if (numChannels) { + *numChannels = desc.dims[1]; + } + if (height) { + *height = desc.dims[0]; + } + if (width) { + *width = 1; + } + } else if (desc.df == DF_MTK) { + if (num) { + *num = desc.dims[2]; + } + if (numChannels) { + *numChannels = desc.dims[0]; + } + if (height) { + *height = desc.dims[1]; + } + if (width) { + *width = 1; + } + } else { + if (width) { + *width = desc.dims[0]; + } + if (height) { + *height = (ndims > 1) ? desc.dims[1] : 1; + } + if (numChannels) { + *numChannels = (ndims > 2) ? desc.dims[2] : 1; + } + if (num) { + *num = (ndims > 3) ? desc.dims[3] : 1; + } + } + return SUCCESS; +} + +#endif diff --git a/common/gcl/include/kernel.h b/common/gcl/include/kernel.h new file mode 100644 index 00000000..bcc5aa63 --- /dev/null +++ b/common/gcl/include/kernel.h @@ -0,0 +1,167 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef KERNEL_H_ +#define KERNEL_H_ +#include "types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief get information of kernel + * @warning please free memory associate with value + **/ +inline EE get_kernel_info(Kernel kernel, cl_kernel_info info, void **value, size_t *size) +{ + if (NULL == value) { + return NULL_POINTER; + } + + size_t len; + cl_int ret = clGetKernelInfo(kernel, info, 0, NULL, &len); + if (CL_SUCCESS == ret) { + if (NULL != size) { + *size = len; + } + void *data = malloc(len); + if (NULL == data) { + return ALLOC_FAILED; + } + ret = clGetKernelInfo(kernel, info, len, data, NULL); + if (CL_SUCCESS == ret) { + *value = data; + } else { + free(data); + } + } + + map_cl_error_2_ee(ret); +} + +/** + * @brief get workgroup information of kernel + * @warning please free memory associate with value + **/ +inline EE get_kernel_workgroup_info( + Kernel kernel, Device device, cl_kernel_work_group_info info, void **value, size_t *size) +{ + size_t len; + cl_int ret = clGetKernelWorkGroupInfo(kernel, device, info, 0, NULL, &len); + if (CL_SUCCESS == ret) { + if (NULL != size) { + *size = len; + } + void *data = malloc(len); + if (NULL == data) { + return ALLOC_FAILED; + } + *value = data; + } + + map_cl_error_2_ee(ret); +} + +inline EE create_kernels_in_program(Program program, U32 num_kernel, Kernel *kernels) +{ + if (kernels == nullptr) { + return NULL_POINTER; + } + I32 ret = clCreateKernelsInProgram(program, num_kernel, kernels, NULL); + map_cl_error_2_ee(ret); +} + +inline EE create_kernel(Program program, CI8 *name, Kernel *kernel) +{ + if (kernel == nullptr) { + return NULL_POINTER; + } + I32 ret; + *kernel = clCreateKernel(program, name, &ret); + map_cl_error_2_ee(ret); +} + +inline EE retain_kernel(Kernel kernel) +{ + cl_int ret = clRetainKernel(kernel); + map_cl_error_2_ee(ret); +} + +inline EE release_kernel(Kernel kernel) +{ + cl_int ret = clReleaseKernel(kernel); + map_cl_error_2_ee(ret); +} + +inline EE set_kernel_arg(Kernel kernel, U32 arg_index, U32 arg_size, const void *arg_value) +{ + cl_int ret = clSetKernelArg(kernel, arg_index, arg_size, arg_value); + map_cl_error_2_ee(ret); +} +/* + inline EE clone_kernel(Kernel src_kernel, Kernel* dst_kernel) { + // TODO + I32 ret; + dst_kernel = clCloneKernel(src_kernel, &ret); + map_cl_error_2_ee(ret); + } + */ +inline EE enqueue_ndrange_kernel(CommandQueue queue, + Kernel kernel, + U32 work_dim, + CU32 *global_work_offset, + CU32 *global_work_size, + CU32 *local_work_size, + U32 num_events_in_wait_list, + const Event *event_in_wait_list, + Event *event) +{ + I32 ret; + UNUSED(global_work_offset); + UNUSED(local_work_size); + switch (work_dim) { + case 1: { + size_t gs = global_work_size[0]; + size_t ls = local_work_size[0]; + size_t *ls_ptr = (ls == 0) ? NULL : &ls; + ret = clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, &gs, ls_ptr, + num_events_in_wait_list, event_in_wait_list, event); + break; + } + case 2: { + size_t gs[2] = {global_work_size[0], global_work_size[1]}; + size_t ls[2] = {local_work_size[0], local_work_size[1]}; + size_t *ls_ptr = (ls[0] == 0 || ls[1] == 0) ? NULL : ls; + ret = clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, gs, ls_ptr, + num_events_in_wait_list, event_in_wait_list, event); + break; + } + case 3: { + size_t gs[3] = {global_work_size[0], global_work_size[1], global_work_size[2]}; + size_t ls[3] = {local_work_size[0], local_work_size[1], local_work_size[2]}; + size_t *ls_ptr = (ls[0] == 0 || ls[1] == 0 || ls[2] == 0) ? NULL : ls; + ret = clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, gs, ls_ptr, + num_events_in_wait_list, event_in_wait_list, event); + break; + } + default: + return NOT_SUPPORTED; + } + map_cl_error_2_ee(ret); +} + +#ifdef __cplusplus +} +#endif +#endif diff --git a/common/gcl/include/memory.h b/common/gcl/include/memory.h new file mode 100644 index 00000000..7f9b5e02 --- /dev/null +++ b/common/gcl/include/memory.h @@ -0,0 +1,661 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_BUFFER +#define _H_BUFFER + +#include "event.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief get memory information + * + **/ +inline EE get_mememory_info(Mem mem, cl_mem_info info, void **value, U32 *len) +{ + if (NULL == value) { + return NULL_POINTER; + } + + size_t size; + I32 ret = clGetMemObjectInfo(mem, info, 0, NULL, &size); + if (CL_SUCCESS == ret) { + if (NULL != len) { + *len = size; + } + void *data = malloc(size); + if (NULL == data) { + return NULL_POINTER; + } + ret = clGetMemObjectInfo(mem, info, size, data, NULL); + if (CL_SUCCESS == ret) { + *value = data; + } + } + + map_cl_error_2_ee(ret); +} + +#if defined(CL_VERSION_1_2) + +inline EE create_image1D(Context context, + cl_mem_flags flags, + const cl_image_format *format, + U32 len, + U32 pitch, + void *host_ptr, + Mem *image) +{ + cl_image_desc image_desc; + image_desc.image_type = CL_MEM_OBJECT_IMAGE1D; + image_desc.image_width = len; + image_desc.image_height = 1; + image_desc.image_depth = 1; + image_desc.image_array_size = 1; + image_desc.image_row_pitch = pitch; + image_desc.image_slice_pitch = 0; + image_desc.num_mip_levels = 0; + image_desc.num_samples = 0; + image_desc.buffer = NULL; + + I32 ret; + Mem temp = clCreateImage(context, flags, format, &image_desc, host_ptr, &ret); + *image = temp; + map_cl_error_2_ee(ret); +} + +/** + * @brief create 1d image buffer + * + **/ +inline EE create_image1D_buffer(Context context, + cl_mem_flags flags, + const cl_image_format *format, + U32 len, + const cl_mem buffer, + Mem *image) +{ + cl_image_desc image_desc; + image_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + image_desc.image_width = len; + image_desc.image_height = 1; + image_desc.image_depth = 1; + image_desc.image_array_size = 1; + image_desc.image_row_pitch = len; + image_desc.image_slice_pitch = len; + image_desc.num_mip_levels = 0; + image_desc.num_samples = 0; + image_desc.buffer = buffer; + + I32 ret; + Mem temp = clCreateImage(context, flags, format, &image_desc, NULL, &ret); + if (CL_SUCCESS == ret) { + *image = temp; + } + map_cl_error_2_ee(ret); +} +#endif + +/** + * @brief create 2d image object + * + **/ +inline EE create_image2D(Context cont, + cl_mem_flags flags, + cl_image_format *format, + U32 width, + U32 height, + U32 pitch, + void *host_ptr, + Mem *mem) +{ + I32 ret; +#if defined(CL_VERSION_1_2) + cl_image_desc image_desc; + image_desc.image_type = CL_MEM_OBJECT_IMAGE2D; + image_desc.image_width = width; + image_desc.image_height = height; + image_desc.image_depth = 1; + image_desc.image_array_size = 1; + image_desc.image_row_pitch = pitch; + image_desc.image_slice_pitch = 0; + image_desc.num_mip_levels = 0; + image_desc.num_samples = 0; + image_desc.buffer = NULL; + + Mem temp = clCreateImage(cont, flags, format, &image_desc, host_ptr, &ret); +#else + Mem temp = clCreateImage2D(cont, flags, format, width, height, pitch, host_ptr, &ret); +#endif + if (CL_SUCCESS == ret) { + *mem = temp; + } + + map_cl_error_2_ee(ret); +} + +#if defined(CL_VERSION_1_2) +/** + * @brief create 2d image buffer object + * + **/ +inline EE create_image2D_array(Context cont, + cl_mem_flags flags, + cl_image_format *format, + U32 width, + U32 height, + U32 pitch, + U32 arraySize, + void *host_ptr, + Mem *mem) +{ + I32 ret; + cl_image_desc image_desc; + image_desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY; + image_desc.image_width = width; + image_desc.image_height = height; + image_desc.image_depth = 1; + image_desc.image_array_size = arraySize; + image_desc.image_row_pitch = pitch; + image_desc.image_slice_pitch = 0; + image_desc.num_mip_levels = 0; + image_desc.num_samples = 0; + image_desc.buffer = NULL; + + *mem = clCreateImage(cont, flags, format, &image_desc, host_ptr, &ret); + map_cl_error_2_ee(ret); +} +#endif + +/** + * @brief create 3d image object + * + **/ +inline EE create_image3D(Context cont, + cl_mem_flags flags, + cl_image_format *format, + U32 width, + U32 height, + U32 depth, + U32 rowPitch, + U32 slicePitch, + void *host_ptr, + Mem *mem) +{ + I32 ret; +#if defined(CL_VERSION_1_2) + cl_image_desc image_desc; + image_desc.image_type = CL_MEM_OBJECT_IMAGE3D; + image_desc.image_width = width; + image_desc.image_height = height; + image_desc.image_depth = depth; + image_desc.image_array_size = 1; + image_desc.image_row_pitch = rowPitch; + image_desc.image_slice_pitch = slicePitch; + image_desc.num_mip_levels = 0; + image_desc.num_samples = 0; + image_desc.buffer = NULL; + + Mem temp = clCreateImage(cont, flags, format, &image_desc, host_ptr, &ret); +#else + Mem temp = clCreateImage3D( + cont, flags, format, width, height, depth, rowPitch, slicePitch, host_ptr, &ret); +#endif + if (CL_SUCCESS == ret) { + *mem = temp; + } + + map_cl_error_2_ee(ret); +} + +/** + * @brief get image information + * + **/ +inline EE get_image_info(Mem mem, cl_mem_info info, void **value, U32 *len) +{ + size_t size; + I32 ret = clGetImageInfo(mem, info, 0, NULL, &size); + if (CL_SUCCESS == ret) { + if (NULL != len) { + *len = size; + } + + void *data = malloc(size); + if (NULL == data) { + return NULL_POINTER; + } + ret = clGetImageInfo(mem, info, size, data, NULL); + if (CL_SUCCESS == ret) { + *value = data; + } + } + + map_cl_error_2_ee(ret); +} + +/** + * @brief get supported image format + * + * @warning please free memory associated with format + **/ +inline EE get_supported_image_formats( + Context cont, cl_mem_flags flags, cl_mem_object_type type, cl_image_format **format, U32 *num) +{ + if (NULL == format) { + return NULL_POINTER; + } + + U32 len; + I32 ret = clGetSupportedImageFormats(cont, flags, type, 0, NULL, &len); + if (CL_SUCCESS == ret) { + if (NULL != num) { + *num = len; + } + cl_image_format *data = (cl_image_format *)malloc(len); + if (NULL == data) { + return NULL_POINTER; + } + ret = clGetSupportedImageFormats(cont, flags, type, len, data, 0); + if (CL_SUCCESS == ret) { + *format = data; + } + } + + map_cl_error_2_ee(ret); +} + +inline EE retain_memory(Mem mem) +{ + I32 ret = clRetainMemObject(mem); + map_cl_error_2_ee(ret); +} + +inline EE release_memory(Mem mem) +{ + I32 ret = clReleaseMemObject(mem); + map_cl_error_2_ee(ret); +} + +inline EE enqueue_unmap_memory(CommandQueue queue, + Mem mem, + void *mapped_ptr, + I32 num_wait_events, + const Event *wait_events, + Event *event) +{ + I32 ret = clEnqueueUnmapMemObject(queue, mem, mapped_ptr, num_wait_events, wait_events, event); + + map_cl_error_2_ee(ret); +} + +inline EE create_buffer(Context context, cl_mem_flags flags, U32 size, void *host_ptr, Mem *buffe) +{ + I32 ret; + size_t len = size; + *buffe = clCreateBuffer(context, flags, len, host_ptr, &ret); + map_cl_error_2_ee(ret); +} + +inline EE create_sub_buffer(Mem buffer, cl_mem_flags flags, U32 offset, U32 size, Mem *sub) +{ + I32 ret; + cl_buffer_region region = {offset, size}; + *sub = clCreateSubBuffer(buffer, flags, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &ret); + map_cl_error_2_ee(ret); +} + +inline EE enqueue_read_buffer(CommandQueue queue, + Mem buffer, + cl_bool blocking, + U32 offset, + U32 size, + void *ptr, + U32 num_wait_events, + const Event *wait_events, + Event *event) +{ + I32 ret = clEnqueueReadBuffer( + queue, buffer, blocking, offset, size, ptr, num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); +} + +/* + inline EE enqueue_read_buffer_rect(CommandQueue queue, Mem buffer, cl_bool blocking, + const U32 *buffer_origin, const U32 *host_origin, const U32 *region, + U32 buffer_row_pitch, U32 buffer_slice_pitch, U32 host_row_pitch, + U32 host_slice_pitch, void *ptr, U32 num_wait_events, + const Event *wait_events, Event *event) { + + I32 ret = clEnqueueReadBufferRect(queue, buffer, blocking, + buffer_origin, host_origin, region, + buffer_row_pitch, buffer_slice_pitch, host_row_pitch, + host_slice_pitch, ptr, num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); + } + */ +inline EE enqueue_write_buffer(CommandQueue queue, + Mem buffer, + cl_bool blocking, + U32 offset, + U32 size, + const void *ptr, + U32 num_wait_events, + const Event *wait_events, + Event *event) +{ + I32 ret = clEnqueueWriteBuffer( + queue, buffer, blocking, offset, size, ptr, num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); +} + +inline EE enqueue_fill_buffer(CommandQueue queue, + Mem buffer, + const void *pattern, + U32 pattern_size, + U32 offset, + U32 size, + U32 num_wait_events, + const Event *wait_events, + Event *event) +{ + size_t pat_size = pattern_size; + size_t off = offset; + size_t si = size; + I32 ret = clEnqueueFillBuffer( + queue, buffer, pattern, pat_size, off, si, num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); +} + +inline EE enqueue_write_buffer_rect(CommandQueue queue, + Mem buffer, + cl_bool blocking_write, + const U32 *buffer_origin, + const U32 *host_origin, + const U32 *region, + U32 buffer_row_pitch, + U32 buffer_slice_pitch, + U32 host_row_pitch, + U32 host_slice_pitch, + const void *ptr, + U32 num_wait_events, + const Event *wait_events, + Event *event) +{ + size_t b_ori[3]; + size_t h_ori[3]; + size_t reg[3]; + size_t b_rp = buffer_row_pitch; + size_t b_sp = buffer_slice_pitch; + size_t h_rp = host_row_pitch; + size_t h_sp = host_slice_pitch; + for (U32 i = 0; i < 3; i++) { + b_ori[i] = buffer_origin[i]; + h_ori[i] = host_origin[i]; + reg[i] = region[i]; + } + I32 ret = clEnqueueWriteBufferRect(queue, buffer, blocking_write, b_ori, h_ori, reg, b_rp, b_sp, + h_rp, h_sp, ptr, num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); +} + +inline EE enqueue_copy_buffer(CommandQueue queue, + Mem src_buffer, + Mem dst_buffer, + U32 src_offset, + U32 dst_offset, + U32 size, + U32 num_wait_events, + const Event *wait_events, + Event *event) +{ + I32 ret = clEnqueueCopyBuffer(queue, src_buffer, dst_buffer, src_offset, dst_offset, size, + num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); +} + +/* + EE enqueue_copy_buffer_rect(CommandQueue queue, Mem src_buffer, Mem dst_buffer, + const U32 *src_origin, const U32 *dst_origin, const U32 *region, + U32 src_row_pitch, U32 src_slice_pitch, U32 dst_row_pitch, + U32 dst_slice_pitch, U32 num_wait_events, + const Event *wait_events, Event *event) { + I32 ret = clEnqueueCopyBufferRect(queue, src_buffer, dst_buffer, + const size_t *src_origin, const size_t *dst_origin, const size_t *region, + src_row_pitch, src_slice_pitch, dst_row_pitch, + dst_slice_pitch, num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); + } + */ + +inline EE enqueue_map_buffer(CommandQueue queue, + Mem buffer, + cl_bool blocking_map, + cl_map_flags map_flags, + U32 offset, + U32 size, + U32 num_wait_events, + const Event *wait_events, + Event *event, + void **ptr) +{ + I32 ret; + *ptr = clEnqueueMapBuffer(queue, buffer, blocking_map, map_flags, offset, size, num_wait_events, + wait_events, event, &ret); + map_cl_error_2_ee(ret); +} + +inline EE create_image(Context context, + cl_mem_flags flags, + const cl_image_format *image_format, + const cl_image_desc *image_desc, + void *host_ptr, + Mem *mem) +{ + I32 ret; + *mem = clCreateImage(context, flags, image_format, image_desc, host_ptr, &ret); + map_cl_error_2_ee(ret); +} + +inline EE enqueue_read_image(CommandQueue queue, + Mem image, + cl_bool blocking_read, + const U32 *origin, + const U32 *region, + U32 row_pitch, + U32 slice_pitch, + void *ptr, + U32 num_wait_events, + const Event *wait_events, + Event *event) +{ + size_t org[3]; + size_t reg[3]; + for (U32 i = 0; i < 3; ++i) { + org[i] = (size_t)origin[i]; + reg[i] = (size_t)region[i]; + } + I32 ret = clEnqueueReadImage(queue, image, blocking_read, org, reg, row_pitch, slice_pitch, ptr, + num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); +} + +inline EE enqueue_write_image(CommandQueue queue, + Mem image, + cl_bool blocking_write, + const U32 *origin, + const U32 *region, + U32 input_row_pitch, + U32 input_slice_pitch, + const void *ptr, + U32 num_wait_events, + const Event *wait_events, + Event *event) +{ + size_t org[3]; + size_t reg[3]; + for (U32 i = 0; i < 3; ++i) { + org[i] = (size_t)origin[i]; + reg[i] = (size_t)region[i]; + } + I32 ret = clEnqueueWriteImage(queue, image, blocking_write, org, reg, input_row_pitch, + input_slice_pitch, ptr, num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); +} + +inline EE enqueue_fill_image(CommandQueue queue, + Mem image, + const void *fill_color, + const U32 *origin, + const U32 *region, + U32 num_wait_events, + const Event *wait_events, + Event *event) +{ + size_t org[3]; + size_t reg[3]; + for (U32 i = 0; i < 3; ++i) { + org[i] = (size_t)origin[i]; + reg[i] = (size_t)region[i]; + } + I32 ret = + clEnqueueFillImage(queue, image, fill_color, org, reg, num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); +} + +inline EE enqueue_copy_image_to_buffer(CommandQueue queue, + Mem src_image, + Mem dst_buffer, + const U32 *src_origin, + const U32 *region, + U32 dst_offset, + U32 num_wait_events, + const cl_event *wait_events, + cl_event *event) +{ + size_t org[3]; + size_t reg[3]; + for (U32 i = 0; i < 3; ++i) { + org[i] = (size_t)src_origin[i]; + reg[i] = (size_t)region[i]; + } + I32 ret = clEnqueueCopyImageToBuffer( + queue, src_image, dst_buffer, org, reg, dst_offset, num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); +} + +inline EE enqueue_copy_buffer_to_image(CommandQueue queue, + Mem src_buffer, + Mem dst_image, + U32 src_offset, + const U32 *dst_origin, + const U32 *region, + U32 num_wait_events, + const cl_event *wait_events, + cl_event *event) +{ + size_t org[3]; + size_t reg[3]; + for (U32 i = 0; i < 3; ++i) { + org[i] = (size_t)dst_origin[i]; + reg[i] = (size_t)region[i]; + } + I32 ret = clEnqueueCopyBufferToImage( + queue, src_buffer, dst_image, src_offset, org, reg, num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); +} +/* + + EE enqueue_copy_image(CommandQueue queue, Mem src_image, Mem dst_image, + const U32 *src_origin, const U32 *dst_origin, const U32 *region, + U32 num_wait_events, const cl_event *wait_events, cl_event *event) { + I32 ret = clEnqueueCopyImage(queue, src_image, dst_image, + const size_t *src_origin, const size_t *dst_origin, const size_t *region, + num_wait_events, wait_events, event); + map_cl_error_2_ee(ret); + } + + + + EE enqueue_map_image(CommandQueue queue, Mem image, cl_bool blocking_map, + cl_map_flags map_flags, const U32 *origin, const U32 *region, + U32 *image_row_pitch, U32 *image_slice_pitch, U32 num_wait_events, + const cl_event *wait_events, cl_event *event, void* *ptr) { + I32 ret; + * ptr = clEnqueueMapImage(queue, image, blocking_map, + map_flags, const size_t *origin, const size_t *region, + size_t *image_row_pitch, size_t *image_slice_pitch, + num_wait_events, wait_events, event, &ret); + map_cl_error_2_ee(ret); + } + */ + +inline EE create_sampler(Context context, const cl_sampler_properties *properties, Sampler *s) +{ + I32 ret; + *s = clCreateSamplerWithProperties(context, properties, &ret); + map_cl_error_2_ee(ret); +} + +inline EE retain_sampler(Sampler s) +{ + I32 ret = clRetainSampler(s); + map_cl_error_2_ee(ret); +} + +inline EE release_sampler(Sampler s) +{ + I32 ret = clReleaseSampler(s); + map_cl_error_2_ee(ret); +} + +inline EE get_sampler_info(Sampler s, cl_sampler_info info, void **value, size_t *len) +{ + if (NULL == value) { + return NULL_POINTER; + } + + size_t size; + I32 ret = clGetSamplerInfo(s, info, 0, NULL, &size); + if (CL_SUCCESS == ret) { + if (NULL != len) { + *len = size; + } + void *data = malloc(size); + if (NULL == data) { + return NULL_POINTER; + } + ret = clGetSamplerInfo(s, info, size, data, NULL); + if (CL_SUCCESS == ret) { + *value = data; + } + } + + map_cl_error_2_ee(ret); +} + +inline EE get_memory_size(Mem memory, U32 *size) +{ + size_t len; + int ret = clGetMemObjectInfo(memory, CL_MEM_SIZE, sizeof(len), &len, NULL); + *size = len; + map_cl_error_2_ee(ret); +} +#ifdef __cplusplus +} +#endif + +#endif diff --git a/common/gcl/include/ocl_context.h b/common/gcl/include/ocl_context.h new file mode 100644 index 00000000..8cdd38a8 --- /dev/null +++ b/common/gcl/include/ocl_context.h @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef H_OCL_CONTEXT +#define H_OCL_CONTEXT + +#include "gcl_common.h" + +class OCLContext { +public: + static OCLContext &getInstance(); + +protected: + OCLContext(); + ~OCLContext(); + +private: + void setDeviceName(); + void registerBinaryKernelMap(); + void registerSourceKernelMap(); + void registerSourceKernelsExt(); + +public: + std::shared_ptr handle; +}; +#endif diff --git a/common/gcl/include/ocl_data_alloc.h b/common/gcl/include/ocl_data_alloc.h new file mode 100644 index 00000000..6052defb --- /dev/null +++ b/common/gcl/include/ocl_data_alloc.h @@ -0,0 +1,35 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#ifndef _OCL_DATA_ALLOC +#define _OCL_DATA_ALLOC + +#include "gcl_common.h" +#include "gcl_func.h" +#include "ocl_context.h" + +inline GCLMem_t ocl_alloc_gclmem(GCLMemDesc desc) +{ + GCLMem_t gclmem = gcl_create_gclmem(); + gclmem->desc = desc; + CHECK_STATUS(gcl_create_memory(OCLContext::getInstance().handle.get(), gclmem)); + return gclmem; +} + +inline void ocl_release_gclmem(GCLMem_t mem) +{ + CHECK_STATUS(gcl_unmap_memory(OCLContext::getInstance().handle.get(), mem)); + CHECK_STATUS(gcl_release_subMem(mem)); + CHECK_STATUS(gcl_release_memory(mem)); + delete mem; +} +#endif diff --git a/common/gcl/include/ocl_data_trans.h b/common/gcl/include/ocl_data_trans.h new file mode 100644 index 00000000..b6dd08b2 --- /dev/null +++ b/common/gcl/include/ocl_data_trans.h @@ -0,0 +1,33 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#ifndef _OCL_DATA_TRANS +#define _OCL_DATA_TRANS + +#include "types.h" +#include "tensor_desc.h" +#include "gcl_common.h" + +EE ocl_set_input(GCLHandle_t handle, + GCLMem_t input, + TensorDesc hostDesc, + const U8 *hostPtr, + GCLMem_t tmpBuf, + bool blocking); + +EE ocl_get_output(GCLHandle_t handle, const GCLMem_t input, TensorDesc hostDesc, bool blocking); + +EE ocl_trans_mem( + GCLHandle_t handle, GCLMem_t src, GCLMemDesc srcDesc, GCLMem_t dst, GCLMemDesc dstDesc); + +EE ocl_map_mem(GCLHandle_t handle, GCLMem_t gclMem, GCLMemDesc desc); +#endif diff --git a/common/gcl/include/ocl_desc_trans.h b/common/gcl/include/ocl_desc_trans.h new file mode 100644 index 00000000..2ff33ee2 --- /dev/null +++ b/common/gcl/include/ocl_desc_trans.h @@ -0,0 +1,31 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#ifndef _OCL_DESC_TRANS +#define _OCL_DESC_TRANS + +#include "tensor.hpp" +#include "memory_ocl.hpp" +#include "gcl_common.h" + +inline void ocl_set_desc(Tensor *tensor, GCLMemDesc desc) +{ + OclMemory *mem = (OclMemory *)tensor->get_memory(); + mem->padding(desc); +}; + +inline GCLMemDesc ocl_get_desc(Tensor tensor) +{ + OclMemory *mem = (OclMemory *)tensor.get_memory(); + return mem->get_desc(); +} +#endif diff --git a/common/gcl/include/platform.h b/common/gcl/include/platform.h new file mode 100644 index 00000000..1d33cd35 --- /dev/null +++ b/common/gcl/include/platform.h @@ -0,0 +1,500 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_PLATFORM +#define _H_PLATFORM + +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +typedef enum { + VENDOR_ARM = 0, +} PlatformVendor; + +inline EE get_platforms(U32 *numPlatforms, Platform **platforms) +{ + if (NULL == platforms || NULL == numPlatforms) { + return NULL_POINTER; + } + U32 num; + I32 ret = clGetPlatformIDs(0, NULL, &num); + if (SUCCESS == ret) { + *numPlatforms = num; + Platform *p = (Platform *)malloc(num * sizeof(Platform)); + if (NULL == p) { + return ALLOC_FAILED; + } + + ret = clGetPlatformIDs(num, p, NULL); + if (SUCCESS != ret) { + free(p); + } else { + *platforms = p; + } + } + + map_cl_error_2_ee(ret); +} + +static cl_bool stringContains(char *big, const char *s) +{ + for (unsigned int i = 0; i < strlen(big); i++) { + big[i] = tolower(big[i]); + } + std::string str(big); + return std::string::npos != str.find(s); +} + +/** + * @brief get information from platform + * + * @param value value associate with info, memory is allocate by this + * function + * @param len the lengith of value, return by this function + * + **/ + +inline EE get_platform_info(Platform platform, cl_platform_info info, void **value, U32 *len) +{ + if (NULL == len || NULL == value) { + return NULL_POINTER; + } + size_t sizeRet; + I32 ret = clGetPlatformInfo(platform, info, 0, NULL, &sizeRet); + if (CL_SUCCESS == ret) { + if (len) { + *len = (U32)sizeRet; + } + void *data = malloc(sizeRet + 1); + if (NULL == data) { + return ALLOC_FAILED; + } + + ret = clGetPlatformInfo(platform, info, sizeRet + 1, data, NULL); + if (CL_SUCCESS != ret) { + free(data); + } else { + *value = data; + } + } + + map_cl_error_2_ee(ret); +} + +/** + * @brief select platfrom by platform type + * + * @param numPlatforms the number of platforms + * @param platforms platform array need to be selected + * @param type the type of platform we want + * @param index index of the selected platform + * + **/ +inline EE select_platform(PlatformVendor vendor, Platform *platform) +{ + if (NULL == platform) { + return NULL_POINTER; + } + + const static char *key[] = {"arm", "qualcomm"}; + U32 num_platforms; + Platform *platforms; + EE ret = get_platforms(&num_platforms, &platforms); + if (SUCCESS == ret) { + const char *platform_vendor = key[vendor]; + for (U32 i = 0; i < num_platforms; i++) { + Platform p = platforms[i]; + U32 nameLen; + char *name; + ret = get_platform_info(p, CL_PLATFORM_NAME, (void **)&name, &nameLen); + if (SUCCESS == ret) { + if (stringContains(name, platform_vendor)) { + *platform = p; + } + free(name); + } + } + } + free(platforms); + + map_cl_error_2_ee(ret); +} + +#define CHAR_PLATFORM_INFO(info, str) \ + { \ + EE ret = get_platform_info(p, info, &value, &len); \ + if (SUCCESS == ret) { \ + char *tmp = (char *)value; \ + tmp[len] = '\0'; \ + printf(str ": %s\n", tmp); \ + free(value); \ + } else { \ + map_cl_error_2_ee(ret); \ + } \ + } + +/** + * @brief list information about platform + * + */ +inline EE list_platform_info(Platform p) +{ + void *value; + U32 len; + + CHAR_PLATFORM_INFO(CL_PLATFORM_PROFILE, "\t Profile"); + CHAR_PLATFORM_INFO(CL_PLATFORM_VERSION, "\t Version "); + CHAR_PLATFORM_INFO(CL_PLATFORM_NAME, "\t Name "); + CHAR_PLATFORM_INFO(CL_PLATFORM_VENDOR, "\t Vendor "); + CHAR_PLATFORM_INFO(CL_PLATFORM_EXTENSIONS, "\t Extensions "); + + return SUCCESS; +} + +/** + * @brief get devices in platform, and allocate space for storing devices + * @warning please free space of devices allocated in this function + * + * @param p input, specify platform, device will be retrived from this platform + * @param type input, specify device type + * @param num_devices output, return device number with type in platform p + * @param devices output, return devices + * + * @return + * 0 means sucess + * -1 means fail + * + */ +inline EE platform_get_devices( + Platform platform, cl_device_type type, U32 *num_devices, Device **devices) +{ + if (NULL == devices || NULL == num_devices) { + return NULL_POINTER; + } + + U32 num; + I32 ret = clGetDeviceIDs(platform, type, 0, NULL, &num); + if (CL_SUCCESS == ret) { + *num_devices = num; + + Device *did = (Device *)malloc(num * sizeof(Device)); + if (NULL == did) { + return ALLOC_FAILED; + } + + ret = clGetDeviceIDs(platform, type, num, did, NULL); + if (CL_SUCCESS != ret) { + free(did); + } else { + *devices = did; + } + } + map_cl_error_2_ee(ret); +} + +inline EE create_sub_device( + Device device, const cl_device_partition_property *properties, U32 *num_devices, Device **devices) +{ + U32 len; + I32 ret = clCreateSubDevices(device, properties, 0, NULL, &len); + if (CL_SUCCESS == ret) { + if (NULL != num_devices) { + *num_devices = len; + } + Device *d = (Device *)malloc(sizeof(Device) * len); + if (NULL == d) { + return ALLOC_FAILED; + } + ret = clCreateSubDevices(device, properties, len, d, NULL); + if (CL_SUCCESS == ret) { + *devices = d; + } else { + free(d); + } + } + map_cl_error_2_ee(ret); +} + +inline EE retain_device(Device device) +{ + I32 ret = clRetainDevice(device); + map_cl_error_2_ee(ret); +} + +inline EE release_device(Device device) +{ + I32 ret = clReleaseDevice(device); + map_cl_error_2_ee(ret); +} + +/** + * + *@brief get device information + * + * @warning please free memory space allocated for value + * + **/ + +inline EE get_device_info(Device device, cl_device_info info, void **value, U32 *len) +{ + if (NULL == value) { + return NULL_POINTER; + } + + size_t size; + I32 ret = clGetDeviceInfo(device, info, 0, NULL, &size); + if (CL_SUCCESS == ret) { + if (NULL != len) { + *len = (U32)(size); + } + void *data = malloc(size); + if (NULL == data) { + return ALLOC_FAILED; + } + ret = clGetDeviceInfo(device, info, size, data, NULL); + if (CL_SUCCESS != ret) { + free(data); + } else { + *value = data; + } + } + + map_cl_error_2_ee(ret); +} + +#define V_Q_Info(device, info, type, str, modifier) \ + { \ + type v; \ + I32 ret = clGetDeviceInfo(device, info, sizeof(type), &v, NULL); \ + if (CL_SUCCESS != ret) { \ + map_cl_error_2_ee(ret); \ + } \ + \ + printf(str "%" modifier "\n", v); \ + } + +#define B_Q_Info(device, info, str) \ + { \ + cl_bool v; \ + I32 ret = clGetDeviceInfo(device, info, sizeof(cl_bool), &v, NULL); \ + if (CL_SUCCESS != ret) { \ + map_cl_error_2_ee(ret); \ + } \ + \ + printf(str "%s\n", v ? "Yes" : "NO"); \ + } + +#define STR_Q_Info(device, info, str) \ + { \ + size_t len; \ + I32 ret = clGetDeviceInfo(device, info, 0, NULL, &len); \ + if (SUCCESS != ret) { \ + map_cl_error_2_ee(ret); \ + } \ + \ + char *v = (char *)malloc(len + 1); \ + ret = clGetDeviceInfo(device, info, len, v, NULL); \ + if (SUCCESS != ret) { \ + map_cl_error_2_ee(ret); \ + } \ + \ + v[len] = '\0'; \ + printf(str "%s\n", v); \ + free(v); \ + } + +/** + * @brief list all attributes of device + * + * @param device input + * + * @return + * 0 : success + * -1: error + */ +inline EE list_device_info(Device device) +{ + printf("..........Device Info..............\n"); + STR_Q_Info(device, CL_DEVICE_NAME, "Device name : "); + V_Q_Info(device, CL_DEVICE_ADDRESS_BITS, U32, "Address Bits : ", "u"); + B_Q_Info(device, CL_DEVICE_AVAILABLE, "Device Available : "); + B_Q_Info(device, CL_DEVICE_COMPILER_AVAILABLE, "Device Compiler Available : "); + B_Q_Info(device, CL_DEVICE_ENDIAN_LITTLE, "Device is little Endian : "); + B_Q_Info(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, "ECC Supported : "); + STR_Q_Info(device, CL_DEVICE_EXTENSIONS, "Device Extensions : "); + STR_Q_Info(device, CL_DEVICE_OPENCL_C_VERSION, "OpenCL C Version : "); + STR_Q_Info(device, CL_DEVICE_PROFILE, "Device Profile : "); + V_Q_Info(device, CL_DEVICE_PROFILING_TIMER_RESOLUTION, size_t, "Timer Resolution : ", "ld"); + { + cl_device_fp_config v; + I32 ret = clGetDeviceInfo( + device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(cl_device_fp_config), &v, NULL); + if (CL_SUCCESS != ret) { + map_cl_error_2_ee(ret); + } + + if (v & CL_FP_DENORM) { + printf("Device Support Denorm Single Float \n"); + } + if (v & CL_FP_INF_NAN) { + printf("Device Support Single Float INF NAN\n"); + } + if (v & CL_FP_ROUND_TO_NEAREST) { + printf("Device Support Single Float Round to Nearest\n"); + } + if (v & CL_FP_ROUND_TO_ZERO) { + printf("Device Support Single Float Round to Zero \n"); + } + if (v & CL_FP_ROUND_TO_INF) { + printf("Device Support Single Float Round to Inf\n"); + } + if (v & CL_FP_FMA) { + printf("Device Support Single Float FMA\n"); + } + if (v & CL_FP_SOFT_FLOAT) { + printf("Device does not Support Hardware Single Float\n"); + } + } + + STR_Q_Info(device, CL_DEVICE_VENDOR, "Device Vendor : "); + V_Q_Info(device, CL_DEVICE_VENDOR_ID, U32, "Device Vendor ID : ", "u"); + STR_Q_Info(device, CL_DEVICE_VERSION, "Device Version : "); + STR_Q_Info(device, CL_DRIVER_VERSION, "Driver Version : "); + B_Q_Info(device, CL_DEVICE_HOST_UNIFIED_MEMORY, "Unified Memory Supported : "); + V_Q_Info(device, CL_DEVICE_MAX_PARAMETER_SIZE, size_t, "Max Parameter Size : ", "ld"); + + printf("..............Global Memory Configuration.............\n"); + V_Q_Info(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong, "Max Memory Allocate Size : ", "lu"); + V_Q_Info(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, U32, "Max Base Address Align Size : ", "u"); + V_Q_Info(device, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, U32, "Min Data Type align Size :", "u"); + + V_Q_Info(device, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong, "Global Memory Cache Size : ", "lu"); + { + cl_device_mem_cache_type v; + I32 ret = clGetDeviceInfo( + device, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(cl_device_mem_cache_type), &v, NULL); + if (CL_SUCCESS != ret) { + map_cl_error_2_ee(ret); + } + switch (v) { + case CL_NONE: + printf("Global Memory does not have Cache \n"); + break; + case CL_READ_ONLY_CACHE: + printf("Global Memory has Readonly Cache \n"); + break; + case CL_READ_WRITE_CACHE: + printf("Global Memory has Read Write Cache \n"); + break; + default: + printf("Unknown Global Memory Cache type \n"); + break; + } + } + + V_Q_Info( + device, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, U32, "Global Memory, Cacheline Size : ", "u"); + V_Q_Info(device, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong, "Global Memory Size : ", "lu"); + // CL_DEVICE_HALF_FP_CONFIG + + printf("..................Image Information...................\n"); + B_Q_Info(device, CL_DEVICE_IMAGE_SUPPORT, "Image Supported : "); + V_Q_Info(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, size_t, "2D Image Max Height : ", "ld"); + V_Q_Info(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, size_t, "2D Image Max Width : ", "ld"); + V_Q_Info(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, size_t, "3D Image Max Depth : ", "ld"); + V_Q_Info(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, size_t, "3D Image Max Height : ", "ld"); + V_Q_Info(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, size_t, "3D Image Max Width : ", "ld"); + V_Q_Info(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, U32, "Max Read Image Args : ", "u"); + V_Q_Info(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, U32, "Max Write Image Args : ", "u"); + V_Q_Info(device, CL_DEVICE_MAX_SAMPLERS, U32, "Max Samples : ", "u"); + + printf(".................Local Memory...............................\n"); + V_Q_Info(device, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong, "Local Memory Size : ", "lu"); + { + cl_device_local_mem_type v; + I32 ret = clGetDeviceInfo( + device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(cl_device_local_mem_type), &v, NULL); + if (CL_SUCCESS != ret) { + map_cl_error_2_ee(ret); + } + switch (v) { + case CL_LOCAL: + printf("Device has Dedicate Local Memory\n"); + break; + case CL_GLOBAL: + printf("Local Memory uses Global Memory\n"); + break; + default: + printf("%d\n", __LINE__); + } + } + + printf("...................CU Information...........................\n"); + V_Q_Info(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, U32, "Max Clock Frequency : ", "u"); + V_Q_Info(device, CL_DEVICE_MAX_COMPUTE_UNITS, U32, "Max Compute Units : ", "u"); + + printf(".................Constant Memory Information.............\n"); + V_Q_Info(device, CL_DEVICE_MAX_CONSTANT_ARGS, U32, "Max Constant Args : ", "u"); + V_Q_Info( + device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong, "Max Constant Buffer Size : ", "lu"); + + printf("...................ND Range Information........................\n"); + V_Q_Info(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t, "Max Work Group Size : ", "ld"); + V_Q_Info(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, U32, "Work Item Dimensions : ", "u"); + + { + size_t v[3]; + I32 ret = + clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, &v, NULL); + if (CL_SUCCESS != ret) { + map_cl_error_2_ee(ret); + } + printf("Max Work Item size : %ld %ld %ld\n", v[0], v[1], v[2]); + } + + printf(".....................Vector Information..................\n"); + V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, U32, "Native Vector Width Char : ", "u"); + V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, U32, "Native Vector Width Short : ", "u"); + V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, U32, "Native Vector Width Int : ", "u"); + V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, U32, "Native Vector Width Long : ", "u"); + V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, U32, "Native Vector Width Float : ", "u"); + V_Q_Info( + device, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, U32, "Native Vector Width Double : ", "u"); + V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, U32, "Native Vector Width Half : ", "u"); + + V_Q_Info( + device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, U32, "Preferred Vector Width Char : ", "u"); + V_Q_Info(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, U32, + "Preferred Vector Width Short : ", "u"); + V_Q_Info( + device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, U32, "Preferred Vector Width Int : ", "u"); + V_Q_Info( + device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, U32, "Preferred Vector Width Long : ", "u"); + V_Q_Info(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, U32, + "Preferred Vector Width Float : ", "u"); + V_Q_Info(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, U32, + "Preferred Vector Width Double : ", "u"); + V_Q_Info( + device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, U32, "Preferred Vector Width Half : ", "u"); + + return SUCCESS; +} + +#if defined(__cplusplus) +} +#endif +#endif diff --git a/common/gcl/include/program.h b/common/gcl/include/program.h new file mode 100644 index 00000000..c8dc15cd --- /dev/null +++ b/common/gcl/include/program.h @@ -0,0 +1,303 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef PROGRAM_H_ +#define PROGRAM_H_ + +#ifdef __cplusplus +extern "C" { +#endif +#define check_build_program_error(ret, program, device) \ + { \ + if (SUCCESS != ret) { \ + void *buildLog; \ + U32 buildLogSize; \ + ret = get_program_build_info( \ + program, device, CL_PROGRAM_BUILD_LOG, &buildLog, &buildLogSize); \ + if (SUCCESS == ret) { \ + printf("build log of device %s\n", (char *)buildLog); \ + free(buildLog); \ + } \ + } \ + } \ + /** + * @brief get build information of program + * @warning please free memory associate with value + **/ + +inline EE get_program_build_info( + Program program, Device device, cl_program_build_info info, void **value, U32 *size) +{ + if (NULL == value) { + return NULL_POINTER; + } + + size_t len; + I32 ret = clGetProgramBuildInfo(program, device, info, 0, NULL, &len); + if (SUCCESS == ret) { + if (NULL == size) { + *size = len; + } + void *data = malloc(len); + if (NULL == data) { + return ALLOC_FAILED; + } + ret = clGetProgramBuildInfo(program, device, info, len, data, NULL); + if (SUCCESS == ret) { + *value = data; + } else { + free(data); + } + } + + map_cl_error_2_ee(ret); +} + +/** + * @brief create program from source code + * + * @param context input, specify associate context + * @param source input, source code + * @param program output, created and built program + * + **/ + +inline EE create_program_from_source(Context context, U32 *len, CI8 *str, Program *program) +{ + I32 ret; + size_t length = (size_t)(*len); + *program = clCreateProgramWithSource(context, 1, &str, &length, &ret); + map_cl_error_2_ee(ret); +} + +/** + * @brief create program from binary code + * + * @param context input, specify associate context + * @param numDevices input, the number of devices need to compile the + * code for + * @param devices input, devices need to compile the code for + * @param lengths input, + * @param binaries + * @param binary_status output, compiled status for every devices + * @param program output, created and built program + * + **/ + +inline EE create_program_from_binary(Context context, + const Device device, + U32 *length, + CU8 **binary, + I32 *binary_status, + Program *program) +{ + I32 ret; + size_t len = *length; + *program = clCreateProgramWithBinary(context, 1, &device, &len, binary, binary_status, &ret); + map_cl_error_2_ee(ret); +} + +/** + * @brief build program + * + **/ + +inline EE build_program(Program program, Device device, CI8 *options) +{ + I32 ret = clBuildProgram(program, 1, &device, options, NULL, NULL); + if (CL_SUCCESS != ret) { + check_build_program_error(ret, program, device); + } + map_cl_error_2_ee(ret); +} + +/** + * @brief create program from source then build it + * + * @param cont input, specify associate context + * @param source input, source code + * @param devices input, source will be built on devices + * @param options input, options for compiling source + * @param program output, created and built program + * + */ + +inline EE create_build_program_from_source( + Context context, U32 *length, CI8 *source, Device device, CI8 *options, Program *program) +{ + if (NULL == program) { + return NULL_POINTER; + } + Program prog; + EE ret; + create_program_from_source(context, length, source, &prog); + ret = build_program(prog, device, options); + *program = prog; + map_cl_error_2_ee(ret); +} + +/** + * @brief create program from binary then build it + * + **/ + +inline EE create_build_program_from_binary(Context context, + Device device, + U32 *length, + CU8 **binary, + CI8 *options, + I32 *binary_status, + Program *program) +{ + if (NULL == program) { + return NULL_POINTER; + } + Program prog; + EE ret; + create_program_from_binary(context, device, length, binary, binary_status, &prog); + ret = build_program(prog, device, options); + *program = prog; + map_cl_error_2_ee(ret); +} + +/** + * @brief get information of program + * @warning please free memory associate with value + **/ + +inline EE get_program_info(Program program, cl_program_info info, void **value, U32 *size) +{ + if (NULL == value) { + return NULL_POINTER; + } + size_t len; + I32 ret = clGetProgramInfo(program, info, 0, NULL, &len); + if (CL_SUCCESS == ret) { + if (NULL != size) { + *size = len; + } + void *data = malloc(len); + if (NULL == data) { + return ALLOC_FAILED; + } + ret = clGetProgramInfo(program, info, len, data, NULL); + if (CL_SUCCESS == ret) { + *value = data; + } else { + free(data); + } + } + map_cl_error_2_ee(ret); +} + +/** + * @brief get information of program + * @warning please free memory associate with value + **/ +inline EE get_program_binary(Program program, U8 **binary, U32 *len) +{ + size_t size; + I32 ret = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL); + if (CL_SUCCESS == ret) { + *len = (U32)(size); + void *data = malloc(size); + if (NULL == data) { + return ALLOC_FAILED; + } + ret = clGetProgramInfo( + program, CL_PROGRAM_BINARIES, size, &data, NULL); //waring: need set &data + if (CL_SUCCESS == ret) { + *binary = (U8 *)(data); + } else { + free(data); + } + } + map_cl_error_2_ee(ret); +} + +/** + * @brief get binary of source code + * + * @warning please don't free binary, it is return by ocl + * + **/ + +inline EE get_program_binary_from_source( + Context context, U32 *length, CI8 *str, Device device, CI8 *options, U8 **binary, U32 *len) +{ + if (NULL == binary) { + return NULL_POINTER; + } + + Program program; + EE ret = create_build_program_from_source(context, length, str, device, options, &program); + if (SUCCESS == ret) { + ret = get_program_binary(program, binary, len); + } + return ret; +} + +/* + inline EE create_program_from_il(Context context, + const void *il, U32 length, Program *program) { + //TODO + I32 ret; + * program = clCreateProgramWithIL(context, il, length, &ret); + map_cl_error_2_ee(ret); + } + */ + +inline EE release_program(Program program) +{ + map_cl_error_2_ee(clReleaseProgram(program)); +} + +inline EE compile_program(Program program, + const Device device, + CI8 *options, + U32 num_input_headers, + const Program *input_headers, + CI8 **header_include_names) +{ + I32 ret = clCompileProgram(program, 1, &device, options, num_input_headers, input_headers, + header_include_names, NULL, NULL); + if (CL_SUCCESS != ret) { + check_build_program_error(ret, program, device); + } + map_cl_error_2_ee(ret); +} + +inline EE link_program(Context context, + const Device device, + CI8 *options, + U32 num_input_programs, + const Program *input_programs, + Program *program) +{ + I32 ret; + *program = clLinkProgram( + context, 1, &device, options, num_input_programs, input_programs, NULL, NULL, &ret); + map_cl_error_2_ee(ret); +} + +inline EE unload_platform_compiler(Platform p) +{ + I32 ret = clUnloadPlatformCompiler(p); + map_cl_error_2_ee(ret); +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/common/gcl/src/CMakeLists.txt b/common/gcl/src/CMakeLists.txt new file mode 100644 index 00000000..ef8301af --- /dev/null +++ b/common/gcl/src/CMakeLists.txt @@ -0,0 +1,14 @@ +file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) + +# shared library +add_library(${PROJECT_NAME} SHARED ${srcs}) + +# static library +add_library(${PROJECT_NAME}_static STATIC ${srcs}) + +set_target_properties(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}") +set_target_properties(${PROJECT_NAME} PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(${PROJECT_NAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) +install(TARGETS ${PROJECT_NAME} ${PROJECT_NAME}_static + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) diff --git a/common/gcl/src/ocl_context.cpp b/common/gcl/src/ocl_context.cpp new file mode 100644 index 00000000..8e348277 --- /dev/null +++ b/common/gcl/src/ocl_context.cpp @@ -0,0 +1,187 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "gcl_func.h" +#include "ocl_context.h" + +OCLContext::OCLContext() +{ + UNI_DEBUG_LOG("OCLContext %p constructor start\n", (char *)this); + this->handle = std::shared_ptr(new GCLHandle()); + this->handle->platformId = 0; + this->handle->deviceId = 0; + this->handle->deviceType = CL_DEVICE_TYPE_GPU; + this->handle->eventPtr = nullptr; + this->handle->numWaitEvents = 0; + this->handle->waitEvents = nullptr; + this->handle->t_execute = 0; + this->handle->t_total = 0; + this->handle->curOpName = "unknow"; + this->handle->deviceName = "unknow"; + this->handle->kernel_source = nullptr; + this->handle->kernel_binmap = nullptr; + this->handle->kernel_binmap_handle = nullptr; + this->handle->common_source_opt = "unknow"; + this->handle->common_source_ext = "unknow"; + this->handle->source_head_name[0] = "unknow"; + this->handle->useBinMap = false; + this->handle->existProfilingQueue = false; + CHECK_STATUS(get_platforms(&(this->handle->numPlatform), &(this->handle->platforms))); + CHECK_STATUS(platform_get_devices(this->handle->platforms[this->handle->platformId], + this->handle->deviceType, &this->handle->numDevice, &this->handle->devices)); + CHECK_STATUS(create_context(this->handle->platforms[this->handle->platformId], + this->handle->numDevice, this->handle->devices, &this->handle->context)); + cl_queue_properties props[] = {CL_QUEUE_PROPERTIES, 0, 0}; +#ifdef _DEBUG + this->handle->eventPtr = &this->handle->eventObj; + props[1] = props[1] | CL_QUEUE_PROFILING_ENABLE; +#endif + CHECK_STATUS(create_command_queue_properties(this->handle->context, + this->handle->devices[this->handle->deviceId], props, &this->handle->queue)); + this->setDeviceName(); + this->registerBinaryKernelMap(); + if (!this->handle->useBinMap) { + this->registerSourceKernelMap(); + this->registerSourceKernelsExt(); + } + UNI_DEBUG_LOG("OCLContext %p constructor end\n", (char *)this); +} + +OCLContext::~OCLContext() +{ + UNI_DEBUG_LOG("OCLContext %p deconstructor start\n", (char *)this); + if (this->handle->platforms == nullptr) { + return; + } + CHECK_STATUS(finish(this->handle->queue)); + for (auto k : this->handle->programMap) { + CHECK_STATUS(release_program(k.second)); + } + for (auto k : this->handle->kernelMap) { + CHECK_STATUS(release_kernel(k.second)); + } + if (this->handle->useBinMap) { + delete (gcl_kernel_binmap *)this->handle->kernel_binmap; + dlclose(this->handle->kernel_binmap_handle); + } else { + CHECK_STATUS(release_program(this->handle->source_head[0])); + delete (gcl_kernel_source *)this->handle->kernel_source; + } + this->handle->kernelMap.clear(); + if (this->handle->existProfilingQueue) { + CHECK_STATUS(finish(this->handle->queue_profiling)); + CHECK_STATUS(release_command_queue(this->handle->queue_profiling)); + } + CHECK_STATUS(release_command_queue(this->handle->queue)); + CHECK_STATUS(release_context(this->handle->context)); + CHECK_STATUS(release_device(this->handle->devices[this->handle->deviceId])); + free(this->handle->devices); + free(this->handle->platforms); + UNI_DEBUG_LOG("OCLContext %p deconstructor end\n", (char *)this); +} + +void OCLContext::setDeviceName() +{ + cl_device_id device = this->handle->devices[this->handle->deviceId]; + U32 len; + I8 *data; + CHECK_STATUS(get_device_info(device, CL_DEVICE_NAME, (void **)&data, &len)); + I8 devName[64]; + for (U32 i = 0; i < len - 1; i++) { + if (data[i] == '-') { + data[i] = '_'; + } + if (data[i] == ' ') { + data[i] = '_'; + } + devName[i] = data[i]; + } + U32 version_len; + free(data); + CHECK_STATUS(get_device_info(device, CL_DEVICE_VERSION, (void **)&data, &version_len)); + std::string deviceV = std::string(data); + U32 be = deviceV.find("r"); + U32 end = deviceV.find("p", be + 1); + std::string numV = deviceV.substr(be + 1, end - be - 1); + U32 i = atoi(numV.c_str()); + if (i >= 14) { + devName[len - 1] = 'p'; + devName[len] = '\0'; + } else { + devName[len - 1] = '\0'; + } + free(data); + this->handle->deviceName = devName; +} + +void OCLContext::registerBinaryKernelMap() +{ + std::string libKernelBinName = "lib" + this->handle->deviceName + "_map.so"; + char *err; + void *dvm_handle = dlopen(libKernelBinName.c_str(), RTLD_LAZY); + if (dvm_handle) { + std::string func = "create_" + this->handle->deviceName + "_kernelbin_map"; + gcl_kernel_binmap *(*create_kernelbin_map)(); + dlerror(); + create_kernelbin_map = (gcl_kernel_binmap * (*)()) dlsym(dvm_handle, func.c_str()); + if ((err = dlerror()) != NULL) { + UNI_ERROR_LOG( + "Get %s in %s failed, error %s\n", func.c_str(), libKernelBinName.c_str(), err); + dlclose(dvm_handle); + } + gcl_kernel_binmap *kernel_binmap = create_kernelbin_map(); + this->handle->kernel_binmap = (void *)kernel_binmap; + this->handle->useBinMap = true; + this->handle->kernel_binmap_handle = dvm_handle; + } else { + UNI_DEBUG_LOG("try to dlopen %s failed, %s, create kernel from source code\n", + libKernelBinName.c_str(), dlerror()); + } +} + +void OCLContext::registerSourceKernelMap() +{ + gcl_kernel_source *kernel_source = new kernel_source_executor(); + this->handle->kernel_source = kernel_source; + KernelOption *common_opt; + if (!kernel_source->get_option("common", &common_opt)) { + UNI_ERROR_LOG("the common doesn't exist in optionMap\n"); + CHECK_STATUS(NULL_POINTER); + } + this->handle->common_source_opt = common_opt->option; + this->handle->common_source_ext = "#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable\n"; + this->handle->common_source_ext += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"; + this->handle->source_head_name[0] = "kernel_def.h"; + KernelSource *head_source; + if (!kernel_source->get_source("kernel_def", &head_source)) { + UNI_ERROR_LOG("the kernel_def doesn't exist in sourceMap\n"); + CHECK_STATUS(NULL_POINTER); + } + CHECK_STATUS(create_program_from_source(this->handle->context, (U32 *)&head_source->len, + head_source->data, this->handle->source_head)); +} + +void OCLContext::registerSourceKernelsExt() +{ + Kernel tmpK; + CHECK_STATUS(gcl_get_kernel_from_map(this->handle.get(), "padding_input_gclmem", &tmpK)); + CHECK_STATUS(gcl_get_kernel_from_map(this->handle.get(), "mem_trans_nchw_to_ncwhc4", &tmpK)); + CHECK_STATUS(gcl_get_kernel_from_map(this->handle.get(), "mem_trans_ncwhc4_to_nchw", &tmpK)); + CHECK_STATUS(gcl_get_kernel_from_map(this->handle.get(), "mem_trans_ncwhc4_to_mtk", &tmpK)); +} + +OCLContext &OCLContext::getInstance() +{ + static OCLContext _instance; + return _instance; +} diff --git a/common/gcl/src/ocl_data_trans.cpp b/common/gcl/src/ocl_data_trans.cpp new file mode 100644 index 00000000..9057614d --- /dev/null +++ b/common/gcl/src/ocl_data_trans.cpp @@ -0,0 +1,279 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gcl_common.h" +#include "gcl_func.h" +#include "gclmem_desc_infer.h" +#include "ocl_data_trans.h" + +EE ocl_set_input(GCLHandle_t handle, + GCLMem_t input, + TensorDesc hostDesc, + const U8 *hostPtr, + GCLMem_t tmpBuf, + bool blocking) +{ + GCLMemDesc desc = input->desc; + if (desc.memType == GCL_MEM_BUF) { + U32 size = tensorNumBytes(hostDesc); + Kernel kernel; + U32 iw, ih, ic, in; + DataType hdt; + DataFormat hdf; + if (hostDesc.df == DF_NCHW || hostDesc.df == DF_NHWC) { + tensorSelectGet(hostDesc, &hdt, &hdf, &in, &ic, &ih, &iw); + } else if (hostDesc.df == DF_NORMAL) { + tensor2dGet(hostDesc, &hdt, &hdf, &ih, &iw); + ic = 1; + in = 1; + hdf = DF_NORMAL; + } else { + return NOT_SUPPORTED; + } + if (hdf == DF_NCHW) { + U32 ow, oh, pw, ph; + ow = input->desc.stride[0]; + oh = input->desc.stride[1]; + pw = input->desc.offset[0]; + ph = input->desc.offset[1]; + if (desc.memFormat == DF_NCHW || (ow == 1 && oh == 1 && pw == 0 && ph == 0)) { + GCLMem_t dst = (iw == ow && ih == oh) ? input : tmpBuf; + CHECK_STATUS(gcl_trans_memory( + handle, (void *)hostPtr, (void *)dst, &size, HOST_TO_DEVICE_BUF, CL_TRUE)); + if (iw != ow || ih != oh) { + CHECK_STATUS(gcl_get_kernel_from_map(handle, "padding_input_gclmem", &kernel)); + CHECK_STATUS( + gcl_set_kernelArgs(kernel, iw, ih, pw, ph, ow, oh, tmpBuf->mem, input->mem)); + U32 gs[3] = {(ow + 3) / 4 * 4, (oh + 3) / 4 * 4, ic}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + CHECK_STATUS( + gcl_run_kernel(handle, kernel, dim, gs, ls, "padding_input_gclmem")); + } + return SUCCESS; + } + + if (desc.memFormat == DF_NCWHC4) { + if (hdt != DT_F16) { + return NOT_SUPPORTED; + } + oh = input->desc.stride[0]; + ow = input->desc.stride[1]; + ph = input->desc.offset[0]; + pw = input->desc.offset[1]; + gcl_trans_memory( + handle, (void *)hostPtr, (void *)tmpBuf, &size, HOST_TO_DEVICE_BUF, blocking); + CHECK_STATUS(gcl_get_kernel_from_map(handle, "mem_trans_nchw_to_ncwhc4", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw, ih, 0, 0, ow, oh, pw, ph, iw, ih, ic, + iw, ih, ic, 0, 0, tmpBuf->mem, input->mem)); + U32 gs[3] = {(iw + 3) / 4, ih, (ic + 3) / 4 * in}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + CHECK_STATUS( + gcl_run_kernel(handle, kernel, dim, gs, ls, "mem_trans_nchw_to_ncwhc4")); + return SUCCESS; + } + return NOT_SUPPORTED; + } + + if (hdf == DF_NHWC) { + U32 oc, ow, pc, pw; + oc = input->desc.stride[0]; + ow = input->desc.stride[1]; + pc = input->desc.offset[0]; + pw = input->desc.offset[1]; + if (desc.memFormat == DF_NHWC) { + if (ic == oc && iw == ow && pc == 0 && pw == 0) { + gcl_trans_memory(handle, (void *)hostPtr, (void *)input, &size, + HOST_TO_DEVICE_BUF, blocking); + return SUCCESS; + } + } + return NOT_SUPPORTED; + } + + if (hdf == DF_NORMAL) { + U32 oh, ow, ph, pw; + ow = input->desc.stride[0]; + oh = input->desc.stride[1]; + pw = input->desc.offset[0]; + ph = input->desc.offset[1]; + if (desc.memFormat == DF_NCHW) { + if (iw == ow && ih == oh && pw == 0 && ph == 0) { + gcl_trans_memory(handle, (void *)hostPtr, (void *)input, &size, + HOST_TO_DEVICE_BUF, blocking); + return SUCCESS; + } + } + return NOT_SUPPORTED; + } + } + return NOT_SUPPORTED; +} + +EE ocl_get_output(GCLHandle_t handle, const GCLMem_t input, TensorDesc hostDesc, bool blocking) +{ + GCLMemDesc desc = input->desc; + Kernel kernel; + DataType host_dt; + DataFormat host_df, device_df; + U32 ow, oh, oc, on; + U32 iw, ih, ic, pw, ph; + tensorSelectGet(hostDesc, &host_dt, &host_df, &on, &oc, &oh, &ow); + U32 size = tensorNumBytes(hostDesc); + U32 offset = 0; + get_gclmem_dim(desc, &iw, &ih, &ic, &pw, &ph); + device_df = desc.memFormat; + if (desc.byteSize < size) { + CHECK_STATUS(NOT_MATCH); + } + + if (device_df == DF_NCWHC4 && (host_df == DF_NCHW || host_df == DF_NORMAL) && + host_dt == DT_F16 && (ih != 1 || iw != 1)) { + if (desc.byteSize < size * 2) { + CHECK_STATUS(NOT_MATCH); + } + offset = iw * ih * ic * 4; + CHECK_STATUS(gcl_get_kernel_from_map(handle, "mem_trans_ncwhc4_to_nchw", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw, ih, pw, ph, ow, oh, 0, 0, ow, oh, oc, ow, oh, + oc, 0, offset, input->mem, input->mem)); + U32 gs[3] = {oh, (ow + 3) >> 2, (oc + 3) / 4 * on}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "mem_trans_ncwhc4_to_nchw")); + offset = offset * bytesOf(host_dt); + } + + if (device_df == DF_NCWHC4 && host_df == DF_MKT) { + if (desc.byteSize < size * 2) { + CHECK_STATUS(NOT_MATCH); + } + offset = iw * ih * ic * 4; + U32 gs[2] = {oh, (oc + 3) / 4}; + U32 ls[2] = {0, 0}; + U32 dim = 2; + CHECK_STATUS(gcl_get_kernel_from_map(handle, "mem_trans_ncwhc4_to_mtk", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs( + kernel, ih, iw, ph, pw, oc, offset, gs[0], gs[1], input->mem, input->mem)); + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "mem_trans_ncwhc4_to_mtk")); + offset = offset * bytesOf(host_dt); + } + CHECK_STATUS(gcl_map_memory(handle, input, &offset, &size, CL_MAP_READ, blocking)); + return SUCCESS; +} + +EE ocl_trans_mem( + GCLHandle_t handle, GCLMem_t src, GCLMemDesc srcDesc, GCLMem_t dst, GCLMemDesc dstDesc) +{ + if (srcDesc.memType == dstDesc.memType && srcDesc.memType == GCL_MEM_BUF) { + U32 sw_str, sh_str, sc_str, sw_off, sh_off; + U32 dw_str, dh_str, dc_str, dw_off, dh_off; + DataFormat sf, df; + sf = srcDesc.memFormat; + df = dstDesc.memFormat; + get_gclmem_dim(srcDesc, &sw_str, &sh_str, &sc_str, &sw_off, &sh_off); + get_gclmem_dim(dstDesc, &dw_str, &dh_str, &dc_str, &dw_off, &dh_off); + U32 gs[3] = {0, 0, 0}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Mem srcMem = src->mem; + Mem dstMem = dst->mem; + Kernel kernel; + if (sf == df) { + if (sw_str == dw_str && sh_str == dh_str && sc_str == dc_str && sw_off == dw_off && + sh_off == dh_off) { + U32 len = srcDesc.num; + gs[0] = (len + 3) / 4; + ls[0] = 0; + dim = 1; + CHECK_STATUS(gcl_create_kernel(handle, "copy_f16", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, len, len, 0, 0, gs[0], srcMem, dstMem)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "copy_fp16"); + } else if (sf == DF_NCHW && sw_off == 0 && sh_off == 0 && sc_str == dc_str) { + gs[0] = (dw_str + 3) / 4 * 4; + gs[1] = (dh_str + 3) / 4 * 4; + gs[2] = dc_str; + dim = 3; + CHECK_STATUS(gcl_create_kernel(handle, "padding_input_gclmem", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs( + kernel, sw_str, sh_str, dw_off, dh_off, dw_str, dh_str, srcMem, dstMem)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "padding_input_gclmem"); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + } else if (sf == DF_NCHW && df == DF_NCWHC4) { + U32 iw, ih, ic; + TensorDesc cpuDesc = tensor4df(srcDesc.dt, srcDesc.df, srcDesc.dims[3], srcDesc.dims[2], + srcDesc.dims[1], srcDesc.dims[0]); + tensorSelectGet(cpuDesc, NULL, NULL, NULL, &ic, &ih, &iw); + gs[0] = (iw + 3) / 4; + gs[1] = ih; + gs[2] = (ic + 3) / 4; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + CHECK_STATUS(gcl_create_kernel(handle, "mem_trans_nchw_to_ncwhc4", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, sw_str, sh_str, sw_off, sh_off, dw_str, dh_str, + dw_off, dh_off, iw, ih, ic, iw, ih, ic, 0, 0, srcMem, dstMem)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "mem_trans_nchw_to_ncwhc4"); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + return SUCCESS; +} + +EE ocl_map_mem(GCLHandle_t handle, GCLMem_t gclMem, GCLMemDesc desc) +{ + DataType dt; + DataFormat df; + U32 n, c, h, w; + CHECK_STATUS(gclmem_get_desc_non_padding(desc, &dt, &df, &n, &c, &h, &w)); + + DataFormat mf = desc.memFormat; + U32 w_str, h_str, c_str, w_off, h_off; + get_gclmem_dim(desc, &w_str, &h_str, &c_str, &w_off, &h_off); + bool needTrans = true; + U32 offset = 0; + U32 size = n * c * h * w * bytesOf(dt); + if (w_str == w && h_str == h && c_str == c && mf != DF_NCWHC4) { + needTrans = false; + } + if (w_str == 1 && h_str == 1 && mf == DF_NCWHC4) { + needTrans = false; + } + if (needTrans) { + if (mf == DF_NCWHC4) { + U32 gs[3] = {h, (w + 3) >> 2, (c + 3) / 4 * n}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + Mem buf = gclMem->mem; + offset = desc.num; + CHECK_STATUS(gcl_get_kernel_from_map(handle, "mem_trans_ncwhc4_to_nchw", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, w_str, h_str, w_off, h_off, w, h, 0, 0, w, h, c, + w, h, c, 0, offset, buf, buf)); + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "mem_trans_ncwhc4_to_nchw")); + offset = desc.num * bytesOf(dt); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + } + CHECK_STATUS(gcl_map_memory(handle, gclMem, &offset, &size, CL_MAP_READ, CL_TRUE)); + return SUCCESS; +} diff --git a/gcl/tools/device_info/CMakeLists.txt b/common/gcl/tools/device_info/CMakeLists.txt similarity index 71% rename from gcl/tools/device_info/CMakeLists.txt rename to common/gcl/tools/device_info/CMakeLists.txt index e96015bc..d9dbd182 100644 --- a/gcl/tools/device_info/CMakeLists.txt +++ b/common/gcl/tools/device_info/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.2) -file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake) +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) if (BOLT_CONFIGURE_FILE) set(USE_LLVM_CLANG ON) set(USE_GNU_GCC OFF) @@ -18,13 +18,11 @@ project(gclinfo) set_policy() -SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${BOLT_ROOT}/cmakes") find_package(Gcl) -find_package(Uni) set_c_cxx_flags() set_test_c_cxx_flags() add_executable(gcl_info clinfo.cpp) -TARGET_LINK_LIBRARIES(gcl_info ${OPENCL_LIBRARIES}) +target_link_libraries(gcl_info ${OPENCL_LIBRARIES} log -Wl,-allow-shlib-undefined, -static-libstdc++) diff --git a/gcl/tools/device_info/clinfo.cpp b/common/gcl/tools/device_info/clinfo.cpp similarity index 81% rename from gcl/tools/device_info/clinfo.cpp rename to common/gcl/tools/device_info/clinfo.cpp index 97530a7b..a705d882 100644 --- a/gcl/tools/device_info/clinfo.cpp +++ b/common/gcl/tools/device_info/clinfo.cpp @@ -1,23 +1,22 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include"gcl.h" -int main(){ +#include "gcl.h" +int main() +{ GCLHandle_t handle; CHECK_STATUS(gcl_create_handle(&handle)); CHECK_STATUS(list_device_info(handle->devices[handle->deviceId])); - CHECK_STATUS(gcl_destroy_handle(handle)); + gcl_destroy_handle(handle); return 0; } diff --git a/gcl/tools/gcl_sample/CMakeLists.txt b/common/gcl/tools/gcl_sample/CMakeLists.txt similarity index 55% rename from gcl/tools/gcl_sample/CMakeLists.txt rename to common/gcl/tools/gcl_sample/CMakeLists.txt index 625209b8..0fdb0401 100644 --- a/gcl/tools/gcl_sample/CMakeLists.txt +++ b/common/gcl/tools/gcl_sample/CMakeLists.txt @@ -1,11 +1,7 @@ cmake_minimum_required(VERSION 3.4.1) -file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake) +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) if (BOLT_CONFIGURE_FILE) - set(USE_LLVM_CLANG ON) - set(USE_GNU_GCC OFF) - set(USE_MALI ON) - set(USE_DYNAMIC_LIBRARY OFF) include(${BOLT_CONFIGURE_FILE}) else (BOLT_CONFIGURE_FILE) message(FATAL_ERROR " @@ -18,13 +14,10 @@ project(sample) set_policy() -SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${BOLT_ROOT}/cmakes") find_package(Gcl) -find_package(Uni) set_c_cxx_flags() - set_test_c_cxx_flags() add_executable(sample sample.cpp) -target_link_libraries(sample ${KERNELBIN_LIBRARIES} ${OPENCL_LIBRARIES}) +target_link_libraries(sample ${KERNELSOURCE_LIBRARIES} ${OPENCL_LIBRARIES} log -Wl,-allow-shlib-undefined, -static-libstdc++) diff --git a/common/gcl/tools/gcl_sample/build.sh b/common/gcl/tools/gcl_sample/build.sh new file mode 100644 index 00000000..8abf4138 --- /dev/null +++ b/common/gcl/tools/gcl_sample/build.sh @@ -0,0 +1,20 @@ +options="-DUSE_CROSS_COMPILE=ON \ + -DBUILD_TEST=ON \ + -DUSE_GNU_GCC=OFF \ + -DUSE_LLVM_CLANG=ON \ + -DUSE_MALI=ON \ + -DUSE_NEON=ON \ + -DUSE_DYNAMIC_LIBRARY=OFF \ + -DCMAKE_C_COMPILER=`which aarch64-linux-android21-clang` \ + -DCMAKE_CXX_COMPILER=`which aarch64-linux-android21-clang++` \ + -DCMAKE_STRIP=`which aarch64-linux-android-strip` " +rm -rf ./build_gcl_sample +mkdir ./build_gcl_sample +cd ./build_gcl_sample +cmake .. ${options} +make -j33 +cd .. + + + + diff --git a/gcl/tools/gcl_sample/cl/sample.cl b/common/gcl/tools/gcl_sample/cl/sample.cl similarity index 75% rename from gcl/tools/gcl_sample/cl/sample.cl rename to common/gcl/tools/gcl_sample/cl/sample.cl index 555f5271..c2ceaa8d 100644 --- a/gcl/tools/gcl_sample/cl/sample.cl +++ b/common/gcl/tools/gcl_sample/cl/sample.cl @@ -11,23 +11,36 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -__kernel void sample(const int iw_str, const int iwh_str, const int fc, const int flt_str, const int ow_str, const int oh_str, const int bx, const int by, - __global const T* in, __global const T* flt, __global const T* bias, __global T* out){ +__kernel void sample(const int iw_str, + const int iwh_str, + const int fc, + const int flt_str, + const int ow_str, + const int oh_str, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __global const T *bias, + __global T *out) +{ const int idx = get_global_id(0); const int idy = get_global_id(1); const int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; + if (idx >= bx || idy >= by) { + return; + } T3 flt_val; T3 in_val; - T out_val; + T out_val; out_val = bias[idz]; int flt_off = idz * flt_str; - int in_off = idy * iw_str + idx; - for(int i = 0 ; i < fc; ++i) { - for(uchar j = 0; j < 3; ++j) { + int in_off = idy * iw_str + idx; + for (int i = 0; i < fc; ++i) { + for (uchar j = 0; j < 3; ++j) { flt_val = vload3(0, flt + flt_off + j * 3); - in_val = vload3(0, in + in_off + j * iw_str); + in_val = vload3(0, in + in_off + j * iw_str); out_val += flt_val.x * in_val.x; out_val += flt_val.y * in_val.y; out_val += flt_val.z * in_val.z; @@ -35,7 +48,7 @@ __kernel void sample(const int iw_str, const int iwh_str, const int fc, const in flt_off += 9; in_off += iwh_str; } - + int out_off = (idz * oh_str + idy) * ow_str + idx; out[out_off] = out_val; } diff --git a/common/gcl/tools/gcl_sample/sample.cpp b/common/gcl/tools/gcl_sample/sample.cpp new file mode 100644 index 00000000..da6f3154 --- /dev/null +++ b/common/gcl/tools/gcl_sample/sample.cpp @@ -0,0 +1,168 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#ifdef _USE_FP16 +#include "gcl.h" +#include "types.h" +#include "libkernelsource.h" + +void setMemDesc(GCLMem_t mem, + DataType dt, + DataFormat ft, + GCLMemType mt, + U32 s0, + U32 s1, + U32 s2, + U32 off0, + U32 off1, + U32 off2) +{ + mem->desc.stride[0] = s0 + 2 * off0; + mem->desc.stride[1] = s1 + 2 * off1; + mem->desc.stride[2] = s2; + mem->desc.offset[0] = off0; + mem->desc.offset[1] = off1; + mem->desc.offset[2] = off2; + mem->desc.num = (s0 + 2 * off0) * (s1 + 2 * off0) * s2; + mem->desc.byteSize = s0 * s1 * s2 * bytesOf(dt); + mem->desc.memFormat = ft; + mem->desc.memType = mt; +} + +int main() +{ + GCLHandle_t handle; + CHECK_STATUS(gcl_create_handle(&handle)); + U32 iw, ih, ic, in; + U32 fw, fh, fc, fn; + U32 sv, pv; + U32 ow, oh, oc, on; + + iw = 1440; + ih = 960; + ic = 4; + in = 1; + + fw = 3; + fh = 3; + fc = 4; + fn = 1; + + ow = iw; + oh = ih; + oc = fn; + on = in; + + sv = 1; + pv = 1; + + TensorDesc outDesc = tensor4d(DT_F16, on, oc, oh, ow); + GCLMem_t input = gcl_create_gclmem(); + GCLMem_t flt = gcl_create_gclmem(); + GCLMem_t bias = gcl_create_gclmem(); + GCLMem_t output = gcl_create_gclmem(); + setMemDesc(input, DT_F16, DF_NCWHC4, GCL_MEM_BUF, iw + 8, ih, ic, pv, pv, 0); + setMemDesc(flt, DT_F16, DF_NCWHC4, GCL_MEM_BUF, fw * fh, fc, fn, 0, 0, 0); + setMemDesc(bias, DT_F16, DF_NCHW, GCL_MEM_BUF, fn, 1, 1, 0, 0, 0); + setMemDesc(output, DT_F16, DF_NCHW, GCL_MEM_BUF, ow, oh, oc * 4, 0, 0, 0); + CHECK_STATUS(gcl_create_memory(handle, input)); + CHECK_STATUS(gcl_create_memory(handle, flt)); + CHECK_STATUS(gcl_create_memory(handle, bias)); + CHECK_STATUS(gcl_create_memory(handle, output)); + + U8 *iptr = new U8[input->desc.byteSize]; + U8 *fptr = new U8[flt->desc.byteSize]; + U8 *bptr = new U8[bias->desc.byteSize]; + + F16 *ival = (F16 *)iptr; + F16 *fval = (F16 *)fptr; + F16 *bval = (F16 *)bptr; + for (U32 i = 0; i < input->desc.num; i++) { + ival[i] = (rand() & 1023) / 1024.0 - 0.5; + U32 s0 = input->desc.stride[0]; + U32 s1 = input->desc.stride[1]; + U32 j = i % (s0 * s1); + U32 h = j % s1; + U32 w = j / s1; + if (h < pv || w < pv) { + ival[i] = 0; + } + if (h >= ih + pv || w >= iw + pv) { + ival[i] = 0; + } + } + + for (U32 i = 0; i < flt->desc.num; i++) { + fval[i] = (rand() & 1023) / 1024.0 - 0.5; + } + + for (U32 i = 0; i < bias->desc.num; i++) { + bval[i] = (rand() & 1023) / 1024.0 - 0.5; + } + + CHECK_STATUS(gcl_trans_memory( + handle, (void *)iptr, (void *)input, &input->desc.byteSize, HOST_TO_DEVICE_BUF, CL_TRUE)); + CHECK_STATUS(gcl_trans_memory( + handle, (void *)fptr, (void *)flt, &flt->desc.byteSize, HOST_TO_DEVICE_BUF, CL_TRUE)); + CHECK_STATUS(gcl_trans_memory( + handle, (void *)bptr, (void *)bias, &bias->desc.byteSize, HOST_TO_DEVICE_BUF, CL_TRUE)); + + Kernel kernel; + char kernelname[128]; + U32 be = 0; + U32 end = 0; + for (int i = 3; i <= 8; i++) { + sprintf(kernelname, "conv_direct_s1_fn_spe_nchw_3%d", i); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + U32 iw_str = input->desc.stride[0]; + U32 ih_str = input->desc.stride[1]; + U32 ihw_str = iw_str * ih_str; + U32 ic_str = (input->desc.stride[2] + 3) / 4; + U32 ih_off = input->desc.offset[0] - pv; + U32 iw_off = input->desc.offset[1] - pv; + U32 sh = 1; + U32 ow_str = output->desc.stride[0]; + U32 oh_str = output->desc.stride[1]; + U32 ohw_str = ow_str * oh_str; + U32 ow_off = output->desc.offset[0]; + U32 oh_off = output->desc.offset[1]; + + U32 gs[2]; + gs[0] = oh; + gs[1] = (ow + i - 1) / i; + U32 dim = 2; + U32 ls[2] = {0, 0}; + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, oh_str, + ow_str, ohw_str, oh_off, ow_off, ow, sh, gs[0], gs[1], input->mem, flt->mem, bias->mem, + output->mem)); + CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname)); + end = handle->kernelVec.size(); + CHECK_STATUS(gcl_run_kernelVec_timing(handle, be, end)); + CHECK_STATUS(gcl_run_kernelVec_timing(handle, be, end)); + CHECK_STATUS(gcl_run_kernelVec_timing(handle, be, end)); + CHECK_STATUS(gcl_run_kernelVec_timing(handle, be, end)); + be = end; +#ifdef _DEBUG + CHECK_STATUS(gcl_check_data(handle, outDesc, output, 0, false)); +#endif + } + + delete[] iptr; + delete[] fptr; + delete[] bptr; + gcl_destroy_gclmem(input); + gcl_destroy_gclmem(flt); + gcl_destroy_gclmem(bias); + gcl_destroy_gclmem(output); + gcl_destroy_handle(handle); +} +#endif diff --git a/gcl/tools/kernel_lib_compile/CMakeLists.txt b/common/gcl/tools/kernel_lib_compile/CMakeLists.txt similarity index 58% rename from gcl/tools/kernel_lib_compile/CMakeLists.txt rename to common/gcl/tools/kernel_lib_compile/CMakeLists.txt index 807c6128..8fc8b455 100644 --- a/gcl/tools/kernel_lib_compile/CMakeLists.txt +++ b/common/gcl/tools/kernel_lib_compile/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.2) -file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake) +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) if (BOLT_CONFIGURE_FILE) include(${BOLT_CONFIGURE_FILE}) else (BOLT_CONFIGURE_FILE) @@ -15,9 +15,7 @@ project(KERNELBIN) set_policy() -SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${BOLT_ROOT}/cmakes") find_package(Gcl) -find_package(Uni) include_directories(${PROJECT_SOURCE_DIR}/include) set_project_install_directory() @@ -29,8 +27,8 @@ execute_process( ) file(GLOB_RECURSE kernel_src_list "src/*.cpp") -ADD_LIBRARY(kernelbin SHARED ${kernel_src_list}) -ADD_LIBRARY(kernelbin_static STATIC ${kernel_src_list}) -SET_TARGET_PROPERTIES(kernelbin_static PROPERTIES OUTPUT_NAME "kernelbin") -SET_TARGET_PROPERTIES(kernelbin PROPERTIES CLEAN_DIRECT_OUTPUT 1) -SET_TARGET_PROPERTIES(kernelbin_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) +add_library(kernelbin SHARED ${kernel_src_list}) +add_library(kernelbin_static STATIC ${kernel_src_list}) +set_target_properties(kernelbin_static PROPERTIES OUTPUT_NAME "kernelbin") +set_target_properties(kernelbin PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(kernelbin_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) diff --git a/gcl/tools/kernel_lib_compile/buildKernelLib.sh b/common/gcl/tools/kernel_lib_compile/buildKernelLib.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/buildKernelLib.sh rename to common/gcl/tools/kernel_lib_compile/buildKernelLib.sh diff --git a/gcl/tools/kernel_lib_compile/device_name/CMakeLists.txt b/common/gcl/tools/kernel_lib_compile/device_name/CMakeLists.txt similarity index 68% rename from gcl/tools/kernel_lib_compile/device_name/CMakeLists.txt rename to common/gcl/tools/kernel_lib_compile/device_name/CMakeLists.txt index b709b930..e311eeea 100644 --- a/gcl/tools/kernel_lib_compile/device_name/CMakeLists.txt +++ b/common/gcl/tools/kernel_lib_compile/device_name/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.2) -file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake) +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) if (BOLT_CONFIGURE_FILE) include(${BOLT_CONFIGURE_FILE}) else (BOLT_CONFIGURE_FILE) @@ -18,9 +18,7 @@ set_c_cxx_flags() set_test_c_cxx_flags() -SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${BOLT_ROOT}/cmakes") find_package(Gcl) -find_package(Uni) add_executable(gcl_device_name device_name.cpp) -TARGET_LINK_LIBRARIES(gcl_device_name ${OPENCL_LIBRARIES} -Wl,-allow-shlib-undefined, -static-libstdc++) +target_link_libraries(gcl_device_name ${OPENCL_LIBRARIES} -Wl,-allow-shlib-undefined, -static-libstdc++) diff --git a/gcl/tools/kernel_lib_compile/device_name/device_name.cpp b/common/gcl/tools/kernel_lib_compile/device_name/device_name.cpp similarity index 74% rename from gcl/tools/kernel_lib_compile/device_name/device_name.cpp rename to common/gcl/tools/kernel_lib_compile/device_name/device_name.cpp index ea7e17b5..3f3ccac7 100644 --- a/gcl/tools/kernel_lib_compile/device_name/device_name.cpp +++ b/common/gcl/tools/kernel_lib_compile/device_name/device_name.cpp @@ -1,30 +1,26 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#include "gcl.h" +#include - -#include"gcl.h" -#include - -int main(){ +int main() +{ GCLHandle_t handle; CHECK_STATUS(gcl_create_handle(&handle)); - FILE* fp = fopen("deviceBinmapNameFile", "w"); - fwrite(handle->deviceBinmapName.c_str(), handle->deviceBinmapName.length(), 1, fp); + FILE *fp = fopen("deviceNameFile", "w"); + fwrite(handle->deviceName.c_str(), handle->deviceName.length(), 1, fp); fclose(fp); gcl_destroy_handle(handle); return 0; } - - - diff --git a/gcl/tools/kernel_lib_compile/kernel_bin/CMakeLists.txt b/common/gcl/tools/kernel_lib_compile/kernel_bin/CMakeLists.txt similarity index 67% rename from gcl/tools/kernel_lib_compile/kernel_bin/CMakeLists.txt rename to common/gcl/tools/kernel_lib_compile/kernel_bin/CMakeLists.txt index d55ee6ca..0081c2f4 100644 --- a/gcl/tools/kernel_lib_compile/kernel_bin/CMakeLists.txt +++ b/common/gcl/tools/kernel_lib_compile/kernel_bin/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.2) -file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake) +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) if (BOLT_CONFIGURE_FILE) include(${BOLT_CONFIGURE_FILE}) else (BOLT_CONFIGURE_FILE) @@ -14,13 +14,11 @@ project(gclBinary) set_policy() -SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${BOLT_ROOT}/cmakes") find_package(Gcl) -find_package(Uni) set_c_cxx_flags() set_test_c_cxx_flags() add_executable(gcl_binary clbinary.cpp) -TARGET_LINK_LIBRARIES(gcl_binary ${OPENCL_LIBRARIES} -Wl,-allow-shlib-undefined, -static-libstdc++) +target_link_libraries(gcl_binary ${OPENCL_LIBRARIES} -Wl,-allow-shlib-undefined, -static-libstdc++) diff --git a/gcl/tools/kernel_lib_compile/kernel_bin/clbinary.cpp b/common/gcl/tools/kernel_lib_compile/kernel_bin/clbinary.cpp similarity index 59% rename from gcl/tools/kernel_lib_compile/kernel_bin/clbinary.cpp rename to common/gcl/tools/kernel_lib_compile/kernel_bin/clbinary.cpp index 028c03a4..c6f2e89d 100644 --- a/gcl/tools/kernel_lib_compile/kernel_bin/clbinary.cpp +++ b/common/gcl/tools/kernel_lib_compile/kernel_bin/clbinary.cpp @@ -1,26 +1,25 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include"gcl.h" +#include "gcl.h" #include const char *imagesource = "#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable\n"; -const char *half16source = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"; +const char *half16source = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"; -void printHelp() { +void printHelp() +{ printf("please use the linux tradition, or you will face problem!!!!!!!!!!!!!!\n"); printf("The program only support opencl kernel compile now !!!!!!!!!!!1!!!!!!!\n"); printf("-i or --input to specify OpenCL input cl soruce file name\n"); @@ -28,23 +27,35 @@ void printHelp() { printf("-O or --options to specify OpenCL compiling options\n"); } -bool GetFileLength(CI8* filename, U32* len) { - if((NULL == filename) || (0 == strlen(filename)) ) return false; +bool GetFileLength(CI8 *filename, U32 *len) +{ + if ((NULL == filename) || (0 == strlen(filename))) { + return false; + } FILE *fp = fopen(filename, "rb"); - if(NULL == fp) return false; + if (NULL == fp) { + return false; + } rewind(fp); - if(0 != fseek(fp, 0, SEEK_END)) return false; - * len = ftell(fp); + if (0 != fseek(fp, 0, SEEK_END)) { + return false; + } + *len = ftell(fp); fclose(fp); return true; } -bool LoadBinFile(CI8* filename, I8* str, U32 len) { - if((NULL == filename) || (0 == strlen(filename)) ) return false; +bool LoadBinFile(CI8 *filename, I8 *str, U32 len) +{ + if ((NULL == filename) || (0 == strlen(filename))) { + return false; + } FILE *fp = fopen(filename, "rb"); - if(NULL == fp) return false; + if (NULL == fp) { + return false; + } rewind(fp); - if(len != fread(str, sizeof(char), len, fp)) { + if (len != fread(str, sizeof(char), len, fp)) { fclose(fp); return false; } @@ -52,11 +63,16 @@ bool LoadBinFile(CI8* filename, I8* str, U32 len) { return true; } -bool StoreToBinFile(CI8* filename, U32 length, CU8* s) { - if((NULL == s) || (NULL == filename) || (0 == strlen(filename)) ) return false; +bool StoreToBinFile(CI8 *filename, U32 length, CU8 *s) +{ + if ((NULL == s) || (NULL == filename) || (0 == strlen(filename))) { + return false; + } FILE *fp = fopen(filename, "wb"); - if(NULL == fp) return false; - if(length != fwrite(s, sizeof(char), length, fp)) { + if (NULL == fp) { + return false; + } + if (length != fwrite(s, sizeof(char), length, fp)) { fclose(fp); return false; } @@ -64,24 +80,21 @@ bool StoreToBinFile(CI8* filename, U32 length, CU8* s) { return true; } - -void parseCommandLine(I32 argc, I8* argv[], I8** inputFilename, I8** outputFilename, I8** options){ - const struct option long_options[] = { - {"input", 1, nullptr, 'i'}, - {"output", 1, nullptr, 'o'}, - {"options", 1, nullptr, 'O'}, - {nullptr, 1, nullptr, '0'}}; - bool setInput = false; - bool setOutput = false; +void parseCommandLine(I32 argc, I8 *argv[], I8 **inputFilename, I8 **outputFilename, I8 **options) +{ + const struct option long_options[] = {{"input", 1, nullptr, 'i'}, {"output", 1, nullptr, 'o'}, + {"options", 1, nullptr, 'O'}, {nullptr, 1, nullptr, '0'}}; + bool setInput = false; + bool setOutput = false; bool setOptions = false; - int optionIndex = 0; - int ch; - while((ch = getopt_long(argc, argv, "i:o:O", long_options, &optionIndex)) != -1){ - switch(ch) { + int optionIndex = 0; + int ch; + while ((ch = getopt_long(argc, argv, "i:o:O", long_options, &optionIndex)) != -1) { + switch (ch) { case 'i': printf("input file name is %s\n", optarg); *inputFilename = optarg; - if(setInput) { + if (setInput) { printf("you specify input file name twice, program will exit\n"); exit(0); } @@ -90,7 +103,7 @@ void parseCommandLine(I32 argc, I8* argv[], I8** inputFilename, I8** outputFilen case 'o': printf("output file name is %s\n", optarg); *outputFilename = optarg; - if(setOutput) { + if (setOutput) { printf("you specify output file name twice, program will exit\n"); exit(0); } @@ -99,7 +112,7 @@ void parseCommandLine(I32 argc, I8* argv[], I8** inputFilename, I8** outputFilen case 'O': printf("options is %s\n", optarg); *options = optarg; - if(setOptions) { + if (setOptions) { printf("you specify compiling options twice, program will exit\n"); exit(0); } @@ -109,30 +122,30 @@ void parseCommandLine(I32 argc, I8* argv[], I8** inputFilename, I8** outputFilen printf("not support option:%c\n", ch); } } - if(!setInput) { - printf("you don't specify the input cl file name, program will exit\n"); - exit(0); + if (!setInput) { + printf("you don't specify the input cl file name, program will exit\n"); + exit(0); } - if(!setOutput) { - printf("you don't specify the output file name, program will exit\n"); - exit(0); + if (!setOutput) { + printf("you don't specify the output file name, program will exit\n"); + exit(0); } - if(!setOptions) { - printf("you don't specify the options for compiling cl file, default is empty\n"); - *options=(char*)""; + if (!setOptions) { + printf("you don't specify the options for compiling cl file, default is empty\n"); + *options = (char *)""; } - } -int main(I32 argc , I8* argv[]){ - if(1 == argc){ +int main(I32 argc, I8 *argv[]) +{ + if (1 == argc) { printHelp(); return 0; } - I8* FLAGS_inputFilename; - I8* FLAGS_outputFilename; - I8* FLAGS_options; + I8 *FLAGS_inputFilename; + I8 *FLAGS_outputFilename; + I8 *FLAGS_options; parseCommandLine(argc, argv, &FLAGS_inputFilename, &FLAGS_outputFilename, &FLAGS_options); GCLHandle_t handle; @@ -144,32 +157,39 @@ int main(I32 argc , I8* argv[]){ U32 half16Len = strlen(half16source); U32 clcodeLen = 0; bool FileStatus = GetFileLength(FLAGS_inputFilename, &clcodeLen); - if(!FileStatus) {printf("get file length failed\n");return 0;} + if (!FileStatus) { + printf("get file length failed\n"); + return 0; + } U32 srcLen = imageLen + half16Len + clcodeLen; - I8* source = new I8[srcLen]; + I8 *source = new I8[srcLen]; #ifdef CL_VERSION_1_2 memcpy(source, imagesource, imageLen); #endif memcpy(source + imageLen, half16source, half16Len); FileStatus = LoadBinFile(FLAGS_inputFilename, source + imageLen + half16Len, clcodeLen); - if(!FileStatus) {printf("load bin file failed\n");delete[] source; return 0;} + if (!FileStatus) { + printf("load bin file failed\n"); + delete[] source; + return 0; + } Program program; - U32 numKernel = 1; - Kernel kernel; - U32 size = 0; - U8* binary; - - CHECK_STATUS(gcl_produce_program_kernel_with_source(handle, &srcLen, source, FLAGS_options, &program, numKernel, &kernel)); + U32 numKernel = 1; + Kernel kernel; + U32 size = 0; + U8 *binary; + + CHECK_STATUS(gcl_produce_program_kernel_with_source( + handle, &srcLen, source, FLAGS_options, &program, numKernel, &kernel)); CHECK_STATUS(gcl_get_program_info(program, &binary, &size)); FileStatus = StoreToBinFile(FLAGS_outputFilename, size, binary); - if(!FileStatus) {printf("store bin file failed\n");} + if (!FileStatus) { + printf("store bin file failed\n"); + } free(binary); delete[] source; CHECK_STATUS(release_program(program)); CHECK_STATUS(release_kernel(kernel)); gcl_destroy_handle(handle); } - - - diff --git a/gcl/tools/kernel_lib_compile/kernel_bin2char/bin2char.cpp b/common/gcl/tools/kernel_lib_compile/kernel_bin2char/bin2char.cpp similarity index 76% rename from gcl/tools/kernel_lib_compile/kernel_bin2char/bin2char.cpp rename to common/gcl/tools/kernel_lib_compile/kernel_bin2char/bin2char.cpp index d0db4ec5..3b73ffba 100644 --- a/gcl/tools/kernel_lib_compile/kernel_bin2char/bin2char.cpp +++ b/common/gcl/tools/kernel_lib_compile/kernel_bin2char/bin2char.cpp @@ -1,18 +1,16 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - #include #include #include @@ -20,7 +18,8 @@ #include #include -int main(int argc, char *argv[]) { +int main(int argc, char *argv[]) +{ char *binName; char *cppName; char *charName; @@ -28,29 +27,29 @@ int main(int argc, char *argv[]) { std::string binFile; std::string cppFile; - if(argc == 3) { + if (argc == 3) { binName = argv[1]; binFile = binName; charName = strtok(binName, "."); cppFile = std::string(charName) + ".cpp"; int len = strlen(charName); - for(int i = len - 1; i > -1 ; --i) { - if(charName[i] == '/') { - charName = &charName[i+1]; + for (int i = len - 1; i > -1; --i) { + if (charName[i] == '/') { + charName = &charName[i + 1]; break; } } binMapName = argv[2]; - } else if(argc == 4) { + } else if (argc == 4) { binName = argv[1]; binFile = binName; cppName = argv[2]; cppFile = cppName; charName = strtok(cppName, "."); int len = strlen(charName); - for(int i = len - 1; i > -1 ; --i) { - if(charName[i] == '/') { - charName = &charName[i+1]; + for (int i = len - 1; i > -1; --i) { + if (charName[i] == '/') { + charName = &charName[i + 1]; break; } } @@ -60,7 +59,7 @@ int main(int argc, char *argv[]) { } FILE *fpbin = fopen(binFile.c_str(), "rb"); - if(fpbin == NULL) { + if (fpbin == NULL) { printf("file %s open error\n", binFile.c_str()); return 1; } @@ -76,17 +75,19 @@ int main(int argc, char *argv[]) { templen << filelen; std::string filelen_st = templen.str(); - std::string str = "#include \"inline_" + std::string(binMapName) + ".h\"\n\nCU32 " + std::string(charName) + "_len = " + filelen_st + ";\nCU8 " + std::string(charName) + "[] = {"; + std::string str = "#include \"inline_" + std::string(binMapName) + ".h\"\n\nCU32 " + + std::string(charName) + "_len = " + filelen_st + ";\nCU8 " + std::string(charName) + + "[] = {"; unsigned char charRead; std::string appendBuf; - for(int i = 0 ;i < filelen ; i++ ) { + for (int i = 0; i < filelen; i++) { appendBuf.clear(); - if(i % 20 == 0) { + if (i % 20 == 0) { appendBuf += "\n"; } - if(1 != fread(&charRead, 1, 1, fpbin)) { + if (1 != fread(&charRead, 1, 1, fpbin)) { printf("file %s read error\n", binFile.c_str()); fclose(fpbin); return 1; @@ -95,9 +96,8 @@ int main(int argc, char *argv[]) { sprintf(tempstr, "0x%02x", charRead); appendBuf += std::string(tempstr); - if(i == filelen -1) { - } - else if(i % 20 == 19) { + if (i == filelen - 1) { + } else if (i % 20 == 19) { appendBuf += ","; } else { appendBuf += ", "; @@ -108,7 +108,7 @@ int main(int argc, char *argv[]) { str += "};"; std::ofstream file; - file.open (cppFile.c_str()); + file.open(cppFile.c_str()); file << str; file.close(); diff --git a/common/gcl/tools/kernel_lib_compile/sh/adbDeviceNum.sh b/common/gcl/tools/kernel_lib_compile/sh/adbDeviceNum.sh new file mode 100644 index 00000000..da1d2da3 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/adbDeviceNum.sh @@ -0,0 +1,12 @@ +adbDeviceNum=($(adb devices | sed 's/\r//' | grep ".device$")) +i=0 +length=${#adbDeviceNum[@]} +while [ "$i" -lt "$length" ]; do + if + ((i%2!=0)) + then + unset adbDeviceNum[i] + fi + ((i++)) +done +adbDeviceNum=(E5B0119506000260 GCL5T19822000030) diff --git a/gcl/tools/kernel_lib_compile/sh/buildKernelBin.sh b/common/gcl/tools/kernel_lib_compile/sh/buildKernelBin.sh similarity index 54% rename from gcl/tools/kernel_lib_compile/sh/buildKernelBin.sh rename to common/gcl/tools/kernel_lib_compile/sh/buildKernelBin.sh index 31349475..faaa49bc 100644 --- a/gcl/tools/kernel_lib_compile/sh/buildKernelBin.sh +++ b/common/gcl/tools/kernel_lib_compile/sh/buildKernelBin.sh @@ -1,5 +1,15 @@ #build kernel bin on device# #if devices name are the same, the build will only execute once# + +function push_directory() { + # Use for to avoid adb push version problems + for file in $(ls $1) + do + echo "Pushing ${file} to $2" + adb -s ${dNum} push $1/${file} $2 + done +} + index=1 for dNum in "${adbDeviceNum[@]}"; do adb -s ${dNum} shell "rm -rf ${kernelBuildPath}" @@ -20,20 +30,25 @@ for dNum in "${adbDeviceNum[@]}"; do if [ $dnameS -eq 0 ]; then rm -rf ${binPath}/${dname} mkdir ${binPath}/${dname} - adb -s ${dNum} shell "cd ${kernelBuildPath} && mkdir sh" - adb -s ${dNum} push gcl_binary ${kernelBuildPath} - adb -s ${dNum} push ${clPath} ${kernelBuildPath} - adb -s ${dNum} push ${compileConfigPath} ${kernelBuildPath} - adb -s ${dNum} push ${shPath}/sh.config ${kernelBuildPath} + adb -s ${dNum} push gcl_binary ${kernelBuildPath} + push_directory ${clPath} ${kernelBuildPath} + push_directory ${compileConfigPath} ${kernelBuildPath} + adb -s ${dNum} push ${shPath}/sh.config ${kernelBuildPath} adb -s ${dNum} shell "cd ${kernelBuildPath} && chmod +x gcl_binary" - adb -s ${dNum} shell "cd ${kernelBuildPath} && cp *.sh ./sh" - for compileConfig in $compileConfigFiles - do - adb -s ${dNum} shell "cd ${kernelBuildPath} && source ./sh.config && chmod +x ${compileConfig} && ./${compileConfig} > tmp.sh && chmod +x tmp.sh && ./tmp.sh" - done + echo "Compiling on device ${dNum}" + for compileConfig in $compileConfigFiles + do + adb -s ${dNum} shell "cd ${kernelBuildPath} && source ./sh.config && + chmod +x ${compileConfig} && ./${compileConfig} > tmp.sh && chmod +x tmp.sh && ./tmp.sh" + done adb -s ${dNum} shell "cd ${kernelBuildPath} && mkdir bin" adb -s ${dNum} shell "cd ${kernelBuildPath} && cp *.bin ${kernelBuildPath}/bin" - adb -s ${dNum} pull ${kernelBuildPath}/bin/ ${binPath}/${dname} + adb -s ${dNum} pull ${kernelBuildPath}/bin ${binPath}/${dname} + if [ -d ${binPath}/${dname}/bin ]; then + mv ${binPath}/${dname}/bin/* ${binPath}/${dname} + rm -r ${binPath}/${dname}/bin + echo ${binPath}/${dname} + fi adb -s ${dNum} shell "rm -rf ${kernelBuildPath}" echo ${dname} >> ${dNameFile} fi diff --git a/gcl/tools/kernel_lib_compile/sh/buildKernelLibConfig.sh b/common/gcl/tools/kernel_lib_compile/sh/buildKernelLibConfig.sh similarity index 84% rename from gcl/tools/kernel_lib_compile/sh/buildKernelLibConfig.sh rename to common/gcl/tools/kernel_lib_compile/sh/buildKernelLibConfig.sh index 50cf821a..38fb3254 100644 --- a/gcl/tools/kernel_lib_compile/sh/buildKernelLibConfig.sh +++ b/common/gcl/tools/kernel_lib_compile/sh/buildKernelLibConfig.sh @@ -4,8 +4,9 @@ workPath=$(pwd) #set file.cl dir# tensorCLPath=${BOLT_ROOT}/tensor_computing/src/gpu/mali/cl sampleCLPath=${BOLT_ROOT}/gcl/tools/gcl_sample/cl -CLPath=(${tensorCLPath} ${sampleCLPath}) -deviceNameFile=deviceBinmapNameFile +imageCLPath=${BOLT_ROOT}/image/src/gpu/mali/cl +CLPath=(${tensorCLPath} ${sampleCLPath} ${imageCLPath}) +deviceNameFile=deviceNameFile #get kernel compile option sh# shPath=${workPath}/sh @@ -20,7 +21,8 @@ srcPath=${workPath}/src incPath=${workPath}/include clPath=${workPath}/cl namePath=${workPath}/name -kernelBuildPath=/data/local/tmp/boltKernelBuild +TimeFlag=`adb -s ${adbDeviceNum[0]} shell "date +%s_%N"` +kernelBuildPath=/data/local/tmp/${TimeFlag} bin2charPath=${workPath}/kernel_bin2char kernelBinPath=${workPath}/kernel_bin deviceNamePath=${workPath}/device_name @@ -46,7 +48,8 @@ cmake_options="-DUSE_CROSS_COMPILE=ON \ -DUSE_LLVM_CLANG=ON \ -DUSE_MALI=ON \ -DUSE_DYNAMIC_LIBRARY=ON \ - -DBUILD_TEST=ON " + -DBUILD_TEST=ON \ + -DUSE_THREAD_SAFE=OFF" cmake .. ${cmake_options} make -j33 cp gcl_binary ${workPath} diff --git a/gcl/tools/kernel_lib_compile/sh/compile/activation.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/activation.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/activation.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/activation.sh diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/argmax_x.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/argmax_x.sh new file mode 100644 index 00000000..dae036ea --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/argmax_x.sh @@ -0,0 +1,12 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "argmax_x.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}.bin --options=\"${copt}\" + echo ./gcl_binary --input=$file --output=${file%.*}_index.bin --options=\"${copt} -DUSE_INDEX\" + fi + fi + done + + + diff --git a/gcl/tools/kernel_lib_compile/sh/compile/bilateral_slice_apply_c12.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/bilateral_slice_apply_c12.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/bilateral_slice_apply_c12.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/bilateral_slice_apply_c12.sh diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/channel_resize.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/channel_resize.sh new file mode 100644 index 00000000..b277c214 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/channel_resize.sh @@ -0,0 +1,14 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "channel_resize.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}.bin --options=\"${copt}\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw.bin --options=\"${copt} -DINPUT_NCHW -DOUTPUT_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_ncwhc4.bin --options=\"${copt} -DINPUT_NCHW -DOUTPUT_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_nchw.bin --options=\"${copt} -DINPUT_NCWHC4 -DOUTPUT_NCHW\" + fi + fi + done + + + diff --git a/gcl/tools/kernel_lib_compile/sh/compile/common.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/common.sh similarity index 50% rename from gcl/tools/kernel_lib_compile/sh/compile/common.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/common.sh index 3fe7ab58..16be6935 100644 --- a/gcl/tools/kernel_lib_compile/sh/compile/common.sh +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/common.sh @@ -3,15 +3,15 @@ for file in * if [ "${file##*.}"x = "cl"x ];then clFileName=${file%.*} speConfig=0 - for filesh in `ls sh` - do - if [ "${filesh##*.}"x = "sh"x ];then - shFileName=${filesh%.*} - if [ "$clFileName" = "$shFileName" ];then - speConfig=1; + for filesh in *.sh + do + if [ "${filesh##*.}"x = "sh"x ];then + shFileName=${filesh%.*} + if [ "$clFileName" = "$shFileName" ];then + speConfig=1; + fi fi - fi - done + done if [ $speConfig -eq 0 ]; then echo ./gcl_binary --input=$file --output=${file%.*}.bin --options=\"${copt}\" fi diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/concat.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/concat.sh new file mode 100644 index 00000000..85f82916 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/concat.sh @@ -0,0 +1,26 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "concat.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_w1.bin --options=\"${copt} -D N=1 -D AXIS_W\" + echo ./gcl_binary --input=$file --output=${file%.*}_w2.bin --options=\"${copt} -D N=2 -D AXIS_W\" + echo ./gcl_binary --input=$file --output=${file%.*}_w3.bin --options=\"${copt} -D N=3 -D AXIS_W\" + echo ./gcl_binary --input=$file --output=${file%.*}_w4.bin --options=\"${copt} -D N=4 -D AXIS_W\" + echo ./gcl_binary --input=$file --output=${file%.*}_h1.bin --options=\"${copt} -D N=1 -D AXIS_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h2.bin --options=\"${copt} -D N=2 -D AXIS_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h3.bin --options=\"${copt} -D N=3 -D AXIS_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h4.bin --options=\"${copt} -D N=4 -D AXIS_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_c1.bin --options=\"${copt} -D N=1 -D AXIS_C\" + echo ./gcl_binary --input=$file --output=${file%.*}_c2.bin --options=\"${copt} -D N=2 -D AXIS_C\" + echo ./gcl_binary --input=$file --output=${file%.*}_c3.bin --options=\"${copt} -D N=3 -D AXIS_C\" + echo ./gcl_binary --input=$file --output=${file%.*}_c4.bin --options=\"${copt} -D N=4 -D AXIS_C\" + echo ./gcl_binary --input=$file --output=${file%.*}_nonalign_c_p1_1.bin --options=\"${copt} -D N=1 -D AXIS_C -D NON_ALIGN_C\" + echo ./gcl_binary --input=$file --output=${file%.*}_nonalign_c_p1_2.bin --options=\"${copt} -D N=2 -D AXIS_C -D NON_ALIGN_C\" + echo ./gcl_binary --input=$file --output=${file%.*}_nonalign_c_p1_3.bin --options=\"${copt} -D N=3 -D AXIS_C -D NON_ALIGN_C\" + echo ./gcl_binary --input=$file --output=${file%.*}_nonalign_c_p1_4.bin --options=\"${copt} -D N=4 -D AXIS_C -D NON_ALIGN_C\" + fi + fi + done + + + diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_s1.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_s1.sh similarity index 52% rename from gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_s1.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_s1.sh index ae04ae43..cc053ba3 100644 --- a/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_s1.sh +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_s1.sh @@ -20,6 +20,17 @@ for file in * echo ./gcl_binary --input=$file --output=${file%.*}_relu_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" echo ./gcl_binary --input=$file --output=${file%.*}_relu_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_34.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=6 -D LN=6 -D UN=5 -D Fsq=9 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_35.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_36.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -DUSE_HALF\" echo ./gcl_binary --input=$file --output=${file%.*}_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -DUSE_HALF\" echo ./gcl_binary --input=$file --output=${file%.*}_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -DUSE_HALF\" @@ -38,6 +49,43 @@ for file in * echo ./gcl_binary --input=$file --output=${file%.*}_relu_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" echo ./gcl_binary --input=$file --output=${file%.*}_relu_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_54.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_55.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_71.bin --options=\"${copt} -D F=7 -D ON=1 -D IN=7 -D LN=7 -D UN=4 -D Fsq=49 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_72.bin --options=\"${copt} -D F=7 -D ON=2 -D IN=8 -D LN=8 -D UN=5 -D Fsq=49 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_73.bin --options=\"${copt} -D F=7 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=49 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_74.bin --options=\"${copt} -D F=7 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=49 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_75.bin --options=\"${copt} -D F=7 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=49 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_76.bin --options=\"${copt} -D F=7 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=49 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_77.bin --options=\"${copt} -D F=7 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=49 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_78.bin --options=\"${copt} -D F=7 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=49 -DUSE_HALF -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_71.bin --options=\"${copt} -D F=7 -D ON=1 -D IN=7 -D LN=7 -D UN=6 -D Fsq=49 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_72.bin --options=\"${copt} -D F=7 -D ON=2 -D IN=8 -D LN=8 -D UN=7 -D Fsq=49 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_73.bin --options=\"${copt} -D F=7 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=49 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_74.bin --options=\"${copt} -D F=7 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=49 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_75.bin --options=\"${copt} -D F=7 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=49 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_76.bin --options=\"${copt} -D F=7 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=49 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_77.bin --options=\"${copt} -D F=7 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=49 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_78.bin --options=\"${copt} -D F=7 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=49 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_71.bin --options=\"${copt} -D F=7 -D ON=1 -D IN=7 -D LN=7 -D UN=6 -D Fsq=49 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_72.bin --options=\"${copt} -D F=7 -D ON=2 -D IN=8 -D LN=8 -D UN=7 -D Fsq=49 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_73.bin --options=\"${copt} -D F=7 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=49 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_74.bin --options=\"${copt} -D F=7 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=49 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_75.bin --options=\"${copt} -D F=7 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=49 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_76.bin --options=\"${copt} -D F=7 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=49 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_77.bin --options=\"${copt} -D F=7 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=49 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_78.bin --options=\"${copt} -D F=7 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=49 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_NCWH -DUSE_HALF\" echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -DUSE_NCWH -DUSE_HALF\" echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_NCWH -DUSE_HALF\" @@ -56,6 +104,17 @@ for file in * echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -D BASICE_REG\" echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_34.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=6 -D LN=6 -D UN=5 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_35.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_36.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -DUSE_NCWH -DUSE_HALF\" echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -DUSE_NCWH -DUSE_HALF\" echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -DUSE_NCWH -DUSE_HALF\" @@ -73,6 +132,16 @@ for file in * echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -D BASICE_REG\" echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -D BASICE_REG\" echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_54.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_55.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" fi fi done diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_s2.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_s2.sh similarity index 65% rename from gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_s2.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_s2.sh index 410d92fe..eb79dea9 100644 --- a/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_s2.sh +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_s2.sh @@ -20,6 +20,15 @@ for file in * echo ./gcl_binary --input=$file --output=${file%.*}_relu_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" echo ./gcl_binary --input=$file --output=${file%.*}_relu_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_34.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_35.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_36.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=25 -DUSE_HALF -DBASIC_REG\" echo ./gcl_binary --input=$file --output=${file%.*}_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=25 -DUSE_HALF -DBASIC_REG\" echo ./gcl_binary --input=$file --output=${file%.*}_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -DUSE_HALF -DBASIC_REG\" @@ -38,6 +47,15 @@ for file in * echo ./gcl_binary --input=$file --output=${file%.*}_relu_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" echo ./gcl_binary --input=$file --output=${file%.*}_relu_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_54.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_55.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_NCWH -DUSE_HALF\" echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_NCWH -DUSE_HALF\" echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF\" @@ -56,6 +74,15 @@ for file in * echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_31.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_32.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_33.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_34.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_35.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_36.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_37.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" echo ./gcl_binary --input=$file --output=${file%.*}_ncwh_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DBASIC_REG\" @@ -73,6 +100,15 @@ for file in * echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwh_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_51.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_52.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_53.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_54.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_55.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_56.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_57.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_ncwh_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -DUSE_NCWH -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" fi fi done diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_trans_fltbuf.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_trans_fltbuf.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_trans_fltbuf.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/conv_depthwise_trans_fltbuf.sh diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_3d_sw1_nchw_to_ncwhc4.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_3d_sw1_nchw_to_ncwhc4.sh new file mode 100644 index 00000000..162e901c --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_3d_sw1_nchw_to_ncwhc4.sh @@ -0,0 +1,11 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "conv_direct_3d_sw1_nchw_to_ncwhc4.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_relu_752.bin --options=\"${copt} -D FWH=7 -D FT=5 -D FWHT=245 -D ON=2 -DUSE_RELU\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_3d_sw2_nchw_to_ncwhc4.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_3d_sw2_nchw_to_ncwhc4.sh new file mode 100644 index 00000000..7fc02394 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_3d_sw2_nchw_to_ncwhc4.sh @@ -0,0 +1,11 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "conv_direct_3d_sw2_nchw_to_ncwhc4.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_relu_755.bin --options=\"${copt} -D FWH=7 -D FT=5 -D FWHT=245 -D ON=5 -DUSE_RELU\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1.sh new file mode 100644 index 00000000..af751df8 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1.sh @@ -0,0 +1,311 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "conv_direct_s1.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_111.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_131.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_151.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_161.bin --options=\"${copt} -D F=1 -D ON=6 -D IN=6 -D LN=6 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_171.bin --options=\"${copt} -D F=1 -D ON=7 -D IN=7 -D LN=7 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_181.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_112.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_132.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_152.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_114.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_134.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=4 -DUSE_HALF\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_111.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_131.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_151.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_161.bin --options=\"${copt} -D F=1 -D ON=6 -D IN=6 -D LN=6 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_171.bin --options=\"${copt} -D F=1 -D ON=7 -D IN=7 -D LN=7 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_181.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_112.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_132.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_114.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_134.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_111.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_131.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_151.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_161.bin --options=\"${copt} -D F=1 -D ON=6 -D IN=6 -D LN=6 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_171.bin --options=\"${copt} -D F=1 -D ON=7 -D IN=7 -D LN=7 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_181.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_112.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_132.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_114.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_134.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU6\" + + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_111.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_131.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_151.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_161.bin --options=\"${copt} -D F=1 -D ON=6 -D IN=6 -D LN=6 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_171.bin --options=\"${copt} -D F=1 -D ON=7 -D IN=7 -D LN=7 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_181.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_112.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_132.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_114.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_GELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_gelu_134.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_GELU\" + + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_111.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_131.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_151.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_161.bin --options=\"${copt} -D F=1 -D ON=6 -D IN=6 -D LN=6 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_171.bin --options=\"${copt} -D F=1 -D ON=7 -D IN=7 -D LN=7 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_181.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_112.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_132.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_114.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_134.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_ELTWISE_NCWHC4\" + + echo ./gcl_binary --input=$file --output=${file%.*}_h_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF -DREUSE_H\" + + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU -DREUSE_H\" + + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu6_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu6_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu6_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU6 -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu6_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU6 -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu6_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU6 -DREUSE_H\" + + echo ./gcl_binary --input=$file --output=${file%.*}_h_gelu_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_GELU -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_gelu_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_GELU -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_gelu_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_GELU -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_gelu_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_GELU -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_gelu_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_GELU -DREUSE_H\" + + echo ./gcl_binary --input=$file --output=${file%.*}_311.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_321.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_331.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_341.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_351.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_361.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_371.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_381.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_311.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_321.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_331.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_341.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_351.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_361.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_371.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_381.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_312.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_322.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_332.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_342.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=6 -D LN=6 -D UN=5 -D Fsq=9 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_312.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_322.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_332.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_342.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=6 -D LN=6 -D UN=5 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_312.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_322.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_332.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_342.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=6 -D LN=6 -D UN=5 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU6\" +# echo ./gcl_binary --input=$file --output=${file%.*}_314.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=4 -DUSE_HALF\" +# echo ./gcl_binary --input=$file --output=${file%.*}_324.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=4 -DUSE_HALF\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_314.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=4 -DUSE_HALF -DUSE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_324.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=4 -DUSE_HALF -DUSE_RELU\" + + echo ./gcl_binary --input=$file --output=${file%.*}_211.bin --options=\"${copt} -D F=2 -D ON=1 -D IN=2 -D LN=2 -D UN=1 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_221.bin --options=\"${copt} -D F=2 -D ON=2 -D IN=3 -D LN=3 -D UN=2 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_231.bin --options=\"${copt} -D F=2 -D ON=3 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_241.bin --options=\"${copt} -D F=2 -D ON=4 -D IN=5 -D LN=5 -D UN=4 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_251.bin --options=\"${copt} -D F=2 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=4 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_261.bin --options=\"${copt} -D F=2 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_271.bin --options=\"${copt} -D F=2 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=4 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_281.bin --options=\"${copt} -D F=2 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=4 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_211.bin --options=\"${copt} -D F=2 -D ON=1 -D IN=2 -D LN=2 -D UN=1 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_221.bin --options=\"${copt} -D F=2 -D ON=2 -D IN=3 -D LN=3 -D UN=2 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_231.bin --options=\"${copt} -D F=2 -D ON=3 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_241.bin --options=\"${copt} -D F=2 -D ON=4 -D IN=5 -D LN=5 -D UN=4 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_251.bin --options=\"${copt} -D F=2 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_261.bin --options=\"${copt} -D F=2 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_271.bin --options=\"${copt} -D F=2 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_281.bin --options=\"${copt} -D F=2 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_212.bin --options=\"${copt} -D F=2 -D ON=1 -D IN=2 -D LN=2 -D UN=1 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_222.bin --options=\"${copt} -D F=2 -D ON=2 -D IN=3 -D LN=3 -D UN=2 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_232.bin --options=\"${copt} -D F=2 -D ON=3 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_242.bin --options=\"${copt} -D F=2 -D ON=4 -D IN=5 -D LN=5 -D UN=4 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_212.bin --options=\"${copt} -D F=2 -D ON=1 -D IN=2 -D LN=2 -D UN=1 -D Fsq=4 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_222.bin --options=\"${copt} -D F=2 -D ON=2 -D IN=3 -D LN=3 -D UN=2 -D Fsq=4 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_232.bin --options=\"${copt} -D F=2 -D ON=3 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_242.bin --options=\"${copt} -D F=2 -D ON=4 -D IN=5 -D LN=5 -D UN=4 -D Fsq=4 -D KN=2 -DUSE_HALF -DUSE_RELU\" + + + + + echo ./gcl_binary --input=$file --output=${file%.*}_411.bin --options=\"${copt} -D F=4 -D ON=1 -D IN=4 -D LN=4 -D UN=3 -D Fsq=16 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_421.bin --options=\"${copt} -D F=4 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=16 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_431.bin --options=\"${copt} -D F=4 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=16 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_441.bin --options=\"${copt} -D F=4 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=16 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_451.bin --options=\"${copt} -D F=4 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=16 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_461.bin --options=\"${copt} -D F=4 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=16 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_471.bin --options=\"${copt} -D F=4 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=16 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_481.bin --options=\"${copt} -D F=4 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=16 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_411.bin --options=\"${copt} -D F=4 -D ON=1 -D IN=4 -D LN=4 -D UN=3 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU \" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_421.bin --options=\"${copt} -D F=4 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU \" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_431.bin --options=\"${copt} -D F=4 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_441.bin --options=\"${copt} -D F=4 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_451.bin --options=\"${copt} -D F=4 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_461.bin --options=\"${copt} -D F=4 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_471.bin --options=\"${copt} -D F=4 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_481.bin --options=\"${copt} -D F=4 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_411.bin --options=\"${copt} -D F=4 -D ON=1 -D IN=4 -D LN=4 -D UN=3 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU6 \" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_421.bin --options=\"${copt} -D F=4 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU6 \" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_431.bin --options=\"${copt} -D F=4 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_441.bin --options=\"${copt} -D F=4 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_451.bin --options=\"${copt} -D F=4 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_461.bin --options=\"${copt} -D F=4 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_471.bin --options=\"${copt} -D F=4 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_481.bin --options=\"${copt} -D F=4 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_412.bin --options=\"${copt} -D F=4 -D ON=1 -D IN=4 -D LN=4 -D UN=3 -D Fsq=16 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_422.bin --options=\"${copt} -D F=4 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=16 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_432.bin --options=\"${copt} -D F=4 -D ON=3 -D IN=6 -D LN=6 -D UN=5 -D Fsq=16 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_442.bin --options=\"${copt} -D F=4 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=16 -D KN=2 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_412.bin --options=\"${copt} -D F=4 -D ON=1 -D IN=4 -D LN=4 -D UN=3 -D Fsq=16 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_422.bin --options=\"${copt} -D F=4 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=16 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_432.bin --options=\"${copt} -D F=4 -D ON=3 -D IN=6 -D LN=6 -D UN=5 -D Fsq=16 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_442.bin --options=\"${copt} -D F=4 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=16 -D KN=2 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_412.bin --options=\"${copt} -D F=4 -D ON=1 -D IN=4 -D LN=4 -D UN=3 -D Fsq=16 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_422.bin --options=\"${copt} -D F=4 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=16 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_432.bin --options=\"${copt} -D F=4 -D ON=3 -D IN=6 -D LN=6 -D UN=5 -D Fsq=16 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_442.bin --options=\"${copt} -D F=4 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=16 -D KN=2 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + + + + + echo ./gcl_binary --input=$file --output=${file%.*}_511.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_521.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_531.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_541.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_551.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_561.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_571.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_581.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_511.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU \" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_521.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU \" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_531.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_541.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_551.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_561.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_571.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_581.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_511.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 \" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_521.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 \" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_531.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_541.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_551.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_561.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_571.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_581.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_512.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_522.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_532.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_542.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=2 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_512.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_522.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_532.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_542.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_512.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_522.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_532.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_542.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_514.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_524.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_514.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=4 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_524.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=4 -DUSE_HALF -DUSE_RELU\" + + + echo ./gcl_binary --input=$file --output=${file%.*}_711.bin --options=\"${copt} -D F=7 -D ON=1 -D IN=7 -D LN=7 -D UN=6 -D Fsq=49 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_721.bin --options=\"${copt} -D F=7 -D ON=2 -D IN=8 -D LN=8 -D UN=7 -D Fsq=49 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_731.bin --options=\"${copt} -D F=7 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=49 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_741.bin --options=\"${copt} -D F=7 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=49 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_751.bin --options=\"${copt} -D F=7 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=49 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_761.bin --options=\"${copt} -D F=7 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=49 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_771.bin --options=\"${copt} -D F=7 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=49 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_781.bin --options=\"${copt} -D F=7 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=49 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_711.bin --options=\"${copt} -D F=7 -D ON=1 -D IN=7 -D LN=7 -D UN=6 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU \" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_721.bin --options=\"${copt} -D F=7 -D ON=2 -D IN=8 -D LN=8 -D UN=7 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU \" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_731.bin --options=\"${copt} -D F=7 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_741.bin --options=\"${copt} -D F=7 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_751.bin --options=\"${copt} -D F=7 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_761.bin --options=\"${copt} -D F=7 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_771.bin --options=\"${copt} -D F=7 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_781.bin --options=\"${copt} -D F=7 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_711.bin --options=\"${copt} -D F=7 -D ON=1 -D IN=7 -D LN=7 -D UN=6 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU6 \" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_721.bin --options=\"${copt} -D F=7 -D ON=2 -D IN=8 -D LN=8 -D UN=7 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU6 \" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_731.bin --options=\"${copt} -D F=7 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_741.bin --options=\"${copt} -D F=7 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_751.bin --options=\"${copt} -D F=7 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_761.bin --options=\"${copt} -D F=7 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_771.bin --options=\"${copt} -D F=7 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_781.bin --options=\"${copt} -D F=7 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=49 -D KN=1 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_712.bin --options=\"${copt} -D F=7 -D ON=1 -D IN=7 -D LN=7 -D UN=6 -D Fsq=49 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_722.bin --options=\"${copt} -D F=7 -D ON=2 -D IN=8 -D LN=8 -D UN=7 -D Fsq=49 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_732.bin --options=\"${copt} -D F=7 -D ON=3 -D IN=9 -D LN=9 -D UN=8 -D Fsq=49 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_742.bin --options=\"${copt} -D F=7 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=49 -D KN=2 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_712.bin --options=\"${copt} -D F=7 -D ON=1 -D IN=7 -D LN=7 -D UN=6 -D Fsq=49 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_722.bin --options=\"${copt} -D F=7 -D ON=2 -D IN=8 -D LN=8 -D UN=7 -D Fsq=49 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_732.bin --options=\"${copt} -D F=7 -D ON=3 -D IN=9 -D LN=9 -D UN=8 -D Fsq=49 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_742.bin --options=\"${copt} -D F=7 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=49 -D KN=2 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_712.bin --options=\"${copt} -D F=7 -D ON=1 -D IN=7 -D LN=7 -D UN=6 -D Fsq=49 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_722.bin --options=\"${copt} -D F=7 -D ON=2 -D IN=8 -D LN=8 -D UN=7 -D Fsq=49 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_732.bin --options=\"${copt} -D F=7 -D ON=3 -D IN=9 -D LN=9 -D UN=8 -D Fsq=49 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_742.bin --options=\"${copt} -D F=7 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=49 -D KN=2 -DUSE_HALF -DUSE_RELU6 -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_714.bin --options=\"${copt} -D F=7 -D ON=1 -D IN=7 -D LN=7 -D UN=6 -D Fsq=49 -D KN=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_724.bin --options=\"${copt} -D F=7 -D ON=2 -D IN=8 -D LN=8 -D UN=7 -D Fsq=49 -D KN=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_714.bin --options=\"${copt} -D F=7 -D ON=1 -D IN=7 -D LN=7 -D UN=6 -D Fsq=49 -D KN=4 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_724.bin --options=\"${copt} -D F=7 -D ON=2 -D IN=8 -D LN=8 -D UN=7 -D Fsq=49 -D KN=4 -DUSE_HALF -DUSE_RELU\" + + + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_fn_spe.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_fn_spe.sh new file mode 100644 index 00000000..4e76a6c7 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_fn_spe.sh @@ -0,0 +1,44 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "conv_direct_s1_fn_spe.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_18.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_18.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_18.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_18.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -DUSE_NCHW -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_nchw_18.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -DUSE_RELU -DUSE_NCHW -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_nchw_18.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -DUSE_RELU6 -DUSE_NCHW -DUSE_HALF\" + + echo ./gcl_binary --input=$file --output=${file%.*}_28.bin --options=\"${copt} -D F=2 -D ON=8 -D IN=9 -D LN=9 -D UN=8 -D Fsq=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_28.bin --options=\"${copt} -D F=2 -D ON=8 -D IN=9 -D LN=9 -D UN=8 -D Fsq=4 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_28.bin --options=\"${copt} -D F=2 -D ON=8 -D IN=9 -D LN=9 -D UN=8 -D Fsq=4 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_28.bin --options=\"${copt} -D F=2 -D ON=8 -D IN=9 -D LN=9 -D UN=8 -D Fsq=4 -DUSE_NCHW -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_nchw_28.bin --options=\"${copt} -D F=2 -D ON=8 -D IN=9 -D LN=9 -D UN=8 -D Fsq=4 -DUSE_RELU -DUSE_NCHW -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_nchw_28.bin --options=\"${copt} -D F=2 -D ON=8 -D IN=9 -D LN=9 -D UN=8 -D Fsq=4 -DUSE_RELU6 -DUSE_NCHW -DUSE_HALF\" + + echo ./gcl_binary --input=$file --output=${file%.*}_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=10 -D LN=10 -D UN=9 -D Fsq=9 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=10 -D LN=10 -D UN=9 -D Fsq=9 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=10 -D LN=10 -D UN=9 -D Fsq=9 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=10 -D LN=10 -D UN=9 -D Fsq=9 -DUSE_NCHW -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_nchw_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=10 -D LN=10 -D UN=9 -D Fsq=9 -DUSE_RELU -DUSE_NCHW -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_nchw_38.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=10 -D LN=10 -D UN=9 -D Fsq=9 -DUSE_RELU6 -DUSE_NCHW -DUSE_HALF\" + + echo ./gcl_binary --input=$file --output=${file%.*}_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=12 -D LN=12 -D UN=11 -D Fsq=25 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=12 -D LN=12 -D UN=11 -D Fsq=25 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=12 -D LN=12 -D UN=11 -D Fsq=25 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=12 -D LN=12 -D UN=11 -D Fsq=25 -DUSE_NCHW -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_nchw_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=12 -D LN=12 -D UN=11 -D Fsq=25 -DUSE_RELU -DUSE_NCHW -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_nchw_58.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=12 -D LN=12 -D UN=11 -D Fsq=25 -DUSE_RELU6 -DUSE_NCHW -DUSE_HALF\" + + echo ./gcl_binary --input=$file --output=${file%.*}_76.bin --options=\"${copt} -D F=7 -D ON=6 -D IN=12 -D LN=12 -D UN=11 -D Fsq=49 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_76.bin --options=\"${copt} -D F=7 -D ON=6 -D IN=12 -D LN=12 -D UN=11 -D Fsq=49 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_76.bin --options=\"${copt} -D F=7 -D ON=6 -D IN=12 -D LN=12 -D UN=11 -D Fsq=49 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_76.bin --options=\"${copt} -D F=7 -D ON=6 -D IN=12 -D LN=12 -D UN=11 -D Fsq=49 -DUSE_NCHW -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_nchw_76.bin --options=\"${copt} -D F=7 -D ON=6 -D IN=12 -D LN=12 -D UN=11 -D Fsq=49 -DUSE_RELU -DUSE_NCHW -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_nchw_76.bin --options=\"${copt} -D F=7 -D ON=6 -D IN=12 -D LN=12 -D UN=11 -D Fsq=49 -DUSE_RELU6 -DUSE_NCHW -DUSE_HALF\" + fi + fi + done + + + diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_nchw_to_ncwhc4.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_nchw_to_ncwhc4.sh similarity index 53% rename from gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_nchw_to_ncwhc4.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_nchw_to_ncwhc4.sh index 2798b4df..bb4898f0 100644 --- a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_nchw_to_ncwhc4.sh +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_nchw_to_ncwhc4.sh @@ -5,9 +5,15 @@ for file in * echo ./gcl_binary --input=$file --output=${file%.*}_18.bin --options=\"${copt} -D F=1 -D ON=8 -D Fsq=1 -DUSE_HALF\" echo ./gcl_binary --input=$file --output=${file%.*}_36.bin --options=\"${copt} -D F=3 -D ON=6 -D Fsq=9 -DUSE_HALF\" echo ./gcl_binary --input=$file --output=${file%.*}_54.bin --options=\"${copt} -D F=5 -D ON=4 -D Fsq=25 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_72.bin --options=\"${copt} -D F=7 -D ON=2 -D Fsq=49 -DUSE_HALF\" echo ./gcl_binary --input=$file --output=${file%.*}_relu_18.bin --options=\"${copt} -D F=1 -D ON=8 -D Fsq=1 -DUSE_HALF -DUSE_RELU\" echo ./gcl_binary --input=$file --output=${file%.*}_relu_36.bin --options=\"${copt} -D F=3 -D ON=6 -D Fsq=9 -DUSE_HALF -DUSE_RELU\" echo ./gcl_binary --input=$file --output=${file%.*}_relu_54.bin --options=\"${copt} -D F=5 -D ON=4 -D Fsq=25 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_72.bin --options=\"${copt} -D F=7 -D ON=2 -D Fsq=49 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_18.bin --options=\"${copt} -D F=1 -D ON=8 -D Fsq=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_36.bin --options=\"${copt} -D F=3 -D ON=6 -D Fsq=9 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_54.bin --options=\"${copt} -D F=5 -D ON=4 -D Fsq=25 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_72.bin --options=\"${copt} -D F=7 -D ON=2 -D Fsq=49 -DUSE_HALF -DUSE_RELU6\" fi fi done diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_spe_f1c3k1.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_spe_f1c3k1.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_spe_f1c3k1.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1_spe_f1c3k1.sh diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2.sh new file mode 100644 index 00000000..b17f648e --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2.sh @@ -0,0 +1,213 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "conv_direct_s2.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_111.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_131.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_151.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_161.bin --options=\"${copt} -D F=1 -D ON=6 -D IN=6 -D LN=6 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_171.bin --options=\"${copt} -D F=1 -D ON=7 -D IN=7 -D LN=7 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_181.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_111.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_131.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_151.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_161.bin --options=\"${copt} -D F=1 -D ON=6 -D IN=6 -D LN=6 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_171.bin --options=\"${copt} -D F=1 -D ON=7 -D IN=7 -D LN=7 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_181.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_111.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_131.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_151.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_161.bin --options=\"${copt} -D F=1 -D ON=6 -D IN=6 -D LN=6 -D Fsq=1 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_171.bin --options=\"${copt} -D F=1 -D ON=7 -D IN=7 -D LN=7 -D Fsq=1 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_181.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_112.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_132.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_112.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_132.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_112.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_132.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_114.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_134.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_114.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_134.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU\" + + + echo ./gcl_binary --input=$file --output=${file%.*}_211.bin --options=\"${copt} -D F=2 -D ON=1 -D IN=2 -D LN=2 -D UN=1 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_221.bin --options=\"${copt} -D F=2 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=4 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_231.bin --options=\"${copt} -D F=2 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=4 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_241.bin --options=\"${copt} -D F=2 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=4 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_251.bin --options=\"${copt} -D F=2 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=4 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_261.bin --options=\"${copt} -D F=2 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_271.bin --options=\"${copt} -D F=2 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=4 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_281.bin --options=\"${copt} -D F=2 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=4 -D KN=1 -DUSE_HALF -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_211.bin --options=\"${copt} -D F=2 -D ON=1 -D IN=2 -D LN=2 -D UN=1 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_221.bin --options=\"${copt} -D F=2 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_231.bin --options=\"${copt} -D F=2 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_241.bin --options=\"${copt} -D F=2 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_251.bin --options=\"${copt} -D F=2 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_261.bin --options=\"${copt} -D F=2 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_271.bin --options=\"${copt} -D F=2 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_281.bin --options=\"${copt} -D F=2 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_211.bin --options=\"${copt} -D F=2 -D ON=1 -D IN=2 -D LN=2 -D UN=1 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_221.bin --options=\"${copt} -D F=2 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_231.bin --options=\"${copt} -D F=2 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_241.bin --options=\"${copt} -D F=2 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_251.bin --options=\"${copt} -D F=2 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_261.bin --options=\"${copt} -D F=2 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_271.bin --options=\"${copt} -D F=2 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_281.bin --options=\"${copt} -D F=2 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=4 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_212.bin --options=\"${copt} -D F=2 -D ON=1 -D IN=2 -D LN=2 -D UN=1 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_222.bin --options=\"${copt} -D F=2 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_232.bin --options=\"${copt} -D F=2 -D ON=3 -D IN=6 -D LN=6 -D UN=5 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_242.bin --options=\"${copt} -D F=2 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=4 -D KN=2 -DUSE_HALF -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_212.bin --options=\"${copt} -D F=2 -D ON=1 -D IN=2 -D LN=2 -D UN=1 -D Fsq=4 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_222.bin --options=\"${copt} -D F=2 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_232.bin --options=\"${copt} -D F=2 -D ON=3 -D IN=6 -D LN=6 -D UN=5 -D Fsq=4 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_242.bin --options=\"${copt} -D F=2 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=4 -D KN=2 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_212.bin --options=\"${copt} -D F=2 -D ON=1 -D IN=2 -D LN=2 -D UN=1 -D Fsq=4 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_222.bin --options=\"${copt} -D F=2 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_232.bin --options=\"${copt} -D F=2 -D ON=3 -D IN=6 -D LN=6 -D UN=5 -D Fsq=4 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_242.bin --options=\"${copt} -D F=2 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=4 -D KN=2 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + + + + echo ./gcl_binary --input=$file --output=${file%.*}_311.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_321.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_331.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_341.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_351.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_361.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_371.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_381.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_311.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_321.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_331.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_341.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_351.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_361.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_371.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_381.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_311.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_321.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_331.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_341.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_351.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_361.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_371.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_381.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_312.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_322.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_332.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_342.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=2 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_312.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_322.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_332.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_342.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_312.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_322.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_332.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_342.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" +# echo ./gcl_binary --input=$file --output=${file%.*}_314.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=4 -DUSE_HALF\" +# echo ./gcl_binary --input=$file --output=${file%.*}_324.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=4 -DUSE_HALF\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_314.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=4 -DUSE_HALF -DUSE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_324.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=4 -DUSE_HALF -DUSE_RELU\" + + + + echo ./gcl_binary --input=$file --output=${file%.*}_411.bin --options=\"${copt} -D F=4 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=16 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_421.bin --options=\"${copt} -D F=4 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=16 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_431.bin --options=\"${copt} -D F=4 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=16 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_441.bin --options=\"${copt} -D F=4 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=16 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_451.bin --options=\"${copt} -D F=4 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=16 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_461.bin --options=\"${copt} -D F=4 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=16 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_471.bin --options=\"${copt} -D F=4 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=16 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_481.bin --options=\"${copt} -D F=4 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=16 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_411.bin --options=\"${copt} -D F=4 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_421.bin --options=\"${copt} -D F=4 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_431.bin --options=\"${copt} -D F=4 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_441.bin --options=\"${copt} -D F=4 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_451.bin --options=\"${copt} -D F=4 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_461.bin --options=\"${copt} -D F=4 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_471.bin --options=\"${copt} -D F=4 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_481.bin --options=\"${copt} -D F=4 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=16 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_412.bin --options=\"${copt} -D F=4 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=16 -D KN=2 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_422.bin --options=\"${copt} -D F=4 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=16 -D KN=2 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_432.bin --options=\"${copt} -D F=4 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=16 -D KN=2 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_442.bin --options=\"${copt} -D F=4 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=16 -D KN=2 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + + + + + echo ./gcl_binary --input=$file --output=${file%.*}_511.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_521.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_531.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_541.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_551.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_561.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_571.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_581.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_511.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_521.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_531.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_541.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_551.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_561.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_571.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_581.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_511.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_521.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_531.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_541.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_551.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_561.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_571.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_581.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_512.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_522.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_532.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=2 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_542.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=2 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_512.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_522.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_532.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_542.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_512.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_522.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_532.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_542.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" +# echo ./gcl_binary --input=$file --output=${file%.*}_514.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=4 -DUSE_HALF\" +# echo ./gcl_binary --input=$file --output=${file%.*}_524.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=4 -DUSE_HALF\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_514.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=4 -DUSE_HALF -DUSE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_524.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=4 -DUSE_HALF -DUSE_RELU\" + fi + fi + done + + + diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2_nchw_to_ncwhc4.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2_nchw_to_ncwhc4.sh similarity index 53% rename from gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2_nchw_to_ncwhc4.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2_nchw_to_ncwhc4.sh index c3cf03c5..b1f03360 100644 --- a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2_nchw_to_ncwhc4.sh +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2_nchw_to_ncwhc4.sh @@ -5,9 +5,15 @@ for file in * echo ./gcl_binary --input=$file --output=${file%.*}_18.bin --options=\"${copt} -D F=1 -D ON=8 -D Fsq=1 -DUSE_HALF\" echo ./gcl_binary --input=$file --output=${file%.*}_37.bin --options=\"${copt} -D F=3 -D ON=7 -D Fsq=9 -DUSE_HALF\" echo ./gcl_binary --input=$file --output=${file%.*}_56.bin --options=\"${copt} -D F=5 -D ON=6 -D Fsq=25 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_75.bin --options=\"${copt} -D F=7 -D ON=5 -D Fsq=49 -DUSE_HALF\" echo ./gcl_binary --input=$file --output=${file%.*}_relu_18.bin --options=\"${copt} -D F=1 -D ON=8 -D Fsq=1 -DUSE_HALF -DUSE_RELU\" echo ./gcl_binary --input=$file --output=${file%.*}_relu_37.bin --options=\"${copt} -D F=3 -D ON=7 -D Fsq=9 -DUSE_HALF -DUSE_RELU\" echo ./gcl_binary --input=$file --output=${file%.*}_relu_56.bin --options=\"${copt} -D F=5 -D ON=6 -D Fsq=25 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_75.bin --options=\"${copt} -D F=7 -D ON=5 -D Fsq=49 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_18.bin --options=\"${copt} -D F=1 -D ON=8 -D Fsq=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_37.bin --options=\"${copt} -D F=3 -D ON=7 -D Fsq=9 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_56.bin --options=\"${copt} -D F=5 -D ON=6 -D Fsq=25 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_75.bin --options=\"${copt} -D F=7 -D ON=5 -D Fsq=49 -DUSE_HALF -DUSE_RELU6\" fi fi done diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_spe_fwhs1.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_spe_fwhs1.sh new file mode 100644 index 00000000..4d1249dc --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_spe_fwhs1.sh @@ -0,0 +1,25 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "conv_direct_spe_fwhs1.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_1.bin --options=\"${copt} -D OC=1\" + echo ./gcl_binary --input=$file --output=${file%.*}_2.bin --options=\"${copt} -D OC=2\" + echo ./gcl_binary --input=$file --output=${file%.*}_3.bin --options=\"${copt} -D OC=3\" + echo ./gcl_binary --input=$file --output=${file%.*}_4.bin --options=\"${copt} -D OC=4\" + echo ./gcl_binary --input=$file --output=${file%.*}_8.bin --options=\"${copt} -D OC=8\" + echo ./gcl_binary --input=$file --output=${file%.*}_16.bin --options=\"${copt} -D OC=16\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_4.bin --options=\"${copt} -D OC=4 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_8.bin --options=\"${copt} -D OC=8 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_16.bin --options=\"${copt} -D OC=16 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4.bin --options=\"${copt} -D OC=4 -D USE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_8.bin --options=\"${copt} -D OC=8 -D USE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_16.bin --options=\"${copt} -D OC=16 -D USE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4.bin --options=\"${copt} -D OC=4 -D USE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_8.bin --options=\"${copt} -D OC=8 -D USE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_16.bin --options=\"${copt} -D OC=16 -D USE_RELU6\" + fi + fi + done + + + diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_trans_fltbuf.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_trans_fltbuf.sh similarity index 83% rename from gcl/tools/kernel_lib_compile/sh/compile/conv_direct_trans_fltbuf.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_trans_fltbuf.sh index 9c3bed24..5cb4f7be 100644 --- a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_trans_fltbuf.sh +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_trans_fltbuf.sh @@ -3,8 +3,10 @@ for file in * if [ "${file##*.}"x = "cl"x ];then if [[ "${file}" == "conv_direct_trans_fltbuf.cl" ]];then echo ./gcl_binary --input=$file --output=${file%.*}_14.bin --options=\"${copt} -D C=1 -D K=4\" + echo ./gcl_binary --input=$file --output=${file%.*}_41.bin --options=\"${copt} -D C=4 -D K=1\" echo ./gcl_binary --input=$file --output=${file%.*}_44.bin --options=\"${copt} -D C=4 -D K=4\" echo ./gcl_binary --input=$file --output=${file%.*}_48.bin --options=\"${copt} -D C=4 -D K=8\" + echo ./gcl_binary --input=$file --output=${file%.*}_416.bin --options=\"${copt} -D C=4 -D K=16\" echo ./gcl_binary --input=$file --output=${file%.*}_10.bin --options=\"${copt} -D C=1 -D K=0\" echo ./gcl_binary --input=$file --output=${file%.*}_20.bin --options=\"${copt} -D C=2 -D K=0\" echo ./gcl_binary --input=$file --output=${file%.*}_30.bin --options=\"${copt} -D C=3 -D K=0\" diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_wh_s1.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_wh_s1.sh new file mode 100644 index 00000000..4a40a181 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_wh_s1.sh @@ -0,0 +1,143 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "conv_direct_wh_s1.cl" ]];then + # W=4 H=1 + echo ./gcl_binary --input=$file --output=${file%.*}_4111.bin --options=\"${copt} -D W=4 -D H=1 -D ON=1 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_4121.bin --options=\"${copt} -D W=4 -D H=1 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_4131.bin --options=\"${copt} -D W=4 -D H=1 -D ON=3 -D IN=6 -D LN=6 -D UN=5 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_4141.bin --options=\"${copt} -D W=4 -D H=1 -D ON=4 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_4151.bin --options=\"${copt} -D W=4 -D H=1 -D ON=5 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_4161.bin --options=\"${copt} -D W=4 -D H=1 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_4171.bin --options=\"${copt} -D W=4 -D H=1 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=4 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_4181.bin --options=\"${copt} -D W=4 -D H=1 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=4 -D KN=1 -DUSE_HALF -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4111.bin --options=\"${copt} -D W=4 -D H=1 -D ON=1 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=1 -D USE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4121.bin --options=\"${copt} -D W=4 -D H=1 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=4 -D KN=1 -D USE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4131.bin --options=\"${copt} -D W=4 -D H=1 -D ON=3 -D IN=6 -D LN=6 -D UN=5 -D Fsq=4 -D KN=1 -D USE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4141.bin --options=\"${copt} -D W=4 -D H=1 -D ON=4 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -D USE_RELU -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4151.bin --options=\"${copt} -D W=4 -D H=1 -D ON=5 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -D USE_RELU -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4161.bin --options=\"${copt} -D W=4 -D H=1 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -D USE_RELU -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4171.bin --options=\"${copt} -D W=4 -D H=1 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=4 -D KN=1 -D USE_RELU -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4181.bin --options=\"${copt} -D W=4 -D H=1 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=4 -D KN=1 -D USE_RELU -DUSE_HALF -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4111.bin --options=\"${copt} -D W=4 -D H=1 -D ON=1 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=1 -D USE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4121.bin --options=\"${copt} -D W=4 -D H=1 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=4 -D KN=1 -D USE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4131.bin --options=\"${copt} -D W=4 -D H=1 -D ON=3 -D IN=6 -D LN=6 -D UN=5 -D Fsq=4 -D KN=1 -D USE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4141.bin --options=\"${copt} -D W=4 -D H=1 -D ON=4 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -D USE_RELU6 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4151.bin --options=\"${copt} -D W=4 -D H=1 -D ON=5 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -D USE_RELU6 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4161.bin --options=\"${copt} -D W=4 -D H=1 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=1 -D USE_RELU6 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4171.bin --options=\"${copt} -D W=4 -D H=1 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=4 -D KN=1 -D USE_RELU6 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4181.bin --options=\"${copt} -D W=4 -D H=1 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=4 -D KN=1 -D USE_RELU6 -DUSE_HALF -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_4112.bin --options=\"${copt} -D W=4 -D H=1 -D ON=1 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_4122.bin --options=\"${copt} -D W=4 -D H=1 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_4132.bin --options=\"${copt} -D W=4 -D H=1 -D ON=3 -D IN=6 -D LN=6 -D UN=5 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_4142.bin --options=\"${copt} -D W=4 -D H=1 -D ON=4 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=2 -DUSE_HALF -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4112.bin --options=\"${copt} -D W=4 -D H=1 -D ON=1 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=2 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4122.bin --options=\"${copt} -D W=4 -D H=1 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=4 -D KN=2 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4132.bin --options=\"${copt} -D W=4 -D H=1 -D ON=3 -D IN=6 -D LN=6 -D UN=5 -D Fsq=4 -D KN=2 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_4142.bin --options=\"${copt} -D W=4 -D H=1 -D ON=4 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=2 -DUSE_RELU -DUSE_HALF -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4112.bin --options=\"${copt} -D W=4 -D H=1 -D ON=1 -D IN=4 -D LN=4 -D UN=3 -D Fsq=4 -D KN=2 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4122.bin --options=\"${copt} -D W=4 -D H=1 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=4 -D KN=2 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4132.bin --options=\"${copt} -D W=4 -D H=1 -D ON=3 -D IN=6 -D LN=6 -D UN=5 -D Fsq=4 -D KN=2 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_4142.bin --options=\"${copt} -D W=4 -D H=1 -D ON=4 -D IN=6 -D LN=5 -D UN=5 -D Fsq=4 -D KN=2 -DUSE_RELU6 -DUSE_HALF -D BASICE_REG\" + + # W=3 H=1 + echo ./gcl_binary --input=$file --output=${file%.*}_3111.bin --options=\"${copt} -D W=3 -D H=1 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=3 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_3121.bin --options=\"${copt} -D W=3 -D H=1 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=3 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_3131.bin --options=\"${copt} -D W=3 -D H=1 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=3 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_3141.bin --options=\"${copt} -D W=3 -D H=1 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=3 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_3151.bin --options=\"${copt} -D W=3 -D H=1 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=3 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_3161.bin --options=\"${copt} -D W=3 -D H=1 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=3 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_3171.bin --options=\"${copt} -D W=3 -D H=1 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=3 -D KN=1 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_3181.bin --options=\"${copt} -D W=3 -D H=1 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=3 -D KN=1 -DUSE_HALF -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3111.bin --options=\"${copt} -D W=3 -D H=1 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=3 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3121.bin --options=\"${copt} -D W=3 -D H=1 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=3 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3131.bin --options=\"${copt} -D W=3 -D H=1 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=3 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3141.bin --options=\"${copt} -D W=3 -D H=1 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=3 -D KN=1 -DUSE_RELU -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3151.bin --options=\"${copt} -D W=3 -D H=1 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=3 -D KN=1 -DUSE_RELU -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3161.bin --options=\"${copt} -D W=3 -D H=1 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=3 -D KN=1 -DUSE_RELU -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3171.bin --options=\"${copt} -D W=3 -D H=1 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=3 -D KN=1 -DUSE_RELU -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3181.bin --options=\"${copt} -D W=3 -D H=1 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=3 -D KN=1 -DUSE_RELU -DUSE_HALF -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3111.bin --options=\"${copt} -D W=3 -D H=1 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=3 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3121.bin --options=\"${copt} -D W=3 -D H=1 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=3 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3131.bin --options=\"${copt} -D W=3 -D H=1 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=3 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3141.bin --options=\"${copt} -D W=3 -D H=1 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=3 -D KN=1 -DUSE_RELU6 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3151.bin --options=\"${copt} -D W=3 -D H=1 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=3 -D KN=1 -DUSE_RELU6 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3161.bin --options=\"${copt} -D W=3 -D H=1 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=3 -D KN=1 -DUSE_RELU6 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3171.bin --options=\"${copt} -D W=3 -D H=1 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=3 -D KN=1 -DUSE_RELU6 -DUSE_HALF -D BASICE_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3181.bin --options=\"${copt} -D W=3 -D H=1 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=3 -D KN=1 -DUSE_RELU6 -DUSE_HALF -D BASICE_REG\" + + + echo ./gcl_binary --input=$file --output=${file%.*}_3112.bin --options=\"${copt} -D W=3 -D H=1 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=3 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_3122.bin --options=\"${copt} -D W=3 -D H=1 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=3 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_3132.bin --options=\"${copt} -D W=3 -D H=1 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=3 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_3142.bin --options=\"${copt} -D W=3 -D H=1 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=3 -D KN=2 -DUSE_HALF -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3112.bin --options=\"${copt} -D W=3 -D H=1 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=3 -D KN=2 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3122.bin --options=\"${copt} -D W=3 -D H=1 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=3 -D KN=2 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3132.bin --options=\"${copt} -D W=3 -D H=1 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=3 -D KN=2 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3142.bin --options=\"${copt} -D W=3 -D H=1 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=3 -D KN=2 -DUSE_RELU -DUSE_HALF -D BASICE_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3112.bin --options=\"${copt} -D W=3 -D H=1 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=3 -D KN=2 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3122.bin --options=\"${copt} -D W=3 -D H=1 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=3 -D KN=2 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3132.bin --options=\"${copt} -D W=3 -D H=1 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=3 -D KN=2 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3142.bin --options=\"${copt} -D W=3 -D H=1 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=3 -D KN=2 -DUSE_RELU6 -DUSE_HALF -D BASICE_REG\" + + # W=1 H=4 + echo ./gcl_binary --input=$file --output=${file%.*}_1411.bin --options=\"${copt} -D W=1 -D H=4 -D ON=1 -D IN=1 -D LN=1 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_1421.bin --options=\"${copt} -D W=1 -D H=4 -D ON=2 -D IN=2 -D LN=2 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_1431.bin --options=\"${copt} -D W=1 -D H=4 -D ON=3 -D IN=3 -D LN=3 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_1441.bin --options=\"${copt} -D W=1 -D H=4 -D ON=4 -D IN=4 -D LN=4 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_1451.bin --options=\"${copt} -D W=1 -D H=4 -D ON=5 -D IN=5 -D LN=5 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_1461.bin --options=\"${copt} -D W=1 -D H=4 -D ON=6 -D IN=6 -D LN=6 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_1471.bin --options=\"${copt} -D W=1 -D H=4 -D ON=7 -D IN=7 -D LN=7 -D Fsq=4 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_1481.bin --options=\"${copt} -D W=1 -D H=4 -D ON=8 -D IN=8 -D LN=8 -D Fsq=4 -D KN=1 -DUSE_HALF\" + + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_1411.bin --options=\"${copt} -D W=1 -D H=4 -D ON=1 -D IN=1 -D LN=1 -D Fsq=4 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_1421.bin --options=\"${copt} -D W=1 -D H=4 -D ON=2 -D IN=2 -D LN=2 -D Fsq=4 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_1431.bin --options=\"${copt} -D W=1 -D H=4 -D ON=3 -D IN=3 -D LN=3 -D Fsq=4 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_1441.bin --options=\"${copt} -D W=1 -D H=4 -D ON=4 -D IN=4 -D LN=4 -D Fsq=4 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_1451.bin --options=\"${copt} -D W=1 -D H=4 -D ON=5 -D IN=5 -D LN=5 -D Fsq=4 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_1461.bin --options=\"${copt} -D W=1 -D H=4 -D ON=6 -D IN=6 -D LN=6 -D Fsq=4 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_1471.bin --options=\"${copt} -D W=1 -D H=4 -D ON=7 -D IN=7 -D LN=7 -D Fsq=4 -D KN=1 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_1481.bin --options=\"${copt} -D W=1 -D H=4 -D ON=8 -D IN=8 -D LN=8 -D Fsq=4 -D KN=1 -DUSE_RELU -DUSE_HALF\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_1411.bin --options=\"${copt} -D W=1 -D H=4 -D ON=1 -D IN=1 -D LN=1 -D Fsq=4 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_1421.bin --options=\"${copt} -D W=1 -D H=4 -D ON=2 -D IN=2 -D LN=2 -D Fsq=4 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_1431.bin --options=\"${copt} -D W=1 -D H=4 -D ON=3 -D IN=3 -D LN=3 -D Fsq=4 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_1441.bin --options=\"${copt} -D W=1 -D H=4 -D ON=4 -D IN=4 -D LN=4 -D Fsq=4 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_1451.bin --options=\"${copt} -D W=1 -D H=4 -D ON=5 -D IN=5 -D LN=5 -D Fsq=4 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_1461.bin --options=\"${copt} -D W=1 -D H=4 -D ON=6 -D IN=6 -D LN=6 -D Fsq=4 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_1471.bin --options=\"${copt} -D W=1 -D H=4 -D ON=7 -D IN=7 -D LN=7 -D Fsq=4 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_1481.bin --options=\"${copt} -D W=1 -D H=4 -D ON=8 -D IN=8 -D LN=8 -D Fsq=4 -D KN=1 -DUSE_RELU6 -DUSE_HALF\" + + + echo ./gcl_binary --input=$file --output=${file%.*}_1412.bin --options=\"${copt} -D W=1 -D H=4 -D ON=1 -D IN=1 -D LN=1 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_1422.bin --options=\"${copt} -D W=1 -D H=4 -D ON=2 -D IN=2 -D LN=2 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_1432.bin --options=\"${copt} -D W=1 -D H=4 -D ON=3 -D IN=3 -D LN=3 -D Fsq=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_1442.bin --options=\"${copt} -D W=1 -D H=4 -D ON=4 -D IN=4 -D LN=4 -D Fsq=4 -D KN=2 -DUSE_HALF\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_1412.bin --options=\"${copt} -D W=1 -D H=4 -D ON=1 -D IN=1 -D LN=1 -D Fsq=4 -D KN=2 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_1422.bin --options=\"${copt} -D W=1 -D H=4 -D ON=2 -D IN=2 -D LN=2 -D Fsq=4 -D KN=2 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_1432.bin --options=\"${copt} -D W=1 -D H=4 -D ON=3 -D IN=3 -D LN=3 -D Fsq=4 -D KN=2 -DUSE_RELU -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_1442.bin --options=\"${copt} -D W=1 -D H=4 -D ON=4 -D IN=4 -D LN=4 -D Fsq=4 -D KN=2 -DUSE_RELU -DUSE_HALF\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_1412.bin --options=\"${copt} -D W=1 -D H=4 -D ON=1 -D IN=1 -D LN=1 -D Fsq=4 -D KN=2 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_1422.bin --options=\"${copt} -D W=1 -D H=4 -D ON=2 -D IN=2 -D LN=2 -D Fsq=4 -D KN=2 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_1432.bin --options=\"${copt} -D W=1 -D H=4 -D ON=3 -D IN=3 -D LN=3 -D Fsq=4 -D KN=2 -DUSE_RELU6 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_1442.bin --options=\"${copt} -D W=1 -D H=4 -D ON=4 -D IN=4 -D LN=4 -D Fsq=4 -D KN=2 -DUSE_RELU6 -DUSE_HALF\" + fi + fi + done + + + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_wh_s2.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_wh_s2.sh new file mode 100644 index 00000000..7bf77fac --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_wh_s2.sh @@ -0,0 +1,54 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "conv_direct_wh_s2.cl" ]];then + # W=3 H=1 Stride = 2 + echo ./gcl_binary --input=$file --output=${file%.*}_3111.bin --options=\"${copt} -D W=3 -D H=1 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=3 -D KN=1 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_3121.bin --options=\"${copt} -D W=3 -D H=1 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=3 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_3131.bin --options=\"${copt} -D W=3 -D H=1 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=3 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_3141.bin --options=\"${copt} -D W=3 -D H=1 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=3 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_3151.bin --options=\"${copt} -D W=3 -D H=1 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=3 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_3161.bin --options=\"${copt} -D W=3 -D H=1 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=3 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_3171.bin --options=\"${copt} -D W=3 -D H=1 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=3 -D KN=1 -DUSE_HALF -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_3181.bin --options=\"${copt} -D W=3 -D H=1 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=3 -D KN=1 -DUSE_HALF -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3111.bin --options=\"${copt} -D W=3 -D H=1 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3121.bin --options=\"${copt} -D W=3 -D H=1 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3131.bin --options=\"${copt} -D W=3 -D H=1 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3141.bin --options=\"${copt} -D W=3 -D H=1 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3151.bin --options=\"${copt} -D W=3 -D H=1 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3161.bin --options=\"${copt} -D W=3 -D H=1 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3171.bin --options=\"${copt} -D W=3 -D H=1 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3181.bin --options=\"${copt} -D W=3 -D H=1 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3111.bin --options=\"${copt} -D W=3 -D H=1 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3121.bin --options=\"${copt} -D W=3 -D H=1 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3131.bin --options=\"${copt} -D W=3 -D H=1 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3141.bin --options=\"${copt} -D W=3 -D H=1 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3151.bin --options=\"${copt} -D W=3 -D H=1 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3161.bin --options=\"${copt} -D W=3 -D H=1 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3171.bin --options=\"${copt} -D W=3 -D H=1 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3181.bin --options=\"${copt} -D W=3 -D H=1 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=3 -D KN=1 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_3112.bin --options=\"${copt} -D W=3 -D H=1 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=3 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_3122.bin --options=\"${copt} -D W=3 -D H=1 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=3 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_3132.bin --options=\"${copt} -D W=3 -D H=1 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=3 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_3142.bin --options=\"${copt} -D W=3 -D H=1 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=3 -D KN=2 -DUSE_HALF -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3112.bin --options=\"${copt} -D W=3 -D H=1 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=3 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3122.bin --options=\"${copt} -D W=3 -D H=1 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=3 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3132.bin --options=\"${copt} -D W=3 -D H=1 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=3 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_3142.bin --options=\"${copt} -D W=3 -D H=1 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=3 -D KN=2 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3112.bin --options=\"${copt} -D W=3 -D H=1 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=3 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3122.bin --options=\"${copt} -D W=3 -D H=1 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=3 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3132.bin --options=\"${copt} -D W=3 -D H=1 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=3 -D KN=2 -DUSE_HALF -DUSE_RELU6\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu6_3142.bin --options=\"${copt} -D W=3 -D H=1 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=3 -D KN=2 -DUSE_HALF -DUSE_RELU6 -DBASIC_REG\" + fi + fi + done + + + + + diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_gemm36_tn.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_gemm36_tn.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/conv_wino_gemm36_tn.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_gemm36_tn.sh diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_rotate_fltbuf.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_rotate_fltbuf.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/conv_wino_rotate_fltbuf.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_rotate_fltbuf.sh diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_trans_outbuf.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_trans_outbuf.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/conv_wino_trans_outbuf.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_trans_outbuf.sh diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_trans_picbuf_left.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_trans_picbuf_left.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/conv_wino_trans_picbuf_left.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_trans_picbuf_left.sh diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_trans_picbuf_right.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_trans_picbuf_right.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/conv_wino_trans_picbuf_right.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/conv_wino_trans_picbuf_right.sh diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/copy.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/copy.sh new file mode 100644 index 00000000..4f988f0c --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/copy.sh @@ -0,0 +1,16 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "copy.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_f16.bin --options=\"${copt} -D DT=f16\" + echo ./gcl_binary --input=$file --output=${file%.*}_i32.bin --options=\"-D T=int -D T2=int2 -D T3=int3 -D T4=int4 -D DT=i32\" + echo ./gcl_binary --input=$file --output=${file%.*}_u32.bin --options=\"-D T=uint -D T2=uint2 -D T3=uint3 -D T4=uint4 -D DT=u32\" + echo ./gcl_binary --input=$file --output=${file%.*}_with_block_index_f16.bin --options=\"${copt} -D DT=f16 -D USE_BLOCK_INDEX\" + echo ./gcl_binary --input=$file --output=${file%.*}_with_block_index_i32.bin --options=\"-D T=int -D T2=int2 -D T3=int3 -D T4=int4 -D DT=i32 -D USE_BLOCK_INDEX\" + echo ./gcl_binary --input=$file --output=${file%.*}_with_block_index_u32.bin --options=\"-D T=uint -D T2=uint2 -D T3=uint3 -D T4=uint4 -D DT=u32 -D USE_BLOCK_INDEX\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/deconv_gemm_f2s2.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/deconv_gemm_f2s2.sh new file mode 100644 index 00000000..70634810 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/deconv_gemm_f2s2.sh @@ -0,0 +1,30 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "deconv_gemm_f2s2.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_12.bin --options=\"${copt} -D ON=1 -D IN=1 -D LN=1 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_22.bin --options=\"${copt} -D ON=2 -D IN=2 -D LN=2 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_32.bin --options=\"${copt} -D ON=3 -D IN=3 -D LN=3 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_42.bin --options=\"${copt} -D ON=4 -D IN=4 -D LN=4 -D KN=2 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_14.bin --options=\"${copt} -D ON=1 -D IN=1 -D LN=1 -D KN=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_24.bin --options=\"${copt} -D ON=2 -D IN=2 -D LN=2 -D KN=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_34.bin --options=\"${copt} -D ON=3 -D IN=3 -D LN=3 -D KN=4 -DUSE_HALF\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_22.bin --options=\"${copt} -D ON=2 -D IN=2 -D LN=2 -D KN=2 -DUSE_HALF -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_42.bin --options=\"${copt} -D ON=4 -D IN=4 -D LN=4 -D KN=2 -DUSE_HALF -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_24.bin --options=\"${copt} -D ON=2 -D IN=2 -D LN=2 -D KN=4 -DUSE_HALF -DREUSE_H\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_12.bin --options=\"${copt} -D ON=1 -D IN=1 -D LN=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_22.bin --options=\"${copt} -D ON=2 -D IN=2 -D LN=2 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_32.bin --options=\"${copt} -D ON=3 -D IN=3 -D LN=3 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_42.bin --options=\"${copt} -D ON=4 -D IN=4 -D LN=4 -D KN=2 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_14.bin --options=\"${copt} -D ON=1 -D IN=1 -D LN=1 -D KN=4 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_24.bin --options=\"${copt} -D ON=2 -D IN=2 -D LN=2 -D KN=4 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_34.bin --options=\"${copt} -D ON=3 -D IN=3 -D LN=3 -D KN=4 -DUSE_HALF -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu_22.bin --options=\"${copt} -D ON=2 -D IN=2 -D LN=2 -D KN=2 -DUSE_HALF -DREUSE_H -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu_42.bin --options=\"${copt} -D ON=4 -D IN=4 -D LN=4 -D KN=2 -DUSE_HALF -DREUSE_H -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_h_relu_24.bin --options=\"${copt} -D ON=2 -D IN=2 -D LN=2 -D KN=4 -DUSE_HALF -DREUSE_H -DUSE_RELU\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/deconv_gemm_trans_fltbuf.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/deconv_gemm_trans_fltbuf.sh new file mode 100644 index 00000000..f8cd5dbc --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/deconv_gemm_trans_fltbuf.sh @@ -0,0 +1,13 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "deconv_gemm_trans_fltbuf.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_14.bin --options=\"${copt} -D C=1 -D K=4\" + echo ./gcl_binary --input=$file --output=${file%.*}_24.bin --options=\"${copt} -D C=2 -D K=4\" + echo ./gcl_binary --input=$file --output=${file%.*}_44.bin --options=\"${copt} -D C=4 -D K=4\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/eltwise.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/eltwise.sh new file mode 100644 index 00000000..c7039f76 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/eltwise.sh @@ -0,0 +1,69 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "eltwise.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_max1.bin --options=\"${copt} -D N=1 -D TP=max -DUSE_MAX\" + echo ./gcl_binary --input=$file --output=${file%.*}_max2.bin --options=\"${copt} -D N=2 -D TP=max -DUSE_MAX\" + echo ./gcl_binary --input=$file --output=${file%.*}_max3.bin --options=\"${copt} -D N=3 -D TP=max -DUSE_MAX\" + echo ./gcl_binary --input=$file --output=${file%.*}_max4.bin --options=\"${copt} -D N=4 -D TP=max -DUSE_MAX\" + + echo ./gcl_binary --input=$file --output=${file%.*}_sum1.bin --options=\"${copt} -D N=1 -D TP=sum -DUSE_SUM\" + echo ./gcl_binary --input=$file --output=${file%.*}_sum2.bin --options=\"${copt} -D N=2 -D TP=sum -DUSE_SUM\" + echo ./gcl_binary --input=$file --output=${file%.*}_sum3.bin --options=\"${copt} -D N=3 -D TP=sum -DUSE_SUM\" + echo ./gcl_binary --input=$file --output=${file%.*}_sum4.bin --options=\"${copt} -D N=4 -D TP=sum -DUSE_SUM\" + + echo ./gcl_binary --input=$file --output=${file%.*}_prod1.bin --options=\"${copt} -D N=1 -D TP=prod -DUSE_PROD\" + echo ./gcl_binary --input=$file --output=${file%.*}_prod2.bin --options=\"${copt} -D N=2 -D TP=prod -DUSE_PROD\" + echo ./gcl_binary --input=$file --output=${file%.*}_prod3.bin --options=\"${copt} -D N=3 -D TP=prod -DUSE_PROD\" + echo ./gcl_binary --input=$file --output=${file%.*}_prod4.bin --options=\"${copt} -D N=4 -D TP=prod -DUSE_PROD\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_max1.bin --options=\"${copt} -D N=1 -D TP=max -DUSE_MAX -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_max2.bin --options=\"${copt} -D N=2 -D TP=max -DUSE_MAX -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_max3.bin --options=\"${copt} -D N=3 -D TP=max -DUSE_MAX -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_max4.bin --options=\"${copt} -D N=4 -D TP=max -DUSE_MAX -DUSE_RELU\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_sum1.bin --options=\"${copt} -D N=1 -D TP=sum -DUSE_SUM -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_sum2.bin --options=\"${copt} -D N=2 -D TP=sum -DUSE_SUM -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_sum3.bin --options=\"${copt} -D N=3 -D TP=sum -DUSE_SUM -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_sum4.bin --options=\"${copt} -D N=4 -D TP=sum -DUSE_SUM -DUSE_RELU\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_prod1.bin --options=\"${copt} -D N=1 -D TP=prod -DUSE_PROD -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_prod2.bin --options=\"${copt} -D N=2 -D TP=prod -DUSE_PROD -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_prod3.bin --options=\"${copt} -D N=3 -D TP=prod -DUSE_PROD -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_prod4.bin --options=\"${copt} -D N=4 -D TP=prod -DUSE_PROD -DUSE_RELU\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_max1.bin --options=\"${copt} -D N=1 -D TP=max -DUSE_MAX -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_max2.bin --options=\"${copt} -D N=2 -D TP=max -DUSE_MAX -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_max3.bin --options=\"${copt} -D N=3 -D TP=max -DUSE_MAX -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_max4.bin --options=\"${copt} -D N=4 -D TP=max -DUSE_MAX -DUSE_NCHW\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_sum1.bin --options=\"${copt} -D N=1 -D TP=sum -DUSE_SUM -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_sum2.bin --options=\"${copt} -D N=2 -D TP=sum -DUSE_SUM -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_sum3.bin --options=\"${copt} -D N=3 -D TP=sum -DUSE_SUM -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_sum4.bin --options=\"${copt} -D N=4 -D TP=sum -DUSE_SUM -DUSE_NCHW\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_prod1.bin --options=\"${copt} -D N=1 -D TP=prod -DUSE_PROD -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_prod2.bin --options=\"${copt} -D N=2 -D TP=prod -DUSE_PROD -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_prod3.bin --options=\"${copt} -D N=3 -D TP=prod -DUSE_PROD -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_prod4.bin --options=\"${copt} -D N=4 -D TP=prod -DUSE_PROD -DUSE_NCHW\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_relu_max1.bin --options=\"${copt} -D N=1 -D TP=max -DUSE_MAX -DUSE_RELU -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_relu_max2.bin --options=\"${copt} -D N=2 -D TP=max -DUSE_MAX -DUSE_RELU -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_relu_max3.bin --options=\"${copt} -D N=3 -D TP=max -DUSE_MAX -DUSE_RELU -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_relu_max4.bin --options=\"${copt} -D N=4 -D TP=max -DUSE_MAX -DUSE_RELU -DUSE_NCHW\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_relu_sum1.bin --options=\"${copt} -D N=1 -D TP=sum -DUSE_SUM -DUSE_RELU -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_relu_sum2.bin --options=\"${copt} -D N=2 -D TP=sum -DUSE_SUM -DUSE_RELU -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_relu_sum3.bin --options=\"${copt} -D N=3 -D TP=sum -DUSE_SUM -DUSE_RELU -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_relu_sum4.bin --options=\"${copt} -D N=4 -D TP=sum -DUSE_SUM -DUSE_RELU -DUSE_NCHW\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_relu_prod1.bin --options=\"${copt} -D N=1 -D TP=prod -DUSE_PROD -DUSE_RELU -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_relu_prod2.bin --options=\"${copt} -D N=2 -D TP=prod -DUSE_PROD -DUSE_RELU -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_relu_prod3.bin --options=\"${copt} -D N=3 -D TP=prod -DUSE_PROD -DUSE_RELU -DUSE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_nchw_relu_prod4.bin --options=\"${copt} -D N=4 -D TP=prod -DUSE_PROD -DUSE_RELU -DUSE_NCHW\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/eltwise_broadcast.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/eltwise_broadcast.sh new file mode 100644 index 00000000..37ad824c --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/eltwise_broadcast.sh @@ -0,0 +1,22 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "eltwise_broadcast.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_max0.bin --options=\"${copt} -D N=0 -D TP=max -DUSE_MAX\" + echo ./gcl_binary --input=$file --output=${file%.*}_max1.bin --options=\"${copt} -D N=1 -D TP=max -DUSE_MAX\" + echo ./gcl_binary --input=$file --output=${file%.*}_max2.bin --options=\"${copt} -D N=2 -D TP=max -DUSE_MAX\" + echo ./gcl_binary --input=$file --output=${file%.*}_max3.bin --options=\"${copt} -D N=3 -D TP=max -DUSE_MAX\" + + echo ./gcl_binary --input=$file --output=${file%.*}_sum0.bin --options=\"${copt} -D N=0 -D TP=sum -DUSE_SUM\" + echo ./gcl_binary --input=$file --output=${file%.*}_sum1.bin --options=\"${copt} -D N=1 -D TP=sum -DUSE_SUM\" + echo ./gcl_binary --input=$file --output=${file%.*}_sum2.bin --options=\"${copt} -D N=2 -D TP=sum -DUSE_SUM\" + echo ./gcl_binary --input=$file --output=${file%.*}_sum3.bin --options=\"${copt} -D N=3 -D TP=sum -DUSE_SUM\" + + echo ./gcl_binary --input=$file --output=${file%.*}_prod0.bin --options=\"${copt} -D N=0 -D TP=prod -DUSE_PROD\" + echo ./gcl_binary --input=$file --output=${file%.*}_prod1.bin --options=\"${copt} -D N=1 -D TP=prod -DUSE_PROD\" + echo ./gcl_binary --input=$file --output=${file%.*}_prod2.bin --options=\"${copt} -D N=2 -D TP=prod -DUSE_PROD\" + echo ./gcl_binary --input=$file --output=${file%.*}_prod3.bin --options=\"${copt} -D N=3 -D TP=prod -DUSE_PROD\" + fi + fi + done + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/eltwise_spe_nchw_c.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/eltwise_spe_nchw_c.sh new file mode 100644 index 00000000..c879ea4d --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/eltwise_spe_nchw_c.sh @@ -0,0 +1,13 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "eltwise_spe_nchw_c.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_max.bin --options=\"${copt} -D TP=max -DUSE_MAX\" + echo ./gcl_binary --input=$file --output=${file%.*}_sum.bin --options=\"${copt} -D TP=sum -DUSE_SUM\" + echo ./gcl_binary --input=$file --output=${file%.*}_prod.bin --options=\"${copt} -D TP=prod -DUSE_PROD\" + fi + fi + done + + + diff --git a/gcl/tools/kernel_lib_compile/sh/compile/fc_trans_fltbuf.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/fc_trans_fltbuf.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/fc_trans_fltbuf.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/fc_trans_fltbuf.sh diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/fill_memory_zero.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/fill_memory_zero.sh new file mode 100644 index 00000000..7926db3c --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/fill_memory_zero.sh @@ -0,0 +1,12 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "fill_memory_zero.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_f16.bin --options=\"${copt} -D DT=f16\" + echo ./gcl_binary --input=$file --output=${file%.*}_i32.bin --options=\"-D T=int -D DT=i32\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/fill_memory_zero_vec4.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/fill_memory_zero_vec4.sh new file mode 100644 index 00000000..a29cf5dd --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/fill_memory_zero_vec4.sh @@ -0,0 +1,12 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "fill_memory_zero_vec4.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_f16.bin --options=\"${copt} -D DT=f16\" + echo ./gcl_binary --input=$file --output=${file%.*}_i32.bin --options=\"-D T=int -D T2=int2 -D T3=int3 -D T4=int4 -D DT=i32\" + fi + fi + done + + + diff --git a/gcl/tools/kernel_lib_compile/sh/compile/gemm_nt.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/gemm_nt.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/gemm_nt.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/gemm_nt.sh diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/gemm_tn.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/gemm_tn.sh new file mode 100644 index 00000000..6ed69c02 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/gemm_tn.sh @@ -0,0 +1,375 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == gemm_tn.cl ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_41.bin --options=\"${copt} -D LM=4 -D LN=1 -D UN=0 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_42.bin --options=\"${copt} -D LM=4 -D LN=2 -D UN=1 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_43.bin --options=\"${copt} -D LM=4 -D LN=3 -D UN=2 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_44.bin --options=\"${copt} -D LM=4 -D LN=4 -D UN=3 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_45.bin --options=\"${copt} -D LM=4 -D LN=5 -D UN=4 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_46.bin --options=\"${copt} -D LM=4 -D LN=6 -D UN=5 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_47.bin --options=\"${copt} -D LM=4 -D LN=7 -D UN=6 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_48.bin --options=\"${copt} -D LM=4 -D LN=8 -D UN=7 -DUSE_NCWHC4\" + + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_81.bin --options=\"${copt} -D LM=8 -D LN=1 -D UN=0 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_82.bin --options=\"${copt} -D LM=8 -D LN=2 -D UN=1 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_83.bin --options=\"${copt} -D LM=8 -D LN=3 -D UN=2 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_84.bin --options=\"${copt} -D LM=8 -D LN=4 -D UN=3 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_85.bin --options=\"${copt} -D LM=8 -D LN=5 -D UN=4 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_86.bin --options=\"${copt} -D LM=8 -D LN=6 -D UN=5 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_87.bin --options=\"${copt} -D LM=8 -D LN=7 -D UN=6 -DUSE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_88.bin --options=\"${copt} -D LM=8 -D LN=8 -D UN=7 -DUSE_NCWHC4\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_41.bin --options=\"${copt} -D LM=4 -D LN=1 -D UN=0 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_42.bin --options=\"${copt} -D LM=4 -D LN=2 -D UN=1 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_43.bin --options=\"${copt} -D LM=4 -D LN=3 -D UN=2 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_44.bin --options=\"${copt} -D LM=4 -D LN=4 -D UN=3 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_45.bin --options=\"${copt} -D LM=4 -D LN=5 -D UN=4 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_46.bin --options=\"${copt} -D LM=4 -D LN=6 -D UN=5 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_47.bin --options=\"${copt} -D LM=4 -D LN=7 -D UN=6 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_48.bin --options=\"${copt} -D LM=4 -D LN=8 -D UN=7 -DUSE_NCWHC4 -DUSE_RELU\" + + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_81.bin --options=\"${copt} -D LM=8 -D LN=1 -D UN=0 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_82.bin --options=\"${copt} -D LM=8 -D LN=2 -D UN=1 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_83.bin --options=\"${copt} -D LM=8 -D LN=3 -D UN=2 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_84.bin --options=\"${copt} -D LM=8 -D LN=4 -D UN=3 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_85.bin --options=\"${copt} -D LM=8 -D LN=5 -D UN=4 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_86.bin --options=\"${copt} -D LM=8 -D LN=6 -D UN=5 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_87.bin --options=\"${copt} -D LM=8 -D LN=7 -D UN=6 -DUSE_NCWHC4 -DUSE_RELU\" + echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_88.bin --options=\"${copt} -D LM=8 -D LN=8 -D UN=7 -DUSE_NCWHC4 -DUSE_RELU\" + +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_41.bin --options=\"${copt} -D LM=4 -D LN=1 -D UN=0 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_42.bin --options=\"${copt} -D LM=4 -D LN=2 -D UN=1 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_43.bin --options=\"${copt} -D LM=4 -D LN=3 -D UN=2 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_44.bin --options=\"${copt} -D LM=4 -D LN=4 -D UN=3 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_45.bin --options=\"${copt} -D LM=4 -D LN=5 -D UN=4 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_46.bin --options=\"${copt} -D LM=4 -D LN=6 -D UN=5 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_47.bin --options=\"${copt} -D LM=4 -D LN=7 -D UN=6 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_48.bin --options=\"${copt} -D LM=4 -D LN=8 -D UN=7 -DUSE_NCWHC4 -DUSE_GELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_81.bin --options=\"${copt} -D LM=8 -D LN=1 -D UN=0 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_82.bin --options=\"${copt} -D LM=8 -D LN=2 -D UN=1 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_83.bin --options=\"${copt} -D LM=8 -D LN=3 -D UN=2 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_84.bin --options=\"${copt} -D LM=8 -D LN=4 -D UN=3 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_85.bin --options=\"${copt} -D LM=8 -D LN=5 -D UN=4 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_86.bin --options=\"${copt} -D LM=8 -D LN=6 -D UN=5 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_87.bin --options=\"${copt} -D LM=8 -D LN=7 -D UN=6 -DUSE_NCWHC4 -DUSE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_ncwhc4_88.bin --options=\"${copt} -D LM=8 -D LN=8 -D UN=7 -DUSE_NCWHC4 -DUSE_GELU\" + + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_41.bin --options=\"${copt} -D LM=4 -D LN=1 -D UN=0 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_42.bin --options=\"${copt} -D LM=4 -D LN=2 -D UN=1 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_43.bin --options=\"${copt} -D LM=4 -D LN=3 -D UN=2 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_44.bin --options=\"${copt} -D LM=4 -D LN=4 -D UN=3 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_45.bin --options=\"${copt} -D LM=4 -D LN=5 -D UN=4 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_46.bin --options=\"${copt} -D LM=4 -D LN=6 -D UN=5 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_47.bin --options=\"${copt} -D LM=4 -D LN=7 -D UN=6 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_48.bin --options=\"${copt} -D LM=4 -D LN=8 -D UN=7 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_81.bin --options=\"${copt} -D LM=8 -D LN=1 -D UN=0 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_82.bin --options=\"${copt} -D LM=8 -D LN=2 -D UN=1 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_83.bin --options=\"${copt} -D LM=8 -D LN=3 -D UN=2 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_84.bin --options=\"${copt} -D LM=8 -D LN=4 -D UN=3 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_85.bin --options=\"${copt} -D LM=8 -D LN=5 -D UN=4 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_86.bin --options=\"${copt} -D LM=8 -D LN=6 -D UN=5 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_87.bin --options=\"${copt} -D LM=8 -D LN=7 -D UN=6 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise1_ncwhc4_88.bin --options=\"${copt} -D LM=8 -D LN=8 -D UN=7 -DUSE_NCWHC4 -DUSE_ELTWISE_NCHW\" + + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_41.bin --options=\"${copt} -D LM=4 -D LN=1 -D UN=0 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_42.bin --options=\"${copt} -D LM=4 -D LN=2 -D UN=1 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_43.bin --options=\"${copt} -D LM=4 -D LN=3 -D UN=2 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_44.bin --options=\"${copt} -D LM=4 -D LN=4 -D UN=3 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_45.bin --options=\"${copt} -D LM=4 -D LN=5 -D UN=4 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_46.bin --options=\"${copt} -D LM=4 -D LN=6 -D UN=5 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_47.bin --options=\"${copt} -D LM=4 -D LN=7 -D UN=6 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_48.bin --options=\"${copt} -D LM=4 -D LN=8 -D UN=7 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_81.bin --options=\"${copt} -D LM=8 -D LN=1 -D UN=0 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_82.bin --options=\"${copt} -D LM=8 -D LN=2 -D UN=1 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_83.bin --options=\"${copt} -D LM=8 -D LN=3 -D UN=2 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_84.bin --options=\"${copt} -D LM=8 -D LN=4 -D UN=3 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_85.bin --options=\"${copt} -D LM=8 -D LN=5 -D UN=4 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_86.bin --options=\"${copt} -D LM=8 -D LN=6 -D UN=5 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_87.bin --options=\"${copt} -D LM=8 -D LN=7 -D UN=6 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + echo ./gcl_binary --input=$file --output=${file%.*}_eltwise4_ncwhc4_88.bin --options=\"${copt} -D LM=8 -D LN=8 -D UN=7 -DUSE_NCWHC4 -DUSE_ELTWISE_NCWHC4\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_13.bin --options=\"${copt} -D LM=1 -D LN=3 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_14.bin --options=\"${copt} -D LM=1 -D LN=4 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_15.bin --options=\"${copt} -D LM=1 -D LN=5 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_16.bin --options=\"${copt} -D LM=1 -D LN=6 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_17.bin --options=\"${copt} -D LM=1 -D LN=7 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_18.bin --options=\"${copt} -D LM=1 -D LN=8 -D NO_BIAS\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_22.bin --options=\"${copt} -D LM=2 -D LN=2 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_23.bin --options=\"${copt} -D LM=2 -D LN=3 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_24.bin --options=\"${copt} -D LM=2 -D LN=4 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_25.bin --options=\"${copt} -D LM=2 -D LN=5 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_26.bin --options=\"${copt} -D LM=2 -D LN=6 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_27.bin --options=\"${copt} -D LM=2 -D LN=7 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_28.bin --options=\"${copt} -D LM=2 -D LN=8 -D NO_BIAS\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_31.bin --options=\"${copt} -D LM=3 -D LN=1 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_32.bin --options=\"${copt} -D LM=3 -D LN=2 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_33.bin --options=\"${copt} -D LM=3 -D LN=3 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_34.bin --options=\"${copt} -D LM=3 -D LN=4 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_35.bin --options=\"${copt} -D LM=3 -D LN=5 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_36.bin --options=\"${copt} -D LM=3 -D LN=6 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_37.bin --options=\"${copt} -D LM=3 -D LN=7 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_38.bin --options=\"${copt} -D LM=3 -D LN=8 -D NO_BIAS\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_41.bin --options=\"${copt} -D LM=4 -D LN=1 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_42.bin --options=\"${copt} -D LM=4 -D LN=2 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_43.bin --options=\"${copt} -D LM=4 -D LN=3 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_44.bin --options=\"${copt} -D LM=4 -D LN=4 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_45.bin --options=\"${copt} -D LM=4 -D LN=5 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_46.bin --options=\"${copt} -D LM=4 -D LN=6 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_47.bin --options=\"${copt} -D LM=4 -D LN=7 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_48.bin --options=\"${copt} -D LM=4 -D LN=8 -D NO_BIAS\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_51.bin --options=\"${copt} -D LM=5 -D LN=1 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_52.bin --options=\"${copt} -D LM=5 -D LN=2 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_53.bin --options=\"${copt} -D LM=5 -D LN=3 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_54.bin --options=\"${copt} -D LM=5 -D LN=4 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_55.bin --options=\"${copt} -D LM=5 -D LN=5 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_56.bin --options=\"${copt} -D LM=5 -D LN=6 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_57.bin --options=\"${copt} -D LM=5 -D LN=7 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_58.bin --options=\"${copt} -D LM=5 -D LN=8 -D NO_BIAS\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_61.bin --options=\"${copt} -D LM=6 -D LN=1 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_62.bin --options=\"${copt} -D LM=6 -D LN=2 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_63.bin --options=\"${copt} -D LM=6 -D LN=3 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_64.bin --options=\"${copt} -D LM=6 -D LN=4 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_65.bin --options=\"${copt} -D LM=6 -D LN=5 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_66.bin --options=\"${copt} -D LM=6 -D LN=6 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_67.bin --options=\"${copt} -D LM=6 -D LN=7 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_68.bin --options=\"${copt} -D LM=6 -D LN=8 -D NO_BIAS\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_71.bin --options=\"${copt} -D LM=7 -D LN=1 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_72.bin --options=\"${copt} -D LM=7 -D LN=2 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_73.bin --options=\"${copt} -D LM=7 -D LN=3 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_74.bin --options=\"${copt} -D LM=7 -D LN=4 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_75.bin --options=\"${copt} -D LM=7 -D LN=5 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_76.bin --options=\"${copt} -D LM=7 -D LN=6 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_77.bin --options=\"${copt} -D LM=7 -D LN=7 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_78.bin --options=\"${copt} -D LM=7 -D LN=8 -D NO_BIAS\" + + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_81.bin --options=\"${copt} -D LM=8 -D LN=1 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_82.bin --options=\"${copt} -D LM=8 -D LN=2 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_83.bin --options=\"${copt} -D LM=8 -D LN=3 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_84.bin --options=\"${copt} -D LM=8 -D LN=4 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_85.bin --options=\"${copt} -D LM=8 -D LN=5 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_86.bin --options=\"${copt} -D LM=8 -D LN=6 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_87.bin --options=\"${copt} -D LM=8 -D LN=7 -D NO_BIAS\" + echo ./gcl_binary --input=$file --output=${file%.*}_nobias_88.bin --options=\"${copt} -D LM=8 -D LN=8 -D NO_BIAS\" + + echo ./gcl_binary --input=$file --output=${file%.*}_13.bin --options=\"${copt} -D LM=1 -D LN=3\" + echo ./gcl_binary --input=$file --output=${file%.*}_14.bin --options=\"${copt} -D LM=1 -D LN=4\" + echo ./gcl_binary --input=$file --output=${file%.*}_15.bin --options=\"${copt} -D LM=1 -D LN=5\" + echo ./gcl_binary --input=$file --output=${file%.*}_16.bin --options=\"${copt} -D LM=1 -D LN=6\" + echo ./gcl_binary --input=$file --output=${file%.*}_17.bin --options=\"${copt} -D LM=1 -D LN=7\" + echo ./gcl_binary --input=$file --output=${file%.*}_18.bin --options=\"${copt} -D LM=1 -D LN=8\" + + echo ./gcl_binary --input=$file --output=${file%.*}_22.bin --options=\"${copt} -D LM=2 -D LN=2\" + echo ./gcl_binary --input=$file --output=${file%.*}_23.bin --options=\"${copt} -D LM=2 -D LN=3\" + echo ./gcl_binary --input=$file --output=${file%.*}_24.bin --options=\"${copt} -D LM=2 -D LN=4\" + echo ./gcl_binary --input=$file --output=${file%.*}_25.bin --options=\"${copt} -D LM=2 -D LN=5\" + echo ./gcl_binary --input=$file --output=${file%.*}_26.bin --options=\"${copt} -D LM=2 -D LN=6\" + echo ./gcl_binary --input=$file --output=${file%.*}_27.bin --options=\"${copt} -D LM=2 -D LN=7\" + echo ./gcl_binary --input=$file --output=${file%.*}_28.bin --options=\"${copt} -D LM=2 -D LN=8\" + + echo ./gcl_binary --input=$file --output=${file%.*}_31.bin --options=\"${copt} -D LM=3 -D LN=1\" + echo ./gcl_binary --input=$file --output=${file%.*}_32.bin --options=\"${copt} -D LM=3 -D LN=2\" + echo ./gcl_binary --input=$file --output=${file%.*}_33.bin --options=\"${copt} -D LM=3 -D LN=3\" + echo ./gcl_binary --input=$file --output=${file%.*}_34.bin --options=\"${copt} -D LM=3 -D LN=4\" + echo ./gcl_binary --input=$file --output=${file%.*}_35.bin --options=\"${copt} -D LM=3 -D LN=5\" + echo ./gcl_binary --input=$file --output=${file%.*}_36.bin --options=\"${copt} -D LM=3 -D LN=6\" + echo ./gcl_binary --input=$file --output=${file%.*}_37.bin --options=\"${copt} -D LM=3 -D LN=7\" + echo ./gcl_binary --input=$file --output=${file%.*}_38.bin --options=\"${copt} -D LM=3 -D LN=8\" + + echo ./gcl_binary --input=$file --output=${file%.*}_41.bin --options=\"${copt} -D LM=4 -D LN=1\" + echo ./gcl_binary --input=$file --output=${file%.*}_42.bin --options=\"${copt} -D LM=4 -D LN=2\" + echo ./gcl_binary --input=$file --output=${file%.*}_43.bin --options=\"${copt} -D LM=4 -D LN=3\" + echo ./gcl_binary --input=$file --output=${file%.*}_44.bin --options=\"${copt} -D LM=4 -D LN=4\" + echo ./gcl_binary --input=$file --output=${file%.*}_45.bin --options=\"${copt} -D LM=4 -D LN=5\" + echo ./gcl_binary --input=$file --output=${file%.*}_46.bin --options=\"${copt} -D LM=4 -D LN=6\" + echo ./gcl_binary --input=$file --output=${file%.*}_47.bin --options=\"${copt} -D LM=4 -D LN=7\" + echo ./gcl_binary --input=$file --output=${file%.*}_48.bin --options=\"${copt} -D LM=4 -D LN=8\" + + echo ./gcl_binary --input=$file --output=${file%.*}_51.bin --options=\"${copt} -D LM=5 -D LN=1\" + echo ./gcl_binary --input=$file --output=${file%.*}_52.bin --options=\"${copt} -D LM=5 -D LN=2\" + echo ./gcl_binary --input=$file --output=${file%.*}_53.bin --options=\"${copt} -D LM=5 -D LN=3\" + echo ./gcl_binary --input=$file --output=${file%.*}_54.bin --options=\"${copt} -D LM=5 -D LN=4\" + echo ./gcl_binary --input=$file --output=${file%.*}_55.bin --options=\"${copt} -D LM=5 -D LN=5\" + echo ./gcl_binary --input=$file --output=${file%.*}_56.bin --options=\"${copt} -D LM=5 -D LN=6\" + echo ./gcl_binary --input=$file --output=${file%.*}_57.bin --options=\"${copt} -D LM=5 -D LN=7\" + echo ./gcl_binary --input=$file --output=${file%.*}_58.bin --options=\"${copt} -D LM=5 -D LN=8\" + + echo ./gcl_binary --input=$file --output=${file%.*}_61.bin --options=\"${copt} -D LM=6 -D LN=1\" + echo ./gcl_binary --input=$file --output=${file%.*}_62.bin --options=\"${copt} -D LM=6 -D LN=2\" + echo ./gcl_binary --input=$file --output=${file%.*}_63.bin --options=\"${copt} -D LM=6 -D LN=3\" + echo ./gcl_binary --input=$file --output=${file%.*}_64.bin --options=\"${copt} -D LM=6 -D LN=4\" + echo ./gcl_binary --input=$file --output=${file%.*}_65.bin --options=\"${copt} -D LM=6 -D LN=5\" + echo ./gcl_binary --input=$file --output=${file%.*}_66.bin --options=\"${copt} -D LM=6 -D LN=6\" + echo ./gcl_binary --input=$file --output=${file%.*}_67.bin --options=\"${copt} -D LM=6 -D LN=7\" + echo ./gcl_binary --input=$file --output=${file%.*}_68.bin --options=\"${copt} -D LM=6 -D LN=8\" + + echo ./gcl_binary --input=$file --output=${file%.*}_71.bin --options=\"${copt} -D LM=7 -D LN=1\" + echo ./gcl_binary --input=$file --output=${file%.*}_72.bin --options=\"${copt} -D LM=7 -D LN=2\" + echo ./gcl_binary --input=$file --output=${file%.*}_73.bin --options=\"${copt} -D LM=7 -D LN=3\" + echo ./gcl_binary --input=$file --output=${file%.*}_74.bin --options=\"${copt} -D LM=7 -D LN=4\" + echo ./gcl_binary --input=$file --output=${file%.*}_75.bin --options=\"${copt} -D LM=7 -D LN=5\" + echo ./gcl_binary --input=$file --output=${file%.*}_76.bin --options=\"${copt} -D LM=7 -D LN=6\" + echo ./gcl_binary --input=$file --output=${file%.*}_77.bin --options=\"${copt} -D LM=7 -D LN=7\" + echo ./gcl_binary --input=$file --output=${file%.*}_78.bin --options=\"${copt} -D LM=7 -D LN=8\" + + echo ./gcl_binary --input=$file --output=${file%.*}_81.bin --options=\"${copt} -D LM=8 -D LN=1\" + echo ./gcl_binary --input=$file --output=${file%.*}_82.bin --options=\"${copt} -D LM=8 -D LN=2\" + echo ./gcl_binary --input=$file --output=${file%.*}_83.bin --options=\"${copt} -D LM=8 -D LN=3\" + echo ./gcl_binary --input=$file --output=${file%.*}_84.bin --options=\"${copt} -D LM=8 -D LN=4\" + echo ./gcl_binary --input=$file --output=${file%.*}_85.bin --options=\"${copt} -D LM=8 -D LN=5\" + echo ./gcl_binary --input=$file --output=${file%.*}_86.bin --options=\"${copt} -D LM=8 -D LN=6\" + echo ./gcl_binary --input=$file --output=${file%.*}_87.bin --options=\"${copt} -D LM=8 -D LN=7\" + echo ./gcl_binary --input=$file --output=${file%.*}_88.bin --options=\"${copt} -D LM=8 -D LN=8\" + +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_13.bin --options=\"${copt} -D LM=1 -D LN=3 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_14.bin --options=\"${copt} -D LM=1 -D LN=4 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_15.bin --options=\"${copt} -D LM=1 -D LN=5 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_16.bin --options=\"${copt} -D LM=1 -D LN=6 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_17.bin --options=\"${copt} -D LM=1 -D LN=7 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_18.bin --options=\"${copt} -D LM=1 -D LN=8 -D USE_RELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_22.bin --options=\"${copt} -D LM=2 -D LN=2 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_23.bin --options=\"${copt} -D LM=2 -D LN=3 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_24.bin --options=\"${copt} -D LM=2 -D LN=4 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_25.bin --options=\"${copt} -D LM=2 -D LN=5 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_26.bin --options=\"${copt} -D LM=2 -D LN=6 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_27.bin --options=\"${copt} -D LM=2 -D LN=7 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_28.bin --options=\"${copt} -D LM=2 -D LN=8 -D USE_RELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_31.bin --options=\"${copt} -D LM=3 -D LN=1 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_32.bin --options=\"${copt} -D LM=3 -D LN=2 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_33.bin --options=\"${copt} -D LM=3 -D LN=3 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_34.bin --options=\"${copt} -D LM=3 -D LN=4 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_35.bin --options=\"${copt} -D LM=3 -D LN=5 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_36.bin --options=\"${copt} -D LM=3 -D LN=6 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_37.bin --options=\"${copt} -D LM=3 -D LN=7 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_38.bin --options=\"${copt} -D LM=3 -D LN=8 -D USE_RELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_41.bin --options=\"${copt} -D LM=4 -D LN=1 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_42.bin --options=\"${copt} -D LM=4 -D LN=2 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_43.bin --options=\"${copt} -D LM=4 -D LN=3 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_44.bin --options=\"${copt} -D LM=4 -D LN=4 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_45.bin --options=\"${copt} -D LM=4 -D LN=5 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_46.bin --options=\"${copt} -D LM=4 -D LN=6 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_47.bin --options=\"${copt} -D LM=4 -D LN=7 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_48.bin --options=\"${copt} -D LM=4 -D LN=8 -D USE_RELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_51.bin --options=\"${copt} -D LM=5 -D LN=1 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_52.bin --options=\"${copt} -D LM=5 -D LN=2 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_53.bin --options=\"${copt} -D LM=5 -D LN=3 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_54.bin --options=\"${copt} -D LM=5 -D LN=4 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_55.bin --options=\"${copt} -D LM=5 -D LN=5 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_56.bin --options=\"${copt} -D LM=5 -D LN=6 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_57.bin --options=\"${copt} -D LM=5 -D LN=7 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_58.bin --options=\"${copt} -D LM=5 -D LN=8 -D USE_RELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_61.bin --options=\"${copt} -D LM=6 -D LN=1 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_62.bin --options=\"${copt} -D LM=6 -D LN=2 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_63.bin --options=\"${copt} -D LM=6 -D LN=3 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_64.bin --options=\"${copt} -D LM=6 -D LN=4 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_65.bin --options=\"${copt} -D LM=6 -D LN=5 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_66.bin --options=\"${copt} -D LM=6 -D LN=6 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_67.bin --options=\"${copt} -D LM=6 -D LN=7 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_68.bin --options=\"${copt} -D LM=6 -D LN=8 -D USE_RELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_71.bin --options=\"${copt} -D LM=7 -D LN=1 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_72.bin --options=\"${copt} -D LM=7 -D LN=2 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_73.bin --options=\"${copt} -D LM=7 -D LN=3 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_74.bin --options=\"${copt} -D LM=7 -D LN=4 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_75.bin --options=\"${copt} -D LM=7 -D LN=5 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_76.bin --options=\"${copt} -D LM=7 -D LN=6 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_77.bin --options=\"${copt} -D LM=7 -D LN=7 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_78.bin --options=\"${copt} -D LM=7 -D LN=8 -D USE_RELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_81.bin --options=\"${copt} -D LM=8 -D LN=1 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_82.bin --options=\"${copt} -D LM=8 -D LN=2 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_83.bin --options=\"${copt} -D LM=8 -D LN=3 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_84.bin --options=\"${copt} -D LM=8 -D LN=4 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_85.bin --options=\"${copt} -D LM=8 -D LN=5 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_86.bin --options=\"${copt} -D LM=8 -D LN=6 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_87.bin --options=\"${copt} -D LM=8 -D LN=7 -D USE_RELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_relu_88.bin --options=\"${copt} -D LM=8 -D LN=8 -D USE_RELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_13.bin --options=\"${copt} -D LM=1 -D LN=3 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_14.bin --options=\"${copt} -D LM=1 -D LN=4 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_15.bin --options=\"${copt} -D LM=1 -D LN=5 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_16.bin --options=\"${copt} -D LM=1 -D LN=6 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_17.bin --options=\"${copt} -D LM=1 -D LN=7 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_18.bin --options=\"${copt} -D LM=1 -D LN=8 -D USE_GELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_22.bin --options=\"${copt} -D LM=2 -D LN=2 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_23.bin --options=\"${copt} -D LM=2 -D LN=3 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_24.bin --options=\"${copt} -D LM=2 -D LN=4 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_25.bin --options=\"${copt} -D LM=2 -D LN=5 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_26.bin --options=\"${copt} -D LM=2 -D LN=6 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_27.bin --options=\"${copt} -D LM=2 -D LN=7 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_28.bin --options=\"${copt} -D LM=2 -D LN=8 -D USE_GELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_31.bin --options=\"${copt} -D LM=3 -D LN=1 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_32.bin --options=\"${copt} -D LM=3 -D LN=2 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_33.bin --options=\"${copt} -D LM=3 -D LN=3 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_34.bin --options=\"${copt} -D LM=3 -D LN=4 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_35.bin --options=\"${copt} -D LM=3 -D LN=5 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_36.bin --options=\"${copt} -D LM=3 -D LN=6 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_37.bin --options=\"${copt} -D LM=3 -D LN=7 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_38.bin --options=\"${copt} -D LM=3 -D LN=8 -D USE_GELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_41.bin --options=\"${copt} -D LM=4 -D LN=1 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_42.bin --options=\"${copt} -D LM=4 -D LN=2 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_43.bin --options=\"${copt} -D LM=4 -D LN=3 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_44.bin --options=\"${copt} -D LM=4 -D LN=4 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_45.bin --options=\"${copt} -D LM=4 -D LN=5 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_46.bin --options=\"${copt} -D LM=4 -D LN=6 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_47.bin --options=\"${copt} -D LM=4 -D LN=7 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_48.bin --options=\"${copt} -D LM=4 -D LN=8 -D USE_GELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_51.bin --options=\"${copt} -D LM=5 -D LN=1 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_52.bin --options=\"${copt} -D LM=5 -D LN=2 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_53.bin --options=\"${copt} -D LM=5 -D LN=3 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_54.bin --options=\"${copt} -D LM=5 -D LN=4 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_55.bin --options=\"${copt} -D LM=5 -D LN=5 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_56.bin --options=\"${copt} -D LM=5 -D LN=6 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_57.bin --options=\"${copt} -D LM=5 -D LN=7 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_58.bin --options=\"${copt} -D LM=5 -D LN=8 -D USE_GELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_61.bin --options=\"${copt} -D LM=6 -D LN=1 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_62.bin --options=\"${copt} -D LM=6 -D LN=2 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_63.bin --options=\"${copt} -D LM=6 -D LN=3 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_64.bin --options=\"${copt} -D LM=6 -D LN=4 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_65.bin --options=\"${copt} -D LM=6 -D LN=5 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_66.bin --options=\"${copt} -D LM=6 -D LN=6 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_67.bin --options=\"${copt} -D LM=6 -D LN=7 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_68.bin --options=\"${copt} -D LM=6 -D LN=8 -D USE_GELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_71.bin --options=\"${copt} -D LM=7 -D LN=1 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_72.bin --options=\"${copt} -D LM=7 -D LN=2 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_73.bin --options=\"${copt} -D LM=7 -D LN=3 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_74.bin --options=\"${copt} -D LM=7 -D LN=4 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_75.bin --options=\"${copt} -D LM=7 -D LN=5 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_76.bin --options=\"${copt} -D LM=7 -D LN=6 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_77.bin --options=\"${copt} -D LM=7 -D LN=7 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_78.bin --options=\"${copt} -D LM=7 -D LN=8 -D USE_GELU\" +# +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_81.bin --options=\"${copt} -D LM=8 -D LN=1 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_82.bin --options=\"${copt} -D LM=8 -D LN=2 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_83.bin --options=\"${copt} -D LM=8 -D LN=3 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_84.bin --options=\"${copt} -D LM=8 -D LN=4 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_85.bin --options=\"${copt} -D LM=8 -D LN=5 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_86.bin --options=\"${copt} -D LM=8 -D LN=6 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_87.bin --options=\"${copt} -D LM=8 -D LN=7 -D USE_GELU\" +# echo ./gcl_binary --input=$file --output=${file%.*}_gelu_88.bin --options=\"${copt} -D LM=8 -D LN=8 -D USE_GELU\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/mem_trans_nchw_to_ncwhc4.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/mem_trans_nchw_to_ncwhc4.sh new file mode 100644 index 00000000..fe6d4969 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/mem_trans_nchw_to_ncwhc4.sh @@ -0,0 +1,13 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "mem_trans_nchw_to_ncwhc4.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}.bin --options=\"${copt}\" + echo ./gcl_binary --input=$file --output=${file%.*}_input_tran.bin --options=\"${copt} -DINPUT_TRAN\" + echo ./gcl_binary --input=$file --output=${file%.*}_output_tran.bin --options=\"${copt} -DOUTPUT_TRAN\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/mem_trans_ncwhc4_to_nchw.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/mem_trans_ncwhc4_to_nchw.sh new file mode 100644 index 00000000..ced01b7f --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/mem_trans_ncwhc4_to_nchw.sh @@ -0,0 +1,12 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "mem_trans_ncwhc4_to_nchw.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}.bin --options=\"${copt}\" + echo ./gcl_binary --input=$file --output=${file%.*}_output_tran.bin --options=\"${copt} -DOUTPUT_TRAN\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/mem_trans_ncwhc4_to_ncwhc4.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/mem_trans_ncwhc4_to_ncwhc4.sh new file mode 100644 index 00000000..e975b63a --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/mem_trans_ncwhc4_to_ncwhc4.sh @@ -0,0 +1,12 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "mem_trans_ncwhc4_to_ncwhc4.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}.bin --options=\"${copt}\" + echo ./gcl_binary --input=$file --output=${file%.*}_output_tran.bin --options=\"${copt} -DOUTPUT_TRAN\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/normalization.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/normalization.sh new file mode 100644 index 00000000..4abc0e04 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/normalization.sh @@ -0,0 +1,12 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "normalization.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_c1.bin --options=\"${copt} -D USE_C1 \" + echo ./gcl_binary --input=$file --output=${file%.*}.bin --options=\"${copt}\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/power.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/power.sh new file mode 100644 index 00000000..6cb49036 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/power.sh @@ -0,0 +1,12 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "power.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_f16.bin --options=\"${copt} -D DT=f16\" + echo ./gcl_binary --input=$file --output=${file%.*}_i32.bin --options=\"-D T=int -D T2=int2 -D T3=int3 -D T4=int4 -D DT=i32\" + fi + fi + done + + + diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/prelu.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/prelu.sh new file mode 100644 index 00000000..440302d9 --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/prelu.sh @@ -0,0 +1,12 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "prelu.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_noprop.bin --options=\"${copt} -D MD=noprop \" + echo ./gcl_binary --input=$file --output=${file%.*}_prop.bin --options=\"${copt} -D MD=prop -DUSE_SAME \" + fi + fi + done + + + diff --git a/gcl/tools/kernel_lib_compile/sh/compile/sample.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/sample.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/sample.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/sample.sh diff --git a/gcl/tools/kernel_lib_compile/sh/compile/scale.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/scale.sh similarity index 59% rename from gcl/tools/kernel_lib_compile/sh/compile/scale.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/scale.sh index f8cdd99f..d3e3e61b 100644 --- a/gcl/tools/kernel_lib_compile/sh/compile/scale.sh +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/scale.sh @@ -4,6 +4,8 @@ for file in * if [[ "${file}" == "scale.cl" ]];then echo ./gcl_binary --input=$file --output=${file%.*}_nobeta.bin --options=\"${copt} -D MD=nobeta \" echo ./gcl_binary --input=$file --output=${file%.*}_beta.bin --options=\"${copt} -D MD=beta -DUSE_BETA\" + echo ./gcl_binary --input=$file --output=${file%.*}1_nobeta.bin --options=\"${copt} -D MD=nobeta -DUSE_SAME\" + echo ./gcl_binary --input=$file --output=${file%.*}1_beta.bin --options=\"${copt} -D MD=beta -DUSE_BETA -DUSE_SAME\" fi fi done diff --git a/gcl/tools/kernel_lib_compile/sh/compile/slice_h.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/slice_h.sh similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/compile/slice_h.sh rename to common/gcl/tools/kernel_lib_compile/sh/compile/slice_h.sh diff --git a/common/gcl/tools/kernel_lib_compile/sh/compile/transpose_nchw.sh b/common/gcl/tools/kernel_lib_compile/sh/compile/transpose_nchw.sh new file mode 100644 index 00000000..af18b26a --- /dev/null +++ b/common/gcl/tools/kernel_lib_compile/sh/compile/transpose_nchw.sh @@ -0,0 +1,14 @@ +for file in * + do + if [ "${file##*.}"x = "cl"x ];then + if [[ "${file}" == "transpose_nchw.cl" ]];then + echo ./gcl_binary --input=$file --output=${file%.*}_0231.bin --options=\"${copt} -D OC=2 -D OH=3 -D OW=1\" + echo ./gcl_binary --input=$file --output=${file%.*}_0213.bin --options=\"${copt} -D OC=2 -D OH=1 -D OW=3\" + echo ./gcl_binary --input=$file --output=${file%.*}_0312.bin --options=\"${copt} -D OC=3 -D OH=1 -D OW=2\" + echo ./gcl_binary --input=$file --output=${file%.*}_0321.bin --options=\"${copt} -D OC=3 -D OH=2 -D OW=1\" + fi + fi + done + + + diff --git a/gcl/tools/kernel_lib_compile/sh/packKernelBin.sh b/common/gcl/tools/kernel_lib_compile/sh/packKernelBin.sh similarity index 97% rename from gcl/tools/kernel_lib_compile/sh/packKernelBin.sh rename to common/gcl/tools/kernel_lib_compile/sh/packKernelBin.sh index f6d5d34a..3de6d230 100644 --- a/gcl/tools/kernel_lib_compile/sh/packKernelBin.sh +++ b/common/gcl/tools/kernel_lib_compile/sh/packKernelBin.sh @@ -39,7 +39,7 @@ for((i=1;i $srcPath/$InlineHead echo "#define _${UpperInlineHead}_H" >> $srcPath/$InlineHead - echo "#include \""type.h"\"" >> $srcPath/$InlineHead + echo "#include \""types.h"\"" >> $srcPath/$InlineHead echo >> $srcPath/$InlineHead echo "#include \""${Head}"\"" > $srcPath/$InlineCpp diff --git a/gcl/tools/kernel_lib_compile/sh/sh.config b/common/gcl/tools/kernel_lib_compile/sh/sh.config similarity index 100% rename from gcl/tools/kernel_lib_compile/sh/sh.config rename to common/gcl/tools/kernel_lib_compile/sh/sh.config diff --git a/common/gcl/tools/kernel_source_compile/CMakeLists.txt b/common/gcl/tools/kernel_source_compile/CMakeLists.txt new file mode 100644 index 00000000..11429d21 --- /dev/null +++ b/common/gcl/tools/kernel_source_compile/CMakeLists.txt @@ -0,0 +1,35 @@ +cmake_minimum_required(VERSION 3.2) + +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) +if (BOLT_CONFIGURE_FILE) + include(${BOLT_CONFIGURE_FILE}) +else (BOLT_CONFIGURE_FILE) + message(FATAL_ERROR " +FATAL: can not find bolt.cmake in directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (BOLT_CONFIGURE_FILE) + +project(kernelsource) + +set_c_cxx_flags() + +execute_process( + COMMAND bash buildKernelSourceLib.sh + WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" +) + +file(GLOB source_srcs "src/cl/*.cpp") +file(GLOB option_srcs "src/option/*.cpp") +set(kernel_source_list "${source_srcs};${option_srcs}") + +add_library(${PROJECT_NAME} SHARED ${kernel_source_list}) +add_library(${PROJECT_NAME}_static STATIC ${kernel_source_list}) +set_target_properties(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}") +set_target_properties(${PROJECT_NAME} PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(${PROJECT_NAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) +install(TARGETS ${PROJECT_NAME} ${PROJECT_NAME}_static + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) +install(FILES ${CMAKE_BINARY_DIR}/libOpenCL.so + DESTINATION lib) diff --git a/common/gcl/tools/kernel_source_compile/buildKernelSourceLib.sh b/common/gcl/tools/kernel_source_compile/buildKernelSourceLib.sh new file mode 100644 index 00000000..9d157e46 --- /dev/null +++ b/common/gcl/tools/kernel_source_compile/buildKernelSourceLib.sh @@ -0,0 +1,22 @@ +workPath=${BOLT_ROOT}/common/gcl/tools/kernel_source_compile +#echo "Build OpenCL kernel source in ${workPath}" +cd ${workPath} + +if [ -d "src" ]; then + rm -rf src +fi +mkdir src +mkdir src/cl +mkdir src/option + +if [ -d "include" ]; then + rm -rf include +fi +mkdir include + +headfile=${BOLT_ROOT}/common/uni/include/ +cd ${workPath}/kernel_cl2char/ +g++ -g -std=c++11 cl2char.cpp -o gcl_cl2char -I ${headfile} +./gcl_cl2char +rm gcl_cl2char +cd ${workPath} diff --git a/common/gcl/tools/kernel_source_compile/kernel_cl2char/cl2char.cpp b/common/gcl/tools/kernel_source_compile/kernel_cl2char/cl2char.cpp new file mode 100644 index 00000000..d6fd8fde --- /dev/null +++ b/common/gcl/tools/kernel_source_compile/kernel_cl2char/cl2char.cpp @@ -0,0 +1,540 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "types.h" + +typedef struct { + std::string kernel; + U32 len; + bool use_kernel_def_head; +} KernelInfo; + +typedef struct { + std::string sourceName; + std::string option; + bool use_common_opt; +} OptionInfo; + +inline std::vector buildFileNames(std::string path, std::string postfix) +{ + struct dirent *dirTp; + DIR *handle = opendir(path.c_str()); + std::vector names; + if (handle != NULL) { + while ((dirTp = readdir(handle)) != NULL) { + std::string clFileName = dirTp->d_name; + U32 len = clFileName.size(); + U32 postfix_len = postfix.size(); + if (len > postfix_len) { + if (clFileName.substr(len - postfix_len) == postfix) { + clFileName.erase(len - postfix_len, postfix_len); + names.push_back(clFileName); + } + } + } + } else { + UNI_ERROR_LOG("opendir %s failed\n", path.c_str()); + } + closedir(handle); + return names; +} + +inline std::map buildClMap(std::vector clNames, + std::vector clPaths, + std::vector clNamesIndex, + std::string postfix) +{ + std::map clMap; + for (int ii = 0; ii < clPaths.size(); ii++) { + std::string clPath = clPaths[ii]; + int be = (ii - 1) < 0 ? 0 : clNamesIndex[ii - 1]; + int end = clNamesIndex[ii]; + for (int i = be; i < end; i++) { + KernelInfo kernelInfo; + std::string clName = clNames[i]; + std::string fileName = clPath + clName + postfix; + int fd = open(fileName.c_str(), O_RDONLY); + if (-1 == fd) { + UNI_ERROR_LOG("Cannot open .bolt file. Name: %s\n", fileName.c_str()); + } + + struct stat ss; + if (-1 == fstat(fd, &ss)) { + UNI_ERROR_LOG( + "Cannot get size from file descriptor. File Name: %s\n", fileName.c_str()); + } + + int fileLength = ss.st_size; + char *bytes = (char *)mmap(nullptr, fileLength, PROT_READ, MAP_SHARED, fd, 0); + if (MAP_FAILED == bytes) { + UNI_ERROR_LOG("Mmap failed. File Name: %s\n", fileName.c_str()); + } + std::string fileContent = (const char *)bytes; + int note_pos = -1; + int j = 0; + + for (; j < fileContent.size() - 1; j++) { + if (fileContent[j] == '/' && note_pos < 0) { + if (fileContent[j + 1] == '/') { + note_pos = j; + continue; + } + } + + if (fileContent[j] == '\n' && note_pos >= 0) { + fileContent.erase(note_pos, j - note_pos); + j = note_pos; + note_pos = -1; + } + } + note_pos = -1; + for (j = 0; j < fileContent.size() - 1; j++) { + if (fileContent[j] == '/' && note_pos < 0) { + if (fileContent[j + 1] == '*') { + note_pos = j; + continue; + } + } + + if (fileContent[j] == '*' && note_pos >= 0) { + if (fileContent[j + 1] == '/') { + fileContent.erase(note_pos, j - note_pos + 2); + j = note_pos; + note_pos = -1; + } + } + } + + for (j = 0; j < fileContent.size() - 1; j++) { + if (fileContent[j] == '\r') { + fileContent.erase(j, 1); + j = j - 1; + } + } + + for (j = 0; j < fileContent.size() - 1; j++) { + if (fileContent[j] == '\n') { + if (fileContent[j + 1] == '\n') { + fileContent.erase(j, 1); + j = j - 1; + } + } + } + if (fileContent[0] == '\n') { + fileContent.erase(0, 1); + } + if (fileContent[fileContent.size() - 1] == '\n') { + fileContent.erase(fileContent.size() - 1, 1); + } + kernelInfo.len = fileContent.size(); + + std::string kernel_def_head = "kernel_def.h"; + kernelInfo.use_kernel_def_head = false; + if (fileContent.find(kernel_def_head, 0) != -1) { + kernelInfo.use_kernel_def_head = true; + } + + std::string substr_a = "\\"; + std::string substr_b = "\\n\""; + std::string substr_c = "\""; + U32 sublen_a = substr_a.size(); + U32 sublen_b = substr_b.size(); + U32 sublen_c = substr_c.size(); + + for (j = 0; j < fileContent.size() - 1; j++) { + if (fileContent[j] == '\\') { + fileContent.insert(j, substr_a); + j += sublen_a + 1; + } + } + + for (j = 0; j < fileContent.size() - 1; j++) { + if (fileContent[j] == '"') { + fileContent.insert(j, substr_a); + j += sublen_a + 1; + } + } + + for (j = 0; j < fileContent.size() - 1; j++) { + if (fileContent[j] == '\n') { + fileContent.insert(j, substr_b); + j += sublen_b + 1; + fileContent.insert(j, substr_c); + j += sublen_c; + } + } + fileContent.insert(0, substr_c); + fileContent.insert(fileContent.size(), substr_b); + kernelInfo.kernel = fileContent; + clMap[clName] = kernelInfo; + munmap(bytes, fileLength); + if (-1 != fd) { + close(fd); + } + } + } + return clMap; +} + +inline std::map buildClOptionMap( + std::vector optionNames, std::string optionPath, std::string postfix) +{ + std::map optionMap; + for (int i = 0; i < optionNames.size(); i++) { + std::string optionName = optionNames[i]; + std::string fileName = optionPath + optionName + postfix; + int fd = open(fileName.c_str(), O_RDONLY); + if (-1 == fd) { + UNI_ERROR_LOG("Cannot open .bolt file. Name: %s\n", fileName.c_str()); + } + + struct stat ss; + if (-1 == fstat(fd, &ss)) { + UNI_ERROR_LOG("Cannot get size from file descriptor. File Name: %s\n", fileName.c_str()); + } + + int fileLength = ss.st_size; + char *bytes = (char *)mmap(nullptr, fileLength, PROT_READ, MAP_SHARED, fd, 0); + if (MAP_FAILED == bytes) { + UNI_ERROR_LOG("Mmap failed. File Name: %s\n", fileName.c_str()); + } + std::string fileContent = (const char *)bytes; + int note_pos = -1; + int j = 0; + + for (; j < fileContent.size() - 1; j++) { + if (fileContent[j] == '#' && note_pos < 0) { + note_pos = j; + continue; + } + + if (fileContent[j] == '\n' && note_pos >= 0) { + fileContent.erase(note_pos, j - note_pos); + j = note_pos; + note_pos = -1; + } + } + + for (j = 0; j < fileContent.size() - 1; j++) { + if (fileContent[j] == '\r') { + fileContent.erase(j, 1); + j = j - 1; + } + } + + for (j = 0; j < fileContent.size() - 1; j++) { + if (fileContent[j] == '\n') { + if (fileContent[j + 1] == '\n') { + fileContent.erase(j, 1); + j = j - 1; + } + } + } + if (fileContent[0] == '\n') { + fileContent.erase(0, 1); + } + if (fileContent[fileContent.size() - 1] == '\n') { + fileContent.erase(fileContent.size() - 1, 1); + } + optionMap[optionName] = fileContent; + munmap(bytes, fileLength); + if (-1 != fd) { + close(fd); + } + } + return optionMap; +} + +inline std::map buildClOptionExpandMap( + std::map optionMap) +{ + std::map optionMapExpand; + std::string output_flag = "--output="; + std::string option_flag = "\\\""; + std::string postfix = ".bin"; + std::string replace_name = "${file%.*}"; + std::string common_opt = "${copt}"; + for (auto p : optionMap) { + std::string name = p.first; + std::string option = p.second; + OptionInfo optionInfo; + optionInfo.sourceName = name; + int pos = option.find(output_flag, 0); + while (pos != -1) { + int be = pos + output_flag.size(); + int end = option.find(" ", be); + std::string expandName = option.substr(be, end - be); + expandName.erase(expandName.size() - postfix.size(), postfix.size()); + expandName.replace(0, replace_name.size(), name); + + pos = option.find(option_flag, end); + be = pos + option_flag.size(); + end = option.find(option_flag, be); + std::string expandOption = option.substr(be, end - be); + int common_opt_pos = expandOption.find(common_opt, 0); + if (common_opt_pos == -1) { + optionInfo.use_common_opt = false; + } else { + optionInfo.use_common_opt = true; + if (name == "common") { + expandOption.replace(0, common_opt.size(), + "-D T=half -D T2=half2 -D T3=half3 -D T4=half4 -D T8=half8 -D T16=half16 " + "-DUSE_HALF"); + } else { + expandOption.erase(common_opt_pos, common_opt.size()); + } + } + pos = option.find(output_flag, end); + optionInfo.option = expandOption; + optionMapExpand[expandName] = optionInfo; + } + } + return optionMapExpand; +} + +inline std::string produce_inline_cl_source_head(std::vector clNames) +{ + std::string source_head = ""; + for (auto p : clNames) { + std::string func = "source_" + p; + source_head += "extern bool " + func + "_head;\n"; + source_head += "extern const unsigned int " + func + "_len;\n"; + source_head += "extern const char " + func + "[];\n"; + } + return source_head; +} + +inline std::string produce_inline_cl_option_head(std::vector optionNamesExpand) +{ + std::string option_head = ""; + for (auto p : optionNamesExpand) { + std::string func = "option_" + p; + option_head += "extern bool " + func + "_common;\n"; + option_head += "extern const char " + func + "_source_name[];\n"; + option_head += "extern const char " + func + "[];\n"; + } + return option_head; +} + +inline std::string produce_inline_cl_source(std::vector clNames) +{ + std::string source = ""; + for (auto p : clNames) { + std::string func = "source_" + p; + source += " put_source(\"" + p + "\", " + "{" + func + ", " + func + "_len, " + func + + "_head});\n"; + } + return source; +} + +inline std::string produce_inline_cl_option(std::vector optionNamesExpand) +{ + std::string source = ""; + for (auto p : optionNamesExpand) { + std::string func = "option_" + p; + source += " put_option(\"" + p + "\", " + "{" + func + ", " + func + "_source_name, " + + func + "_common});\n"; + } + return source; +} + +inline std::string produce_kernel_source(std::string name, KernelInfo kernelInfo) +{ + name = "source_" + name; + std::string source = ""; + bool use_kernel_def_head = kernelInfo.use_kernel_def_head; + U32 len = kernelInfo.len; + source += "bool " + name + "_head = " + std::to_string(use_kernel_def_head) + ";\n"; + source += "const unsigned int " + name + "_len = " + std::to_string(len) + ";\n"; + source += "const char " + name + "[] = \n"; + source += kernelInfo.kernel; + source += ";\n"; + return source; +} + +inline std::string produce_option_source(std::string name, OptionInfo optionInfo) +{ + name = "option_" + name; + std::string source = ""; + source += "bool " + name + "_common = " + std::to_string(optionInfo.use_common_opt) + ";\n"; + source += "const char " + name + "_source_name[] = "; + source += "\""; + source += optionInfo.sourceName; + source += "\";\n"; + source += "const char " + name + "[] = "; + source += "\""; + source += optionInfo.option; + source += "\";\n"; + return source; +} + +inline void write_to_file(std::string str, std::string path, std::string name) +{ + std::string fileName = path + name; + std::ofstream file(fileName.c_str()); + if (file.is_open()) { + file << str.c_str(); + file.close(); + } else { + UNI_ERROR_LOG("fail to write file %s\n", fileName.c_str()); + } +} + +int main() +{ + CI8 *boltEnv = getenv("BOLT_ROOT"); + if (boltEnv == NULL) { + UNI_ERROR_LOG("BOLT_ROOT env value has not been set successfully\n"); + }; + std::string boltPath = boltEnv; + CI8 lastFlag = boltPath[boltPath.length() - 1]; + if (strcmp(&lastFlag, "/") != 0) { + boltPath += "/"; + } + std::string tensorComputingClPath = "compute/tensor/src/gpu/mali/cl/"; + std::string imageClPath = "compute/image/src/gpu/mali/cl/"; + tensorComputingClPath = boltPath + tensorComputingClPath; + imageClPath = boltPath + imageClPath; + + std::string clOptionPath = "common/gcl/tools/kernel_lib_compile/sh/compile/"; + clOptionPath = boltPath + clOptionPath; + + // std::string samplePath = "gcl/tools/gcl_sample/cl/"; + // samplePath = boltPath + samplePath; + + std::vector clPath; + clPath.push_back(tensorComputingClPath); + clPath.push_back(imageClPath); + // clPath.push_back(samplePath); + + std::vector clNames; + std::vector headNames; + std::vector clNamesIndex; + std::vector headNamesIndex; + + for (auto p : clPath) { + std::vector clName; + std::vector headName; + headName = buildFileNames(p, ".h"); + clName = buildFileNames(p, ".cl"); + clNames.insert(clNames.end(), clName.begin(), clName.end()); + headNames.insert(headNames.end(), headName.begin(), headName.end()); + clNamesIndex.push_back(clNames.size()); + headNamesIndex.push_back(headName.size()); + } + + std::vector clOptionNames; + std::vector clOptionNamesExpand; + clOptionNames = buildFileNames(clOptionPath, ".sh"); + + std::map headMap; + std::map clMap; + std::map clOptionMap; + std::map clOptionMapExpand; + headMap = buildClMap(headNames, clPath, headNamesIndex, ".h"); + clMap = buildClMap(clNames, clPath, clNamesIndex, ".cl"); + clOptionMap = buildClOptionMap(clOptionNames, clOptionPath, ".sh"); + + std::string filePath = "common/gcl/tools/kernel_source_compile/include/"; + filePath = boltPath + filePath; + std::string kernel_source_executor; + kernel_source_executor = "#ifndef _LIBKERNELSOURCE_H\n"; + kernel_source_executor += "#define _LIBKERNELSOURCE_H\n"; + kernel_source_executor += "#include \"gcl_kernel_source.h\"\n"; + kernel_source_executor += "class kernel_source_executor : public gcl_kernel_source {\n"; + kernel_source_executor += "public:\n"; + kernel_source_executor += " kernel_source_executor() {\n"; + kernel_source_executor += " loadKernelSource();\n"; + kernel_source_executor += " loadKernelOption();\n"; + kernel_source_executor += " }\n"; + kernel_source_executor += " void loadKernelSource();\n"; + kernel_source_executor += " void loadKernelOption();\n"; + kernel_source_executor += "};\n"; + kernel_source_executor += "#endif\n"; + write_to_file(kernel_source_executor, filePath, "libkernelsource.h"); + + filePath = "common/gcl/tools/kernel_source_compile/src/cl/"; + filePath = boltPath + filePath; + std::string inline_cl_source_head; + inline_cl_source_head = "#ifndef _INLINE_CL_SOURCE_HEAD\n"; + inline_cl_source_head += "#define _INLINE_CL_SOURCE_HEAD\n"; + inline_cl_source_head += produce_inline_cl_source_head(headNames); + inline_cl_source_head += produce_inline_cl_source_head(clNames); + inline_cl_source_head += "#endif\n "; + write_to_file(inline_cl_source_head, filePath, "inline_cl_source_head.h"); + + std::string inline_cl_source; + inline_cl_source = "#include \"libkernelsource.h\"\n"; + inline_cl_source += "#include \"inline_cl_source_head.h\"\n"; + inline_cl_source += "void kernel_source_executor::loadKernelSource() {\n"; + inline_cl_source += produce_inline_cl_source(headNames); + inline_cl_source += produce_inline_cl_source(clNames); + inline_cl_source += "}\n"; + write_to_file(inline_cl_source, filePath, "inline_cl_source.cpp"); + + std::string kernel_source = "#include \"inline_cl_source_head.h\"\n"; + for (auto p : headMap) { + std::string name = p.first; + KernelInfo kernelInfo = p.second; + kernel_source += produce_kernel_source(name, kernelInfo); + } + for (auto p : clMap) { + std::string name = p.first; + KernelInfo kernelInfo = p.second; + kernel_source += produce_kernel_source(name, kernelInfo); + } + write_to_file(kernel_source, filePath, "gcl_kernel_source.cpp"); + + clOptionMapExpand = buildClOptionExpandMap(clOptionMap); + for (auto p : clOptionMapExpand) { + clOptionNamesExpand.push_back(p.first); + } + filePath = "common/gcl/tools/kernel_source_compile/src/option/"; + filePath = boltPath + filePath; + std::string inline_cl_option_head; + inline_cl_option_head = "#ifndef _INLINE_CL_OPTION_HEAD\n"; + inline_cl_option_head += "#define _INLINE_CL_OPTION_HEAD\n"; + inline_cl_option_head += produce_inline_cl_option_head(clOptionNamesExpand); + inline_cl_option_head += "#endif\n "; + write_to_file(inline_cl_option_head, filePath, "inline_cl_option_head.h"); + + std::string inline_cl_option; + inline_cl_option = "#include \"libkernelsource.h\"\n"; + inline_cl_option += "#include \"inline_cl_option_head.h\"\n"; + inline_cl_option += "void kernel_source_executor::loadKernelOption() {\n"; + inline_cl_option += produce_inline_cl_option(clOptionNamesExpand); + inline_cl_option += "}\n"; + write_to_file(inline_cl_option, filePath, "inline_cl_option.cpp"); + + std::string option_source = "#include \"inline_cl_option_head.h\"\n"; + for (auto p : clOptionMapExpand) { + std::string name = p.first; + OptionInfo optionInfo = p.second; + option_source += produce_option_source(name, optionInfo); + } + write_to_file(option_source, filePath, "gcl_kernel_option.cpp"); + return 0; +} diff --git a/common/memory/include/memory.hpp b/common/memory/include/memory.hpp new file mode 100644 index 00000000..2a968edc --- /dev/null +++ b/common/memory/include/memory.hpp @@ -0,0 +1,52 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _MEMORY_H +#define _MEMORY_H + +#include +#include "tensor_desc.h" + +typedef enum { OCLMem = 0, CPUMem = 1 } MemoryType; + +class Memory { +public: + Memory() + {} + + virtual ~Memory() = default; + + virtual MemoryType get_mem_type() = 0; + + virtual std::shared_ptr clone(bool allocate = true) = 0; + + virtual void resize(TensorDesc desc) = 0; + + virtual void alloc() = 0; + + virtual EE reuse(Memory *other) = 0; + + virtual EE copy_from(Memory *other) = 0; + + virtual EE copy_to(Memory *other) + { + return other->copy_from(this); + } + + virtual U32 length() = 0; + virtual U32 bytes() = 0; + virtual U32 capacity() = 0; + virtual std::string string(U32 num, F32 factor) = 0; + virtual F32 element(U32 index) = 0; +}; +#endif diff --git a/common/memory/include/memory_cpu.hpp b/common/memory/include/memory_cpu.hpp new file mode 100644 index 00000000..a5018345 --- /dev/null +++ b/common/memory/include/memory_cpu.hpp @@ -0,0 +1,172 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _MEMORY_CPU_H +#define _MEMORY_CPU_H +#include +#include "memory.hpp" + +class CpuMemory : public Memory { +public: + CpuMemory() + { + this->capacitySize = 0; + this->allocated = false; + } + + ~CpuMemory() = default; + + std::shared_ptr clone(bool allocate) override + { + CpuMemory *mem = new CpuMemory(); + mem->desc = this->desc; + if (allocate) { + mem->alloc(); + } + return std::shared_ptr(mem); + } + + MemoryType get_mem_type() override + { + return CPUMem; + } + + void resize(TensorDesc desc) override + { + this->desc = desc; + if (tensorNumBytes(desc) > this->capacity()) { + this->allocated = false; + } + } + + void alloc() override + { + auto size = this->bytes(); + if (!this->allocated && size > this->capacity()) { + this->capacitySize = size; + this->val = std::shared_ptr((U8 *)operator new(size)); + } + this->allocated = true; + } + + TensorDesc get_desc() + { + return this->desc; + } + + void set_ptr(U8 *val) + { + this->set_shared_ptr(std::shared_ptr(val)); + } + + void *get_ptr() + { + return this->val.get(); + } + + void set_shared_ptr(std::shared_ptr val) + { + this->val = val; + this->allocated = true; + this->capacitySize = this->bytes(); + } + + std::shared_ptr get_shared_ptr() + { + return this->val; + } + + U32 length() override + { + return tensorNumElements(this->desc); + } + + U32 bytes() override + { + return tensorNumBytes(this->desc); + } + + U32 capacity() override + { + return this->capacitySize; + } + + EE reuse(Memory *other) override + { + EE ret; + if (other->get_mem_type() != CPUMem) { + ret = this->copy_from(other); + } else { + U32 other_size = other->capacity(); + if (other_size >= this->bytes()) { + this->set_shared_ptr(((CpuMemory *)other)->get_shared_ptr()); + this->capacitySize = other->capacity(); + ret = SUCCESS; + } else { + UNI_ERROR_LOG("Small CPU memory can not meet big CPU memory demand\n"); + ret = NOT_SUPPORTED; + } + } + return ret; + } + + EE copy_from(Memory *other) override + { + if (!this->allocated) { + this->alloc(); + } + if (CPUMem == other->get_mem_type()) { + auto *src = ((CpuMemory *)other)->val.get(); + auto *dst = this->val.get(); + auto dst_size = this->bytes(); + auto src_size = other->bytes(); + U32 min_size = UNI_MIN(src_size, dst_size); + U32 max_size = UNI_MAX(src_size, dst_size); + if (min_size <= 0) { + min_size = max_size; + } + UNI_memcpy(dst, src, min_size); + } else { + //todo + } + return SUCCESS; + } + + std::string string(U32 num, F32 factor) override + { + std::string line = "desc: " + tensorDesc2Str(this->desc) + " data:"; + for (U32 i = 0; i < num; i++) { + line = line + std::to_string(this->element(i) * factor) + " "; + } + return line; + } + + F32 element(U32 index) override + { + U8 *res = (U8 *)this->get_ptr(); + U32 offset = bytesOf(this->desc.dt) * index; + F32 value; + transformToFloat(this->desc.dt, res + offset, &value, 1); + return value; + } + +private: + // array val's really bytes + U32 capacitySize; + std::shared_ptr val; + + TensorDesc desc; + + bool allocated; +}; +#endif diff --git a/common/memory/include/memory_ocl.hpp b/common/memory/include/memory_ocl.hpp new file mode 100644 index 00000000..9129c7d9 --- /dev/null +++ b/common/memory/include/memory_ocl.hpp @@ -0,0 +1,294 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _MEMORY_OCL_H +#define _MEMORY_OCL_H + +#include "memory.hpp" +#include "gcl.h" +#include "ocl_data_alloc.h" +#include "ocl_data_trans.h" + +class OclMemory : public Memory { +public: + OclMemory() + { + memset(&(this->desc), 0, sizeof(GCLMemDesc)); + this->desc.memFormat = DF_NCHW; + this->allocated = false; + this->mapped = false; + this->capacitySize = 0; + } + + ~OclMemory() = default; + + MemoryType get_mem_type() override + { + return OCLMem; + } + + std::shared_ptr clone(bool allocate) override + { + OclMemory *mem = new OclMemory(); + mem->desc = this->desc; + if (allocate) { + mem->alloc(); + } + return std::shared_ptr(mem); + } + + void resize(TensorDesc desc) override + { + this->desc.nDims = desc.nDims; + for (U32 i = 0; i < desc.nDims; i++) { + this->desc.dims[i] = desc.dims[i]; + } + this->desc.dt = desc.dt; + this->desc.df = desc.df; + if (this->desc.byteSize == 0) { + this->desc.memType = GCL_MEM_BUF; + this->desc.flags = CL_MEM_READ_WRITE; + } + if (tensorNumBytes(desc) > this->capacity()) { + this->allocated = false; + } + } + + void padding(GCLMemDesc desc) + { + if (desc.byteSize > this->capacity()) { + this->allocated = false; + } + for (U32 i = 0; i < 3; i++) { + this->desc.stride[i] = desc.stride[i]; + this->desc.offset[i] = desc.offset[i]; + } + this->desc.memType = desc.memType; + this->desc.memFormat = desc.memFormat; + this->desc.byteSize = desc.byteSize; + this->desc.num = desc.num; + this->desc.flags = desc.flags; + this->desc.imgFormat = desc.imgFormat; + this->desc.host_ptr = desc.host_ptr; + this->desc.need_pad = desc.need_pad; + } + + void alloc() override + { + if (this->desc.byteSize == 0) { + U32 num = (this->desc.nDims == 0) ? 0 : 1; + for (U32 i = 0; i < this->desc.nDims; i++) { + num *= this->desc.dims[i]; + } + this->desc.byteSize = num * bytesOf(this->desc.dt); + } + U32 size = this->desc.byteSize; + if (!this->allocated && size > this->capacity()) { + GCLMem_t mem = ocl_alloc_gclmem(this->desc); + this->val = std::shared_ptr(mem, ocl_release_gclmem); + this->allocated = true; + this->capacitySize = size; + } + } + + GCLMemDesc get_desc() + { + return this->desc; + } + + EE copy_from(Memory *other) override + { + EE ret = SUCCESS; + if (other->get_mem_type() == CPUMem) { + U32 size = ((CpuMemory *)other)->bytes(); + void *host_ptr = ((CpuMemory *)other)->get_ptr(); + if (!allocated) { + U8 *tmp = nullptr; + if (size < this->desc.byteSize) { + U8 *tmp = (U8 *)operator new(this->desc.byteSize); + memset(tmp, 0, this->desc.byteSize); + memcpy(tmp, host_ptr, size); + host_ptr = tmp; + } + this->desc.host_ptr = host_ptr; + this->desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + this->alloc(); + if (tmp) { + delete tmp; + } + } else { + this->val->desc = this->desc; //TODO DELETE AFTER SPLITE DESC FROM GCLMEM + if (size > this->desc.byteSize) { + size = this->desc.byteSize; + } + CHECK_STATUS(gcl_trans_memory(OCLContext::getInstance().handle.get(), host_ptr, + this->val.get(), &size, HOST_TO_DEVICE_BUF, CL_TRUE)); + } + } else if (other->get_mem_type() == OCLMem) { + if (!allocated) { + this->alloc(); + } else { + GCLMemDesc srcDesc = ((OclMemory *)other)->get_desc(); + GCLMemType srcMt = srcDesc.memType; + GCLMemType dstMt = this->desc.memType; + void *srcPtr = ((OclMemory *)other)->get_ptr(); + void *dstPtr = this->val.get(); + if (srcMt != GCL_MEM_BUF && dstMt == GCL_MEM_BUF) { + if (srcDesc.byteSize > this->desc.byteSize) { + CHECK_STATUS(NOT_MATCH); + } + U32 region[3] = {srcDesc.stride[0], srcDesc.stride[1], srcDesc.stride[2]}; + CHECK_STATUS(gcl_trans_memory(OCLContext::getInstance().handle.get(), srcPtr, + dstPtr, region, DEVICE_IMG_TO_BUF, CL_TRUE)); + } else if (srcMt == GCL_MEM_BUF && dstMt != GCL_MEM_BUF) { + if (this->desc.byteSize > srcDesc.byteSize) { + CHECK_STATUS(NOT_MATCH); + } + U32 region[3] = { + this->desc.stride[0], this->desc.stride[1], this->desc.stride[2]}; + CHECK_STATUS(gcl_trans_memory(OCLContext::getInstance().handle.get(), srcPtr, + dstPtr, region, DEVICE_BUF_TO_IMG, CL_TRUE)); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + } + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + return ret; + } + + void *get_ptr() + { + if (allocated) { + this->val->desc = this->desc; //TODO DELETE AFTER SPLITE DESC FROM GCLMEM + } + return this->val.get(); + } + + void set_shared_ptr(std::shared_ptr val) + { + this->val = val; + this->allocated = true; + this->capacitySize = this->bytes(); + } + + std::shared_ptr get_shared_ptr() + { + if (allocated) { + this->val->desc = this->desc; //TODO DELETE AFTER SPLITE DESC FROM GCLMEM + } + return this->val; + } + + void mapped_alloc() + { + this->desc.flags = CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR; + this->desc.byteSize *= 2; + this->allocated = this->mapped; + this->mapped = true; + this->alloc(); + } + + void *get_mapped_ptr() + { + if (!mapped) { + CHECK_STATUS(NOT_MATCH); + } + ocl_map_mem(OCLContext::getInstance().handle.get(), this->val.get(), this->desc); + return this->val->mapPtrArray.back(); + } + + EE reuse(Memory *other) override + { + EE ret; + if (other->get_mem_type() != OCLMem) { + ret = this->copy_from(other); + } else { + U32 size = other->capacity(); + if (size >= this->bytes()) { + this->val = ((OclMemory *)other)->get_shared_ptr(); + this->allocated = true; + this->capacitySize = other->capacity(); + ret = SUCCESS; + } else { + UNI_ERROR_LOG("small OCL memory can not meet big OCL memory demand\n"); + ret = NOT_SUPPORTED; + } + } + return ret; + } + + U32 length() override + { + return this->desc.num; + } + + U32 bytes() override + { + return this->desc.byteSize; + } + + U32 capacity() override + { + return this->capacitySize; + } + + std::string string(U32 num, F32 factor) override + { + std::string line = "desc: " + gclMemDesc2Str(this->desc) + "data: \n"; +#ifdef _DEBUG + DataType dt = (this->desc.dt == DT_U8) ? DT_F16 : this->desc.dt; + switch (dt) { + case DT_F16: + line += gcl_check_data( + OCLContext::getInstance().handle.get(), this->desc, get_ptr(), num, 0, false); + break; + case DT_I32: + line += gcl_check_data( + OCLContext::getInstance().handle.get(), this->desc, get_ptr(), num, 0, false); + break; + default: + UNI_ERROR_LOG("Currently not support to get %d type OCL Memory\n", this->desc.dt); + break; + } +#else + if (mapped) { + for (U32 i = 0; i < num; i++) { + line += std::to_string(this->element(i) * factor) + " "; + } + } +#endif + return line; + } + + F32 element(U32 index) override + { + F32 result = 0; + if (this->mapped) { + F16 *res = (F16 *)this->val->mapPtrArray.back(); + result = res[index]; + } else { + UNI_ERROR_LOG("Currently not support to get element on OCL memory\n"); + } + return result; + } + +private: + GCLMemDesc desc; + std::shared_ptr val; + U32 capacitySize; + bool allocated; + bool mapped; +}; +#endif diff --git a/common/memory/include/tensor.hpp b/common/memory/include/tensor.hpp new file mode 100644 index 00000000..25021765 --- /dev/null +++ b/common/memory/include/tensor.hpp @@ -0,0 +1,168 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _TENSOR_H +#define _TENSOR_H + +#include +#include +#include +#include +#include "memory_cpu.hpp" +#ifdef _USE_MALI +#include "memory_ocl.hpp" +#endif + +class Tensor { +public: + Tensor(MemoryType memoryType = CPUMem) + { + if (memoryType == CPUMem) { + this->val = std::shared_ptr(new CpuMemory()); + } else { +#ifdef _USE_MALI + this->val = std::shared_ptr(new OclMemory()); +#else + UNI_ERROR_LOG("not support to create GPU Tensor\n"); +#endif + } + this->scale = std::shared_ptr(new F32(-1.0)); + } + + Tensor clone(bool allocate = true) + { + Tensor tensor = *this; + tensor.val = this->val->clone(allocate); + tensor.scale = std::shared_ptr(new F32(tensor.get_scale())); + return tensor; + } + + void resize(TensorDesc desc) + { + this->desc = desc; + this->val->resize(desc); + } + + void alloc() + { + this->val->alloc(); + } + + template + static Tensor alloc_sized(TensorDesc desc) + { + Tensor tensor(type); + tensor.resize(desc); + tensor.alloc(); + return tensor; + } + + TensorDesc get_desc() + { + return this->desc; + } + + void set_scale(F32 scale) + { + *(this->scale) = scale; + } + + F32 get_scale() + { + return *(this->scale); + } + + void reuse(Tensor *other) + { + this->val->reuse(other->val.get()); + } + + void copy_from(Tensor *other) + { + this->desc = other->desc; + memcpy(this->scale.get(), other->scale.get(), sizeof(F32)); + this->val->copy_from(other->val.get()); + } + + void copy_to(Tensor *other) + { + other->copy_from(this); + } + + Memory *get_memory() + { + return this->val.get(); + } + + std::shared_ptr get_shared_memory() + { + return this->val; + } + + U32 length() + { + return this->val->length(); + } + + U32 bytes() + { + return this->val->bytes(); + } + + U32 capacity() + { + return this->val->capacity(); + } + + std::string string(int length = -1) + { + int num = tensorNumElements(this->desc); + if (length >= 0 && length < num) { + num = length; + } + F32 factor = this->get_scale(); + factor = (factor == -1) ? 1 : factor; + std::string line = this->val->string(num, factor); + return line; + } + + F32 element(U32 index) + { + F32 factor = this->get_scale(); + factor = (factor == -1) ? 1 : factor; + return this->val->element(index) * factor; + } + +private: + TensorDesc desc; + std::shared_ptr val; + std::shared_ptr scale; +}; + +#include "sys.h" + +// deprecated API, this will be remove +inline void *get_ptr_from_tensor(Tensor tensor, Arch arch) +{ + void *ptr = nullptr; + if (arch == MALI) { +#ifdef _USE_MALI + ptr = ((OclMemory *)(tensor.get_memory()))->get_ptr(); +#endif + } else { + ptr = ((CpuMemory *)(tensor.get_memory()))->get_ptr(); + } + return ptr; +} + +#endif // _TENSOR_H diff --git a/common/uni/CMakeLists.txt b/common/uni/CMakeLists.txt new file mode 100644 index 00000000..b4669c13 --- /dev/null +++ b/common/uni/CMakeLists.txt @@ -0,0 +1,19 @@ +cmake_minimum_required(VERSION 3.2) + +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) +if (BOLT_CONFIGURE_FILE) + include(${BOLT_CONFIGURE_FILE}) +else (BOLT_CONFIGURE_FILE) + message(FATAL_ERROR " +FATAL: can not find bolt.cmake in directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (BOLT_CONFIGURE_FILE) + +project(uni) + +set_c_cxx_flags() + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) + +add_subdirectory(src) diff --git a/common/uni/include/algorithm_map.h b/common/uni/include/algorithm_map.h new file mode 100644 index 00000000..80440e6c --- /dev/null +++ b/common/uni/include/algorithm_map.h @@ -0,0 +1,400 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _ALGORITHM_MAP_H +#define _ALGORITHM_MAP_H + +#include +#include +#include "thread_affinity.h" +#include "op_type.h" +#include "types.h" + +class AlgorithmMap { +public: + AlgorithmMap(Arch arch, std::string modelName, std::string deviceName, DataType dt) + { + this->algorithmFileName = "algorithmInfo_"; + this->algorithmFileName += deviceName; + this->algorithmFileName += "_"; + this->algorithmFileName += modelName; + this->algorithmFileName += "_"; + this->algorithmFileName += std::to_string(arch); + this->algorithmFileName += "_"; + this->algorithmFileName += std::to_string(dt); + this->hasAlgorithmFile = false; + this->arch = arch; + this->commonAlgoFileName = "commonAlgoInfo_"; + this->commonAlgoFileName += deviceName; + this->commonAlgoFileName += "_"; + this->commonAlgoFileName += std::to_string(arch); + this->commonAlgoFileName += "_"; + this->commonAlgoFileName += std::to_string(dt); + this->hasCommonAlgoFile = false; + } + + void setAlgorithmInfoToMap( + std::string name, I32 *algorithmArray, U32 arrayNum, bool commonAlgo = false) + { + std::string algoInfo = "/"; + for (U32 i = 0; i < arrayNum; i++) { + algoInfo += std::to_string(algorithmArray[i]); + algoInfo += "/"; + } + if (!commonAlgo) { + this->algorithmMap[name] = algoInfo; + } else { + this->commonAlgoMap[name] = algoInfo; + } + } + + bool getAlgorithmInfoFromMap( + std::string name, I32 *algorithmArray, U32 arrayNum, bool commonAlgo = false) + { + std::string algoInfo; + if (!commonAlgo) { + if (this->algorithmMap.find(name) == this->algorithmMap.end()) { + return false; + } + algoInfo = this->algorithmMap[name]; + } else { + if (this->commonAlgoMap.find(name) == this->commonAlgoMap.end()) { + return false; + } + algoInfo = this->commonAlgoMap[name]; + } + U32 be = algoInfo.find_first_of("/"); + U32 end; + for (U32 i = 0; i < arrayNum; i++) { + end = algoInfo.find("/", be + 1); + algorithmArray[i] = std::stoi(algoInfo.substr(be + 1, end - be - 1)); + be = end; + } + return true; + } + + void loadAlgorithmMapFromFileStream(const char *algoFileStream) + { + U32 be = 0; + be = readFileStreamForMap(algoFileStream, be, &this->algorithmMap); +#ifdef _USE_MALI + be = readFileStreamForMap(algoFileStream, be, &this->kernelThreadMap); +#endif + be = readFileStreamForMap(algoFileStream, be, &this->commonAlgoMap); + if (algorithmMap.size()) { + this->hasAlgorithmFile = true; + } + if (commonAlgoMap.size()) { + this->hasCommonAlgoFile = true; + } + } + + void loadAlgorithmMapFromText(std::string algorithmMapPath) + { + if (algorithmMapPath == std::string("")) { + UNI_DEBUG_LOG("load algorithm file failed, path is not set \n"); + return; + } + CI8 lastFlag = algorithmMapPath[algorithmMapPath.length() - 1]; + if (strcmp(&lastFlag, "/") != 0) { + algorithmMapPath += "/"; + } + this->hasAlgorithmFile = readTextForMap(algorithmFileName, algorithmMapPath, &algorithmMap); + this->hasCommonAlgoFile = + readTextForMap(commonAlgoFileName, algorithmMapPath, &commonAlgoMap); + } + + void saveAlgorithmMapToText(std::string algorithmMapPath) + { + if (algorithmMapPath == std::string("")) { + UNI_DEBUG_LOG("save algorithm file failed, path is not set \n"); + return; + } + if (this->hasAlgorithmFile) { + return; + } + CI8 lastFlag = algorithmMapPath[algorithmMapPath.length() - 1]; + if (strcmp(&lastFlag, "/") != 0) { + algorithmMapPath += "/"; + } + saveMapToText( + this->algorithmFileName, algorithmMapPath, this->algorithmMap, this->hasAlgorithmFile); + saveMapToText(this->commonAlgoFileName, algorithmMapPath, this->commonAlgoMap, + this->hasCommonAlgoFile); + } + + void getCommonAlgoMapPara(U32 *ic_step, + U32 *ihw_step, + U32 *fn_step, + U32 *ic_max, + U32 *ihw_max, + U32 *fn_max, + std::set *fwh, + std::set *stride) + { + if (ic_step) { + *ic_step = 16; + } + if (ihw_step) { + *ihw_step = 16; + } + if (fn_step) { + *fn_step = 16; + } + if (ic_max) { + *ic_max = 640; + } + if (ihw_max) { + *ihw_max = 640; + } + if (fn_max) { + *fn_max = 640; + } + if (fwh) { + (*fwh).insert(1); + (*fwh).insert(2); + (*fwh).insert(3); + (*fwh).insert(4); + (*fwh).insert(5); + (*fwh).insert(7); + } + if (stride) { + (*stride).insert(1); + (*stride).insert(2); + } + } + + void setCommonAlgoInfoToMap(OperatorType opType, + DataType dt, + U32 ic, + U32 ih, + U32 iw, + U32 fn, + U32 fh, + U32 fw, + U32 sh, + U32 sw, + I32 *algorithmArray, + U32 arrayNum) + { + std::string algoName = getCommonAlgoName(opType, dt, ic, ih, iw, fn, fh, fw, sh, sw); + setAlgorithmInfoToMap(algoName, algorithmArray, arrayNum, true); + } + + bool getCommonAlgoInfoFromMap(OperatorType opType, + DataType dt, + U32 ic, + U32 ih, + U32 iw, + U32 fn, + U32 fh, + U32 fw, + U32 sh, + U32 sw, + I32 *algorithmArray, + U32 arrayNum) + { + if (this->commonAlgoMap.size() == 0) { + return false; + } + U32 ic_step, ihw_step, fn_step, ic_max, ihw_max, fn_max; + std::set fwh; + std::set stride; + getCommonAlgoMapPara( + &ic_step, &ihw_step, &fn_step, &ic_max, &ihw_max, &fn_max, &fwh, &stride); + ic = ((ic + ic_step - 1) / ic_step) * ic_step; + ih = ((ih + ihw_step - 1) / ihw_step) * ihw_step; + iw = ((iw + ihw_step - 1) / ihw_step) * ihw_step; + fn = ((fn + fn_step - 1) / fn_step) * fn_step; + ic = (ic > ic_max) ? ic_max : ic; + ih = (ih > ihw_max) ? ihw_max : ih; + iw = (iw > ihw_max) ? ihw_max : iw; + fn = (fn > fn_max) ? fn_max : fn; + fw = (fw < fh) ? fh : fw; + while (fwh.find(fw) == fwh.end()) { + fw--; + } + while (stride.find(sw) == stride.end()) { + sw--; + } + std::string algoName = getCommonAlgoName(opType, dt, ic, ih, iw, fn, fh, fw, sh, sw); + return getAlgorithmInfoFromMap(algoName, algorithmArray, arrayNum, true); + } + +#ifdef _USE_MALI + void setKernelThreadInfoToMap(std::string name, U32 gs[3], U32 ls[3]) + { + std::string kernelThreadInfo = "/"; + for (U32 i = 0; i < 3; i++) { + kernelThreadInfo += std::to_string(gs[i]); + kernelThreadInfo += "/"; + } + for (U32 i = 0; i < 3; i++) { + kernelThreadInfo += std::to_string(ls[i]); + kernelThreadInfo += "/"; + } + kernelThreadMap[name] = kernelThreadInfo; + } + + bool getKernelThreadInfoFromMap(std::string name, U32 *gs, U32 *ls) + { + bool findKernelInfo = kernelThreadMap.count(name); + if (!findKernelInfo) { + return findKernelInfo; + } + std::string kernelThreadInfo = kernelThreadMap[name]; + U32 be = kernelThreadInfo.find_first_of("/"); + U32 end; + for (U32 i = 0; i < 3; i++) { + end = kernelThreadInfo.find("/", be + 1); + gs[i] = std::stoi(kernelThreadInfo.substr(be + 1, end - be - 1)); + be = end; + } + for (U32 i = 0; i < 3; i++) { + end = kernelThreadInfo.find("/", be + 1); + ls[i] = std::stoi(kernelThreadInfo.substr(be + 1, end - be - 1)); + be = end; + } + return findKernelInfo; + } +#endif + +private: + U32 readFileStreamForMap( + const char *algoFileStream, U32 be, std::map *targetMap) + { + int num; + std::string content(algoFileStream); + std::string numString = ""; + std::string nameString = ""; + std::string infoString = ""; + while (content[be] == '\n' || content[be] == '\r' || content[be] == '\t' || + content[be] == ' ') { + be++; + } + if (be >= content.size()) { + return content.size(); + } + if (content[be] == '\0') { + return be; + } + while (content[be] != '\n') { + numString += content[be]; + be++; + } + num = (numString.size()) ? std::stoi(numString) : 0; + for (int i = 0; i < num; i++) { + be++; + while (content[be] != ' ') { + nameString += content[be]; + be++; + } + be++; + while (content[be] != '\n') { + infoString += content[be]; + be++; + } + (*targetMap)[nameString] = infoString; + nameString = ""; + infoString = ""; + } + return be++; + } + + bool readTextForMap( + std::string fileName, std::string path, std::map *targetMap) + { + std::string fullyFileName = path + fileName; + FILE *file = fopen(fullyFileName.c_str(), "r"); + if (!file || feof(file)) { + return false; + } + UNI_INFO_LOG("load algorithmFile %s\n", fullyFileName.c_str()); + int num = 0; + fscanf(file, "%d", &num); + char operatorName[100]; + char algorithm[100]; + for (int i = 0; i < num; i++) { + fscanf(file, "%s %s", operatorName, algorithm); + (*targetMap)[operatorName] = algorithm; + } +#ifdef _USE_MALI + if (this->arch == MALI && fileName == this->algorithmFileName) { + fscanf(file, "%d", &num); + char kernelName[100]; + char kernelThreadInfo[100]; + for (int i = 0; i < num; i++) { + fscanf(file, "%s %s", kernelName, kernelThreadInfo); + kernelThreadMap[kernelName] = kernelThreadInfo; + } + } +#endif + fclose(file); + return true; + } + + void saveMapToText(std::string fileName, + std::string path, + std::map targetMap, + bool noNeedSave) + { + if (noNeedSave) { + return; + } + if (targetMap.size() > 0) { + std::string fullyFileName = path + fileName; + UNI_DEBUG_LOG("save algorithmFile %s\n", fullyFileName.c_str()); + FILE *file = fopen(fullyFileName.c_str(), "w"); + fprintf(file, "%ld\n", (I64)targetMap.size()); + for (auto iter : targetMap) { + fprintf(file, "%s %s\n", iter.first.c_str(), iter.second.c_str()); + } +#ifdef _USE_MALI + if (this->arch == MALI && fileName == this->algorithmFileName) { + fprintf(file, "%ld\n", (I64)kernelThreadMap.size()); + for (auto iter : kernelThreadMap) { + fprintf(file, "%s %s\n", iter.first.c_str(), iter.second.c_str()); + } + } +#endif + fclose(file); + } + } + + std::string getCommonAlgoName( + OperatorType opType, DataType dt, U32 ic, U32 ih, U32 iw, U32 fn, U32 fh, U32 fw, U32 sh, U32 sw) + { + std::string algoName = "op" + std::to_string(opType) + "dt" + std::to_string(dt); + algoName += "ic" + std::to_string(ic); + algoName += "ih" + std::to_string(ih); + algoName += "iw" + std::to_string(iw); + algoName += "fn" + std::to_string(fn); + algoName += "fh" + std::to_string(fh); + algoName += "fw" + std::to_string(fw); + algoName += "sh" + std::to_string(sh); + algoName += "sw" + std::to_string(sw); + return algoName; + } + + std::map algorithmMap; + std::string algorithmFileName; + Arch arch; + bool hasAlgorithmFile; +#ifdef _USE_MALI + std::map kernelThreadMap; +#endif + std::map commonAlgoMap; + std::string commonAlgoFileName; + bool hasCommonAlgoFile; +}; +#endif diff --git a/uni/include/arm_neon_expand.h b/common/uni/include/arm_neon_expand.h similarity index 59% rename from uni/include/arm_neon_expand.h rename to common/uni/include/arm_neon_expand.h index 489d76b0..74df296f 100644 --- a/uni/include/arm_neon_expand.h +++ b/common/uni/include/arm_neon_expand.h @@ -1,24 +1,23 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_ARM_NEON_EXPAND #define _H_ARM_NEON_EXPAND #include #include #include -#include "type.h" +#include "types.h" #include "error.h" #ifndef __aarch64__ @@ -37,7 +36,7 @@ inline float vmaxvq_f32(float32x4_t x) } #ifndef __ANDROID__ -inline float32x4_t vfmaq_f32(float32x4_t c, float32x4_t a, float32_t b) +inline float32x4_t vfmaq_f32(float32x4_t c, float32x4_t a, float32_t b) { return vmlaq_f32(c, a, vdupq_n_f32(b)); } @@ -65,42 +64,37 @@ inline unsigned int vaddvq_u32(uint32x4_t x) inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array &coeffs) { - float32x4_t A = vfmaq_f32(coeffs[0], coeffs[4], x); - float32x4_t B = vfmaq_f32(coeffs[2], coeffs[6], x); - float32x4_t C = vfmaq_f32(coeffs[1], coeffs[5], x); - float32x4_t D = vfmaq_f32(coeffs[3], coeffs[7], x); - float32x4_t x2 = vmulq_f32(x, x); - float32x4_t x4 = vmulq_f32(x2, x2); - float32x4_t res = vfmaq_f32(vfmaq_f32(A, B, x2), - vfmaq_f32(C, D, x2), - x4); + float32x4_t A = vfmaq_f32(coeffs[0], coeffs[4], x); + float32x4_t B = vfmaq_f32(coeffs[2], coeffs[6], x); + float32x4_t C = vfmaq_f32(coeffs[1], coeffs[5], x); + float32x4_t D = vfmaq_f32(coeffs[3], coeffs[7], x); + float32x4_t x2 = vmulq_f32(x, x); + float32x4_t x4 = vmulq_f32(x2, x2); + float32x4_t res = vfmaq_f32(vfmaq_f32(A, B, x2), vfmaq_f32(C, D, x2), x4); return res; } inline float32x4_t vexpq_f32_03_percent_error(float32x4_t x) { - const std::array exp_tab = - { - { - vdupq_n_f32(1.f), - vdupq_n_f32(0.0416598916054f), - vdupq_n_f32(0.500000596046f), - vdupq_n_f32(0.0014122662833f), - vdupq_n_f32(1.00000011921f), - vdupq_n_f32(0.00833693705499f), - vdupq_n_f32(0.166665703058f), - vdupq_n_f32(0.000195780929062f), - } - }; + const std::array exp_tab = {{ + vdupq_n_f32(1.f), + vdupq_n_f32(0.0416598916054f), + vdupq_n_f32(0.500000596046f), + vdupq_n_f32(0.0014122662833f), + vdupq_n_f32(1.00000011921f), + vdupq_n_f32(0.00833693705499f), + vdupq_n_f32(0.166665703058f), + vdupq_n_f32(0.000195780929062f), + }}; x = vminq_f32(x, vdupq_n_f32(88.3762626647949f)); - static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); - static const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f); - static const float32x4_t CONST_0 = vdupq_n_f32(0.f); - static const int32x4_t CONST_NEGATIVE_14 = vdupq_n_s32(-14); + static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); + static const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f); + static const float32x4_t CONST_0 = vdupq_n_f32(0.f); + static const int32x4_t CONST_NEGATIVE_14 = vdupq_n_s32(-14); - int32x4_t m = vcvtq_s32_f32(vmulq_f32(x, CONST_INV_LN2)); + int32x4_t m = vcvtq_s32_f32(vmulq_f32(x, CONST_INV_LN2)); float32x4_t val = vfmsq_f32(x, vcvtq_f32_s32(m), CONST_LN2); float32x4_t poly = vtaylor_polyq_f32(val, exp_tab); @@ -111,6 +105,32 @@ inline float32x4_t vexpq_f32_03_percent_error(float32x4_t x) return poly; } +inline float32x4_t vlogq_f32(float32x4_t x) +{ + uint32x4_t ux = vreinterpretq_u32_f32(x); + float32x4_t fx = vcvtq_f32_u32(ux); + // fx * (1.0f / (1 << 23)) + fx = vmulq_f32(fx, vdivq_f32(vdupq_n_f32(1.0f), vcvtq_f32_u32(vshlq_n_u32(vdupq_n_u32(1), 23)))); + + uint32x4_t umx = + vorrq_u32(vandq_u32(ux, vdupq_n_u32(0x007FFFFF)), vshlq_n_u32(vdupq_n_u32(0x7e), 23)); + float32x4_t mx = vreinterpretq_f32_u32(umx); + + const float32x4_t c_124_22551499 = vdupq_n_f32(124.22551499f); + const float32x4_t c_1_498030302 = vdupq_n_f32(1.498030302f); + const float32x4_t c_1_725877999 = vdupq_n_f32(1.72587999f); + const float32x4_t c_0_3520087068 = vdupq_n_f32(0.3520887068f); + + float32x4_t tmp = vdivq_f32(c_1_725877999, vaddq_f32(c_0_3520087068, mx)); + tmp = vaddq_f32(c_124_22551499, tmp); + tmp = vfmaq_f32(tmp, c_1_498030302, mx); + const float32x4_t c_0_69314718 = vdupq_n_f32(0.69314718f); + float32x4_t result_v = vmulq_f32(vsubq_f32(fx, tmp), c_0_69314718); + result_v = vbslq_f32(vcltq_f32(x, vdupq_n_f32(0)), vdupq_n_f32(NAN), result_v); + result_v = vbslq_f32(vceqq_f32(x, vdupq_n_f32(0)), vdupq_n_f32(-INFINITY), result_v); + return result_v; +} + inline float32x4_t vsigmoidq_f32(float32x4_t x) { float32x4_t one_v = vdupq_n_f32(1.f); @@ -122,7 +142,7 @@ inline float32x4_t vtanhq_f32(float32x4_t x) float32x4_t one_v = vdupq_n_f32(1.f); float32x4_t two_v = vdupq_n_f32(2.f); float32x4_t e_2G_v = vexpq_f32_03_percent_error(vmulq_f32(two_v, x)); - //float32x4_t result_v = vfmsq_f32(one_v, two_v, vrecpeq_f32(vaddq_f32(e_2G_v, one_v))); + // float32x4_t result_v = vfmsq_f32(one_v, two_v, vrecpeq_f32(vaddq_f32(e_2G_v, one_v))); float32x4_t result_v = vsubq_f32(one_v, vdivq_f32(two_v, vaddq_f32(one_v, e_2G_v))); return result_v; } @@ -144,42 +164,37 @@ inline float16x8_t vaddq_f16_f32(float16x8_t a, float16x8_t b) inline float16x8_t vtaylor_polyq_f16(float16x8_t x, const std::array &coeffs) { - float16x8_t A = vfmaq_f16(coeffs[0], coeffs[4], x); - float16x8_t B = vfmaq_f16(coeffs[2], coeffs[6], x); - float16x8_t C = vfmaq_f16(coeffs[1], coeffs[5], x); - float16x8_t D = vfmaq_f16(coeffs[3], coeffs[7], x); - float16x8_t x2 = vmulq_f16(x, x); - float16x8_t x4 = vmulq_f16(x2, x2); - float16x8_t res = vfmaq_f16(vfmaq_f16(A, B, x2), - vfmaq_f16(C, D, x2), - x4); + float16x8_t A = vfmaq_f16(coeffs[0], coeffs[4], x); + float16x8_t B = vfmaq_f16(coeffs[2], coeffs[6], x); + float16x8_t C = vfmaq_f16(coeffs[1], coeffs[5], x); + float16x8_t D = vfmaq_f16(coeffs[3], coeffs[7], x); + float16x8_t x2 = vmulq_f16(x, x); + float16x8_t x4 = vmulq_f16(x2, x2); + float16x8_t res = vfmaq_f16(vfmaq_f16(A, B, x2), vfmaq_f16(C, D, x2), x4); return res; } inline float16x8_t vexpq_f16_03_percent_error(float16x8_t x) { - const std::array exp_tab = - { - { - vdupq_n_f16(1.f), - vdupq_n_f16(0.0416598916054f), - vdupq_n_f16(0.500000596046f), - vdupq_n_f16(0.0014122662833f), - vdupq_n_f16(1.00000011921f), - vdupq_n_f16(0.00833693705499f), - vdupq_n_f16(0.166665703058f), - vdupq_n_f16(0.000195780929062f), - } - }; + const std::array exp_tab = {{ + vdupq_n_f16(1.f), + vdupq_n_f16(0.0416598916054f), + vdupq_n_f16(0.500000596046f), + vdupq_n_f16(0.0014122662833f), + vdupq_n_f16(1.00000011921f), + vdupq_n_f16(0.00833693705499f), + vdupq_n_f16(0.166665703058f), + vdupq_n_f16(0.000195780929062f), + }}; x = vminq_f16(x, vdupq_n_f16(11.0898664884f)); - static const float16x8_t CONST_LN2 = vdupq_n_f16(0.6931471805f); - static const float16x8_t CONST_INV_LN2 = vdupq_n_f16(1.4426950408f); - static const float16x8_t CONST_0 = vdupq_n_f16(0.f); - static const int16x8_t CONST_NEGATIVE_14 = vdupq_n_s16(-14); + static const float16x8_t CONST_LN2 = vdupq_n_f16(0.6931471805f); + static const float16x8_t CONST_INV_LN2 = vdupq_n_f16(1.4426950408f); + static const float16x8_t CONST_0 = vdupq_n_f16(0.f); + static const int16x8_t CONST_NEGATIVE_14 = vdupq_n_s16(-14); - int16x8_t m = vcvtq_s16_f16(vmulq_f16(x, CONST_INV_LN2)); + int16x8_t m = vcvtq_s16_f16(vmulq_f16(x, CONST_INV_LN2)); float16x8_t val = vfmsq_f16(x, vcvtq_f16_s16(m), CONST_LN2); float16x8_t poly = vtaylor_polyq_f16(val, exp_tab); @@ -204,25 +219,31 @@ inline float16x8_t vexpq_f16_4_percent_error_half_time(float16x8_t x) return in3; } - inline float16x8_t vexpq_f16_f32(float16x8_t a) { #ifdef _USE_F16_MIX_PRECISION float32x4_t a0 = vcvt_f32_f16(vget_low_f16(a)); float32x4_t a1 = vcvt_f32_f16(vget_high_f16(a)); - return vcombine_f16(vcvt_f16_f32(vexpq_f32_03_percent_error(a0)), vcvt_f16_f32(vexpq_f32_03_percent_error(a1))); + return vcombine_f16( + vcvt_f16_f32(vexpq_f32_03_percent_error(a0)), vcvt_f16_f32(vexpq_f32_03_percent_error(a1))); #else return vexpq_f16_03_percent_error(a); #endif } +inline float16x8_t vlogq_f16(float16x8_t x) +{ + float32x4_t a0 = vcvt_f32_f16(vget_low_f16(x)); + float32x4_t a1 = vcvt_f32_f16(vget_high_f16(x)); + return vcombine_f16(vcvt_f16_f32(vlogq_f32(a0)), vcvt_f16_f32(vlogq_f32(a1))); +} + inline float16x8_t vsigmoidq_f16(float16x8_t x) { #ifdef _USE_F16_MIX_PRECISION float32x4_t x0 = vcvt_f32_f16(vget_low_f16(x)); float32x4_t x1 = vcvt_f32_f16(vget_high_f16(x)); - float16x8_t y = vcombine_f16(vcvt_f16_f32(vsigmoidq_f32(x0)), - vcvt_f16_f32(vsigmoidq_f32(x1))); + float16x8_t y = vcombine_f16(vcvt_f16_f32(vsigmoidq_f32(x0)), vcvt_f16_f32(vsigmoidq_f32(x1))); return y; #else float16x8_t one_v = vdupq_n_f16(1.f); @@ -235,14 +256,13 @@ inline float16x8_t vtanhq_f16(float16x8_t x) #ifdef _USE_F16_MIX_PRECISION float32x4_t x0 = vcvt_f32_f16(vget_low_f16(x)); float32x4_t x1 = vcvt_f32_f16(vget_high_f16(x)); - float16x8_t y = vcombine_f16(vcvt_f16_f32(vtanhq_f32(x0)), - vcvt_f16_f32(vtanhq_f32(x1))); + float16x8_t y = vcombine_f16(vcvt_f16_f32(vtanhq_f32(x0)), vcvt_f16_f32(vtanhq_f32(x1))); return y; #else float16x8_t one_v = vdupq_n_f16(1.f); float16x8_t two_v = vdupq_n_f16(2.f); float16x8_t e_2G_v = vexpq_f16_03_percent_error(vmulq_f16(two_v, x)); - //float16x8_t result_v = vfmsq_f16(one_v, two_v, vrecpeq_f16(vaddq_f16(e_2G_v, one_v))); + // float16x8_t result_v = vfmsq_f16(one_v, two_v, vrecpeq_f16(vaddq_f16(e_2G_v, one_v))); float16x8_t result_v = vsubq_f16(one_v, vdivq_f16(two_v, vaddq_f16(one_v, e_2G_v))); return result_v; #endif @@ -256,7 +276,8 @@ inline F32 vaddvq_f16(float16x8_t x) return sum; } -inline void vst1q_lane_f16_builtin(F16* address, float16x8_t vec, const int laneId) { +inline void vst1q_lane_f16_builtin(F16 *address, float16x8_t vec, const int laneId) +{ switch (laneId) { case 0: vst1q_lane_f16(address, vec, 0); @@ -289,19 +310,28 @@ inline void vst1q_lane_f16_builtin(F16* address, float16x8_t vec, const int lane #endif #ifdef _USE_INT8 -inline int32x4_t vdotq_laneq_s32_builtin(int32x4_t c, int8x16_t a, int8x16_t b, const int laneId) { +inline int32x4_t vdotq_laneq_s32_builtin(int32x4_t c, int8x16_t a, int8x16_t b, const int laneId) +{ + int32x4_t ret; switch (laneId) { case 0: - return vdotq_laneq_s32(c, a, b, 0); + ret = vdotq_laneq_s32(c, a, b, 0); + break; case 1: - return vdotq_laneq_s32(c, a, b, 1); + ret = vdotq_laneq_s32(c, a, b, 1); + break; case 2: - return vdotq_laneq_s32(c, a, b, 2); + ret = vdotq_laneq_s32(c, a, b, 2); + break; case 3: - return vdotq_laneq_s32(c, a, b, 3); + ret = vdotq_laneq_s32(c, a, b, 3); + break; default: CHECK_REQUIREMENT(0); + ret = vdupq_n_s32(0); + break; } + return ret; } #endif #endif diff --git a/common/uni/include/error.h b/common/uni/include/error.h new file mode 100644 index 00000000..af899235 --- /dev/null +++ b/common/uni/include/error.h @@ -0,0 +1,183 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_ERROR +#define _H_ERROR + +#include +#include +#include +#include +#include + +#ifdef __GLIBC__ +#define UNI_THREADID pid_t tid = syscall(SYS_gettid); +#else +#ifdef _USE_IOS +#include +#define UNI_THREADID \ + uint64_t tid64; \ + pthread_threadid_np(NULL, &tid64); \ + pid_t tid = (pid_t)tid64; +#else +#define UNI_THREADID pid_t tid = gettid(); +#endif +#endif + +#ifdef _THREAD_SAFE +extern pthread_mutex_t uniThreadMutex; +#endif + +#ifdef _USE_ANDROID_LOG +#include +#define UNI_LOGD(...) \ + { \ + __android_log_print(ANDROID_LOG_DEBUG, "Bolt", __VA_ARGS__); \ + printf(__VA_ARGS__); \ + fflush(stdout); \ + } +#define UNI_EXIT +#else +#define UNI_LOGD(...) \ + { \ + printf(__VA_ARGS__); \ + fflush(stdout); \ + } +#define UNI_EXIT exit(1); +#endif + +#ifdef __cplusplus +extern "C" { +#endif +#ifdef _THREAD_SAFE +#define UNI_THREAD_SAFE(func) \ + pthread_mutex_lock(&uniThreadMutex); \ + func; \ + pthread_mutex_unlock(&uniThreadMutex); +#else +#define UNI_THREAD_SAFE(func) func; +#endif +#define UNI_CI_LOG(...) printf(__VA_ARGS__); +#define UNI_INFO_LOG(...) \ + { \ + UNI_THREADID \ + UNI_THREAD_SAFE({ \ + UNI_LOGD("[INFO] thread %d ", tid); \ + UNI_LOGD(__VA_ARGS__); \ + }) \ + } +#define UNI_WARNING_LOG(...) \ + { \ + UNI_THREADID \ + UNI_THREAD_SAFE({ \ + UNI_LOGD("[WARNING] thread %d ", tid); \ + UNI_LOGD(__VA_ARGS__); \ + }) \ + } +#define UNI_ERROR_LOG(...) \ + { \ + UNI_THREADID \ + UNI_THREAD_SAFE({ \ + UNI_LOGD("[ERROR] thread %d ", tid); \ + UNI_LOGD(__VA_ARGS__); \ + }) \ + UNI_EXIT; \ + } +#ifdef _DEBUG +#define UNI_DEBUG_LOG(...) \ + { \ + UNI_THREADID \ + UNI_THREAD_SAFE({ \ + UNI_LOGD("[DEBUG] thread %d ", tid); \ + UNI_LOGD(__VA_ARGS__); \ + }) \ + } +#else +#define UNI_DEBUG_LOG(...) +#endif +#define CHECK_REQUIREMENT(status) \ + if (!(status)) { \ + UNI_ERROR_LOG("%s %s line %d requirement mismatch\n", __FILE__, __func__, __LINE__); \ + } +#define CHECK_STATUS(ee) \ + { \ + EE status = (ee); \ + if (status != SUCCESS) { \ + UNI_ERROR_LOG( \ + "%s %s line %d got an error: %s\n", __FILE__, __func__, __LINE__, ee2str(status)); \ + } \ + } + +inline void UNI_PROFILE_INFO(const char *name, const char *category, long start, long duration) +{ +#ifdef _PROFILE + int pid = 0; + UNI_THREADID; + UNI_THREAD_SAFE({ + UNI_LOGD("[PROFILE] thread %d ", tid); + UNI_LOGD("{\"name\": \"%s\", \"cat\": \"%s\", \"ph\": \"X\", \"pid\": \"%d\", \"tid\": " + "\"%d\", \"ts\": %ld, \"dur\": %ld},\n", + name, category, pid, tid, start, duration); + }); +#endif +} + +typedef enum { + SUCCESS = 0, + NULL_POINTER = 1, + NOT_MATCH = 2, + NOT_FOUND = 3, + ALLOC_FAILED = 4, + NOT_IMPLEMENTED = 50, + NOT_SUPPORTED = 51, + GCL_ERROR = 52, + FILE_ERROR = 53, + UNKNOWN = 99 +} EE; + +inline const char *ee2str(EE ee) +{ + const char *ret = 0; + switch (ee) { + case SUCCESS: + ret = "SUCCESS"; + break; + case NULL_POINTER: + ret = "Null Pointer"; + break; + case NOT_MATCH: + ret = "Not Match"; + break; + case NOT_FOUND: + ret = "Not Found"; + break; + case NOT_IMPLEMENTED: + ret = "Not Implemented"; + break; + case NOT_SUPPORTED: + ret = "Not Supported"; + break; + case FILE_ERROR: + ret = "Error with file system"; + break; + default: + ret = "Unknown"; + break; + } + return ret; +} +#ifdef __cplusplus +} +#endif + +#endif diff --git a/common/uni/include/graph.h b/common/uni/include/graph.h new file mode 100644 index 00000000..8e3c6081 --- /dev/null +++ b/common/uni/include/graph.h @@ -0,0 +1,294 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef UNI_INCLUDE_GRAPH_H_ +#define UNI_INCLUDE_GRAPH_H_ + +#include +#include +#include +#include +#include +#include + +#ifdef _USE_XCODE +#include "coded_stream.h" +#include "zero_copy_stream_impl.h" +#include "text_format.h" +#include "message.h" +#else +#include +#include +#include +#include +#endif + +#include "error.h" +#include "tensor_desc.h" +#include "thread_affinity.h" + +template +class Graph { +public: + Graph() + {} + + ~Graph() + {} + + Graph clone() + { + UNI_DEBUG_LOG("graph %s clone begin\n", this->name.c_str()); + Graph graph = *this; + for (unsigned int i = 0; i < this->nodes.size(); i++) { + this->nodes[i] = this->nodes[i].clone(); + } + CHECK_STATUS(graph.manageDataTensors()); + CHECK_STATUS(graph.manageTmpBuffer()); + UNI_DEBUG_LOG("graph %s clone end\n", this->name.c_str()); + return graph; + } + + void init(std::string graphPath) + { + UNI_DEBUG_LOG("load and build graph from %s begin\n", graphPath.c_str()); + GraphParameter graphParameter; + CHECK_REQUIREMENT(load(graphPath, (google::protobuf::Message *)(&graphParameter))); + this->name = graphParameter.name(); + + for (int i = 0; i < graphParameter.output_size(); i++) { + this->outputs.insert(graphParameter.output(i)); + } + for (int i = 0, index = 0; i < graphParameter.node_size(); i++) { + ComputeNode node; + auto nodeParameter = graphParameter.node(i); + node.setNodeParameter(nodeParameter); + if (nodeParameter.type() == std::string("Input")) { + DataTensor *tensor = new DataTensor(); + tensor->resize(extractInputTensorDescFromNode(node)); + CHECK_REQUIREMENT(nodeParameter.output_size() == 1); + this->tensors[nodeParameter.output(0)] = std::shared_ptr(tensor); + continue; + } + + this->nodes.push_back(node); + index++; + } + UNI_DEBUG_LOG("load and build graph from %s end\n", graphPath.c_str()); + } + + EE ready(DataType precision, AffinityPolicy affinityPolicy, int gpuId) + { + UNI_DEBUG_LOG("graph %s ready begin\n", this->name.c_str()); + CHECK_STATUS(managePrecision(precision)); + if (gpuId >= 0) { + affinityPolicy = AFFINITY_GPU; + } + CHECK_STATUS(initInference(affinityPolicy)); + CHECK_STATUS(manageDataTensors()); + CHECK_STATUS(manageTmpBuffer()); + for (unsigned int i = 0; i < this->nodes.size(); i++) { + this->nodes[i].ready(); + } + UNI_DEBUG_LOG("graph %s ready end\n", this->name.c_str()); + return SUCCESS; + } + + EE setRuntime(int cpuId, Arch arch) + { + UNI_DEBUG_LOG( + "graph %s setRuntime(core:%d arch:%d) begin\n", this->name.c_str(), cpuId, arch); + for (unsigned int i = 0; i < this->nodes.size(); i++) { + this->nodes[i].setRuntime(cpuId, arch); + } + UNI_DEBUG_LOG("graph %s setRuntime end\n", this->name.c_str()); + return SUCCESS; + } + + EE run(std::map> tensors) + { + UNI_DEBUG_LOG("graph %s run begin\n", this->name.c_str()); + CHECK_STATUS(setData(tensors)); + for (unsigned int i = 0; i < this->nodes.size(); i++) { + this->nodes[i].run(); + } + UNI_DEBUG_LOG("graph %s run end\n", this->name.c_str()); + return SUCCESS; + } + +private: + std::string name; + std::vector nodes; + std::map> tensors; + std::shared_ptr tmpDataTensor; + std::set outputs; + + bool load(std::string graphPath, google::protobuf::Message *message) + { + std::ifstream fileStream(graphPath, std::ifstream::in); + bool ret = false; + if (fileStream.is_open()) { + google::protobuf::io::IstreamInputStream input(&fileStream); + ret = google::protobuf::TextFormat::Parse(&input, message); + fileStream.close(); + } else { + UNI_ERROR_LOG("can not load graph from %s\n", graphPath.c_str()); + } + return ret; + } + + TensorDesc extractInputTensorDescFromNode(ComputeNode node) + { + auto nodeParameter = node.getNodeParameter(); + std::map types = {{"FLOAT32", DT_F32}, {"FLOAT16", DT_F16}, + {"UINT32", DT_U32}, {"INT8", DT_I8}, {"UINT8", DT_U8}}; + std::map formats = { + {"NCHW", DF_NCHW}, {"NCHWC8", DF_NCHWC8}, {"MTK", DF_MTK}, {"NORMAL", DF_NORMAL}}; + TensorDesc desc; + if (types.find(nodeParameter.input_type()) != types.end()) { + desc.dt = types[nodeParameter.input_type()]; + } else { + UNI_ERROR_LOG( + "graph unsupported input data type %s\n", nodeParameter.input_type().c_str()); + } + if (formats.find(nodeParameter.input_format()) != formats.end()) { + desc.df = formats[nodeParameter.input_format()]; + } else { + UNI_ERROR_LOG( + "graph unsupported input data format %s\n", nodeParameter.input_format().c_str()); + } + desc.nDims = nodeParameter.input_dim_size(); + for (unsigned int i = 0; i < desc.nDims; i++) { + desc.dims[i] = nodeParameter.input_dim(desc.nDims - 1 - i); + } + return desc; + } + + EE inferOutputSize() + { + UNI_DEBUG_LOG("graph %s infer output size begin\n", this->name.c_str()); + for (unsigned int i = 0; i < this->nodes.size(); i++) { + CHECK_STATUS(this->nodes[i].inferOutputSize()); + } + UNI_DEBUG_LOG("graph %s infer output size end\n", this->name.c_str()); + return SUCCESS; + } + + EE setNodeInputOutput() + { + UNI_DEBUG_LOG("graph %s set node input and output begin\n", this->name.c_str()); + for (unsigned int i = 0; i < this->nodes.size(); i++) { + auto nodeParameter = this->nodes[i].getNodeParameter(); + std::map> nodeInputs, nodeOutputs; + for (int j = 0; j < nodeParameter.input_size(); j++) { + std::string nodeInputName = nodeParameter.input(j); + nodeInputs[nodeInputName] = tensors[nodeInputName]; + } + this->nodes[i].setInput(nodeInputs); + + for (int j = 0; j < nodeParameter.output_size(); j++) { + std::string nodeOutputName = nodeParameter.output(j); + nodeOutputs[nodeOutputName] = tensors[nodeOutputName]; + } + this->nodes[i].setOutput(nodeOutputs); + } + CHECK_STATUS(inferOutputSize()); + UNI_DEBUG_LOG("graph %s set node input and output end\n", this->name.c_str()); + return SUCCESS; + } + + EE manageDataTensors() + { + UNI_DEBUG_LOG("graph %s manage tensors begin\n", this->name.c_str()); + for (unsigned int i = 0; i < this->nodes.size(); i++) { + auto nodeParameter = this->nodes[i].getNodeParameter(); + for (int j = 0; j < nodeParameter.output_size(); j++) { + DataTensor *tensor = new DataTensor(); + std::string nodeOutputName = nodeParameter.output(j); + this->tensors[nodeOutputName] = std::shared_ptr(tensor); + } + } + CHECK_STATUS(setNodeInputOutput()); + for (auto tensor : this->tensors) { + if (this->outputs.find(tensor.first) == this->outputs.end()) { + tensor.second->alloc(); + } + } + UNI_DEBUG_LOG("graph %s manage tensors end\n", this->name.c_str()); + return SUCCESS; + } + + EE managePrecision(DataType dataType) + { + UNI_DEBUG_LOG("graph %s manage precision(%d) begin\n", this->name.c_str(), dataType); + for (unsigned int i = 0; i < this->nodes.size(); i++) { + this->nodes[i].setPrecision(dataType); + } + UNI_DEBUG_LOG("graph %s manage precision end\n", this->name.c_str()); + return SUCCESS; + } + + EE initInference(AffinityPolicy affinityPolicy) + { + UNI_DEBUG_LOG("graph %s init inference(%d) begin\n", this->name.c_str(), affinityPolicy); + for (unsigned int i = 0; i < this->nodes.size(); i++) { + this->nodes[i].initInference(affinityPolicy); + } + UNI_DEBUG_LOG("graph %s init inference end\n", this->name.c_str()); + return SUCCESS; + } + + unsigned int inferTmpBufferSize() + { + UNI_DEBUG_LOG("graph %s infer tmp buffer size begin\n", this->name.c_str()); + unsigned int maxTmpBufferSize = 0; + for (unsigned int i = 0; i < this->nodes.size(); i++) { + unsigned int tmpBufferSize = this->nodes[i].getTmpBufferSize(); + if (tmpBufferSize > maxTmpBufferSize) { + maxTmpBufferSize = tmpBufferSize; + } + } + UNI_DEBUG_LOG("graph %s infer tmp buffer size end\n", this->name.c_str()); + return maxTmpBufferSize; + } + + EE manageTmpBuffer() + { + UNI_DEBUG_LOG("graph %s manage tmp buffer begin\n", this->name.c_str()); + unsigned int maxTmpBufferSize = inferTmpBufferSize(); + this->tmpDataTensor = std::shared_ptr(new DataTensor()); + this->tmpDataTensor->resize(tensor1d(DT_U8, maxTmpBufferSize)); + for (unsigned int i = 0; i < this->nodes.size(); i++) { + this->nodes[i].setTmpBuffer(this->tmpDataTensor); + } + UNI_DEBUG_LOG("graph %s manage tmp buffer end\n", this->name.c_str()); + return SUCCESS; + } + + EE setData(std::map> tensors) + { + UNI_DEBUG_LOG("graph %s set data from upper begin\n", this->name.c_str()); + for (auto tensor : tensors) { + if (this->tensors.find(tensor.first) != this->tensors.end()) { + this->tensors[tensor.first] = tensor.second; + } else { + UNI_ERROR_LOG("graph %s can not find %s to set as input or output\n", + this->name.c_str(), tensor.first.c_str()); + } + } + CHECK_STATUS(setNodeInputOutput()); + UNI_DEBUG_LOG("graph %s set data from upper end\n", this->name.c_str()); + return SUCCESS; + } +}; +#endif // UNI_INCLUDE_GRAPH_H_ diff --git a/model-tools/include/model_print.h b/common/uni/include/model_print.h similarity index 84% rename from model-tools/include/model_print.h rename to common/uni/include/model_print.h index 7d8f95a8..f486e342 100644 --- a/model-tools/include/model_print.h +++ b/common/uni/include/model_print.h @@ -1,29 +1,27 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_MODEL_PRINT #define _H_MODEL_PRINT -#include "model_tools.h" +#include "types.h" #ifdef __cplusplus extern "C" { #endif - void print_header(const ModelSpec ms); -void print_operator_tensor_relationship(const ModelSpec ms, bool delete_deprecated_op=false); +void print_operator_tensor_relationship(const ModelSpec ms, bool delete_deprecated_op = false); void print_weights(const ModelSpec ms); diff --git a/common/uni/include/model_serialize_deserialize.hpp b/common/uni/include/model_serialize_deserialize.hpp new file mode 100644 index 00000000..0c6bd471 --- /dev/null +++ b/common/uni/include/model_serialize_deserialize.hpp @@ -0,0 +1,70 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_MODEL_SERIALIZE_DESERIALIZE +#define _H_MODEL_SERIALIZE_DESERIALIZE + +#include +#include "types.h" + +int get_operator_parameter_size(OperatorType operatorType); + +#if defined(_BUILD_TEST) || defined(_USE_CAFFE) || defined(_USE_ONNX) || defined(_USE_TFLITE) || \ + defined(_USE_TENSORFLOW) +EE serialize_header(const ModelSpec *spec, std::string *tmp); + +EE serialize_operators(const ModelSpec *spec, std::string *tmp); + +EE serialize_weights(const ModelSpec *spec, std::string *tmp); + +EE serialize_model(const ModelSpec *spec, std::string *bytes); + +EE write_to_file(std::string *bytes, const char *fn); + +EE serialize_model_to_file(const ModelSpec *spec, const char *fn); + +EE ms_datatype_converter(ModelSpec *original_ms, + ModelSpec *target_ms, + DataConvertType convert_mode, + std::string storageMode); +#endif + +EE deserialize_header(char *bytes, ModelSpec *spec, U32 *pos); + +EE deserialize_operator(char *bytes, ModelSpec *spec, U32 *pos); + +EE deserialize_weight(char *bytes, ModelSpec *spec, U32 *pos); + +EE operator_relationship(ModelSpec *spec); + +EE deserialize_model_from_file(const char *fn, ModelSpec *spec, bool useFileStream = false); + +inline std::string concat_dir_file(std::string dir, std::string file) +{ + std::string ret; + if (!dir.empty()) { + int len = dir.size(); + char &last = dir.at(len - 1); + if ('/' != last) { + ret = dir + '/'; + } else { + ret = dir; + } + ret += file; + } else { + ret = file; + } + + return ret; +} +#endif diff --git a/common/uni/include/op_type.h b/common/uni/include/op_type.h new file mode 100644 index 00000000..9804af72 --- /dev/null +++ b/common/uni/include/op_type.h @@ -0,0 +1,127 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_OP_TYPE +#define _H_OP_TYPE + +#ifdef __cplusplus +extern "C" { +#endif + +// please add OperatorType and OperatorTypeName at the same time +typedef enum { + OT_None = 0, + OT_Input = 1, + OT_Conv = 2, + OT_Deconvolution = 3, + OT_FC = 4, + OT_RNN = 5, + OT_MatMul = 6, + OT_Resize = 7, + OT_BilateralSliceApply = 8, + OT_Pooling = 9, + OT_Scale = 10, + OT_PRelu = 11, + OT_BatchNorm = 12, + OT_LayerNorm = 13, + OT_L2Normalization = 14, + OT_Reduction = 15, + OT_ArgMax = 16, + OT_Softmax = 17, + OT_SoftmaxWithLoss = 18, + OT_LogSoftmax = 19, + + OT_Clip = 20, + OT_Power = 21, + OT_Sigmoid = 22, + OT_Relu = 23, + OT_Relu6 = 24, + OT_HSwish = 25, + OT_HSigmoid = 26, + OT_Gelu = 27, + OT_TanH = 28, + OT_Mish = 29, + OT_Erf = 30, + + OT_Gather = 31, + OT_Embedding = 32, + OT_Pad = 33, + OT_Eltwise = 34, + OT_Concat = 35, + OT_Slice = 36, + OT_TfSlice = 37, + + OT_Cast = 38, + OT_Shape = 39, + OT_ConstantOfShape = 40, + OT_Transpose = 41, + OT_Reshape = 42, + OT_Squeeze = 43, + OT_Unsqueeze = 44, + OT_Space2Depth = 45, + OT_Depth2Space = 46, + OT_Constant = 47, + + OT_ChannelResize = 48, + OT_PreAllocatedMemory = 49, + OT_SharedWeight = 50, + OT_Copy = 51, + OT_Check = 52, + OT_Repeat = 53, + OT_Jump = 54, + OT_Attention = 55, + OT_AttentionMask = 56, + OT_RelativePositionEmbedding = 57, + OT_RelativeShift = 58, + OT_PriorBox = 59, + OT_DetectionOutput = 60, + OT_Yolov3DetectionOutput = 61, + OT_MultiHeadAttention = 62, + OT_SqDiff = 63, + OT_Tile = 64, + OT_Splice = 65, + OT_Neg = 66, + OT_Greater = 67 // Temporary support for special case +} OperatorType; + +inline const char *const *OperatorTypeName() +{ + static const char *const names[] = {"OT_None", "OT_Input", "OT_Conv", "OT_Deconvolution", + "OT_FC", "OT_RNN", "OT_MatMul", "OT_Resize", "OT_BilateralSliceApply", "OT_Pooling", + + "OT_Scale", "OT_PRelu", "OT_BatchNorm", "OT_LayerNorm", "OT_L2Normalization", + "OT_Reduction", "OT_ArgMax", "OT_Softmax", "OT_SoftmaxWithLoss", "OT_LogSoftmax", + + "OT_Clip", "OT_Power", "OT_Sigmoid", "OT_Relu", "OT_Relu6", "OT_HSwish", "OT_HSigmoid", + "OT_Gelu", "OT_TanH", "OT_Mish", + + "OT_Erf", "OT_Gather", "OT_Embedding", "OT_Pad", "OT_Eltwise", "OT_Concat", "OT_Slice", + "OT_TfSlice", "OT_Cast", "OT_Shape", + + "OT_ConstantOfShape", "OT_Transpose", "OT_Reshape", "OT_Squeeze", "OT_Unsqueeze", + "OT_Space2Depth", "OT_Depth2Space", "OT_Constant", "OT_ChannelResize", + "OT_PreAllocatedMemory", + + "OT_SharedWeight", "OT_Copy", "OT_Check", "OT_Repeat", "OT_Jump", "OT_Attention", + "OT_AttentionMask", "OT_RelativePositionEmbedding", "OT_RelativeShift", "OT_PriorBox", + + "OT_DetectionOutput", "OT_Yolov3DetectionOutput", "OT_MultiHeadAttention", "OT_SqDiff", + "OT_Tile", "OT_Splice", "OT_Neg", "OT_Greater"}; + return names; +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/common/uni/include/parse_command.h b/common/uni/include/parse_command.h new file mode 100644 index 00000000..d5d03c1a --- /dev/null +++ b/common/uni/include/parse_command.h @@ -0,0 +1,312 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_PARSE_COMMAND +#define _H_PARSE_COMMAND +#include +#include +#include +#include "types.h" +#include "error.h" + +#ifdef _USE_FP16 + +inline U32 getBinFileSize(CI8 *dataPath, CI8 *dataName) +{ + std::string filePath = dataPath; + CI8 lastFlag = filePath[filePath.length() - 1]; + if (strcmp(&lastFlag, "/") != 0) { + filePath += "/"; + } + std::string fileName = dataName; + fileName = filePath + fileName; + FILE *file = fopen(fileName.c_str(), "rb"); + if (file == NULL) { + UNI_WARNING_LOG("waring fopen %s failed\n", fileName.c_str()); + return 0; + } + fseek(file, 0, SEEK_END); + U32 size = (U32)ftell(file); + fseek(file, 0, SEEK_SET); + fclose(file); + return size; +} + +inline void writeF16ToF32Bin(F16 *data, U32 num, CI8 *dataPath, CI8 *dataName) +{ + std::string filePath = dataPath; + CI8 lastFlag = filePath[filePath.length() - 1]; + if (strcmp(&lastFlag, "/") != 0) { + filePath += "/"; + } + std::string fileName = dataName; + fileName = filePath + fileName; + FILE *outfile = fopen(fileName.c_str(), "wb"); + if (outfile == NULL) { + UNI_WARNING_LOG("waring fopen %s failed\n", fileName.c_str()); + return; + } + F32 *dataTran = new F32[num]; + for (U32 i = 0; i < num; i++) { + dataTran[i] = (F32)data[i]; + } + fwrite(dataTran, sizeof(float), num, outfile); + fclose(outfile); + delete[] dataTran; +} + +inline void readF32BinToF16(F16 *data, U32 num, CI8 *dataPath, CI8 *dataName) +{ + std::string filePath = dataPath; + CI8 lastFlag = filePath[filePath.length() - 1]; + if (strcmp(&lastFlag, "/") != 0) { + filePath += "/"; + } + std::string fileName = dataName; + fileName = filePath + fileName; + FILE *infile = fopen(fileName.c_str(), "rb"); + if (infile == NULL) { + UNI_WARNING_LOG("waring fopen %s failed\n", fileName.c_str()); + return; + } + F32 *dataTran = new F32[num]; + fread(dataTran, sizeof(float), num, infile); + for (U32 i = 0; i < num; i++) { + data[i] = (F16)dataTran[i]; + } + fclose(infile); + delete[] dataTran; +} + +#endif +const struct option long_options[]{ + {"model", 1, nullptr, 'm'}, + {"inputPath", 1, nullptr, 'i'}, + {"archInfo", 1, nullptr, 'a'}, + {"algoPath", 1, nullptr, 'p'}, + {"imageFormat", 1, nullptr, 'f'}, + {"scaleValue", 1, nullptr, 's'}, + {"topK", 1, nullptr, 't'}, + {"correctLable", 1, nullptr, 'c'}, + {"loopTime", 1, nullptr, 'l'}, + {"subNetworkName", 1, nullptr, 'S'}, + {"help", 1, nullptr, 'h'}, + {"readInputBinName", 1, nullptr, 1}, + {"writeOutputBinName", 1, nullptr, 2}, +}; + +const char optstring[] = "m:i:a:p:f:s:t:c:l:S:h:"; + +typedef struct { + std::pair model; + std::pair inputPath; + std::pair archInfo; + std::pair algoPath; + std::pair imageFormat; + std::pair scaleValue; + std::pair topK; + std::pair correctLable; + std::pair loopTime; + std::pair subNetworkName; + std::pair readInputBinName; + std::pair writeOutputBinName; +} ParseRes; +typedef ParseRes *ParseRes_t; + +inline void init_parse_res(ParseRes_t parse_res) +{ + parse_res->model.second = false; + parse_res->inputPath.second = false; + parse_res->archInfo.second = false; + parse_res->algoPath.second = false; + parse_res->imageFormat.second = false; + parse_res->scaleValue.second = false; + parse_res->topK.second = false; + parse_res->correctLable.second = false; + parse_res->loopTime.second = false; + parse_res->subNetworkName.second = false; + parse_res->readInputBinName.second = false; + parse_res->writeOutputBinName.second = false; +} + +inline void help_examples() +{ + std::cout << "<<<<<<<<<<<<<<<<<<<< Parameters specification for examples >>>>>>>>>>>>>>>>>>>>" + << std::endl; + std::cout << "--model " + << " or -m: " + << "--required-- " + << "specific bolt model" << std::endl; + std::cout << "--archInfo " + << " or -a: " + << "--optional-- " + << "specific running arch: CPU_AFFINITY_HIGH_PERFORMANCE/CPU_AFFINITY_LOW_POWER/GPU," + << " the defaule value is CPU_AFFINITY_HIGH_PERFORMANCE" << std::endl; + std::cout << "--inputPath " + << " or -i: " + << "--optional-- " + << "specific file path to read input data" << std::endl; + std::cout << "--algoPath " + << " or -p: " + << "--optional-- " + << "specific file path to read or write algorithm auto tunning result" << std::endl; + std::cout << "--imageFormat " + << " or -f: " + << "--optional-- " + << "specific imageFormat if the input is an image: " + "BGR/RGB/RGB_SC/BGR_SC_RAW/BGR_SC_R," + << " the default value is RGB" << std::endl; + std::cout << "--scaleValue " + << " or -s: " + << "--optional-- " + << "specific scaleValue for image classification, the default value is 1" << std::endl; + std::cout << "--topK " + << " or -t: " + << "--optional-- " + << "specific topK value for image classification, the default value is 5" << std::endl; + std::cout << "--correctLable " + << " or -c: " + << "--optional-- " + << "specific correctLable for image classification, the deault value is -1" + << std::endl; + std::cout << "--loopTime " + << " or -l: " + << "--optional-- " + << "specific loopTime for running set_input + run + get_output, the deault value is 1" + << std::endl; + std::cout << "--subNetworkName" + << " or -S: " + << "--optional-- " + << "specific subNetworkName for:" << std::endl; + std::cout << " asr convolution transformer: encoder/prediction_net_ln/joint_net, the " + "default value is encoder" + << std::endl; + std::cout << " nmt_tsc : encoder/decoder" << std::endl; + std::cout << " tts : " + "encoder_decoder/postnet/melgan_vocoder/tinybert, the default value is " + "encoder_decoder" + << std::endl; + std::cout << "--readInputBinName " + << "--optional-- " + << "specific read input as binary, the binary should be float value with nchw format" + << std::endl; + std::cout << "--writeOutputBinName " + << "--optional-- " + << "specific save output as binary, the binary will be float value with nchw format" + << std::endl; +} + +inline void help(std::string name) +{ + if (name == "examples") { + help_examples(); + } +} + +inline void parseCommandLine(int argc, char *argv[], ParseRes_t parse_res, std::string name) +{ + int c = 0; + int optionIndex; + ImageFormat imageFormat; + std::cout << "[PARAMETERS INFO]:" << std::endl; + if (argc == 1) { + help(name); + } + while ((c = getopt_long(argc, argv, optstring, long_options, &optionIndex)) != -1) { + switch (c) { + case 'm': + parse_res->model.first = optarg; + parse_res->model.second = true; + std::cout << " - " << parse_res->model.first << std::endl; + break; + case 'i': + parse_res->inputPath.first = optarg; + parse_res->inputPath.second = true; + std::cout << " - " << parse_res->inputPath.first << std::endl; + break; + case 'a': + parse_res->archInfo.first = optarg; + parse_res->archInfo.second = true; + std::cout << " - " << parse_res->archInfo.first << std::endl; + break; + case 'p': + parse_res->algoPath.first = optarg; + parse_res->algoPath.second = true; + std::cout << " - " << parse_res->algoPath.first << std::endl; + break; + case 'f': + if (std::string(optarg) == std::string("RGB")) { + imageFormat = RGB; + } else if (std::string(optarg) == std::string("BGR")) { + imageFormat = BGR; + } else if (std::string(optarg) == std::string("RGB_SC")) { + imageFormat = RGB_SC; + } else if (std::string(optarg) == std::string("BGR_SC_RAW")) { + imageFormat = BGR_SC_RAW; + } else if (std::string(optarg) == std::string("RGB_SC_RAW")) { + imageFormat = RGB_SC_RAW; + } else { + imageFormat = RGB; + std::cout << "Unsupported image format, default to be RGB" << std::endl; + } + parse_res->imageFormat.first = imageFormat; + parse_res->imageFormat.second = true; + std::cout << " - " << optarg << std::endl; + break; + case 's': + parse_res->scaleValue.first = atof(optarg); + parse_res->scaleValue.second = true; + std::cout << " - " << parse_res->scaleValue.first << std::endl; + break; + case 't': + parse_res->topK.first = atoi(optarg); + parse_res->topK.second = true; + std::cout << " - " << parse_res->topK.first << std::endl; + break; + case 'l': + parse_res->loopTime.first = atoi(optarg); + parse_res->loopTime.second = true; + std::cout << " - " << parse_res->loopTime.first << std::endl; + break; + case 'c': + parse_res->correctLable.first = atoi(optarg); + parse_res->correctLable.second = true; + std::cout << " - " << parse_res->correctLable.first << std::endl; + break; + case 'S': + parse_res->subNetworkName.first = optarg; + parse_res->subNetworkName.second = true; + std::cout << " - " << parse_res->subNetworkName.first << std::endl; + break; + case 1: + parse_res->readInputBinName.first = optarg; + parse_res->readInputBinName.second = true; + std::cout << " - " << parse_res->readInputBinName.first + << std::endl; + break; + case 2: + parse_res->writeOutputBinName.first = optarg; + parse_res->writeOutputBinName.second = true; + std::cout << " - " << parse_res->writeOutputBinName.first + << std::endl; + break; + case 'h': + help(name); + break; + default: + help(name); + break; + } + } +} +#endif diff --git a/common/uni/include/profiling.h b/common/uni/include/profiling.h new file mode 100644 index 00000000..6d601a28 --- /dev/null +++ b/common/uni/include/profiling.h @@ -0,0 +1,49 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_PROFILING +#define _H_PROFILING + +#include "ut_util.h" + +std::string extract_class_function(std::string &&pretty_function); +std::string extract_file_function(std::string &&pretty_function); + +#define __CLASS_FUNCTION__ extract_class_function(std::string(__PRETTY_FUNCTION__)) +#define __FILE_FUNCTION__ \ + extract_file_function(std::string(__FILE__) + "::" + std::string(__FUNCTION__)) + +void ut_time_init(); +void ut_time_process( + const std::string &name, const std::string &category, double time_start_ms, double time_end_ms); +void ut_time_statistics(); + +#ifdef _PROFILE_STATISTICS +#define UNI_TIME_INIT ut_time_init(); +#define UNI_TIME_STATISTICS ut_time_statistics(); +#else +#define UNI_TIME_INIT +#define UNI_TIME_STATISTICS +#endif + +#ifdef _PROFILE +#define UNI_PROFILE(func, name, category) \ + double profile_time_start_ms = ut_time_ms(); \ + func; \ + double profile_time_end_ms = ut_time_ms(); \ + ut_time_process(name, category, profile_time_start_ms, profile_time_end_ms); +#else +#define UNI_PROFILE(func, name, category) func; + +#endif +#endif // _H_PROFILING diff --git a/common/uni/include/schedule.h b/common/uni/include/schedule.h new file mode 100644 index 00000000..c9bd0441 --- /dev/null +++ b/common/uni/include/schedule.h @@ -0,0 +1,245 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef FLOW_INCLUDE_SCHEDULE_H_ +#define FLOW_INCLUDE_SCHEDULE_H_ + +#define _USE_WEIGHT_SHARE + +#include +#include +#include +#include + +#include "graph.h" +#include "task.h" + +template +class Schedule { +public: + Schedule() + { + pthread_mutex_init(&(this->taskQueueLock), NULL); + pthread_cond_init(&(this->condition), NULL); + this->threadNum = 0; + this->stop = false; + } + + ~Schedule() + { + pthread_mutex_lock(&(this->taskQueueLock)); + pthread_mutex_destroy(&(this->taskQueueLock)); + pthread_cond_destroy(&(this->condition)); + delete[] this->threads; + } + + int init(std::vector graphPath, + DataType dataType, + AffinityPolicy affinityPolicy, + int threadNum, + bool useGPU) + { + UNI_DEBUG_LOG("schedule init begin\n"); + if (threadNum <= 0) { + return 1; + } + if (pthread_mutex_init(&(this->taskQueueLock), NULL)) { + return 1; + } + if (pthread_cond_init(&(this->condition), NULL)) { + return 1; + } + this->precision = dataType; + this->deviceInfo = get_cpu_info(affinityPolicy); + this->graphPath = graphPath; + +#ifdef _USE_WEIGHT_SHARE + for (unsigned int i = 0; i < graphPath.size(); i++) { + this->graph[graphPath[i]].init(graphPath[i]); + this->graph[graphPath[i]].ready(this->precision, this->deviceInfo.affinityPolicy, -1); + } +#endif + int cpuId; + if (this->deviceInfo.affinityPolicy == AFFINITY_CPU_LOW_POWER) { + cpuId = 3; + } else { + cpuId = 4; + } + set_thread_affinity(0, &cpuId, 1); + this->threadNum = threadNum; + this->threads = new pthread_t[threadNum]; + for (int i = 0; i < threadNum; i++) { + if (pthread_create(this->threads + i, NULL, worker, reinterpret_cast(this)) != + 0) { + this->end(); + UNI_ERROR_LOG("schedule create thread pool fail\n"); + return 1; + } + } + this->useGPU = useGPU; + UNI_DEBUG_LOG("schedule init end\n"); + return 0; + } + + int end() + { + UNI_DEBUG_LOG("schedule exit begin\n"); + if (pthread_mutex_lock(&(this->taskQueueLock)) != 0) { + return 1; + } + + this->stop = true; + + if ((pthread_cond_broadcast(&(this->condition)) != 0) || + (pthread_mutex_unlock(&(this->taskQueueLock)) != 0)) { + return 1; + } + + for (int i = 0; i < this->threadNum; i++) { + if (pthread_join(this->threads[i], NULL) != 0) { + return 1; + } + } + UNI_DEBUG_LOG("schedule exit end\n"); + return 0; + } + + int enqueue(Task *task) + { + UNI_DEBUG_LOG("schedule enqueue task begin\n"); + if (this->threadNum == 0 || task == nullptr) { + UNI_WARNING_LOG("schedule enqueue task failed because task or schedule is " + "deprecated\n"); + return 1; + } + if (pthread_mutex_lock(&(this->taskQueueLock)) != 0) { + UNI_WARNING_LOG("schedule enqueue task failed because of can not acquire task queue " + "lock\n"); + return 1; + } + if (this->stop) { + UNI_WARNING_LOG("schedule enqueue task failed because schedule has end\n"); + return 1; + } + this->taskQueue.push(task); + if (pthread_cond_signal(&(this->condition)) != 0) { + UNI_WARNING_LOG("schedule enqueue task failed because can not find worker\n"); + return 1; + } + pthread_mutex_unlock(&(this->taskQueueLock)); + UNI_DEBUG_LOG("schedule enqueue task end\n"); + return 0; + } + +private: + int threadNum; + pthread_mutex_t taskQueueLock; + std::queue taskQueue; + pthread_cond_t condition; + pthread_t *threads; + int stop; + + std::vector graphPath; + std::map> graph; + + bool useGPU; + DeviceInfo deviceInfo; + DataType precision; + + int getThreadId(pthread_t tid) + { + for (int i = 0; i < this->threadNum; i++) { + if (this->threads[i] == tid) { + return i; + } + } + return -1; + } + + static void *worker(void *_schedule) + { + Schedule *schedule = reinterpret_cast(_schedule); + int threadId = schedule->getThreadId(pthread_self()); + UNI_DEBUG_LOG("worker(%d) begin\n", threadId); + std::map> threadPrivateGraph; + double timeStart = ut_time_ms(); +#ifdef _USE_WEIGHT_SHARE + int gpuId = -1, cpuId = -1; + Arch arch = MALI; + if (schedule->useGPU && threadId == schedule->threadNum - 1) { + gpuId = 0; + for (unsigned int i = 0; i < schedule->graphPath.size(); i++) { + threadPrivateGraph[schedule->graphPath[i]].init(schedule->graphPath[i]); + threadPrivateGraph[schedule->graphPath[i]].ready( + schedule->precision, schedule->deviceInfo.affinityPolicy, gpuId); + } + } + if (gpuId < 0) { + if (schedule->deviceInfo.affinityPolicy == AFFINITY_CPU_HIGH_PERFORMANCE) { + cpuId = schedule->deviceInfo.cpuNum - 1 - threadId; + } else { + cpuId = threadId; + } + arch = schedule->deviceInfo.archs[cpuId]; + if (threadId == 0) { + threadPrivateGraph = schedule->graph; + for (unsigned int i = 0; i < schedule->graphPath.size(); i++) { + threadPrivateGraph[schedule->graphPath[i]].setRuntime(cpuId, arch); + } + } else { + for (unsigned int i = 0; i < schedule->graphPath.size(); i++) { + threadPrivateGraph[schedule->graphPath[i]] = + schedule->graph[schedule->graphPath[i]].clone(); + threadPrivateGraph[schedule->graphPath[i]].setRuntime(cpuId, arch); + } + } + } +#else + for (unsigned int i = 0; i < schedule->graphPath.size(); i++) { + threadPrivateGraph[schedule->graphPath[i]].init(schedule->graphPath[i]); + threadPrivateGraph[schedule->graphPath[i]].ready( + schedule->precision, schedule->deviceInfo.affinityPolicy, -1); + threadPrivateGraph[schedule->graphPath[i]].setRuntime(6, ARM_A76); + } +#endif + UNI_DEBUG_LOG("start to wait task\n"); + double timeEnd = ut_time_ms(); + UNI_PROFILE_INFO("graphs init", "init", timeStart * 1000, (timeEnd - timeStart) * 1000); + while (1) { + pthread_mutex_lock(&(schedule->taskQueueLock)); + while (schedule->taskQueue.empty() && !(schedule->stop)) { + pthread_cond_wait(&(schedule->condition), &(schedule->taskQueueLock)); + } + if (schedule->stop) { + break; + } + + Task *task = nullptr; + if (!(schedule->taskQueue.empty())) { + task = schedule->taskQueue.front(); + schedule->taskQueue.pop(); + } + pthread_mutex_unlock(&(schedule->taskQueueLock)); + if (task != nullptr) { + threadPrivateGraph[task->graphPath].run(task->data); + task->status = TASK_END; + } + } + + pthread_mutex_unlock(&(schedule->taskQueueLock)); + pthread_exit(NULL); + UNI_DEBUG_LOG("worker end\n"); + return (NULL); + } +}; +#endif // UNI_INCLUDE_SCHEDULE_H_ diff --git a/common/uni/include/sys.h b/common/uni/include/sys.h new file mode 100644 index 00000000..097e13c7 --- /dev/null +++ b/common/uni/include/sys.h @@ -0,0 +1,54 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_SYS +#define _H_SYS + +#if defined(_USE_GENERAL) || defined(_USE_NEON) || defined(_USE_X86) +#define _USE_CPU +#endif +#define IS_GENERAL(arch) (arch == CPU_GENERAL) +#define IS_X86_AVX2(arch) (arch == X86_AVX2) +#define IS_ARM_V7(arch) (arch == ARM_V7) +#define IS_ARM_V8(arch) (arch == ARM_V8) +#define IS_ARM_A55(arch) (arch == ARM_A55) +#define IS_ARM_A76(arch) (arch == ARM_A76) +#define IS_ARM_LG_V8(arch) (IS_ARM_A55(arch) || IS_ARM_A76(arch)) +#define IS_ARM(arch) (IS_ARM_LG_V8(arch) || IS_ARM_V8(arch) || IS_ARM_V7(arch)) +#define IS_CPU(arch) (IS_GENERAL(arch) || IS_X86_AVX2(arch) || IS_ARM(arch)) +#define IS_MALI_GPU(arch) (arch == MALI) + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + CPU_GENERAL = 1, + MALI = 2, + ARM_V7 = 3, + ARM_V8 = 4, + ARM_A55 = 5, + ARM_A76 = 6, + X86_AVX2 = 7, +} Arch; + +typedef struct { + Arch arch; + void *archPara; +} ArchInfo; +typedef ArchInfo *ArchInfo_t; +#ifdef __cplusplus +} +#endif + +#endif diff --git a/common/uni/include/task.h b/common/uni/include/task.h new file mode 100644 index 00000000..963b7385 --- /dev/null +++ b/common/uni/include/task.h @@ -0,0 +1,130 @@ +/** + * @file + * @brief Task API Document + * + * @copyright + * @code + * Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE + * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * @endcode + */ + +#ifndef UNI_INCLUDE_TASK_H_ +#define UNI_INCLUDE_TASK_H_ + +#include +#include +#include +#include +#include + +#include "tensor.hpp" +#include "profiling.h" + +/** task status */ +typedef enum TaskStatus { + TASK_CREATE, ///< task is created + TASK_READY, ///< task can be processed + TASK_RUN, ///< task is being processed + TASK_END ///< task has been finished +} TaskStatus; + +class Task { +public: + /** + * @brief Task constructor + * + * @return + */ + Task() + { + this->status = TASK_CREATE; + } + + /** + * @brief Task constructor + * @param graphPath predefined flow graph file path + * @param data graph input data + * + * @return + */ + Task(std::string graphPath, std::map> data) + { + this->set(ut_time_ms(), graphPath, data, TASK_READY); + } + + /** + * @brief Task constructor + * @param id time series data stamp + * @param graphPath predefined flow graph file path + * @param data graph input data map + * + * @return + */ + Task(int id, std::string graphPath, std::map> data) + { + this->set(id, graphPath, data, TASK_READY); + } + + /** + * @brief Task copy constructor + * @param task copy from task to generate new Task + * + * @return + */ + Task(Task *task) + { + this->set(task->id, task->graphPath, task->data, task->status); + } + + /** + * @brief Task set function + * @param id time series data stamp + * @param graphPath predefined flow graph file path + * @param data graph input data map + * @param status task status + * + * @return + */ + void set(int id, + std::string graphPath, + std::map> data, + TaskStatus status) + { + this->id = id; + this->graphPath = graphPath; + this->data = data; + this->status = status; + } + + friend std::ostream &operator<<(std::ostream &os, const Task &task) + { + os << "Task " << task.id << "(timestamp " << task.id << ", status " << task.status + << ", graph " << task.graphPath << ", data " << std::endl; + for (auto iter : task.data) { + os << "tensor name " << iter.first << " " << iter.second->string(1) << std::endl; + } + os << ")"; + return os; + } + + /** time stamp */ + int id; + /** task status */ + TaskStatus status; + /** predefined flow graph file path */ + std::string graphPath; + /** graph data */ + std::map> data; +}; +#endif // UNI_INCLUDE_TASK_H_ diff --git a/common/uni/include/tensor_desc.h b/common/uni/include/tensor_desc.h new file mode 100644 index 00000000..c2d4ade0 --- /dev/null +++ b/common/uni/include/tensor_desc.h @@ -0,0 +1,516 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TENSOR_DESC +#define _H_TENSOR_DESC +#include +#include +#include +#include +#include "error.h" + +#define UNUSED(x) (void)x +#define UNI_MIN(a, b) (((a) < (b)) ? (a) : (b)) +#define UNI_MAX(a, b) (((a) > (b)) ? (a) : (b)) +#define UNI_ABS(a) (((a) > 0) ? (a) : (-1 * (a))) +#define UNI_F16_MIN -65504.0f +#define UNI_F16_MAX 65504.0f +#define NAME_LEN 128 +#ifdef __cplusplus +extern "C" { +#endif +int UNI_ISNAN(float a); +int UNI_ISINF(float a); +#ifdef __cplusplus +} +#endif + +#ifdef _USE_X86 +#include +#endif +#if defined(_USE_NEON) || defined(_USE_MALI) +#include +#ifdef __aarch64__ +typedef __fp16 F16; +#endif +typedef int8_t INT8; +#else +typedef char INT8; +#endif +typedef unsigned char U8; +typedef const unsigned char CU8; +typedef char I8; +typedef const char CI8; +typedef unsigned int U32; +typedef const unsigned int CU32; +typedef int I32; +typedef const int CI32; +typedef float F32; +typedef double F64; +typedef long I64; +typedef unsigned char BIN8; + +typedef enum { + RGB_SC = 0, // scale and center crop + RGB = 1, + BGR = 2, + RGB_RAW = 3, + RGB_SC_RAW = 4, + BGR_SC_RAW = 5 +} ImageFormat; + +typedef enum { + DT_U8 = 0, + DT_I8 = 1, + DT_U32 = 2, + DT_I32 = 3, + DT_F16 = 4, + DT_F16_8Q = 5, + DT_F32 = 6, + DT_BIN01 = 7, + DT_BIN11 = 8, + DT_NUM = 9 +} DataType; + +inline U32 bytesOf(DataType dt) +{ + U32 bytes[] = {1, 1, 4, 4, 2, 2, 4, 1, 1, + 8}; // Please divide number of elements by 8 first in the case of binary data types + return dt < DT_NUM ? bytes[dt] : 0; +} + +typedef enum { + DF_NCHW, + DF_NCHWN16, // vectorize for N=16, for filter + DF_NCHWC8, // vectorize for C=8, for input and output + DF_HWNCN16, // vectorize for N=16, for filter in winograd + DF_NHWCN16, // im2col + GEMM, for filter + DF_NHWCN8, // vectorize for N=8, not used + DF_HWNCN8C4, // int8 filter for winograd + DF_NCHWN8C4, // int8 im2col + GEMM, for filter + DF_NCHWN8HW4, // int8 im2col + GEMM in the first layer, for filter + DF_NCHWN16C8, // bnn im2col + GEMM, for filter + DF_NCHWCxN32, // x86 AVX2 direct conv, for filter + DF_NCHWCxN24, // x86 AVX2 conv 1x1, for filter + DF_NCHWC24, // x86 AVX2 depthwise conv, for filter + DF_TRANSPOSE, // vectorize for COL_MAJOR + DF_NORMAL, // vectorize for ROW_MAJOR + DF_MTK, // RNN input, M: batch, T: step, K: x_dim + DF_MKT, // RNN input, M: batch, T: step, K: x_dim + DF_NK, // MMM/MVM filter, N: col_num, K: row_num + DF_NKN16, // MMM/MVM filter, vectorized for N=16 + DF_NKN32, // MMM/MVM filter, vectorized for N=32 + DF_NKN64, // MMM/MVM filter, vectorized for N=64 + DF_NKN32K4, // int8 MVM filter, vectorized for N=32 + DF_NCWHC4, // ocl mali input and output + DF_NCHWC3, // ocl mali support input rgb + DF_NHWC, // ocl mali support input/output + DF_NCHWN4C4, // ocl mali conv filter + DF_NCHWN4, // ocl mali conv filter + DF_HWCN, // ocl mali filter + DF_NCWHN4C4, // ocl mali fc filter + DF_NHWCN4, // ocl mali filter + DF_CHWNC4, // ocl mali filter + DF_CHWNC8, // ocl mali filter + DF_CHWNC16, // ocl mali filter + DF_CHWC8_NCN8, // fp32 dw_conv, vectorized for C8 and N8 + DF_RGB, + DF_HWNCN8, // fp32 filter for winograd + DF_NKN24, // Optimized MMM filter for FP16 +#ifdef __aarch64__ + DF_NKN12, // Optimized MMM filter for FP32 +#else + DF_NKN8, // Optimized MMM filter for FP32 +#endif + DF_NKN12K4, // Optimized MMM filter for INT8 + DF_NCTHW, // conv 3d +} DataFormat; + +typedef struct { + DataType dt = DT_U8; + DataFormat df; + U32 nDims = 0; + U32 dims[6] = {0}; +} TensorDesc; + +inline TensorDesc tensor5df( + DataType dt, DataFormat df, U32 num, U32 numChannels, U32 height, U32 width, U32 align) +{ + TensorDesc ret; + ret.dt = dt; + ret.df = df; + ret.nDims = 5; + ret.dims[0] = align; + ret.dims[1] = width; + ret.dims[2] = height; + ret.dims[3] = numChannels; + ret.dims[4] = num; + return ret; +} + +inline TensorDesc tensor5d(DataType dt, U32 num, U32 numChannels, U32 height, U32 width, U32 align) +{ + return tensor5df(dt, DF_NCHW, num, numChannels, height, width, align); +} + +inline TensorDesc tensor4df( + DataType dt, DataFormat df, U32 num, U32 numChannels, U32 height, U32 width) +{ + TensorDesc ret; + ret.dt = dt; + ret.df = df; + ret.nDims = 4; + ret.dims[0] = width; + ret.dims[1] = height; + ret.dims[2] = numChannels; + ret.dims[3] = num; + return ret; +} + +inline TensorDesc tensor4d(DataType dt, U32 num, U32 numChannels, U32 height, U32 width) +{ + return tensor4df(dt, DF_NCHW, num, numChannels, height, width); +} + +inline TensorDesc tensor3df(DataType dt, DataFormat df, U32 numChannels, U32 height, U32 width) +{ + TensorDesc ret = tensor4df(dt, df, 1, numChannels, height, width); + ret.nDims = 3; + return ret; +} + +inline TensorDesc tensor3d(DataType dt, U32 numChannels, U32 height, U32 width) +{ + return tensor3df(dt, DF_NCHW, numChannels, height, width); +} + +inline TensorDesc tensor2df(DataType dt, DataFormat df, U32 numRows, U32 numColumns) +{ + TensorDesc ret = tensor3df(dt, df, 1, numRows, numColumns); + ret.nDims = 2; + return ret; +} + +inline TensorDesc tensor2d(DataType dt, U32 numRows, U32 numColumns) +{ + TensorDesc ret = tensor3d(dt, 1, numRows, numColumns); + ret.nDims = 2; + return ret; +} + +inline TensorDesc tensor1d(DataType dt, U32 len) +{ + TensorDesc ret = tensor2d(dt, 1, len); + ret.nDims = 1; + return ret; +} + +inline EE tensor1dGet(TensorDesc desc, DataType *dt, DataFormat *df, U32 *len) +{ + if (nullptr == len || nullptr == dt || nullptr == df) { + return NULL_POINTER; + } + if (1 != desc.nDims) { + return NOT_MATCH; + } + + *df = desc.df; + *dt = desc.dt; + *len = desc.dims[0]; + return SUCCESS; +} + +inline EE tensor2dGet(TensorDesc desc, DataType *dt, DataFormat *df, U32 *numRows, U32 *numColumns) +{ + if (nullptr == numColumns || nullptr == numRows || nullptr == dt || nullptr == df) { + return NULL_POINTER; + } + if (2 != desc.nDims) { + return NOT_MATCH; + } + + *df = desc.df; + *dt = desc.dt; + *numColumns = desc.dims[0]; + *numRows = desc.dims[1]; + return SUCCESS; +} + +inline EE tensor3dGet( + TensorDesc desc, DataType *dt, DataFormat *df, U32 *numChannels, U32 *height, U32 *width) +{ + if (nullptr == numChannels || nullptr == height || nullptr == width || nullptr == dt || + nullptr == df) { + return NULL_POINTER; + } + if (3 != desc.nDims) { + return NOT_MATCH; + } + + *dt = desc.dt; + *df = desc.df; + *width = desc.dims[0]; + *height = desc.dims[1]; + *numChannels = desc.dims[2]; + return SUCCESS; +} + +inline EE tensor4dGet( + TensorDesc desc, DataType *dt, DataFormat *df, U32 *num, U32 *numChannels, U32 *height, U32 *width) +{ + if (nullptr == num || nullptr == numChannels || nullptr == height || nullptr == width || + nullptr == dt || nullptr == df) { + return NULL_POINTER; + } + if (4 != desc.nDims) { + return NOT_MATCH; + } + + *dt = desc.dt; + *df = desc.df; + *width = desc.dims[0]; + *height = desc.dims[1]; + *numChannels = desc.dims[2]; + *num = desc.dims[3]; + return SUCCESS; +} + +inline EE tensorSelectGet(TensorDesc desc, + DataType *dt, + DataFormat *df, + U32 *num, + U32 *numChannels, + U32 *height, + U32 *width, + U32 *time = NULL) + +{ + U32 ndims = desc.nDims; + if (dt) { + *dt = desc.dt; + } + if (df) { + *df = desc.df; + } + if (time && ndims < 5) { + *time = 1; + } + if (desc.df == DF_MKT) { + if (num) { + *num = desc.dims[2]; + } + if (numChannels) { + *numChannels = desc.dims[1]; + } + if (height) { + *height = desc.dims[0]; + } + if (width) { + *width = 1; + } + } else if (desc.df == DF_MTK) { + if (num) { + *num = desc.dims[2]; + } + if (numChannels) { + *numChannels = desc.dims[0]; + } + if (height) { + *height = desc.dims[1]; + } + if (width) { + *width = 1; + } + } else if (desc.df == DF_NCTHW) { + if (width) { + *width = desc.dims[0]; + } + if (height) { + *height = desc.dims[1]; + } + if (time) { + *time = desc.dims[2]; + } + if (numChannels) { + *numChannels = desc.dims[3]; + } + if (num) { + *num = desc.dims[4]; + } + } else { + if (width) { + *width = desc.dims[0]; + } + if (height) { + *height = (ndims > 1) ? desc.dims[1] : 1; + } + if (numChannels) { + *numChannels = (ndims > 2) ? desc.dims[2] : 1; + } + if (num) { + *num = (ndims > 3) ? desc.dims[3] : 1; + } + } + return SUCCESS; +} + +inline U32 tensorNumElements(TensorDesc desc) +{ + if (desc.nDims == 0) { + return 0; + } + U32 ret = 1; + for (U32 i = 0; i < desc.nDims; i++) { + ret *= desc.dims[i]; + } + return ret; +} + +inline U32 tensorNumBytes(TensorDesc desc) +{ + if (desc.dt == DT_BIN01 || desc.dt == DT_BIN11) { + return tensorNumElements(desc) / 8; + } else { + return tensorNumElements(desc) * bytesOf(desc.dt); + } +} + +inline U8 tensorIs1d(TensorDesc desc) +{ + return 1 == desc.nDims; +} + +inline U8 tensorIs2d(TensorDesc desc) +{ + return 2 == desc.nDims; +} + +inline U8 tensorIs3d(TensorDesc desc) +{ + return 3 == desc.nDims; +} + +inline U8 tensorIs4d(TensorDesc desc) +{ + return 4 == desc.nDims; +} + +inline std::string tensorDesc2Str(TensorDesc desc) +{ + std::string descStr = "dt:" + std::to_string(desc.dt) + " df:" + std::to_string(desc.df) + + " dims:" + std::to_string(desc.nDims); + + if (desc.nDims > 0) { + descStr += "("; + } + for (I32 i = int(desc.nDims) - 1; i >= 0; i--) { + descStr += std::to_string(desc.dims[i]); + if (i > 0) { + descStr += ","; + } else { + descStr += ")"; + } + } + + return descStr; +} + +inline int tensorDescIsValid(TensorDesc desc) +{ + if (desc.dt < 0 || desc.dt >= 10) { + return 0; + } + + if (desc.df < 0 || desc.df >= 30) { + return 0; + } + + if (desc.nDims > 6) { + return 0; + } + + for (U32 i = 0; i < desc.nDims; i++) { + if (desc.dims[i] > INT_MAX) { + return 0; + } + } + + return 1; +} + +inline DataFormat getTensorDefaultDataFormat(int nDims) +{ + DataFormat df = DF_NORMAL; + switch (nDims) { + case 2: + df = DF_NORMAL; + break; + case 3: + df = DF_MTK; + break; + case 4: + df = DF_NCHW; + break; + default: + break; + } + return df; +} + +inline std::vector calculateLocalIndex(U32 index, U32 *dims, U32 nDims) +{ + std::vector indexes(nDims); + for (U32 i = 0; i < nDims; i++) { + indexes[i] = index % dims[i]; + index /= dims[i]; + } + return indexes; +} + +inline U32 calculateGlobalIndex(U32 *indexes, U32 *dims, U32 nDims) +{ + U32 index = 0; + for (int i = ((int)nDims) - 1; i >= 0; i--) { + index = index * dims[i] + indexes[i]; + } + return index; +} + +void UNI_memcpy(void *dst, const void *src, int size); + +void UNI_init(U32 num, DataType dt, F32 val, void *dst); + +EE array_transpose(DataType dt, + U32 *inputDims, + const void *input, + U32 *outputDims, + void *output, + U32 *transposeDims, + int dimsNum); + +void transformFromFloat(DataType dataType, float *src, void *dst, int num, float scale = 1.0); + +void transformToFloat(DataType dataType, void *src, float *dst, int num, float scale = 1.0); + +EE transformToNCHW(TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output); + +EE transformToNHWC(TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output); + +EE transformNCHWToNCHWC8( + TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output); + +EE transformNCHWC8ToNCHWC8ByGroup( + TensorDesc inputDesc, const void *input, int group, TensorDesc outputDesc, void *output); + +EE transposeFilter(TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output); +#endif diff --git a/common/uni/include/thread_affinity.h b/common/uni/include/thread_affinity.h new file mode 100644 index 00000000..c133f2a0 --- /dev/null +++ b/common/uni/include/thread_affinity.h @@ -0,0 +1,535 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_THREAD_AFFINITY +#define _H_THREAD_AFFINITY + +#include +#include +#include +#include +#include "sys.h" +#include "error.h" +#include "tensor_desc.h" + +#ifdef _USE_X86 +#define __cpuid(data, eaxIn, ecxIn) \ + __asm__ __volatile__("cpuid\n" \ + : "=a"(data[0]), "=b"(data[1]), "=c"(data[2]), "=d"(data[3]) \ + : "0"(eaxIn), "2"(ecxIn)) +#endif + +const int CPU_MAX_NUMBER = 64; +#ifdef _USE_OPENMP +const int OMP_NUM_THREADS = 2; +#else +const int OMP_NUM_THREADS = 1; +#endif + +typedef enum { + AFFINITY_CPU_LOW_POWER = 0, + AFFINITY_CPU_HIGH_PERFORMANCE = 1, + AFFINITY_GPU = 2 +} AffinityPolicy; + +typedef struct CpuStat { + unsigned long idle; + unsigned long total; +} CpuStat; + +typedef struct DeviceInfo { + int cpuNum; + Arch archs[CPU_MAX_NUMBER]; + long freqs[CPU_MAX_NUMBER]; + float occupys[CPU_MAX_NUMBER]; + int cpuids[CPU_MAX_NUMBER]; + CpuStat cpuStats[CPU_MAX_NUMBER]; + + float maxOccupy; + AffinityPolicy affinityPolicy; + Arch schedule; +} DeviceInfo; + +inline const char *const *AffinityPolicyNames() +{ + static const char *const names[] = { + "CPU_AFFINITY_LOW_POWER", "CPU_AFFINITY_HIGH_PERFORMANCE", "GPU"}; + return names; +} + +inline const AffinityPolicy *AffinityPolicies() +{ + static const AffinityPolicy policies[] = { + AFFINITY_CPU_LOW_POWER, AFFINITY_CPU_HIGH_PERFORMANCE, AFFINITY_GPU}; + return policies; +} + +inline int get_cpus_num() +{ +#ifdef _USE_IOS + return 6; +#else + const int bufferSize = 1024; + char buffer[bufferSize]; + FILE *fp = fopen("/proc/cpuinfo", "rb"); + if (!fp) { + return 1; + } + + int cpuNum = 0; + while (!feof(fp)) { + char *status = fgets(buffer, bufferSize, fp); + if (!status) { + break; + } + + if (memcmp(buffer, "processor", 9) == 0) { + cpuNum++; + } + } + fclose(fp); + if (cpuNum > CPU_MAX_NUMBER) { + cpuNum = CPU_MAX_NUMBER; + } + return cpuNum; +#endif +} + +inline void get_cpus_arch(Arch *archs, int cpuNum) +{ +#ifdef _USE_IOS + for (int cpuid = 0; cpuid < cpuNum; cpuid++) { + archs[cpuid] = ARM_A76; + } + return; +#endif + FILE *fp = fopen("/proc/cpuinfo", "rb"); + *archs = CPU_GENERAL; + if (!fp) { + return; + } + +#if defined(_USE_FP32) && defined(_USE_X86) + U32 data[4] = {}; + const U32 &ebx = data[1]; + const U32 &ecx = data[2]; + + const U32 osxsave = 1U << 0; + const U32 avx = 1U << 1; + const U32 avx2 = 1U << 2; + + U32 cpuArch = 0; + __cpuid(data, 0, 0); + __cpuid(data, 1, 0); + if (ecx & (1U << 27)) { + cpuArch |= osxsave; + } + if (cpuArch & osxsave) { + if (ecx & (1U << 28)) { + cpuArch |= avx; + } + } + __cpuid(data, 7, 0); + if ((cpuArch & avx) && (ebx & (1U << 5))) { + cpuArch |= avx2; + } + + if (cpuArch & avx2) { + archs[0] = X86_AVX2; + } else { + UNI_WARNING_LOG("AVX2 is not available, use general implementation."); + } +#endif + + int cpuid = 0; +#ifdef _USE_NEON + const int bufferSize = 1024; + char buffer[bufferSize]; + while (!feof(fp)) { + char *status = fgets(buffer, bufferSize, fp); + if (!status) { + break; + } + + if (memcmp(buffer, "CPU part", 8) == 0) { + Arch arch = ARM_V8; + int id = 0; + sscanf(buffer, "CPU part\t: %x", &id); + switch (id) { + case 0xc07: + arch = ARM_V7; + break; + case 0xc0f: + arch = ARM_V7; + break; + case 0xd01: + arch = ARM_A76; + break; + case 0xd03: + arch = ARM_V8; + break; + case 0xd05: + arch = ARM_A55; + break; + case 0xd07: + arch = ARM_V8; + break; + case 0xd08: + arch = ARM_V8; + break; + case 0xd09: + arch = ARM_V8; + break; + case 0xd0a: + arch = ARM_A76; + break; + case 0xd0b: + arch = ARM_A76; + break; + case 0xd0d: + arch = ARM_A76; + break; + case 0xd40: + arch = ARM_A76; + break; + case 0xd41: + arch = ARM_A76; + break; + case 0xd44: + arch = ARM_A76; + break; + case 0x804: + arch = ARM_A76; + break; + case 0x805: + arch = ARM_A55; + break; + case 0x802: + arch = ARM_A76; + break; + case 0x803: + arch = ARM_A55; + break; + case 0x801: + arch = ARM_V8; + break; + case 0x800: + arch = ARM_V8; + break; + case 0x205: + arch = ARM_V8; + break; + default: + UNI_WARNING_LOG("unknown CPU %d arch %x, set to ARM_V8\n", cpuid, id); + break; + } + archs[cpuid++] = arch; + } + } +#endif + for (; cpuid < cpuNum; cpuid++) { + archs[cpuid] = archs[0]; + } + fclose(fp); +} + +inline long get_cpu_freq(int cpuid) +{ + char path[256]; + FILE *fp = NULL; + if (fp == NULL) { + snprintf( + path, sizeof(path), "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid); + fp = fopen(path, "rb"); + } + if (fp == NULL) { + snprintf( + path, sizeof(path), "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", cpuid); + fp = fopen(path, "rb"); + } + if (fp == NULL) { + snprintf( + path, sizeof(path), "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid); + fp = fopen(path, "rb"); + } + + long maxFrequency = -1; + if (fp == NULL) { + printf("[WARNING] can not get CPU max frequency\n"); + } else { + fscanf(fp, "%ld", &maxFrequency); + fclose(fp); + } + return maxFrequency; +} + +inline void get_cpus_freq(long *freqs, int cpuNum) +{ + for (int i = 0; i < cpuNum; i++) { + freqs[i] = get_cpu_freq(i); + } +} + +inline void get_cpus_occupy(CpuStat *cpuStat, float *cpuOccupy, int cpuNum) +{ + const int bufferSize = 1024; + char buffer[bufferSize]; + char name[32]; + unsigned long user, nice, system, idle, iowait, irq, softirq, total; + FILE *fp = fopen("/proc/stat", "rb"); + if (!fp) { + for (int i = 0; i < cpuNum; i++) { + cpuOccupy[i] = 0; + } + return; + } + + // skip total statistics + fgets(buffer, bufferSize, fp); + + for (int i = 0; i < cpuNum; i++) { + fgets(buffer, bufferSize, fp); + sscanf(buffer, "%s %lu %lu %lu %lu %lu %lu %lu", name, &user, &nice, &system, &idle, + &iowait, &irq, &softirq); + total = user + nice + system + idle + iowait + irq + softirq; + cpuOccupy[i] = 0; + if (cpuStat[i].total != 0) { + float idleTime = idle - cpuStat[i].idle; + float totalTime = total - cpuStat[i].total; + if (totalTime != 0) { + cpuOccupy[i] = 1.0 - idleTime / totalTime; + } + } + cpuStat[i].idle = idle; + cpuStat[i].total = total; + } + fclose(fp); +} + +inline void swap_variable(void *a, void *b, const int size) +{ + char buffer[size]; + memcpy(buffer, a, size); + memcpy(a, b, size); + memcpy(b, buffer, size); +} + +inline void disable_cpus(float *occupys, int *cpuids, int cpuNum, float cpuOccupyMax) +{ + for (int i = 0; i < cpuNum; i++) { + if (occupys[i] > cpuOccupyMax) { + cpuids[i] = -1; + } + } +} + +inline void sort_cpus_by_arch_freq_occupy( + Arch *archs, long *freqs, float *occupys, int *cpuids, int cpuNum, float cpuOccupyMax) +{ + for (int i = 0; i < cpuNum; i++) { + cpuids[i] = i; + } + + for (int i = 1; i < cpuNum; i++) { + for (int j = i - 1; j >= 0; j--) { + if (archs[j + 1] < archs[j]) { + swap_variable(&archs[j], &archs[j + 1], sizeof(Arch)); + swap_variable(&freqs[j], &freqs[j + 1], sizeof(long)); + swap_variable(&cpuids[j], &cpuids[j + 1], sizeof(int)); + swap_variable(&occupys[j], &occupys[j + 1], sizeof(float)); + continue; + } + if (archs[j + 1] == archs[j]) { + if (freqs[j + 1] < freqs[j]) { + swap_variable(&archs[j], &archs[j + 1], sizeof(Arch)); + swap_variable(&freqs[j], &freqs[j + 1], sizeof(long)); + swap_variable(&cpuids[j], &cpuids[j + 1], sizeof(int)); + swap_variable(&occupys[j], &occupys[j + 1], sizeof(float)); + continue; + } + if (freqs[j + 1] >= freqs[j]) { + continue; + } + } + if (archs[j + 1] > archs[j]) { + continue; + } + } + } + disable_cpus(occupys, cpuids, cpuNum, cpuOccupyMax); +} + +inline int set_thread_affinity(int threadid, const int *cpuids, int num) +{ +#ifndef _USE_IOS +#ifdef __GLIBC__ + pid_t tid = syscall(SYS_gettid); +#else + pid_t tid = gettid(); +#endif + cpu_set_t mask; + CPU_ZERO(&mask); + for (int i = 0; i < num; i++) { + UNI_DEBUG_LOG("bind thread %d to core %d\n", threadid, cpuids[i]); + CPU_SET(cpuids[i], &mask); + } + int status = syscall(__NR_sched_setaffinity, tid, sizeof(mask), &mask); + if (status) { + UNI_WARNING_LOG("fail to set affinity %d\n", status); + return -1; + } +#endif + return 0; +} + +inline AffinityPolicy thread_affinity_get_policy_by_name(const char *name) +{ + int nameLength = strlen(name); + for (int i = 0; i < 3; i++) { + const char *target = AffinityPolicyNames()[i]; + int targetLength = strlen(target); + if (nameLength < targetLength) { + continue; + } + int match = 1; + for (int j = 0; j < targetLength; j++) { + if (name[j] == target[j] || name[j] == target[j] + 32) { + continue; + } else { + match = 0; + break; + } + } + if (match) { + return AffinityPolicies()[i]; + } + } + return AFFINITY_CPU_HIGH_PERFORMANCE; +} + +inline Arch thread_affinity_set_by_policy( + Arch *archs, int *cpuids, int cpuNum, AffinityPolicy policy, int threadId) +{ + if (threadId >= cpuNum) { + UNI_WARNING_LOG("can not allocate more cores for thread %d\n", threadId); + return CPU_GENERAL; + } + if (policy == AFFINITY_GPU) { + return MALI; + } +#ifndef _USE_OPENMP + int cpuid; + Arch arch; + int i = cpuNum - 1 - threadId; + switch (policy) { + case AFFINITY_CPU_LOW_POWER: { + i = threadId; + while (cpuids[i] == -1 && i < cpuNum - 1) { + i++; + } + break; + } + case AFFINITY_CPU_HIGH_PERFORMANCE: { + i = cpuNum - 1 - threadId; + while (cpuids[i] == -1 && i > 0) { + i--; + } + break; + } + default: { + break; + } + } + cpuid = cpuids[i]; + arch = archs[i]; + set_thread_affinity(threadId, &cpuid, 1); +#else + int index = 0; + for (int i = 0; i < cpuNum; i++) { + if (policy == AFFINITY_CPU_LOW_POWER && archs[index] > archs[i]) { + index = i; + } + if (policy == AFFINITY_CPU_HIGH_PERFORMANCE && archs[index] < archs[i]) { + index = i; + } + } + int count = 0; + int candidates[CPU_MAX_NUMBER]; + for (int i = 0; i < cpuNum; i++) { + if (archs[index] == archs[i]) { + candidates[count++] = i; + } + } + set_thread_affinity(threadId, candidates, count); + Arch arch = archs[index]; +#endif + return arch; +} + +inline void thread_affinity_set_by_arch( + Arch *archs, int *cpuids, int cpuNum, Arch arch, int threadId) +{ + if (threadId >= cpuNum) { + UNI_WARNING_LOG("can not allocate more cores for thread %d\n", threadId); + return; + } + if (IS_MALI_GPU(arch)) { + return; + } + int count = 0; + int cpuid = -1; + for (int i = 0; i < cpuNum; i++) { + if (archs[i] == arch && cpuids[i] != -1) { + if (count == threadId) { + cpuid = cpuids[i]; + break; + } else { + count++; + } + } + } + if (cpuid != -1) { + set_thread_affinity(threadId, &cpuid, 1); + } else { + UNI_WARNING_LOG("there is not enough %d arch cores for thread %d", arch, threadId); + } +} + +inline DeviceInfo get_cpu_info(AffinityPolicy affinityPolicy) +{ + DeviceInfo deviceInfo; + deviceInfo.affinityPolicy = affinityPolicy; + deviceInfo.cpuNum = get_cpus_num(); + deviceInfo.maxOccupy = 0.5; + get_cpus_arch(deviceInfo.archs, deviceInfo.cpuNum); + get_cpus_freq(deviceInfo.freqs, deviceInfo.cpuNum); + for (int i = 0; i < deviceInfo.cpuNum; i++) { + deviceInfo.cpuStats[i].total = 0; + } + get_cpus_occupy(deviceInfo.cpuStats, deviceInfo.occupys, deviceInfo.cpuNum); + return deviceInfo; +} + +inline void set_cpu_dynamic(DeviceInfo *deviceInfo, int threadId) +{ + if (deviceInfo->affinityPolicy == AFFINITY_GPU) { + deviceInfo->schedule = MALI; + return; + } + get_cpus_occupy(deviceInfo->cpuStats, deviceInfo->occupys, deviceInfo->cpuNum); + sort_cpus_by_arch_freq_occupy(deviceInfo->archs, deviceInfo->freqs, deviceInfo->occupys, + deviceInfo->cpuids, deviceInfo->cpuNum, deviceInfo->maxOccupy); + deviceInfo->schedule = thread_affinity_set_by_policy(deviceInfo->archs, deviceInfo->cpuids, + deviceInfo->cpuNum, deviceInfo->affinityPolicy, threadId); +} +#endif diff --git a/common/uni/include/types.h b/common/uni/include/types.h new file mode 100644 index 00000000..6ae1a128 --- /dev/null +++ b/common/uni/include/types.h @@ -0,0 +1,618 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TYPES +#define _H_TYPES + +#include +#include "tensor_desc.h" +#include "op_type.h" +#ifdef __cplusplus +extern "C" { +#endif + +static const int sg_boltVersion = 20201120; +static const int sg_magicNumber = 1141119; + +typedef enum { POOLING_MAX, POOLING_MEAN } PoolingMode; + +typedef enum { CEIL, FLOOR } RoundMode; + +typedef enum { + ELTWISE_SUM, + ELTWISE_MAX, + ELTWISE_MIN, + ELTWISE_PROD, + ELTWISE_SUB, + ELTWISE_DIV, + ELTWISE_SQRT, + ELTWISE_ERF +} EltwiseMode; + +typedef enum { + ACTIVATION_NULL, + ACTIVATION_RELU, + ACTIVATION_RELU6, + ACTIVATION_H_SWISH, + ACTIVATION_H_SIGMOID, + ACTIVATION_SIGMOID, + ACTIVATION_TANH, + ACTIVATION_GELU, + ACTIVATION_MISH, + ACTIVATION_GREATER +} ActivationMode; + +typedef enum { BSliceApply_NULL, BSliceApply_CONV } BilateralSliceApplyMode; + +typedef enum { + Convolution_Pointwise, + Convolution_Dilation, + Convolution_Depthwise, + Convolution_Depthwise_Pointwise, + Convolution_Deconvolution, + Convolution_Depthwise_Deconvolution +} ConvolutionMode; + +typedef enum { Pad_Constant, Pad_Reflect, Pad_Edge, Pad_Symmetric } PadMode; + +typedef enum { CHECK_EQUAL, CHECK_GREATEQUAL, CHECK_GREAT } CheckMode; + +typedef enum { + REDUCTION_SUM, + REDUCTION_MEAN, + REDUCTION_STD_DEVIATION, + REDUCTION_SCALAR_PRODUCT +} ReductionMode; + +typedef enum { KeepPrecision, ToFloat, ToInt } CastPrecisionMode; + +typedef enum { F32_to_F32, F32_to_F16, F32_to_I8 } DataConvertType; + +typedef enum { RNN_RNN, RNN_LSTM, RNN_GRU, RNN_GRU_LBR } RNNMode; + +#pragma pack(8) +typedef struct { + ActivationMode mode; + float value[4] = {0, 0, 0, 0}; +} ActivationParamSpec; + +typedef struct { + bool propagate_down; +} PReLUParamSpec; + +typedef enum { + CONVOLUTION_NO_TMP_MEM, + CONVOLUTION_FASTEST, + CONVOLUTION_TUNNING, + CONVOLUTION_LIBRARY_SEARCH, +} ConvolutionPolicy; + +typedef enum { + CONVOLUTION_ALGORITHM_POINTWISE, + CONVOLUTION_ALGORITHM_DIRECT, + CONVOLUTION_ALGORITHM_IM2COL_GEMM, + CONVOLUTION_ALGORITHM_GEMM, + CONVOLUTION_ALGORITHM_GEMM_ICNCHW, + CONVOLUTION_ALGORITHM_WINOGRAD, + CONVOLUTION_ALGORITHM_BNN, + CONVOLUTION_ALGORITHM_DIRECT_SPE_CK, + CONVOLUTION_ALGORITHM_GROUP_DECONV, + CONVOLUTION_ALGORITHM_NULL +} ConvolutionForwardAlgorithm; + +typedef struct { + F32 xmin; + F32 ymin; + F32 xmax; + F32 ymax; + U32 label; +} BoxRect; + +typedef struct { + U32 label; + I64 box_index; +} BoxInfo; + +typedef struct { + U32 max_output_boxes_per_class; + F32 iou_threshold; + F32 score_threshold; +} NonMaxSuppressionParamSpec; + +typedef struct { + U32 output_h; + U32 output_w; + U32 sampling_ratio; + F32 spatial_scale; +} RoiAlignParamSpec; + +typedef enum { + DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT, + DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT, + DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT_NO_PADDING, + DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_3X3S1P1, + DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM, + DEPTHWISE_CONVOLUTION_ALGORITHM_NULL +} DepthwiseConvolutionForwardAlgorithm; + +typedef struct { + char mode[NAME_LEN]; + U32 sizes[2]; + float scales[4]; + U32 num_sizes; + U32 num_scales; +} ResizeParamSpec; + +typedef struct { + int gather_axis; +} GatherParamSpec; + +typedef struct { + int axes[8]; + int axes_num; +} SqueezeParamSpec; + +typedef struct { + int axes[8]; + int axes_num; +} UnsqueezeParamSpec; + +typedef struct { + CastPrecisionMode castPrecision; +} CastParamSpec; + +typedef struct { + int axis; + int num_concat; +} ScaleParamSpec; + +typedef struct { + float neg_slope; +} ReLUParamSpec; + +typedef struct { + float coeff_values[8]; + int coeff_size; +} EltwiseSumSpec; + +typedef struct { + float min; + float max; +} ClipParamSpec; + +typedef union { + ReLUParamSpec relu_spec; + ClipParamSpec clip_spec; +} ActivationSpec; + +typedef struct { + EltwiseMode elt_mode; + EltwiseSumSpec elt_sum_spec; + ActivationMode activation_type; + ActivationSpec activation_spec; +} EltwiseParamSpec; + +typedef struct { + U32 num_outputs; + U32 kernel_t; + U32 kernel_h; + U32 kernel_w; + U32 stride_t; + U32 stride_h; + U32 stride_w; + U32 padding_before; + U32 padding_after; + U32 padding_top; + U32 padding_bottom; + U32 padding_left; + U32 padding_right; + U32 group; + U32 dilatedRate_t; + U32 dilatedRate_h; + U32 dilatedRate_w; + U32 num_outputs_origin; + ConvolutionMode convolution_type; + ActivationMode dw_activation_type; + ActivationMode pw_activation_type; + ActivationSpec activation_spec; +} ConvolutionParamSpec; + +typedef struct { + U32 kernel_t; + U32 kernel_h; + U32 kernel_w; + U32 stride_t; + U32 stride_h; + U32 stride_w; + U32 padding_before; + U32 padding_after; + U32 padding_top; + U32 padding_bottom; + U32 padding_left; + U32 padding_right; + RoundMode rm; + PoolingMode mode; +} PoolingParamSpec; + +typedef struct { + U32 num_outputs; + U32 num_slices; + I32 slice_point[32]; +} FullyConnectedParamSpec; + +typedef struct { + int axis; + F32 eps; + F32 gama; + F32 momentum; +} BatchNormParamSpec; + +typedef struct { + U32 before; + U32 after; + U32 top; + U32 bottom; + U32 left; + U32 right; + F32 constant_value; + PadMode pad_mode; +} PadParamSpec; + +typedef struct { + U32 input_dim; + U32 num_output; + bool bias_term; + bool transpose; + int axis; +} EmbedParamSpec; + +typedef struct { + float scale; + float shift; + float power; +} PowerParamSpec; + +typedef struct { + I32 shape_dims[8]; + I32 shape_size; + I32 axis; + I32 num_axes; +} ReshapeParamSpec; + +typedef struct { + I32 slice_points[8]; + U32 slice_size; + I32 axis; +} SliceParamSpec; + +typedef struct { + U32 trans_dims[8]; + U32 trans_size; +} TransposeParamSpec; + +typedef struct { + U32 num_heads; + U32 from_sequence_length; + U32 to_sequence_length; +} AttentionParamSpec; + +typedef struct { + RNNMode mode; + U32 numOutput; + I32 steps; + I32 numProjection; + float zoneoutCell; + float zoneoutOutput; + + bool biDirection; + float forgetBias; + ActivationMode activationMode; +} RNNParamSpec; + +typedef struct { + U32 coefficient_len; + BilateralSliceApplyMode mode; + bool has_offset; +} BilateralSliceApplyParamSpec; + +typedef struct { + I32 axes[8]; + I32 axes_num; + ReductionMode reduction_mode; + float coeff; + bool keep_dim; +} ReductionParamSpec; + +typedef struct { + I32 axis; +} ArgMaxParamSpec; + +typedef struct { + I32 src_dims[3]; + I32 dst_dims[3]; + I32 length; +} CopyParamSpec; + +typedef struct { + CheckMode check_mode; +} CheckParamSpec; + +typedef struct { + int loops; + int axis; +} RepeatParamSpec; + +typedef struct { + TensorDesc desc; +} PreAllocatedMemoryParamSpec; + +typedef struct { + TensorDesc desc; +} SharedWeightParamSpec; + +typedef struct { + bool transpose_a; + bool transpose_b; +} MatMulParamSpec; + +typedef struct { + int attention_length; + float mask; + bool same_length; +} AttentionMaskParamSpec; + +typedef struct { + int axis; + int shift_length; +} RelativeShiftParamSpec; + +typedef struct { + int axis; + int num_concat; +} ConcatParamSpec; + +typedef struct { + int axis; +} SoftmaxParamSpec; + +typedef struct { + int begin[8]; + int end[8]; + int strides[8]; + char begin_mask[8]; + char end_mask[8]; + char ellipsis_mask[8]; + char new_axis_mask[8]; + char shrink_axis_mask[8]; + U32 dim_size; +} TfSliceParamSpec; + +typedef struct { + F32 min_sizes[2]; + F32 max_sizes[2]; + F32 aspect_ratios[2]; + U32 flip; + U32 clip; + F32 variances[4]; + U32 image_h; + U32 image_w; + F32 step_h; + F32 step_w; + F32 offset; +} PriorBoxParamSpec; + +typedef struct { + U32 num_class; + F32 nms_threshold; + U32 nms_top_k; + U32 keep_top_k; + F32 confidence_threshold; +} DetectionOutputParamSpec; + +typedef struct { + U32 num_class; + U32 num_box; + F32 confidence_threshold; + F32 nms_threshold; + F32 biases[18]; + U32 anchors_scale[3]; + U32 mask_group_num; + U32 mask[9]; +} Yolov3DetectionOutputParamSpec; + +typedef struct { + char symmetric[NAME_LEN]; + int group; + int channel_before; + int channel_after; +} ChannelResizeParamSpec; + +typedef struct { + int blockSize; +} Space2DepthParamSpec; + +typedef struct { + int blockSize; + I8 reMode[8]; +} Depth2SpaceParamSpec; + +typedef struct { + int repeatsInfo[8]; + int dimsSize; + int axis; +} TileParamSpec; + +typedef struct { + U32 numIndices; + int outputDim; +} SpliceParamSpec; + +typedef struct { + FullyConnectedParamSpec fc_desc[6]; + PowerParamSpec power_spec; + bool eltwiseWithLayerNormIn[2]; + ActivationMode actiMode; + ReshapeParamSpec reshapeDesc[4]; + EltwiseParamSpec eltwiseDesc[2]; +} MultiheadAttentionParamSpec; + +typedef union ParameterSpec { + ParameterSpec() + {} + ConvolutionParamSpec conv_spec; + FullyConnectedParamSpec fc_spec; + RNNParamSpec rnn_spec; + MatMulParamSpec matmul_spec; + ResizeParamSpec resize_spec; + BilateralSliceApplyParamSpec bilateral_slice_apply_spec; + PoolingParamSpec pooling_spec; + ScaleParamSpec scale_spec; + BatchNormParamSpec bn_spec; + ReductionParamSpec reduction_spec; + ArgMaxParamSpec argmax_spec; + SoftmaxParamSpec softmax_spec; + ClipParamSpec clip_spec; + PowerParamSpec power_spec; + ReLUParamSpec relu_spec; + GatherParamSpec gather_spec; + EmbedParamSpec embed_spec; + PadParamSpec pad_spec; + EltwiseParamSpec eltwise_spec; + ConcatParamSpec concat_spec; + SliceParamSpec slice_spec; + TfSliceParamSpec tfslice_spec; + CastParamSpec cast_spec; + TransposeParamSpec transpose_spec; + ReshapeParamSpec reshape_spec; + SqueezeParamSpec squeeze_spec; + UnsqueezeParamSpec unsqueeze_spec; + Space2DepthParamSpec space2depth_spec; + Depth2SpaceParamSpec depth2space_spec; + ChannelResizeParamSpec channel_resize_spec; + PreAllocatedMemoryParamSpec preallocated_memory_spec; + SharedWeightParamSpec shared_weight_spec; + CopyParamSpec copy_spec; + CheckParamSpec check_spec; + RepeatParamSpec repeat_spec; + AttentionParamSpec attention_spec; + AttentionMaskParamSpec attention_mask_spec; + RelativeShiftParamSpec relative_shift_spec; + PriorBoxParamSpec prior_box_spec; + DetectionOutputParamSpec detection_output_spec; + Yolov3DetectionOutputParamSpec yolov3_detection_output_spec; + MultiheadAttentionParamSpec multiheadAttention_spec; + TileParamSpec tile_spec; + SpliceParamSpec splice_spec; +} ParameterSpec; + +typedef struct { + int num_scale; + F32 *scale; +} QuantSpec; + +typedef struct { + I8 name[NAME_LEN]; + OperatorType type; + U32 num_inputs; + I8 **input_tensors_name; + U32 num_outputs; + I8 **output_tensors_name; + I32 *tensor_positions; + U32 num_quant_feature; + QuantSpec *feature_scale; + ParameterSpec ps; +} OperatorSpec; + +typedef struct { + I8 op_name[NAME_LEN]; + DataType mdt = DT_U8; + U32 bytes_of_weight = 0; + U8 *weight; + U32 bytes_of_vec = 0; + U8 *vec; + U32 num_quant_scale; // Merged FC may have multiple weight scales + QuantSpec *weight_scale; +} WeightSpec; + +typedef struct { + I8 op[NAME_LEN]; + U32 num_inputs; + I8 **input_op_names; + U32 num_outputs; + I8 **output_op_names; +} OperatorRelationshipMapEntry; + +typedef struct { + I32 version; + I32 magic_number; + + I8 model_name[NAME_LEN]; + DataType dt; + + I32 num_inputs; + I8 **input_names; + TensorDesc *input_dims; + + I32 num_outputs; + I8 **output_names; + + I32 num_operator_specs; + OperatorSpec *ops; + + I32 num_weight_specs; + WeightSpec *ws; + + I32 num_op_tensor_entries; + OperatorRelationshipMapEntry *op_relationship_entries; +} ModelSpec; +#pragma pack() + +#ifdef __cplusplus +} +#endif + +OperatorSpec mt_create_operator( + const char *name, OperatorType type, U32 num_inputs, U32 num_outputs); + +EE mt_insert_operator(ModelSpec *ms, int index, OperatorSpec newOperator); + +WeightSpec mt_create_weight( + const char *name, DataType dataType, U32 bytesOfWeight, U32 bytesOfVec, U32 numQuantScale); + +bool isDeprecatedOp(OperatorType opType); + +bool isDeprecatedOpWeight(const ModelSpec *spec, int index); + +EE str_copy(I8 *dst, const I8 *src, I32 src_len, I32 dst_len = NAME_LEN); + +void *mt_new_storage(size_t size); + +inline INT8 round_towards_zero(F32 num, bool clamp = true) +{ + INT8 ret; + if (clamp) { + if (num > 127.0) { + return 127; + } else if (num < -127.0) { + return -127; + } + } + if (num > 0) { + ret = floor(num); + } else { + ret = ceil(num); + } + return ret; +} + +#endif diff --git a/uni/include/ut_util.h b/common/uni/include/ut_util.h similarity index 58% rename from uni/include/ut_util.h rename to common/uni/include/ut_util.h index 5ac8dde7..f5660668 100644 --- a/uni/include/ut_util.h +++ b/common/uni/include/ut_util.h @@ -1,17 +1,16 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_UT_UTIL #define _H_UT_UTIL @@ -22,27 +21,34 @@ #include #include "sys.h" -#include "type.h" +#include "types.h" #include "error.h" +#if defined(_USE_NEON) const Arch UT_ARCH = ARM_A76; +#elif defined(_USE_X86) +const Arch UT_ARCH = X86_AVX2; +#else +const Arch UT_ARCH = CPU_GENERAL; +#endif // whether to check right const int UT_CHECK = 1; -// toop times to benchmark +// loop times to benchmark const int UT_LOOPS = 6; // init data type -typedef enum UT_RANDOM_TYPE{ - UT_INIT_RANDOM, // random - UT_INIT_NEG, // random & < 0 - UT_INIT_POS, // random & > 0 - UT_INIT_ZERO // 0 +typedef enum UT_RANDOM_TYPE { + UT_INIT_RANDOM, // random + UT_INIT_NEG, // random & < 0 + UT_INIT_POS, // random & > 0 + UT_INIT_ZERO // 0 } UT_RANDOM_TYPE; // generate random data -inline F32 ut_init_s(DataType dt, UT_RANDOM_TYPE type) { +inline F32 ut_init_s(DataType dt, UT_RANDOM_TYPE type) +{ if (type == UT_INIT_ZERO) { return 0; } @@ -70,11 +76,12 @@ inline F32 ut_init_s(DataType dt, UT_RANDOM_TYPE type) { return s; } - // generate random array -inline void ut_init_v(U8* data, U32 len, DataType dt, UT_RANDOM_TYPE type) { - if (data == nullptr) +inline void ut_init_v(U8 *data, U32 len, DataType dt, UT_RANDOM_TYPE type) +{ + if (data == nullptr) { return; + } for (U32 i = 0; i < len; i++) { switch (dt) { @@ -118,33 +125,31 @@ inline void ut_init_v(U8* data, U32 len, DataType dt, UT_RANDOM_TYPE type) { break; } default: - std::cerr << "[ERROR] unsupported data type in ut_init_v" << std::endl; - exit(1); + UNI_ERROR_LOG("unsupported data type in ut_init_v\n"); } } } -inline U8* ut_input_v(U32 len, DataType dt, UT_RANDOM_TYPE type) { - U8* data = (U8*)malloc(len * bytesOf(dt)); +inline U8 *ut_input_v(U32 len, DataType dt, UT_RANDOM_TYPE type) +{ + U8 *data = (U8 *)malloc(len * bytesOf(dt)); ut_init_v(data, len, dt, type); return data; } - // unit test element check -inline void ut_check_s(F32 a, F32 b, F32 threshold, std::string file, int line) +inline void ut_check_s(F32 a, F32 b, F32 threshold, std::string file, int line, int index) { if (!((a <= b + threshold) && (a >= b - threshold))) { - std::cerr << "[ERROR] check in " << file << " at line " << line << " " \ - << a << " " << b << std::endl; - exit(1); + UNI_ERROR_LOG("check in %s at line %d, %d @ %f %f\n", file.c_str(), line, index, a, b); } } - // unit test array check -inline void ut_check_v(void *A, void *B, U32 len, DataType dt, F32 threshold, std::string file, int line) { +inline void ut_check_v( + void *A, void *B, U32 len, DataType dt, F32 threshold, std::string file, int line) +{ F32 a = 0, b = 0; for (U32 i = 0; i < len; i++) { switch (dt) { @@ -181,14 +186,14 @@ inline void ut_check_v(void *A, void *B, U32 len, DataType dt, F32 threshold, st b = ((BIN8 *)B)[i]; break; default: - std::cerr << "[ERROR] unsupported data type in ut_check_v(array, array) " << std::endl; - exit(1); + UNI_ERROR_LOG("unsupported data type in ut_check_v(array, array)\n"); } - ut_check_s(a, b, threshold, file, line); + ut_check_s(a, b, threshold, file, line, i); } } -inline void ut_check_v(void *A, F32 val, U32 len, DataType dt, std::string file, int line) { +inline void ut_check_v(void *A, F32 val, U32 len, DataType dt, std::string file, int line) +{ F32 a; for (U32 i = 0; i < len; i++) { switch (dt) { @@ -215,116 +220,180 @@ inline void ut_check_v(void *A, F32 val, U32 len, DataType dt, std::string file, a = ((BIN8 *)A)[i]; break; default: - std::cerr << "[ERROR] unsupported data type in ut_check_v(array, scalar) " << std::endl; - exit(1); + UNI_ERROR_LOG("unsupported data type in ut_check_v(array, scalar)\n"); } - ut_check_s(a, val, 0, file, line); + ut_check_s(a, val, 0, file, line, i); } } -inline void ut_check_a(void* A, void* B, U32 len, DataType dt) { +inline void ut_check_a(void *A, void *B, U32 len, DataType dt) +{ U32 e0, e1, e2, e3, e4, e5, e6; - e0 = 0; e1 = 0; e2 = 0; e3 = 0; e4 = 0; e5 = 0; e6 = 0; - F32 a, b, diff; + e0 = 0; + e1 = 0; + e2 = 0; + e3 = 0; + e4 = 0; + e5 = 0; + e6 = 0; + F32 a = 1, b = 0, diff; F32 d0, d1, d2, d3, d4, d5; - F32 maxrel = -1.0; - F32 maxabs = -1.0; + F32 maxrel = -1.0; + F32 maxabs = -1.0; F32 max_a0, max_b0, max_a1, max_b1; U32 max_n0, max_n1; - - - switch(dt) { -#ifdef _USE_FP16 + switch (dt) { + case DT_F32: + d0 = 1; + d1 = 0.1; + d2 = 0.01; + d3 = 0.001; + d4 = 0.0001; + d5 = 0.00001; + break; +#ifdef _USE_FP16 case DT_F16: - d0 = 1; d1 = 0.1; d2 = 0.01; d3 = 0.001; d4 = 0.0001; d5 = 0.00001; - break; + d0 = 1; + d1 = 0.1; + d2 = 0.01; + d3 = 0.001; + d4 = 0.0001; + d5 = 0.00001; + break; #endif case DT_U8: - d0 = 30; d1 = 20; d2 = 10; d3 = 5; d4 = 3; d5 = 2; - break; - + d0 = 30; + d1 = 20; + d2 = 10; + d3 = 5; + d4 = 3; + d5 = 2; + break; default: - std::cerr << "[ERROR] unsupported data type in ut_check_a(array, array) " << std::endl; - exit(1); + UNI_ERROR_LOG("unsupported data type in ut_check_a(array, array)\n"); } - for(U32 i = 0; i < len; i++) { - switch(dt) { -#ifdef _USE_FP16 + for (U32 i = 0; i < len; i++) { + switch (dt) { + case DT_F32: + a = ((F32 *)A)[i]; + b = ((F32 *)B)[i]; + break; +#ifdef _USE_FP16 case DT_F16: - a = ((F16*)A)[i]; - b = ((F16*)B)[i]; - - break; + a = ((F16 *)A)[i]; + b = ((F16 *)B)[i]; + break; #endif case DT_U8: - a = ((U8*)A)[i]; - b = ((U8*)B)[i]; - diff = a - b; + a = ((U8 *)A)[i]; + b = ((U8 *)B)[i]; break; default: break; } + + if (UNI_ISNAN((float)a) || UNI_ISINF((float)a)) { + UNI_ERROR_LOG("nan or inf value in ut_check_a of input A\n"); + return; + } + if (UNI_ISNAN((float)b) || UNI_ISINF((float)b)) { + UNI_ERROR_LOG("nan or inf value in ut_check_a of input B\n"); + return; + } + diff = a - b; - if(diff < 0) diff = -diff; - if(diff > maxabs) { + if (diff < 0) { + diff = -diff; + } + if (diff > maxabs) { maxabs = diff; max_a0 = a; max_b0 = b; max_n0 = i; } F32 tmp = diff * 2 / (a + b + 0.000001); - if(tmp > maxrel) { + if (tmp > maxrel) { maxrel = tmp; max_a1 = a; max_b1 = b; max_n1 = i; } - if(diff >= d0) {e0++; continue;} - if(diff >= d1) {e1++; continue;} - if(diff >= d2) {e2++; continue;} - if(diff >= d3) {e3++; continue;} - if(diff >= d4) {e4++; continue;} - if(diff >= d5) {e5++; continue;} + if (diff >= d0) { + e0++; + continue; + } + if (diff >= d1) { + e1++; + continue; + } + if (diff >= d2) { + e2++; + continue; + } + if (diff >= d3) { + e3++; + continue; + } + if (diff >= d4) { + e4++; + continue; + } + if (diff >= d5) { + e5++; + continue; + } e6++; } - std::cout << "abs(diff) >= " << std::scientific << d0 << " number = " << std::dec << e0 << std::endl; - std::cout << "abs(diff) >= " << std::scientific << d1 << " number = " << std::dec << e1 << std::endl; - std::cout << "abs(diff) >= " << std::scientific << d2 << " number = " << std::dec << e2 << std::endl; - std::cout << "abs(diff) >= " << std::scientific << d3 << " number = " << std::dec << e3 << std::endl; - std::cout << "abs(diff) >= " << std::scientific << d4 << " number = " << std::dec << e4 << std::endl; - std::cout << "abs(diff) >= " << std::scientific << d5 << " number = " << std::dec << e5 << std::endl; - std::cout << "others number = " << e6 << std::endl; - std::cout << "number " << max_n0 << " is "<< "maxabs = " << std::fixed << maxabs << " a = " << max_a0 << " b = " << max_b0 <= " << std::scientific << d0 << " number = " << std::dec << e0 + << std::endl; + std::cout << "abs(diff) >= " << std::scientific << d1 << " number = " << std::dec << e1 + << std::endl; + std::cout << "abs(diff) >= " << std::scientific << d2 << " number = " << std::dec << e2 + << std::endl; + std::cout << "abs(diff) >= " << std::scientific << d3 << " number = " << std::dec << e3 + << std::endl; + std::cout << "abs(diff) >= " << std::scientific << d4 << " number = " << std::dec << e4 + << std::endl; + std::cout << "abs(diff) >= " << std::scientific << d5 << " number = " << std::dec << e5 + << std::endl; + std::cout << "others number = " << e6 << std::endl; + std::cout << "number " << max_n0 << " is " + << "maxabs = " << std::fixed << maxabs << " a = " << max_a0 << " b = " << max_b0 + << std::endl; + std::cout << "number " << max_n1 << " is " + << "maxrel = " << std::fixed << maxrel << " a = " << max_a1 << " b = " << max_b1 + << std::endl; } // benchmark time -inline double ut_time_ms() { +inline double ut_time_ms() +{ struct timeval tv; gettimeofday(&tv, NULL); double time = tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0; return time; } -inline double ut_time_s() { +inline double ut_time_s() +{ return ut_time_ms() / 1000.0; } // calculate GFLOPS -inline double ut_gflops(double ops, double time_ms) { +inline double ut_gflops(double ops, double time_ms) +{ return 1e-6 * ops / time_ms; } // uniform log message -inline void ut_log(DataType dt, char *call, double ops, double time_ms) { - char buffer[200]; - sprintf(buffer, "%ubit, %s,\tTIME %10.6lfms,\tGFLOPS %10.6lf", - (U32)bytesOf(dt)*8, call, time_ms, - ut_gflops(ops, time_ms)); - std::cout << buffer << std::endl; +inline void ut_log(DataType dt, char *call, double ops, double time_ms) +{ + UNI_INFO_LOG("%ubit, %s,\tTIME %10.6lfms,\tGFLOPS %10.6lf\n", (U32)bytesOf(dt) * 8, call, + time_ms, ut_gflops(ops, time_ms)); } -inline void initialization_zero(void* ptr, int bytesOfNum) { +inline void initialization_zero(void *ptr, int bytesOfNum) +{ memset(ptr, 0, bytesOfNum); return; } diff --git a/common/uni/include/x86_avx2_expand.h b/common/uni/include/x86_avx2_expand.h new file mode 100644 index 00000000..880f2431 --- /dev/null +++ b/common/uni/include/x86_avx2_expand.h @@ -0,0 +1,140 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef CHEETAH_X86_AVX2_EXPAND_H +#define CHEETAH_X86_AVX2_EXPAND_H +#include +#include +#include "types.h" +#include "error.h" + +//horizontal add u32 +inline unsigned int _mm256_hadd_u32(__m256i x) +{ + __m128i low = _mm256_extracti128_si256(x, 0); + __m128i high = _mm256_extracti128_si256(x, 1); + __m128i sum = _mm_add_epi32(low, high); + int one = _mm_extract_epi32(sum, 0); + int two = _mm_extract_epi32(sum, 1); + int three = _mm_extract_epi32(sum, 2); + int four = _mm_extract_epi32(sum, 3); + + return (one + two + three + four); +} + +inline __m256 _mm256_log_ps(__m256 x) +{ + static const __m256 CONST_one = _mm256_set1_ps(1.0f); + static const __m256 CONST_two = _mm256_set1_ps(2.0f); + static const __m256 CONST_neg_one = _mm256_set1_ps(-1.0f); + F32 i = 30; + __m256 n = _mm256_set1_ps(i); + __m256 nk = _mm256_add_ps(_mm256_mul_ps(CONST_two, n), CONST_one); + x = _mm256_div_ps(_mm256_add_ps(x, CONST_neg_one), _mm256_add_ps(x, CONST_one)); + __m256 xx = _mm256_mul_ps(x, x); + __m256 y = _mm256_div_ps(CONST_one, nk); + for (; i > 0; i--) { + nk = _mm256_sub_ps(nk, CONST_two); + y = _mm256_add_ps(_mm256_div_ps(CONST_one, nk), _mm256_mul_ps(xx, y)); + } + + y = _mm256_mul_ps(CONST_two, _mm256_mul_ps(x, y)); + return y; +} + +inline __m256 _mm256_exp_ps(__m256 x) +{ + // the max and min x in exp(x) in 32-bit float range + __m256 max_upper_bound = _mm256_set1_ps(88.3762626647949f); + __m256 min_lower_bound = _mm256_set1_ps(-87.3365447504019f); + + x = _mm256_min_ps(x, max_upper_bound); + x = _mm256_max_ps(x, min_lower_bound); + + __m256 t, f, p, r; + __m256i i, j; + + const __m256 l2e = _mm256_set1_ps(1.442695041f); /* log2(e) */ + const __m256 l2h = _mm256_set1_ps(-6.93145752e-1f); /* -log(2)_hi */ + const __m256 l2l = _mm256_set1_ps(-1.42860677e-6f); /* -log(2)_lo */ + const __m256 c0 = _mm256_set1_ps(0.008301110f); + const __m256 c1 = _mm256_set1_ps(0.041906696f); + const __m256 c2 = _mm256_set1_ps(0.166674897f); + const __m256 c3 = _mm256_set1_ps(0.499990642f); + const __m256 c4 = _mm256_set1_ps(0.999999762f); + const __m256 c5 = _mm256_set1_ps(1.000000000f); + + /* exp(x) = 2^i * e^f; i = rint (log2(e) * x), f = x - log(2) * i */ + t = _mm256_mul_ps(x, l2e); /* t = log2(e) * x */ + r = _mm256_round_ps(t, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); /* r = rint (t) */ + + f = _mm256_fmadd_ps(r, l2h, x); /* x - log(2)_hi * r */ + f = _mm256_fmadd_ps(r, l2l, f); /* f = x - log(2)_hi * r - log(2)_lo * r */ + + i = _mm256_cvtps_epi32(t); /* i = (int)rint(t) */ + + /* p ~= exp (f), -log(2)/2 <= f <= log(2)/2 */ + p = c0; /* c0 */ + p = _mm256_fmadd_ps(p, f, c1); /* c0*f+c1 */ + p = _mm256_fmadd_ps(p, f, c2); /* (c0*f+c1)*f+c2 */ + p = _mm256_fmadd_ps(p, f, c3); /* ((c0*f+c1)*f+c2)*f+c3 */ + p = _mm256_fmadd_ps(p, f, c4); /* (((c0*f+c1)*f+c2)*f+c3)*f+c4 ~= exp(f) */ + p = _mm256_fmadd_ps(p, f, c5); /* (((c0*f+c1)*f+c2)*f+c3)*f+c4 ~= exp(f) */ + /* exp(x) = 2^i * p */ + j = _mm256_slli_epi32(i, 23); /* i << 23 */ + r = _mm256_castsi256_ps(_mm256_add_epi32(j, _mm256_castps_si256(p))); /* r = p * 2^i */ + + return r; +} + +inline __m256 _mm256_sigmod_ps(__m256 x) +{ + __m256 one_v = _mm256_set1_ps(1.f); + __m256 neg_one_v = _mm256_set1_ps(-1.f); + return _mm256_rcp_ps(_mm256_add_ps(_mm256_exp_ps(_mm256_mul_ps(x, neg_one_v)), one_v)); +} + +inline __m256 _mm256_tanh_ps(__m256 x) +{ + __m256 one_v = _mm256_set1_ps(1.f); + __m256 two_v = _mm256_set1_ps(2.f); + __m256 e_2G_v = _mm256_exp_ps(_mm256_mul_ps(two_v, x)); + __m256 result_v = _mm256_sub_ps(one_v, _mm256_div_ps(two_v, _mm256_add_ps(one_v, e_2G_v))); + return result_v; +} + +// horizontal add, sum array to f32 +inline F32 _mm256_sum_ps(__m256 x) +{ + __m128 low = _mm256_extractf128_ps(x, 0); + __m128 high = _mm256_extractf128_ps(x, 1); + __m128 sum = _mm_hadd_ps(low, high); + low = _mm_hadd_ps(sum, sum); + high = _mm_permute_ps(low, 0b01); + sum = _mm_add_ss(low, high); + return _mm_cvtss_f32(sum); +} + +// horizontal max +inline F32 _mm256_hmax_ps(__m256 x) +{ + __m128 low = _mm256_extractf128_ps(x, 0); + __m128 high = _mm256_extractf128_ps(x, 1); + __m128 max = _mm_max_ps(low, high); + high = _mm_permute_ps(max, 0b1110); + low = _mm_max_ps(max, high); + high = _mm_permute_ps(low, 0b01); + max = _mm_max_ss(low, high); + return _mm_cvtss_f32(max); +} +#endif //CHEETAH_X86_AVX2_EXPAND_H diff --git a/common/uni/src/CMakeLists.txt b/common/uni/src/CMakeLists.txt new file mode 100644 index 00000000..ef8301af --- /dev/null +++ b/common/uni/src/CMakeLists.txt @@ -0,0 +1,14 @@ +file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) + +# shared library +add_library(${PROJECT_NAME} SHARED ${srcs}) + +# static library +add_library(${PROJECT_NAME}_static STATIC ${srcs}) + +set_target_properties(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}") +set_target_properties(${PROJECT_NAME} PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(${PROJECT_NAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) +install(TARGETS ${PROJECT_NAME} ${PROJECT_NAME}_static + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) diff --git a/model-tools/src/model_deserialize.cpp b/common/uni/src/model_deserialize.cpp similarity index 56% rename from model-tools/src/model_deserialize.cpp rename to common/uni/src/model_deserialize.cpp index 30c4695c..d06929ff 100644 --- a/model-tools/src/model_deserialize.cpp +++ b/common/uni/src/model_deserialize.cpp @@ -1,52 +1,70 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include -#include #include -#include -#include #include #include #include -#include "model_serialize_deserialize.hpp" -#include "model_tools.h" #include #include #include #include +#include "model_serialize_deserialize.hpp" +#include "profiling.h" -EE str_copy(I8* dst, const I8* src, I32 srcLen) { - memset(dst, 0, NAME_LEN); - I32 copyLen = NAME_LEN - 1; - if (copyLen > srcLen) - copyLen = srcLen; - memcpy(dst, src, copyLen*sizeof(I8)); - return SUCCESS; -} - -void* mt_new_storage(size_t size) +int get_operator_parameter_size(OperatorType operatorType) { - if (size == 0) { - return nullptr; + std::map operatorParameterSizeMap = {{OT_Conv, sizeof(ConvolutionParamSpec)}, + {OT_Deconvolution, sizeof(ConvolutionParamSpec)}, {OT_FC, sizeof(FullyConnectedParamSpec)}, + {OT_RNN, sizeof(RNNParamSpec)}, {OT_MatMul, sizeof(MatMulParamSpec)}, + {OT_Resize, sizeof(ResizeParamSpec)}, + {OT_BilateralSliceApply, sizeof(BilateralSliceApplyParamSpec)}, + {OT_Pooling, sizeof(PoolingParamSpec)}, {OT_Scale, sizeof(ScaleParamSpec)}, + {OT_BatchNorm, sizeof(BatchNormParamSpec)}, {OT_Reduction, sizeof(ReductionParamSpec)}, + {OT_ArgMax, sizeof(ArgMaxParamSpec)}, {OT_Softmax, sizeof(SoftmaxParamSpec)}, + {OT_Clip, sizeof(ClipParamSpec)}, {OT_Power, sizeof(PowerParamSpec)}, + {OT_Relu, sizeof(ReLUParamSpec)}, {OT_Gather, sizeof(GatherParamSpec)}, + {OT_Embedding, sizeof(EmbedParamSpec)}, {OT_Pad, sizeof(PadParamSpec)}, + {OT_Eltwise, sizeof(EltwiseParamSpec)}, {OT_Concat, sizeof(ConcatParamSpec)}, + {OT_Slice, sizeof(SliceParamSpec)}, {OT_TfSlice, sizeof(TfSliceParamSpec)}, + {OT_Cast, sizeof(CastParamSpec)}, {OT_Transpose, sizeof(TransposeParamSpec)}, + {OT_Reshape, sizeof(ReshapeParamSpec)}, {OT_Squeeze, sizeof(SqueezeParamSpec)}, + {OT_Unsqueeze, sizeof(UnsqueezeParamSpec)}, {OT_Space2Depth, sizeof(Space2DepthParamSpec)}, + {OT_Depth2Space, sizeof(Depth2SpaceParamSpec)}, + {OT_ChannelResize, sizeof(ChannelResizeParamSpec)}, + {OT_PreAllocatedMemory, sizeof(PreAllocatedMemoryParamSpec)}, + {OT_SharedWeight, sizeof(SharedWeightParamSpec)}, {OT_Copy, sizeof(CopyParamSpec)}, + {OT_Check, sizeof(CheckParamSpec)}, {OT_Repeat, sizeof(RepeatParamSpec)}, + {OT_Attention, sizeof(AttentionParamSpec)}, + {OT_AttentionMask, sizeof(AttentionMaskParamSpec)}, + {OT_RelativePositionEmbedding, sizeof(EmbedParamSpec)}, + {OT_RelativeShift, sizeof(RelativeShiftParamSpec)}, {OT_PriorBox, sizeof(PriorBoxParamSpec)}, + {OT_DetectionOutput, sizeof(DetectionOutputParamSpec)}, + {OT_Yolov3DetectionOutput, sizeof(Yolov3DetectionOutputParamSpec)}, + {OT_MultiHeadAttention, sizeof(MultiheadAttentionParamSpec)}, + {OT_Tile, sizeof(TileParamSpec)}, {OT_Splice, sizeof(SpliceParamSpec)}}; + int size; + if (operatorParameterSizeMap.find(operatorType) == operatorParameterSizeMap.end()) { + size = 0; } else { - U8* s = new U8[size]; - return (void*)s; + size = operatorParameterSizeMap[operatorType]; } + return size; } -EE operator_relationship(ModelSpec* spec) { +EE operator_relationship(ModelSpec *spec) +{ std::map opCanInChange; std::set inplaceTensors; std::map inplaceTensorInNum; @@ -81,45 +99,43 @@ EE operator_relationship(ModelSpec* spec) { inId = 1; inplaceTensorInNum.insert(std::make_pair(tmpInTensor, inId)); opCanInChange[tmpInTensor] = true; - }else{ + } else { if (opCanInChange[tmpInTensor] == false) { - inId = inplaceTensorInNum[tmpInTensor]+1; + inId = inplaceTensorInNum[tmpInTensor] + 1; // inplaceTensorInNum.insert(std::make_pair(tmpInTensor, inId)); inplaceTensorInNum[tmpInTensor] = inId; opCanInChange[tmpInTensor] = true; - }else{ + } else { inId = inplaceTensorInNum[tmpInTensor]; opCanInChange[tmpInTensor] = true; } } - std::ostringstream stream; - stream << inId; - std::string tmpInTensorChanged = tmpInTensor + "_" + stream.str(); + std::string tmpInTensorChanged = tmpInTensor + "_" + std::to_string(inId); inTensorVec.push_back(tmpInTensorChanged); if (tensorFlowsToOpSet.find(tmpInTensorChanged) == tensorFlowsToOpSet.end()) { std::vector tmpVector; tmpVector.push_back(currentOpName); tensorFlowsToOpSet.insert(std::make_pair(tmpInTensorChanged, tmpVector)); - }else{ + } else { tensorFlowsToOpSet[tmpInTensorChanged].push_back(currentOpName); } - }else{ + } else { inTensorVec.push_back(tmpInTensor); if (tensorFlowsToOpSet.find(tmpInTensor) == tensorFlowsToOpSet.end()) { std::vector tmpVector; tmpVector.push_back(currentOpName); tensorFlowsToOpSet.insert(std::make_pair(tmpInTensor, tmpVector)); - }else{ + } else { tensorFlowsToOpSet[tmpInTensor].push_back(currentOpName); } } } opInTensorNew.insert(std::make_pair(currentOpName, inTensorVec)); - // dealing with the relationship of op -- output tensors + // dealing with the relationship of op -- output tensors std::string tmpOutTensor = spec->ops[i].output_tensors_name[0]; if (inplaceTensors.find(tmpOutTensor) != inplaceTensors.end()) { // todo @@ -128,18 +144,16 @@ EE operator_relationship(ModelSpec* spec) { outId = 1; inplaceTensorOutNum.insert(std::make_pair(tmpOutTensor, outId)); opCanInChange[tmpOutTensor] = false; - }else{ + } else { outId = inplaceTensorOutNum[tmpOutTensor] + 1; // inplaceTensorOutNum.insert(std::make_pair(tmpOutTensor, outId)); can not update inplaceTensorOutNum[tmpOutTensor] = outId; opCanInChange[tmpOutTensor] = false; } - std::ostringstream stream; - stream << outId; - std::string tmpOutTensorChanged = tmpOutTensor + "_" + stream.str(); + std::string tmpOutTensorChanged = tmpOutTensor + "_" + std::to_string(outId); opOutTensorNew.insert(std::make_pair(currentOpName, tmpOutTensorChanged)); tensorOpMapping.insert(std::make_pair(tmpOutTensorChanged, currentOpName)); - }else{ + } else { opOutTensorNew.insert(std::make_pair(currentOpName, tmpOutTensor)); tensorOpMapping.insert(std::make_pair(tmpOutTensor, currentOpName)); } @@ -148,11 +162,12 @@ EE operator_relationship(ModelSpec* spec) { // assign op-op relationship int opNum = spec->num_operator_specs; spec->num_op_tensor_entries = opNum; - OperatorSpec* opsPtr2 = spec->ops; - OperatorRelationshipMapEntry* oprmePtr = (OperatorRelationshipMapEntry*)mt_new_storage(sizeof(OperatorRelationshipMapEntry) * opNum); + OperatorSpec *opsPtr2 = spec->ops; + OperatorRelationshipMapEntry *oprmePtr = (OperatorRelationshipMapEntry *)mt_new_storage( + sizeof(OperatorRelationshipMapEntry) * opNum); spec->op_relationship_entries = oprmePtr; for (int j = 0; j < opNum; j++) { - str_copy(oprmePtr[j].op, opsPtr2[j].name, NAME_LEN); + str_copy(oprmePtr[j].op, opsPtr2[j].name, NAME_LEN); int opInOpNum = opInTensorNew[opsPtr2[j].name].size(); oprmePtr[j].num_inputs = opInOpNum; oprmePtr[j].input_op_names = (I8 **)mt_new_storage(opInOpNum * sizeof(I8 *)); @@ -175,8 +190,8 @@ EE operator_relationship(ModelSpec* spec) { return SUCCESS; } -template -void dequantize_int8_weight(int num, F32 scale, INT8* q, T* d) +template +void dequantize_int8_weight(int num, F32 scale, INT8 *q, T *d) { F32 factor = 1 / scale; T table[255]; @@ -191,42 +206,86 @@ void dequantize_int8_weight(int num, F32 scale, INT8* q, T* d) } } -EE deserialize_header(const char* bytes, ModelSpec* spec, U32* pos) +inline void dequantize_fp16(int num, unsigned short *q, F32 *d) { - const char* pointer = bytes + *pos; +#if defined(_USE_NEON) && defined(__aarch64__) + F16 *half = (F16 *)q; +#else + U32 *word = (U32 *)d; +#endif + + for (int i = 0; i < num; i++) { +#if defined(_USE_NEON) && defined(__aarch64__) + d[i] = half[i]; +#else + unsigned short value = q[i]; + unsigned short sign = (value & 0x8000) >> 15; + unsigned short exponent = (value & 0x7c00) >> 10; + unsigned short significand = value & 0x03FF; + + U32 u; + if (exponent == 0) { + if (significand == 0) { + u = sign << 31; + } else { + exponent = 0; + while (0 == (significand & 0x200)) { + significand <<= 1; + exponent++; + } + significand <<= 1; + significand &= 0x3FF; + u = (sign << 31) | ((-exponent + (-15 + 127)) << 23) | (significand << 13); + } + } else if (exponent == 0x1F) { + u = (sign << 31) | (0xFF << 23) | (significand << 13); + } else { + u = (sign << 31) | ((exponent + (-15 + 127)) << 23) | (significand << 13); + } + word[i] = u; +#endif + } +} + +EE deserialize_header(const char *bytes, ModelSpec *spec, U32 *pos) +{ + const char *pointer = bytes + *pos; memcpy(&spec->version, pointer, sizeof(I32)); pointer += sizeof(I32); *pos += sizeof(I32); - if (spec->version != mt_version()) { - std::cerr << "[ERROR] version not_match: code " << mt_version() << \ - "bolt model " << spec->version << std::endl; + if (spec->version != sg_boltVersion) { + UNI_ERROR_LOG("X2bolt version is [%d], but your model version is : [%d].\n Please update " + "X2bolt to version[%d].\n", + sg_boltVersion, spec->version, spec->version); CHECK_STATUS(NOT_MATCH); + return NOT_MATCH; } memcpy(&spec->magic_number, pointer, sizeof(I32)); pointer += sizeof(I32); *pos += sizeof(I32); - if (spec->magic_number != mt_magic_number()) { - std::cerr << "[ERROR] magic_number not_match: code " << mt_magic_number() << \ - "bolt model " << spec->version << std::endl; + if (spec->magic_number != sg_magicNumber) { + UNI_ERROR_LOG( + "magic_number not_match: code %d bolt model %d\n", sg_magicNumber, spec->magic_number); CHECK_STATUS(NOT_MATCH); + return NOT_MATCH; } str_copy(spec->model_name, pointer, NAME_LEN); pointer += NAME_LEN; *pos += NAME_LEN; - spec->dt = *((DataType*)pointer); + spec->dt = *((DataType *)pointer); pointer += sizeof(DataType); *pos += sizeof(DataType); - spec->num_inputs = *((I32*)pointer); + spec->num_inputs = *((I32 *)pointer); pointer += sizeof(I32); *pos += sizeof(I32); - spec->input_names = (I8**)mt_new_storage(spec->num_inputs * sizeof(I8*)); + spec->input_names = (I8 **)mt_new_storage(spec->num_inputs * sizeof(I8 *)); for (int i = 0; i < spec->num_inputs; i++) { - spec->input_names[i] = (I8*)mt_new_storage(NAME_LEN * sizeof(I8)); + spec->input_names[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); str_copy(spec->input_names[i], pointer, NAME_LEN); pointer += NAME_LEN; *pos += NAME_LEN; @@ -237,11 +296,11 @@ EE deserialize_header(const char* bytes, ModelSpec* spec, U32* pos) pointer += spec->num_inputs * sizeof(TensorDesc); *pos += spec->num_inputs * sizeof(TensorDesc); - spec->num_outputs = *((I32*)pointer); + spec->num_outputs = *((I32 *)pointer); pointer += sizeof(I32); *pos += sizeof(I32); - spec->output_names = (I8**)mt_new_storage(spec->num_outputs * NAME_LEN); + spec->output_names = (I8 **)mt_new_storage(spec->num_outputs * NAME_LEN); for (int i = 0; i < spec->num_outputs; i++) { spec->output_names[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); str_copy(spec->output_names[i], pointer, NAME_LEN); @@ -251,15 +310,16 @@ EE deserialize_header(const char* bytes, ModelSpec* spec, U32* pos) return SUCCESS; } -EE deserialize_operator(const char* bytes, ModelSpec* spec, U32* pos) +EE deserialize_operator(const char *bytes, ModelSpec *spec, U32 *pos) { - const char* pointer = bytes + *pos; - I32* p4numOperatorSpecs = (I32 *)pointer; + const char *pointer = bytes + *pos; + I32 *p4numOperatorSpecs = (I32 *)pointer; spec->num_operator_specs = *p4numOperatorSpecs; pointer += sizeof(U32); *pos += sizeof(U32); - OperatorSpec *ptr = (OperatorSpec*)mt_new_storage(spec->num_operator_specs * sizeof(OperatorSpec)); + OperatorSpec *ptr = + (OperatorSpec *)mt_new_storage(spec->num_operator_specs * sizeof(OperatorSpec)); spec->ops = ptr; for (int i = 0; i < spec->num_operator_specs; i++) { str_copy(ptr[i].name, pointer, NAME_LEN); @@ -275,7 +335,7 @@ EE deserialize_operator(const char* bytes, ModelSpec* spec, U32* pos) *pos += sizeof(U32); ptr[i].input_tensors_name = (I8 **)mt_new_storage(ptr[i].num_inputs * sizeof(I8 *)); - for (U32 j = 0; jnum_weight_specs = *p4numWeightSpecs; pointer += sizeof(U32); *pos += sizeof(U32); - WeightSpec* ptr = (WeightSpec*)mt_new_storage(spec->num_weight_specs * sizeof(WeightSpec)); + WeightSpec *ptr = (WeightSpec *)mt_new_storage(spec->num_weight_specs * sizeof(WeightSpec)); spec->ws = ptr; for (int i = 0; i < spec->num_weight_specs; i++) { - U32* length = (U32*)pointer; + U32 *length = (U32 *)pointer; pointer += sizeof(U32); *pos += sizeof(U32); U32 weightBiasBytes = 0; @@ -367,22 +415,30 @@ EE deserialize_weight(const char* bytes, ModelSpec* spec, U32* pos) pointer += sizeof(U32); *pos += sizeof(U32); - bool quantWeight = false; - if (DT_I8 == ptr[i].mdt && DT_I8 != spec->dt) { + bool quantFP16 = false; + bool quantInt8 = false; + if (DT_F16 == ptr[i].mdt && DT_F32 == spec->dt) { + ptr[i].mdt = DT_F32; + quantFP16 = true; + } else if (DT_I8 == ptr[i].mdt && DT_I8 != spec->dt) { ptr[i].mdt = (spec->dt == DT_F16_8Q) ? DT_F16 : spec->dt; - quantWeight = true; + quantInt8 = true; } memcpy(&(ptr[i].bytes_of_weight), pointer, sizeof(U32)); U32 alignSize = ptr[i].bytes_of_weight; - if (quantWeight) { + + if (quantFP16) { + ptr[i].bytes_of_weight *= 2; + } + if (quantInt8) { ptr[i].bytes_of_weight *= bytesOf(ptr[i].mdt); } pointer += sizeof(U32); *pos += sizeof(U32); - ptr[i].weight = (U8*)mt_new_storage(ptr[i].bytes_of_weight); - INT8 *serialWeight = (INT8*)pointer; + ptr[i].weight = (U8 *)mt_new_storage(ptr[i].bytes_of_weight); + U8 *serialWeight = (U8 *)pointer; pointer += alignSize; *pos += alignSize; @@ -392,20 +448,29 @@ EE deserialize_weight(const char* bytes, ModelSpec* spec, U32* pos) pointer += sizeof(U32); *pos += sizeof(U32); - U8* ppp4 = (U8*)mt_new_storage(ptr[i].bytes_of_vec); - memcpy(ppp4, pointer, ptr[i].bytes_of_vec); - ptr[i].vec = ppp4; - - pointer += ptr[i].bytes_of_vec; - *pos += ptr[i].bytes_of_vec; - weightBiasBytes += ptr[i].bytes_of_vec; + alignSize = ptr[i].bytes_of_vec; + if (quantFP16) { + ptr[i].bytes_of_vec *= 2; + } + U8 *serialBias = nullptr; + if (0 != ptr[i].bytes_of_vec) { + serialBias = (U8 *)pointer; + ptr[i].vec = (U8 *)mt_new_storage(ptr[i].bytes_of_vec); + } else { + ptr[i].vec = nullptr; + } + + pointer += alignSize; + *pos += alignSize; + weightBiasBytes += alignSize; memcpy(&(ptr[i].num_quant_scale), pointer, sizeof(U32)); pointer += sizeof(U32); *pos += sizeof(U32); if (0 != ptr[i].num_quant_scale) { - ptr[i].weight_scale = (QuantSpec*)mt_new_storage(ptr[i].num_quant_scale * sizeof(QuantSpec)); + ptr[i].weight_scale = + (QuantSpec *)mt_new_storage(ptr[i].num_quant_scale * sizeof(QuantSpec)); } for (U32 j = 0; j < ptr[i].num_quant_scale; j++) { ptr[i].weight_scale[j].num_scale = *((int *)pointer); @@ -413,32 +478,40 @@ EE deserialize_weight(const char* bytes, ModelSpec* spec, U32* pos) pointer += sizeof(int); *pos += sizeof(int); - ptr[i].weight_scale[j].scale = (F32*)mt_new_storage(num * sizeof(F32)); + ptr[i].weight_scale[j].scale = (F32 *)mt_new_storage(num * sizeof(F32)); memcpy(ptr[i].weight_scale[j].scale, pointer, num * sizeof(F32)); pointer += num * sizeof(F32); *pos += num * sizeof(F32); } - + CHECK_REQUIREMENT(*length == weightBiasBytes); - - if (quantWeight) { - CHECK_REQUIREMENT(1 == ptr[i].num_quant_scale && 1 == ptr[i].weight_scale[0].num_scale); - F32 scale = ptr[i].weight_scale[0].scale[0]; - if (DT_F32 == ptr[i].mdt) { - dequantize_int8_weight(alignSize, scale, serialWeight, (F32*)ptr[i].weight); + + if (quantFP16) { + dequantize_fp16(ptr[i].bytes_of_weight / 4, (unsigned short *)serialWeight, (F32 *)ptr[i].weight); + dequantize_fp16(ptr[i].bytes_of_vec / 4, (unsigned short *)serialBias, (F32 *)ptr[i].vec); + } else { + if (quantInt8) { + CHECK_REQUIREMENT(1 == ptr[i].num_quant_scale && 1 == ptr[i].weight_scale[0].num_scale); + F32 scale = ptr[i].weight_scale[0].scale[0]; + if (DT_F32 == ptr[i].mdt) { + dequantize_int8_weight( + ptr[i].bytes_of_weight / 4, scale, (INT8 *)serialWeight, (F32 *)ptr[i].weight); + } else { + #ifdef __aarch64__ + dequantize_int8_weight( + ptr[i].bytes_of_weight / 2, scale, (INT8 *)serialWeight, (F16 *)ptr[i].weight); + #endif + } } else { -#ifdef __aarch64__ - dequantize_int8_weight(alignSize, scale, serialWeight, (F16*)ptr[i].weight); -#endif + memcpy(ptr[i].weight, serialWeight, ptr[i].bytes_of_weight); } - } else { - memcpy(ptr[i].weight, serialWeight, ptr[i].bytes_of_weight); + memcpy(ptr[i].vec, serialBias, ptr[i].bytes_of_vec); } } return SUCCESS; } -EE deserialize_model(const char* bytes, ModelSpec* spec) +EE deserialize_model(const char *bytes, ModelSpec *spec) { U32 pos = 0; CHECK_STATUS(deserialize_header(bytes, spec, &pos)); @@ -448,27 +521,45 @@ EE deserialize_model(const char* bytes, ModelSpec* spec) return SUCCESS; } -int read_from_file(const char* fn, char** bytes) +EE deserialize_model_from_file(const char *fn, ModelSpec *spec, bool useFileStream) { - int fd = open(fn, O_RDONLY); - CHECK_REQUIREMENT(-1 != fd); - - struct stat ss; - CHECK_REQUIREMENT(fstat(fd, &ss) != -1); - - int fileLength = ss.st_size; - *bytes = (char*)mmap(nullptr, fileLength, PROT_READ, - MAP_SHARED, fd, 0); - CHECK_REQUIREMENT(MAP_FAILED != bytes); - close(fd); - return fileLength; -} + UNI_PROFILE( + { + char *bytes = nullptr; + int fd; + int fileLength; + if (useFileStream) { + bytes = (char *)fn; + } else { + fd = open(fn, O_RDONLY); + if (-1 == fd) { + UNI_ERROR_LOG("Cannot open .bolt file. Name: %s\n", fn); + return FILE_ERROR; + } -EE deserialize_model_from_file(const char* fn, ModelSpec* spec) -{ - char *bytes = nullptr; - int fileLength = read_from_file(fn, &bytes); - CHECK_STATUS(deserialize_model(bytes, spec)); - munmap(bytes, fileLength); + struct stat ss; + if (-1 == fstat(fd, &ss)) { + UNI_ERROR_LOG("Cannot get size from file descriptor. File Name: %s\n", fn); + return FILE_ERROR; + } + + fileLength = ss.st_size; + bytes = (char *)mmap(nullptr, fileLength, PROT_READ, MAP_SHARED, fd, 0); + if (MAP_FAILED == bytes) { + UNI_ERROR_LOG("Mmap failed. File Name: %s\n", fn); + return FILE_ERROR; + } + } + + CHECK_STATUS(deserialize_model(bytes, spec)); + + if (!useFileStream) { + munmap(bytes, fileLength); + if (-1 != fd) { + close(fd); + } + } + }, + std::string("deserialize_model_from_file"), std::string("prepare")); return SUCCESS; } diff --git a/common/uni/src/model_print.cpp b/common/uni/src/model_print.cpp new file mode 100644 index 00000000..847856bc --- /dev/null +++ b/common/uni/src/model_print.cpp @@ -0,0 +1,127 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "model_print.h" +#include "types.h" + +void print_header(const ModelSpec ms) +{ + printf("[Model] %s\n [Input]", ms.model_name); + for (int i = 0; i < ms.num_inputs; i++) { + printf(" %s(%s)", ms.input_names[i], tensorDesc2Str(ms.input_dims[i]).c_str()); + } + printf("\n [Output]"); + for (int i = 0; i < ms.num_outputs; i++) { + printf(" %s", ms.output_names[i]); + } + printf("\n"); +} + +void print_operator_tensor_relationship(const ModelSpec ms, bool deleteDeprecatedOp) +{ + int number = ms.num_operator_specs; + printf(" [Ops] %d\n", number); + for (int i = 0; i < number; i++) { + if (deleteDeprecatedOp) { + if (isDeprecatedOp(ms.ops[i].type)) { + continue; + } + } + printf(" Op %3d %32s %16s|", i, ms.ops[i].name, OperatorTypeName()[ms.ops[i].type]); + for (U32 j = 0; j < ms.ops[i].num_inputs; j++) { + printf(" %s", ms.ops[i].input_tensors_name[j]); + } + printf(" ->"); + for (U32 j = 0; j < ms.ops[i].num_outputs; j++) { + printf(" %s", ms.ops[i].output_tensors_name[j]); + } + if (nullptr != ms.ops[i].tensor_positions) { + printf(" tensor position:"); + for (U32 j = 0; j < ms.ops[i].num_inputs + ms.ops[i].num_outputs; j++) { + printf(" %d", ms.ops[i].tensor_positions[j]); + } + } + if (nullptr != ms.ops[i].feature_scale) { + printf(" quant scale:"); + for (U32 j = 0; j < ms.ops[i].num_quant_feature; j++) { + printf(" %f", ms.ops[i].feature_scale[j].scale[0]); + } + } + printf("\n"); + } +} + +void print_weights(const ModelSpec ms) +{ + int number = ms.num_weight_specs; + printf(" [Weights] %d\n", number); + for (int i = 0; i < number; i++) { + if (isDeprecatedOpWeight(&ms, i)) { + printf(" Weight %3d %32s | Delete mdt %d weight: %p %uB bias: %p %uB\n", i, + ms.ws[i].op_name, ms.ws[i].mdt, ms.ws[i].weight, ms.ws[i].bytes_of_weight, + ms.ws[i].vec, ms.ws[i].bytes_of_vec); + continue; + } + + printf(" Weight %3d %32s | Retain mdt %d weight: %p %uB bias: %p %uB example: ", i, + ms.ws[i].op_name, ms.ws[i].mdt, ms.ws[i].weight, ms.ws[i].bytes_of_weight, ms.ws[i].vec, + ms.ws[i].bytes_of_vec); + if (ms.ws[i].bytes_of_weight > 0 && ms.ws[i].weight != nullptr) { + F32 value; + transformToFloat(ms.ws[i].mdt, ms.ws[i].weight, &value, 1); + printf("%f", value); + } else if ((ms.ws[i].bytes_of_weight == 0 && ms.ws[i].weight != nullptr) || + (ms.ws[i].bytes_of_weight != 0 && ms.ws[i].weight == nullptr)) { + UNI_ERROR_LOG("weight is null but size is not zero\n"); + } + if (ms.ws[i].bytes_of_vec > 0 && ms.ws[i].vec != nullptr) { + DataType dt = ms.ws[i].mdt; + if (DT_BIN01 == ms.ws[i].mdt || DT_BIN11 == ms.ws[i].mdt) { + dt = DT_F16; + } + F32 value; + transformToFloat(dt, ms.ws[i].vec, &value, 1); + printf(",%f", value); + } else if ((ms.ws[i].bytes_of_vec == 0 && ms.ws[i].vec != nullptr) || + (ms.ws[i].bytes_of_vec != 0 && ms.ws[i].vec == nullptr)) { + UNI_ERROR_LOG("vec is null but size is not zero\n"); + } + printf("\n"); + } +} + +void print_relationship(const ModelSpec ms) +{ + int number = ms.num_op_tensor_entries; + printf(" [Relationships] %d\n", number); + for (int i = 0; i < number; i++) { + printf(" Relation %3d %32s |", i, ms.op_relationship_entries[i].op); + for (U32 j = 0; j < ms.op_relationship_entries[i].num_inputs; j++) { + printf(" %s", ms.op_relationship_entries[i].input_op_names[j]); + } + printf(" ->"); + for (U32 j = 0; j < ms.op_relationship_entries[i].num_outputs; j++) { + printf(" %s", ms.op_relationship_entries[i].output_op_names[j]); + } + printf("\n"); + } +} + +void print_ms(const ModelSpec ms) +{ + print_header(ms); + print_operator_tensor_relationship(ms); + print_weights(ms); + print_relationship(ms); +} diff --git a/model-tools/src/model_serialize.cpp b/common/uni/src/model_serialize.cpp similarity index 56% rename from model-tools/src/model_serialize.cpp rename to common/uni/src/model_serialize.cpp index 37c4311a..92bf670f 100644 --- a/model-tools/src/model_serialize.cpp +++ b/common/uni/src/model_serialize.cpp @@ -1,177 +1,160 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include -#include #include -#include -#include #include "model_serialize_deserialize.hpp" -#include "model_tools.h" -#include "model_optimizer.hpp" -#include "OPOptimizers/DeprecatedOPOptimizer.hpp" - -EE serialize_header(const ModelSpec* spec, std::string* tmp) { - U32 bufSize = sizeof(I32) * 2 \ - + sizeof(I8) * NAME_LEN + sizeof(DataType) + sizeof(I32) \ - + sizeof(I8) * NAME_LEN * spec->num_inputs + sizeof(TensorDesc) * spec->num_inputs \ - + sizeof(I32) + sizeof(I8) * NAME_LEN * spec->num_outputs; - I8* data = (I8*)mt_new_storage(bufSize); - - I32* pointer4version = (I32*)data; +#include "types.h" + +EE serialize_header(const ModelSpec *spec, std::string *tmp) +{ + U32 bufSize = sizeof(I32) * 2 + sizeof(I8) * NAME_LEN + sizeof(DataType) + sizeof(I32) + + sizeof(I8) * NAME_LEN * spec->num_inputs + sizeof(TensorDesc) * spec->num_inputs + + sizeof(I32) + sizeof(I8) * NAME_LEN * spec->num_outputs; + I8 *data = (I8 *)mt_new_storage(bufSize); + + I32 *pointer4version = (I32 *)data; memcpy(pointer4version, &spec->version, sizeof(I32)); - pointer4version += 1; // the pointer datatype(I32) of add 1 means 4 steps + pointer4version += 1; // the pointer datatype(I32) of add 1 means 4 steps - I32* pointer4magicNumber = (I32*)pointer4version; + I32 *pointer4magicNumber = (I32 *)pointer4version; memcpy(pointer4magicNumber, &spec->magic_number, sizeof(I32)); pointer4magicNumber += 1; - I8* pointer4modelName = (I8*)pointer4magicNumber; + I8 *pointer4modelName = (I8 *)pointer4magicNumber; str_copy(pointer4modelName, spec->model_name, NAME_LEN); pointer4modelName += NAME_LEN; - DataType* pointer4dt = (DataType*)pointer4modelName; + DataType *pointer4dt = (DataType *)pointer4modelName; *pointer4dt = spec->dt; pointer4dt++; - I32* pointer4numInputs = (I32*)pointer4dt; + I32 *pointer4numInputs = (I32 *)pointer4dt; *pointer4numInputs = spec->num_inputs; pointer4numInputs++; - I8* pointer4InputNames = (I8*)pointer4numInputs; + I8 *pointer4InputNames = (I8 *)pointer4numInputs; for (int i = 0; i < spec->num_inputs; i++) { str_copy(pointer4InputNames, spec->input_names[i], NAME_LEN); pointer4InputNames += NAME_LEN; } - TensorDesc* pointer4TensorDesc = (TensorDesc*)pointer4InputNames; + TensorDesc *pointer4TensorDesc = (TensorDesc *)pointer4InputNames; memcpy(pointer4TensorDesc, spec->input_dims, sizeof(TensorDesc) * spec->num_inputs); pointer4TensorDesc += spec->num_inputs; - I32* pointer4numOutputs = (I32 *)pointer4TensorDesc; + I32 *pointer4numOutputs = (I32 *)pointer4TensorDesc; *pointer4numOutputs = spec->num_outputs; pointer4numOutputs++; - I8* pointer4outputNames = (I8 *)pointer4numOutputs; + I8 *pointer4outputNames = (I8 *)pointer4numOutputs; for (int i = 0; i < spec->num_outputs; i++) { str_copy(pointer4outputNames, spec->output_names[i], NAME_LEN); pointer4outputNames += NAME_LEN; } tmp->clear(); - CHECK_REQUIREMENT(pointer4outputNames - data == bufSize); + CHECK_REQUIREMENT((U32)(pointer4outputNames - data) == bufSize); tmp->assign(data, data + bufSize); - delete [] data; + delete data; return SUCCESS; } - -U32 operator_memory_size(OperatorSpec* ops) +U32 operator_memory_size(OperatorSpec *ops) { // sizeof(U32) * 4 : type + num_inputs + num_output + num_quant_feature - U32 allocatedBufferSize = sizeof(I8) * NAME_LEN + sizeof(U32) * 4 - + ops->num_inputs * NAME_LEN * sizeof(I8) - + ops->num_outputs * NAME_LEN * sizeof(I8) - + (ops->num_inputs + ops->num_outputs) * sizeof(I32) - + sizeof(ParameterSpec); + U32 allocatedBufferSize = sizeof(I8) * NAME_LEN + sizeof(U32) * 4 + + ops->num_inputs * NAME_LEN * sizeof(I8) + ops->num_outputs * NAME_LEN * sizeof(I8) + + (ops->num_inputs + ops->num_outputs) * sizeof(I32) + get_operator_parameter_size(ops->type); for (U32 i = 0; i < ops->num_quant_feature; i++) { allocatedBufferSize += sizeof(int); // num_scale allocatedBufferSize += ops->feature_scale[i].num_scale * sizeof(F32); } - switch (ops->type) { - case OT_Eltwise: { - if (ops->ps.eltwise_spec.elt_mode == ELTWISE_SUM) - allocatedBufferSize += ops->ps.eltwise_spec.elt_sum_spec.coeff_size * sizeof(float); - break; - } - default: - break; - } return allocatedBufferSize; } - -EE serialize_operators(const ModelSpec* spec, std::string* tmp) { - OperatorSpec* opsTmp = spec->ops; +EE serialize_operators(const ModelSpec *spec, std::string *tmp) +{ + OperatorSpec *opsTmp = spec->ops; int removeOpNum = 0; U32 bufSize = sizeof(I32); for (int i = 0; i < spec->num_operator_specs; i++) { - if (DeprecatedOPOptimizer::isDeprecatedOp(opsTmp->type)) { + if (isDeprecatedOp(opsTmp->type)) { removeOpNum++; - } - else { + } else { bufSize += operator_memory_size(opsTmp); } opsTmp++; } - char* data = (char*)mt_new_storage(bufSize); + char *data = (char *)mt_new_storage(bufSize); - I32* pointer4numOperatorSpecs = (I32 *)data; + I32 *pointer4numOperatorSpecs = (I32 *)data; *pointer4numOperatorSpecs = spec->num_operator_specs - removeOpNum; // attention pointer4numOperatorSpecs++; - OperatorSpec* opsPointer = spec->ops; - I8* pointer4opsName = (I8*)pointer4numOperatorSpecs; + OperatorSpec *opsPointer = spec->ops; + I8 *pointer4opsName = (I8 *)pointer4numOperatorSpecs; for (int i = 0; i < spec->num_operator_specs; i++) { - if (DeprecatedOPOptimizer::isDeprecatedOp(opsPointer[i].type)) { + if (isDeprecatedOp(opsPointer[i].type)) { continue; } - str_copy(pointer4opsName, opsPointer[i].name, NAME_LEN); // to copy the name of op + str_copy(pointer4opsName, opsPointer[i].name, NAME_LEN); // to copy the name of op pointer4opsName += NAME_LEN; - U32* pointer4opsType = (U32 *)pointer4opsName; + U32 *pointer4opsType = (U32 *)pointer4opsName; *pointer4opsType = opsPointer[i].type; pointer4opsType++; - U32* pointer4opsNumInputs = pointer4opsType; + U32 *pointer4opsNumInputs = pointer4opsType; *pointer4opsNumInputs = opsPointer[i].num_inputs; pointer4opsNumInputs++; - I8* pointer4opsInputTensorsName = (I8 *)pointer4opsNumInputs; + I8 *pointer4opsInputTensorsName = (I8 *)pointer4opsNumInputs; for (U32 j = 0; j < opsPointer[i].num_inputs; j++) { str_copy(pointer4opsInputTensorsName, opsPointer[i].input_tensors_name[j], NAME_LEN); pointer4opsInputTensorsName += NAME_LEN; } - U32* pointer4opsNumOutputs = (U32 *)pointer4opsInputTensorsName; + U32 *pointer4opsNumOutputs = (U32 *)pointer4opsInputTensorsName; *pointer4opsNumOutputs = opsPointer[i].num_outputs; pointer4opsNumOutputs++; - I8* pointer4opsOutputTensorsName = (I8 *)pointer4opsNumOutputs; + I8 *pointer4opsOutputTensorsName = (I8 *)pointer4opsNumOutputs; for (U32 j = 0; j < opsPointer[i].num_outputs; j++) { str_copy(pointer4opsOutputTensorsName, opsPointer[i].output_tensors_name[j], NAME_LEN); pointer4opsOutputTensorsName += NAME_LEN; } - I32* pointer4tensorPos = (I32*)pointer4opsOutputTensorsName; + I32 *pointer4tensorPos = (I32 *)pointer4opsOutputTensorsName; U32 numTensors = opsPointer[i].num_inputs + opsPointer[i].num_outputs; if (nullptr != opsPointer[i].tensor_positions) { - memcpy(pointer4tensorPos, opsPointer[i].tensor_positions, numTensors*sizeof(I32)); + memcpy(pointer4tensorPos, opsPointer[i].tensor_positions, numTensors * sizeof(I32)); } else { - memset(pointer4tensorPos, 0, numTensors*sizeof(I32)); + for (U32 j = 0; j < numTensors; j++) { + pointer4tensorPos[j] = -1; + } } pointer4tensorPos += numTensors; - U32* pointer4numint8 = (U32*)pointer4tensorPos; + U32 *pointer4numint8 = (U32 *)pointer4tensorPos; *pointer4numint8 = opsPointer[i].num_quant_feature; pointer4numint8++; - int* pointer4quant = (int*)pointer4numint8; + int *pointer4quant = (int *)pointer4numint8; for (U32 j = 0; j < opsPointer[i].num_quant_feature; j++) { *pointer4quant = opsPointer[i].feature_scale[j].num_scale; int num = *pointer4quant; @@ -180,46 +163,33 @@ EE serialize_operators(const ModelSpec* spec, std::string* tmp) { pointer4quant += num; } - char* pointer4parameterSpecs = (char *)pointer4quant; - memcpy(pointer4parameterSpecs, &(opsPointer[i].ps), sizeof(ParameterSpec)); - if (opsPointer[i].type == OT_Eltwise) { - memset(&(pointer4parameterSpecs[(char*)(&(opsPointer[i].ps.eltwise_spec.elt_sum_spec.coeff_values)) - (char*)(&(opsPointer[i].ps))]), 0, sizeof(float*)); - } - pointer4parameterSpecs += sizeof(ParameterSpec); - switch (opsPointer[i].type) { - case OT_Eltwise: { - if (opsPointer[i].ps.eltwise_spec.elt_mode == ELTWISE_SUM) { - U32 bytes = opsPointer[i].ps.eltwise_spec.elt_sum_spec.coeff_size * sizeof(float); - memcpy(pointer4parameterSpecs, opsPointer[i].ps.eltwise_spec.elt_sum_spec.coeff_values, bytes); - pointer4parameterSpecs += bytes; - } - break; - } - default: - break; - } + char *pointer4parameterSpecs = (char *)pointer4quant; + int operatorParameterSize = get_operator_parameter_size(opsPointer[i].type); + memcpy(pointer4parameterSpecs, &(opsPointer[i].ps), operatorParameterSize); + pointer4parameterSpecs += operatorParameterSize; pointer4opsName = (I8 *)pointer4parameterSpecs; } tmp->clear(); - CHECK_REQUIREMENT(pointer4opsName - data == bufSize); + CHECK_REQUIREMENT((U32)(pointer4opsName - data) == bufSize); tmp->assign(data, data + bufSize); - delete [] data; + delete data; return SUCCESS; } -EE serialize_weights(const ModelSpec* spec, std::string* tmp) +EE serialize_weights(const ModelSpec *spec, std::string *tmp) { - WeightSpec* tmpPointer = spec->ws; + WeightSpec *tmpPointer = spec->ws; U32 bufSize = sizeof(I32); U32 weightCount = 0; for (int i = 0; i < spec->num_weight_specs; i++) { - if (DeprecatedOPOptimizer::isDeprecatedOpWeight(spec, i)) { + if (isDeprecatedOpWeight(spec, i)) { continue; } // U32 x 5: length, mdt, bytes_of_weight, bytes_of_vec, num_quant_scale - bufSize += sizeof(I8) * NAME_LEN + sizeof(U32) * 5 + tmpPointer[i].bytes_of_weight + tmpPointer[i].bytes_of_vec; + bufSize += sizeof(I8) * NAME_LEN + sizeof(U32) * 5 + tmpPointer[i].bytes_of_weight + + tmpPointer[i].bytes_of_vec; for (U32 j = 0; j < tmpPointer[i].num_quant_scale; j++) { bufSize += sizeof(int); // num_scale bufSize += tmpPointer[i].weight_scale[j].num_scale * sizeof(F32); @@ -227,20 +197,20 @@ EE serialize_weights(const ModelSpec* spec, std::string* tmp) weightCount++; } - char* data = (char*)mt_new_storage(bufSize); + char *data = (char *)mt_new_storage(bufSize); - I32* pointer4numWeightSpecs = (I32*)data; + I32 *pointer4numWeightSpecs = (I32 *)data; *pointer4numWeightSpecs = weightCount; pointer4numWeightSpecs++; - WeightSpec* wsPointer = spec -> ws; - char* pointer4wsOpName = (char*)pointer4numWeightSpecs; + WeightSpec *wsPointer = spec->ws; + char *pointer4wsOpName = (char *)pointer4numWeightSpecs; for (int i = 0; i < spec->num_weight_specs; i++) { - if (DeprecatedOPOptimizer::isDeprecatedOpWeight(spec, i)) { + if (isDeprecatedOpWeight(spec, i)) { continue; } - U32* length = (U32*)pointer4wsOpName; + U32 *length = (U32 *)pointer4wsOpName; U32 len; len = wsPointer[i].bytes_of_weight + wsPointer[i].bytes_of_vec; *length = len; @@ -249,31 +219,31 @@ EE serialize_weights(const ModelSpec* spec, std::string* tmp) str_copy(pointer4wsOpName, wsPointer[i].op_name, NAME_LEN); pointer4wsOpName += NAME_LEN; - U32* pointer4wsMdt = (U32*)pointer4wsOpName; + U32 *pointer4wsMdt = (U32 *)pointer4wsOpName; *pointer4wsMdt = wsPointer[i].mdt; pointer4wsMdt++; - U32* pointer4wsBytesOfWeight = (U32*)pointer4wsMdt; + U32 *pointer4wsBytesOfWeight = (U32 *)pointer4wsMdt; *pointer4wsBytesOfWeight = wsPointer[i].bytes_of_weight; pointer4wsBytesOfWeight++; - U8* pointer4wsWeight = (U8*)pointer4wsBytesOfWeight; + U8 *pointer4wsWeight = (U8 *)pointer4wsBytesOfWeight; memcpy(pointer4wsWeight, wsPointer[i].weight, wsPointer[i].bytes_of_weight); - pointer4wsWeight += wsPointer[i].bytes_of_weight; + pointer4wsWeight += wsPointer[i].bytes_of_weight; - U32* pointer4wsBytesOfVec = (U32*)pointer4wsWeight; + U32 *pointer4wsBytesOfVec = (U32 *)pointer4wsWeight; *pointer4wsBytesOfVec = wsPointer[i].bytes_of_vec; pointer4wsBytesOfVec++; - U8* pointer4wsVec = (U8*)pointer4wsBytesOfVec; + U8 *pointer4wsVec = (U8 *)pointer4wsBytesOfVec; memcpy(pointer4wsVec, wsPointer[i].vec, wsPointer[i].bytes_of_vec); pointer4wsVec += wsPointer[i].bytes_of_vec; - U32* pointer4numquant = (U32*)pointer4wsVec; + U32 *pointer4numquant = (U32 *)pointer4wsVec; *pointer4numquant = wsPointer[i].num_quant_scale; pointer4numquant++; - int* pointer4quant = (int*)pointer4numquant; + int *pointer4quant = (int *)pointer4numquant; for (U32 j = 0; j < wsPointer[i].num_quant_scale; j++) { *pointer4quant = wsPointer[i].weight_scale[j].num_scale; int num = *pointer4quant; @@ -282,17 +252,18 @@ EE serialize_weights(const ModelSpec* spec, std::string* tmp) pointer4quant += num; } - pointer4wsOpName = (char*)pointer4quant; + pointer4wsOpName = (char *)pointer4quant; } tmp->clear(); - CHECK_REQUIREMENT(pointer4wsOpName - data == bufSize); + CHECK_REQUIREMENT((U32)(pointer4wsOpName - data) == bufSize); tmp->assign(data, data + bufSize); - delete [] data; + delete data; return SUCCESS; } -EE serialize_model(const ModelSpec* spec, std::string* bytes) { +EE serialize_model(const ModelSpec *spec, std::string *bytes) +{ bytes->clear(); std::string tmp; @@ -307,16 +278,30 @@ EE serialize_model(const ModelSpec* spec, std::string* bytes) { return SUCCESS; } -EE write_to_file(std::string* bytes, const char* fn) { +EE write_to_file(std::string *bytes, const char *fn) +{ std::ofstream out(fn); + if (!out) { + return FILE_ERROR; + } out << *bytes; out.close(); return SUCCESS; } -EE serialize_model_to_file(const ModelSpec* spec, const char* fn) { +EE serialize_model_to_file(const ModelSpec *spec, const char *fn) +{ std::string bytes = ""; CHECK_STATUS(serialize_model(spec, &bytes)); CHECK_STATUS(write_to_file(&bytes, fn)); return SUCCESS; } + +#if defined(_USE_CAFFE) || defined(_USE_ONNX) || defined(_USE_TFLITE) || defined(_USE_TENSORFLOW) +EE mt_store(CI8 *dir, CI8 *mfn, const ModelSpec *md) +{ + std::string completePath = concat_dir_file(dir, mfn); + serialize_model_to_file(md, completePath.c_str()); + return SUCCESS; +} +#endif diff --git a/common/uni/src/profiling.cpp b/common/uni/src/profiling.cpp new file mode 100644 index 00000000..a3bbca68 --- /dev/null +++ b/common/uni/src/profiling.cpp @@ -0,0 +1,91 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include +#include + +#include "profiling.h" + +#ifdef _THREAD_SAFE +pthread_mutex_t uniThreadMutex = PTHREAD_MUTEX_INITIALIZER; +#endif + +std::string extract_class_function(std::string &&pretty_function) +{ + auto pos = pretty_function.find('('); + if (pos != std::string::npos) { + pretty_function.erase(pretty_function.begin() + pos, pretty_function.end()); + } + + pos = pretty_function.rfind(' '); + if (pos != std::string::npos) { + pretty_function.erase(pretty_function.begin(), pretty_function.begin() + pos + 1); + } + + return std::move(pretty_function); +} + +std::string extract_file_function(std::string &&pretty_function) +{ + auto pos = pretty_function.find('('); + if (pos != std::string::npos) { + pretty_function.erase(pretty_function.begin() + pos, pretty_function.end()); + } + + pos = pretty_function.rfind('/'); + if (pos != std::string::npos) { + pretty_function.erase(pretty_function.begin(), pretty_function.begin() + pos + 1); + } + + return std::move(pretty_function); +} + +std::map time_statistics; + +void ut_time_init() +{ + UNI_THREAD_SAFE(time_statistics.clear()); +} + +void ut_time_process( + const std::string &name, const std::string &category, double time_start_ms, double time_end_ms) +{ +#ifdef _PROFILE + UNI_PROFILE_INFO( + name.c_str(), category.c_str(), time_start_ms * 1000, (time_end_ms - time_start_ms) * 1000); +#endif +#ifdef _PROFILE_STATISTICS + double duration = time_end_ms - time_start_ms; + UNI_THREAD_SAFE({ + if (time_statistics.find(category) == time_statistics.end()) { + time_statistics[category] = duration; + } else { + time_statistics[category] += duration; + } + }); +#endif +} + +void ut_time_statistics() +{ + std::vector> vec(time_statistics.begin(), time_statistics.end()); + sort(vec.begin(), vec.end(), + [&](const std::pair &a, const std::pair &b) { + return (a.second > b.second); + }); + for (U32 i = 0; i < vec.size(); ++i) { + UNI_INFO_LOG("%s\t%lfms\n", vec[i].first.c_str(), vec[i].second); + } +} diff --git a/common/uni/src/tensor_desc.cpp b/common/uni/src/tensor_desc.cpp new file mode 100644 index 00000000..9f46fd6b --- /dev/null +++ b/common/uni/src/tensor_desc.cpp @@ -0,0 +1,614 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include "tensor_desc.h" + +void UNI_memcpy(void *dst, const void *src, int size) +{ + if (src == dst || size <= 0 || dst == nullptr || src == nullptr) { + return; + } + memcpy(dst, src, size); +} + +void UNI_init(U32 num, DataType dt, F32 val, void *dst) +{ + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: { + F16 v = val; + F16 *arr = (F16 *)dst; + for (U32 i = 0; i < num; i++) { + arr[i] = v; + } + break; + } +#endif + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } +} + +void transformFromFloat(DataType dataType, float *src, void *dst, int num, float scale) +{ + switch (dataType) { + case DT_F32: { + UNI_memcpy(dst, src, sizeof(float) * num); + break; + } + case DT_U32: { + U32 *ptr = (U32 *)dst; + for (int i = 0; i < num; i++) { + ptr[i] = src[i]; + } + break; + } + case DT_I32: { + I32 *ptr = (I32 *)dst; + for (int i = 0; i < num; i++) { + ptr[i] = src[i]; + } + break; + } +#ifdef __aarch64__ + case DT_F16: { + F16 *ptr = (F16 *)dst; + for (int i = 0; i < num; i++) { + ptr[i] = src[i]; + } + break; + } + case DT_F16_8Q: { + F16 *ptr = (F16 *)dst; + for (int i = 0; i < num; i++) { + ptr[i] = src[i]; + } + break; + } +#endif + case DT_I8: { + INT8 *ptr = (INT8 *)dst; + for (int i = 0; i < num; i++) { + ptr[i] = src[i] * scale; + } + break; + } + case DT_U8: { + U8 *ptr = (U8 *)dst; + for (int i = 0; i < num; i++) { + ptr[i] = src[i]; + } + break; + } + default: { + UNI_ERROR_LOG("not unsupport transform float to %d type data\n", dataType); + break; + } + } +} + +void transformToFloat(DataType dataType, void *src, float *dst, int num, float scale) +{ + switch (dataType) { + case DT_F32: { + UNI_memcpy(dst, src, sizeof(float) * num); + break; + } + case DT_U32: { + U32 *ptr = (U32 *)src; + for (int i = 0; i < num; i++) { + dst[i] = ptr[i]; + } + break; + } + case DT_I32: { + I32 *ptr = (I32 *)src; + for (int i = 0; i < num; i++) { + dst[i] = ptr[i]; + } + break; + } +#ifdef __aarch64__ + case DT_F16: { + F16 *ptr = (F16 *)src; + for (int i = 0; i < num; i++) { + dst[i] = ptr[i]; + } + break; + } + case DT_F16_8Q: { + F16 *ptr = (F16 *)src; + for (int i = 0; i < num; i++) { + dst[i] = ptr[i]; + } + break; + } +#endif + case DT_I8: { + INT8 *ptr = (INT8 *)src; + for (int i = 0; i < num; i++) { + dst[i] = ptr[i] / scale; + } + break; + } + case DT_U8: { + U8 *ptr = (U8 *)src; + for (int i = 0; i < num; i++) { + dst[i] = ptr[i]; + } + break; + } + case DT_BIN01: { + BIN8 *ptr = (BIN8 *)src; + for (int i = 0; i < num; i++) { + std::bitset<8> Val(((BIN8 *)ptr)[i / 8]); + if (Val.test(7 - (i % 8))) { + dst[i] = 1.0; + } else { + dst[i] = 0; + } + } + break; + } + case DT_BIN11: { + BIN8 *ptr = (BIN8 *)src; + for (int i = 0; i < num; i++) { + std::bitset<8> Val(((BIN8 *)ptr)[i / 8]); + if (Val.test(7 - (i % 8))) { + dst[i] = 1.0; + } else { + dst[i] = -1.0; + } + } + break; + } + default: { + UNI_ERROR_LOG("not unsupport transform %d type data to float\n", dataType); + break; + } + } +} + +template +static void transformToNCHWKernel( + TensorDesc inputDesc, const T *input, TensorDesc outputDesc, T *output) +{ + DataType idt, odt; + DataFormat idf, odf; + U32 in, ic, ih, iw, on, oc, oh, ow; + if (tensorIs2d(inputDesc)) { + CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &in, &iw)); + ic = 1; + ih = 1; + } else if (tensorIs3d(inputDesc)) { + CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &in, &ih, &iw)); + ic = 1; + } else if (tensorIs4d(inputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + } else { + UNI_ERROR_LOG("not support transform %d-dim tensor to NCHW format\n", (int)inputDesc.nDims); + return; + } + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 ihiw = ih * iw; + U32 size = tensorNumElements(outputDesc); + switch (idf) { + case DF_NCHW: { + CHECK_REQUIREMENT(tensorNumElements(inputDesc) == size); + if (output != input) { + memcpy(output, input, size); + } + break; + } + case DF_NCHWC8: { + CHECK_REQUIREMENT(ic % 8 == 0); + ic /= 8; + for (U32 n = 0, srcIndex = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 hw = 0; hw < ihiw; hw++) { + for (U32 c8 = 0; c8 < 8; c8++, srcIndex++) { + U32 c_o = c * 8 + c8; + // support channel cut + if (c_o < oc) { + U32 dstIndex = (n * oc + c_o) * ihiw + hw; + output[dstIndex] = input[srcIndex]; + } + } + } + } + } + break; + } + case DF_NHWCN8: { + CHECK_REQUIREMENT(in % 8 == 0); + in /= 8; + for (U32 o = 0, srcIndex = 0; o < in; o++) { + for (U32 hw = 0; hw < ihiw; hw++) { + for (U32 c = 0; c < ic; c++) { + for (U32 o8 = 0; o8 < 8; o8++, srcIndex++) { + U32 dstIndex = ((o * 8 + o8) * ic + c) * ihiw + hw; + output[dstIndex] = input[srcIndex]; + } + } + } + } + break; + } + case DF_NHWC: { + CHECK_REQUIREMENT(tensorNumElements(inputDesc) == size); + for (U32 o = 0, srcIndex = 0; o < in; o++) { + for (U32 hw = 0; hw < ihiw; hw++) { + for (U32 cc = 0; cc < ic; cc++, srcIndex++) { + U32 dstIndex = (o * ic + cc) * ihiw + hw; + output[dstIndex] = input[srcIndex]; + } + } + } + break; + } + default: { + UNI_ERROR_LOG("not support transform %d format tensor to NCHW format\n", idf); + } + } +} + +EE transformToNCHW(TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output) +{ + if (nullptr == input || nullptr == output) { + return NULL_POINTER; + } + switch (inputDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + transformToNCHWKernel(inputDesc, (F32 *)input, outputDesc, (F32 *)output); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + transformToNCHWKernel(inputDesc, (F16 *)input, outputDesc, (F16 *)output); + break; + } +#endif +#ifdef _USE_INT8 + case DT_I8: { + transformToNCHWKernel(inputDesc, (INT8 *)input, outputDesc, (INT8 *)output); + break; + } +#endif + default: { + return NOT_SUPPORTED; + } + } + return SUCCESS; +} + +template +static void transformToNHWCKernel( + TensorDesc inputDesc, const T *input, TensorDesc outputDesc, T *output) +{ + DataType idt, odt; + DataFormat idf, odf; + U32 in, ic, ih, iw, on, oc, oh, ow; + if (tensorIs2d(inputDesc)) { + CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &in, &iw)); + ic = 1; + ih = 1; + } else if (tensorIs3d(inputDesc)) { + CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &in, &ih, &iw)); + ic = 1; + } else if (tensorIs4d(inputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + } else { + UNI_ERROR_LOG("not support transform %d-dim tensor to NHWC format\n", (int)inputDesc.nDims); + return; + } + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 size = tensorNumElements(outputDesc); + U32 ihiw = ih * iw; + switch (idf) { + case DF_NHWC: { + CHECK_REQUIREMENT(tensorNumElements(inputDesc) == size); + if (input != output) { + memcpy(output, input, size); + } + break; + } + case DF_NCHW: { + CHECK_REQUIREMENT(tensorNumElements(inputDesc) == size); + for (U32 o = 0, srcIndex = 0; o < in; o++) { + for (U32 cc = 0; cc < ic; cc++) { + for (U32 hw = 0; hw < ihiw; hw++, srcIndex++) { + U32 dstIndex = (o * ihiw + hw) * ic + cc; + output[dstIndex] = input[srcIndex]; + } + } + } + break; + } + case DF_NCHWC8: { + CHECK_REQUIREMENT(ic % 8 == 0); + ic /= 8; + for (U32 n = 0, srcIndex = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 hw = 0; hw < ihiw; hw++) { + for (U32 c8 = 0; c8 < 8; c8++, srcIndex++) { + U32 dstIndex = ((n * ihiw + hw) * ic + c) * 8 + c8; + output[dstIndex] = input[srcIndex]; + } + } + } + } + break; + } + default: { + UNI_ERROR_LOG("not support transform %d format tensor to NHWC format\n", idf); + } + } +} + +EE transformToNHWC(TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output) +{ + if (nullptr == input || nullptr == output) { + return NULL_POINTER; + } + switch (inputDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + transformToNHWCKernel(inputDesc, (F32 *)input, outputDesc, (F32 *)output); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + transformToNHWCKernel(inputDesc, (F16 *)input, outputDesc, (F16 *)output); + break; + } +#endif +#ifdef _USE_INT8 + case DT_I8: { + transformToNHWCKernel(inputDesc, (INT8 *)input, outputDesc, (INT8 *)output); + break; + } +#endif + default: { + return NOT_SUPPORTED; + } + } + return SUCCESS; +} + +EE transformNCHWToNCHWC8( + TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output) +{ + if (input == NULL || output == NULL) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, odt; + DataFormat idf, odf; + U32 in, ic, ih, iw, on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + CHECK_REQUIREMENT(in == on && idf == DF_NCHW && odf == DF_NCHWC8 && idt == odt && ic <= oc && + ih == oh && iw == ow); + int elementSize = bytesOf(idt); + oc /= 8; + U32 ohow = oh * ow; + const U8 *inputPtr = (const U8 *)input; + U8 *outputPtr = (U8 *)output; + for (U32 n = 0, dstIndex = 0; n < on; n++) { + for (U32 c = 0; c < oc; c++) { + for (U32 hw = 0; hw < ohow; hw++) { + for (U32 c8 = 0; c8 < 8; c8++, dstIndex += elementSize) { + U32 c_i = c * 8 + c8; + // support channel padding + if (c_i < ic) { + U32 srcIndex = ((n * ic + c_i) * ohow + hw) * elementSize; + memcpy(outputPtr + dstIndex, inputPtr + srcIndex, elementSize); + } else { + memset(outputPtr + dstIndex, 0, elementSize); + } + } + } + } + } + return SUCCESS; +} + +EE transformNCHWC8ToNCHWC8ByGroup( + TensorDesc inputDesc, const void *input, int group, TensorDesc outputDesc, void *output) +{ + U32 inputSize = tensorNumElements(inputDesc); + U32 outputSize = tensorNumElements(outputDesc); + if (group <= 1 || inputSize == outputSize) { + if (input != output) { + memcpy(output, input, outputSize); + } + return SUCCESS; + } + + U32 channelAlignSize = 8; + DataType dtBefore, dtAfter; + DataFormat dfBefore, dfAfter; + U32 batch, channelBefore, hBefore, wBefore; + U32 batchAfter, channelAfter, hAfter, wAfter; + CHECK_STATUS( + tensor4dGet(inputDesc, &dtBefore, &dfBefore, &batch, &channelBefore, &hBefore, &wBefore)); + CHECK_STATUS( + tensor4dGet(outputDesc, &dtAfter, &dfAfter, &batchAfter, &channelAfter, &hAfter, &wAfter)); + CHECK_REQUIREMENT(dtBefore == dtAfter); + CHECK_REQUIREMENT(dfBefore == DF_NCHWC8 && dfAfter == DF_NCHWC8); + CHECK_REQUIREMENT(batch == batchAfter); + CHECK_REQUIREMENT(hBefore == hAfter); + CHECK_REQUIREMENT(wBefore == wAfter); + U32 channelGroupSizeBefore = channelBefore / group; + U32 channelGroupSizeAfter = channelAfter / group; + U32 channelTileSizeBefore = channelBefore / channelAlignSize; + U32 channelTileSizeAfter = channelAfter / channelAlignSize; + U32 elementSize = bytesOf(dtBefore); + U32 hw = hBefore * wBefore; + for (U32 n = 0; n < batch; n++) { + for (I32 g = 0, channelIdAfter = 0; g < group; g++) { + for (U32 c = 0; c < channelGroupSizeAfter; c++, channelIdAfter++) { + U32 channelIdBefore = g * channelGroupSizeBefore + c; + U32 channelTileBefore = channelIdBefore / channelAlignSize; + U32 channelTileAfter = channelIdAfter / channelAlignSize; + U32 channelLocalBefore = channelIdBefore % channelAlignSize; + U32 channelLocalAfter = channelIdAfter % channelAlignSize; + U32 indexBefore = + (((n * channelTileSizeBefore + channelTileBefore) * hw) * channelAlignSize + + channelLocalBefore) * + elementSize; + U32 indexAfter = + (((n * channelTileSizeAfter + channelTileAfter) * hw) * channelAlignSize + + channelLocalAfter) * + elementSize; + U32 stepSize = channelAlignSize * elementSize; + U32 indexBeforeUpper = indexBefore + stepSize * hw; + while (indexBefore < indexBeforeUpper) { + memcpy((U8 *)output + indexAfter, (const U8 *)input + indexBefore, elementSize); + indexBefore += stepSize; + indexAfter += stepSize; + } + } + } + } + return SUCCESS; +} + +EE transposeFilter(TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output) +{ + if (input == NULL || output == NULL) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, odt; + DataFormat idf, odf; + U32 in, ic, ih, iw, on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + CHECK_REQUIREMENT(idf == odf); + const U8 *inputPtr = (const U8 *)input; + U8 *outputPtr = (U8 *)output; + + switch (idf) { + case DF_NHWCN8: { + CHECK_REQUIREMENT(in % 8 == 0); + in /= 8; + U32 hwMax = ih * iw - 1; + + U32 innerSize = bytesOf(idt) * ic * 8; + + for (U32 o = 0; o < in; o++) { + for (U32 hw = 0; hw < ih * iw; hw++) { + U32 srcIndex = o * ih * iw * innerSize + hw * innerSize; + U32 dstIndex = o * ih * iw * innerSize + (hwMax - hw) * innerSize; + memcpy(outputPtr + dstIndex, inputPtr + srcIndex, innerSize); + } + } + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + return SUCCESS; +} + +EE array_transpose(DataType dt, + U32 *inputDims, + const void *input, + U32 *outputDims, + void *output, + U32 *transposeDims, + int dimsNum) +{ + U32 sizeInner = 1; + I32 sizeInnerIndex = 0; + for (I32 i = dimsNum - 1; i >= 0; i--) { + if ((I32)transposeDims[i] == i) { + sizeInner *= inputDims[dimsNum - 1 - i]; + sizeInnerIndex++; + } else { + break; + } + } + U32 inputSize = 1, outputSize = 1; + for (int i = 0; i < dimsNum; i++) { + inputSize *= inputDims[i]; + outputSize *= outputDims[i]; + } + CHECK_REQUIREMENT(inputSize == outputSize); + outputSize = outputSize / sizeInner; + + std::vector inputLocalIndex(dimsNum); + const U8 *inputPtr = (const U8 *)input; + U8 *outputPtr = (U8 *)output; + U32 tileSize = sizeInner * bytesOf(dt); + for (U32 i = 0; i < outputSize; i++) { + U32 outputIndex = i; + for (I32 j = sizeInnerIndex; j < dimsNum; j++) { + U32 value = outputIndex % outputDims[j]; + outputIndex /= outputDims[j]; + inputLocalIndex[dimsNum - 1 - transposeDims[dimsNum - 1 - j]] = value; + } + U32 inputIndex = 0; + for (I32 j = dimsNum - 1; j > sizeInnerIndex; j--) { + inputIndex = (inputIndex + inputLocalIndex[j]) * inputDims[j - 1]; + } + inputIndex += inputLocalIndex[sizeInnerIndex]; + memcpy(outputPtr + i * tileSize, inputPtr + inputIndex * tileSize, tileSize); + } + + return SUCCESS; +} + +EE array_transpose_naive(DataType dt, + U32 *inputDims, + const void *input, + U32 *outputDims, + void *output, + U32 *transposeDims, + int dimsNum) +{ + if (dimsNum <= 1) { + return SUCCESS; + } + U32 inputSize = 1, outputSize = 1; + for (int i = 0; i < dimsNum; i++) { + inputSize *= inputDims[i]; + outputSize *= outputDims[i]; + } + std::vector inputLocalIndex(dimsNum); + const U8 *inputPtr = (const U8 *)input; + U8 *outputPtr = (U8 *)output; + U32 tileSize = bytesOf(dt); + for (U32 i = 0; i < outputSize; i++) { + U32 outputIndex = i; + for (I32 j = 0; j < dimsNum; j++) { + U32 value = outputIndex % outputDims[j]; + outputIndex /= outputDims[j]; + inputLocalIndex[dimsNum - 1 - transposeDims[dimsNum - 1 - j]] = value; + } + U32 inputIndex = 0; + for (I32 j = dimsNum - 1; j > 0; j--) { + inputIndex = (inputIndex + inputLocalIndex[j]) * inputDims[j - 1]; + } + inputIndex += inputLocalIndex[0]; + memcpy(outputPtr + i * tileSize, inputPtr + inputIndex * tileSize, tileSize); + } + + return SUCCESS; +} diff --git a/common/uni/src/types.cpp b/common/uni/src/types.cpp new file mode 100644 index 00000000..0bfdec06 --- /dev/null +++ b/common/uni/src/types.cpp @@ -0,0 +1,126 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include +#include +#include +#include +#include "types.h" +#include "ut_util.h" + +OperatorSpec mt_create_operator(const char *name, OperatorType type, U32 num_inputs, U32 num_outputs) +{ + OperatorSpec newOperator; + initialization_zero(&(newOperator), sizeof(OperatorSpec)); + U32 length = UNI_MIN(strlen(name), NAME_LEN - 1); + str_copy(newOperator.name, name, length); + if (length < NAME_LEN) { + newOperator.name[length] = '\0'; + } + newOperator.type = type; + newOperator.num_inputs = num_inputs; + newOperator.input_tensors_name = (I8 **)mt_new_storage(num_inputs * sizeof(I8 *)); + for (U32 i = 0; i < num_inputs; i++) { + newOperator.input_tensors_name[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + } + newOperator.num_outputs = num_outputs; + newOperator.output_tensors_name = (I8 **)mt_new_storage(num_outputs * sizeof(I8 *)); + for (U32 i = 0; i < num_outputs; i++) { + newOperator.output_tensors_name[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + } + newOperator.tensor_positions = NULL; + newOperator.num_quant_feature = 0; + newOperator.feature_scale = NULL; + return newOperator; +} + +EE mt_insert_operator(ModelSpec *ms, int index, OperatorSpec newOperator) +{ + if (nullptr == ms) { + return NULL_POINTER; + } + OperatorSpec *operatorList = + (OperatorSpec *)mt_new_storage(sizeof(OperatorSpec) * (ms->num_operator_specs + 1)); + for (int i = 0; i < index; i++) { + operatorList[i] = ms->ops[i]; + } + operatorList[index] = newOperator; + for (int i = index; i < ms->num_operator_specs; i++) { + operatorList[i + 1] = ms->ops[i]; + } + delete ms->ops; + ms->ops = operatorList; + ms->num_operator_specs++; + return SUCCESS; +} + +WeightSpec mt_create_weight( + const char *name, DataType dataType, U32 bytesOfWeight, U32 bytesOfVec, U32 numQuantScale) +{ + WeightSpec newWeight; + initialization_zero(&(newWeight), sizeof(WeightSpec)); + U32 length = UNI_MIN(strlen(name), NAME_LEN - 1); + str_copy(newWeight.op_name, name, length); + if (length < NAME_LEN) { + newWeight.op_name[length] = '\0'; + } + newWeight.mdt = dataType; + newWeight.bytes_of_weight = bytesOfWeight; + newWeight.weight = (U8 *)mt_new_storage(bytesOfWeight); + newWeight.bytes_of_vec = bytesOfVec; + newWeight.vec = (U8 *)mt_new_storage(bytesOfVec); + newWeight.num_quant_scale = numQuantScale; + newWeight.weight_scale = (QuantSpec *)mt_new_storage(sizeof(QuantSpec) * numQuantScale); + return newWeight; +} + +bool isDeprecatedOp(OperatorType opType) +{ + return (opType == OT_None) ? true : false; +} + +bool isDeprecatedOpWeight(const ModelSpec *spec, int index) +{ + if (index >= spec->num_weight_specs) { + return true; + } else { + if (spec->ws[index].bytes_of_weight == 0 && spec->ws[index].bytes_of_vec == 0) { + return true; + } else { + return false; + } + } +} + +EE str_copy(I8 *dst, const I8 *src, I32 srcLen, I32 dstLen) +{ + memset(dst, 0, dstLen); + I32 copyLen = NAME_LEN - 1; + if (copyLen > srcLen) { + copyLen = srcLen; + } + memcpy(dst, src, copyLen * sizeof(I8)); + return SUCCESS; +} + +void *mt_new_storage(size_t size) +{ + if (size == 0) { + return nullptr; + } else { + U8 *s = (U8 *)operator new(size); + return (void *)s; + } +} diff --git a/common/uni/src/uni.cpp b/common/uni/src/uni.cpp new file mode 100644 index 00000000..506f4260 --- /dev/null +++ b/common/uni/src/uni.cpp @@ -0,0 +1,42 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_desc.h" + +#include + +extern "C" int UNI_ISINF(float a) +{ +#ifdef isinf + return isinf(a); +#else +#if __cplusplus < 201103L + return isinf(a); +#else + return std::isinf(a); +#endif +#endif +} + +extern "C" int UNI_ISNAN(float a) +{ +#ifdef isnan + return isnan(a); +#else +#if __cplusplus < 201103L + return isnan(a); +#else + return std::isnan(a); +#endif +#endif +} diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt new file mode 100644 index 00000000..1ed03bab --- /dev/null +++ b/compute/CMakeLists.txt @@ -0,0 +1,17 @@ +cmake_minimum_required(VERSION 3.2) + +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) +if (BOLT_CONFIGURE_FILE) + include(${BOLT_CONFIGURE_FILE}) +else (BOLT_CONFIGURE_FILE) + message(FATAL_ERROR " +FATAL: can not find bolt.cmake in /common/cmakes directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (BOLT_CONFIGURE_FILE) + +project(compute) + +add_subdirectory(blas_enhance) +add_subdirectory(tensor) +add_subdirectory(image) diff --git a/compute/blas_enhance/CMakeLists.txt b/compute/blas_enhance/CMakeLists.txt new file mode 100644 index 00000000..70601eef --- /dev/null +++ b/compute/blas_enhance/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 3.2) + +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) +if (BOLT_CONFIGURE_FILE) + include(${BOLT_CONFIGURE_FILE}) +else (BOLT_CONFIGURE_FILE) + message(FATAL_ERROR " +FATAL: can not find bolt.cmake in directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (BOLT_CONFIGURE_FILE) + +project(blas_enhance) + +set_c_cxx_flags() + +include_blas_enhance() + +add_subdirectory(src) +add_subdirectory(tests) diff --git a/compute/blas_enhance/include/blas_enhance.h b/compute/blas_enhance/include/blas_enhance.h new file mode 100644 index 00000000..7cae3bcc --- /dev/null +++ b/compute/blas_enhance/include/blas_enhance.h @@ -0,0 +1,105 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_BLAS_ENHANCE +#define _H_BLAS_ENHANCE + +#include "sys.h" +#include "tensor_desc.h" +#include "types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +EE matrix_matrix_multiply_tmp_bytes( + TensorDesc matrixADesc, TensorDesc matrixBDesc, U32 *bytes, Arch arch); + +EE matrix_matrix_multiply(TensorDesc matrixADesc, + const void *matrixA, + TensorDesc matrixBDesc, + const void *matrixB, + U32 bytes, + void *tmp, + TensorDesc matrixCDesc, + void *matrixC, + Arch arch); + +EE matrix_vector_multiply_tmp_bytes(TensorDesc matrixDesc, TensorDesc vectorDesc, U32 *bytes, Arch); + +EE matrix_vector_multiply(TensorDesc matrixDesc, + const void *matrix, + TensorDesc vectorDesc, + const void *vector, + U32 bytes, + void *tmp, + TensorDesc resultDesc, + void *result, + Arch arch); + +inline DataFormat targetFormat4MatrixB(DataType dt) +{ + switch (dt) { + case DT_F16: { + return DF_NKN24; + } + case DT_F32: { +#ifdef __aarch64__ + return DF_NKN12; +#else + return DF_NKN8; +#endif + } + case DT_I8: { + return DF_NKN12K4; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + return DF_NCHWC8; + } + } +} + +inline DataFormat targetFormat4mvmMatrix(DataType dt) +{ + switch (dt) { + case DT_I8: { + return DF_NKN32K4; + } + case DT_F16: { + return DF_NKN64; + } + case DT_F32: { + return DF_NKN16; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + return DF_NCHWC8; + } + } +} + +EE matrix_matrix_multiply_transform_rhs( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst, Arch arch); + +EE matrix_vector_multiply_transform_weight( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst, Arch arch); + +EE vector_vector_axpby( + F32 a, TensorDesc xDesc, const void *x, F32 b, TensorDesc yDesc, void *y, Arch arch); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/compute/blas_enhance/src/CMakeLists.txt b/compute/blas_enhance/src/CMakeLists.txt new file mode 100644 index 00000000..23572214 --- /dev/null +++ b/compute/blas_enhance/src/CMakeLists.txt @@ -0,0 +1,44 @@ +if (USE_GENERAL) + file(GLOB general_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/general/*.cpp) +endif (USE_GENERAL) + +if (USE_X86) + file(GLOB x86_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/x86/*.cpp) + if (USE_FP32) + file(GLOB x86_fp32_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/x86/fp32/*.cpp) + endif (USE_FP32) + set(x86_srcs "${x86_srcs};${x86_fp32_srcs};") +endif (USE_X86) + +if (USE_NEON) + if (USE_FP16) + file(GLOB arm_fp16_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/fp16/*.cpp) + endif (USE_FP16) + if (USE_FP32) + file(GLOB arm_fp32_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/fp32/*.cpp) + endif (USE_FP32) + if (USE_INT8) + file(GLOB arm_int8_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/int8/*.cpp) + endif (USE_INT8) + file(GLOB arm_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/*.cpp) + set(arm_srcs "${arm_srcs};${arm_fp16_srcs};${arm_fp32_srcs};${arm_int8_srcs}") +endif (USE_NEON) + +file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) +set(srcs "${srcs};${general_srcs};${arm_srcs};${x86_srcs}") + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + +# shared library +add_library(${PROJECT_NAME} SHARED ${srcs}) +target_link_libraries(${PROJECT_NAME} LINK_PUBLIC uni) + +# static library +add_library(${PROJECT_NAME}_static STATIC ${srcs}) + +set_target_properties(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}") +set_target_properties(${PROJECT_NAME} PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(${PROJECT_NAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) +install(TARGETS ${PROJECT_NAME} ${PROJECT_NAME}_static + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) diff --git a/compute/blas_enhance/src/axpby.cpp b/compute/blas_enhance/src/axpby.cpp new file mode 100644 index 00000000..6f7cb448 --- /dev/null +++ b/compute/blas_enhance/src/axpby.cpp @@ -0,0 +1,53 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "blas_enhance.h" +#ifdef _USE_GENERAL +#include "cpu/general/blas_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/blas_arm.h" +#endif + +EE vector_vector_axpby( + F32 a, TensorDesc xDesc, const void *x, F32 b, TensorDesc yDesc, void *y, Arch arch) +{ + if (nullptr == x || nullptr == y) { + CHECK_STATUS(NULL_POINTER); + } + DataType xDataType, yDataType; + DataFormat xDataFormat, yDataFormat; + U32 xLen, yLen; + CHECK_STATUS(tensor1dGet(xDesc, &xDataType, &xDataFormat, &xLen)); + CHECK_STATUS(tensor1dGet(yDesc, &yDataType, &yDataFormat, &yLen)); + + if (xDataType != yDataType) { + CHECK_STATUS(NOT_MATCH); + } + + if (xLen != yLen) { + CHECK_STATUS(NOT_MATCH); + } + + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = axpby_general(yLen, yDataType, a, x, b, y); +#endif +#ifdef _USE_NEON + } else { + ret = axpby_arm(yLen, yDataType, a, x, b, y, arch); +#endif + } + return ret; +} diff --git a/compute/blas_enhance/src/cpu/arm/axpby.cpp b/compute/blas_enhance/src/cpu/arm/axpby.cpp new file mode 100644 index 00000000..89bc34a2 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/axpby.cpp @@ -0,0 +1,46 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "error.h" +#include "types.h" +#include "cpu/arm/blas_arm.h" +#ifdef _USE_FP16 +#include "cpu/arm/fp16/blas_fp16.h" +#endif +#ifdef _USE_FP32 +#include "cpu/arm/fp32/blas_fp32.h" +#endif + +EE axpby_arm(U32 len, DataType dt, F32 a, const void *x, F32 b, void *y, Arch arch) +{ + EE ret = SUCCESS; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + if (ARM_A55 != arch && ARM_A76 != arch) { + return NOT_SUPPORTED; + } + ret = axpby_fp16(len, a, (F16 *)x, b, (F16 *)y); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + ret = axpby_fp32(len, a, (F32 *)x, b, (F32 *)y); + break; +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/blas-enhance/src/cpu/arm/blas_arm.h b/compute/blas_enhance/src/cpu/arm/blas_arm.h similarity index 53% rename from blas-enhance/src/cpu/arm/blas_arm.h rename to compute/blas_enhance/src/cpu/arm/blas_arm.h index e69e3c2b..e52b72fa 100644 --- a/blas-enhance/src/cpu/arm/blas_arm.h +++ b/compute/blas_enhance/src/cpu/arm/blas_arm.h @@ -1,42 +1,54 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_BLAS_ARM #define _H_BLAS_ARM #include "error.h" #include "sys.h" -#include "type.h" +#include "tensor_desc.h" + +EE matrix_vector_multiply_tmp_bytes_arm(bool transpose, DataType dt, U32 *bytes); -EE matrix_vector_multiply_tmp_bytes_arm(bool transpose, - DataType dt, U32 *bytes); +EE matrix_vector_multiply_transform_weight_arm( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst); -EE mvm_arm(U32 row, U32 col, DataType dt, bool transpose, - const void *matrix, const void *vector, +EE mvm_arm(U32 row, + U32 col, + DataType dt, + DataFormat df, + const void *matrix, + const void *vector, void *tmp, void *result, Arch arch); -EE matrix_matrix_multiply_tmp_bytes_arm(U32 matrixA_M, U32 matrixA_K, U32 matrixB_K, U32 matrixB_N, - DataType dt, U32 *bytes); +EE matrix_matrix_multiply_tmp_bytes_arm( + U32 matrixA_M, U32 matrixA_K, U32 matrixB_K, U32 matrixB_N, DataType dt, U32 *bytes); + +EE matrix_matrix_multiply_transform_rhs_arm( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst); -EE mmm_arm(U32 matrixC_N, U32 matrixC_M, U32 matrixA_K, - DataType matrixADataType, - const void* matrixAData, const void* matrixBData, - void* tmp, - void* matrixCData, - Arch arch); +EE mmm_arm(U32 matrixC_N, + U32 matrixC_M, + U32 matrixA_K, + DataType matrixADataType, + bool transposeA, + const void *matrixAData, + const void *matrixBData, + void *tmp, + void *matrixCData, + Arch arch); inline U32 pad_to_4_multiple(U32 k) { @@ -47,4 +59,6 @@ inline U32 pad_to_4_multiple(U32 k) } } +EE axpby_arm(U32 len, DataType dt, F32 a, const void *x, F32 b, void *y, Arch arch); + #endif diff --git a/compute/blas_enhance/src/cpu/arm/fp16/axpby.cpp b/compute/blas_enhance/src/cpu/arm/fp16/axpby.cpp new file mode 100644 index 00000000..9f3589af --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp16/axpby.cpp @@ -0,0 +1,34 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "error.h" +#include "cpu/arm/fp16/blas_fp16.h" + +EE axpby_fp16(U32 len, F32 a, const F16 *x, F32 b, F16 *y) +{ + EE ret = SUCCESS; + float16x8_t alpha = vdupq_n_f16(a); + float16x8_t beta = vdupq_n_f16(b); + I32 i = 0; + for (; i < ((I32)len) - 7; i += 8) { + float16x8_t out = vld1q_f16(y + i); + float16x8_t in = vld1q_f16(x + i); + out = vmulq_f16(out, beta); + out = vfmaq_f16(out, alpha, in); + vst1q_f16(y + i, out); + } + for (; i < (I32)len; i++) { + y[i] = a * x[i] + b * y[i]; + } + return ret; +} diff --git a/blas-enhance/src/cpu/arm/fp16/blas_fp16.h b/compute/blas_enhance/src/cpu/arm/fp16/blas_fp16.h similarity index 62% rename from blas-enhance/src/cpu/arm/fp16/blas_fp16.h rename to compute/blas_enhance/src/cpu/arm/fp16/blas_fp16.h index 1f5c57f8..fbc144d7 100644 --- a/blas-enhance/src/cpu/arm/fp16/blas_fp16.h +++ b/compute/blas_enhance/src/cpu/arm/fp16/blas_fp16.h @@ -1,34 +1,40 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - +#ifdef _USE_FP16 #ifndef _H_BLAS_FP16 #define _H_BLAS_FP16 #include "sys.h" -#include "type.h" +#include "types.h" #include "error.h" #include "tensor_desc.h" +EE matrix_vector_multiply_transform_weight_fp16(TensorDesc desc, F16 *src, F16 *dst); + +EE mvm_fp16(U32 row, U32 col, DataFormat df, F16 *matrix, F16 *vector, F16 *result, Arch arch); -EE mvm_fp16(U32 row, U32 col, bool transpose, F16* matrix, F16* vector, F16* result, Arch arch); +void matrix_matrix_multiply_tmp_bytes_fp16( + U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes); -void matrix_matrix_multiply_tmp_bytes_fp16(U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes); +EE matrix_matrix_multiply_transform_rhsN_fp16(TensorDesc desc, F16 *src, F16 *dst); -EE matrix_matrix_multiply_transform_rhsN_fp16(TensorDesc desc, F16* src, F16* dst); +EE matrix_matrix_multiply_transform_rhsT_fp16(TensorDesc desc, F16 *src, F16 *dst); -EE matrix_matrix_multiply_transform_rhsT_fp16(TensorDesc desc, F16* src, F16* dst); +EE mmm_fp16( + int M, int N, int K, bool transposeA, F16 *matrix1, F16 *matrix2, F16 *tmp, F16 *result, Arch arch); -EE mmm_fp16(int M, int N, int K, F16* matrix1, F16* matrix2, F16* tmp, F16* result, Arch arch); +EE axpby_fp16(U32 len, F32 a, const F16 *x, F32 b, F16 *y); #endif +#endif diff --git a/blas-enhance/src/cpu/arm/fp16/mmm.cpp b/compute/blas_enhance/src/cpu/arm/fp16/mmm.cpp similarity index 71% rename from blas-enhance/src/cpu/arm/fp16/mmm.cpp rename to compute/blas_enhance/src/cpu/arm/fp16/mmm.cpp index 778bc70e..8b7fc590 100644 --- a/blas-enhance/src/cpu/arm/fp16/mmm.cpp +++ b/compute/blas_enhance/src/cpu/arm/fp16/mmm.cpp @@ -1,36 +1,36 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include "type.h" +#include "types.h" #include "error.h" #include "cpu/arm/fp16/blas_fp16.h" #include "mmm.h" #include "mmm_common.h" - -void matrix_matrix_multiply_tmp_bytes_fp16(U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes) +void matrix_matrix_multiply_tmp_bytes_fp16( + U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes) { *bytes = row1 * col1 + row2 * col2; *bytes *= bytesOf(dt); *bytes += 32; } -EE matrix_matrix_multiply_transform_rhsN_fp16(TensorDesc desc, F16* src, F16* dst) +EE matrix_matrix_multiply_transform_rhsN_fp16(TensorDesc desc, F16 *src, F16 *dst) { DataType dt; + DataFormat df; U32 N, K; - CHECK_STATUS(tensor2dGet(desc, &dt, &K, &N)); + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N)); int i = 0; for (; i < (int)N - 23; i += 24) { matrix2_trans(24, K, N, src + i, dst + i * K); @@ -47,11 +47,12 @@ EE matrix_matrix_multiply_transform_rhsN_fp16(TensorDesc desc, F16* src, F16* ds return SUCCESS; } -EE matrix_matrix_multiply_transform_rhsT_fp16(TensorDesc desc, F16* src, F16* dst) +EE matrix_matrix_multiply_transform_rhsT_fp16(TensorDesc desc, F16 *src, F16 *dst) { DataType dt; + DataFormat df; U32 N, K; - CHECK_STATUS(tensor2dGet(desc, &dt, &N, &K)); + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K)); int i = 0; for (; i < (int)N - 23; i += 24) { matrix1_trans(24, K, K, src + i * K, dst + i * K); @@ -68,20 +69,20 @@ EE matrix_matrix_multiply_transform_rhsT_fp16(TensorDesc desc, F16* src, F16* ds return SUCCESS; } - -EE mmm_fp16(int M, int N, int K, F16* matrix1, F16* matrix2, F16* tmp, F16* result, Arch arch) +EE mmm_fp16( + int M, int N, int K, bool transposeA, F16 *matrix1, F16 *matrix2, F16 *tmp, F16 *result, Arch arch) { EE ret = SUCCESS; switch (arch) { - case ARM_A55: - mmm_A55(M, N, K, matrix1, matrix2, tmp, result); - break; - case ARM_A76: - mmm_A76(M, N, K, matrix1, matrix2, tmp, result); - break; - default: - ret = NOT_SUPPORTED; - break; + case ARM_A55: + mmm_A55(M, N, K, transposeA, matrix1, matrix2, tmp, result); + break; + case ARM_A76: + mmm_A76(M, N, K, transposeA, matrix1, matrix2, tmp, result); + break; + default: + ret = NOT_SUPPORTED; + break; } return ret; } diff --git a/blas-enhance/src/cpu/arm/int8/mmm.h b/compute/blas_enhance/src/cpu/arm/fp16/mmm.h similarity index 71% rename from blas-enhance/src/cpu/arm/int8/mmm.h rename to compute/blas_enhance/src/cpu/arm/fp16/mmm.h index 81a9683c..a724ea98 100644 --- a/blas-enhance/src/cpu/arm/int8/mmm.h +++ b/compute/blas_enhance/src/cpu/arm/fp16/mmm.h @@ -1,23 +1,23 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_MMM #define _H_MMM +#include "types.h" -#include "type.h" - -void mmm_A55(int M, int N, int K, INT8* matrix1, INT8* matrix2, INT8* tmp, I32* result); +void mmm_A55( + int M, int N, int K, bool transposeA, F16 *matrix1, F16 *matrix2, F16 *tmp, F16 *result); -void mmm_A76(int M, int N, int K, INT8* matrix1, INT8* matrix2, INT8* tmp, I32* result); +void mmm_A76( + int M, int N, int K, bool transposeA, F16 *matrix1, F16 *matrix2, F16 *tmp, F16 *result); #endif diff --git a/compute/blas_enhance/src/cpu/arm/fp16/mmm_A55.cpp b/compute/blas_enhance/src/cpu/arm/fp16/mmm_A55.cpp new file mode 100644 index 00000000..9166bde4 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp16/mmm_A55.cpp @@ -0,0 +1,783 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include + +#include "types.h" +#include "error.h" +#include "cpu/arm/fp16/mmm_common.h" +#include "cpu/arm/fp16/mmm.h" + +inline void mmm_4x24_A55(U32 M, U32 K, F16 *w, F16 *in, F16 *out) +{ + U32 KTail = K % 2; + U32 KInner = K - KTail; + asm volatile( + // init in0- > v1, w- > v0 + "ld1 {v1.4h}, [%1], #8\n" + "ldr x22, [%1], #8\n" + "ins v1.d[1], x22\n" + "ld1 {v0.4h}, [%2], #8\n" + "mov x26, %0\n" + "ld1 {v5.8h, v6.8h, v7.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.8h, v9.8h, v10.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.8h, v12.8h, v13.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v14.8h, v15.8h, v16.8h}, [x26]\n" + + "mov x20, %3\n" + + "0:\n" + // w- > v4, in0- > v2/v3/v1, out0=v5~v28 + "fmla v5.8h, v1.8h, v0.h[0]\n" + "ld1 {v2.4h}, [%1], #8\n" + "fmla v8.8h, v1.8h, v0.h[1]\n" + "ldr x23, [%1], #8\n" + "fmla v11.8h, v1.8h, v0.h[2]\n" + "ins v2.d[1], x23\n" + "fmla v14.8h, v1.8h, v0.h[3]\n" + "ld1 {v4.4h}, [%2], #8\n" + "fmla v6.8h, v2.8h, v0.h[0]\n" + "ld1 {v3.4h}, [%1], #8\n" + "fmla v9.8h, v2.8h, v0.h[1]\n" + "ldr x24, [%1], #8\n" + "fmla v12.8h, v2.8h, v0.h[2]\n" + "ins v3.d[1], x24\n" + "fmla v15.8h, v2.8h, v0.h[3]\n" + "fmla v7.8h, v3.8h, v0.h[0]\n" + "ld1 {v1.4h}, [%1], #8\n" + "fmla v10.8h, v3.8h, v0.h[1]\n" + "ldr x22, [%1], #8\n" + "fmla v13.8h, v3.8h, v0.h[2]\n" + "ins v1.d[1], x22\n" + "fmla v16.8h, v3.8h, v0.h[3]\n" + + // w- > v0, in0- > v2/v3/v1, out0- > v5~v28 + "fmla v5.8h, v1.8h, v4.h[0]\n" + "ld1 {v2.4h}, [%1], #8\n" + "fmla v8.8h, v1.8h, v4.h[1]\n" + "ldr x23, [%1], #8\n" + "fmla v11.8h, v1.8h, v4.h[2]\n" + "ins v2.d[1], x23\n" + "fmla v14.8h, v1.8h, v4.h[3]\n" + "ld1 {v0.4h}, [%2], #8\n" + "fmla v6.8h, v2.8h, v4.h[0]\n" + "ld1 {v3.4h}, [%1], #8\n" + "fmla v9.8h, v2.8h, v4.h[1]\n" + "ldr x24, [%1], #8\n" + "fmla v12.8h, v2.8h, v4.h[2]\n" + "ins v3.d[1], x24\n" + "fmla v15.8h, v2.8h, v4.h[3]\n" + "fmla v7.8h, v3.8h, v4.h[0]\n" + "ld1 {v1.4h}, [%1], #8\n" + "fmla v10.8h, v3.8h, v4.h[1]\n" + "ldr x22, [%1], #8\n" + "fmla v13.8h, v3.8h, v4.h[2]\n" + "ins v1.d[1], x22\n" + "fmla v16.8h, v3.8h, v4.h[3]\n" + + "subs x20, x20, #0x2\n" + "bne 0b\n" + + "cbz %5, 1f\n" + "fmla v5.8h, v1.8h, v0.h[0]\n" + "ld1 {v2.4h}, [%1], #8\n" + "fmla v8.8h, v1.8h, v0.h[1]\n" + "ldr x23, [%1], #8\n" + "fmla v11.8h, v1.8h, v0.h[2]\n" + "ins v2.d[1], x23\n" + "fmla v14.8h, v1.8h, v0.h[3]\n" + "fmla v6.8h, v2.8h, v0.h[0]\n" + "ld1 {v3.4h}, [%1], #8\n" + "fmla v9.8h, v2.8h, v0.h[1]\n" + "ldr x24, [%1], #8\n" + "fmla v12.8h, v2.8h, v0.h[2]\n" + "ins v3.d[1], x24\n" + "fmla v15.8h, v2.8h, v0.h[3]\n" + "fmla v7.8h, v3.8h, v0.h[0]\n" + "fmla v10.8h, v3.8h, v0.h[1]\n" + "fmla v13.8h, v3.8h, v0.h[2]\n" + "fmla v16.8h, v3.8h, v0.h[3]\n" + + "1:\n" + "mov x26, %0\n" + "st1 {v5.8h, v6.8h, v7.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.8h, v9.8h, v10.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.8h, v12.8h, v13.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v14.8h, v15.8h, v16.8h}, [x26]\n" + : "+r"(out), "+r"(in), "+r"(w) + : "r"((I64)KInner), "r"((I64)M), "r"((I64)KTail) + : "memory", "cc", "x0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", + "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16"); +} + +inline void mmm_8x4_A55(U32 M, U32 K, F16 *w, F16 *in, F16 *out) +{ + U32 KTail = K % 2; + U32 KInner = K - KTail; + asm volatile("mov x26, %0\n" + "ld1 {v5.h}[0], [x26], #2\n" + "ld1 {v6.h}[0], [x26], #2\n" + "ld1 {v7.h}[0], [x26], #2\n" + "ld1 {v8.h}[0], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[1], [x26], #2\n" + "ld1 {v6.h}[1], [x26], #2\n" + "ld1 {v7.h}[1], [x26], #2\n" + "ld1 {v8.h}[1], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[2], [x26], #2\n" + "ld1 {v6.h}[2], [x26], #2\n" + "ld1 {v7.h}[2], [x26], #2\n" + "ld1 {v8.h}[2], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[3], [x26], #2\n" + "ld1 {v6.h}[3], [x26], #2\n" + "ld1 {v7.h}[3], [x26], #2\n" + "ld1 {v8.h}[3], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[4], [x26], #2\n" + "ld1 {v6.h}[4], [x26], #2\n" + "ld1 {v7.h}[4], [x26], #2\n" + "ld1 {v8.h}[4], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[5], [x26], #2\n" + "ld1 {v6.h}[5], [x26], #2\n" + "ld1 {v7.h}[5], [x26], #2\n" + "ld1 {v8.h}[5], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[6], [x26], #2\n" + "ld1 {v6.h}[6], [x26], #2\n" + "ld1 {v7.h}[6], [x26], #2\n" + "ld1 {v8.h}[6], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[7], [x26], #2\n" + "ld1 {v6.h}[7], [x26], #2\n" + "ld1 {v7.h}[7], [x26], #2\n" + "ld1 {v8.h}[7], [x26], #2\n" + "add x26, x26, %4\n" + + "mov x20, %3\n" + + "ld1 {v1.4h}, [%2], #8\n" + "ldr x24, [%2], #8\n" + "ins v1.d[1], x24\n" + "ld1 {v2.4h}, [%1], #8\n" + + "0:\n" + "fmla v5.8h, v1.8h, v2.h[0]\n" + "ld1 {v3.4h}, [%2], #8\n" + "fmla v6.8h, v1.8h, v2.h[1]\n" + "ldr x25, [%2], #8\n" + "ld1 {v4.4h}, [%1], #8\n" + "fmla v7.8h, v1.8h, v2.h[2]\n" + "ins v3.d[1], x25\n" + "fmla v8.8h, v1.8h, v2.h[3]\n" + + "fmla v5.8h, v3.8h, v4.h[0]\n" + "ld1 {v1.4h}, [%2], #8\n" + "fmla v6.8h, v3.8h, v4.h[1]\n" + "ldr x24, [%2], #8\n" + "ld1 {v2.4h}, [%1], #8\n" + "fmla v7.8h, v3.8h, v4.h[2]\n" + "ins v1.d[1], x24\n" + "fmla v8.8h, v3.8h, v4.h[3]\n" + + "subs x20, x20, 0x2\n" + "bne 0b\n" + + "cbz %5, 1f\n" + "fmla v5.8h, v1.8h, v2.h[0]\n" + "fmla v6.8h, v1.8h, v2.h[1]\n" + "fmla v7.8h, v1.8h, v2.h[2]\n" + "fmla v8.8h, v1.8h, v2.h[3]\n" + + "1:\n" + "mov x26, %0\n" + "st1 {v5.h}[0], [x26], #2\n" + "st1 {v6.h}[0], [x26], #2\n" + "st1 {v7.h}[0], [x26], #2\n" + "st1 {v8.h}[0], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[1], [x26], #2\n" + "st1 {v6.h}[1], [x26], #2\n" + "st1 {v7.h}[1], [x26], #2\n" + "st1 {v8.h}[1], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[2], [x26], #2\n" + "st1 {v6.h}[2], [x26], #2\n" + "st1 {v7.h}[2], [x26], #2\n" + "st1 {v8.h}[2], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[3], [x26], #2\n" + "st1 {v6.h}[3], [x26], #2\n" + "st1 {v7.h}[3], [x26], #2\n" + "st1 {v8.h}[3], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[4], [x26], #2\n" + "st1 {v6.h}[4], [x26], #2\n" + "st1 {v7.h}[4], [x26], #2\n" + "st1 {v8.h}[4], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[5], [x26], #2\n" + "st1 {v6.h}[5], [x26], #2\n" + "st1 {v7.h}[5], [x26], #2\n" + "st1 {v8.h}[5], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[6], [x26], #2\n" + "st1 {v6.h}[6], [x26], #2\n" + "st1 {v7.h}[6], [x26], #2\n" + "st1 {v8.h}[6], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[7], [x26], #2\n" + "st1 {v6.h}[7], [x26], #2\n" + "st1 {v7.h}[7], [x26], #2\n" + "st1 {v8.h}[7], [x26], #2\n" + "add x26, x26, %4\n" + : "+r"(out), "+r"(in), "+r"(w) + : "r"((I64)KInner), "r"((I64)M), "r"((I64)KTail) + : "memory", "cc", "x20", "x24", "x25", "x26", "x27", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8"); +} + +inline void mmm_4x8_A55(U32 M, U32 K, F16 *w, F16 *in, F16 *out) +{ + U32 KTail = K % 2; + U32 KInner = K - KTail; + asm volatile("mov x26, %0\n" + "ld1 {v5.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v6.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v7.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.8h}, [x26]\n" + + "mov x20, %3\n" + + "ld1 {v1.4h}, [%1], #8\n" + "ldr x24, [%1], #8\n" + "ins v1.d[1], x24\n" + "ld1 {v2.4h}, [%2], #8\n" + + "0:\n" + "fmla v5.8h, v1.8h, v2.h[0]\n" + "ld1 {v3.4h}, [%1], #8\n" + "fmla v6.8h, v1.8h, v2.h[1]\n" + "ldr x25, [%1], #8\n" + "ld1 {v4.4h}, [%2], #8\n" + "fmla v7.8h, v1.8h, v2.h[2]\n" + "ins v3.d[1], x25\n" + "fmla v8.8h, v1.8h, v2.h[3]\n" + "fmla v5.8h, v3.8h, v4.h[0]\n" + "ld1 {v1.4h}, [%1], #8\n" + "fmla v6.8h, v3.8h, v4.h[1]\n" + "ldr x24, [%1], #8\n" + "ld1 {v2.4h}, [%2], #8\n" + "fmla v7.8h, v3.8h, v4.h[2]\n" + "ins v1.d[1], x24\n" + "fmla v8.8h, v3.8h, v4.h[3]\n" + + "subs x20, x20, 0x2\n" + "bne 0b\n" + + "cbz %5, 1f\n" + "fmla v5.8h, v1.8h, v2.h[0]\n" + "fmla v6.8h, v1.8h, v2.h[1]\n" + "fmla v7.8h, v1.8h, v2.h[2]\n" + "fmla v8.8h, v1.8h, v2.h[3]\n" + + "1:\n" + "mov x26, %0\n" + "st1 {v5.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v6.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v7.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.8h}, [x26]\n" + : "+r"(out), "+r"(in), "+r"(w) + : "r"((I64)KInner), "r"((I64)M), "r"((I64)KTail) + : "memory", "cc", "x20", "x24", "x25", "x26", "x27", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8"); +} + +inline void mmm_4x4_A55(U32 M, U32 K, F16 *w, F16 *in, F16 *out) +{ + U32 KTail = K % 2; + U32 KInner = K - KTail; + asm volatile("mov x26, %0\n" + "ld1 {v5.4h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v6.4h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v7.4h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.4h}, [x26]\n" + + "mov x20, %3\n" + + "ld1 {v1.4h}, [%1], #8\n" + "ld1 {v2.4h}, [%2], #8\n" + + "0:\n" + "fmla v5.4h, v1.4h, v2.h[0]\n" + "ld1 {v3.4h}, [%1], #8\n" + "fmla v6.4h, v1.4h, v2.h[1]\n" + "ld1 {v4.4h}, [%2], #8\n" + "fmla v7.4h, v1.4h, v2.h[2]\n" + "fmla v8.4h, v1.4h, v2.h[3]\n" + "fmla v5.4h, v3.4h, v4.h[0]\n" + "ld1 {v1.4h}, [%1], #8\n" + "fmla v6.4h, v3.4h, v4.h[1]\n" + "ld1 {v2.4h}, [%2], #8\n" + "fmla v7.4h, v3.4h, v4.h[2]\n" + "fmla v8.4h, v3.4h, v4.h[3]\n" + + "subs x20, x20, 0x2\n" + "bne 0b\n" + + "cbz %5, 1f\n" + "fmla v5.4h, v1.4h, v2.h[0]\n" + "fmla v6.4h, v1.4h, v2.h[1]\n" + "fmla v7.4h, v1.4h, v2.h[2]\n" + "fmla v8.4h, v1.4h, v2.h[3]\n" + + "1:\n" + "mov x26, %0\n" + "st1 {v5.4h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v6.4h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v7.4h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.4h}, [x26]\n" + : "+r"(out), "+r"(in), "+r"(w) + : "r"((I64)KInner), "r"((I64)M), "r"((I64)KTail) + : "memory", "cc", "x20", "x26", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8"); +} + +inline void mmm_8x8_A55(U32 M, U32 K, F16 *w, F16 *in, F16 *out) +{ + U32 KTail = K % 2; + U32 KInner = K - KTail; + asm volatile("mov x26, %0\n" + "ld1 {v5.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v6.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v7.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v9.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v10.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v12.8h}, [x26]\n" + + "mov x20, %3\n" + + "ld1 {v1.4h}, [%1], #8\n" + "ldr x24, [%1], #8\n" + "ins v1.d[1], x24\n" + "ld1 {v2.4h}, [%2], #8\n" + "ldr x22, [%2], #8\n" + "ins v2.d[1], x22\n" + + "0:\n" + "fmla v5.8h, v1.8h, v2.h[0]\n" + "ld1 {v3.4h}, [%1], #8\n" + "fmla v6.8h, v1.8h, v2.h[1]\n" + "ldr x25, [%1], #8\n" + "fmla v7.8h, v1.8h, v2.h[2]\n" + "ins v3.d[1], x25\n" + "fmla v8.8h, v1.8h, v2.h[3]\n" + "ld1 {v4.4h}, [%2], #8\n" + "fmla v9.8h, v1.8h, v2.h[4]\n" + "ldr x23, [%2], #8\n" + "fmla v10.8h, v1.8h, v2.h[5]\n" + "ins v4.d[1], x23\n" + "fmla v11.8h, v1.8h, v2.h[6]\n" + "fmla v12.8h, v1.8h, v2.h[7]\n" + + "fmla v5.8h, v3.8h, v4.h[0]\n" + "ld1 {v1.4h}, [%1], #8\n" + "fmla v6.8h, v3.8h, v4.h[1]\n" + "ldr x24, [%1], #8\n" + "fmla v7.8h, v3.8h, v4.h[2]\n" + "ins v1.d[1], x24\n" + "fmla v8.8h, v3.8h, v4.h[3]\n" + "ld1 {v2.4h}, [%2], #8\n" + "fmla v9.8h, v3.8h, v4.h[4]\n" + "ldr x22, [%2], #8\n" + "fmla v10.8h, v3.8h, v4.h[5]\n" + "ins v2.d[1], x22\n" + "fmla v11.8h, v3.8h, v4.h[6]\n" + "fmla v12.8h, v3.8h, v4.h[7]\n" + + "subs x20, x20, 0x2\n" + "bne 0b\n" + + "cbz %5, 1f\n" + "fmla v5.8h, v1.8h, v2.h[0]\n" + "fmla v6.8h, v1.8h, v2.h[1]\n" + "fmla v7.8h, v1.8h, v2.h[2]\n" + "fmla v8.8h, v1.8h, v2.h[3]\n" + "fmla v9.8h, v1.8h, v2.h[4]\n" + "fmla v10.8h, v1.8h, v2.h[5]\n" + "fmla v11.8h, v1.8h, v2.h[6]\n" + "fmla v12.8h, v1.8h, v2.h[7]\n" + + "1:\n" + "mov x26, %0\n" + "st1 {v5.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v6.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v7.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v9.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v10.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v12.8h}, [x26]\n" + : "+r"(out), "+r"(in), "+r"(w) + : "r"((I64)KInner), "r"((I64)M), "r"((I64)KTail) + : "memory", "cc", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v1", + "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12"); +} + +inline void mmm_8x24_A55(U32 M, U32 K, F16 *w, F16 *in, F16 *out) +{ + U32 KTail = K % 2; + U32 KInner = K - KTail; + asm volatile( + // init in0- > v1, w- > v0 + "ld1 {v1.4h}, [%1], #8\n" + "ldr x22, [%1], #8\n" + "ins v1.d[1], x22\n" + "ld1 {v0.4h}, [%2], #8\n" + "ldr x21, [%2], #8\n" + "ins v0.d[1], x21\n" + + "mov x26, %0\n" + "ld1 {v5.8h, v6.8h, v7.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.8h, v9.8h, v10.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.8h, v12.8h, v13.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v14.8h, v15.8h, v16.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v17.8h, v18.8h, v19.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v20.8h, v21.8h, v22.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v23.8h, v24.8h, v25.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v26.8h, v27.8h, v28.8h}, [x26]\n" + + "mov x20, %3\n" + + "0:\n" + // w- > v4, in0- > v2/v3/v1, out0=v5~v28 + "ld1 {v2.4h}, [%1], #8\n" + "fmla v5.8h, v1.8h, v0.h[0]\n" + "fmla v8.8h, v1.8h, v0.h[1]\n" + "ldr x23, [%1], #8\n" + "fmla v11.8h, v1.8h, v0.h[2]\n" + "fmla v14.8h, v1.8h, v0.h[3]\n" + "ins v2.d[1], x23\n" + "fmla v17.8h, v1.8h, v0.h[4]\n" + "fmla v20.8h, v1.8h, v0.h[5]\n" + "ld1 {v4.4h}, [%2], #8\n" + "fmla v23.8h, v1.8h, v0.h[6]\n" + "fmla v26.8h, v1.8h, v0.h[7]\n" + + "ld1 {v3.4h}, [%1], #8\n" + "fmla v6.8h, v2.8h, v0.h[0]\n" + "fmla v9.8h, v2.8h, v0.h[1]\n" + "ldr x24, [%1], #8\n" + "fmla v12.8h, v2.8h, v0.h[2]\n" + "fmla v15.8h, v2.8h, v0.h[3]\n" + "ins v3.d[1], x24\n" + "fmla v18.8h, v2.8h, v0.h[4]\n" + "fmla v21.8h, v2.8h, v0.h[5]\n" + "ldr x25, [%2], #8\n" + "fmla v24.8h, v2.8h, v0.h[6]\n" + "fmla v27.8h, v2.8h, v0.h[7]\n" + + "ld1 {v1.4h}, [%1], #8\n" + "fmla v7.8h, v3.8h, v0.h[0]\n" + "fmla v10.8h, v3.8h, v0.h[1]\n" + "ldr x22, [%1], #8\n" + "fmla v13.8h, v3.8h, v0.h[2]\n" + "fmla v16.8h, v3.8h, v0.h[3]\n" + "ins v1.d[1], x22\n" + "fmla v19.8h, v3.8h, v0.h[4]\n" + "fmla v22.8h, v3.8h, v0.h[5]\n" + "ins v4.d[1], x25\n" + "fmla v25.8h, v3.8h, v0.h[6]\n" + "fmla v28.8h, v3.8h, v0.h[7]\n" + + // w- > v0, in0- > v2/v3/v1, out0- > v5~v28 + "ld1 {v2.4h}, [%1], #8\n" + "fmla v5.8h, v1.8h, v4.h[0]\n" + "fmla v8.8h, v1.8h, v4.h[1]\n" + "ldr x23, [%1], #8\n" + "fmla v11.8h, v1.8h, v4.h[2]\n" + "fmla v14.8h, v1.8h, v4.h[3]\n" + "ins v2.d[1], x23\n" + "fmla v17.8h, v1.8h, v4.h[4]\n" + "fmla v20.8h, v1.8h, v4.h[5]\n" + "ld1 {v0.4h}, [%2], #8\n" + "fmla v23.8h, v1.8h, v4.h[6]\n" + "fmla v26.8h, v1.8h, v4.h[7]\n" + + "ld1 {v3.4h}, [%1], #8\n" + "fmla v6.8h, v2.8h, v4.h[0]\n" + "fmla v9.8h, v2.8h, v4.h[1]\n" + "ldr x24, [%1], #8\n" + "fmla v12.8h, v2.8h, v4.h[2]\n" + "fmla v15.8h, v2.8h, v4.h[3]\n" + "ins v3.d[1], x24\n" + "fmla v18.8h, v2.8h, v4.h[4]\n" + "fmla v21.8h, v2.8h, v4.h[5]\n" + "ldr x21, [%2], #8\n" + "fmla v24.8h, v2.8h, v4.h[6]\n" + "fmla v27.8h, v2.8h, v4.h[7]\n" + + "ld1 {v1.4h}, [%1], #8\n" + "fmla v7.8h, v3.8h, v4.h[0]\n" + "fmla v10.8h, v3.8h, v4.h[1]\n" + "ldr x22, [%1], #8\n" + "fmla v13.8h, v3.8h, v4.h[2]\n" + "fmla v16.8h, v3.8h, v4.h[3]\n" + "ins v1.d[1], x22\n" + "fmla v19.8h, v3.8h, v4.h[4]\n" + "fmla v22.8h, v3.8h, v4.h[5]\n" + "ins v0.d[1], x21\n" + "fmla v25.8h, v3.8h, v4.h[6]\n" + "subs x20, x20, #0x2\n" + "fmla v28.8h, v3.8h, v4.h[7]\n" + + "bne 0b\n" + + "cbz %5, 1f\n" + "ld1 {v2.4h}, [%1], #8\n" + "fmla v5.8h, v1.8h, v0.h[0]\n" + "fmla v8.8h, v1.8h, v0.h[1]\n" + "ldr x23, [%1], #8\n" + "fmla v11.8h, v1.8h, v0.h[2]\n" + "fmla v14.8h, v1.8h, v0.h[3]\n" + "ins v2.d[1], x23\n" + "fmla v17.8h, v1.8h, v0.h[4]\n" + "fmla v20.8h, v1.8h, v0.h[5]\n" + "fmla v23.8h, v1.8h, v0.h[6]\n" + "fmla v26.8h, v1.8h, v0.h[7]\n" + + "ld1 {v3.4h}, [%1], #8\n" + "fmla v6.8h, v2.8h, v0.h[0]\n" + "fmla v9.8h, v2.8h, v0.h[1]\n" + "ldr x24, [%1], #8\n" + "fmla v12.8h, v2.8h, v0.h[2]\n" + "fmla v15.8h, v2.8h, v0.h[3]\n" + "ins v3.d[1], x24\n" + "fmla v18.8h, v2.8h, v0.h[4]\n" + "fmla v21.8h, v2.8h, v0.h[5]\n" + "fmla v24.8h, v2.8h, v0.h[6]\n" + "fmla v27.8h, v2.8h, v0.h[7]\n" + + "fmla v7.8h, v3.8h, v0.h[0]\n" + "fmla v10.8h, v3.8h, v0.h[1]\n" + "fmla v13.8h, v3.8h, v0.h[2]\n" + "fmla v16.8h, v3.8h, v0.h[3]\n" + "fmla v19.8h, v3.8h, v0.h[4]\n" + "fmla v22.8h, v3.8h, v0.h[5]\n" + "fmla v25.8h, v3.8h, v0.h[6]\n" + "fmla v28.8h, v3.8h, v0.h[7]\n" + + "1:\n" + "mov x26, %0\n" + "st1 {v5.8h, v6.8h, v7.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.8h, v9.8h, v10.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.8h, v12.8h, v13.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v14.8h, v15.8h, v16.8h}, [x26]\n" + "add x26, x26, %4\n" + + "st1 {v17.8h, v18.8h, v19.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v20.8h, v21.8h, v22.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v23.8h, v24.8h, v25.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v26.8h, v27.8h, v28.8h}, [x26]\n" + : "+r"(out), "+r"(in), "+r"(w) + : "r"((I64)KInner), "r"((I64)M), "r"((I64)KTail) + : "memory", "cc", "x0", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", + "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28"); +} + +void mmm_A55(int M, int N, int K, bool transposeA, F16 *matrix1, F16 *matrix2, F16 *tmp, F16 *result) +{ + int blockK = K; + int blockM = 192; + F16 *matrix1Trans = tmp; + F16 *resultCurrent = result; + int KInner, MInner, m, n; + for (int k = 0; k < K; k += blockK) { + KInner = UNI_MIN(blockK, K - k); + for (int i = 0; i < M; i += blockM) { + MInner = UNI_MIN(blockM, M - i); + + for (n = 0; n <= N - 8; n += 8) { + if (i == 0) { + if (transposeA) { + matrix2_trans(8, KInner, N, matrix1 + n, matrix1Trans + n * KInner); + } else { + matrix1_trans(8, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + } + } + for (m = 0; m <= (MInner - 24); m += 24) { + resultCurrent = result + n * M + m + i; + mmm_8x24_A55(M * 2, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + for (; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_8x8_A55(M * 2, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_8x4_A55(M * 2, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_N8_MTail(MInner - m, M, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + } + + if ((N - n) >= 4) { + if (i == 0) { + if (transposeA) { + matrix2_trans(4, KInner, N, matrix1 + n, matrix1Trans + n * KInner); + } else { + matrix1_trans(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + } + } + + for (m = 0; m <= (MInner - 24); m += 24) { + resultCurrent = result + n * M + m + i; + mmm_4x24_A55(M * 2, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + for (; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_4x8_A55(M * 2, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_4x4_A55(M * 2, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_N4_MTail(MInner - m, M, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + n += 4; + } + + if (N - n) { + if (i == 0) { + if (transposeA) { + matrix2_trans(N - n, KInner, N, matrix1 + n, matrix1Trans + n * KInner); + } else { + matrix1_trans( + N - n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + } + } + + for (m = 0; m <= (MInner - 24); m += 24) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M24(M, N - n, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + for (; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M8(M, N - n, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M4(M, N - n, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M(MInner - m, M, N - n, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + } + } + } +} diff --git a/compute/blas_enhance/src/cpu/arm/fp16/mmm_A76.cpp b/compute/blas_enhance/src/cpu/arm/fp16/mmm_A76.cpp new file mode 100644 index 00000000..74df49e2 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp16/mmm_A76.cpp @@ -0,0 +1,592 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include + +#include "types.h" +#include "error.h" +#include "cpu/arm/fp16/mmm_common.h" +#include "cpu/arm/fp16/mmm.h" + +#define MMM_FMA_4x8_V5V14s3_V1xV0 \ + "fmla v5.8h, v1.8h, v0.h[0]\n" \ + "fmla v8.8h, v1.8h, v0.h[1]\n" \ + "fmla v11.8h, v1.8h, v0.h[2]\n" \ + "fmla v14.8h, v1.8h, v0.h[3]\n" +#define MMM_FMA_4x8_V17V26s3_V1xV0 \ + "fmla v17.8h, v1.8h, v0.h[4]\n" \ + "fmla v20.8h, v1.8h, v0.h[5]\n" \ + "fmla v23.8h, v1.8h, v0.h[6]\n" \ + "fmla v26.8h, v1.8h, v0.h[7]\n" +#define MMM_FMA_4x8_V6V15s3_V2xV0 \ + "fmla v6.8h, v2.8h, v0.h[0]\n" \ + "fmla v9.8h, v2.8h, v0.h[1]\n" \ + "fmla v12.8h, v2.8h, v0.h[2]\n" \ + "fmla v15.8h, v2.8h, v0.h[3]\n" +#define MMM_FMA_4x8_V18V27s3_V2xV0 \ + "fmla v18.8h, v2.8h, v0.h[4]\n" \ + "fmla v21.8h, v2.8h, v0.h[5]\n" \ + "fmla v24.8h, v2.8h, v0.h[6]\n" \ + "fmla v27.8h, v2.8h, v0.h[7]\n" +#define MMM_FMA_4x8_V7V16s3_V3xV0 \ + "fmla v7.8h, v3.8h, v0.h[0]\n" \ + "fmla v10.8h, v3.8h, v0.h[1]\n" \ + "fmla v13.8h, v3.8h, v0.h[2]\n" \ + "fmla v16.8h, v3.8h, v0.h[3]\n" +#define MMM_FMA_4x8_V19V28s3_V3xV0 \ + "fmla v19.8h, v3.8h, v0.h[4]\n" \ + "fmla v22.8h, v3.8h, v0.h[5]\n" \ + "fmla v25.8h, v3.8h, v0.h[6]\n" \ + "fmla v28.8h, v3.8h, v0.h[7]\n" +#define MMM_FMA_4x8_V5V14s3_V29xV4 \ + "fmla v5.8h, v29.8h, v4.h[0]\n" \ + "fmla v8.8h, v29.8h, v4.h[1]\n" \ + "fmla v11.8h, v29.8h, v4.h[2]\n" \ + "fmla v14.8h, v29.8h, v4.h[3]\n" +#define MMM_FMA_4x8_V17V26s3_V29xV4 \ + "fmla v17.8h, v29.8h, v4.h[4]\n" \ + "fmla v20.8h, v29.8h, v4.h[5]\n" \ + "fmla v23.8h, v29.8h, v4.h[6]\n" \ + "fmla v26.8h, v29.8h, v4.h[7]\n" +#define MMM_FMA_4x8_V6V15s3_V30xV4 \ + "fmla v6.8h, v30.8h, v4.h[0]\n" \ + "fmla v9.8h, v30.8h, v4.h[1]\n" \ + "fmla v12.8h, v30.8h, v4.h[2]\n" \ + "fmla v15.8h, v30.8h, v4.h[3]\n" +#define MMM_FMA_4x8_V18V27s3_V30xV4 \ + "fmla v18.8h, v30.8h, v4.h[4]\n" \ + "fmla v21.8h, v30.8h, v4.h[5]\n" \ + "fmla v24.8h, v30.8h, v4.h[6]\n" \ + "fmla v27.8h, v30.8h, v4.h[7]\n" +#define MMM_FMA_4x8_V7V16s3_V31xV4 \ + "fmla v7.8h, v31.8h, v4.h[0]\n" \ + "fmla v10.8h, v31.8h, v4.h[1]\n" \ + "fmla v13.8h, v31.8h, v4.h[2]\n" \ + "fmla v16.8h, v31.8h, v4.h[3]\n" +#define MMM_FMA_4x8_V19V28s3_V31xV4 \ + "fmla v19.8h, v31.8h, v4.h[4]\n" \ + "fmla v22.8h, v31.8h, v4.h[5]\n" \ + "fmla v25.8h, v31.8h, v4.h[6]\n" \ + "fmla v28.8h, v31.8h, v4.h[7]\n" + +inline void mmm_4x24_A76(U32 M, U32 K, F16 *w, F16 *in, F16 *out) +{ + U32 KTail = K % 2; + U32 KInner = K - KTail; + asm volatile( + // init in0- > v1, w- > v0 + "ld1 {v1.8h}, [%1], #16\n" + "ld1 {v0.4h}, [%2], #8\n" + "mov x26, %0\n" + "ld1 {v5.8h, v6.8h, v7.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.8h, v9.8h, v10.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.8h, v12.8h, v13.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v14.8h, v15.8h, v16.8h}, [x26]\n" + + "mov x20, %3\n" + + "0:\n" + // w- > v4, in0- > v2/v3/v1, out0=v5~v28 + "ld1 {v2.8h}, [%1], #16\n" MMM_FMA_4x8_V5V14s3_V1xV0 "ld1 {v3.8h}, [%1], #16\n" + "ld1 {v29.8h}, [%1], #16\n" MMM_FMA_4x8_V6V15s3_V2xV0 "ld1 {v4.4h}, [%2], " + "#8\n" MMM_FMA_4x8_V7V16s3_V3xV0 + + // w- > v0, in0- > v2/v3/v1, out0- > v5~v28 + "ld1 {v30.8h}, [%1], #16\n" MMM_FMA_4x8_V5V14s3_V29xV4 + "ld1 {v31.8h}, [%1], #16\n" MMM_FMA_4x8_V6V15s3_V30xV4 "ld1 {v1.8h}, [%1], #16\n" + "ld1 {v0.4h}, [%2], #8\n" MMM_FMA_4x8_V7V16s3_V31xV4 + + "subs x20, x20, #0x2\n" + "bne 0b\n" + + "cbz %5, 1f\n" + "ld1 {v2.8h}, [%1], #16\n" MMM_FMA_4x8_V5V14s3_V1xV0 + "ld1 {v3.8h}, [%1], #16\n" MMM_FMA_4x8_V6V15s3_V2xV0 MMM_FMA_4x8_V7V16s3_V3xV0 + + "1:\n" + "mov x26, %0\n" + "st1 {v5.8h, v6.8h, v7.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.8h, v9.8h, v10.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.8h, v12.8h, v13.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v14.8h, v15.8h, v16.8h}, [x26]\n" + : "+r"(out), "+r"(in), "+r"(w) + : "r"((I64)KInner), "r"((I64)M), "r"((I64)KTail) + : "memory", "cc", "x20", "x26", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v29", "v30", "v31"); +} +inline void mmm_8x4_A76(U32 M, U32 K, F16 *w, F16 *in, F16 *out) +{ + U32 KTail = K % 2; + U32 KInner = K - KTail; + asm volatile("ld1 {v1.8h}, [%2], #16\n" + "ld1 {v0.4h}, [%1], #8\n" + + "mov x26, %0\n" + "ld1 {v5.h}[0], [x26], #2\n" + "ld1 {v8.h}[0], [x26], #2\n" + "ld1 {v11.h}[0], [x26], #2\n" + "ld1 {v14.h}[0], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[1], [x26], #2\n" + "ld1 {v8.h}[1], [x26], #2\n" + "ld1 {v11.h}[1], [x26], #2\n" + "ld1 {v14.h}[1], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[2], [x26], #2\n" + "ld1 {v8.h}[2], [x26], #2\n" + "ld1 {v11.h}[2], [x26], #2\n" + "ld1 {v14.h}[2], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[3], [x26], #2\n" + "ld1 {v8.h}[3], [x26], #2\n" + "ld1 {v11.h}[3], [x26], #2\n" + "ld1 {v14.h}[3], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[4], [x26], #2\n" + "ld1 {v8.h}[4], [x26], #2\n" + "ld1 {v11.h}[4], [x26], #2\n" + "ld1 {v14.h}[4], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[5], [x26], #2\n" + "ld1 {v8.h}[5], [x26], #2\n" + "ld1 {v11.h}[5], [x26], #2\n" + "ld1 {v14.h}[5], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[6], [x26], #2\n" + "ld1 {v8.h}[6], [x26], #2\n" + "ld1 {v11.h}[6], [x26], #2\n" + "ld1 {v14.h}[6], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "ld1 {v5.h}[7], [x26], #2\n" + "ld1 {v8.h}[7], [x26], #2\n" + "ld1 {v11.h}[7], [x26], #2\n" + "ld1 {v14.h}[7], [x26], #2\n" + + "mov x20, %3\n" + + "0:\n" + "ld1 {v4.4h}, [%1], #8\n" + "ld1 {v29.8h}, [%2], #16\n" MMM_FMA_4x8_V5V14s3_V1xV0 "ld1 {v1.8h}, [%2], #16\n" + "ld1 {v0.4h}, [%1], #8\n" MMM_FMA_4x8_V5V14s3_V29xV4 + + "subs x20, x20, 0x2\n" + "bne 0b\n" + + "cbz %5, 1f\n" MMM_FMA_4x8_V5V14s3_V1xV0 + + "1:\n" + "mov x26, %0\n" + "st1 {v5.h}[0], [x26], #2\n" + "st1 {v8.h}[0], [x26], #2\n" + "st1 {v11.h}[0], [x26], #2\n" + "st1 {v14.h}[0], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[1], [x26], #2\n" + "st1 {v8.h}[1], [x26], #2\n" + "st1 {v11.h}[1], [x26], #2\n" + "st1 {v14.h}[1], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[2], [x26], #2\n" + "st1 {v8.h}[2], [x26], #2\n" + "st1 {v11.h}[2], [x26], #2\n" + "st1 {v14.h}[2], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[3], [x26], #2\n" + "st1 {v8.h}[3], [x26], #2\n" + "st1 {v11.h}[3], [x26], #2\n" + "st1 {v14.h}[3], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[4], [x26], #2\n" + "st1 {v8.h}[4], [x26], #2\n" + "st1 {v11.h}[4], [x26], #2\n" + "st1 {v14.h}[4], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[5], [x26], #2\n" + "st1 {v8.h}[5], [x26], #2\n" + "st1 {v11.h}[5], [x26], #2\n" + "st1 {v14.h}[5], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[6], [x26], #2\n" + "st1 {v8.h}[6], [x26], #2\n" + "st1 {v11.h}[6], [x26], #2\n" + "st1 {v14.h}[6], [x26], #2\n" + "sub x26, x26, #8\n" + "add x26, x26, %4\n" + "st1 {v5.h}[7], [x26], #2\n" + "st1 {v8.h}[7], [x26], #2\n" + "st1 {v11.h}[7], [x26], #2\n" + "st1 {v14.h}[7], [x26], #2\n" + : "+r"(out), "+r"(in), "+r"(w) + : "r"((I64)KInner), "r"((I64)M), "r"((I64)KTail) + : "memory", "cc", "x20", "x26", "v0", "v1", "v4", "v29", "v5", "v8", "v11", "v14"); +} + +inline void mmm_4x8_A76(U32 M, U32 K, F16 *w, F16 *in, F16 *out) +{ + U32 KTail = K % 2; + U32 KInner = K - KTail; + asm volatile("ld1 {v1.8h}, [%1], #16\n" + "ld1 {v0.4h}, [%2], #8\n" + "mov x26, %0\n" + "ld1 {v5.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v14.8h}, [x26]\n" + + "mov x20, %3\n" + + "0:\n" + "ld1 {v29.8h}, [%1], #16\n" + "ld1 {v4.4h}, [%2], #8\n" MMM_FMA_4x8_V5V14s3_V1xV0 "ld1 {v1.8h}, [%1], #16\n" + "ld1 {v0.4h}, [%2], #8\n" MMM_FMA_4x8_V5V14s3_V29xV4 + + "subs x20, x20, 0x2\n" + "bne 0b\n" + + "cbz %5, 1f\n" MMM_FMA_4x8_V5V14s3_V1xV0 + + "1:\n" + "mov x26, %0\n" + "st1 {v5.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v14.8h}, [x26]\n" + : "+r"(out), "+r"(in), "+r"(w) + : "r"((I64)KInner), "r"((I64)M), "r"((I64)KTail) + : "memory", "cc", "x20", "x26", "v0", "v1", "v4", "v5", "v8", "v11", "v14", "v29"); +} + +inline void mmm_4x4_A76(U32 M, U32 K, F16 *w, F16 *in, F16 *out) +{ + U32 KTail = K % 2; + U32 KInner = K - KTail; + asm volatile("ld1 {v1.4h}, [%1], #8\n" + "ld1 {v0.4h}, [%2], #8\n" + "mov x26, %0\n" + "ld1 {v5.4h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.4h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.4h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v14.4h}, [x26]\n" + + "mov x20, %3\n" + + "0:\n" + "ld1 {v29.4h}, [%1], #8\n" + "ld1 {v4.4h}, [%2], #8\n" MMM_FMA_4x8_V5V14s3_V1xV0 "ld1 {v1.4h}, [%1], #8\n" + "ld1 {v0.4h}, [%2], #8\n" MMM_FMA_4x8_V5V14s3_V29xV4 + + "subs x20, x20, 0x2\n" + "bne 0b\n" + + "cbz %5, 1f\n" MMM_FMA_4x8_V5V14s3_V1xV0 + + "1:\n" + "mov x26, %0\n" + "st1 {v5.4h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.4h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.4h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v14.4h}, [x26]\n" + : "+r"(out), "+r"(in), "+r"(w) + : "r"((I64)KInner), "r"((I64)M), "r"((I64)KTail) + : "memory", "cc", "x20", "x26", "v0", "v1", "v4", "v29", "v5", "v8", "v11", "v14"); +} + +inline void mmm_8x8_A76(U32 M, U32 K, F16 *w, F16 *in, F16 *out) +{ + U32 KTail = K % 2; + U32 KInner = K - KTail; + asm volatile("mov x26, %0\n" + "ld1 {v5.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v14.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v17.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v20.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v23.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v26.8h}, [x26]\n" + + "mov x20, %3\n" + + "ld1 {v1.8h}, [%1], #16\n" + "ld1 {v0.8h}, [%2], #16\n" + + "0:\n" + "ld1 {v29.8h}, [%1], #16\n" + "ld1 {v4.8h}, [%2], #16\n" MMM_FMA_4x8_V5V14s3_V1xV0 MMM_FMA_4x8_V17V26s3_V1xV0 + + "ld1 {v1.8h}, [%1], #16\n" + "ld1 {v0.8h}, [%2], #16\n" MMM_FMA_4x8_V5V14s3_V29xV4 MMM_FMA_4x8_V17V26s3_V29xV4 + + "subs x20, x20, 0x2\n" + "bne 0b\n" + + "cbz %5, 1f\n" MMM_FMA_4x8_V5V14s3_V1xV0 MMM_FMA_4x8_V17V26s3_V1xV0 + + "1:\n" + "mov x26, %0\n" + "st1 {v5.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v14.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v17.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v20.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v23.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v26.8h}, [x26]\n" + : "+r"(out), "+r"(in), "+r"(w) + : "r"((I64)KInner), "r"((I64)M), "r"((I64)KTail) + : "memory", "cc", "x20", "x26", "v1", "v0", "v29", "v4", "v5", "v8", "v11", "v14", + "v17", "v20", "v23", "v26"); +} + +inline void mmm_8x24_A76(U32 M, U32 K, F16 *w, F16 *in, F16 *out) +{ + U32 KTail = K % 2; + U32 KInner = K - KTail; + asm volatile( + // init in0- > v1, w- > v0 + "ld1 {v1.8h}, [%1], #16\n" + "ld1 {v0.8h}, [%2], #16\n" + "mov x26, %0\n" + "ld1 {v5.8h, v6.8h, v7.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.8h, v9.8h, v10.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.8h, v12.8h, v13.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v14.8h, v15.8h, v16.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v17.8h, v18.8h, v19.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v20.8h, v21.8h, v22.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v23.8h, v24.8h, v25.8h}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v26.8h, v27.8h, v28.8h}, [x26]\n" + + "mov x20, %3\n" + + "0:\n" + // w- > v4, in0- > v2/v3/v1, out0=v5~v28 + "ld1 {v2.8h}, [%1], #16\n" + "ld1 {v3.8h}, [%1], #16\n" MMM_FMA_4x8_V5V14s3_V1xV0 MMM_FMA_4x8_V17V26s3_V1xV0 + + "ld1 {v4.8h}, [%2], #16\n" MMM_FMA_4x8_V6V15s3_V2xV0 MMM_FMA_4x8_V18V27s3_V2xV0 + + "ld1 {v29.8h}, [%1], #16\n" MMM_FMA_4x8_V7V16s3_V3xV0 MMM_FMA_4x8_V19V28s3_V3xV0 + + // w- > v0, in0- > v2/v3/v1, out0- > v5~v28 + "ld1 {v30.8h}, [%1], #16\n" + "ld1 {v0.8h}, [%2], #16\n" MMM_FMA_4x8_V5V14s3_V29xV4 MMM_FMA_4x8_V17V26s3_V29xV4 + + "ld1 {v31.8h}, [%1], #16\n" MMM_FMA_4x8_V6V15s3_V30xV4 MMM_FMA_4x8_V18V27s3_V30xV4 + + "ld1 {v1.8h}, [%1], #16\n" MMM_FMA_4x8_V7V16s3_V31xV4 "subs x20, x20, " + "#0x2\n" MMM_FMA_4x8_V19V28s3_V31xV4 + + "bne 0b\n" + + "cbz %5, 1f\n" + "ld1 {v2.8h}, [%1], #16\n" + "ld1 {v3.8h}, [%1], #16\n" MMM_FMA_4x8_V5V14s3_V1xV0 MMM_FMA_4x8_V17V26s3_V1xV0 + MMM_FMA_4x8_V6V15s3_V2xV0 MMM_FMA_4x8_V18V27s3_V2xV0 MMM_FMA_4x8_V7V16s3_V3xV0 + MMM_FMA_4x8_V19V28s3_V3xV0 + + "1:\n" + "mov x26, %0\n" + "st1 {v5.8h, v6.8h, v7.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.8h, v9.8h, v10.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.8h, v12.8h, v13.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v14.8h, v15.8h, v16.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v17.8h, v18.8h, v19.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v20.8h, v21.8h, v22.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v23.8h, v24.8h, v25.8h}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v26.8h, v27.8h, v28.8h}, [x26]\n" + : "+r"(out), "+r"(in), "+r"(w) + : "r"((I64)KInner), "r"((I64)M), "r"((I64)KTail) + : "memory", "cc", "x0", "x20", "x26", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); +} + +void mmm_A76(int M, int N, int K, bool transposeA, F16 *matrix1, F16 *matrix2, F16 *tmp, F16 *result) +{ + int blockK = K; + int blockM = 192; + F16 *matrix1Trans = tmp; + F16 *resultCurrent = result; + int KInner, MInner, m, n; + for (int k = 0; k < K; k += blockK) { + KInner = UNI_MIN(blockK, K - k); + for (int i = 0; i < M; i += blockM) { + MInner = UNI_MIN(blockM, M - i); + for (n = 0; n <= N - 8; n += 8) { + if (i == 0) { + if (transposeA) { + matrix2_trans(8, KInner, N, matrix1 + n, matrix1Trans + n * KInner); + } else { + matrix1_trans(8, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + } + } + for (m = 0; m <= (MInner - 24); m += 24) { + resultCurrent = result + n * M + m + i; + mmm_8x24_A76(M * 2, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + for (; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_8x8_A76(M * 2, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_8x4_A76(M * 2, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_N8_MTail(MInner - m, M, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + } + + if ((N - n) >= 4) { + if (i == 0) { + if (transposeA) { + matrix2_trans(4, KInner, N, matrix1 + n, matrix1Trans + n * KInner); + } else { + matrix1_trans(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + } + } + + for (m = 0; m <= (MInner - 24); m += 24) { + resultCurrent = result + n * M + m + i; + mmm_4x24_A76(M * 2, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + for (; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_4x8_A76(M * 2, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_4x4_A76(M * 2, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_N4_MTail(MInner - m, M, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + n += 4; + } + + if (N - n) { + if (i == 0) { + if (transposeA) { + matrix2_trans(N - n, KInner, N, matrix1 + n, matrix1Trans + n * KInner); + } else { + matrix1_trans( + N - n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + } + } + + for (m = 0; m <= (MInner - 24); m += 24) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M24(M, N - n, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + for (; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M8(M, N - n, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M4(M, N - n, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M(MInner - m, M, N - n, KInner, matrix1Trans + n * KInner, + matrix2 + (i + m) * KInner, resultCurrent); + } + } + } + } +} diff --git a/blas-enhance/src/cpu/arm/fp16/mmm_common.h b/compute/blas_enhance/src/cpu/arm/fp16/mmm_common.h similarity index 63% rename from blas-enhance/src/cpu/arm/fp16/mmm_common.h rename to compute/blas_enhance/src/cpu/arm/fp16/mmm_common.h index 9934677e..2538f643 100644 --- a/blas-enhance/src/cpu/arm/fp16/mmm_common.h +++ b/compute/blas_enhance/src/cpu/arm/fp16/mmm_common.h @@ -1,64 +1,58 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_MMM_COMMON #define _H_MMM_COMMON #include #include -#include "type.h" - +#include "types.h" -inline void matrix1_trans(U32 size, U32 blockK, U32 K, F16* src, F16* dst) { - F16* src1 = src; +inline void matrix1_trans(U32 size, U32 blockK, U32 K, F16 *src, F16 *dst) +{ + F16 *src1 = src; U32 offset; for (U32 i = 0; i < blockK; i++) { for (U32 j = 0; j < size; j++) { src1 = src + j * K; offset = 8 * blockK; if (i % 32) { - asm volatile( - "prfm pldl2keep, [%0, %1]\n" - :"+r" (src1) - :"r"((I64)offset) - :"memory","cc" - ); + asm volatile("prfm pldl2keep, [%0, %1]\n" + : "+r"(src1) + : "r"((I64)offset) + : "memory", "cc"); } *dst++ = *(src1 + i); } } } -inline void matrix2_trans(U32 size, U32 blockK, U32 M, F16* src, F16* dst) { +inline void matrix2_trans(U32 size, U32 blockK, U32 M, F16 *src, F16 *dst) +{ for (U32 i = 0; i < blockK; i++) { - asm volatile( - "prfm pldl2keep, [%0, #48]\n" - :"+r" (src) - : - :"memory","cc" - ); - memcpy(dst, src, size * sizeof (F16)); + asm volatile("prfm pldl2keep, [%0, #48]\n" : "+r"(src) : : "memory", "cc"); + memcpy(dst, src, size * sizeof(F16)); dst += size; src += M; } } -inline void mmm_NTail_M24(U32 M, U32 N, U32 K, F16* matrix1, F16* matrix2, F16* result) { +inline void mmm_NTail_M24(U32 M, U32 N, U32 K, F16 *matrix1, F16 *matrix2, F16 *result) +{ float16x8x3_t mat2, res; for (U32 i = 0; i < N; i++) { res = vld3q_f16(result + i * M); - for (U32 q = 0; q < K; q+=1) { + for (U32 q = 0; q < K; q += 1) { mat2 = vld3q_f16(matrix2 + q * 24); res.val[0] = vfmaq_n_f16(res.val[0], mat2.val[0], matrix1[q * N + i]); res.val[1] = vfmaq_n_f16(res.val[1], mat2.val[1], matrix1[q * N + i]); @@ -68,11 +62,12 @@ inline void mmm_NTail_M24(U32 M, U32 N, U32 K, F16* matrix1, F16* matrix2, F16* } } -inline void mmm_NTail_M8(U32 M, U32 N, U32 K, F16* matrix1, F16* matrix2, F16* result) { +inline void mmm_NTail_M8(U32 M, U32 N, U32 K, F16 *matrix1, F16 *matrix2, F16 *result) +{ float16x8_t mat2, res; for (U32 i = 0; i < N; i++) { res = vld1q_f16(result + i * M); - for (U32 q = 0; q < K; q+=1) { + for (U32 q = 0; q < K; q += 1) { mat2 = vld1q_f16(matrix2 + q * 8); res = vfmaq_n_f16(res, mat2, matrix1[q * N + i]); } @@ -80,11 +75,12 @@ inline void mmm_NTail_M8(U32 M, U32 N, U32 K, F16* matrix1, F16* matrix2, F16* r } } -inline void mmm_NTail_M4(U32 M, U32 N, U32 K, F16* matrix1, F16* matrix2, F16* result) { +inline void mmm_NTail_M4(U32 M, U32 N, U32 K, F16 *matrix1, F16 *matrix2, F16 *result) +{ float16x4_t mat2, res; for (U32 i = 0; i < N; i++) { res = vld1_f16(result + i * M); - for (U32 q = 0; q < K; q+=1) { + for (U32 q = 0; q < K; q += 1) { mat2 = vld1_f16(matrix2 + q * 4); res = vfma_n_f16(res, mat2, matrix1[q * N + i]); } @@ -92,51 +88,53 @@ inline void mmm_NTail_M4(U32 M, U32 N, U32 K, F16* matrix1, F16* matrix2, F16* r } } -inline void mmm_NTail_M(U32 MInner, U32 M, U32 N, U32 K, F16* matrix1, F16* matrix2, F16* result) { - for(U32 i = 0; i < N; i++) { - for(U32 j = 0; j < MInner; j++) { - for(U32 k = 0; k < K; k++) { +inline void mmm_NTail_M(U32 MInner, U32 M, U32 N, U32 K, F16 *matrix1, F16 *matrix2, F16 *result) +{ + for (U32 i = 0; i < N; i++) { + for (U32 j = 0; j < MInner; j++) { + for (U32 k = 0; k < K; k++) { result[i * M + j] += *(matrix1 + k * N + i) * *(matrix2 + k * MInner + j); } - } } } -inline void mmm_N8_MTail(U32 MInner, U32 M, U32 K, F16* matrix1, F16* matrix2, F16* result) { +inline void mmm_N8_MTail(U32 MInner, U32 M, U32 K, F16 *matrix1, F16 *matrix2, F16 *result) +{ float16x8_t mat1 = {0}, res[4] = {0}; F16 tmp[8] = {0}; CHECK_REQUIREMENT(MInner < 4); - for(U32 i = 0; i < K; i++) { + for (U32 i = 0; i < K; i++) { mat1 = vld1q_f16(matrix1 + i * 8); - for(U32 j = 0; j < MInner; j++) { + for (U32 j = 0; j < MInner; j++) { res[j] = vfmaq_n_f16(res[j], mat1, matrix2[j + i * MInner]); } } - for(U32 p = 0; p < MInner; p++) { + for (U32 p = 0; p < MInner; p++) { vst1q_f16(tmp, res[p]); - for(U32 q = 0; q < 8; q++) { + for (U32 q = 0; q < 8; q++) { result[q * M + p] += tmp[q]; } res[p] = vdupq_n_f16(0); } } -inline void mmm_N4_MTail(U32 MInner, U32 M, U32 K, F16* matrix1, F16* matrix2, F16* result) { +inline void mmm_N4_MTail(U32 MInner, U32 M, U32 K, F16 *matrix1, F16 *matrix2, F16 *result) +{ float16x4_t mat1 = {0}, res[4] = {0}; F16 tmp[4] = {0}; CHECK_REQUIREMENT(MInner < 4); - for(U32 i = 0; i < K; i++) { + for (U32 i = 0; i < K; i++) { mat1 = vld1_f16(matrix1 + i * 4); - for(U32 j = 0; j < MInner; j++) { + for (U32 j = 0; j < MInner; j++) { res[j] = vfma_n_f16(res[j], mat1, matrix2[j + i * MInner]); } } - for(U32 p = 0; p < MInner; p++) { + for (U32 p = 0; p < MInner; p++) { vst1_f16(tmp, res[p]); - for(U32 q = 0; q < 4; q++) { + for (U32 q = 0; q < 4; q++) { result[q * M + p] += tmp[q]; } res[p] = vdup_n_f16(0); diff --git a/compute/blas_enhance/src/cpu/arm/fp16/mvm.cpp b/compute/blas_enhance/src/cpu/arm/fp16/mvm.cpp new file mode 100644 index 00000000..1dc5c366 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp16/mvm.cpp @@ -0,0 +1,119 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "error.h" +#include "cpu/arm/fp16/blas_fp16.h" +#include "cpu/arm/fp16/mvm.h" +#include "cpu/arm/fp16/mmm_common.h" +#include "cpu/arm/fp16/mvm_common.h" + +EE matrix_vector_multiply_transform_weight_fp16(TensorDesc desc, F16 *src, F16 *dst) +{ + DataType dt; + DataFormat df; + U32 N, K; + EE ret = SUCCESS; + int i = 0; + switch (desc.df) { + case DF_NORMAL: { + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K)); + for (; i < (int)N - 63; i += 64) { + matrix1_trans(64, K, K, src + i * K, dst + i * K); + } + if (i < (int)N) { + memcpy(dst + i * K, src + i * K, (N - i) * K * bytesOf(DT_F16)); + } + break; + } + case DF_TRANSPOSE: { + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N)); + for (; i < (int)N - 63; i += 64) { + matrix2_trans(64, K, N, src + i, dst + i * K); + } + if (i < (int)N) { + int base = i; + F16 *basePtr = dst + i * K; + for (int j = 0; j < (int)K; j++) { + for (; i < (int)N; i++) { + basePtr[(i - base) * K + j] = src[j * N + i]; + } + } + } + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +void mvm_kernel_fp16(U32 rounds, U32 K, F16 *matrix, F16 *vector, F16 *result) +{ + U32 N = rounds * 64; + float16x8_t mat[8]; + F16 v; + float16x8_t res[8]; + + for (U32 n = 0; n < N; n += 64) { + F16 *bufMov = matrix + n * K; + for (int i = 0; i < 8; i++) { + res[i] = vld1q_f16(result + n + i * 8); + } + for (U32 k = 0; k < K; k++) { + v = vector[k]; + for (int i = 0; i < 8; i++) { + mat[i] = vld1q_f16(bufMov + i * 8); + } + for (int i = 0; i < 8; i++) { + res[i] = vfmaq_n_f16(res[i], mat[i], v); + } + bufMov += 64; + } + for (int i = 0; i < 8; i++) { + vst1q_f16(result + n + i * 8, res[i]); + } + } +} + +void mvm_pack(U32 row, U32 col, F16 *matrix, F16 *vector, F16 *result) +{ + U32 rounds = row / 64; + U32 nTail = row % 64; + + mvm_kernel_fp16(rounds, col, matrix, vector, result); + if (0 != nTail) { + mvm_row_tail(nTail, col, matrix + (row - nTail) * col, vector, result + (row - nTail)); + } +} + +EE mvm_fp16(U32 row, U32 col, DataFormat df, F16 *matrix, F16 *vector, F16 *result, Arch arch) +{ + EE ret = SUCCESS; + if (DF_NKN64 == df) { + mvm_pack(row, col, matrix, vector, result); + return ret; + } + switch (arch) { + case ARM_A55: + mvm_A55(row, col, DF_TRANSPOSE == df, matrix, vector, result); + break; + case ARM_A76: + mvm_A76(row, col, DF_TRANSPOSE == df, matrix, vector, result); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/blas-enhance/src/cpu/arm/fp16/mvm.h b/compute/blas_enhance/src/cpu/arm/fp16/mvm.h similarity index 72% rename from blas-enhance/src/cpu/arm/fp16/mvm.h rename to compute/blas_enhance/src/cpu/arm/fp16/mvm.h index a4b059d1..6764411c 100644 --- a/blas-enhance/src/cpu/arm/fp16/mvm.h +++ b/compute/blas_enhance/src/cpu/arm/fp16/mvm.h @@ -1,23 +1,22 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_MVM #define _H_MVM -#include "type.h" +#include "types.h" -void mvm_A55(U32 row, U32 col, bool transpose, F16* matrix, F16* vector, F16* result); +void mvm_A55(U32 row, U32 col, bool transpose, F16 *matrix, F16 *vector, F16 *result); -void mvm_A76(U32 row, U32 col, bool transpose, F16* matrix, F16* vector, F16* result); +void mvm_A76(U32 row, U32 col, bool transpose, F16 *matrix, F16 *vector, F16 *result); #endif diff --git a/compute/blas_enhance/src/cpu/arm/fp16/mvm_A55.cpp b/compute/blas_enhance/src/cpu/arm/fp16/mvm_A55.cpp new file mode 100644 index 00000000..4a23f27c --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp16/mvm_A55.cpp @@ -0,0 +1,138 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "mvm_common.h" +#include "mvm.h" + +inline void mvm_row_kernel_A55(U32 N, U32 K, F16 *matrix, F16 *vector, F16 *result) +{ + U32 KTail = K % 8; + U32 KInner = K - KTail; + F16 *w0 = matrix; + F16 *w1 = matrix + K * N / 2; + F16 *w2 = matrix + K * 2 * N / 2; + F16 *w3 = matrix + K * 3 * N / 2; + + asm volatile("mov x19, %5\n" + "ld1 {v18.h}[0], [x19]\n" + "add x19, x19, %8\n" + "ld1 {v18.h}[1], [x19]\n" + "add x19, x19, %8\n" + "ld1 {v18.h}[2], [x19]\n" + "add x19, x19, %8\n" + "ld1 {v18.h}[3], [x19]\n" + + "movi v17.8h, #0x0\n" + "movi v16.8h, #0x0\n" + "movi v9.8h, #0x0\n" + "movi v10.8h, #0x0\n" + "movi v11.8h, #0x0\n" + "movi v12.8h, #0x0\n" + "mov x20, %6\n" + "cmp x20, #0x0\n" + "beq 3f\n" + "0:\n" + + "ld1 {v0.4h}, [%0], #8\n" + "ldr x15, [%0], #8\n" + "ins v0.d[1], x15\n" + + "ld1 {v1.4h}, [%1], #8\n" + "ld1 {v2.4h}, [%2], #8\n" + "ldr x21, [%1], #8\n" + "ldr x22, [%2], #8\n" + "ins v1.d[1], x21\n" + "ins v2.d[1], x22\n" + + "ld1 {v3.4h}, [%3], #8\n" + "fmla v9.8h, v1.8h, v0.8h\n" + "ld1 {v4.4h}, [%4], #8\n" + "fmla v10.8h, v2.8h, v0.8h\n" + "ldr x23, [%3], #8\n" + "ldr x24, [%4], #8\n" + "ins v3.d[1], x23\n" + "ins v4.d[1], x24\n" + "fmla v11.8h, v3.8h, v0.8h\n" + "fmla v12.8h, v4.8h, v0.8h\n" + + "subs x20, x20, 0x8\n" + "bne 0b\n" + + "faddp v13.8h, v9.8h, v10.8h\n" + "faddp v14.8h, v11.8h, v12.8h\n" + "faddp v15.8h, v13.8h, v14.8h\n" + "faddp v17.8h, v15.8h, v15.8h\n" + "3:\n" + "mov x16, %7\n" + "cmp x16, #0x0\n" + "beq 2f\n" + + "1:\n" + "ld1 {v8.h}[0], [%0], #2\n" + + "ld1 {v1.h}[0], [%1], #2\n" + "ld1 {v1.h}[1], [%2], #2\n" + "ld1 {v1.h}[2], [%3], #2\n" + "ld1 {v1.h}[3], [%4], #2\n" + "fmla v16.8h, v1.8h, v8.h[0]\n" + + "subs x16, x16, 0x1\n" + "bne 1b\n" + + "fadd v17.8h, v17.8h, v16.8h\n" + + "2:\n" + + "fadd v17.8h, v17.8h, v18.8h\n" + + "mov x19, %5\n" + "st1 {v17.h}[0], [x19]\n" + "add x19, x19, %8\n" + "st1 {v17.h}[1], [x19]\n" + "add x19, x19, %8\n" + "st1 {v17.h}[2], [x19]\n" + "add x19, x19, %8\n" + "st1 {v17.h}[3], [x19]\n" + + : "+r"(vector), "+r"(w0), "+r"(w1), "+r"(w2), "+r"(w3), "+r"(result) + : "r"((I64)KInner), "r"((I64)KTail), "r"((I64)N) + : "memory", "cc", "x19", "x20", "x21", "x22", "x23", "x24", "x15", "x16", "v0", + "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18"); +} + +inline void mvm_row_A55(U32 numRows, U32 numColumns, F16 *matrix, F16 *vector, F16 *result) +{ + // Actual layout is NK, and vector is K + U32 N = numRows; + U32 K = numColumns; + U32 NTail = N % 4; + U32 NInner = N / 4; + for (U32 i = 0; i < NInner; i++) { + mvm_row_kernel_A55(NInner * 2, K, matrix + i * K, vector, result + i); + } + if (NTail != 0) { + mvm_row_tail(NTail, K, matrix + (N - NTail) * K, vector, result + (N - NTail)); + } +} + +void mvm_A55(U32 row, U32 col, bool transpose, F16 *matrix, F16 *vector, F16 *result) +{ + if (transpose) { + mvm_col(row, col, matrix, vector, result); + } else { + mvm_row_A55(row, col, matrix, vector, result); + } +} diff --git a/compute/blas_enhance/src/cpu/arm/fp16/mvm_A76.cpp b/compute/blas_enhance/src/cpu/arm/fp16/mvm_A76.cpp new file mode 100644 index 00000000..99450729 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp16/mvm_A76.cpp @@ -0,0 +1,124 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "mvm_common.h" +#include "mvm.h" + +inline void mvm_row_kernel_A76(U32 N, U32 K, F16 *matrix, F16 *vector, F16 *result) +{ + U32 KTail = K % 8; + U32 KInner = K - KTail; + F16 *w0 = matrix; + F16 *w1 = matrix + K * N / 2; + F16 *w2 = matrix + K * 2 * N / 2; + F16 *w3 = matrix + K * 3 * N / 2; + asm volatile("mov x19, %5\n" + "ld1 {v18.h}[0], [x19]\n" + "add x19, x19, %8\n" + "ld1 {v18.h}[1], [x19]\n" + "add x19, x19, %8\n" + "ld1 {v18.h}[2], [x19]\n" + "add x19, x19, %8\n" + "ld1 {v18.h}[3], [x19]\n" + + "movi v17.8h, #0x0\n" + "movi v16.8h, #0x0\n" + "movi v9.8h, #0x0\n" + "movi v10.8h, #0x0\n" + "movi v11.8h, #0x0\n" + "movi v12.8h, #0x0\n" + "mov x20, %6\n" + "cmp x20, #0x0\n" + "beq 3f\n" + "0:\n" + + "ld1 {v0.8h}, [%0], #16\n" + "ld1 {v1.8h}, [%1], #16\n" + "ld1 {v2.8h}, [%2], #16\n" + "ld1 {v3.8h}, [%3], #16\n" + "ld1 {v4.8h}, [%4], #16\n" + + "fmla v9.8h, v1.8h, v0.8h\n" + "fmla v10.8h, v2.8h, v0.8h\n" + "fmla v11.8h, v3.8h, v0.8h\n" + "fmla v12.8h, v4.8h, v0.8h\n" + + "subs x20, x20, 0x8\n" + "bne 0b\n" + + "faddp v13.8h, v9.8h, v10.8h\n" + "faddp v14.8h, v11.8h, v12.8h\n" + "faddp v15.8h, v13.8h, v14.8h\n" + "faddp v17.8h, v15.8h, v15.8h\n" + "3:\n" + "mov x16, %7\n" + "cmp x16, #0x0\n" + "beq 2f\n" + + "1:\n" + "ld1 {v8.h}[0], [%0], #2\n" + + "ld1 {v1.h}[0], [%1], #2\n" + "ld1 {v1.h}[1], [%2], #2\n" + "ld1 {v1.h}[2], [%3], #2\n" + "ld1 {v1.h}[3], [%4], #2\n" + "fmla v16.8h, v1.8h, v8.h[0]\n" + + "subs x16, x16, 0x1\n" + "bne 1b\n" + + "fadd v17.8h, v17.8h, v16.8h\n" + + "2:\n" + + "fadd v17.8h, v17.8h, v18.8h\n" + "mov x19, %5\n" + "st1 {v17.h}[0], [x19]\n" + "add x19, x19, %8\n" + "st1 {v17.h}[1], [x19]\n" + "add x19, x19, %8\n" + "st1 {v17.h}[2], [x19]\n" + "add x19, x19, %8\n" + "st1 {v17.h}[3], [x19]\n" + : "+r"(vector), "+r"(w0), "+r"(w1), "+r"(w2), "+r"(w3), "+r"(result) + : "r"((I64)KInner), "r"((I64)KTail), "r"((I64)N) + : "memory", "cc", "x19", "x20", "x21", "x22", "x23", "x24", "x15", "x16", "v0", + "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18"); +} + +inline void mvm_row_A76(U32 numRows, U32 numColumns, F16 *matrix, F16 *vector, F16 *result) +{ + // Actual layout is NK, and vector is K + U32 N = numRows; + U32 K = numColumns; + U32 NTail = N % 4; + U32 NInner = N / 4; + for (U32 i = 0; i < NInner; i++) { + mvm_row_kernel_A76(NInner * 2, K, matrix + i * K, vector, result + i); + } + if (NTail != 0) { + mvm_row_tail(NTail, K, matrix + (N - NTail) * K, vector, result + (N - NTail)); + } +} + +void mvm_A76(U32 row, U32 col, bool transpose, F16 *matrix, F16 *vector, F16 *result) +{ + if (transpose) { + mvm_col(row, col, matrix, vector, result); + } else { + mvm_row_A76(row, col, matrix, vector, result); + } +} diff --git a/compute/blas_enhance/src/cpu/arm/fp16/mvm_common.h b/compute/blas_enhance/src/cpu/arm/fp16/mvm_common.h new file mode 100644 index 00000000..d8f8ed02 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp16/mvm_common.h @@ -0,0 +1,253 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_MVM_COMMON +#define _H_MVM_COMMON + +#include +#include "types.h" +#include "arm_neon_expand.h" + +inline void mvm_row_tail(U32 N, U32 K, F16 *matrix, F16 *vector, F16 *result) +{ + float16x8_t vec, res, mat; + U32 KTail = K % 8; + U32 KInner = K - KTail; + + for (U32 i = 0; i < N; i += 1) { + res = vdupq_n_f16(0); + + for (U32 j = 0; j < KInner; j += 8) { + vec = vld1q_f16(&vector[j]); + mat = vld1q_f16(&matrix[j + K * i]); + res = vfmaq_f16(res, vec, mat); + } + result[i] += vaddvq_f16(res); + + if (KTail != 0) { + for (U32 p = 0; p < KTail; p += 1) { + result[i] += vector[p + KInner] * matrix[KInner + p + K * i]; + } + } + } +} + +inline void mvm_col_tail(U32 N, U32 K, F16 *matrix, F16 *vector, F16 *result) +{ + float16x8_t tmp, res, mat; + U32 NTail = N % 8; + U32 NInner = N - NTail; + + for (U32 i = 0; i < K; i += 1) { + for (U32 j = 0; j < NInner; j += 8) { + tmp = vld1q_f16(result + j); + mat = vld1q_f16(&matrix[j + N * i]); + res = vfmaq_n_f16(tmp, mat, vector[i]); + vst1q_f16(result + j, res); + } + if (NTail != 0) { + for (U32 p = 0; p < NTail; p += 1) { + result[NInner + p] += vector[i] * matrix[NInner + N * i + p]; + } + } + } +} + +inline void mvm_col_kernel(U32 N, U32 K, F16 *matrix, F16 *vector, F16 *result) +{ + float16x8_t mat[4] = {0}; + + F16 *w0 = matrix; + F16 *w1 = matrix + K * N; + F16 *w2 = matrix + 2 * K * N; + F16 *w3 = matrix + 3 * K * N; + + U32 N_tail = N % 8; + U32 N_inner = N - N_tail; + + for (U32 i = 0; i < K; i += 1) { + for (U32 j = 0; j < N_inner; j += 8) { + float16x8_t res[4] = {0}; + + res[3] = vld1q_f16(result + j); + mat[0] = vld1q_f16(w0); + mat[1] = vld1q_f16(w1); + mat[2] = vld1q_f16(w2); + mat[3] = vld1q_f16(w3); + + res[0] = vfmaq_n_f16(res[3], mat[0], vector[i]); + res[1] = vfmaq_n_f16(res[0], mat[1], vector[K + i]); + res[2] = vfmaq_n_f16(res[1], mat[2], vector[2 * K + i]); + res[3] = vfmaq_n_f16(res[2], mat[3], vector[3 * K + i]); + + w0 += 8; + w1 += 8; + w2 += 8; + w3 += 8; + vst1q_f16(result + j, res[3]); + } + if (N_tail != 0) { + for (U32 p = 0; p < N_tail; p += 1) { + result[N_inner + p] += vector[i] * *w0++; + result[N_inner + p] += vector[i + K] * *w1++; + result[N_inner + p] += vector[i + 2 * K] * *w2++; + result[N_inner + p] += vector[i + 3 * K] * *w3++; + } + } + } +} + +inline void mvm_col_kernel_4x8(U32 N, U32 K, F16 *matrix, F16 *vector, F16 *result) +{ + F16 *result_end8 = result + N / 8 * 8; + F16 *result_end = result + N; + asm volatile("mov x20, %0\n" + "add x21, x20, %5\n" + "add x22, x21, %5\n" + "add x23, x22, %5\n" + "mov x24, %1\n" + "add x25, x24, %6\n" + "add x26, x25, %6\n" + "add x27, x26, %6\n" + "mov x29, x21\n" + + "00:\n" + "cmp x20, x29\n" + "bge 01f\n" + "ldr h0, [x20], 2\n" + "dup v0.8h, v0.h[0]\n" + "ldr h1, [x21], 2\n" + "dup v1.8h, v1.h[0]\n" + "ldr h2, [x22], 2\n" + "dup v2.8h, v2.h[0]\n" + "ldr h3, [x23], 2\n" + "dup v3.8h, v3.h[0]\n" + + "mov x28, %2\n" + + "10:\n" + "cmp x28, %3\n" + "bge 11f\n" + "ldr q4, [x28]\n" + "ldr q8, [x24], 16\n" + "ldr q9, [x25], 16\n" + "ldr q10, [x26], 16\n" + "fmla v4.8h, v8.8h, v0.8h\n" + "ldr q11, [x27], 16\n" + "fmla v4.8h, v9.8h, v1.8h\n" + "fmla v4.8h, v10.8h, v2.8h\n" + "fmla v4.8h, v11.8h, v3.8h\n" + "str q4, [x28], 16\n" + "b 10b\n" + + "11:\n" + "cmp x28, %4\n" + "bge 12f\n" + "ldr h4, [x28]\n" + "ldr h8, [x24], 2\n" + "ldr h9, [x25], 2\n" + "ldr h10, [x26], 2\n" + "fmla h4, h8, v0.h[0]\n" + "ldr h11, [x27], 2\n" + "fmla h4, h9, v1.h[0]\n" + "fmla h4, h10, v2.h[0]\n" + "fmla h4, h11, v3.h[0]\n" + "str h4, [x28], 2\n" + "b 11b\n" + + "12:\n" + "b 00b\n" + "01:\n" + : "+r"(vector), "+r"(matrix), "+r"(result), "+r"(result_end8), "+r"(result_end) + : "r"((I64)K * 2), "r"((I64)K * N * 2) + : "memory", "cc", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", + "x29", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11"); +} + +inline void mvm_row_kernel(U32 N, U32 K, F16 *matrix, F16 *vector, F16 *result) +{ + float16x8_t res[4] = {0}, mat[4] = {0}, vec; + float16x8_t tmp[6] = {0}; + + F16 *w0 = matrix; + F16 *w1 = matrix + K * N; + F16 *w2 = matrix + 2 * K * N; + F16 *w3 = matrix + 3 * K * N; + + U32 K_tail = K % 8; + U32 K_inner = K - K_tail; + + for (U32 i = 0; i < N; i += 1) { + for (U32 j = 0; j < K_inner; j += 8) { + vec = vld1q_f16(&vector[j]); + + mat[0] = vld1q_f16(w0); + mat[1] = vld1q_f16(w1); + mat[2] = vld1q_f16(w2); + mat[3] = vld1q_f16(w3); + for (U32 k = 0; k < 4; k++) { + res[k] = vfmaq_f16(res[k], vec, mat[k]); + } + w0 += 8; + w1 += 8; + w2 += 8; + w3 += 8; + } + + for (U32 m = 0; m < 2; m++) { + tmp[m] = vpaddq_f16(res[m * 2], res[m * 2 + 1]); + } + tmp[4] = vpaddq_f16(tmp[0], tmp[1]); + tmp[5] = vpaddq_f16(tmp[4], tmp[3]); + F16 addbias; + for (U32 n = 0; n < 4; n++) { + vst1q_lane_f16_builtin(&addbias, tmp[5], n); + result[i + N * n] += addbias; + res[n] = vdupq_n_f16(0); + } + + if (K_tail != 0) { + for (U32 p = 0; p < K_tail; p += 1) { + *(result + i) += vector[p + K_inner] * *w0++; + *(result + N + i) += vector[p + K_inner] * *w1++; + *(result + 2 * N + i) += vector[p + K_inner] * *w2++; + *(result + 3 * N + i) += vector[p + K_inner] * *w3++; + } + } + } +} + +inline void mvm_col(U32 numRows, U32 numColumns, F16 *matrix, F16 *vector, F16 *result) +{ + // Actual layout is KN, and vector is K + U32 N = numRows; + U32 K = numColumns; + U32 KInner = K / 4; + U32 KTail = K % 4; + mvm_col_kernel_4x8(N, KInner, matrix, vector, result); + if (KTail != 0) { + mvm_col_tail(N, KTail, matrix + (K - KTail) * N, vector + (K - KTail), result); + } +} + +// N is number of rows, K for columns +inline void mvm_row(U32 N, U32 K, F16 *matrix, F16 *vector, F16 *result) +{ + U32 NInner = (N / 4); + U32 NTail = N % 4; + mvm_row_kernel(NInner, K, matrix, vector, result); + if (NTail != 0) { + mvm_row_tail(NTail, K, matrix + (N - NTail) * K, vector, result + N - NTail); + } +} +#endif diff --git a/compute/blas_enhance/src/cpu/arm/fp32/axpby.cpp b/compute/blas_enhance/src/cpu/arm/fp32/axpby.cpp new file mode 100644 index 00000000..a1761246 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp32/axpby.cpp @@ -0,0 +1,34 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "error.h" +#include "cpu/arm/fp32/blas_fp32.h" + +EE axpby_fp32(U32 len, F32 a, const F32 *x, F32 b, F32 *y) +{ + EE ret = SUCCESS; + float32x4_t alpha = vdupq_n_f32(a); + float32x4_t beta = vdupq_n_f32(b); + I32 i = 0; + for (; i < ((I32)len) - 3; i += 4) { + float32x4_t out = vld1q_f32(y + i); + float32x4_t in = vld1q_f32(x + i); + out = vmulq_f32(out, beta); + out = vmlaq_f32(out, alpha, in); + vst1q_f32(y + i, out); + } + for (; i < (I32)len; i++) { + y[i] = a * x[i] + b * y[i]; + } + return ret; +} diff --git a/compute/blas_enhance/src/cpu/arm/fp32/blas_fp32.h b/compute/blas_enhance/src/cpu/arm/fp32/blas_fp32.h new file mode 100644 index 00000000..0834ca99 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp32/blas_fp32.h @@ -0,0 +1,97 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_BLAS_FP32 +#define _H_BLAS_FP32 + +#include "sys.h" +#include "types.h" +#include "error.h" +#include "tensor_desc.h" +#include "arm_neon_expand.h" + +EE matrix_vector_multiply_transform_weight_fp32(TensorDesc desc, F32 *src, F32 *dst); + +void mvm_col_fp32(U32 row, U32 col, F32 *matrix, F32 *vector, F32 *result); + +void mvm_row_fp32(U32 row, U32 col, F32 *matrix, F32 *vector, F32 *result); + +EE mvm_fp32(U32 row, U32 col, DataFormat df, F32 *matrix, F32 *vector, F32 *result); + +void matrix_matrix_multiply_tmp_bytes_fp32( + U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes); + +EE matrix_matrix_multiply_transform_rhsN_fp32(TensorDesc desc, F32 *src, F32 *dst); + +EE matrix_matrix_multiply_transform_rhsT_fp32(TensorDesc desc, F32 *src, F32 *dst); + +#ifdef __aarch64__ +EE mmm_fp32_V8( + int M, int N, int K, bool transposeA, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *result); +#else +EE mmm_fp32_V7( + int M, int N, int K, bool transposeA, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *result); +#endif + +EE axpby_fp32(U32 len, F32 a, const F32 *x, F32 b, F32 *y); + +inline void matrix1_trans(U32 size, U32 blockK, U32 K, F32 *src, F32 *dst) +{ + F32 *src1 = src; + for (U32 i = 0; i < blockK; i++) { + for (U32 j = 0; j < size; j++) { + src1 = src + j * K; + if (i % 16 == 0) { + __builtin_prefetch(src1 + 16); + } + *dst++ = *(src1 + i); + } + } +} + +inline void matrix2_trans(U32 size, U32 blockK, U32 M, F32 *src, F32 *dst) +{ + for (U32 i = 0; i < blockK; i++) { + if (i % 16 == 0) { + __builtin_prefetch(src + 16); + } + memcpy(dst, src, size * sizeof(F32)); + dst += size; + src += M; + } +} + +inline void mvm_row_tail(U32 N, U32 K, F32 *matrix, F32 *vector, F32 *result) +{ + float32x4_t vec, res, mat; + U32 KTail = K % 4; + U32 KInner = K - KTail; + + for (U32 i = 0; i < N; i++) { + res = vdupq_n_f32(0); + + for (U32 j = 0; j < KInner; j += 4) { + vec = vld1q_f32(&vector[j]); + mat = vld1q_f32(&matrix[j + K * i]); + res = vfmaq_f32(res, vec, mat); + } + result[i] += vaddvq_f32(res); + + if (KTail != 0) { + for (U32 p = 0; p < KTail; p++) { + result[i] += vector[p + KInner] * matrix[KInner + p + K * i]; + } + } + } +} +#endif diff --git a/blas-enhance/src/cpu/arm/fp32/mmm_V7.cpp b/compute/blas_enhance/src/cpu/arm/fp32/mmm_V7.cpp similarity index 57% rename from blas-enhance/src/cpu/arm/fp32/mmm_V7.cpp rename to compute/blas_enhance/src/cpu/arm/fp32/mmm_V7.cpp index e1adf11b..30845d43 100644 --- a/blas-enhance/src/cpu/arm/fp32/mmm_V7.cpp +++ b/compute/blas_enhance/src/cpu/arm/fp32/mmm_V7.cpp @@ -15,7 +15,7 @@ #include #include "cpu/arm/fp32/blas_fp32.h" #include "error.h" -#include "type.h" +#include "types.h" void matrix_matrix_multiply_tmp_bytes_fp32( U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes) @@ -25,37 +25,12 @@ void matrix_matrix_multiply_tmp_bytes_fp32( *bytes += 32; } -void matrix1_trans(U32 size, U32 blockK, U32 K, F32 *src, F32 *dst) -{ - F32 *src1 = src; - for (U32 i = 0; i < blockK; i++) { - for (U32 j = 0; j < size; j++) { - src1 = src + j * K; - if (i % 16 == 0) { - __builtin_prefetch(src1 + 16); - } - *dst++ = *(src1 + i); - } - } -} - -void matrix2_trans(U32 size, U32 blockK, U32 M, F32 *src, F32 *dst) -{ - for (U32 i = 0; i < blockK; i++) { - if (i % 16 == 0) { - __builtin_prefetch(src + 16); - } - memcpy(dst, src, size * sizeof(F32)); - dst += size; - src += M; - } -} - EE matrix_matrix_multiply_transform_rhsN_fp32(TensorDesc desc, F32 *src, F32 *dst) { DataType dt; + DataFormat df; U32 N, K; - CHECK_STATUS(tensor2dGet(desc, &dt, &K, &N)); + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N)); int i = 0; for (; i < (int)N - 7; i += 8) { matrix2_trans(8, K, N, src + i, dst + i * K); @@ -72,8 +47,9 @@ EE matrix_matrix_multiply_transform_rhsN_fp32(TensorDesc desc, F32 *src, F32 *ds EE matrix_matrix_multiply_transform_rhsT_fp32(TensorDesc desc, F32 *src, F32 *dst) { DataType dt; + DataFormat df; U32 N, K; - CHECK_STATUS(tensor2dGet(desc, &dt, &N, &K)); + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K)); int i = 0; for (; i < (int)N - 7; i += 8) { matrix1_trans(8, K, K, src + i * K, dst + i * K); @@ -173,54 +149,53 @@ void mmm_N4_MTail(U32 MInner, U32 M, U32 K, F32 *matrix1, F32 *matrix2, F32 *res void mmm_4x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) { - asm volatile( - "vld1.f32 {d0-d1}, [%[in]]!\n" + asm volatile("vld1.f32 {d0-d1}, [%[in]]!\n" - "vld1.f32 {d4-d5}, [%[w]]!\n" + "vld1.f32 {d4-d5}, [%[w]]!\n" - //K- > r2 - "mov r2, %[K]\n" + // K- > r2 + "mov r2, %[K]\n" - //give out address to r1 - "mov r1, %[out]\n" + // give out address to r1 + "mov r1, %[out]\n" - //load in bias - "vld1.f32 {d8-d9}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d12-d13}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d16-d17}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d20-d21}, [r1]\n" + // load in bias + "vld1.f32 {d8-d9}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d12-d13}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d16-d17}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d20-d21}, [r1]\n" - //Computation loop - "0:\n" + // Computation loop + "0:\n" - "vmla.f32 q4, q2, d0[0]\n" - "vmla.f32 q6, q2, d0[1]\n" - "vmla.f32 q8, q2, d1[0]\n" - "vmla.f32 q10, q2, d1[1]\n" + "vmla.f32 q4, q2, d0[0]\n" + "vmla.f32 q6, q2, d0[1]\n" + "vmla.f32 q8, q2, d1[0]\n" + "vmla.f32 q10, q2, d1[1]\n" - "vld1.f32 {d4-d5}, [%[w]]!\n" - "subs r2, r2, #1\n" + "vld1.f32 {d4-d5}, [%[w]]!\n" + "subs r2, r2, #1\n" - "vld1.f32 {d0-d1}, [%[in]]!\n" - "bne 0b\n" + "vld1.f32 {d0-d1}, [%[in]]!\n" + "bne 0b\n" - //give out address to r1 - "mov r1, %[out]\n" + // give out address to r1 + "mov r1, %[out]\n" - "vst1.f32 {d8-d9}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d12-d13}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d16-d17}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d20-d21}, [r1]\n" + "vst1.f32 {d8-d9}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d12-d13}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d16-d17}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d20-d21}, [r1]\n" - : [in]"+r"(in), [w]"+r"(w), [out]"+r"(out) - : [K]"r"(K), [offset]"r"(offset) - : "memory", "cc", "q0", "q2", "q4", "q6", "q8", "q10", "r1", "r2"); + : [in] "+r"(in), [w] "+r"(w), [out] "+r"(out) + : [K] "r"(K), [offset] "r"(offset) + : "memory", "cc", "q0", "q2", "q4", "q6", "q8", "q10", "r1", "r2"); } void mmm_6x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) @@ -230,13 +205,13 @@ void mmm_6x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "vld1.f32 {d4-d5}, [%[w]]!\n" - //K- > r2 + // K- > r2 "mov r2, %[K]\n" - //give out address to r1 + // give out address to r1 "mov r1, %[out]\n" - //load in bias + // load in bias "vld1.f32 {d8-d9}, [r1]\n" "add r1, r1, %[offset]\n" "vld1.f32 {d12-d13}, [r1]\n" @@ -249,7 +224,7 @@ void mmm_6x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "add r1, r1, %[offset]\n" "vld1.f32 {d28-d29}, [r1]\n" - //Computation loop + // Computation loop "0:\n" "vmla.f32 q4, q2, d0[0]\n" @@ -265,7 +240,7 @@ void mmm_6x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "vld1.f32 {d0-d2}, [%[in]]!\n" "bne 0b\n" - //give out address to r1 + // give out address to r1 "mov r1, %[out]\n" "vst1.f32 {d8-d9}, [r1]\n" @@ -280,145 +255,143 @@ void mmm_6x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "add r1, r1, %[offset]\n" "vst1.f32 {d28-d29}, [r1]\n" - : [in]"+r"(in), [w]"+r"(w), [out]"+r"(out) - : [K]"r"(K), [offset]"r"(offset) - : "memory", "cc", "q0", "q1", "q2", "q4", "q6", "q8", "q10", - "q12", "q14", "r1", "r2"); + : [in] "+r"(in), [w] "+r"(w), [out] "+r"(out) + : [K] "r"(K), [offset] "r"(offset) + : "memory", "cc", "q0", "q1", "q2", "q4", "q6", "q8", "q10", "q12", "q14", "r1", "r2"); } void mmm_4x8(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) { - asm volatile( - "vld1.f32 {d0-d1}, [%[in]]!\n" + asm volatile("vld1.f32 {d0-d1}, [%[in]]!\n" - "vld1.f32 {d4-d7}, [%[w]]!\n" + "vld1.f32 {d4-d7}, [%[w]]!\n" - //K- > r2 - "mov r2, %[K]\n" + // K- > r2 + "mov r2, %[K]\n" - //give out address to r1 - "mov r1, %[out]\n" + // give out address to r1 + "mov r1, %[out]\n" - //load in bias - "vld1.f32 {d8-d11}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d12-d15}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d16-d19}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d20-d23}, [r1]\n" + // load in bias + "vld1.f32 {d8-d11}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d12-d15}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d16-d19}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d20-d23}, [r1]\n" - //Computation loop - "0:\n" + // Computation loop + "0:\n" - "vmla.f32 q4, q2, d0[0]\n" - "vmla.f32 q6, q2, d0[1]\n" - "vmla.f32 q8, q2, d1[0]\n" - "vmla.f32 q10, q2, d1[1]\n" + "vmla.f32 q4, q2, d0[0]\n" + "vmla.f32 q6, q2, d0[1]\n" + "vmla.f32 q8, q2, d1[0]\n" + "vmla.f32 q10, q2, d1[1]\n" - "vld1.f32 {d4-d5}, [%[w]]!\n" + "vld1.f32 {d4-d5}, [%[w]]!\n" - "vmla.f32 q5, q3, d0[0]\n" - "vmla.f32 q7, q3, d0[1]\n" - "vmla.f32 q9, q3, d1[0]\n" - "vmla.f32 q11, q3, d1[1]\n" + "vmla.f32 q5, q3, d0[0]\n" + "vmla.f32 q7, q3, d0[1]\n" + "vmla.f32 q9, q3, d1[0]\n" + "vmla.f32 q11, q3, d1[1]\n" - "vld1.f32 {d6-d7}, [%[w]]!\n" - "subs r2, r2, #1\n" + "vld1.f32 {d6-d7}, [%[w]]!\n" + "subs r2, r2, #1\n" - "vld1.f32 {d0-d1}, [%[in]]!\n" - "bne 0b\n" + "vld1.f32 {d0-d1}, [%[in]]!\n" + "bne 0b\n" - //give out address to r1 - "mov r1, %[out]\n" + // give out address to r1 + "mov r1, %[out]\n" - "vst1.f32 {d8-d11}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d12-d15}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d16-d19}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d20-d23}, [r1]\n" + "vst1.f32 {d8-d11}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d12-d15}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d16-d19}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d20-d23}, [r1]\n" - : [in]"+r"(in), [w]"+r"(w), [out]"+r"(out) - : [K]"r"(K), [offset]"r"(offset) - : "memory", "cc", "q0", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "r1", "r2"); + : [in] "+r"(in), [w] "+r"(w), [out] "+r"(out) + : [K] "r"(K), [offset] "r"(offset) + : "memory", "cc", "q0", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", + "q11", "r1", "r2"); } void mmm_6x8(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) { - asm volatile( - "vld1.f32 {d0-d2}, [%[in]]!\n" - - "vld1.f32 {d4-d7}, [%[w]]!\n" - - //K- > r2 - "mov r2, %[K]\n" - - //give out address to r1 - "mov r1, %[out]\n" - - //load in bias - "vld1.f32 {d8-d11}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d12-d15}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d16-d19}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d20-d23}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d24-d27}, [r1]\n" - "add r1, r1, %[offset]\n" - "vld1.f32 {d28-d31}, [r1]\n" - - //Computation loop - "0:\n" - - "vmla.f32 q4, q2, d0[0]\n" - "vmla.f32 q6, q2, d0[1]\n" - "vmla.f32 q8, q2, d1[0]\n" - "vmla.f32 q10, q2, d1[1]\n" - "vmla.f32 q12, q2, d2[0]\n" - "vmla.f32 q14, q2, d2[1]\n" - - "vld1.f32 {d4-d5}, [%[w]]!\n" - - "vmla.f32 q5, q3, d0[0]\n" - "vmla.f32 q7, q3, d0[1]\n" - "vmla.f32 q9, q3, d1[0]\n" - "vmla.f32 q11, q3, d1[1]\n" - "vmla.f32 q13, q3, d2[0]\n" - "vmla.f32 q15, q3, d2[1]\n" - - "vld1.f32 {d6-d7}, [%[w]]!\n" - "subs r2, r2, #1\n" - - "vld1.f32 {d0-d2}, [%[in]]!\n" - "bne 0b\n" - - //give out address to r1 - "mov r1, %[out]\n" - - "vst1.f32 {d8-d11}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d12-d15}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d16-d19}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d20-d23}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d24-d27}, [r1]\n" - "add r1, r1, %[offset]\n" - "vst1.f32 {d28-d31}, [r1]\n" - - : [in]"+r"(in), [w]"+r"(w), [out]"+r"(out) - : [K]"r"(K), [offset]"r"(offset) - : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "q12", "q13", "q14", "q15", "r1", "r2"); + asm volatile("vld1.f32 {d0-d2}, [%[in]]!\n" + + "vld1.f32 {d4-d7}, [%[w]]!\n" + + // K- > r2 + "mov r2, %[K]\n" + + // give out address to r1 + "mov r1, %[out]\n" + + // load in bias + "vld1.f32 {d8-d11}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d12-d15}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d16-d19}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d20-d23}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d24-d27}, [r1]\n" + "add r1, r1, %[offset]\n" + "vld1.f32 {d28-d31}, [r1]\n" + + // Computation loop + "0:\n" + + "vmla.f32 q4, q2, d0[0]\n" + "vmla.f32 q6, q2, d0[1]\n" + "vmla.f32 q8, q2, d1[0]\n" + "vmla.f32 q10, q2, d1[1]\n" + "vmla.f32 q12, q2, d2[0]\n" + "vmla.f32 q14, q2, d2[1]\n" + + "vld1.f32 {d4-d5}, [%[w]]!\n" + + "vmla.f32 q5, q3, d0[0]\n" + "vmla.f32 q7, q3, d0[1]\n" + "vmla.f32 q9, q3, d1[0]\n" + "vmla.f32 q11, q3, d1[1]\n" + "vmla.f32 q13, q3, d2[0]\n" + "vmla.f32 q15, q3, d2[1]\n" + + "vld1.f32 {d6-d7}, [%[w]]!\n" + "subs r2, r2, #1\n" + + "vld1.f32 {d0-d2}, [%[in]]!\n" + "bne 0b\n" + + // give out address to r1 + "mov r1, %[out]\n" + + "vst1.f32 {d8-d11}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d12-d15}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d16-d19}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d20-d23}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d24-d27}, [r1]\n" + "add r1, r1, %[offset]\n" + "vst1.f32 {d28-d31}, [r1]\n" + + : [in] "+r"(in), [w] "+r"(w), [out] "+r"(out) + : [K] "r"(K), [offset] "r"(offset) + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15", "r1", "r2"); } -EE mmm_fp32_V7(int M, int N, int K, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *result) +EE mmm_fp32_V7( + int M, int N, int K, bool transposeA, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *result) { int blockK = K; int blockM = 96; @@ -431,7 +404,11 @@ EE mmm_fp32_V7(int M, int N, int K, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *r MInner = UNI_MIN(blockM, M - i); for (n = 0; n <= N - 6; n += 6) { if (i == 0) { - matrix1_trans(6, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + if (transposeA) { + matrix2_trans(6, KInner, N, matrix1 + n, matrix1Trans + n * KInner); + } else { + matrix1_trans(6, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + } } for (m = 0; m <= (MInner - 8); m += 8) { resultCurrent = result + n * M + m + i; @@ -455,7 +432,11 @@ EE mmm_fp32_V7(int M, int N, int K, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *r if ((N - n) >= 4) { if (i == 0) { - matrix1_trans(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + if (transposeA) { + matrix2_trans(4, KInner, N, matrix1 + n, matrix1Trans + n * KInner); + } else { + matrix1_trans(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + } } for (m = 0; m <= (MInner - 8); m += 8) { @@ -482,7 +463,12 @@ EE mmm_fp32_V7(int M, int N, int K, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *r if (N - n) { if (i == 0) { - matrix1_trans(N - n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + if (transposeA) { + matrix2_trans(N - n, KInner, N, matrix1 + n, matrix1Trans + n * KInner); + } else { + matrix1_trans( + N - n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + } } for (m = 0; m <= (MInner - 8); m += 8) { diff --git a/blas-enhance/src/cpu/arm/fp32/mmm_V8.cpp b/compute/blas_enhance/src/cpu/arm/fp32/mmm_V8.cpp similarity index 87% rename from blas-enhance/src/cpu/arm/fp32/mmm_V8.cpp rename to compute/blas_enhance/src/cpu/arm/fp32/mmm_V8.cpp index e305cf78..d9d7c3a4 100644 --- a/blas-enhance/src/cpu/arm/fp32/mmm_V8.cpp +++ b/compute/blas_enhance/src/cpu/arm/fp32/mmm_V8.cpp @@ -15,7 +15,7 @@ #include #include "cpu/arm/fp32/blas_fp32.h" #include "error.h" -#include "type.h" +#include "types.h" void matrix_matrix_multiply_tmp_bytes_fp32( U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes) @@ -25,42 +25,12 @@ void matrix_matrix_multiply_tmp_bytes_fp32( *bytes += 32; } -void matrix1_trans(U32 size, U32 blockK, U32 K, F32 *src, F32 *dst) -{ - F32 *src1 = src; - U32 offset; - for (U32 i = 0; i < blockK; i++) { - for (U32 j = 0; j < size; j++) { - src1 = src + j * K; - offset = 64; - if (i % 16 == 0) { - asm volatile("prfm pldl2keep, [%0, %1]\n" - : "+r"(src1) - : "r"((I64)offset) - : "memory", "cc"); - } - *dst++ = *(src1 + i); - } - } -} - -void matrix2_trans(U32 size, U32 blockK, U32 M, F32 *src, F32 *dst) -{ - for (U32 i = 0; i < blockK; i++) { - if (i % 16 == 0) { - asm volatile("prfm pldl2keep, [%0, #64]\n" : "+r"(src) : : "memory", "cc"); - } - memcpy(dst, src, size * sizeof(F32)); - dst += size; - src += M; - } -} - EE matrix_matrix_multiply_transform_rhsN_fp32(TensorDesc desc, F32 *src, F32 *dst) { DataType dt; + DataFormat df; U32 N, K; - CHECK_STATUS(tensor2dGet(desc, &dt, &K, &N)); + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N)); int i = 0; for (; i < (int)N - 11; i += 12) { matrix2_trans(12, K, N, src + i, dst + i * K); @@ -80,8 +50,9 @@ EE matrix_matrix_multiply_transform_rhsN_fp32(TensorDesc desc, F32 *src, F32 *ds EE matrix_matrix_multiply_transform_rhsT_fp32(TensorDesc desc, F32 *src, F32 *dst) { DataType dt; + DataFormat df; U32 N, K; - CHECK_STATUS(tensor2dGet(desc, &dt, &N, &K)); + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K)); int i = 0; for (; i < (int)N - 11; i += 12) { matrix1_trans(12, K, K, src + i * K, dst + i * K); @@ -197,24 +168,24 @@ void mmm_N4_MTail(U32 MInner, U32 M, U32 K, F32 *matrix1, F32 *matrix2, F32 *res void mmm_4x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) { asm volatile( - //init in- > v1, w- > v0 + // init in- > v1, w- > v0 "ldr q1, [%0]\n" "ldr q0, [%1]\n" - //give in address to x3 + // give in address to x3 "mov x3, %0\n" - //give w address to x0 + // give w address to x0 "mov x0, %1\n" - //K- > x2 + // K- > x2 "mov x2, %3\n" - //give out address to x26 + // give out address to x26 "mov x26, %2\n" - //load in bias + // load in bias "ldr q5, [x26]\n" "add x26, x26, %4\n" @@ -226,7 +197,7 @@ void mmm_4x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "ldr q11, [x26]\n" - //Computation loop + // Computation loop "0:\n" "ldr q3, [x3, 16]!\n" @@ -242,7 +213,7 @@ void mmm_4x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "1:\n" - //give out address to x26 + // give out address to x26 "mov x26, %2\n" "str q5, [x26]\n" @@ -265,24 +236,24 @@ void mmm_4x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) void mmm_8x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) { asm volatile( - //init in- > v1, w- > v0 + // init in- > v1, w- > v0 "ldr q1, [%0]\n" "ldr q0, [%1]\n" - //give in address to x3 + // give in address to x3 "mov x3, %0\n" - //give w address to x0 + // give w address to x0 "mov x0, %1\n" - //K- > x2 + // K- > x2 "mov x2, %3\n" - //give out address to x26 + // give out address to x26 "mov x26, %2\n" - //load in bias + // load in bias "ldr q5, [x26]\n" "add x26, x26, %4\n" @@ -306,7 +277,7 @@ void mmm_8x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "ldr q19, [x26]\n" - //Computation loop + // Computation loop "0:\n" "ldr q3, [x3, 16]\n" @@ -325,7 +296,7 @@ void mmm_8x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "mov v0.16b, v29.16b\n" "bne 0b\n" - //give out address to x26 + // give out address to x26 "mov x26, %2\n" "str q5, [x26]\n" @@ -360,21 +331,21 @@ void mmm_8x4(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) void mmm_4x8(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) { asm volatile( - //init in- > v1, w- > v0 + // init in- > v1, w- > v0 "ldr q1, [%0]\n" "ldr q0, [%1]\n" - //give in address to x3 + // give in address to x3 "mov x3, %0\n" - //give w address to x0 + // give w address to x0 "mov x0, %1\n" - //K- > x2 + // K- > x2 "mov x2, %3\n" - //give out address to x26 + // give out address to x26 "mov x26, %2\n" "ld1 {v5.4s, v6.4s}, [x26]\n" @@ -386,13 +357,13 @@ void mmm_4x8(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "ld1 {v11.4s, v12.4s}, [x26]\n" /* Layout - 5 6 - 7 8 - 9 10 - 11 12 - */ + * 5 6 + * 7 8 + * 9 10 + * 11 12 + */ - //Computation loop + // Computation loop "0:\n" "ldr q29, [x0, 16]\n" @@ -411,7 +382,7 @@ void mmm_4x8(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "mov v1.16b, v3.16b\n" "bne 0b\n" - //give out address to x26 + // give out address to x26 "mov x26, %2\n" "st1 {v5.4s, v6.4s}, [x26]\n" @@ -431,24 +402,24 @@ void mmm_4x8(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) void mmm_8x8(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) { asm volatile( - //init in- > v1, w- > v0 + // init in- > v1, w- > v0 "ldr q1, [%0]\n" "ldr q0, [%1]\n" - //give in address to x3 + // give in address to x3 "mov x3, %0\n" - //give w address to x0 + // give w address to x0 "mov x0, %1\n" - //K- > x2 + // K- > x2 "mov x2, %3\n" - //give out address to x26 + // give out address to x26 "mov x26, %2\n" - //load in bias + // load in bias "ld1 {v5.4s, v6.4s}, [x26]\n" "add x26, x26, %4\n" "ld1 {v7.4s, v8.4s}, [x26]\n" @@ -466,18 +437,18 @@ void mmm_8x8(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "ld1 {v19.4s, v20.4s}, [x26]\n" /* Layout - 5 6 - 7 8 - 9 10 - 11 12 - - 13 14 - 15 16 - 17 18 - 19 20 - */ - - //Computation loop + 5 6 + 7 8 + 9 10 + 11 12 + + 13 14 + 15 16 + 17 18 + 19 20 + */ + + // Computation loop "0:\n" "fmla v5.4s, v0.4s, v1.s[0]\n" @@ -507,7 +478,7 @@ void mmm_8x8(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "bne 0b\n" - //give out address to x26 + // give out address to x26 "mov x26, %2\n" "st1 {v5.4s, v6.4s}, [x26]\n" @@ -535,26 +506,26 @@ void mmm_8x8(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) void mmm_4x12(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) { asm volatile( - //init in->v1, w->v0 + // init in->v1, w->v0 "ldr q1, [%0]\n" "ldr q0, [%1]\n" "ldr q29, [%1, 16]\n" // prefetch one more w - //give in address to x3 + // give in address to x3 "mov x3, %0\n" - //give w address to x0 + // give w address to x0 "mov x0, %1\n" - //K->x2 + // K->x2 "mov x2, %3\n" - //give out address to x26 + // give out address to x26 "mov x26, %2\n" - //load in bias + // load in bias "ld1 {v5.4s, v6.4s, v7.4s}, [x26]\n" "add x26, x26, %4\n" "ld1 {v8.4s, v9.4s, v10.4s}, [x26]\n" @@ -564,13 +535,13 @@ void mmm_4x12(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "ld1 {v14.4s, v15.4s, v16.4s}, [x26]\n" /* Layout - 5 6 7 - 8 9 10 - 11 12 13 - 14 15 16 - */ + 5 6 7 + 8 9 10 + 11 12 13 + 14 15 16 + */ - //Computation loop + // Computation loop "0:\n" // in(x3): v1 // w(x0): v0 v29 v30 @@ -598,7 +569,7 @@ void mmm_4x12(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "mov v1.16b, v2.16b\n" "bne 0b\n" - //give out address to x26 + // give out address to x26 "mov x26, %2\n" "st1 {v5.4s, v6.4s, v7.4s}, [x26]\n" @@ -618,26 +589,26 @@ void mmm_4x12(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) void mmm_8x12(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) { asm volatile( - //init in->v1, w->v0 + // init in->v1, w->v0 "ldr q1, [%0]\n" "ldr q0, [%1]\n" "ldr q29, [%1, 16]\n" // prefetch one more w - //give in address to x3 + // give in address to x3 "mov x3, %0\n" - //give w address to x0 + // give w address to x0 "mov x0, %1\n" - //K->x2 + // K->x2 "mov x2, %3\n" - //give out address to x26 + // give out address to x26 "mov x26, %2\n" - //load in bias + // load in bias "ld1 {v5.4s, v6.4s, v7.4s}, [x26]\n" "add x26, x26, %4\n" "ld1 {v8.4s, v9.4s, v10.4s}, [x26]\n" @@ -655,18 +626,18 @@ void mmm_8x12(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "ld1 {v26.4s, v27.4s, v28.4s}, [x26]\n" /* Layout - 5 6 7 - 8 9 10 - 11 12 13 - 14 15 16 - - 17 18 19 - 20 21 22 - 23 24 25 - 26 27 28 - */ - - //Computation loop + 5 6 7 + 8 9 10 + 11 12 13 + 14 15 16 + + 17 18 19 + 20 21 22 + 23 24 25 + 26 27 28 + */ + + // Computation loop "0:\n" // in(x3): v1 v2 // w(x0): v0 v29 v30 @@ -710,7 +681,7 @@ void mmm_8x12(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "bne 0b\n" - //give out address to x26 + // give out address to x26 "mov x26, %2\n" "st1 {v5.4s, v6.4s, v7.4s}, [x26]\n" @@ -736,7 +707,8 @@ void mmm_8x12(U32 offset, U32 K, F32 *in, F32 *w, F32 *out) "v7", "v6", "v5", "v3", "v2", "v1", "v0", "x26", "x3", "x2", "x0"); } -EE mmm_fp32_V8(int M, int N, int K, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *result) +EE mmm_fp32_V8( + int M, int N, int K, bool transposeA, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *result) { int blockK = K; int blockM = 96; @@ -749,7 +721,11 @@ EE mmm_fp32_V8(int M, int N, int K, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *r MInner = UNI_MIN(blockM, M - i); for (n = 0; n <= N - 8; n += 8) { if (i == 0) { - matrix1_trans(8, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + if (transposeA) { + matrix2_trans(8, KInner, N, matrix1 + n, matrix1Trans + n * KInner); + } else { + matrix1_trans(8, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + } } for (m = 0; m <= (MInner - 12); m += 12) { resultCurrent = result + n * M + m + i; @@ -778,7 +754,11 @@ EE mmm_fp32_V8(int M, int N, int K, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *r if ((N - n) >= 4) { if (i == 0) { - matrix1_trans(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + if (transposeA) { + matrix2_trans(4, KInner, N, matrix1 + n, matrix1Trans + n * KInner); + } else { + matrix1_trans(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + } } for (m = 0; m <= (MInner - 12); m += 12) { @@ -811,7 +791,12 @@ EE mmm_fp32_V8(int M, int N, int K, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *r if (N - n) { if (i == 0) { - matrix1_trans(N - n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + if (transposeA) { + matrix2_trans(N - n, KInner, N, matrix1 + n, matrix1Trans + n * KInner); + } else { + matrix1_trans( + N - n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * KInner); + } } for (m = 0; m <= (MInner - 12); m += 12) { diff --git a/compute/blas_enhance/src/cpu/arm/fp32/mvm.cpp b/compute/blas_enhance/src/cpu/arm/fp32/mvm.cpp new file mode 100644 index 00000000..ec364bcf --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp32/mvm.cpp @@ -0,0 +1,118 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "error.h" +#include "cpu/arm/fp32/blas_fp32.h" + +EE matrix_vector_multiply_transform_weight_fp32(TensorDesc desc, F32 *src, F32 *dst) +{ + DataType dt; + DataFormat df; + U32 N, K; + EE ret = SUCCESS; + int i = 0; + switch (desc.df) { + case DF_NORMAL: { + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K)); + for (; i < (int)N - 15; i += 16) { + matrix1_trans(16, K, K, src + i * K, dst + i * K); + } + if (i < (int)N) { + memcpy(dst + i * K, src + i * K, (N - i) * K * bytesOf(DT_F32)); + } + break; + } + case DF_TRANSPOSE: { + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N)); + for (; i < (int)N - 15; i += 16) { + matrix2_trans(16, K, N, src + i, dst + i * K); + } + if (i < (int)N) { + int base = i; + F32 *basePtr = dst + i * K; + for (int j = 0; j < (int)K; j++) { + for (; i < (int)N; i++) { + basePtr[(i - base) * K + j] = src[j * N + i]; + } + } + } + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +void mvm_kernel_fp32(U32 rounds, U32 K, F32 *matrix, F32 *vector, F32 *result) +{ + U32 N = rounds * 16; + float32x4_t mat[4]; + F32 v; + float32x4_t res[4]; + + for (U32 n = 0; n < N; n += 16) { + F32 *bufMov = matrix + n * K; + for (int i = 0; i < 4; i++) { + res[i] = vld1q_f32(result + n + i * 4); + } + for (U32 k = 0; k < K; k++) { + v = vector[k]; + for (int i = 0; i < 4; i++) { + mat[i] = vld1q_f32(bufMov + i * 4); + } + for (int i = 0; i < 4; i++) { + res[i] = vfmaq_n_f32(res[i], mat[i], v); + } + bufMov += 16; + } + for (int i = 0; i < 4; i++) { + vst1q_f32(result + n + i * 4, res[i]); + } + } +} + +void mvm_pack_fp32(U32 row, U32 col, F32 *matrix, F32 *vector, F32 *result) +{ + U32 rounds = row / 16; + U32 nTail = row % 16; + + mvm_kernel_fp32(rounds, col, matrix, vector, result); + if (0 != nTail) { + mvm_row_tail(nTail, col, matrix + (row - nTail) * col, vector, result + (row - nTail)); + } +} + +EE mvm_fp32(U32 row, U32 col, DataFormat df, F32 *matrix, F32 *vector, F32 *result) +{ + EE ret = SUCCESS; + switch (df) { + case DF_NKN16: { + mvm_pack_fp32(row, col, matrix, vector, result); + break; + } + case DF_NORMAL: { + mvm_row_fp32(row, col, matrix, vector, result); + break; + } + case DF_TRANSPOSE: { + mvm_col_fp32(row, col, matrix, vector, result); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/blas-enhance/src/cpu/arm/fp32/mvm_col_V8.cpp b/compute/blas_enhance/src/cpu/arm/fp32/mvm_col.cpp similarity index 70% rename from blas-enhance/src/cpu/arm/fp32/mvm_col_V8.cpp rename to compute/blas_enhance/src/cpu/arm/fp32/mvm_col.cpp index 90174e72..9dc1e256 100644 --- a/blas-enhance/src/cpu/arm/fp32/mvm_col_V8.cpp +++ b/compute/blas_enhance/src/cpu/arm/fp32/mvm_col.cpp @@ -1,28 +1,26 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include -#include "type.h" +#include "types.h" #include "blas_fp32.h" - -inline void mvm_col_tail(U32 N, U32 K, F32* matrix, F32* vector, F32* result) +inline void mvm_col_tail(U32 N, U32 K, F32 *matrix, F32 *vector, F32 *result) { float32x4_t tmp, res, mat; U32 NTail = N % 4; U32 NInner = N - NTail; - + for (U32 i = 0; i < K; i++) { for (U32 j = 0; j < NInner; j += 4) { tmp = vld1q_f32(result + j); @@ -38,21 +36,20 @@ inline void mvm_col_tail(U32 N, U32 K, F32* matrix, F32* vector, F32* result) } } -void mvm_col_kernel(U32 N, U32 K, F32* matrix, F32* vector, F32* result) +void mvm_col_kernel(U32 N, U32 K, F32 *matrix, F32 *vector, F32 *result) { float32x4_t mat[4] = {0}; - - F32* w0 = matrix; - F32* w1 = matrix + K * N; - F32* w2 = matrix + 2 * K * N; - F32* w3 = matrix + 3 * K * N; - + + F32 *w0 = matrix; + F32 *w1 = matrix + K * N; + F32 *w2 = matrix + 2 * K * N; + F32 *w3 = matrix + 3 * K * N; + U32 N_tail = N % 4; U32 N_inner = N - N_tail; - - for(U32 i = 0; i < K; i++) { - for(U32 j = 0; j < N_inner; j += 4) { + for (U32 i = 0; i < K; i++) { + for (U32 j = 0; j < N_inner; j += 4) { float32x4_t res[4] = {0}; res[3] = vld1q_f32(result + j); @@ -71,21 +68,21 @@ void mvm_col_kernel(U32 N, U32 K, F32* matrix, F32* vector, F32* result) w2 += 4; w3 += 4; vst1q_f32(result + j, res[3]); - } - if (N_tail != 0) { - for(U32 p = 0; p < N_tail; p++) { - result[N_inner + p] += vector[i] * *w0++; - result[N_inner + p] += vector[i + K] * *w1++; - result[N_inner + p] += vector[i + 2 * K] * *w2++; - result[N_inner + p] += vector[i + 3 * K] * *w3++; - } - } - } + } + if (N_tail != 0) { + for (U32 p = 0; p < N_tail; p++) { + result[N_inner + p] += vector[i] * *w0++; + result[N_inner + p] += vector[i + K] * *w1++; + result[N_inner + p] += vector[i + 2 * K] * *w2++; + result[N_inner + p] += vector[i + 3 * K] * *w3++; + } + } + } } -void mvm_col_V8(U32 numRows, U32 numColumns, F32* matrix, F32* vector, F32* result) +void mvm_col_fp32(U32 numRows, U32 numColumns, F32 *matrix, F32 *vector, F32 *result) { - //Actual layout is KN, and vector is K + // Actual layout is KN, and vector is K U32 N = numRows; U32 K = numColumns; U32 KInner = K / 4; diff --git a/compute/blas_enhance/src/cpu/arm/fp32/mvm_row.cpp b/compute/blas_enhance/src/cpu/arm/fp32/mvm_row.cpp new file mode 100644 index 00000000..ac5efee5 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/fp32/mvm_row.cpp @@ -0,0 +1,182 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "blas_fp32.h" + +void mvm_row_kernel(U32 N, U32 K, F32 *matrix, F32 *vector, F32 *result) +{ + I32 KTail = K % 4; + I32 KInner = K - KTail; + F32 *w0 = matrix; + F32 *w1 = matrix + K * N; + F32 *w2 = matrix + K * 2 * N; + F32 *w3 = matrix + K * 3 * N; +#ifdef __aarch64__ + asm volatile("mov x19, %5\n" + "ld1 {v18.s}[0], [x19]\n" + "add x19, x19, %8\n" + "ld1 {v18.s}[1], [x19]\n" + "add x19, x19, %8\n" + "ld1 {v18.s}[2], [x19]\n" + "add x19, x19, %8\n" + "ld1 {v18.s}[3], [x19]\n" + + "movi v17.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "mov x20, %6\n" + "cmp x20, #0x0\n" + "beq 3f\n" + "0:\n" + + "ld1 {v0.4s}, [%0], #16\n" + "ld1 {v1.4s}, [%1], #16\n" + "ld1 {v2.4s}, [%2], #16\n" + "ld1 {v3.4s}, [%3], #16\n" + "ld1 {v4.4s}, [%4], #16\n" + + "fmla v9.4s, v1.4s, v0.4s\n" + "fmla v10.4s, v2.4s, v0.4s\n" + "fmla v11.4s, v3.4s, v0.4s\n" + "fmla v12.4s, v4.4s, v0.4s\n" + + "subs x20, x20, #4\n" + "bne 0b\n" + + "faddp v13.4s, v9.4s, v10.4s\n" + "faddp v14.4s, v11.4s, v12.4s\n" + "faddp v17.4s, v13.4s, v14.4s\n" + "3:\n" + "mov x16, %7\n" + "cmp x16, #0x0\n" + "beq 2f\n" + + "1:\n" + "ld1 {v8.s}[0], [%0], #4\n" + + "ld1 {v1.s}[0], [%1], #4\n" + "ld1 {v1.s}[1], [%2], #4\n" + "ld1 {v1.s}[2], [%3], #4\n" + "ld1 {v1.s}[3], [%4], #4\n" + "fmla v16.4s, v1.4s, v8.s[0]\n" + + "subs x16, x16, 0x1\n" + "bne 1b\n" + + "fadd v17.4s, v17.4s, v16.4s\n" + + "2:\n" + + "fadd v17.4s, v17.4s, v18.4s\n" + "mov x19, %5\n" + "st1 {v17.s}[0], [x19]\n" + "add x19, x19, %8\n" + "st1 {v17.s}[1], [x19]\n" + "add x19, x19, %8\n" + "st1 {v17.s}[2], [x19]\n" + "add x19, x19, %8\n" + "st1 {v17.s}[3], [x19]\n" + : "+r"(vector), "+r"(w0), "+r"(w1), "+r"(w2), "+r"(w3), "+r"(result) + : "r"((I64)KInner), "r"((I64)KTail), "r"((I64)N * 4) + : "memory", "cc", "x19", "x20", "x21", "x22", "x23", "x24", "x15", "x16", "v0", + "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", + "v17", "v18"); +#else + asm volatile("mov r3, %[result]\n" + "vld1.f32 {d30[0]}, [r3], %[stride]\n" + "vld1.f32 {d30[1]}, [r3], %[stride]\n" + "vld1.f32 {d31[0]}, [r3], %[stride]\n" + "vld1.f32 {d31[1]}, [r3]\n" + + "veor q6, q6, q6\n" + "veor q5, q5, q5\n" + "veor q9, q9, q9\n" + "veor q10, q10, q10\n" + "veor q11, q11, q11\n" + "veor q12, q12, q12\n" + "mov r3, %[KInner]\n" + "cmp r3, #0\n" + "beq 3f\n" + "0:\n" + + "vld1.f32 {d0-d1}, [%[vector]]!\n" + "vld1.f32 {d2-d3}, [%[w0]]!\n" + "vld1.f32 {d4-d5}, [%[w1]]!\n" + "vld1.f32 {d6-d7}, [%[w2]]!\n" + "vld1.f32 {d8-d9}, [%[w3]]!\n" + + "vmla.f32 q9, q1, q0\n" + "vmla.f32 q10, q2, q0\n" + "vmla.f32 q11, q3, q0\n" + "vmla.f32 q12, q4, q0\n" + + "subs r3, r3, #4\n" + "bne 0b\n" + + "vpadd.f32 d26, d18, d20\n" + "vpadd.f32 d27, d19, d21\n" + "vpadd.f32 d28, d22, d24\n" + "vpadd.f32 d29, d23, d25\n" + "vadd.f32 d12, d26, d27\n" + "vadd.f32 d13, d28, d29\n" + "3:\n" + "mov r3, %[KTail]\n" + "cmp r3, #0\n" + "beq 2f\n" + + "1:\n" + "vld1.f32 {d0[0]}, [%[vector]]!\n" + "vld1.f32 {d2[0]}, [%[w0]]!\n" + "vld1.f32 {d2[1]}, [%[w1]]!\n" + "vld1.f32 {d3[0]}, [%[w2]]!\n" + "vld1.f32 {d3[1]}, [%[w3]]!\n" + "vmla.f32 q5, q1, d0[0]\n" + + "subs r3, r3, #1\n" + "bne 1b\n" + + "vadd.f32 q6, q6, q5\n" + + "2:\n" + + "vadd.f32 q6, q6, q15\n" + "mov r3, %[result]\n" + "vst1.f32 {d12[0]}, [r3], %[stride]\n" + "vst1.f32 {d12[1]}, [r3], %[stride]\n" + "vst1.f32 {d13[0]}, [r3], %[stride]\n" + "vst1.f32 {d13[1]}, [r3]\n" + : [vector] "+r"(vector), [w0] "+r"(w0), [w1] "+r"(w1), [w2] "+r"(w2), + [w3] "+r"(w3), [result] "+r"(result) + : [KInner] "r"(KInner), [KTail] "r"(KTail), [stride] "r"(N * 4) + : "memory", "cc", "r3", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); +#endif +} + +void mvm_row_fp32(U32 numRows, U32 numColumns, F32 *matrix, F32 *vector, F32 *result) +{ + // Actual layout is NK, and vector is K + U32 N = numRows; + U32 K = numColumns; + U32 NTail = N % 4; + U32 NInner = N / 4; + for (U32 i = 0; i < NInner; i++) { + mvm_row_kernel(NInner, K, matrix + i * K, vector, result + i); + } + if (NTail != 0) { + mvm_row_tail(NTail, K, matrix + (N - NTail) * K, vector, result + (N - NTail)); + } +} diff --git a/blas-enhance/src/cpu/arm/int8/blas_int8.h b/compute/blas_enhance/src/cpu/arm/int8/blas_int8.h similarity index 63% rename from blas-enhance/src/cpu/arm/int8/blas_int8.h rename to compute/blas_enhance/src/cpu/arm/int8/blas_int8.h index b81996ac..035a770f 100644 --- a/blas-enhance/src/cpu/arm/int8/blas_int8.h +++ b/compute/blas_enhance/src/cpu/arm/int8/blas_int8.h @@ -1,33 +1,43 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_BLAS_INT8 #define _H_BLAS_INT8 #include "sys.h" -#include "type.h" +#include "types.h" #include "error.h" #include "tensor_desc.h" -EE mvm_int8(U32 row, U32 col, bool transpose, INT8* matrix, INT8* vector, I32* tmp, I32* result); +EE matrix_vector_multiply_transform_weight_int8(TensorDesc desc, INT8 *src, INT8 *dst); + +EE mvm_int8(U32 row, U32 col, DataFormat df, INT8 *matrix, INT8 *vector, I32 *tmp, I32 *result); -void matrix_matrix_multiply_tmp_bytes_int8(U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes); +void matrix_matrix_multiply_tmp_bytes_int8( + U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes); -EE matrix_matrix_multiply_transform_rhsN_int8(TensorDesc desc, INT8* src, INT8* dst); +EE matrix_matrix_multiply_transform_rhsN_int8(TensorDesc desc, INT8 *src, INT8 *dst); -EE matrix_matrix_multiply_transform_rhsT_int8(TensorDesc desc, INT8* src, INT8* dst); +EE matrix_matrix_multiply_transform_rhsT_int8(TensorDesc desc, INT8 *src, INT8 *dst); -EE mmm_int8(int M, int N, int K, INT8* matrix1, INT8* matrix2, INT8* tmp, I32* result, Arch arch); +EE mmm_int8(int M, + int N, + int K, + bool transposeA, + INT8 *matrix1, + INT8 *matrix2, + INT8 *tmp, + I32 *result, + Arch arch); #endif diff --git a/blas-enhance/src/cpu/arm/int8/mmm.cpp b/compute/blas_enhance/src/cpu/arm/int8/mmm.cpp similarity index 73% rename from blas-enhance/src/cpu/arm/int8/mmm.cpp rename to compute/blas_enhance/src/cpu/arm/int8/mmm.cpp index 211b8e56..89d0d667 100644 --- a/blas-enhance/src/cpu/arm/int8/mmm.cpp +++ b/compute/blas_enhance/src/cpu/arm/int8/mmm.cpp @@ -1,17 +1,16 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "sys.h" #include "error.h" #include "cpu/arm/blas_arm.h" @@ -19,8 +18,8 @@ #include "cpu/arm/int8/mmm.h" #include "cpu/arm/int8/mmm_common.h" - -void matrix_matrix_multiply_tmp_bytes_int8(U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes) +void matrix_matrix_multiply_tmp_bytes_int8( + U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes) { col1 = pad_to_4_multiple(col1); row2 = pad_to_4_multiple(row2); @@ -29,11 +28,12 @@ void matrix_matrix_multiply_tmp_bytes_int8(U32 row1, U32 col1, U32 row2, U32 col *bytes += 32; } -EE matrix_matrix_multiply_transform_rhsN_int8(TensorDesc desc, INT8* src, INT8* dst) +EE matrix_matrix_multiply_transform_rhsN_int8(TensorDesc desc, INT8 *src, INT8 *dst) { DataType dt; + DataFormat df; U32 N, K; - CHECK_STATUS(tensor2dGet(desc, &dt, &K, &N)); + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N)); U32 K4 = pad_to_4_multiple(K); int i = 0; for (; i < (int)N - 11; i += 12) { @@ -51,11 +51,12 @@ EE matrix_matrix_multiply_transform_rhsN_int8(TensorDesc desc, INT8* src, INT8* return SUCCESS; } -EE matrix_matrix_multiply_transform_rhsT_int8(TensorDesc desc, INT8* src, INT8* dst) +EE matrix_matrix_multiply_transform_rhsT_int8(TensorDesc desc, INT8 *src, INT8 *dst) { DataType dt; + DataFormat df; U32 N, K; - CHECK_STATUS(tensor2dGet(desc, &dt, &N, &K)); + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K)); U32 K4 = pad_to_4_multiple(K); int i = 0; for (; i < (int)N - 11; i += 12) { @@ -73,18 +74,20 @@ EE matrix_matrix_multiply_transform_rhsT_int8(TensorDesc desc, INT8* src, INT8* return SUCCESS; } -EE mmm_int8(int M, int N, int K, INT8* matrix1, INT8* matrix2, INT8* tmp, I32* result, Arch arch) { +EE mmm_int8( + int M, int N, int K, bool transposeA, INT8 *matrix1, INT8 *matrix2, INT8 *tmp, I32 *result, Arch arch) +{ EE ret = SUCCESS; switch (arch) { - case ARM_A55: - mmm_A55(M, N, K, matrix1, matrix2, tmp, result); - break; - case ARM_A76: - mmm_A76(M, N, K, matrix1, matrix2, tmp, result); - break; - default: - ret = NOT_SUPPORTED; - break; + case ARM_A55: + mmm_A55(M, N, K, transposeA, matrix1, matrix2, tmp, result); + break; + case ARM_A76: + mmm_A76(M, N, K, transposeA, matrix1, matrix2, tmp, result); + break; + default: + ret = NOT_SUPPORTED; + break; } return ret; } diff --git a/compute/blas_enhance/src/cpu/arm/int8/mmm.h b/compute/blas_enhance/src/cpu/arm/int8/mmm.h new file mode 100644 index 00000000..9a433664 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/int8/mmm.h @@ -0,0 +1,24 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_MMM +#define _H_MMM + +#include "types.h" + +void mmm_A55( + int M, int N, int K, bool transposeA, INT8 *matrix1, INT8 *matrix2, INT8 *tmp, I32 *result); + +void mmm_A76( + int M, int N, int K, bool transposeA, INT8 *matrix1, INT8 *matrix2, INT8 *tmp, I32 *result); +#endif diff --git a/compute/blas_enhance/src/cpu/arm/int8/mmm_A55.cpp b/compute/blas_enhance/src/cpu/arm/int8/mmm_A55.cpp new file mode 100644 index 00000000..479f726b --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/int8/mmm_A55.cpp @@ -0,0 +1,741 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_INT8 +#include +#include +#include "cpu/arm/blas_arm.h" +#include "cpu/arm/int8/mmm_common.h" +#include "cpu/arm/int8/mmm.h" + +inline void mmm_4x4_A55(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out) +{ + asm volatile( + // init in- > v1, w- > v0 + "ldr d1, [%0]\n" + "ldr x16, [%0, 8]\n" + "ins v1.d[1], x16\n" + + "ldr d0, [%1]\n" + "ldr x17, [%1, 8]\n" + "ins v0.d[1], x17\n" + + // give in address to x3 + "mov x3, %0\n" + + // give w address to x0 + "mov x0, %1\n" + + // K- > x2 + "mov x2, %3\n" + + // give out address to x26 + "mov x26, %2\n" + + // load in bias + "ldr q5, [x26]\n" + "add x26, x26, %4\n" + + "ldr q7, [x26]\n" + "add x26, x26, %4\n" + + "ldr q9, [x26]\n" + "add x26, x26, %4\n" + + "ldr q11, [x26]\n" + + // Computation loop + "0:\n" + + "ldr d3, [x3, 16] !\n" + "ldr x16, [x3, 24]\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d29, [x0, 16] !\n" + "ldr x17, [x0, 24]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ins v3.d[1], x16\n" + "subs x2, x2, #4\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "ins v29.d[1], x17\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + "mov v1.16b, v3.16b\n" + "mov v0.16b, v29.16b\n" + "bne 0b\n" + + "1:\n" + + // give out address to x26 + "mov x26, %2\n" + + "str q5, [x26]\n" + "add x26, x26, %4\n" + + "str q7, [x26]\n" + "add x26, x26, %4\n" + + "str q9, [x26]\n" + "add x26, x26, %4\n" + + "str q11, [x26]\n" + + : "+r"(in), "+r"(w), "+r"(out) + : "r"((I64)K), "r"((I64)offset) + : "memory", "cc", "v30", "v29", "v11", "v9", "v7", "v5", "v3", "v1", "v0", "x26", "x16", + "x17", "x3", "x2", "x0"); +} + +inline void mmm_8x4_A55(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out) +{ + asm volatile( + // init in-> v1, w-> v0 + "ldr q1, [%0]\n" + "ldr q0, [%1]\n" + + // give in address to x3 + "mov x3, %0\n" + + // give w address to x0 + "mov x0, %1\n" + + // K-> x2 + "mov x2, %3\n" + + // give out address to x26 + "mov x26, %2\n" + + // load in bias + "ldr q5, [x26]\n" + "add x26, x26, %4\n" + + "ldr q7, [x26]\n" + "add x26, x26, %4\n" + + "ldr q9, [x26]\n" + "add x26, x26, %4\n" + + "ldr q11, [x26]\n" + "add x26, x26, %4\n" + + "ldr q13, [x26]\n" + "add x26, x26, %4\n" + + "ldr q15, [x26]\n" + "add x26, x26, %4\n" + + "ldr q17, [x26]\n" + "add x26, x26, %4\n" + + "ldr q19, [x26]\n" + + // Computation loop + "0:\n" + + "ldr d3, [x3, 16]\n" + "ldr x16, [x3, 24]\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d29, [x0, 16]!\n" + "ldr x17, [x0, 8]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ins v3.d[1], x16\n" + "ldr d30, [x3, 32]!\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "ins v29.d[1], x17\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + + "ldr x16, [x3, 8]\n" + "subs x2, x2, #4\n" + "sdot v13.4s, v0.16b, v3.4b[0]\n" + "ins v30.d[1], x16\n" + "sdot v15.4s, v0.16b, v3.4b[1]\n" + "sdot v17.4s, v0.16b, v3.4b[2]\n" + "mov v1.16b, v30.16b\n" + "sdot v19.4s, v0.16b, v3.4b[3]\n" + "mov v0.16b, v29.16b\n" + "bne 0b\n" + + // give out address to x26 + "mov x26, %2\n" + + "str q5, [x26]\n" + "add x26, x26, %4\n" + + "str q7, [x26]\n" + "add x26, x26, %4\n" + + "str q9, [x26]\n" + "add x26, x26, %4\n" + + "str q11, [x26]\n" + "add x26, x26, %4\n" + + "str q13, [x26]\n" + "add x26, x26, %4\n" + + "str q15, [x26]\n" + "add x26, x26, %4\n" + + "str q17, [x26]\n" + "add x26, x26, %4\n" + + "str q19, [x26]\n" + + : "+r"(in), "+r"(w), "+r"(out) + : "r"((I64)K), "r"((I64)offset) + : "memory", "cc", "v30", "v29", "v19", "v17", "v15", "v13", "v11", "v9", "v7", "v5", "v3", + "v1", "v0", "x26", "x16", "x17", "x3", "x2", "x0"); +} + +inline void mmm_4x8_A55(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out) +{ + asm volatile( + // init in- > v1, w- > v0 + "ldr d1, [%0]\n" + "ldr x16, [%0, 8]\n" + "ins v1.d[1], x16\n" + + "ldr d0, [%1]\n" + "ldr x17, [%1, 8]\n" + "ins v0.d[1], x17\n" + + // give in address to x3 + "mov x3, %0\n" + + // give w address to x0 + "mov x0, %1\n" + + // K- > x2 + "mov x2, %3\n" + + // give out address to x26 + "mov x26, %2\n" + + "ld1 {v5.4s, v6.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v7.4s, v8.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.4s, v12.4s}, [x26]\n" + + /* Layout + * 5 6 + * 7 8 + * 9 10 + * 11 12 + */ + + // Computation loop + "0:\n" + + "ldr d29, [x0, 16]\n" + "ldr x17, [x0, 24]\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d3, [x3, 16] !\n" + "ldr x16, [x3, 8]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ins v29.d[1], x17\n" + "subs x2, x2, #4\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "ins v3.d[1], x16\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "ldr d0, [x0, 32] !\n" + "ldr x17, [x0, 8]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "ins v0.d[1], x17\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + "mov v1.16b, v3.16b\n" + "bne 0b\n" + + // give out address to x26 + "mov x26, %2\n" + + "st1 {v5.4s, v6.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v7.4s, v8.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.4s, v12.4s}, [x26]\n" + + : "+r"(in), "+r"(w), "+r"(out) + : "r"((I64)K), "r"((I64)offset) + : "memory", "cc", "v29", "v12", "v11", "v10", "v9", "v8", "v7", "v6", "v5", "v3", "v1", + "v0", "x26", "x16", "x17", "x3", "x2", "x0"); +} + +inline void mmm_8x8_A55(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out) +{ + asm volatile( + // init in- > v1, w- > v0 + "ldr d1, [%0]\n" + "ldr x16, [%0, 8]\n" + "ins v1.d[1], x16\n" + + "ldr d0, [%1]\n" + "ldr x17, [%1, 8]\n" + "ins v0.d[1], x17\n" + + // give in address to x3 + "mov x3, %0\n" + + // give w address to x0 + "mov x0, %1\n" + + // K- > x2 + "mov x2, %3\n" + + // give out address to x26 + "mov x26, %2\n" + + // load in bias + "ld1 {v5.4s, v6.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v7.4s, v8.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.4s, v12.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v13.4s, v14.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v15.4s, v16.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v17.4s, v18.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v19.4s, v20.4s}, [x26]\n" + + /* Layout + 5 6 + 7 8 + 9 10 + 11 12 + + 13 14 + 15 16 + 17 18 + 19 20 + */ + + // Computation loop + "0:\n" + + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d3, [x3, 16] !\n" + "ldr x16, [x3, 8]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ldr d29, [x0, 16]\n" + "ldr x17, [x0, 24]\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "ins v3.d[1], x16\n" + "ldr d30, [x3, 16] !\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + "ins v29.d[1], x17\n" + + "sdot v13.4s, v0.16b, v3.4b[0]\n" + "ldr x16, [x3, 8]\n" + "subs x2, x2, #4\n" + "sdot v15.4s, v0.16b, v3.4b[1]\n" + "sdot v17.4s, v0.16b, v3.4b[2]\n" + "ins v30.d[1], x16\n" + "sdot v19.4s, v0.16b, v3.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "ldr d0, [x0, 32] !\n" + "ldr x17, [x0, 8]\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + + "sdot v14.4s, v29.16b, v3.4b[0]\n" + "ins v0.d[1], x17\n" + "mov v1.16b, v30.16b\n" + "sdot v16.4s, v29.16b, v3.4b[1]\n" + "sdot v18.4s, v29.16b, v3.4b[2]\n" + "sdot v20.4s, v29.16b, v3.4b[3]\n" + + "bne 0b\n" + + // give out address to x26 + "mov x26, %2\n" + + "st1 {v5.4s, v6.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v7.4s, v8.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.4s, v12.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v13.4s, v14.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v15.4s, v16.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v17.4s, v18.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v19.4s, v20.4s}, [x26]\n" + + : "+r"(in), "+r"(w), "+r"(out) + : "r"((I64)K), "r"((I64)offset) + : "memory", "cc", "v30", "v29", "v20", "v19", "v18", "v17", "v16", "v15", "v14", "v13", + "v12", "v11", "v10", "v9", "v8", "v7", "v6", "v5", "v3", "v1", "v0", "x26", "x16", "x17", + "x3", "x2", "x0"); +} + +inline void mmm_4x12_A55(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out) +{ + asm volatile( + // init in->v1, w->v0 + "ldr q1, [%0]\n" + + "ldr q0, [%1]\n" + + "ldr q29, [%1, 16]\n" // prefetch one more w + + // give in address to x3 + "mov x3, %0\n" + + // give w address to x0 + "mov x0, %1\n" + + // K->x2 + "mov x2, %3\n" + + // give out address to x26 + "mov x26, %2\n" + + // load in bias + "ld1 {v5.4s, v6.4s, v7.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.4s, v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.4s, v12.4s, v13.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v14.4s, v15.4s, v16.4s}, [x26]\n" + + /* Layout + 5 6 7 + 8 9 10 + 11 12 13 + 14 15 16 + */ + + // Computation loop + "0:\n" + // in(x3): v1 + // w(x0): v0 v29 v30 + + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d30, [x0, 32]\n" + "sdot v8.4s, v0.16b, v1.4b[1]\n" + "ldr x16, [x0, 40]\n" + "sdot v11.4s, v0.16b, v1.4b[2]\n" + "ldr d2, [x3, 16]!\n" // input of next round + "sdot v14.4s, v0.16b, v1.4b[3]\n" + "ldr x17, [x3, 8]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "ins v30.d[1], x16\n" + "sdot v9.4s, v29.16b, v1.4b[1]\n" + "ldr d0, [x0, 48]!\n" // first w of next round + "sdot v12.4s, v29.16b, v1.4b[2]\n" + "ins v2.d[1], x17\n" + "sdot v15.4s, v29.16b, v1.4b[3]\n" + "ldr x16, [x0, 8]\n" + + "sdot v7.4s, v30.16b, v1.4b[0]\n" + "ldr d29, [x0, 16]\n" + "sdot v10.4s, v30.16b, v1.4b[1]\n" + "ldr x19, [x0, 24]\n" + "ins v0.d[1], x16\n" + "sdot v13.4s, v30.16b, v1.4b[2]\n" + "subs x2, x2, #4\n" + "sdot v16.4s, v30.16b, v1.4b[3]\n" + + "mov v1.16b, v2.16b\n" + "ins v29.d[1], x19\n" + "bne 0b\n" + + // give out address to x26 + "mov x26, %2\n" + + "st1 {v5.4s, v6.4s, v7.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.4s, v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.4s, v12.4s, v13.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v14.4s, v15.4s, v16.4s}, [x26]\n" + + : "+r"(in), "+r"(w), "+r"(out) + : "r"((I64)K), "r"((I64)offset) + : "memory", "cc", "v30", "v29", "v16", "v15", "v14", "v13", "v12", "v11", "v10", "v9", "v8", + "v7", "v6", "v5", "v3", "v2", "v1", "v0", "x26", "x19", "x16", "x17", "x3", "x2", "x0"); +} + +inline void mmm_8x12_A55(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out) +{ + asm volatile( + // init in->v1, w->v0 + "ldr q1, [%0]\n" + + "ldr q0, [%1]\n" + + "ldr q29, [%1, 16]\n" // prefetch one more w + + // give in address to x3 + "mov x3, %0\n" + + // give w address to x0 + "mov x0, %1\n" + + // K->x2 + "mov x2, %3\n" + + // give out address to x26 + "mov x26, %2\n" + + // load in bias + "ld1 {v5.4s, v6.4s, v7.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.4s, v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.4s, v12.4s, v13.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v14.4s, v15.4s, v16.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v17.4s, v18.4s, v19.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v20.4s, v21.4s, v22.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v23.4s, v24.4s, v25.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v26.4s, v27.4s, v28.4s}, [x26]\n" + + /* Layout + 5 6 7 + 8 9 10 + 11 12 13 + 14 15 16 + + 17 18 19 + 20 21 22 + 23 24 25 + 26 27 28 + */ + + // Computation loop + "0:\n" + // in(x3): v1 v2 + // w(x0): v0 v29 v30 + + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d30, [x0, 32]\n" + "sdot v8.4s, v0.16b, v1.4b[1]\n" + "ldr x16, [x0, 40]\n" + "sdot v11.4s, v0.16b, v1.4b[2]\n" + "ldr d2, [x3, 16]\n" + "sdot v14.4s, v0.16b, v1.4b[3]\n" + "ldr x17, [x3, 24]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "ins v30.d[1], x16\n" + "sdot v9.4s, v29.16b, v1.4b[1]\n" + "ldr d3, [x0, 48]!\n" // first w of next round + "sdot v12.4s, v29.16b, v1.4b[2]\n" + "ins v2.d[1], x17\n" + "sdot v15.4s, v29.16b, v1.4b[3]\n" + "ldr x16, [x0, 8]\n" + + "sdot v7.4s, v30.16b, v1.4b[0]\n" + "subs x2, x2, #4\n" + "sdot v10.4s, v30.16b, v1.4b[1]\n" + "ins v3.d[1], x16\n" + "sdot v13.4s, v30.16b, v1.4b[2]\n" + "sdot v16.4s, v30.16b, v1.4b[3]\n" + + "sdot v17.4s, v0.16b, v2.4b[0]\n" + "ldr d1, [x3, 32]!\n" + "sdot v20.4s, v0.16b, v2.4b[1]\n" + "ldr x17, [x3, 8]\n" + "sdot v23.4s, v0.16b, v2.4b[2]\n" + "sdot v26.4s, v0.16b, v2.4b[3]\n" + + "sdot v18.4s, v29.16b, v2.4b[0]\n" + "mov v0.16b, v3.16b\n" + "sdot v21.4s, v29.16b, v2.4b[1]\n" + "ins v1.d[1], x17\n" + "sdot v24.4s, v29.16b, v2.4b[2]\n" + "sdot v27.4s, v29.16b, v2.4b[3]\n" + + "sdot v19.4s, v30.16b, v2.4b[0]\n" + "ldr d29, [x0, 16]\n" + "sdot v22.4s, v30.16b, v2.4b[1]\n" + "ldr x16, [x0, 24]\n" + "sdot v25.4s, v30.16b, v2.4b[2]\n" + "sdot v28.4s, v30.16b, v2.4b[3]\n" + "ins v29.d[1], x16\n" + + "bne 0b\n" + + // give out address to x26 + "mov x26, %2\n" + + "st1 {v5.4s, v6.4s, v7.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.4s, v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.4s, v12.4s, v13.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v14.4s, v15.4s, v16.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v17.4s, v18.4s, v19.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v20.4s, v21.4s, v22.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v23.4s, v24.4s, v25.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v26.4s, v27.4s, v28.4s}, [x26]\n" + + : "+r"(in), "+r"(w), "+r"(out) + : "r"((I64)K), "r"((I64)offset) + : "memory", "cc", "v30", "v29", "v28", "v27", "v26", "v25", "v24", "v23", "v22", "v21", + "v20", "v19", "v18", "v17", "v16", "v15", "v14", "v13", "v12", "v11", "v10", "v9", "v8", + "v7", "v6", "v5", "v3", "v2", "v1", "v0", "x26", "x16", "x17", "x3", "x2", "x0"); +} + +void mmm_A55( + int M, int N, int K, bool transposeA, INT8 *matrix1, INT8 *matrix2, INT8 *tmp, I32 *result) +{ + int blockK = K; + int K4 = pad_to_4_multiple(K); + int blockM = 96; + INT8 *matrix1Trans = tmp; + I32 *resultCurrent = result; + + int KInner, MInner, m, n; + for (int k = 0; k < K; k += blockK) { + KInner = UNI_MIN(blockK, K - k); // K for this inner iteration + for (int i = 0; i < M; i += blockM) { + MInner = UNI_MIN(blockM, M - i); // M for this inner iteration + for (n = 0; n <= N - 8; n += 8) { + if (i == 0) { + if (transposeA) { + matrix2_trans_int8(8, KInner, N, matrix1 + n, matrix1Trans + n * K4); + } else { + matrix1_trans_n8(KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); + } + } + + for (m = 0; m <= (MInner - 12); m += 12) { + resultCurrent = result + n * M + m + i; + mmm_8x12_A55( + M * 4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + } + for (; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_8x8_A55( + M * 4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_8x4_A55( + M * 4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_N8_MTail(MInner - m, M, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, + resultCurrent); + } + } + + if ((N - n) >= 4) { + if (i == 0) { + if (transposeA) { + matrix2_trans_int8(4, KInner, N, matrix1 + n, matrix1Trans + n * K4); + } else { + matrix1_trans_int8(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); + } + } + + for (m = 0; m <= (MInner - 12); m += 12) { + resultCurrent = result + n * M + m + i; + mmm_4x12_A55( + M * 4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + } + + for (; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_4x8_A55( + M * 4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_4x4_A55( + M * 4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_N4_MTail(MInner - m, M, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, + resultCurrent); + } + n += 4; + } + + if (N - n) { + if (i == 0) { + if (transposeA) { + matrix2_trans_int8(N - n, KInner, N, matrix1 + n, matrix1Trans + n * K4); + } else { + matrix1_trans_int8( + N - n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); + } + } + + for (m = 0; m <= (MInner - 12); m += 12) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M12( + M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + } + + for (; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M8( + M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M4( + M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M(MInner - m, M, N - n, K4, matrix1Trans + n * K4, + matrix2 + (i + m) * K4, resultCurrent); + } + } + } + } +} +#endif diff --git a/compute/blas_enhance/src/cpu/arm/int8/mmm_A76.cpp b/compute/blas_enhance/src/cpu/arm/int8/mmm_A76.cpp new file mode 100644 index 00000000..5abe7852 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/int8/mmm_A76.cpp @@ -0,0 +1,685 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_INT8 +#include +#include +#include "cpu/arm/blas_arm.h" +#include "cpu/arm/int8/mmm_common.h" +#include "cpu/arm/int8/mmm.h" + +inline void mmm_4x4_A76(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out) +{ + asm volatile( + // init in- > v1, w- > v0 + "ldr q1, [%0]\n" + + "ldr q0, [%1]\n" + + // give in address to x3 + "mov x3, %0\n" + + // give w address to x0 + "mov x0, %1\n" + + // K- > x2 + "mov x2, %3\n" + + // give out address to x26 + "mov x26, %2\n" + + // load in bias + "ldr q5, [x26]\n" + "add x26, x26, %4\n" + + "ldr q7, [x26]\n" + "add x26, x26, %4\n" + + "ldr q9, [x26]\n" + "add x26, x26, %4\n" + + "ldr q11, [x26]\n" + + // Computation loop + "0:\n" + + "ldr q3, [x3, 16]!\n" + "ldr q29, [x0, 16]!\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "subs x2, x2, #4\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + "mov v1.16b, v3.16b\n" + "mov v0.16b, v29.16b\n" + "bne 0b\n" + + "1:\n" + + // give out address to x26 + "mov x26, %2\n" + + "str q5, [x26]\n" + "add x26, x26, %4\n" + + "str q7, [x26]\n" + "add x26, x26, %4\n" + + "str q9, [x26]\n" + "add x26, x26, %4\n" + + "str q11, [x26]\n" + + : "+r"(in), "+r"(w), "+r"(out) + : "r"((I64)K), "r"((I64)offset) + : "memory", "cc", "v30", "v29", "v11", "v9", "v7", "v5", "v3", "v1", "v0", "x26", "x3", + "x2", "x0"); +} + +inline void mmm_8x4_A76(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out) +{ + asm volatile( + // init in- > v1, w- > v0 + "ldr q1, [%0]\n" + + "ldr q0, [%1]\n" + + // give in address to x3 + "mov x3, %0\n" + + // give w address to x0 + "mov x0, %1\n" + + // K- > x2 + "mov x2, %3\n" + + // give out address to x26 + "mov x26, %2\n" + + // load in bias + "ldr q5, [x26]\n" + "add x26, x26, %4\n" + + "ldr q7, [x26]\n" + "add x26, x26, %4\n" + + "ldr q9, [x26]\n" + "add x26, x26, %4\n" + + "ldr q11, [x26]\n" + "add x26, x26, %4\n" + + "ldr q13, [x26]\n" + "add x26, x26, %4\n" + + "ldr q15, [x26]\n" + "add x26, x26, %4\n" + + "ldr q17, [x26]\n" + "add x26, x26, %4\n" + + "ldr q19, [x26]\n" + + // Computation loop + "0:\n" + + "ldr q3, [x3, 16]\n" + "ldr q29, [x0, 16]!\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + + "subs x2, x2, #4\n" + "sdot v13.4s, v0.16b, v3.4b[0]\n" + "sdot v15.4s, v0.16b, v3.4b[1]\n" + "ldr q1, [x3, 32]!\n" + "sdot v17.4s, v0.16b, v3.4b[2]\n" + "sdot v19.4s, v0.16b, v3.4b[3]\n" + "mov v0.16b, v29.16b\n" + "bne 0b\n" + + // give out address to x26 + "mov x26, %2\n" + + "str q5, [x26]\n" + "add x26, x26, %4\n" + + "str q7, [x26]\n" + "add x26, x26, %4\n" + + "str q9, [x26]\n" + "add x26, x26, %4\n" + + "str q11, [x26]\n" + "add x26, x26, %4\n" + + "str q13, [x26]\n" + "add x26, x26, %4\n" + + "str q15, [x26]\n" + "add x26, x26, %4\n" + + "str q17, [x26]\n" + "add x26, x26, %4\n" + + "str q19, [x26]\n" + + : "+r"(in), "+r"(w), "+r"(out) + : "r"((I64)K), "r"((I64)offset) + : "memory", "cc", "v30", "v29", "v19", "v17", "v15", "v13", "v11", "v9", "v7", "v5", "v3", + "v1", "v0", "x26", "x3", "x2", "x0"); +} + +inline void mmm_4x8_A76(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out) +{ + asm volatile( + // init in- > v1, w- > v0 + "ldr q1, [%0]\n" + + "ldr q0, [%1]\n" + + // give in address to x3 + "mov x3, %0\n" + + // give w address to x0 + "mov x0, %1\n" + + // K- > x2 + "mov x2, %3\n" + + // give out address to x26 + "mov x26, %2\n" + + "ld1 {v5.4s, v6.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v7.4s, v8.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.4s, v12.4s}, [x26]\n" + + /* Layout + * 5 6 + * 7 8 + * 9 10 + * 11 12 + */ + + // Computation loop + "0:\n" + + "ldr q29, [x0, 16]\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr q3, [x3, 16]!\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "subs x2, x2, #4\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "ldr q0, [x0, 32]!\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + "mov v1.16b, v3.16b\n" + "bne 0b\n" + + // give out address to x26 + "mov x26, %2\n" + + "st1 {v5.4s, v6.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v7.4s, v8.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.4s, v12.4s}, [x26]\n" + + : "+r"(in), "+r"(w), "+r"(out) + : "r"((I64)K), "r"((I64)offset) + : "memory", "cc", "v29", "v12", "v11", "v10", "v9", "v8", "v7", "v6", "v5", "v3", "v1", + "v0", "x26", "x3", "x2", "x0"); +} + +inline void mmm_8x8_A76(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out) +{ + asm volatile( + // init in- > v1, w- > v0 + "ldr q1, [%0]\n" + + "ldr q0, [%1]\n" + + // give in address to x3 + "mov x3, %0\n" + + // give w address to x0 + "mov x0, %1\n" + + // K- > x2 + "mov x2, %3\n" + + // give out address to x26 + "mov x26, %2\n" + + // load in bias + "ld1 {v5.4s, v6.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v7.4s, v8.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.4s, v12.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v13.4s, v14.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v15.4s, v16.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v17.4s, v18.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v19.4s, v20.4s}, [x26]\n" + + /* Layout + 5 6 + 7 8 + 9 10 + 11 12 + + 13 14 + 15 16 + 17 18 + 19 20 + */ + + // Computation loop + "0:\n" + + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr q3, [x3, 16]!\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ldr q29, [x0, 16]\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + + "sdot v13.4s, v0.16b, v3.4b[0]\n" + "subs x2, x2, #4\n" + "sdot v15.4s, v0.16b, v3.4b[1]\n" + "sdot v17.4s, v0.16b, v3.4b[2]\n" + "sdot v19.4s, v0.16b, v3.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "ldr q0, [x0, 32]!\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + + "sdot v14.4s, v29.16b, v3.4b[0]\n" + "sdot v16.4s, v29.16b, v3.4b[1]\n" + "ldr q1, [x3, 16]!\n" + "sdot v18.4s, v29.16b, v3.4b[2]\n" + "sdot v20.4s, v29.16b, v3.4b[3]\n" + + "bne 0b\n" + + // give out address to x26 + "mov x26, %2\n" + + "st1 {v5.4s, v6.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v7.4s, v8.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.4s, v12.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v13.4s, v14.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v15.4s, v16.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v17.4s, v18.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v19.4s, v20.4s}, [x26]\n" + + : "+r"(in), "+r"(w), "+r"(out) + : "r"((I64)K), "r"((I64)offset) + : "memory", "cc", "v29", "v20", "v19", "v18", "v17", "v16", "v15", "v14", "v13", "v12", + "v11", "v10", "v9", "v8", "v7", "v6", "v5", "v3", "v1", "v0", "x26", "x3", "x2", "x0"); +} + +inline void mmm_4x12_A76(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out) +{ + asm volatile( + // init in->v1, w->v0 + "ldr q1, [%0]\n" + + "ldr q0, [%1]\n" + + "ldr q29, [%1, 16]\n" // prefetch one more w + + // give in address to x3 + "mov x3, %0\n" + + // give w address to x0 + "mov x0, %1\n" + + // K->x2 + "mov x2, %3\n" + + // give out address to x26 + "mov x26, %2\n" + + // load in bias + "ld1 {v5.4s, v6.4s, v7.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.4s, v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.4s, v12.4s, v13.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v14.4s, v15.4s, v16.4s}, [x26]\n" + + /* Layout + 5 6 7 + 8 9 10 + 11 12 13 + 14 15 16 + */ + + // Computation loop + "0:\n" + // in(x3): v1 + // w(x0): v0 v29 v30 + + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr q30, [x0, 32]\n" + "sdot v8.4s, v0.16b, v1.4b[1]\n" + "sdot v11.4s, v0.16b, v1.4b[2]\n" + "ldr q2, [x3, 16]!\n" // input of next round + "sdot v14.4s, v0.16b, v1.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v9.4s, v29.16b, v1.4b[1]\n" + "ldr q0, [x0, 48]!\n" // first w of next round + "sdot v12.4s, v29.16b, v1.4b[2]\n" + "sdot v15.4s, v29.16b, v1.4b[3]\n" + + "sdot v7.4s, v30.16b, v1.4b[0]\n" + "ldr q29, [x0, 16]\n" + "sdot v10.4s, v30.16b, v1.4b[1]\n" + "sdot v13.4s, v30.16b, v1.4b[2]\n" + "subs x2, x2, #4\n" + "sdot v16.4s, v30.16b, v1.4b[3]\n" + + "mov v1.16b, v2.16b\n" + "bne 0b\n" + + // give out address to x26 + "mov x26, %2\n" + + "st1 {v5.4s, v6.4s, v7.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.4s, v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.4s, v12.4s, v13.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v14.4s, v15.4s, v16.4s}, [x26]\n" + + : "+r"(in), "+r"(w), "+r"(out) + : "r"((I64)K), "r"((I64)offset) + : "memory", "cc", "v30", "v29", "v16", "v15", "v14", "v13", "v12", "v11", "v10", "v9", "v8", + "v7", "v6", "v5", "v3", "v2", "v1", "v0", "x26", "x19", "x3", "x2", "x0"); +} + +inline void mmm_8x12_A76(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out) +{ + asm volatile( + // init in->v1, w->v0 + "ldr q1, [%0]\n" + + "ldr q0, [%1]\n" + + "ldr q29, [%1, 16]\n" // prefetch one more w + + // give in address to x3 + "mov x3, %0\n" + + // give w address to x0 + "mov x0, %1\n" + + // K->x2 + "mov x2, %3\n" + + // give out address to x26 + "mov x26, %2\n" + + // load in bias + "ld1 {v5.4s, v6.4s, v7.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v8.4s, v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v11.4s, v12.4s, v13.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v14.4s, v15.4s, v16.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v17.4s, v18.4s, v19.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v20.4s, v21.4s, v22.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v23.4s, v24.4s, v25.4s}, [x26]\n" + "add x26, x26, %4\n" + "ld1 {v26.4s, v27.4s, v28.4s}, [x26]\n" + + /* Layout + 5 6 7 + 8 9 10 + 11 12 13 + 14 15 16 + + 17 18 19 + 20 21 22 + 23 24 25 + 26 27 28 + */ + + // Computation loop + "0:\n" + // in(x3): v1 v2 + // w(x0): v0 v29 v30 + + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr q30, [x0, 32]\n" + "sdot v8.4s, v0.16b, v1.4b[1]\n" + "sdot v11.4s, v0.16b, v1.4b[2]\n" + "ldr q2, [x3, 16]\n" + "sdot v14.4s, v0.16b, v1.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v9.4s, v29.16b, v1.4b[1]\n" + "ldr q3, [x0, 48]!\n" // first w of next round + "sdot v12.4s, v29.16b, v1.4b[2]\n" + "sdot v15.4s, v29.16b, v1.4b[3]\n" + + "sdot v7.4s, v30.16b, v1.4b[0]\n" + "subs x2, x2, #4\n" + "sdot v10.4s, v30.16b, v1.4b[1]\n" + "sdot v13.4s, v30.16b, v1.4b[2]\n" + "sdot v16.4s, v30.16b, v1.4b[3]\n" + + "sdot v17.4s, v0.16b, v2.4b[0]\n" + "ldr q1, [x3, 32]!\n" + "sdot v20.4s, v0.16b, v2.4b[1]\n" + "sdot v23.4s, v0.16b, v2.4b[2]\n" + "sdot v26.4s, v0.16b, v2.4b[3]\n" + + "sdot v18.4s, v29.16b, v2.4b[0]\n" + "mov v0.16b, v3.16b\n" + "sdot v21.4s, v29.16b, v2.4b[1]\n" + "sdot v24.4s, v29.16b, v2.4b[2]\n" + "sdot v27.4s, v29.16b, v2.4b[3]\n" + + "sdot v19.4s, v30.16b, v2.4b[0]\n" + "ldr q29, [x0, 16]\n" + "sdot v22.4s, v30.16b, v2.4b[1]\n" + "sdot v25.4s, v30.16b, v2.4b[2]\n" + "sdot v28.4s, v30.16b, v2.4b[3]\n" + + "bne 0b\n" + + // give out address to x26 + "mov x26, %2\n" + + "st1 {v5.4s, v6.4s, v7.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v8.4s, v9.4s, v10.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v11.4s, v12.4s, v13.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v14.4s, v15.4s, v16.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v17.4s, v18.4s, v19.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v20.4s, v21.4s, v22.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v23.4s, v24.4s, v25.4s}, [x26]\n" + "add x26, x26, %4\n" + "st1 {v26.4s, v27.4s, v28.4s}, [x26]\n" + + : "+r"(in), "+r"(w), "+r"(out) + : "r"((I64)K), "r"((I64)offset) + : "memory", "cc", "v30", "v29", "v28", "v27", "v26", "v25", "v24", "v23", "v22", "v21", + "v20", "v19", "v18", "v17", "v16", "v15", "v14", "v13", "v12", "v11", "v10", "v9", "v8", + "v7", "v6", "v5", "v3", "v2", "v1", "v0", "x26", "x3", "x2", "x0"); +} + +void mmm_A76( + int M, int N, int K, bool transposeA, INT8 *matrix1, INT8 *matrix2, INT8 *tmp, I32 *result) +{ + int blockK = K; + U32 K4 = pad_to_4_multiple(K); + int blockM = 96; + INT8 *matrix1Trans = tmp; + I32 *resultCurrent = result; + + int KInner, MInner, m, n; + for (int k = 0; k < K; k += blockK) { + KInner = UNI_MIN(blockK, K - k); // K for this inner iteration + for (int i = 0; i < M; i += blockM) { + MInner = UNI_MIN(blockM, M - i); // M for this inner iteration + for (n = 0; n <= N - 8; n += 8) { + if (i == 0) { + if (transposeA) { + matrix2_trans_int8(8, KInner, N, matrix1 + n, matrix1Trans + n * K4); + } else { + matrix1_trans_n8(KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); + } + } + + for (m = 0; m <= (MInner - 12); m += 12) { + resultCurrent = result + n * M + m + i; + mmm_8x12_A76( + M * 4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + } + for (; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_8x8_A76( + M * 4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_8x4_A76( + M * 4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_N8_MTail(MInner - m, M, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, + resultCurrent); + } + } + + if ((N - n) >= 4) { + if (i == 0) { + if (transposeA) { + matrix2_trans_int8(4, KInner, N, matrix1 + n, matrix1Trans + n * K4); + } else { + matrix1_trans_int8(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); + } + } + + for (m = 0; m <= (MInner - 12); m += 12) { + resultCurrent = result + n * M + m + i; + mmm_4x12_A76( + M * 4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + } + + for (; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_4x8_A76( + M * 4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_4x4_A76( + M * 4, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_N4_MTail(MInner - m, M, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, + resultCurrent); + } + n += 4; + } + + if (N - n) { + if (i == 0) { + if (transposeA) { + matrix2_trans_int8(N - n, KInner, N, matrix1 + n, matrix1Trans + n * K4); + } else { + matrix1_trans_int8( + N - n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); + } + } + + for (m = 0; m <= (MInner - 12); m += 12) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M12( + M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + } + + for (; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M8( + M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + } + + if ((MInner - m) >= 4) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M4( + M, N - n, K4, matrix1Trans + n * K4, matrix2 + (i + m) * K4, resultCurrent); + m += 4; + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M(MInner - m, M, N - n, K4, matrix1Trans + n * K4, + matrix2 + (i + m) * K4, resultCurrent); + } + } + } + } +} +#endif diff --git a/compute/blas_enhance/src/cpu/arm/int8/mmm_common.h b/compute/blas_enhance/src/cpu/arm/int8/mmm_common.h new file mode 100644 index 00000000..b5c213c2 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/int8/mmm_common.h @@ -0,0 +1,455 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_MMM_COMMON +#define _H_MMM_COMMON + +#ifdef _USE_INT8 +#include +#include + +#include "types.h" +#include "error.h" +#include "arm_neon_expand.h" + +inline void matrix1_trans_n8(U32 blockK, U32 K, INT8 *src, INT8 *dst) +{ + // Move k4 as one I32 + I32 *dst1 = (I32 *)dst; + + I32 *in[8]; + for (U32 i = 0; i < 8; i++) { + in[i] = (I32 *)(src + i * K); + } + U32 k = 0; + for (; k < blockK - 7; k += 8) { + if (k % 64 == 0) { + asm volatile("prfm pldl2keep, [%[in0], 64]\n" + "prfm pldl2keep, [%[in1], 64]\n" + "prfm pldl2keep, [%[in2], 64]\n" + "prfm pldl2keep, [%[in3], 64]\n" + "prfm pldl2keep, [%[in4], 64]\n" + "prfm pldl2keep, [%[in5], 64]\n" + "prfm pldl2keep, [%[in6], 64]\n" + "prfm pldl2keep, [%[in7], 64]\n" + : [in0] "+r"(in[0]), [in1] "+r"(in[1]), [in2] "+r"(in[2]), + [in3] "+r"(in[3]), [in4] "+r"(in[4]), [in5] "+r"(in[5]), [in6] "+r"(in[6]), + [in7] "+r"(in[7]) + : + : "memory", "cc"); + } + asm volatile("ldr d0, [%[in0]], 8\n" + "ldr d1, [%[in1]], 8\n" + "ldr d2, [%[in2]], 8\n" + "ldr d3, [%[in3]], 8\n" + "ldr d4, [%[in4]], 8\n" + "ldr d5, [%[in5]], 8\n" + "ldr d6, [%[in6]], 8\n" + "ldr d7, [%[in7]], 8\n" + + "zip1 v8.2s, v0.2s, v1.2s\n" + "zip2 v12.2s, v0.2s, v1.2s\n" + "zip1 v9.2s, v2.2s, v3.2s\n" + "zip2 v13.2s, v2.2s, v3.2s\n" + "zip1 v10.2s, v4.2s, v5.2s\n" + "zip2 v14.2s, v4.2s, v5.2s\n" + "zip1 v11.2s, v6.2s, v7.2s\n" + "zip2 v15.2s, v6.2s, v7.2s\n" + + "str d8, [%[out]]\n" + "str d9, [%[out], 8]\n" + "str d10, [%[out], 16]\n" + "str d11, [%[out], 24]\n" + "str d12, [%[out], 32]\n" + "str d13, [%[out], 40]\n" + "str d14, [%[out], 48]\n" + "str d15, [%[out], 56]\n" + : [in0] "+r"(in[0]), [in1] "+r"(in[1]), [in2] "+r"(in[2]), [in3] "+r"(in[3]), + [in4] "+r"(in[4]), [in5] "+r"(in[5]), [in6] "+r"(in[6]), [in7] "+r"(in[7]) + : [out] "r"(dst1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15"); + dst1 += 16; + } + + if (k < blockK - 3) { + for (U32 i = 0; i < 8; i++) { + dst1[0] = in[i][0]; + dst1++; + in[i]++; + } + k += 4; + } + + if (k < blockK) { + U32 kTail = blockK - k; + INT8 *dstI8 = (INT8 *)dst1; + INT8 *inI[8]; + for (U32 i = 0; i < 8; i++) { + inI[i] = (INT8 *)in[i]; + } + for (U32 i = 0; i < 8; i++) { + for (U32 j = 0; j < 4; j++) { + if (j < kTail) { + dstI8[i * 4 + j] = inI[i][j]; + } else { + dstI8[i * 4 + j] = 0; + } + } + } + } +} + +// Trans from NK to NKn(size)k4 +inline void matrix1_trans_int8(U32 size, U32 blockK, U32 K, INT8 *src, INT8 *dst) +{ + // Move k4 as one I32 + I32 *src1; + I32 *dst1 = (I32 *)dst; + U32 offset = 64; + + U32 i = 0; + for (; i < blockK / 4; i++) { + for (U32 j = 0; j < size; j++) { + src1 = (I32 *)(src + j * K); + + if (i % 16 == 0) { + asm volatile("prfm pldl2keep, [%0, %1]\n" + : "+r"(src1) + : "r"((I64)offset) + : "memory", "cc"); + } + *dst1++ = *(src1 + i); + } + } + U32 kTail = blockK % 4; + if (kTail > 0) { + INT8 *srcI8; + INT8 *dstI8 = (INT8 *)dst1; + for (U32 j = 0; j < size; j++) { + srcI8 = src + j * K + i * 4; + for (U32 k = 0; k < 4; k++) { + if (k < kTail) { + dstI8[j * 4 + k] = srcI8[k]; + } else { + dstI8[j * 4 + k] = 0; + } + } + } + } +} + +inline void matrix2_trans_m12(U32 blockK, U32 M, INT8 *src, INT8 *dst) +{ + INT8 *src1 = src; + INT8 *dst1 = dst; + U32 offset = 4 * M; + + U32 i = 0; + for (; i < blockK - 3; i += 4) { + // Prefetch for the next iteration + asm volatile("prfm pldl2keep, [%0, %1]\n" : "+r"(src1) : "r"((I64)offset) : "memory", "cc"); + + INT8 *in12[4]; + for (U32 j = 0; j < 4; j++) { + in12[j] = src1 + j * M; + } + src1 += offset; + + asm volatile("ldr d0, [%[in0]]\n" + "ldr d1, [%[in1]]\n" + "ldr d2, [%[in2]]\n" + "ldr d3, [%[in3]]\n" + "zip1 v4.8b, v0.8b, v1.8b\n" + "zip2 v5.8b, v0.8b, v1.8b\n" + "zip1 v6.8b, v2.8b, v3.8b\n" + "zip2 v7.8b, v2.8b, v3.8b\n" + + "zip1 v0.4h, v4.4h, v6.4h\n" + "zip2 v1.4h, v4.4h, v6.4h\n" + "zip1 v2.4h, v5.4h, v7.4h\n" + "zip2 v3.4h, v5.4h, v7.4h\n" + "str d0, [%[out]]\n" + "str d1, [%[out], 8]\n" + "str d2, [%[out], 16]\n" + "str d3, [%[out], 24]\n" + : + : [in0] "r"(in12[0]), [in1] "r"(in12[1]), [in2] "r"(in12[2]), + [in3] "r"(in12[3]), [out] "r"(dst1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); + + for (U32 j = 0; j < 4; j++) { + for (U32 k = 0; k < 4; k++) { + dst1[32 + j * 4 + k] = in12[k][8 + j]; + } + } + + dst1 += 48; + } + if (i < blockK) { + U32 kTail = blockK - i; + + INT8 *in12[4]; + INT8 zero[12] = {0}; + for (U32 j = 0; j < 4; j++) { + if (j < kTail) { + in12[j] = src1 + j * M; + } else { + in12[j] = zero; + } + } + + asm volatile("ldr d0, [%[in0]]\n" + "ldr d1, [%[in1]]\n" + "ldr d2, [%[in2]]\n" + "ldr d3, [%[in3]]\n" + "zip1 v4.8b, v0.8b, v1.8b\n" + "zip2 v5.8b, v0.8b, v1.8b\n" + "zip1 v6.8b, v2.8b, v3.8b\n" + "zip2 v7.8b, v2.8b, v3.8b\n" + + "zip1 v0.4h, v4.4h, v6.4h\n" + "zip2 v1.4h, v4.4h, v6.4h\n" + "zip1 v2.4h, v5.4h, v7.4h\n" + "zip2 v3.4h, v5.4h, v7.4h\n" + "str d0, [%[out]]\n" + "str d1, [%[out], 8]\n" + "str d2, [%[out], 16]\n" + "str d3, [%[out], 24]\n" + : + : [in0] "r"(in12[0]), [in1] "r"(in12[1]), [in2] "r"(in12[2]), + [in3] "r"(in12[3]), [out] "r"(dst1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); + + for (U32 j = 0; j < 4; j++) { + for (U32 k = 0; k < 4; k++) { + dst1[32 + j * 4 + k] = in12[k][8 + j]; + } + } + } +} + +// Trans from KM to MKm(size)k4 +inline void matrix2_trans_int8(U32 size, U32 blockK, U32 M, INT8 *src, INT8 *dst) +{ + INT8 *src1 = src; + INT8 *dst1 = dst; + U32 offset = 4 * M; + + U32 i = 0; + for (; i < blockK - 3; i += 4) { + src1 = src + i * M; + asm volatile("prfm pldl2keep, [%0, %1]\n" : "+r"(src1) : "r"((I64)offset) : "memory", "cc"); + for (U32 j = 0; j < size; j++) { + src1 = src + i * M + j; + for (U32 k = 0; k < 4; k++) { + *dst1 = *src1; + dst1++; + src1 += M; + } + } + } + if (i < blockK) { + U32 kTail = blockK - i; + for (U32 j = 0; j < size; j++) { + src1 = src + i * M + j; + for (U32 k = 0; k < 4; k++) { + if (k < kTail) { + *dst1 = *src1; + dst1++; + src1 += M; + } else { + *dst1 = 0; + dst1++; + } + } + } + } +} + +inline void mmm_N8_MTail(U32 MInner, U32 M, U32 K, INT8 *matrix1, INT8 *matrix2, I32 *result) +{ + int8x16_t mat1[2]; + int8x16_t mat2; + int32x4_t res[4][2] = {{0}}; + I32 tmp[8] = {0}; + + CHECK_REQUIREMENT(MInner < 4); + + for (U32 i = 0; i < K; i += 4) { + mat1[0] = vld1q_s8(matrix1 + i * 8); + mat1[1] = vld1q_s8(matrix1 + i * 8 + 16); + + mat2 = vld1q_s8(matrix2 + i * MInner); + + for (U32 j = 0; j < MInner; j++) { + res[j][0] = vdotq_laneq_s32_builtin(res[j][0], mat1[0], mat2, j); + res[j][1] = vdotq_laneq_s32_builtin(res[j][1], mat1[1], mat2, j); + } + } + for (U32 p = 0; p < MInner; p++) { + vst1q_s32(tmp, res[p][0]); + vst1q_s32(tmp + 4, res[p][1]); + for (U32 q = 0; q < 8; q++) { + result[q * M + p] += tmp[q]; + } + res[p][0] = vdupq_n_s32(0); + res[p][1] = vdupq_n_s32(0); + } +} + +inline void mmm_N4_MTail(U32 MInner, U32 M, U32 K, INT8 *matrix1, INT8 *matrix2, I32 *result) +{ + int8x16_t mat1 = {0}; + int8x16_t mat2 = {0}; + int32x4_t res[4] = {0}; + I32 tmp[8] = {0}; + + CHECK_REQUIREMENT(MInner < 4); + + for (U32 i = 0; i < K; i += 4) { + mat1 = vld1q_s8(matrix1 + i * 8); + + mat2 = vld1q_s8(matrix2 + i * MInner); + + for (U32 j = 0; j < MInner; j++) { + res[j] = vdotq_laneq_s32_builtin(res[j], mat1, mat2, j); + } + } + for (U32 p = 0; p < MInner; p++) { + vst1q_s32(tmp, res[p]); + for (U32 q = 0; q < 8; q++) { + result[q * M + p] += tmp[q]; + } + res[p] = vdupq_n_s32(0); + } +} + +inline void mmm_NTail_M12(U32 M, U32 N, U32 K, INT8 *matrix1, INT8 *matrix2, I32 *result) +{ + int8x16_t mat1 = {0}; + int8x16_t mat2[3] = {0}; + int32x4_t res[4][3] = {{0}}; + + for (U32 i = 0; i < N; i++) { + res[i][0] = vld1q_s32(result + i * M); + res[i][1] = vld1q_s32(result + i * M + 4); + res[i][2] = vld1q_s32(result + i * M + 8); + } + + for (U32 q = 0; q < K; q += 4) { + mat1 = vld1q_s8(matrix1 + q * N); + + mat2[0] = vld1q_s8(matrix2 + q * 12); + mat2[1] = vld1q_s8(matrix2 + q * 12 + 16); + mat2[2] = vld1q_s8(matrix2 + q * 12 + 32); + + for (U32 n = 0; n < N; n++) { + res[n][0] = vdotq_laneq_s32_builtin(res[n][0], mat2[0], mat1, n); + res[n][1] = vdotq_laneq_s32_builtin(res[n][1], mat2[1], mat1, n); + res[n][2] = vdotq_laneq_s32_builtin(res[n][2], mat2[2], mat1, n); + } + } + + for (U32 i = 0; i < N; i++) { + vst1q_s32(result + i * M, res[i][0]); + vst1q_s32(result + i * M + 4, res[i][1]); + vst1q_s32(result + i * M + 8, res[i][2]); + } +} + +inline void mmm_NTail_M8(U32 M, U32 N, U32 K, INT8 *matrix1, INT8 *matrix2, I32 *result) +{ + int8x16_t mat1 = {0}; + int8x16_t mat2[2] = {0}; + int32x4_t res[4][2] = {{0}}; + + for (U32 i = 0; i < N; i++) { + res[i][0] = vld1q_s32(result + i * M); + res[i][1] = vld1q_s32(result + i * M + 4); + } + + for (U32 q = 0; q < K; q += 4) { + mat1 = vld1q_s8(matrix1 + q * N); + + mat2[0] = vld1q_s8(matrix2 + q * 8); + mat2[1] = vld1q_s8(matrix2 + q * 8 + 16); + + for (U32 n = 0; n < N; n++) { + res[n][0] = vdotq_laneq_s32_builtin(res[n][0], mat2[0], mat1, n); + res[n][1] = vdotq_laneq_s32_builtin(res[n][1], mat2[1], mat1, n); + } + } + + for (U32 i = 0; i < N; i++) { + vst1q_s32(result + i * M, res[i][0]); + vst1q_s32(result + i * M + 4, res[i][1]); + } +} + +inline void mmm_NTail_M4(U32 M, U32 N, U32 K, INT8 *matrix1, INT8 *matrix2, I32 *result) +{ + int8x16_t mat1 = {0}; + int8x16_t mat2 = {0}; + int32x4_t res[4] = {0}; + + for (U32 i = 0; i < N; i++) { + res[i] = vld1q_s32(result + i * M); + } + + for (U32 q = 0; q < K; q += 4) { + mat1 = vld1q_s8(matrix1 + q * N); + + mat2 = vld1q_s8(matrix2 + q * 4); + + for (U32 n = 0; n < N; n++) { + res[n] = vdotq_laneq_s32_builtin(res[n], mat2, mat1, n); + } + } + + for (U32 i = 0; i < N; i++) { + vst1q_s32(result + i * M, res[i]); + } +} + +// matrix2 has been transformed to MKm(MInner)K4 +inline void mmm_NTail_M(U32 MInner, U32 M, U32 N, U32 K, INT8 *matrix1, INT8 *matrix2, I32 *result) +{ + int8x16_t mat1 = {0}; + int8x16_t mat2 = {0}; + int32x4_t res[3] = {0}; + I32 buf[4]; + + // for (U32 i = 0; i < N; i++) { + // res[i] = vld1q_s32(result + i*M); + // } + + for (U32 q = 0; q < K; q += 4) { + mat1 = vld1q_s8(matrix1 + q * N); + + mat2 = vld1q_s8(matrix2 + q * MInner); + + for (U32 n = 0; n < N; n++) { + res[n] = vdotq_laneq_s32_builtin(res[n], mat2, mat1, n); + } + } + + for (U32 i = 0; i < N; i++) { + vst1q_s32(buf, res[i]); + for (U32 j = 0; j < MInner; j++) { + result[i * M + j] += buf[j]; + } + } +} +#endif +#endif diff --git a/compute/blas_enhance/src/cpu/arm/int8/mvm.cpp b/compute/blas_enhance/src/cpu/arm/int8/mvm.cpp new file mode 100644 index 00000000..97e9fb6a --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/int8/mvm.cpp @@ -0,0 +1,168 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_INT8 +#include "cpu/arm/blas_arm.h" +#include "cpu/arm/int8/blas_int8.h" +#include "cpu/arm/int8/mvm.h" +#include "cpu/arm/int8/mmm_common.h" + +EE matrix_vector_multiply_transform_weight_int8(TensorDesc desc, INT8 *src, INT8 *dst) +{ + DataType dt; + DataFormat df; + U32 N, K; + EE ret = SUCCESS; + int i = 0; + switch (desc.df) { + case DF_NORMAL: { + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K)); + U32 K4 = pad_to_4_multiple(K); + for (; i < (int)N - 31; i += 32) { + matrix1_trans_int8(32, K, K, src + i * K, dst + i * K4); + } + if (i < (int)N) { + memcpy(dst + i * K4, src + i * K, (N - i) * K * bytesOf(DT_I8)); + } + break; + } + case DF_TRANSPOSE: { + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N)); + U32 K4 = pad_to_4_multiple(K); + for (; i < (int)N - 31; i += 32) { + matrix2_trans_int8(32, K, N, src + i, dst + i * K4); + } + if (i < (int)N) { + int base = i; + INT8 *basePtr = dst + i * K4; + for (int j = 0; j < (int)K; j++) { + for (; i < (int)N; i++) { + basePtr[(i - base) * K + j] = src[j * N + i]; + } + } + } + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +void mvm_row_pack(U32 Nbatch, U32 K, INT8 *matrix, INT8 *vector, I32 *result) +{ + U32 N = Nbatch * 32; + int8x16_t mat[16]; + int8x8_t v; + int32x4_t res[8]; + U32 K_tail = K % 4; + U32 K_inner = K - K_tail; + U32 K4 = pad_to_4_multiple(K); + + for (U32 n = 0; n < N; n += 32) { + INT8 *bufMov = matrix + n * K4; + if (K_inner > 0) { + for (int i = 0; i < 8; i++) { + res[i] = vld1q_s32(result + n + i * 4); + } + U32 k = 0; + for (; k < K_inner; k += 8) { + v = vld1_s8(vector + k); + for (int i = 0; i < 8; i++) { + mat[i] = vld1q_s8(bufMov + i * 16); + } + for (int i = 0; i < 8; i++) { + res[i] = vdotq_lane_s32(res[i], mat[i], v, 0); + } + for (int i = 8; i < 16; i++) { + mat[i] = vld1q_s8(bufMov + i * 16); + } + for (int i = 0; i < 8; i++) { + res[i] = vdotq_lane_s32(res[i], mat[i + 8], v, 1); + } + bufMov += 256; + } + if (K_inner > K) { + v = vld1_s8(vector + k - 4); + for (int i = 0; i < 8; i++) { + mat[i] = vld1q_s8(bufMov + i * 16); + } + for (int i = 0; i < 8; i++) { + res[i] = vdotq_lane_s32(res[i], mat[i], v, 1); + } + bufMov += 128; + } + + for (int i = 0; i < 8; i++) { + vst1q_s32(result + n + i * 4, res[i]); + } + } + if (K_tail > 0) { + for (int i = 0; i < 32; i++) { + I32 tmp = 0; + for (int j = 0; j < (int)K_tail; j++) { + tmp += vector[K_inner + j] * bufMov[j]; + } + result[n + i] += tmp; + bufMov += 4; + } + } + } +} + +void mvm_row(U32 numRows, U32 numColumns, DataFormat df, INT8 *matrix, INT8 *vector, I32 *result) +{ + // Actual layout is NK, and vector is K + U32 N = numRows; + U32 K = numColumns; + switch (df) { + case DF_NORMAL: { + U32 Nbatch = N / 8; + U32 NTail = N % 8; + + mvm_row_unpack(Nbatch, K, matrix, vector, result); + + if (NTail != 0) { + mvm_row_tail(NTail, K, matrix + (N - NTail) * K, vector, result + N - NTail); + } + break; + } + case DF_NKN32K4: { + U32 Nbatch = N / 32; + U32 NTail = N % 32; + + mvm_row_pack(Nbatch, K, matrix, vector, result); + + if (NTail != 0) { + U32 K4 = pad_to_4_multiple(K); + mvm_row_tail(NTail, K, matrix + (N - NTail) * K4, vector, result + N - NTail); + } + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } +} + +EE mvm_int8(U32 row, U32 col, DataFormat df, INT8 *matrix, INT8 *vector, I32 *tmp, I32 *result) +{ + if (DF_TRANSPOSE == df) { + mvm_col(row, col, matrix, vector, tmp, result); + } else { + mvm_row(row, col, df, matrix, vector, result); + } + return SUCCESS; +} +#endif diff --git a/compute/blas_enhance/src/cpu/arm/int8/mvm.h b/compute/blas_enhance/src/cpu/arm/int8/mvm.h new file mode 100644 index 00000000..ef9ad23e --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/int8/mvm.h @@ -0,0 +1,128 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_MVM +#define _H_MVM + +#ifdef _USE_INT8 +#include +#include + +inline void mvm_col_tail(U32 N, U32 K, INT8 *matrix, INT8 *vector, I32 *result) +{ + for (U32 n = 0; n < N; n++) { + I32 tmp = 0; + for (U32 k = 0; k < K; k++) { + tmp += vector[k] * matrix[k * N + n]; + } + result[n] += tmp; + } +} + +inline void mvm_row_tail(U32 N, U32 K, INT8 *matrix, INT8 *vector, I32 *result) +{ + INT8 *cur_row = matrix; + for (U32 n = 0; n < N; n++) { + I32 tmp = 0; + for (U32 k = 0; k < K; k++) { + tmp += vector[k] * cur_row[k]; + } + result[n] += tmp; + cur_row += K; + } +} + +inline void mvm_row_unpack(U32 Nbatch, U32 K, INT8 *matrix, INT8 *vector, I32 *result) +{ + U32 N = Nbatch * 8; + int8x16_t mat[8], v; + U32 K_tail = K % 16; + U32 K_inner = K - K_tail; + for (U32 n = 0; n < N; n += 8) { + int32x4_t res[8] = {0}; + int32x4_t bias[2]; + + INT8 *w[8]; + for (int i = 0; i < 8; i++) { + w[i] = matrix + (n + i) * K; + } + + for (U32 k = 0; k < K_inner; k += 16) { + v = vld1q_s8(vector + k); + for (int i = 0; i < 8; i++) { + mat[i] = vld1q_s8(w[i + k]); + } + for (int i = 0; i < 8; i++) { + res[i] = vdotq_s32(res[i], mat[i], v); + } + } + bias[0] = vld1q_s32(result + n); + bias[1] = vld1q_s32(result + n + 4); + + res[0] = vpaddq_s32(res[0], res[1]); + res[4] = vpaddq_s32(res[4], res[5]); + res[2] = vpaddq_s32(res[2], res[3]); + res[6] = vpaddq_s32(res[6], res[7]); + res[0] = vpaddq_s32(res[0], res[2]); + res[4] = vpaddq_s32(res[4], res[6]); + res[0] = vaddq_s32(res[0], bias[0]); + res[4] = vaddq_s32(res[4], bias[1]); + + vst1q_s32(result + n, res[0]); + vst1q_s32(result + n + 4, res[4]); + + if (K_tail != 0) { + for (int i = 0; i < 8; i++) { + I32 tmp = 0; + for (U32 p = K_inner; p < K; p++) { + tmp += vector[p] * w[i][p]; + } + result[n + i] += tmp; + } + } + } +} + +inline void mvm_col(U32 numRows, U32 numColumns, INT8 *matrix, INT8 *vector, I32 *tmp, I32 *result) +{ + // Actual layout is KN, and vector is K + U32 N = numRows; + U32 K = numColumns; + U32 NTail = N % 64; + U32 NInner = N - NTail; + + for (U32 n = 0; n < NInner; n += 64) { + memset(tmp, 0, sizeof(I32) * 64); + for (U32 k = 0; k < K; k++) { + for (U32 i = 0; i < 64; i++) { + tmp[i] += vector[k] * matrix[k * N + n + i]; + } + } + + for (U32 i = 0; i < 64; i++) { + result[n + i] += tmp[i]; + } + } + + memset(tmp, 0, sizeof(I32) * 64); + for (U32 k = 0; k < K; k++) { + for (U32 i = 0; i < NTail; i++) { + tmp[i] += vector[k] * matrix[k * N + NInner + i]; + } + for (U32 i = 0; i < NTail; i++) { + result[NInner + i] += tmp[i]; + } + } +} +#endif +#endif diff --git a/blas-enhance/src/cpu/arm/mmm.cpp b/compute/blas_enhance/src/cpu/arm/mmm.cpp similarity index 63% rename from blas-enhance/src/cpu/arm/mmm.cpp rename to compute/blas_enhance/src/cpu/arm/mmm.cpp index 8ba0abab..e2d57d2e 100644 --- a/blas-enhance/src/cpu/arm/mmm.cpp +++ b/compute/blas_enhance/src/cpu/arm/mmm.cpp @@ -1,20 +1,19 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "error.h" -#include "type.h" -#include "blas-enhance.h" +#include "types.h" +#include "blas_enhance.h" #include "cpu/arm/blas_arm.h" #ifdef _USE_FP16 #include "cpu/arm/fp16/blas_fp16.h" @@ -26,27 +25,29 @@ #include "cpu/arm/int8/blas_int8.h" #endif - -EE matrix_matrix_multiply_tmp_bytes_arm(U32 matrixA_M, U32 matrixA_K, U32 matrixB_K, U32 matrixB_N, - DataType dt, U32 *bytes) +EE matrix_matrix_multiply_tmp_bytes_arm( + U32 matrixA_M, U32 matrixA_K, U32 matrixB_K, U32 matrixB_N, DataType dt, U32 *bytes) { EE ret = SUCCESS; switch (dt) { #ifdef _USE_FP16 case DT_F16: { - matrix_matrix_multiply_tmp_bytes_fp16(matrixA_M, matrixA_K, matrixB_K, matrixB_N, dt, bytes); + matrix_matrix_multiply_tmp_bytes_fp16( + matrixA_M, matrixA_K, matrixB_K, matrixB_N, dt, bytes); break; } #endif #ifdef _USE_FP32 case DT_F32: { - matrix_matrix_multiply_tmp_bytes_fp32(matrixA_M, matrixA_K, matrixB_K, matrixB_N, dt, bytes); + matrix_matrix_multiply_tmp_bytes_fp32( + matrixA_M, matrixA_K, matrixB_K, matrixB_N, dt, bytes); break; } #endif #ifdef _USE_INT8 case DT_I8: { - matrix_matrix_multiply_tmp_bytes_int8(matrixA_M, matrixA_K, matrixB_K, matrixB_N, dt, bytes); + matrix_matrix_multiply_tmp_bytes_int8( + matrixA_M, matrixA_K, matrixB_K, matrixB_N, dt, bytes); break; } #endif @@ -58,25 +59,26 @@ EE matrix_matrix_multiply_tmp_bytes_arm(U32 matrixA_M, U32 matrixA_K, U32 matrix return ret; } -EE matrix_matrix_multiply_transform_rhsN(TensorDesc desc, const void* src, TensorDesc* descTran, void* dst) +static EE matrix_matrix_multiply_transform_rhsN( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst) { EE ret = SUCCESS; switch (desc.dt) { #ifdef _USE_FP16 case DT_F16: { - ret = matrix_matrix_multiply_transform_rhsN_fp16(desc, (F16*)src, (F16*)dst); + ret = matrix_matrix_multiply_transform_rhsN_fp16(desc, (F16 *)src, (F16 *)dst); break; } #endif #ifdef _USE_FP32 case DT_F32: { - ret = matrix_matrix_multiply_transform_rhsN_fp32(desc, (F32*)src, (F32*)dst); + ret = matrix_matrix_multiply_transform_rhsN_fp32(desc, (F32 *)src, (F32 *)dst); break; } #endif #ifdef _USE_INT8 case DT_I8: { - ret = matrix_matrix_multiply_transform_rhsN_int8(desc, (INT8*)src, (INT8*)dst); + ret = matrix_matrix_multiply_transform_rhsN_int8(desc, (INT8 *)src, (INT8 *)dst); break; } #endif @@ -89,25 +91,26 @@ EE matrix_matrix_multiply_transform_rhsN(TensorDesc desc, const void* src, Tens return ret; } -EE matrix_matrix_multiply_transform_rhsT(TensorDesc desc, const void* src, TensorDesc* descTran, void* dst) +static EE matrix_matrix_multiply_transform_rhsT( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst) { EE ret = SUCCESS; switch (desc.dt) { #ifdef _USE_FP16 case DT_F16: { - ret = matrix_matrix_multiply_transform_rhsT_fp16(desc, (F16*)src, (F16*)dst); + ret = matrix_matrix_multiply_transform_rhsT_fp16(desc, (F16 *)src, (F16 *)dst); break; } #endif #ifdef _USE_FP32 case DT_F32: { - ret = matrix_matrix_multiply_transform_rhsT_fp32(desc, (F32*)src, (F32*)dst); + ret = matrix_matrix_multiply_transform_rhsT_fp32(desc, (F32 *)src, (F32 *)dst); break; } #endif #ifdef _USE_INT8 case DT_I8: { - ret = matrix_matrix_multiply_transform_rhsT_int8(desc, (INT8*)src, (INT8*)dst); + ret = matrix_matrix_multiply_transform_rhsT_int8(desc, (INT8 *)src, (INT8 *)dst); break; } #endif @@ -117,10 +120,12 @@ EE matrix_matrix_multiply_transform_rhsT(TensorDesc desc, const void* src, Tens } (*descTran) = desc; (*descTran).df = targetFormat4MatrixB(desc.dt); + std::swap((*descTran).dims[0], (*descTran).dims[1]); return ret; } -EE matrix_matrix_multiply_transform_rhs(TensorDesc desc, const void* src, TensorDesc* descTran, void* dst) +EE matrix_matrix_multiply_transform_rhs_arm( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst) { if (desc.df == targetFormat4MatrixB(desc.dt)) { return SUCCESS; @@ -142,34 +147,42 @@ EE matrix_matrix_multiply_transform_rhs(TensorDesc desc, const void* src, Tenso return ret; } -EE mmm_arm(U32 matrixC_N, U32 matrixC_M, U32 matrixA_K, - DataType dt, - const void* matrixAData, const void* matrixBData, - void* tmp, - void* matrixCData, - Arch arch) +EE mmm_arm(U32 matrixC_N, + U32 matrixC_M, + U32 matrixA_K, + DataType dt, + bool transposeA, + const void *matrixAData, + const void *matrixBData, + void *tmp, + void *matrixCData, + Arch arch) { EE ret = SUCCESS; switch (dt) { #ifdef _USE_FP16 case DT_F16: { - ret = mmm_fp16(matrixC_N, matrixC_M, matrixA_K, (F16*)matrixAData, (F16*)matrixBData, (F16*)tmp, (F16*)matrixCData, arch); + ret = mmm_fp16(matrixC_N, matrixC_M, matrixA_K, transposeA, (F16 *)matrixAData, + (F16 *)matrixBData, (F16 *)tmp, (F16 *)matrixCData, arch); break; } #endif #ifdef _USE_FP32 case DT_F32: { #ifdef __aarch64__ - ret = mmm_fp32_V8(matrixC_N, matrixC_M, matrixA_K, (F32*)matrixAData, (F32*)matrixBData, (F32*)tmp, (F32*)matrixCData); + ret = mmm_fp32_V8(matrixC_N, matrixC_M, matrixA_K, transposeA, (F32 *)matrixAData, + (F32 *)matrixBData, (F32 *)tmp, (F32 *)matrixCData); #else - ret = mmm_fp32_V7(matrixC_N, matrixC_M, matrixA_K, (F32*)matrixAData, (F32*)matrixBData, (F32*)tmp, (F32*)matrixCData); + ret = mmm_fp32_V7(matrixC_N, matrixC_M, matrixA_K, transposeA, (F32 *)matrixAData, + (F32 *)matrixBData, (F32 *)tmp, (F32 *)matrixCData); #endif break; } #endif #ifdef _USE_INT8 case DT_I8: { - ret = mmm_int8(matrixC_N, matrixC_M, matrixA_K, (INT8*)matrixAData, (INT8*)matrixBData, (INT8*)tmp, (I32*)matrixCData, arch); + ret = mmm_int8(matrixC_N, matrixC_M, matrixA_K, transposeA, (INT8 *)matrixAData, + (INT8 *)matrixBData, (INT8 *)tmp, (I32 *)matrixCData, arch); break; } #endif diff --git a/blas-enhance/src/cpu/arm/mvm.cpp b/compute/blas_enhance/src/cpu/arm/mvm.cpp similarity index 52% rename from blas-enhance/src/cpu/arm/mvm.cpp rename to compute/blas_enhance/src/cpu/arm/mvm.cpp index 58de421e..1329ceaa 100644 --- a/blas-enhance/src/cpu/arm/mvm.cpp +++ b/compute/blas_enhance/src/cpu/arm/mvm.cpp @@ -1,19 +1,19 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "error.h" -#include "type.h" +#include "types.h" +#include "blas_enhance.h" #include "cpu/arm/blas_arm.h" #ifdef _USE_FP16 #include "cpu/arm/fp16/blas_fp16.h" @@ -25,11 +25,11 @@ #include "cpu/arm/int8/blas_int8.h" #endif -EE matrix_vector_multiply_tmp_bytes_arm(bool transpose, - DataType dt, U32 *bytes) +EE matrix_vector_multiply_tmp_bytes_arm(bool transpose, DataType dt, U32 *bytes) { - if (nullptr == bytes) + if (nullptr == bytes) { CHECK_STATUS(NULL_POINTER); + } switch (dt) { #ifdef _USE_FP16 case DT_F16: @@ -43,8 +43,9 @@ EE matrix_vector_multiply_tmp_bytes_arm(bool transpose, #endif #ifdef _USE_INT8 case DT_I8: { - if (transpose) + if (transpose) { *bytes = 64 * sizeof(I32); + } break; } #endif @@ -54,8 +55,50 @@ EE matrix_vector_multiply_tmp_bytes_arm(bool transpose, return SUCCESS; } -EE mvm_arm(U32 row, U32 col, DataType dt, bool transpose, - const void *matrix, const void *vector, +EE matrix_vector_multiply_transform_weight_arm( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst) +{ + if (desc.df == targetFormat4mvmMatrix(desc.dt)) { + return SUCCESS; + } + EE ret = SUCCESS; + switch (desc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = matrix_vector_multiply_transform_weight_fp32(desc, (F32 *)src, (F32 *)dst); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = matrix_vector_multiply_transform_weight_fp16(desc, (F16 *)src, (F16 *)dst); + break; + } +#endif +#ifdef _USE_INT8 + case DT_I8: { + ret = matrix_vector_multiply_transform_weight_int8(desc, (INT8 *)src, (INT8 *)dst); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + *descTran = desc; + if (DF_TRANSPOSE == desc.df) { + std::swap((*descTran).dims[0], (*descTran).dims[1]); + } + descTran->df = targetFormat4mvmMatrix(desc.dt); + return ret; +} + +EE mvm_arm(U32 row, + U32 col, + DataType dt, + DataFormat df, + const void *matrix, + const void *vector, void *tmp, void *result, Arch arch) @@ -64,17 +107,17 @@ EE mvm_arm(U32 row, U32 col, DataType dt, bool transpose, switch (dt) { #ifdef _USE_FP16 case DT_F16: - ret = mvm_fp16(row, col, transpose, (F16*)matrix, (F16*)vector, (F16*)result, arch); + ret = mvm_fp16(row, col, df, (F16 *)matrix, (F16 *)vector, (F16 *)result, arch); break; #endif #ifdef _USE_FP32 case DT_F32: - ret = mvm_fp32(row, col, transpose, (F32*)matrix, (F32*)vector, (F32*)result); + ret = mvm_fp32(row, col, df, (F32 *)matrix, (F32 *)vector, (F32 *)result); break; #endif #ifdef _USE_INT8 case DT_I8: - ret = mvm_int8(row, col, transpose, (INT8*)matrix, (INT8*)vector, (I32*)tmp, (I32*)result); + ret = mvm_int8(row, col, df, (INT8 *)matrix, (INT8 *)vector, (I32 *)tmp, (I32 *)result); break; #endif default: diff --git a/compute/blas_enhance/src/cpu/general/axpby.cpp b/compute/blas_enhance/src/cpu/general/axpby.cpp new file mode 100644 index 00000000..a24a23fe --- /dev/null +++ b/compute/blas_enhance/src/cpu/general/axpby.cpp @@ -0,0 +1,45 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "error.h" +#include "types.h" +#include "cpu/general/blas_general.h" + +template +inline void axpby(U32 len, F32 a, T *x, F32 b, T *y) +{ + for (U32 i = 0; i < len; i++) { + y[i] = a * x[i] + b * y[i]; + } +} + +EE axpby_general(U32 len, DataType dt, F32 a, const void *x, F32 b, void *y) +{ + EE ret = SUCCESS; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + axpby(len, a, (F16 *)x, b, (F16 *)y); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + axpby(len, a, (F32 *)x, b, (F32 *)y); + break; +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/blas-enhance/src/cpu/general/blas_general.h b/compute/blas_enhance/src/cpu/general/blas_general.h similarity index 62% rename from blas-enhance/src/cpu/general/blas_general.h rename to compute/blas_enhance/src/cpu/general/blas_general.h index fe978a35..aa1a9b44 100644 --- a/blas-enhance/src/cpu/general/blas_general.h +++ b/compute/blas_enhance/src/cpu/general/blas_general.h @@ -1,30 +1,40 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_BLAS_GENERAL #define _H_BLAS_GENERAL #include "sys.h" -#include "type.h" - - -EE mvm_general(U32 row, U32 col, DataType dt, bool transpose, const void *matrix, const void *vector, void *result); - -EE mmm_general(U32 matrixC_N, U32 matrixC_M, U32 matrixA_K, - bool transposeA, bool transposeB, - DataType matrixADataType, - const void* matrixAData, const void* matrixBData, - void* matrixCData); +#include "types.h" + +EE mvm_general(U32 row, + U32 col, + DataType dt, + bool transpose, + const void *matrix, + const void *vector, + void *result); + +EE mmm_general(U32 matrixC_N, + U32 matrixC_M, + U32 matrixA_K, + bool transposeA, + bool transposeB, + DataType matrixADataType, + const void *matrixAData, + const void *matrixBData, + void *matrixCData); + +EE axpby_general(U32 len, DataType dt, F32 a, const void *x, F32 b, void *y); #endif diff --git a/blas-enhance/src/cpu/general/mmm.cpp b/compute/blas_enhance/src/cpu/general/mmm.cpp similarity index 65% rename from blas-enhance/src/cpu/general/mmm.cpp rename to compute/blas_enhance/src/cpu/general/mmm.cpp index 4b6cc208..c041110c 100644 --- a/blas-enhance/src/cpu/general/mmm.cpp +++ b/compute/blas_enhance/src/cpu/general/mmm.cpp @@ -1,37 +1,39 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "error.h" -#include "type.h" +#include "types.h" #include "cpu/general/blas_general.h" - -template -inline void mmm(U32 N, U32 M, U32 K, bool transposeA, bool transposeB, T1* matrixA, T1* matrixB, T2* matrixC) { - for (U32 i =0; i < M; i++) { - for (U32 n = 0; n < N; n++) { +template +inline void mmm( + U32 N, U32 M, U32 K, bool transposeA, bool transposeB, T1 *matrixA, T1 *matrixB, T2 *matrixC) +{ + for (U32 i = 0; i < M; i++) { + for (U32 n = 0; n < N; n++) { F32 value = 0; for (U32 j = 0; j < K; j++) { U32 indexA = 0, indexB = 0; - if (transposeA) + if (transposeA) { indexA = j * M + i; - else + } else { indexA = i * K + j; - if (transposeB) + } + if (transposeB) { indexB = n * K + j; - else + } else { indexB = j * N + n; + } value += matrixA[indexA] * matrixB[indexB]; } matrixC[i * N + n] += value; @@ -39,29 +41,36 @@ inline void mmm(U32 N, U32 M, U32 K, bool transposeA, bool transposeB, T1* matri } } -EE mmm_general(U32 matrixC_N, U32 matrixC_M, U32 matrixA_K, - bool transposeA, bool transposeB, - DataType dt, - const void* matrixAData, const void* matrixBData, - void* matrixCData) +EE mmm_general(U32 matrixC_N, + U32 matrixC_M, + U32 matrixA_K, + bool transposeA, + bool transposeB, + DataType dt, + const void *matrixAData, + const void *matrixBData, + void *matrixCData) { EE ret = SUCCESS; switch (dt) { #ifdef _USE_FP16 case DT_F16: { - mmm(matrixC_N, matrixC_M, matrixA_K, transposeA, transposeB, (F16*)matrixAData, (F16*)matrixBData, (F16*)matrixCData); + mmm(matrixC_N, matrixC_M, matrixA_K, transposeA, transposeB, + (F16 *)matrixAData, (F16 *)matrixBData, (F16 *)matrixCData); break; } #endif #ifdef _USE_INT8 case DT_I8: { - mmm(matrixC_N, matrixC_M, matrixA_K, transposeA, transposeB, (INT8*)matrixAData, (INT8*)matrixBData, (I32*)matrixCData); + mmm(matrixC_N, matrixC_M, matrixA_K, transposeA, transposeB, + (INT8 *)matrixAData, (INT8 *)matrixBData, (I32 *)matrixCData); break; } #endif #ifdef _USE_FP32 case DT_F32: { - mmm(matrixC_N, matrixC_M, matrixA_K, transposeA, transposeB, (F32*)matrixAData, (F32*)matrixBData, (F32*)matrixCData); + mmm(matrixC_N, matrixC_M, matrixA_K, transposeA, transposeB, + (F32 *)matrixAData, (F32 *)matrixBData, (F32 *)matrixCData); break; } #endif diff --git a/blas-enhance/src/cpu/general/mvm.cpp b/compute/blas_enhance/src/cpu/general/mvm.cpp similarity index 70% rename from blas-enhance/src/cpu/general/mvm.cpp rename to compute/blas_enhance/src/cpu/general/mvm.cpp index a8f9a75f..7decf55c 100644 --- a/blas-enhance/src/cpu/general/mvm.cpp +++ b/compute/blas_enhance/src/cpu/general/mvm.cpp @@ -1,24 +1,24 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "error.h" -#include "type.h" +#include "types.h" #include "cpu/general/blas_general.h" -template -inline void mvm(U32 M, U32 K, bool transpose, T1* mat, T1 *vec, T2* res) { - if (! transpose) { +template +inline void mvm(U32 M, U32 K, bool transpose, T1 *mat, T1 *vec, T2 *res) +{ + if (!transpose) { for (U32 i = 0; i < M; i++) { F32 out_f = 0; for (U32 j = 0; j < K; j++) { @@ -37,22 +37,24 @@ inline void mvm(U32 M, U32 K, bool transpose, T1* mat, T1 *vec, T2* res) { } } -EE mvm_general(U32 row, U32 col, DataType dt, bool transpose, const void *matrix, const void *vector, void *result) { +EE mvm_general( + U32 row, U32 col, DataType dt, bool transpose, const void *matrix, const void *vector, void *result) +{ EE ret = SUCCESS; switch (dt) { #ifdef _USE_FP16 case DT_F16: - mvm(row, col, transpose, (F16*)matrix, (F16*)vector, (F16*)result); + mvm(row, col, transpose, (F16 *)matrix, (F16 *)vector, (F16 *)result); break; #endif #ifdef _USE_INT8 case DT_I8: - mvm(row, col, transpose, (INT8*)matrix, (INT8*)vector, (I32*)result); + mvm(row, col, transpose, (INT8 *)matrix, (INT8 *)vector, (I32 *)result); break; #endif #ifdef _USE_FP32 case DT_F32: - mvm(row, col, transpose, (F32*)matrix, (F32*)vector, (F32*)result); + mvm(row, col, transpose, (F32 *)matrix, (F32 *)vector, (F32 *)result); break; #endif default: diff --git a/compute/blas_enhance/src/cpu/x86/blas_x86.h b/compute/blas_enhance/src/cpu/x86/blas_x86.h new file mode 100644 index 00000000..ff6a3792 --- /dev/null +++ b/compute/blas_enhance/src/cpu/x86/blas_x86.h @@ -0,0 +1,47 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_BLAS_X86 +#define _H_BLAS_X86 + +#include "error.h" +#include "sys.h" +#include "tensor_desc.h" + +EE matrix_vector_multiply_tmp_bytes_x86(bool transpose, DataType dt, U32 *bytes); + +EE mvm_x86(U32 row, + U32 col, + DataType dt, + bool transpose, + const void *matrix, + const void *vector, + void *result); + +EE matrix_matrix_multiply_tmp_bytes_x86( + U32 matrixA_M, U32 matrixA_K, U32 matrixB_K, U32 matrixB_N, DataType dt, U32 *bytes); + +EE matrix_matrix_multiply_transform_rhs_x86( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst); + +EE mmm_x86(U32 matrixC_N, + U32 matrixC_M, + U32 matrixA_K, + DataType matrixADataType, + bool transposeA, + const void *matrixAData, + const void *matrixBData, + void *tmp, + void *matrixCData); + +#endif diff --git a/blas-enhance/src/cpu/arm/fp32/blas_fp32.h b/compute/blas_enhance/src/cpu/x86/fp32/blas_fp32.h similarity index 57% rename from blas-enhance/src/cpu/arm/fp32/blas_fp32.h rename to compute/blas_enhance/src/cpu/x86/fp32/blas_fp32.h index 2fc08be2..1f839440 100644 --- a/blas-enhance/src/cpu/arm/fp32/blas_fp32.h +++ b/compute/blas_enhance/src/cpu/x86/fp32/blas_fp32.h @@ -1,50 +1,46 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_BLAS_FP32 #define _H_BLAS_FP32 #include "sys.h" -#include "type.h" +#include "types.h" #include "error.h" #include "tensor_desc.h" -#include "arm_neon_expand.h" -void mvm_col_V8(U32 row, U32 col, F32* matrix, F32* vector, F32* result); +void mvm_col_fp32(U32 row, U32 col, F32 *matrix, F32 *vector, F32 *result); -void mvm_row_V8(U32 row, U32 col, F32* matrix, F32* vector, F32* result); +void mvm_row_fp32(U32 row, U32 col, F32 *matrix, F32 *vector, F32 *result); -inline EE mvm_fp32(U32 row, U32 col, bool transpose, F32* matrix, F32* vector, F32* result) +inline EE mvm_avx2_fp32(U32 row, U32 col, bool transpose, F32 *matrix, F32 *vector, F32 *result) { if (transpose) { - mvm_col_V8(row, col, matrix, vector, result); + mvm_col_fp32(row, col, matrix, vector, result); } else { - mvm_row_V8(row, col, matrix, vector, result); + mvm_row_fp32(row, col, matrix, vector, result); } return SUCCESS; } -void matrix_matrix_multiply_tmp_bytes_fp32(U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes); +void matrix_matrix_multiply_tmp_bytes_fp32( + U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes); -EE matrix_matrix_multiply_transform_rhsN_fp32(TensorDesc desc, F32* src, F32* dst); +EE matrix_matrix_multiply_transform_rhsN_fp32(TensorDesc desc, F32 *src, F32 *dst); -EE matrix_matrix_multiply_transform_rhsT_fp32(TensorDesc desc, F32* src, F32* dst); +EE matrix_matrix_multiply_transform_rhsT_fp32(TensorDesc desc, F32 *src, F32 *dst); -#ifdef __aarch64__ -EE mmm_fp32_V8(int M, int N, int K, F32* matrix1, F32* matrix2, F32* tmp, F32* result); -#else -EE mmm_fp32_V7(int M, int N, int K, F32* matrix1, F32* matrix2, F32* tmp, F32* result); -#endif +EE mmm_avx2_fp32( + int M, int N, int K, bool transposeA, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *result); -#endif +#endif \ No newline at end of file diff --git a/compute/blas_enhance/src/cpu/x86/fp32/mmm_avx2.cpp b/compute/blas_enhance/src/cpu/x86/fp32/mmm_avx2.cpp new file mode 100644 index 00000000..ae62add8 --- /dev/null +++ b/compute/blas_enhance/src/cpu/x86/fp32/mmm_avx2.cpp @@ -0,0 +1,1445 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/fp32/blas_fp32.h" +#include "error.h" +#include "types.h" + +#define UNROLL_K 4 +#define UNROLL_N 24 +#define UNROLL_M 4 +#define BOLCK_M_DIM 768 +#define BOLCK_K_DIM 768 +#define align_addr(addr, unit) (((uintptr_t)addr + unit - 1) / unit * unit) + +typedef void (*kernel_func)( + U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 ldc); + +void matrix_matrix_multiply_tmp_bytes_fp32( + U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes) +{ + *bytes = row1 * col1 + row2 * col2; + *bytes *= sizeof(dt); + *bytes += 32; +} + +void matrix1_trans(U32 size, U32 blockK, U32 K, F32 *src, F32 *dst) +{ + U32 remain = size % 4; + size = size / 4 * 4; + __m128i vindex = _mm_set_epi32(K * 3, K * 2, K, 0); + for (U32 i = 0; i < blockK; ++i) { + U32 j; + for (j = 0; j < size; j += 4) { + if (i % 16 == 0) { + _mm_prefetch(src + i + j * K + 16, _MM_HINT_NTA); + _mm_prefetch(src + i + (j + 1) * K + 16, _MM_HINT_NTA); + _mm_prefetch(src + i + (j + 2) * K + 16, _MM_HINT_NTA); + _mm_prefetch(src + i + (j + 3) * K + 16, _MM_HINT_NTA); + } + _mm_store_ps(dst, _mm_i32gather_ps(src + i + j * K, vindex, 4)); + dst += 4; + } + for (; j < remain; ++j) { + if (i % 16 == 0) { + _mm_prefetch(src + i + (j + size) * K + 16, _MM_HINT_NTA); + } + *(dst++) = *(src + i + j * K); + } + } +} + +void matrix2_trans(U32 size, U32 blockK, U32 M, F32 *src, F32 *dst) +{ + for (U32 i = 0; i < blockK; i++) { + for (U32 j = 0; j < size; j += 16) { + _mm_prefetch(src + M + j, _MM_HINT_NTA); + } + memcpy(dst, src, size * sizeof(F32)); + dst += size; + src += M; + } +} + +EE matrix_matrix_multiply_transform_rhsN_fp32(TensorDesc desc, F32 *src, F32 *dst) +{ + DataType dt; + DataFormat df; + U32 N, K, blockSizeK, unrollSizeN; + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N)); + F32 unrollSize[4] = {4, 8, 16, 24}; + + // buffer addr algined to 32 + F32 *packB = (F32 *)align_addr(dst, 32); + for (U32 bk = 0; bk < K; bk += blockSizeK) { + blockSizeK = UNI_MIN(BOLCK_K_DIM, K - bk); + for (U32 un = 0; un < N; un += unrollSizeN) { + unrollSizeN = UNI_MIN(UNROLL_N, N - un); + unrollSizeN = UNI_MIN(unrollSize[unrollSizeN / 8], unrollSizeN); + matrix2_trans(unrollSizeN, blockSizeK, N, src + un, packB); + packB += unrollSizeN * blockSizeK; + } + src += blockSizeK * N; + } + return SUCCESS; +} + +EE matrix_matrix_multiply_transform_rhsT_fp32(TensorDesc desc, F32 *src, F32 *dst) +{ + DataType dt; + DataFormat df; + U32 N, K, blockSizeK, unrollSizeN; + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K)); + F32 unrollSize[4] = {4, 8, 16, 24}; + + // buffer addr aligned to 32 + F32 *packB = (F32 *)align_addr(dst, 32); + for (U32 bk = 0; bk < K; bk += blockSizeK) { + blockSizeK = UNI_MIN(BOLCK_K_DIM, K - bk); + for (U32 un = 0; un < N; un += unrollSizeN) { + unrollSizeN = UNI_MIN(UNROLL_N, N - un); + unrollSizeN = UNI_MIN(unrollSize[unrollSizeN >> 3], unrollSizeN); + matrix1_trans(unrollSizeN, blockSizeK, K, src + un * K, packB); + packB += unrollSizeN * blockSizeK; + } + src += blockSizeK; + } + return SUCCESS; +} + +void mmm_avx2_4x24_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + __asm__ __volatile__("vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorps %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorps %%ymm7, %%ymm7, %%ymm7 \n\t" + "vxorps %%ymm8, %%ymm8, %%ymm8 \n\t" + "vxorps %%ymm9, %%ymm9, %%ymm9 \n\t" + "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" + "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" + + "mov %0, %%ecx \n\t" + "shr $2, %%ecx \n\t" + "je .k_loop_4x24_end \n\t" + ".align 16 \n\t" + ".k_loop_4x24: \n\t" + + "prefetcht0 0x140(%1) \n\t" + "prefetcht0 0x180(%1) \n\t" + "prefetcht0 0x140(%2) \n\t" + + "vmovaps (%1), %%ymm12 \n\t" + "vmovaps 0x20(%1), %%ymm13 \n\t" + "vmovaps 0x40(%1), %%ymm14 \n\t" + "vbroadcastss 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vbroadcastss 0x4(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vbroadcastss 0x8(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0xC(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "prefetcht0 0x1C0(%1) \n\t" + + "vmovaps 0x60(%1), %%ymm12 \n\t" + "vmovaps 0x80(%1), %%ymm13 \n\t" + "vmovaps 0xA0(%1), %%ymm14 \n\t" + "vbroadcastss 0x10(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vbroadcastss 0x14(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vbroadcastss 0x18(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0x1C(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "prefetcht0 0x200(%1) \n\t" + "prefetcht0 0x240(%1) \n\t" + + "vmovaps 0xC0(%1), %%ymm12 \n\t" + "vmovaps 0xE0(%1), %%ymm13 \n\t" + "vmovaps 0x100(%1), %%ymm14 \n\t" + "vbroadcastss 0x20(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vbroadcastss 0x24(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vbroadcastss 0x28(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0x2C(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "prefetcht0 0x280(%1) \n\t" + + "vmovaps 0x120(%1), %%ymm12 \n\t" + "vmovaps 0x140(%1), %%ymm13 \n\t" + "vmovaps 0x160(%1), %%ymm14 \n\t" + "vbroadcastss 0x30(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vbroadcastss 0x34(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vbroadcastss 0x38(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0x3C(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add $0x180, %1 \n\t" + "add $0x40, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .k_loop_4x24 \n\t" + ".align 16 \n\t" + ".k_loop_4x24_end: \n\t" + + "mov %0, %%ecx \n\t" + "and $3, %%ecx \n\t" + "je .k_loop_4x24_remain_end \n\t" + ".k_loop_4x24_remain: \n\t" + "vmovaps (%1), %%ymm12 \n\t" + "vmovaps 0x20(%1), %%ymm13 \n\t" + "vmovaps 0x40(%1), %%ymm14 \n\t" + "vbroadcastss 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vbroadcastss 0x4(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vbroadcastss 0x8(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0xC(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + "add $0x60, %1 \n\t" + "add $0x10, %2 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_4x24_remain \n\t" + + ".k_loop_4x24_remain_end: \n\t" + "mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + "prefetcht0 0x40(%3) \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "prefetcht0 0x40(%3, %%rax) \n\t" + "vaddps (%3), %%ymm0, %%ymm0 \n\t" + "vaddps 0x20(%3), %%ymm1, %%ymm1 \n\t" + "vaddps 0x40(%3), %%ymm2, %%ymm2 \n\t" + "vmovups %%ymm0, (%3) \n\t" + "vmovups %%ymm1, 0x20(%3) \n\t" + "vmovups %%ymm2, 0x40(%3) \n\t" + "add %%rax, %3 \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "prefetcht0 0x40(%3, %%rax) \n\t" + "vaddps (%3), %%ymm3, %%ymm3 \n\t" + "vaddps 0x20(%3), %%ymm4, %%ymm4 \n\t" + "vaddps 0x40(%3), %%ymm5, %%ymm5 \n\t" + "vmovups %%ymm3, (%3) \n\t" + "vmovups %%ymm4, 0x20(%3) \n\t" + "vmovups %%ymm5, 0x40(%3) \n\t" + "add %%rax, %3 \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "prefetcht0 0x40(%3, %%rax) \n\t" + "vaddps (%3), %%ymm6, %%ymm6 \n\t" + "vaddps 0x20(%3), %%ymm7, %%ymm7 \n\t" + "vaddps 0x40(%3), %%ymm8, %%ymm8 \n\t" + "vmovups %%ymm6, (%3) \n\t" + "vmovups %%ymm7, 0x20(%3) \n\t" + "vmovups %%ymm8, 0x40(%3) \n\t" + "add %%rax, %3 \n\t" + "prefetcht0 0x40(%3) \n\t" + "vaddps (%3), %%ymm9, %%ymm9 \n\t" + "vaddps 0x20(%3), %%ymm10, %%ymm10 \n\t" + "vaddps 0x40(%3), %%ymm11, %%ymm11 \n\t" + "vmovups %%ymm9, (%3) \n\t" + "vmovups %%ymm10, 0x20(%3) \n\t" + "vmovups %%ymm11, 0x40(%3) \n\t" + : + : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) + : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", + "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", + "%ymm13", "%ymm14", "%ymm15", "memory"); +} + +void mmm_avx2_4x16_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + __asm__ __volatile__("vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorps %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorps %%ymm7, %%ymm7, %%ymm7 \n\t" + + "mov %0, %%ecx \n\t" + "shr $2, %%ecx \n\t" + "je .k_loop_4x16_end \n\t" + ".align 16 \n\t" + ".k_loop_4x16: \n\t" + + "prefetcht0 0x140(%1) \n\t" + "prefetcht0 0x140(%2) \n\t" + + "vmovaps (%1), %%ymm8 \n\t" + "vmovaps 0x20(%1), %%ymm9 \n\t" + "vbroadcastss 0x0(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm0 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm1 \n\t" + "vbroadcastss 0x4(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm2 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm3 \n\t" + "vbroadcastss 0x8(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm4 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm5 \n\t" + "vbroadcastss 0xC(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm6 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm7 \n\t" + + "prefetcht0 0x180(%1) \n\t" + + "vmovaps 0x40(%1), %%ymm8 \n\t" + "vmovaps 0x60(%1), %%ymm9 \n\t" + "vbroadcastss 0x10(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm0 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm1 \n\t" + "vbroadcastss 0x14(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm2 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm3 \n\t" + "vbroadcastss 0x18(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm4 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm5 \n\t" + "vbroadcastss 0x1C(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm6 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm7 \n\t" + + "prefetcht0 0x1C0(%1) \n\t" + + "vmovaps 0x80(%1), %%ymm8 \n\t" + "vmovaps 0xA0(%1), %%ymm9 \n\t" + "vbroadcastss 0x20(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm0 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm1 \n\t" + "vbroadcastss 0x24(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm2 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm3 \n\t" + "vbroadcastss 0x28(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm4 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm5 \n\t" + "vbroadcastss 0x2C(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm6 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm7 \n\t" + + "prefetcht0 0x200(%1) \n\t" + + "vmovaps 0xC0(%1), %%ymm8 \n\t" + "vmovaps 0xE0(%1), %%ymm9 \n\t" + "vbroadcastss 0x30(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm0 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm1 \n\t" + "vbroadcastss 0x34(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm2 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm3 \n\t" + "vbroadcastss 0x38(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm4 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm5 \n\t" + "vbroadcastss 0x3C(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm6 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm7 \n\t" + + "add $0x100, %1 \n\t" + "add $0x40, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .k_loop_4x16 \n\t" + ".align 16 \n\t" + ".k_loop_4x16_end: \n\t" + + "mov %0, %%ecx \n\t" + "and $3, %%ecx \n\t" + "je .k_loop_4x16_remain_end \n\t" + ".k_loop_4x16_remain: \n\t" + "vmovaps (%1), %%ymm8 \n\t" + "vmovaps 0x20(%1), %%ymm9 \n\t" + "vbroadcastss 0x0(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm0 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm1 \n\t" + "vbroadcastss 0x4(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm2 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm3 \n\t" + "vbroadcastss 0x8(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm4 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm5 \n\t" + "vbroadcastss 0xC(%2), %%ymm10 \n\t" + "vfmadd231ps %%ymm10, %%ymm8, %%ymm6 \n\t" + "vfmadd231ps %%ymm10, %%ymm9, %%ymm7 \n\t" + "add $0x40, %1 \n\t" + "add $0x10, %2 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_4x16_remain \n\t" + + ".k_loop_4x16_remain_end: \n\t" + "mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "vaddps (%3), %%ymm0, %%ymm0 \n\t" + "vaddps 0x20(%3), %%ymm1, %%ymm1 \n\t" + "vmovups %%ymm0, (%3) \n\t" + "vmovups %%ymm1, 0x20(%3) \n\t" + "add %%rax, %3 \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "vaddps (%3), %%ymm2, %%ymm2 \n\t" + "vaddps 0x20(%3), %%ymm3, %%ymm3 \n\t" + "vmovups %%ymm2, (%3) \n\t" + "vmovups %%ymm3, 0x20(%3) \n\t" + "add %%rax, %3 \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "vaddps (%3), %%ymm4, %%ymm4 \n\t" + "vaddps 0x20(%3), %%ymm5, %%ymm5 \n\t" + "vmovups %%ymm4, (%3) \n\t" + "vmovups %%ymm5, 0x20(%3) \n\t" + "add %%rax, %3 \n\t" + "vaddps (%3), %%ymm6, %%ymm6 \n\t" + "vaddps 0x20(%3), %%ymm7, %%ymm7 \n\t" + "vmovups %%ymm6, (%3) \n\t" + "vmovups %%ymm7, 0x20(%3) \n\t" + : + : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) + : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", + "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "memory"); +} + +void mmm_avx2_4x8_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + __asm__ __volatile__( + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + + "mov %0, %%ecx \n\t" + "shr $2, %%ecx \n\t" + "je .k_loop_4x8_end \n\t" + ".align 16 \n\t" + ".k_loop_4x8: \n\t" + + "prefetcht0 0x140(%1) \n\t" + "prefetcht0 0x140(%2) \n\t" + + "vmovaps (%1), %%ymm4 \n\t" + "vbroadcastss 0x0(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm0 \n\t" + "vbroadcastss 0x4(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm1 \n\t" + "vbroadcastss 0x8(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm2 \n\t" + "vbroadcastss 0xC(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm3 \n\t" + + "vmovaps 0x20(%1), %%ymm4 \n\t" + "vbroadcastss 0x10(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm0 \n\t" + "vbroadcastss 0x14(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm1 \n\t" + "vbroadcastss 0x18(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm2 \n\t" + "vbroadcastss 0x1C(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm3 \n\t" + + "prefetcht0 0x180(%1) \n\t" + + "vmovaps 0x40(%1), %%ymm4 \n\t" + "vbroadcastss 0x20(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm0 \n\t" + "vbroadcastss 0x24(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm1 \n\t" + "vbroadcastss 0x28(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm2 \n\t" + "vbroadcastss 0x2C(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm3 \n\t" + + "vmovaps 0x60(%1), %%ymm4 \n\t" + "vbroadcastss 0x30(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm0 \n\t" + "vbroadcastss 0x34(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm1 \n\t" + "vbroadcastss 0x38(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm2 \n\t" + "vbroadcastss 0x3C(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm3 \n\t" + + "add $0x80, %1 \n\t" + "add $0x40, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .k_loop_4x8 \n\t" + ".align 16 \n\t" + ".k_loop_4x8_end: \n\t" + + "mov %0, %%ecx \n\t" + "and $3, %%ecx \n\t" + "je .k_loop_4x8_remain_end \n\t" + ".k_loop_4x8_remain: \n\t" + "vmovaps (%1), %%ymm4 \n\t" + "vbroadcastss 0x0(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm0 \n\t" + "vbroadcastss 0x4(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm1 \n\t" + "vbroadcastss 0x8(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm2 \n\t" + "vbroadcastss 0xC(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm4, %%ymm3 \n\t" + "add $0x20, %1 \n\t" + "add $0x10, %2 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_4x8_remain \n\t" + + ".k_loop_4x8_remain_end: \n\t" + "mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "vaddps (%3), %%ymm0, %%ymm0 \n\t" + "vmovups %%ymm0, (%3) \n\t" + "add %%rax, %3 \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "vaddps (%3), %%ymm1, %%ymm1 \n\t" + "vmovups %%ymm1, (%3) \n\t" + "add %%rax, %3 \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "vaddps (%3), %%ymm2, %%ymm2 \n\t" + "vmovups %%ymm2, (%3) \n\t" + "add %%rax, %3 \n\t" + "vaddps (%3), %%ymm3, %%ymm3 \n\t" + "vmovups %%ymm3, (%3) \n\t" + : + : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) + : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "memory"); +} + +void mmm_avx2_4x4_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + __asm__ __volatile__( + "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vxorps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vxorps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "mov %0, %%ecx \n\t" + "shr $2, %%ecx \n\t" + "je .k_loop_4x4_end \n\t" + ".align 16 \n\t" + ".k_loop_4x4: \n\t" + + "prefetcht0 0x140(%1) \n\t" + "prefetcht0 0x140(%2) \n\t" + + "vmovaps (%1), %%xmm4 \n\t" + "vbroadcastss 0x0(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm0 \n\t" + "vbroadcastss 0x4(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm1 \n\t" + "vbroadcastss 0x8(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm2 \n\t" + "vbroadcastss 0xC(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm3 \n\t" + + "vmovaps 0x10(%1), %%xmm4 \n\t" + "vbroadcastss 0x10(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm0 \n\t" + "vbroadcastss 0x14(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm1 \n\t" + "vbroadcastss 0x18(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm2 \n\t" + "vbroadcastss 0x1C(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm3 \n\t" + + "vmovaps 0x20(%1), %%xmm4 \n\t" + "vbroadcastss 0x20(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm0 \n\t" + "vbroadcastss 0x24(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm1 \n\t" + "vbroadcastss 0x28(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm2 \n\t" + "vbroadcastss 0x2C(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm3 \n\t" + + "vmovaps 0x30(%1), %%xmm4 \n\t" + "vbroadcastss 0x30(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm0 \n\t" + "vbroadcastss 0x34(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm1 \n\t" + "vbroadcastss 0x38(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm2 \n\t" + "vbroadcastss 0x3C(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm3 \n\t" + + "add $0x40, %1 \n\t" + "add $0x40, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .k_loop_4x4 \n\t" + ".align 16 \n\t" + ".k_loop_4x4_end: \n\t" + + "mov %0, %%ecx \n\t" + "and $3, %%ecx \n\t" + "je .k_loop_4x4_remain_end \n\t" + + ".k_loop_4x4_remain: \n\t" + "vmovaps (%1), %%xmm4 \n\t" + "vbroadcastss 0x0(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm0 \n\t" + "vbroadcastss 0x4(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm1 \n\t" + "vbroadcastss 0x8(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm2 \n\t" + "vbroadcastss 0xC(%2), %%xmm5 \n\t" + "vfmadd231ps %%xmm5, %%xmm4, %%xmm3 \n\t" + "add $0x10, %1 \n\t" + "add $0x10, %2 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_4x4_remain \n\t" + + ".k_loop_4x4_remain_end: \n\t" + "mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "vaddps (%3), %%xmm0, %%xmm0 \n\t" + "vmovups %%xmm0, (%3) \n\t" + "add %%rax, %3 \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "vaddps (%3), %%xmm1, %%xmm1 \n\t" + "vmovups %%xmm1, (%3) \n\t" + "add %%rax, %3 \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "vaddps (%3), %%xmm2, %%xmm2 \n\t" + "vmovups %%xmm2, (%3) \n\t" + "add %%rax, %3 \n\t" + "vaddps (%3), %%xmm3, %%xmm3 \n\t" + "vmovups %%xmm3, (%3) \n\t" + : + : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) + : "%eax", "%rax", "%ecx", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "memory"); +} + +void mmm_avx2_2x24_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + __asm__ __volatile__("mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" + + "mov %0, %%ecx \n\t" + "shr $2, %%ecx \n\t" + "je .k_loop_2x24_end \n\t" + + ".align 16 \n\t" + ".k_loop_2x24: \n\t" + + "prefetcht0 0x140(%1) \n\t" + "prefetcht0 0x180(%1) \n\t" + + "vmovaps (%1), %%ymm6 \n\t" + "vmovaps 0x20(%1), %%ymm7 \n\t" + "vmovaps 0x40(%1), %%ymm8 \n\t" + "vbroadcastss 0x0(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" + "vfmadd231ps %%ymm9, %%ymm8, %%ymm2 \n\t" + "vbroadcastss 0x4(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" + "vfmadd231ps %%ymm9, %%ymm8, %%ymm5 \n\t" + + "prefetcht0 0x1C0(%1) \n\t" + + "vmovaps 0x60(%1), %%ymm6 \n\t" + "vmovaps 0x80(%1), %%ymm7 \n\t" + "vmovaps 0xA0(%1), %%ymm8 \n\t" + "vbroadcastss 0x8(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" + "vfmadd231ps %%ymm9, %%ymm8, %%ymm2 \n\t" + "vbroadcastss 0xC(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" + "vfmadd231ps %%ymm9, %%ymm8, %%ymm5 \n\t" + + "prefetcht0 0x200(%1) \n\t" + "prefetcht0 0x240(%1) \n\t" + + "vmovaps 0xC0(%1), %%ymm6 \n\t" + "vmovaps 0xE0(%1), %%ymm7 \n\t" + "vmovaps 0x100(%1), %%ymm8 \n\t" + "vbroadcastss 0x10(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" + "vfmadd231ps %%ymm9, %%ymm8, %%ymm2 \n\t" + "vbroadcastss 0x14(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" + "vfmadd231ps %%ymm9, %%ymm8, %%ymm5 \n\t" + + "prefetcht0 0x280(%1) \n\t" + + "vmovaps 0x120(%1), %%ymm6 \n\t" + "vmovaps 0x140(%1), %%ymm7 \n\t" + "vmovaps 0x160(%1), %%ymm8 \n\t" + "vbroadcastss 0x18(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" + "vfmadd231ps %%ymm9, %%ymm8, %%ymm2 \n\t" + "vbroadcastss 0x1C(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" + "vfmadd231ps %%ymm9, %%ymm8, %%ymm5 \n\t" + + "add $0x180, %1 \n\t" + "add $0x20, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .k_loop_2x24 \n\t" + ".align 16 \n\t" + ".k_loop_2x24_end: \n\t" + + "mov %0, %%ecx \n\t" + "and $3, %%ecx \n\t" + "je .k_loop_2x24_remain_end \n\t" + + ".align 16 \n\t" + ".k_loop_2x24_remain: \n\t" + "vmovaps (%1), %%ymm6 \n\t" + "vmovaps 0x20(%1), %%ymm7 \n\t" + "vmovaps 0x40(%1), %%ymm8 \n\t" + "vbroadcastss 0x0(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" + "vfmadd231ps %%ymm9, %%ymm8, %%ymm2 \n\t" + "vbroadcastss 0x4(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" + "vfmadd231ps %%ymm9, %%ymm8, %%ymm5 \n\t" + "add $0x60, %1 \n\t" + "add $0x8, %2 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_2x24_remain \n\t" + + ".align 16 \n\t" + ".k_loop_2x24_remain_end: \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "prefetcht0 0x40(%3, %%rax) \n\t" + "vaddps (%3), %%ymm0, %%ymm0 \n\t" + "vaddps 0x20(%3), %%ymm1, %%ymm1 \n\t" + "vaddps 0x40(%3), %%ymm2, %%ymm2 \n\t" + "vmovups %%ymm0, (%3) \n\t" + "vmovups %%ymm1, 0x20(%3) \n\t" + "vmovups %%ymm2, 0x40(%3) \n\t" + "add %%rax, %3 \n\t" + "vaddps (%3), %%ymm3, %%ymm3 \n\t" + "vaddps 0x20(%3), %%ymm4, %%ymm4 \n\t" + "vaddps 0x40(%3), %%ymm5, %%ymm5 \n\t" + "vmovups %%ymm3, (%3) \n\t" + "vmovups %%ymm4, 0x20(%3) \n\t" + "vmovups %%ymm5, 0x40(%3) \n\t" + : + : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) + : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", + "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "memory"); +} + +void mmm_avx2_2x16_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + __asm__ __volatile__("mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" + + "mov %0, %%ecx \n\t" + "shr $2, %%ecx \n\t" + "je .k_loop_2x16_end \n\t" + + ".align 16 \n\t" + ".k_loop_2x16: \n\t" + + "prefetcht0 0x140(%1) \n\t" + + "vmovaps (%1), %%ymm6 \n\t" + "vmovaps 0x20(%1), %%ymm7 \n\t" + "vbroadcastss 0x0(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" + "vbroadcastss 0x4(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" + + "prefetcht0 0x180(%1) \n\t" + + "vmovaps 0x40(%1), %%ymm6 \n\t" + "vmovaps 0x60(%1), %%ymm7 \n\t" + "vbroadcastss 0x8(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" + "vbroadcastss 0xC(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" + + "prefetcht0 0x1C0(%1) \n\t" + + "vmovaps 0x80(%1), %%ymm6 \n\t" + "vmovaps 0xA0(%1), %%ymm7 \n\t" + "vbroadcastss 0x10(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" + "vbroadcastss 0x14(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" + + "prefetcht0 0x200(%1) \n\t" + + "vmovaps 0xC0(%1), %%ymm6 \n\t" + "vmovaps 0xE0(%1), %%ymm7 \n\t" + "vbroadcastss 0x18(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" + "vbroadcastss 0x1C(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" + + "add $0x100, %1 \n\t" + "add $0x20, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .k_loop_2x16 \n\t" + ".align 16 \n\t" + ".k_loop_2x16_end: \n\t" + + "mov %0, %%ecx \n\t" + "and $3, %%ecx \n\t" + "je .k_loop_2x16_remain_end \n\t" + + ".align 16 \n\t" + ".k_loop_2x16_remain: \n\t" + "vmovaps (%1), %%ymm6 \n\t" + "vmovaps 0x20(%1), %%ymm7 \n\t" + "vbroadcastss 0x0(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" + "vbroadcastss 0x4(%2), %%ymm9 \n\t" + "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" + "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" + "add $0x40, %1 \n\t" + "add $0x8, %2 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_2x16_remain \n\t" + + ".align 16 \n\t" + ".k_loop_2x16_remain_end: \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "vaddps (%3), %%ymm0, %%ymm0 \n\t" + "vaddps 0x20(%3), %%ymm1, %%ymm1 \n\t" + "vmovups %%ymm0, (%3) \n\t" + "vmovups %%ymm1, 0x20(%3) \n\t" + "add %%rax, %3 \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "vaddps (%3), %%ymm3, %%ymm3 \n\t" + "vaddps 0x20(%3), %%ymm4, %%ymm4 \n\t" + "vmovups %%ymm3, (%3) \n\t" + "vmovups %%ymm4, 0x20(%3) \n\t" + : + : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) + : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm3", "%ymm4", "%ymm6", + "%ymm7", "%ymm9", "memory"); +} + +void mmm_avx2_2x8_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + __asm__ __volatile__("mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + + "mov %0, %%ecx \n\t" + "shr $2, %%ecx \n\t" + "je .k_loop_2x8_end \n\t" + + ".align 16 \n\t" + ".k_loop_2x8: \n\t" + + "prefetcht0 0x140(%1) \n\t" + "vmovaps (%1), %%ymm2 \n\t" + "vbroadcastss 0x0(%2), %%ymm3 \n\t" + "vfmadd231ps %%ymm3, %%ymm2, %%ymm0 \n\t" + "vbroadcastss 0x4(%2), %%ymm3 \n\t" + "vfmadd231ps %%ymm3, %%ymm2, %%ymm1 \n\t" + + "vmovaps 0x20(%1), %%ymm2 \n\t" + "vbroadcastss 0x8(%2), %%ymm3 \n\t" + "vfmadd231ps %%ymm3, %%ymm2, %%ymm0 \n\t" + "vbroadcastss 0xC(%2), %%ymm3 \n\t" + "vfmadd231ps %%ymm3, %%ymm2, %%ymm1 \n\t" + + "prefetcht0 0x180(%1) \n\t" + "vmovaps 0x40(%1), %%ymm2 \n\t" + "vbroadcastss 0x10(%2), %%ymm3 \n\t" + "vfmadd231ps %%ymm3, %%ymm2, %%ymm0 \n\t" + "vbroadcastss 0x14(%2), %%ymm3 \n\t" + "vfmadd231ps %%ymm3, %%ymm2, %%ymm1 \n\t" + + "vmovaps 0x60(%1), %%ymm2 \n\t" + "vbroadcastss 0x18(%2), %%ymm3 \n\t" + "vfmadd231ps %%ymm3, %%ymm2, %%ymm0 \n\t" + "vbroadcastss 0x1C(%2), %%ymm3 \n\t" + "vfmadd231ps %%ymm3, %%ymm2, %%ymm1 \n\t" + + "add $0x80, %1 \n\t" + "add $0x20, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .k_loop_2x8 \n\t" + ".align 16 \n\t" + ".k_loop_2x8_end: \n\t" + + "mov %0, %%ecx \n\t" + "and $3, %%ecx \n\t" + "je .k_loop_2x8_remain_end \n\t" + + ".align 16 \n\t" + ".k_loop_2x8_remain: \n\t" + "vmovaps (%1), %%ymm2 \n\t" + "vbroadcastss 0x0(%2), %%ymm3 \n\t" + "vfmadd231ps %%ymm3, %%ymm2, %%ymm0 \n\t" + "vbroadcastss 0x4(%2), %%ymm3 \n\t" + "vfmadd231ps %%ymm3, %%ymm2, %%ymm1 \n\t" + "add $0x20, %1 \n\t" + "add $0x8, %2 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_2x8_remain \n\t" + + ".align 16 \n\t" + ".k_loop_2x8_remain_end: \n\t" + + "vaddps (%3), %%ymm0, %%ymm0 \n\t" + "vmovups %%ymm0, (%3) \n\t" + "add %%rax, %3 \n\t" + "vaddps (%3), %%ymm1, %%ymm1 \n\t" + "vmovups %%ymm1, (%3) \n\t" + : + : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) + : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "memory"); +} + +void mmm_avx2_2x4_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + __asm__ __volatile__("mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + + "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" + + "mov %0, %%ecx \n\t" + "shr $2, %%ecx \n\t" + "je .k_loop_2x4_end \n\t" + + ".align 16 \n\t" + ".k_loop_2x4: \n\t" + + "prefetcht0 0x140(%1) \n\t" + "vmovaps (%1), %%xmm2 \n\t" + "vbroadcastss 0x0(%2), %%xmm3 \n\t" + "vfmadd231ps %%xmm3, %%xmm2, %%xmm0 \n\t" + "vbroadcastss 0x4(%2), %%xmm3 \n\t" + "vfmadd231ps %%xmm3, %%xmm2, %%xmm1 \n\t" + + "vmovaps 0x10(%1), %%xmm2 \n\t" + "vbroadcastss 0x8(%2), %%xmm3 \n\t" + "vfmadd231ps %%xmm3, %%xmm2, %%xmm0 \n\t" + "vbroadcastss 0xC(%2), %%xmm3 \n\t" + "vfmadd231ps %%xmm3, %%xmm2, %%xmm1 \n\t" + + "vmovaps 0x20(%1), %%xmm2 \n\t" + "vbroadcastss 0x10(%2), %%xmm3 \n\t" + "vfmadd231ps %%xmm3, %%xmm2, %%xmm0 \n\t" + "vbroadcastss 0x14(%2), %%xmm3 \n\t" + "vfmadd231ps %%xmm3, %%xmm2, %%xmm1 \n\t" + + "vmovaps 0x30(%1), %%xmm2 \n\t" + "vbroadcastss 0x18(%2), %%xmm3 \n\t" + "vfmadd231ps %%xmm3, %%xmm2, %%xmm0 \n\t" + "vbroadcastss 0x1C(%2), %%xmm3 \n\t" + "vfmadd231ps %%xmm3, %%xmm2, %%xmm1 \n\t" + + "add $0x40, %1 \n\t" + "add $0x20, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .k_loop_2x4 \n\t" + ".align 16 \n\t" + ".k_loop_2x4_end: \n\t" + + "mov %0, %%ecx \n\t" + "and $3, %%ecx \n\t" + "je .k_loop_2x4_remain_end \n\t" + + ".align 16 \n\t" + ".k_loop_2x4_remain: \n\t" + "vmovaps (%1), %%xmm2 \n\t" + "vbroadcastss 0x0(%2), %%xmm3 \n\t" + "vfmadd231ps %%xmm3, %%xmm2, %%xmm0 \n\t" + "vbroadcastss 0x4(%2), %%xmm3 \n\t" + "vfmadd231ps %%xmm3, %%xmm2, %%xmm1 \n\t" + "add $0x10, %1 \n\t" + "add $0x8, %2 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_2x4_remain \n\t" + + ".align 16 \n\t" + ".k_loop_2x4_remain_end: \n\t" + + "vaddps (%3), %%xmm0, %%xmm0 \n\t" + "vmovups %%xmm0, (%3) \n\t" + "add %%rax, %3 \n\t" + "vaddps (%3), %%xmm1, %%xmm1 \n\t" + "vmovups %%xmm1, (%3) \n\t" + : + : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) + : "%eax", "%rax", "%ecx", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "memory"); +} + +void mmm_avx2_1x24_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + __asm__ __volatile__("mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + + "mov %0, %%ecx \n\t" + "shr $2, %%ecx \n\t" + "je .k_loop_1x24_end \n\t" + + ".align 16 \n\t" + ".k_loop_1x24: \n\t" + + "prefetcht0 0x140(%1) \n\t" + "prefetcht0 0x180(%1) \n\t" + + "vmovaps (%1), %%ymm3 \n\t" + "vmovaps 0x20(%1), %%ymm4 \n\t" + "vmovaps 0x40(%1), %%ymm5 \n\t" + "vbroadcastss 0x0(%2), %%ymm6 \n\t" + "vfmadd231ps %%ymm6, %%ymm3, %%ymm0 \n\t" + "vfmadd231ps %%ymm6, %%ymm4, %%ymm1 \n\t" + "vfmadd231ps %%ymm6, %%ymm5, %%ymm2 \n\t" + + "prefetcht0 0x1C0(%1) \n\t" + + "vmovaps 0x60(%1), %%ymm3 \n\t" + "vmovaps 0x80(%1), %%ymm4 \n\t" + "vmovaps 0xA0(%1), %%ymm5 \n\t" + "vbroadcastss 0x4(%2), %%ymm6 \n\t" + "vfmadd231ps %%ymm6, %%ymm3, %%ymm0 \n\t" + "vfmadd231ps %%ymm6, %%ymm4, %%ymm1 \n\t" + "vfmadd231ps %%ymm6, %%ymm5, %%ymm2 \n\t" + + "prefetcht0 0x200(%1) \n\t" + "prefetcht0 0x240(%1) \n\t" + + "vmovaps 0xC0(%1), %%ymm3 \n\t" + "vmovaps 0xE0(%1), %%ymm4 \n\t" + "vmovaps 0x100(%1), %%ymm5 \n\t" + "vbroadcastss 0x8(%2), %%ymm6 \n\t" + "vfmadd231ps %%ymm6, %%ymm3, %%ymm0 \n\t" + "vfmadd231ps %%ymm6, %%ymm4, %%ymm1 \n\t" + "vfmadd231ps %%ymm6, %%ymm5, %%ymm2 \n\t" + + "prefetcht0 0x280(%1) \n\t" + + "vmovaps 0x120(%1), %%ymm3 \n\t" + "vmovaps 0x140(%1), %%ymm4 \n\t" + "vmovaps 0x160(%1), %%ymm5 \n\t" + "vbroadcastss 0xC(%2), %%ymm6 \n\t" + "vfmadd231ps %%ymm6, %%ymm3, %%ymm0 \n\t" + "vfmadd231ps %%ymm6, %%ymm4, %%ymm1 \n\t" + "vfmadd231ps %%ymm6, %%ymm5, %%ymm2 \n\t" + + "add $0x180, %1 \n\t" + "add $0x10, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .k_loop_1x24 \n\t" + ".align 16 \n\t" + ".k_loop_1x24_end: \n\t" + + "mov %0, %%ecx \n\t" + "and $3, %%ecx \n\t" + "je .k_loop_1x24_remain_end \n\t" + + ".align 16 \n\t" + ".k_loop_1x24_remain: \n\t" + "vmovaps (%1), %%ymm3 \n\t" + "vmovaps 0x20(%1), %%ymm4 \n\t" + "vmovaps 0x40(%1), %%ymm5 \n\t" + "vbroadcastss (%2), %%ymm6 \n\t" + "vfmadd231ps %%ymm6, %%ymm3, %%ymm0 \n\t" + "vfmadd231ps %%ymm6, %%ymm4, %%ymm1 \n\t" + "vfmadd231ps %%ymm6, %%ymm5, %%ymm2 \n\t" + "add $0x60, %1 \n\t" + "add $0x4, %2 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_1x24_remain \n\t" + + ".align 16 \n\t" + ".k_loop_1x24_remain_end: \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "prefetcht0 0x40(%3, %%rax) \n\t" + "vaddps (%3), %%ymm0, %%ymm0 \n\t" + "vaddps 0x20(%3), %%ymm1, %%ymm1 \n\t" + "vaddps 0x40(%3), %%ymm2, %%ymm2 \n\t" + "vmovups %%ymm0, (%3) \n\t" + "vmovups %%ymm1, 0x20(%3) \n\t" + "vmovups %%ymm2, 0x40(%3) \n\t" + : + : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) + : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", + "%ymm5", "%ymm6", "memory"); +} + +void mmm_avx2_1x16_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + __asm__ __volatile__( + "mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + + "mov %0, %%ecx \n\t" + "shr $2, %%ecx \n\t" + "je .k_loop_1x16_end \n\t" + + ".align 16 \n\t" + ".k_loop_1x16: \n\t" + + "prefetcht0 0x140(%1) \n\t" + + "vmovaps (%1), %%ymm2 \n\t" + "vmovaps 0x20(%1), %%ymm3 \n\t" + "vbroadcastss (%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm2, %%ymm0 \n\t" + "vfmadd231ps %%ymm5, %%ymm3, %%ymm1 \n\t" + + "prefetcht0 0x180(%1) \n\t" + + "vmovaps 0x40(%1), %%ymm2 \n\t" + "vmovaps 0x60(%1), %%ymm3 \n\t" + "vbroadcastss 0x4(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm2, %%ymm0 \n\t" + "vfmadd231ps %%ymm5, %%ymm3, %%ymm1 \n\t" + + "prefetcht0 0x1C0(%1) \n\t" + + "vmovaps 0x80(%1), %%ymm2 \n\t" + "vmovaps 0xA0(%1), %%ymm3 \n\t" + "vbroadcastss 0x8(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm2, %%ymm0 \n\t" + "vfmadd231ps %%ymm5, %%ymm3, %%ymm1 \n\t" + + "prefetcht0 0x200(%1) \n\t" + + "vmovaps 0xC0(%1), %%ymm2 \n\t" + "vmovaps 0xE0(%1), %%ymm3 \n\t" + "vbroadcastss 0xC(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm2, %%ymm0 \n\t" + "vfmadd231ps %%ymm5, %%ymm3, %%ymm1 \n\t" + + "add $0x100, %1 \n\t" + "add $0x10, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .k_loop_1x16 \n\t" + ".align 16 \n\t" + ".k_loop_1x16_end: \n\t" + + "mov %0, %%ecx \n\t" + "and $3, %%ecx \n\t" + "je .k_loop_1x16_remain_end \n\t" + + ".align 16 \n\t" + ".k_loop_1x16_remain: \n\t" + "vmovaps (%1), %%ymm2 \n\t" + "vmovaps 0x20(%1), %%ymm3 \n\t" + "vbroadcastss 0x0(%2), %%ymm5 \n\t" + "vfmadd231ps %%ymm5, %%ymm2, %%ymm0 \n\t" + "vfmadd231ps %%ymm5, %%ymm3, %%ymm1 \n\t" + "add $0x40, %1 \n\t" + "add $0x4, %2 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_1x16_remain \n\t" + + ".align 16 \n\t" + ".k_loop_1x16_remain_end: \n\t" + "prefetcht0 (%3, %%rax) \n\t" + "vaddps (%3), %%ymm0, %%ymm0 \n\t" + "vaddps 0x20(%3), %%ymm1, %%ymm1 \n\t" + "vmovups %%ymm0, (%3) \n\t" + "vmovups %%ymm1, 0x20(%3) \n\t" + : + : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) + : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm5", "memory"); +} + +void mmm_avx2_1x8_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + __asm__ __volatile__("mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + + "mov %0, %%ecx \n\t" + "shr $2, %%ecx \n\t" + "je .k_loop_1x8_end \n\t" + + ".align 16 \n\t" + ".k_loop_1x8: \n\t" + + "prefetcht0 0x140(%1) \n\t" + "vmovaps (%1), %%ymm1 \n\t" + "vbroadcastss (%2), %%ymm2 \n\t" + "vfmadd231ps %%ymm2, %%ymm1, %%ymm0 \n\t" + + "vmovaps 0x20(%1), %%ymm1 \n\t" + "vbroadcastss 0x4(%2), %%ymm2 \n\t" + "vfmadd231ps %%ymm2, %%ymm1, %%ymm0 \n\t" + + "prefetcht0 0x180(%1) \n\t" + "vmovaps 0x40(%1), %%ymm1 \n\t" + "vbroadcastss 0x8(%2), %%ymm2 \n\t" + "vfmadd231ps %%ymm2, %%ymm1, %%ymm0 \n\t" + + "vmovaps 0x60(%1), %%ymm1 \n\t" + "vbroadcastss 0xC(%2), %%ymm2 \n\t" + "vfmadd231ps %%ymm2, %%ymm1, %%ymm0 \n\t" + + "add $0x80, %1 \n\t" + "add $0x10, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .k_loop_1x8 \n\t" + ".align 16 \n\t" + ".k_loop_1x8_end: \n\t" + + "mov %0, %%ecx \n\t" + "and $3, %%ecx \n\t" + "je .k_loop_1x8_remain_end \n\t" + + ".align 16 \n\t" + ".k_loop_1x8_remain: \n\t" + "vmovaps (%1), %%ymm1 \n\t" + "vbroadcastss (%2), %%ymm2 \n\t" + "vfmadd231ps %%ymm2, %%ymm1, %%ymm0 \n\t" + "add $0x20, %1 \n\t" + "add $0x4, %2 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_1x8_remain \n\t" + + ".align 16 \n\t" + ".k_loop_1x8_remain_end: \n\t" + + "vaddps (%3), %%ymm0, %%ymm0 \n\t" + "vmovups %%ymm0, (%3) \n\t" + : + : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) + : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "memory"); +} + +void mmm_avx2_1x4_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + __asm__ __volatile__("mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + + "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" + + "mov %0, %%ecx \n\t" + "shr $2, %%ecx \n\t" + "je .k_loop_1x4_end \n\t" + ".align 16 \n\t" + ".k_loop_1x4: \n\t" + + "prefetcht0 0x40(%1) \n\t" + + "vmovaps (%1), %%xmm1 \n\t" + "vbroadcastss 0x0(%2), %%xmm2 \n\t" + "vfmadd231ps %%xmm2, %%xmm1, %%xmm0 \n\t" + + "vmovaps 0x10(%1), %%xmm1 \n\t" + "vbroadcastss 0x4(%2), %%xmm2 \n\t" + "vfmadd231ps %%xmm2, %%xmm1, %%xmm0 \n\t" + + "vmovaps 0x20(%1), %%xmm1 \n\t" + "vbroadcastss 0x8(%2), %%xmm2 \n\t" + "vfmadd231ps %%xmm2, %%xmm1, %%xmm0 \n\t" + + "vmovaps 0x30(%1), %%xmm1 \n\t" + "vbroadcastss 0xC(%2), %%xmm2 \n\t" + "vfmadd231ps %%xmm2, %%xmm1, %%xmm0 \n\t" + + "add $0x40, %1 \n\t" + "add $0x10, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .k_loop_1x4 \n\t" + ".align 16 \n\t" + ".k_loop_1x4_end: \n\t" + + "mov %0, %%ecx \n\t" + "and $3, %%ecx \n\t" + "je .k_loop_1x4_remain_end \n\t" + + ".align 16 \n\t" + ".k_loop_1x4_remain: \n\t" + "vmovaps (%1), %%xmm1 \n\t" + "vbroadcastss 0x0(%2), %%xmm2 \n\t" + "vfmadd231ps %%xmm2, %%xmm1, %%xmm0 \n\t" + "add $0x10, %1 \n\t" + "add $0x4, %2 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_1x4_remain \n\t" + + ".align 16 \n\t" + ".k_loop_1x4_remain_end: \n\t" + + "vaddps (%3), %%xmm0, %%xmm0 \n\t" + "vmovups %%xmm0, (%3) \n\t" + "add %%rax, %3 \n\t" + : + : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) + : "%eax", "%rax", "%ecx", "%xmm0", "%xmm1", "%xmm2", "memory"); +} + +void mmm_avx2_n_mtail(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) +{ + for (U32 i = 0; i < um; ++i) { + for (U32 j = 0; j < un; ++j) { + for (U32 k = 0; k < bk; ++k) { + matrixC[i * N + j] += matrixA[k * um + i] * matrixB[k * un + j]; + } + } + } +} + +EE mmm_avx2_fp32( + int N, int M, int K, bool transposeA, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *result) +{ + // buffer addr algined to 32 + F32 *packA = (F32 *)align_addr(tmp, 32); + F32 *packB = (F32 *)align_addr(matrix2, 32); + U32 blockSizeM, blockSizeK, blockSizeN, unrollSizeM; + F32 *curA, *curB, *curC; + kernel_func kernel[3][5] = { + {mmm_avx2_n_mtail, mmm_avx2_1x4_asm, mmm_avx2_1x8_asm, mmm_avx2_1x16_asm, mmm_avx2_1x24_asm}, + {mmm_avx2_n_mtail, mmm_avx2_2x4_asm, mmm_avx2_2x8_asm, mmm_avx2_2x16_asm, mmm_avx2_2x24_asm}, + {mmm_avx2_n_mtail, mmm_avx2_4x4_asm, mmm_avx2_4x8_asm, mmm_avx2_4x16_asm, mmm_avx2_4x24_asm}}; + F32 unrollNSize[4] = {4, 8, 16, 24}; + F32 unrollMSize[3] = {1, 2, 4}; + + for (int k = 0; k < K; k += blockSizeK) { + blockSizeK = UNI_MIN(BOLCK_K_DIM, K - k); + for (int j = 0; j < M; j += blockSizeM) { + blockSizeM = UNI_MIN(BOLCK_M_DIM, M - j); + for (int n = 0; n < N; n += blockSizeN) { + blockSizeN = UNI_MIN(UNROLL_N, N - n); + blockSizeN = UNI_MIN(unrollNSize[blockSizeN >> 3], blockSizeN); + curB = packB + k * N + n * blockSizeK; + for (U32 m = 0; m < blockSizeM; m += unrollSizeM) { + unrollSizeM = UNI_MIN(UNROLL_M, blockSizeM - m); + unrollSizeM = unrollMSize[unrollSizeM >> 1]; + curA = packA + m * blockSizeK; + if (n == 0) { + if (transposeA) { + matrix2_trans( + unrollSizeM, blockSizeK, M, matrix1 + (j + m) + k * M, curA); + } else { + matrix1_trans( + unrollSizeM, blockSizeK, K, matrix1 + k + (j + m) * K, curA); + } + } + curC = result + (m + j) * N + n; + kernel[unrollSizeM >> 1][(blockSizeN >> 3) + (blockSizeN > 3)]( + unrollSizeM, blockSizeN, blockSizeK, curA, curB, curC, N); + } + } + } + } + return SUCCESS; +} diff --git a/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_col.cpp b/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_col.cpp new file mode 100644 index 00000000..0ea6b860 --- /dev/null +++ b/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_col.cpp @@ -0,0 +1,565 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/fp32/blas_fp32.h" +#include "error.h" +#include "types.h" + +#define UNROLL_K 4 + +typedef void (*kernel_func)(U32 N, F32 *matrix, F32 *vector, F32 *result); + +void mvm_col_avx2_4_32(U32 N, F32 *matrix, F32 *vector, F32 *result) +{ + __asm__ __volatile__("mov %0, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + "mov %1, %%rdx \n\t" + "add %%rax, %%rdx \n\t" + "mov %%rdx, %%r9 \n\t" + "add %%rax, %%r9 \n\t" + "mov %%r9, %%r10 \n\t" + "add %%rax, %%r10 \n\t" + + "mov %0, %%ecx \n\t" + "cmp $0x20, %%ecx \n\t" + "jl .n_loop_32_end \n\t" + ".align 16 \n\t" + ".n_loop_32: \n\t" + "prefetcht0 0x100(%3) \n\t" + "prefetcht0 0x140(%3) \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups 0x20(%3), %%ymm1 \n\t" + "vmovups 0x40(%3), %%ymm2 \n\t" + "vmovups 0x60(%3), %%ymm3 \n\t" + + "prefetcht0 0x140(%1) \n\t" + "prefetcht0 0x180(%1) \n\t" + "vmovups (%1), %%ymm12 \n\t" + "vmovups 0x20(%1), %%ymm13 \n\t" + "vmovups 0x40(%1), %%ymm14 \n\t" + "vmovups 0x60(%1), %%ymm11 \n\t" + "vbroadcastss 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm11, %%ymm3 \n\t" + + "prefetcht0 0x100(%%rdx) \n\t" + "prefetcht0 0x140(%%rdx) \n\t" + "vmovups (%%rdx), %%ymm12 \n\t" + "vmovups 0x20(%%rdx), %%ymm13 \n\t" + "vmovups 0x40(%%rdx), %%ymm14 \n\t" + "vmovups 0x60(%%rdx), %%ymm11 \n\t" + "vbroadcastss 0x4(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm11, %%ymm3 \n\t" + + "prefetcht0 0x100(%%r9) \n\t" + "prefetcht0 0x140(%%r9) \n\t" + "vmovups (%%r9), %%ymm12 \n\t" + "vmovups 0x20(%%r9), %%ymm13 \n\t" + "vmovups 0x40(%%r9), %%ymm14 \n\t" + "vmovups 0x60(%%r9), %%ymm11 \n\t" + "vbroadcastss 0x8(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm11, %%ymm3 \n\t" + + "prefetcht0 0x100(%%r10) \n\t" + "prefetcht0 0x140(%%r10) \n\t" + "vmovups (%%r10), %%ymm12 \n\t" + "vmovups 0x20(%%r10), %%ymm13 \n\t" + "vmovups 0x40(%%r10), %%ymm14 \n\t" + "vmovups 0x60(%%r10), %%ymm11 \n\t" + "vbroadcastss 0xC(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm11, %%ymm3 \n\t" + + "vmovups %%ymm0, (%3) \n\t" + "vmovups %%ymm1, 0x20(%3) \n\t" + "vmovups %%ymm2, 0x40(%3) \n\t" + "vmovups %%ymm3, 0x60(%3) \n\t" + + "add $0x80, %1 \n\t" + "add $0x80, %%rdx \n\t" + "add $0x80, %%r9 \n\t" + "add $0x80, %%r10 \n\t" + "add $0x80, %3 \n\t" + + "sub $0x20, %%ecx \n\t" + "cmp $0x20, %%ecx \n\t" + "jge .n_loop_32 \n\t" + + ".align 16 \n\t" + ".n_loop_32_end: \n\t" + "cmp $0x10, %%ecx \n\t" + "jl .n_loop_remain_16_end \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups 0x20(%3), %%ymm1 \n\t" + "vmovups (%1), %%ymm12 \n\t" + "vmovups 0x20(%1), %%ymm13 \n\t" + "vbroadcastss 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vmovups (%%rdx), %%ymm12 \n\t" + "vmovups 0x20(%%rdx), %%ymm13 \n\t" + "vbroadcastss 0x4(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vmovups (%%r9), %%ymm12 \n\t" + "vmovups 0x20(%%r9), %%ymm13 \n\t" + "vbroadcastss 0x8(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vmovups (%%r10), %%ymm12 \n\t" + "vmovups 0x20(%%r10), %%ymm13 \n\t" + "vbroadcastss 0xC(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vmovups %%ymm0, (%3) \n\t" + "vmovups %%ymm1, 0x20(%3) \n\t" + + "add $0x40, %1 \n\t" + "add $0x40, %%rdx \n\t" + "add $0x40, %%r9 \n\t" + "add $0x40, %%r10 \n\t" + "add $0x40, %3 \n\t" + "sub $0x10, %%ecx \n\t" + + ".n_loop_remain_16_end: \n\t" + "cmp $0x8, %%ecx \n\t" + "jl .n_loop_remain_8_end \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups (%1), %%ymm12 \n\t" + "vbroadcastss 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovups (%%rdx), %%ymm12 \n\t" + "vbroadcastss 0x4(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovups (%%r9), %%ymm12 \n\t" + "vbroadcastss 0x8(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovups (%%r10), %%ymm12 \n\t" + "vbroadcastss 0xC(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovups %%ymm0, (%3) \n\t" + + "add $0x20, %1 \n\t" + "add $0x20, %%rdx \n\t" + "add $0x20, %%r9 \n\t" + "add $0x20, %%r10 \n\t" + "add $0x20, %3 \n\t" + "sub $0x8, %%ecx \n\t" + + ".align 16 \n\t" + ".n_loop_remain_8_end: \n\t" + "cmp $0x4, %%ecx \n\t" + "jl .n_loop_remain_4_end \n\t" + "vmovups (%3), %%xmm0 \n\t" + "vmovups (%1), %%xmm12 \n\t" + "vbroadcastss 0x0(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovups (%%rdx), %%xmm12 \n\t" + "vbroadcastss 0x4(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovups (%%r9), %%xmm12 \n\t" + "vbroadcastss 0x8(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovups (%%r10), %%xmm12 \n\t" + "vbroadcastss 0xC(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovups %%xmm0, (%3) \n\t" + + "add $0x10, %1 \n\t" + "add $0x10, %%rdx \n\t" + "add $0x10, %%r9 \n\t" + "add $0x10, %%r10 \n\t" + "add $0x10, %3 \n\t" + "sub $0x4, %%ecx \n\t" + + ".align 16 \n\t" + ".n_loop_remain_4_end: \n\t" + "cmp $0x2, %%ecx \n\t" + "jl .n_loop_remain_2_end \n\t" + "vmovsd (%3), %%xmm0 \n\t" + "vmovsd (%1), %%xmm12 \n\t" + "vbroadcastss 0x0(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovsd (%%rdx), %%xmm12 \n\t" + "vbroadcastss 0x4(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovsd (%%r9), %%xmm12 \n\t" + "vbroadcastss 0x8(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovsd (%%r10), %%xmm12 \n\t" + "vbroadcastss 0xC(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovsd %%xmm0, (%3) \n\t" + + "add $0x8, %1 \n\t" + "add $0x8, %%rdx \n\t" + "add $0x8, %%r9 \n\t" + "add $0x8, %%r10 \n\t" + "add $0x8, %3 \n\t" + "sub $0x2, %%ecx \n\t" + + ".align 16 \n\t" + ".n_loop_remain_2_end: \n\t" + "and $0x1, %%ecx \n\t" + "je .n_loop_remain_1_end \n\t" + "vmovss (%3), %%xmm0 \n\t" + "vmovss (%1), %%xmm12 \n\t" + "vmovss 0x0(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovss (%%rdx), %%xmm12 \n\t" + "vmovss 0x4(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovss (%%r9), %%xmm12 \n\t" + "vmovss 0x8(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovss (%%r10), %%xmm12 \n\t" + "vmovss 0xC(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovss %%xmm0, (%3) \n\t" + + ".align 16 \n\t" + ".n_loop_remain_1_end: \n\t" + : + : "r"(N), "r"(matrix), "r"(vector), "r"(result) + : "%eax", "%rax", "%ecx", "%rdx", "%r9", "%r10", "%ymm0", "%ymm1", "%ymm2", + "%ymm12", "%ymm13", "%ymm14", "%ymm15", "%xmm0", "%xmm12", "%xmm15", + "memory"); +} + +void mvm_col_avx2_2_32(U32 N, F32 *matrix, F32 *vector, F32 *result) +{ + __asm__ __volatile__("mov %0, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + "mov %1, %%rdx \n\t" + "add %%rax, %%rdx \n\t" + + "mov %0, %%ecx \n\t" + "cmp $0x20, %%ecx \n\t" + "jl .k2_n_loop_32_end \n\t" + ".align 16 \n\t" + ".k2_n_loop_32: \n\t" + "prefetcht0 0x100(%3) \n\t" + "prefetcht0 0x140(%3) \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups 0x20(%3), %%ymm1 \n\t" + "vmovups 0x40(%3), %%ymm2 \n\t" + "vmovups 0x60(%3), %%ymm3 \n\t" + + "prefetcht0 0x100(%1) \n\t" + "prefetcht0 0x140(%1) \n\t" + "vmovups (%1), %%ymm12 \n\t" + "vmovups 0x20(%1), %%ymm13 \n\t" + "vmovups 0x40(%1), %%ymm14 \n\t" + "vmovups 0x60(%1), %%ymm11 \n\t" + "vbroadcastss 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm11, %%ymm3 \n\t" + + "prefetcht0 0x100(%%rdx) \n\t" + "prefetcht0 0x140(%%rdx) \n\t" + "vmovups (%%rdx), %%ymm12 \n\t" + "vmovups 0x20(%%rdx), %%ymm13 \n\t" + "vmovups 0x40(%%rdx), %%ymm14 \n\t" + "vmovups 0x60(%%rdx), %%ymm11 \n\t" + "vbroadcastss 0x4(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm11, %%ymm3 \n\t" + + "vmovups %%ymm0, (%3) \n\t" + "vmovups %%ymm1, 0x20(%3) \n\t" + "vmovups %%ymm2, 0x40(%3) \n\t" + "vmovups %%ymm3, 0x60(%3) \n\t" + + "add $0x80, %1 \n\t" + "add $0x80, %%rdx \n\t" + "add $0x80, %3 \n\t" + + "sub $0x20, %%ecx \n\t" + "cmp $0x20, %%ecx \n\t" + "jge .k2_n_loop_32 \n\t" + + ".align 16 \n\t" + ".k2_n_loop_32_end: \n\t" + "cmp $0x10, %%ecx \n\t" + "jl .k2_n_loop_remain_16_end \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups 0x20(%3), %%ymm1 \n\t" + "vmovups (%1), %%ymm12 \n\t" + "vmovups 0x20(%1), %%ymm13 \n\t" + "vbroadcastss 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vmovups (%%rdx), %%ymm12 \n\t" + "vmovups 0x20(%%rdx), %%ymm13 \n\t" + "vbroadcastss 0x4(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vmovups %%ymm0, (%3) \n\t" + "vmovups %%ymm1, 0x20(%3) \n\t" + + "add $0x40, %1 \n\t" + "add $0x40, %%rdx \n\t" + "add $0x40, %3 \n\t" + "sub $0x10, %%ecx \n\t" + + ".k2_n_loop_remain_16_end: \n\t" + "cmp $0x8, %%ecx \n\t" + "jl .k2_n_loop_remain_8_end \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups (%1), %%ymm12 \n\t" + "vbroadcastss 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovups (%%rdx), %%ymm12 \n\t" + "vbroadcastss 0x4(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovups %%ymm0, (%3) \n\t" + + "add $0x20, %1 \n\t" + "add $0x20, %%rdx \n\t" + "add $0x20, %3 \n\t" + "sub $0x8, %%ecx \n\t" + + ".align 16 \n\t" + ".k2_n_loop_remain_8_end: \n\t" + "cmp $0x4, %%ecx \n\t" + "jl .k2_n_loop_remain_4_end \n\t" + "vmovups (%3), %%xmm0 \n\t" + "vmovups (%1), %%xmm12 \n\t" + "vbroadcastss 0x0(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovups (%%rdx), %%xmm12 \n\t" + "vbroadcastss 0x4(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + "vmovups %%xmm0, (%3) \n\t" + + "add $0x10, %1 \n\t" + "add $0x10, %%rdx \n\t" + "add $0x10, %3 \n\t" + "sub $0x4, %%ecx \n\t" + + ".align 16 \n\t" + ".k2_n_loop_remain_4_end: \n\t" + "cmp $0x2, %%ecx \n\t" + "jl .k2_n_loop_remain_2_end \n\t" + "vmovsd (%3), %%xmm0 \n\t" + "vmovsd (%1), %%xmm12 \n\t" + "vbroadcastss 0x0(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovsd (%%rdx), %%xmm12 \n\t" + "vbroadcastss 0x4(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + "vmovsd %%xmm0, (%3) \n\t" + + "add $0x8, %1 \n\t" + "add $0x8, %%rdx \n\t" + "add $0x8, %3 \n\t" + "sub $0x2, %%ecx \n\t" + + ".align 16 \n\t" + ".k2_n_loop_remain_2_end: \n\t" + "and $0x1, %%ecx \n\t" + "je .k2_n_loop_remain_1_end \n\t" + "vmovss (%3), %%xmm0 \n\t" + "vmovss (%1), %%xmm12 \n\t" + "vmovss 0x0(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vmovss (%%rdx), %%xmm12 \n\t" + "vmovss 0x4(%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + "vmovss %%xmm0, (%3) \n\t" + + ".align 16 \n\t" + ".k2_n_loop_remain_1_end: \n\t" + : + : "r"(N), "r"(matrix), "r"(vector), "r"(result) + : "%eax", "%rax", "%ecx", "%rdx", "%ymm0", "%ymm1", "%ymm2", "%ymm12", + "%ymm13", "%ymm14", "%ymm15", "%xmm0", "%xmm12", "%xmm15", "memory"); +} + +void mvm_col_avx2_1_32(U32 N, F32 *matrix, F32 *vector, F32 *result) +{ + __asm__ __volatile__("mov %0, %%ecx \n\t" + "cmp $0x20, %%ecx \n\t" + "jl .k1_n_loop_32_end \n\t" + ".align 16 \n\t" + ".k1_n_loop_32: \n\t" + "prefetcht0 0x100(%3) \n\t" + "prefetcht0 0x140(%3) \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups 0x20(%3), %%ymm1 \n\t" + "vmovups 0x40(%3), %%ymm2 \n\t" + "vmovups 0x60(%3), %%ymm3 \n\t" + + "prefetcht0 0x100(%1) \n\t" + "prefetcht0 0x140(%1) \n\t" + "vmovups (%1), %%ymm12 \n\t" + "vmovups 0x20(%1), %%ymm13 \n\t" + "vmovups 0x40(%1), %%ymm14 \n\t" + "vmovups 0x60(%1), %%ymm11 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm11, %%ymm3 \n\t" + + "vmovups %%ymm0, (%3) \n\t" + "vmovups %%ymm1, 0x20(%3) \n\t" + "vmovups %%ymm2, 0x40(%3) \n\t" + "vmovups %%ymm3, 0x60(%3) \n\t" + + "add $0x80, %1 \n\t" + "add $0x80, %3 \n\t" + + "sub $0x20, %%ecx \n\t" + "cmp $0x20, %%ecx \n\t" + "jge .k1_n_loop_32 \n\t" + + ".align 16 \n\t" + ".k1_n_loop_32_end: \n\t" + "cmp $0x10, %%ecx \n\t" + "jl .k1_n_loop_remain_16_end \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups 0x20(%3), %%ymm1 \n\t" + "vmovups (%1), %%ymm12 \n\t" + "vmovups 0x20(%1), %%ymm13 \n\t" + "vbroadcastss 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vmovups %%ymm0, (%3) \n\t" + "vmovups %%ymm1, 0x20(%3) \n\t" + + "add $0x40, %1 \n\t" + "add $0x40, %3 \n\t" + "sub $0x10, %%ecx \n\t" + + ".k1_n_loop_remain_16_end: \n\t" + "cmp $0x8, %%ecx \n\t" + "jl .k1_n_loop_remain_8_end \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups (%1), %%ymm12 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vaddps (%3), %%ymm0, %%ymm0 \n\t" + "vmovups %%ymm0, (%3) \n\t" + + "add $0x20, %1 \n\t" + "add $0x20, %3 \n\t" + "sub $0x8, %%ecx \n\t" + + ".align 16 \n\t" + ".k1_n_loop_remain_8_end: \n\t" + "cmp $0x4, %%ecx \n\t" + "jl .k1_n_loop_remain_4_end \n\t" + "vmovups (%3), %%xmm0 \n\t" + "vmovups (%1), %%xmm12 \n\t" + "vbroadcastss (%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vaddps (%3), %%xmm0, %%xmm0 \n\t" + "vmovups %%xmm0, (%3) \n\t" + + "add $0x10, %1 \n\t" + "add $0x10, %3 \n\t" + "sub $0x4, %%ecx \n\t" + + ".align 16 \n\t" + ".k1_n_loop_remain_4_end: \n\t" + "cmp $0x2, %%ecx \n\t" + "jl .k1_n_loop_remain_2_end \n\t" + "vmovsd (%3), %%xmm0 \n\t" + "vmovsd (%1), %%xmm12 \n\t" + "vbroadcastss (%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + + "vaddps (%3), %%xmm0, %%xmm0 \n\t" + "vmovsd %%xmm0, (%3) \n\t" + + "add $0x8, %1 \n\t" + "add $0x8, %3 \n\t" + "sub $0x2, %%ecx \n\t" + + ".align 16 \n\t" + ".k1_n_loop_remain_2_end: \n\t" + "and $0x1, %%ecx \n\t" + "je .k1_n_loop_remain_1_end \n\t" + "vmovss (%3), %%xmm0 \n\t" + "vmovss (%1), %%xmm12 \n\t" + "vmovss (%2), %%xmm15 \n\t" + "vfmadd231ps %%xmm15, %%xmm12, %%xmm0 \n\t" + "vmovss %%xmm0, (%3) \n\t" + + ".align 16 \n\t" + ".k1_n_loop_remain_1_end: \n\t" + : + : "r"(N), "r"(matrix), "r"(vector), "r"(result) + : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "%xmm0", "%xmm12", "%xmm15", "memory"); +} + +void mvm_col_fp32(U32 numRows, U32 numColumns, F32 *matrix, F32 *vector, F32 *result) +{ + // Actual layout is KN, and vector is K + U32 blockKSize = 0; + kernel_func kernel[3] = {mvm_col_avx2_1_32, mvm_col_avx2_2_32, mvm_col_avx2_4_32}; + U32 unrollKSize[3] = {1, 2, 4}; + for (U32 bk = 0; bk < numColumns; bk += blockKSize) { + blockKSize = UNI_MIN(numColumns - bk, 4); + blockKSize = unrollKSize[blockKSize >> 1]; + kernel[blockKSize >> 1](numRows, matrix + bk * numRows, vector + bk, result); + } +} diff --git a/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_row.cpp b/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_row.cpp new file mode 100644 index 00000000..6030bc9f --- /dev/null +++ b/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_row.cpp @@ -0,0 +1,540 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/fp32/blas_fp32.h" +#include "error.h" +#include "types.h" + +#define UNROLL_N 4 +#define BOLCK_K_DIM 512 + +typedef void (*kernel_func)(U32 bk, U32 lda, F32 *matrix, F32 *vector, F32 *result); + +void mvm_row_avx_4_32(U32 bk, U32 lda, F32 *matrix, F32 *vector, F32 *result) +{ + __asm__ __volatile__("vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + + "mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + "mov %1, %%rdx \n\t" + "add %%rax, %%rdx \n\t" + "mov %%rdx, %%r9 \n\t" + "add %%rax, %%r9 \n\t" + "mov %%r9, %%r10 \n\t" + "add %%rax, %%r10 \n\t" + + "mov %0, %%ecx \n\t" + "shr $5, %%ecx \n\t" + "je .k_loop_32_end \n\t" + ".align 16 \n\t" + ".k_loop_32: \n\t" + + "prefetcht0 0x100(%1) \n\t" + "prefetcht0 0x140(%1) \n\t" + "prefetcht0 0x100(%%rdx) \n\t" + "prefetcht0 0x140(%%rdx) \n\t" + + "vmovups (%2), %%ymm12 \n\t" + "vmovups 0x20(%2), %%ymm13 \n\t" + "vmovups 0x40(%2), %%ymm14 \n\t" + "vmovups 0x60(%2), %%ymm15 \n\t" + + "vmovups (%1), %%ymm4 \n\t" + "vmovups 0x20(%1), %%ymm5 \n\t" + "vmovups 0x40(%1), %%ymm6 \n\t" + "vmovups 0x60(%1), %%ymm7 \n\t" + "vmovups (%%rdx), %%ymm8 \n\t" + "vmovups 0x20(%%rdx), %%ymm9 \n\t" + "vmovups 0x40(%%rdx), %%ymm10 \n\t" + "vmovups 0x60(%%rdx), %%ymm11 \n\t" + "vfmadd231ps %%ymm12, %%ymm4, %%ymm0 \n\t" + "vfmadd231ps %%ymm12, %%ymm8, %%ymm1 \n\t" + "vfmadd231ps %%ymm13, %%ymm5, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm9, %%ymm1 \n\t" + "vfmadd231ps %%ymm14, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm10, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm7, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm11, %%ymm1 \n\t" + + "prefetcht0 0x100(%%r9) \n\t" + "prefetcht0 0x140(%%r9) \n\t" + "prefetcht0 0x100(%%r10) \n\t" + "prefetcht0 0x140(%%r10) \n\t" + + "vmovups (%%r9), %%ymm4 \n\t" + "vmovups 0x20(%%r9), %%ymm5 \n\t" + "vmovups 0x40(%%r9), %%ymm6 \n\t" + "vmovups 0x60(%%r9), %%ymm7 \n\t" + "vmovups (%%r10), %%ymm8 \n\t" + "vmovups 0x20(%%r10), %%ymm9 \n\t" + "vmovups 0x40(%%r10), %%ymm10 \n\t" + "vmovups 0x60(%%r10), %%ymm11 \n\t" + "vfmadd231ps %%ymm12, %%ymm4, %%ymm2 \n\t" + "vfmadd231ps %%ymm12, %%ymm8, %%ymm3 \n\t" + "vfmadd231ps %%ymm13, %%ymm5, %%ymm2 \n\t" + "vfmadd231ps %%ymm13, %%ymm9, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm6, %%ymm2 \n\t" + "vfmadd231ps %%ymm14, %%ymm10, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm7, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm11, %%ymm3 \n\t" + + "add $0x80, %1 \n\t" + "add $0x80, %2 \n\t" + "add $0x80, %%rdx \n\t" + "add $0x80, %%r9 \n\t" + "add $0x80, %%r10 \n\t" + "sub $1, %%ecx \n\t" + "jg .k_loop_32 \n\t" + + ".align 16 \n\t" + ".k_loop_32_end: \n\t" + "mov %0, %%ecx \n\t" + "and $0x1F, %%ecx \n\t" + "cmp $0x10, %%ecx \n\t" + "jl .k_loop_remain_16_end \n\t" + + "vmovups (%2), %%ymm12 \n\t" + "vmovups 0x20(%2), %%ymm13 \n\t" + + "vmovups (%1), %%ymm4 \n\t" + "vmovups 0x20(%1), %%ymm5 \n\t" + "vmovups (%%rdx), %%ymm8 \n\t" + "vmovups 0x20(%%rdx), %%ymm9 \n\t" + "vmovups (%%r9), %%ymm6 \n\t" + "vmovups 0x20(%%r9), %%ymm7 \n\t" + "vmovups (%%r10), %%ymm10 \n\t" + "vmovups 0x20(%%r10), %%ymm11 \n\t" + "vfmadd231ps %%ymm12, %%ymm4, %%ymm0 \n\t" + "vfmadd231ps %%ymm12, %%ymm8, %%ymm1 \n\t" + "vfmadd231ps %%ymm12, %%ymm6, %%ymm2 \n\t" + "vfmadd231ps %%ymm12, %%ymm10, %%ymm3 \n\t" + "vfmadd231ps %%ymm13, %%ymm5, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm9, %%ymm1 \n\t" + "vfmadd231ps %%ymm13, %%ymm7, %%ymm2 \n\t" + "vfmadd231ps %%ymm13, %%ymm11, %%ymm3 \n\t" + + "add $0x40, %1 \n\t" + "add $0x40, %2 \n\t" + "add $0x40, %%rdx \n\t" + "add $0x40, %%r9 \n\t" + "add $0x40, %%r10 \n\t" + "sub $0x10, %%ecx \n\t" + + ".align 16 \n\t" + ".k_loop_remain_16_end: \n\t" + "cmp $0x8, %%ecx \n\t" + "jl .k_loop_remain_8_end \n\t" + "vmovups (%2), %%ymm12 \n\t" + "vmovups (%1), %%ymm4 \n\t" + "vmovups (%%rdx), %%ymm6 \n\t" + "vmovups (%%r9), %%ymm8 \n\t" + "vmovups (%%r10), %%ymm10 \n\t" + "vfmadd231ps %%ymm4, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm6, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm8, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm10, %%ymm12, %%ymm3 \n\t" + "add $0x20, %1 \n\t" + "add $0x20, %2 \n\t" + "add $0x20, %%rdx \n\t" + "add $0x20, %%r9 \n\t" + "add $0x20, %%r10 \n\t" + "sub $0x8, %%ecx \n\t" + + ".align 16 \n\t" + ".k_loop_remain_8_end: \n\t" + "vperm2f128 $0x1, %%ymm0, %%ymm0, %%ymm12 \n\t" + "vperm2f128 $0x1, %%ymm1, %%ymm1, %%ymm13 \n\t" + "vperm2f128 $0x1, %%ymm2, %%ymm2, %%ymm14 \n\t" + "vperm2f128 $0x1, %%ymm3, %%ymm3, %%ymm15 \n\t" + "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm13, %%ymm1, %%ymm1 \n\t" + "vaddps %%ymm14, %%ymm2, %%ymm2 \n\t" + "vaddps %%ymm15, %%ymm3, %%ymm3 \n\t" + + "cmp $0x4, %%ecx \n\t" + "jl .k_loop_remain_4_end \n\t" + "vmovups (%2), %%xmm12 \n\t" + "vmovups (%1), %%xmm4 \n\t" + "vmovups (%%rdx), %%xmm6 \n\t" + "vmovups (%%r9), %%xmm8 \n\t" + "vmovups (%%r10), %%xmm10 \n\t" + "vfmadd231ps %%xmm4, %%xmm12, %%xmm0 \n\t" + "vfmadd231ps %%xmm6, %%xmm12, %%xmm1 \n\t" + "vfmadd231ps %%xmm8, %%xmm12, %%xmm2 \n\t" + "vfmadd231ps %%xmm10,%%xmm12, %%xmm3 \n\t" + "add $0x10, %1 \n\t" + "add $0x10, %2 \n\t" + "add $0x10, %%rdx \n\t" + "add $0x10, %%r9 \n\t" + "add $0x10, %%r10 \n\t" + "sub $0x4, %%ecx \n\t" + + ".align 16 \n\t" + ".k_loop_remain_4_end: \n\t" + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "cmp $0x2, %%ecx \n\t" + "jl .k_loop_remain_2_end \n\t" + "vxorps %%xmm12, %%xmm12, %%xmm12 \n\t" + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorps %%xmm8, %%xmm8, %%xmm8 \n\t" + "vxorps %%xmm10, %%xmm10, %%xmm10 \n\t" + "vmovsd (%2), %%xmm12 \n\t" + "vmovsd (%1), %%xmm4 \n\t" + "vmovsd (%%rdx), %%xmm6 \n\t" + "vmovsd (%%r9), %%xmm8 \n\t" + "vmovsd (%%r10), %%xmm10 \n\t" + "vfmadd231ps %%xmm4, %%xmm12, %%xmm0 \n\t" + "vfmadd231ps %%xmm6, %%xmm12, %%xmm1 \n\t" + "vfmadd231ps %%xmm8, %%xmm12, %%xmm2 \n\t" + "vfmadd231ps %%xmm10, %%xmm12, %%xmm3 \n\t" + "add $0x8, %1 \n\t" + "add $0x8, %2 \n\t" + "add $0x8, %%rdx \n\t" + "add $0x8, %%r9 \n\t" + "add $0x8, %%r10 \n\t" + "sub $0x2, %%ecx \n\t" + + ".align 16 \n\t" + ".k_loop_remain_2_end: \n\t" + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t" + "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t" + + "and $0x1, %%ecx \n\t" + "je .k_loop_remain_1_end \n\t" + "vxorps %%xmm12,%%xmm12, %%xmm12 \n\t" + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vxorps %%xmm8, %%xmm8, %%xmm8 \n\t" + "vxorps %%xmm10, %%xmm10, %%xmm10 \n\t" + "vmovss (%2), %%xmm12 \n\t" + "vmovss (%1), %%xmm4 \n\t" + "vmovss (%%rdx), %%xmm6 \n\t" + "vmovss (%%r9), %%xmm8 \n\t" + "vmovss (%%r10), %%xmm10 \n\t" + "vfmadd231ps %%xmm4, %%xmm12, %%xmm0 \n\t" + "vfmadd231ps %%xmm6, %%xmm12, %%xmm1 \n\t" + "vfmadd231ps %%xmm8, %%xmm12, %%xmm2 \n\t" + "vfmadd231ps %%xmm10, %%xmm12, %%xmm3 \n\t" + + ".align 16 \n\t" + ".k_loop_remain_1_end: \n\t" + "vaddps (%3), %%xmm0, %%xmm0 \n\t" + "vmovss %%xmm0, (%3) \n\t" + "vaddps 0x4(%3), %%xmm1, %%xmm1 \n\t" + "vmovss %%xmm1, 0x4(%3) \n\t" + "vaddps 0x8(%3), %%xmm2, %%xmm2 \n\t" + "vmovss %%xmm2, 0x8(%3) \n\t" + "vaddps 0xC(%3), %%xmm3, %%xmm3 \n\t" + "vmovss %%xmm3, 0xC(%3) \n\t" + : + : "r"(bk), "r"(matrix), "r"(vector), "r"(result), "r"(lda) + : "%eax", "%rax", "%ecx", "%rdx", "%r9", "%r10", "%ymm0", "%ymm1", "%ymm2", + "%ymm3", "%ymm4", "%ymm6", "%ymm8", "%ymm10", "%ymm12", "%ymm13", "%ymm14", + "%ymm15", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm6", "%xmm8", + "%xmm10", "%xmm12", "memory"); +} + +void mvm_row_avx_2_32(U32 bk, U32 lda, F32 *matrix, F32 *vector, F32 *result) +{ + __asm__ __volatile__("vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + + "mov %4, %%eax \n\t" + "shl $2, %%eax \n\t" + "mov %%eax, %%eax \n\t" + "mov %1, %%rdx \n\t" + "add %%rax, %%rdx \n\t" + + "mov %0, %%ecx \n\t" + "shr $5, %%ecx \n\t" + "je .n2_k_loop_32_end \n\t" + ".align 16 \n\t" + ".n2_k_loop_32: \n\t" + + "prefetcht0 0x100(%1) \n\t" + "prefetcht0 0x140(%1) \n\t" + "prefetcht0 0x100(%%rdx) \n\t" + "prefetcht0 0x140(%%rdx) \n\t" + + "vmovups (%2), %%ymm12 \n\t" + "vmovups 0x20(%2), %%ymm13 \n\t" + "vmovups 0x40(%2), %%ymm14 \n\t" + "vmovups 0x60(%2), %%ymm15 \n\t" + + "vmovups (%1), %%ymm4 \n\t" + "vmovups 0x20(%1), %%ymm5 \n\t" + "vmovups 0x40(%1), %%ymm6 \n\t" + "vmovups 0x60(%1), %%ymm7 \n\t" + "vmovups (%%rdx), %%ymm8 \n\t" + "vmovups 0x20(%%rdx), %%ymm9 \n\t" + "vmovups 0x40(%%rdx), %%ymm10 \n\t" + "vmovups 0x60(%%rdx), %%ymm11 \n\t" + "vfmadd231ps %%ymm12, %%ymm4, %%ymm0 \n\t" + "vfmadd231ps %%ymm12, %%ymm8, %%ymm1 \n\t" + "vfmadd231ps %%ymm13, %%ymm5, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm9, %%ymm1 \n\t" + "vfmadd231ps %%ymm14, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm10, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm7, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm11, %%ymm1 \n\t" + + "add $0x80, %1 \n\t" + "add $0x80, %2 \n\t" + "add $0x80, %%rdx \n\t" + "sub $1, %%ecx \n\t" + "jg .n2_k_loop_32 \n\t" + + ".align 16 \n\t" + ".n2_k_loop_32_end: \n\t" + "mov %0, %%ecx \n\t" + "and $0x1F, %%ecx \n\t" + "cmp $0x10, %%ecx \n\t" + "jl .n2_k_loop_remain_16_end \n\t" + + "vmovups (%2), %%ymm12 \n\t" + "vmovups 0x20(%2), %%ymm13 \n\t" + + "vmovups (%1), %%ymm4 \n\t" + "vmovups 0x20(%1), %%ymm5 \n\t" + "vmovups (%%rdx), %%ymm8 \n\t" + "vmovups 0x20(%%rdx), %%ymm9 \n\t" + "vfmadd231ps %%ymm12, %%ymm4, %%ymm0 \n\t" + "vfmadd231ps %%ymm12, %%ymm8, %%ymm1 \n\t" + "vfmadd231ps %%ymm13, %%ymm5, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm9, %%ymm1 \n\t" + + "add $0x40, %1 \n\t" + "add $0x40, %2 \n\t" + "add $0x40, %%rdx \n\t" + "sub $0x10, %%ecx \n\t" + + ".align 16 \n\t" + ".n2_k_loop_remain_16_end: \n\t" + "cmp $0x8, %%ecx \n\t" + "jl .n2_k_loop_remain_8_end \n\t" + "vmovups (%2), %%ymm12 \n\t" + "vmovups (%1), %%ymm4 \n\t" + "vmovups (%%rdx), %%ymm6 \n\t" + "vfmadd231ps %%ymm4, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm6, %%ymm12, %%ymm1 \n\t" + "add $0x20, %1 \n\t" + "add $0x20, %2 \n\t" + "add $0x20, %%rdx \n\t" + "sub $0x8, %%ecx \n\t" + + ".align 16 \n\t" + ".n2_k_loop_remain_8_end: \n\t" + "vperm2f128 $0x1, %%ymm0, %%ymm0, %%ymm12 \n\t" + "vperm2f128 $0x1, %%ymm1, %%ymm1, %%ymm13 \n\t" + "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm13, %%ymm1, %%ymm1 \n\t" + + "cmp $0x4, %%ecx \n\t" + "jl .n2_k_loop_remain_4_end \n\t" + "vmovups (%2), %%xmm12 \n\t" + "vmovups (%1), %%xmm4 \n\t" + "vmovups (%%rdx), %%xmm6 \n\t" + "vfmadd231ps %%xmm4, %%xmm12, %%xmm0 \n\t" + "vfmadd231ps %%xmm6, %%xmm12, %%xmm1 \n\t" + "add $0x10, %1 \n\t" + "add $0x10, %2 \n\t" + "add $0x10, %%rdx \n\t" + "sub $0x4, %%ecx \n\t" + + ".align 16 \n\t" + ".n2_k_loop_remain_4_end: \n\t" + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + + "cmp $0x2, %%ecx \n\t" + "jl .n2_k_loop_remain_2_end \n\t" + "vxorps %%xmm12, %%xmm12, %%xmm12 \n\t" + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vmovsd (%2), %%xmm12 \n\t" + "vmovsd (%1), %%xmm4 \n\t" + "vmovsd (%%rdx), %%xmm6 \n\t" + "vfmadd231ps %%xmm4, %%xmm12, %%xmm0 \n\t" + "vfmadd231ps %%xmm6, %%xmm12, %%xmm1 \n\t" + "add $0x8, %1 \n\t" + "add $0x8, %2 \n\t" + "add $0x8, %%rdx \n\t" + "sub $0x2, %%ecx \n\t" + + ".align 16 \n\t" + ".n2_k_loop_remain_2_end: \n\t" + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" + "and $1, %%ecx \n\t" + "je .n2_k_loop_remain_1_end \n\t" + "vxorps %%xmm12, %%xmm12, %%xmm12 \n\t" + "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" + "vmovss (%2), %%xmm12 \n\t" + "vmovss (%1), %%xmm4 \n\t" + "vmovss (%%rdx), %%xmm6 \n\t" + "vfmadd231ps %%xmm4, %%xmm12, %%xmm0 \n\t" + "vfmadd231ps %%xmm6, %%xmm12, %%xmm1 \n\t" + + ".align 16 \n\t" + ".n2_k_loop_remain_1_end: \n\t" + "vaddps (%3), %%xmm0, %%xmm0 \n\t" + "vmovss %%xmm0, (%3) \n\t" + "vaddps 0x4(%3), %%xmm1, %%xmm1 \n\t" + "vmovss %%xmm1, 0x4(%3) \n\t" + : + : "r"(bk), "r"(matrix), "r"(vector), "r"(result), "r"(lda) + : "%eax", "%rax", "%ecx", "%rdx", "%r9", "%r10", "%ymm0", "%ymm1", "%ymm4", + "%ymm6", "%ymm12", "%ymm13", "%xmm0", "%xmm1", "%xmm4", "%xmm6", "%xmm12", + "%xmm13", "memory"); +} + +void mvm_row_avx_1_32(U32 bk, U32 lda, F32 *matrix, F32 *vector, F32 *result) +{ + __asm__ __volatile__("vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + + "mov %0, %%ecx \n\t" + "shr $5, %%ecx \n\t" + "je .n1_k_loop_32_end \n\t" + ".align 16 \n\t" + ".n1_k_loop_32: \n\t" + + "prefetcht0 0x100(%1) \n\t" + "prefetcht0 0x140(%1) \n\t" + + "vmovups (%2), %%ymm12 \n\t" + "vmovups 0x20(%2), %%ymm13 \n\t" + "vmovups 0x40(%2), %%ymm14 \n\t" + "vmovups 0x60(%2), %%ymm15 \n\t" + + "vmovups (%1), %%ymm4 \n\t" + "vmovups 0x20(%1), %%ymm5 \n\t" + "vmovups 0x40(%1), %%ymm6 \n\t" + "vmovups 0x60(%1), %%ymm7 \n\t" + "vfmadd231ps %%ymm12, %%ymm4, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm5, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm6, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm7, %%ymm0 \n\t" + + "add $0x80, %1 \n\t" + "add $0x80, %2 \n\t" + + "sub $1, %%ecx \n\t" + "jg .n1_k_loop_32 \n\t" + ".align 16 \n\t" + ".n1_k_loop_32_end: \n\t" + "mov %0, %%ecx \n\t" + "and $0x1F, %%ecx \n\t" + "cmp $0x10, %%ecx \n\t" + "jl .n1_k_loop_remain_16_end \n\t" + + "vmovups (%2), %%ymm12 \n\t" + "vmovups 0x20(%2), %%ymm13 \n\t" + + "vmovups (%1), %%ymm4 \n\t" + "vmovups 0x20(%1), %%ymm5 \n\t" + "vfmadd231ps %%ymm12, %%ymm4, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm5, %%ymm0 \n\t" + + "add $0x40, %1 \n\t" + "add $0x40, %2 \n\t" + "sub $0x10, %%ecx \n\t" + + ".align 16 \n\t" + ".n1_k_loop_remain_16_end: \n\t" + "cmp $0x8, %%ecx \n\t" + "jl .n1_k_loop_remain_8_end \n\t" + "vmovups (%2), %%ymm12 \n\t" + "vmovups (%1), %%ymm4 \n\t" + "vfmadd231ps %%ymm4, %%ymm12, %%ymm0 \n\t" + "add $0x20, %1 \n\t" + "add $0x20, %2 \n\t" + "sub $0x8, %%ecx \n\t" + + ".align 16 \n\t" + ".n1_k_loop_remain_8_end: \n\t" + "vperm2f128 $0x1, %%ymm0, %%ymm0, %%ymm13 \n\t" + "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" + + "cmp $0x4, %%ecx \n\t" + "jl .n1_k_loop_remain_4_end \n\t" + "vmovups (%2), %%xmm12 \n\t" + "vmovups (%1), %%xmm4 \n\t" + "vfmadd231ps %%xmm4, %%xmm12, %%xmm0 \n\t" + "add $0x10, %1 \n\t" + "add $0x10, %2 \n\t" + "sub $0x4, %%ecx \n\t" + + ".align 16 \n\t" + ".n1_k_loop_remain_4_end: \n\t" + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + + "cmp $0x2, %%ecx \n\t" + "jl .n1_k_loop_remain_2_end \n\t" + "vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" + "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" + "vmovsd (%2), %%xmm12 \n\t" + "vmovsd (%1), %%xmm4 \n\t" + "vfmadd231ps %%xmm4, %%xmm12, %%xmm0 \n\t" + "add $0x8, %1 \n\t" + "add $0x8, %2 \n\t" + "sub $0x2, %%ecx \n\t" + + ".align 16 \n\t" + ".n1_k_loop_remain_2_end: \n\t" + "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" + "and $1, %%ecx \n\t" + "je .n1_k_loop_remain_1_end \n\t" + "vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" + "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" + "vmovss (%2), %%xmm12 \n\t" + "vmovss (%1), %%xmm4 \n\t" + "vfmadd231ps %%xmm4, %%xmm12, %%xmm0 \n\t" + + ".align 16 \n\t" + ".n1_k_loop_remain_1_end: \n\t" + "vaddps (%3), %%xmm0, %%xmm0 \n\t" + "vmovss %%xmm0, (%3) \n\t" + : + : "r"(bk), "r"(matrix), "r"(vector), "r"(result), "r"(lda) + : "%eax", "%rax", "%ecx", "%rdx", "%r9", "%r10", "%ymm0", "%ymm4", + "%ymm12", "%ymm13", "%xmm0", "%xmm4", "%xmm12", "memory"); +} + +void mvm_row_fp32(U32 numRows, U32 numColumns, F32 *matrix, F32 *vector, F32 *result) +{ + // Actual layout is NK, and vector is K + U32 blockKSize = 0, blockNSize = 0; + kernel_func kernel[3] = {mvm_row_avx_1_32, mvm_row_avx_2_32, mvm_row_avx_4_32}; + U32 unrollNSize[3] = {1, 2, 4}; + for (U32 bk = 0; bk < numColumns; bk += blockKSize) { + blockKSize = UNI_MIN(numColumns - bk, BOLCK_K_DIM); + for (U32 bn = 0; bn < numRows; bn += blockNSize) { + blockNSize = UNI_MIN(numRows - bn, UNROLL_N); + blockNSize = unrollNSize[blockNSize >> 1]; + kernel[blockNSize >> 1]( + blockKSize, numColumns, matrix + bn * numColumns + bk, vector + bk, result + bn); + } + } +} \ No newline at end of file diff --git a/compute/blas_enhance/src/cpu/x86/mmm.cpp b/compute/blas_enhance/src/cpu/x86/mmm.cpp new file mode 100644 index 00000000..b1f7e436 --- /dev/null +++ b/compute/blas_enhance/src/cpu/x86/mmm.cpp @@ -0,0 +1,130 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "error.h" +#include "types.h" +#include "blas_enhance.h" +#include "cpu/x86/blas_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/blas_fp32.h" +#endif + +EE matrix_matrix_multiply_tmp_bytes_x86( + U32 matrixA_M, U32 matrixA_K, U32 matrixB_K, U32 matrixB_N, DataType dt, U32 *bytes) +{ + EE ret = SUCCESS; + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: { + matrix_matrix_multiply_tmp_bytes_fp32( + matrixA_M, matrixA_K, matrixB_K, matrixB_N, dt, bytes); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + + return ret; +} + +static EE matrix_matrix_multiply_transform_rhsN( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst) +{ + EE ret = SUCCESS; + switch (desc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = matrix_matrix_multiply_transform_rhsN_fp32(desc, (F32 *)src, (F32 *)dst); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + (*descTran) = desc; + (*descTran).df = targetFormat4MatrixB(desc.dt); + return ret; +} + +static EE matrix_matrix_multiply_transform_rhsT( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst) +{ + EE ret = SUCCESS; + switch (desc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = matrix_matrix_multiply_transform_rhsT_fp32(desc, (F32 *)src, (F32 *)dst); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + (*descTran) = desc; + (*descTran).df = targetFormat4MatrixB(desc.dt); + std::swap((*descTran).dims[0], (*descTran).dims[1]); + return ret; +} + +EE matrix_matrix_multiply_transform_rhs_x86( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst) +{ + if (desc.df == targetFormat4MatrixB(desc.dt)) { + return SUCCESS; + } + EE ret = SUCCESS; + switch (desc.df) { + case DF_NORMAL: { + ret = matrix_matrix_multiply_transform_rhsN(desc, src, descTran, dst); + break; + } + case DF_TRANSPOSE: { + ret = matrix_matrix_multiply_transform_rhsT(desc, src, descTran, dst); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE mmm_x86(U32 matrixC_N, + U32 matrixC_M, + U32 matrixA_K, + DataType dt, + bool transposeA, + const void *matrixAData, + const void *matrixBData, + void *tmp, + void *matrixCData) +{ + EE ret = SUCCESS; + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = mmm_avx2_fp32(matrixC_N, matrixC_M, matrixA_K, transposeA, (F32 *)matrixAData, + (F32 *)matrixBData, (F32 *)tmp, (F32 *)matrixCData); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/blas_enhance/src/cpu/x86/mvm.cpp b/compute/blas_enhance/src/cpu/x86/mvm.cpp new file mode 100644 index 00000000..740d793f --- /dev/null +++ b/compute/blas_enhance/src/cpu/x86/mvm.cpp @@ -0,0 +1,55 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "error.h" +#include "types.h" +#include "blas_enhance.h" +#include "cpu/x86/blas_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/blas_fp32.h" +#endif + +EE matrix_vector_multiply_tmp_bytes_x86(bool transpose, DataType dt, U32 *bytes) +{ + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + *bytes = 0; + break; +#endif + default: + break; + } + return SUCCESS; +} + +EE mvm_x86( + U32 row, U32 col, DataType dt, bool transpose, const void *matrix, const void *vector, void *result) +{ + EE ret = SUCCESS; + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = mvm_avx2_fp32(row, col, transpose, (F32 *)matrix, (F32 *)vector, (F32 *)result); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/blas_enhance/src/mmm.cpp b/compute/blas_enhance/src/mmm.cpp new file mode 100644 index 00000000..8a7b7612 --- /dev/null +++ b/compute/blas_enhance/src/mmm.cpp @@ -0,0 +1,167 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "blas_enhance.h" +#ifdef _USE_GENERAL +#include "cpu/general/blas_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/blas_arm.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/blas_x86.h" +#endif + +EE matrix_matrix_multiply_tmp_bytes( + TensorDesc matrixADesc, TensorDesc matrixBDesc, U32 *bytes, Arch arch) +{ + DataType matrixADataType, matrixBDataType; + DataFormat matrixADataFormat, matrixBDataFormat; + U32 matrixA_M, matrixA_K, matrixB_K, matrixB_N; + CHECK_STATUS( + tensor2dGet(matrixADesc, &matrixADataType, &matrixADataFormat, &matrixA_M, &matrixA_K)); + CHECK_STATUS( + tensor2dGet(matrixBDesc, &matrixBDataType, &matrixBDataFormat, &matrixB_K, &matrixB_N)); + if (matrixBDesc.df == DF_TRANSPOSE) { + std::swap(matrixB_K, matrixB_N); + } + + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = SUCCESS; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = matrix_matrix_multiply_tmp_bytes_x86( + matrixA_M, matrixA_K, matrixB_K, matrixB_N, matrixADataType, bytes); +#endif +#ifdef _USE_NEON + } else { + ret = matrix_matrix_multiply_tmp_bytes_arm( + matrixA_M, matrixA_K, matrixB_K, matrixB_N, matrixADataType, bytes); +#endif + } + return ret; +} + +EE matrix_matrix_multiply_transform_rhs( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst, Arch arch) +{ + EE ret = NOT_SUPPORTED; +#ifdef _USE_NEON + if (IS_ARM(arch)) { + ret = matrix_matrix_multiply_transform_rhs_arm(desc, src, descTran, dst); + } +#endif +#ifdef _USE_GENERAL + if (IS_GENERAL(arch)) { + memcpy(dst, src, tensorNumBytes(desc)); + (*descTran) = desc; + ret = SUCCESS; + } +#endif +#ifdef _USE_X86 + if (IS_X86_AVX2(arch)) { + ret = matrix_matrix_multiply_transform_rhs_x86(desc, src, descTran, dst); + } +#endif + return ret; +} + +EE matrix_matrix_multiply(TensorDesc matrixADesc, + const void *matrixAData, + TensorDesc matrixBDesc, + const void *matrixBData, + U32 bytes, + void *tmp, + TensorDesc matrixCDesc, + void *matrixCData, + Arch arch) +{ + if (bytes != 0 && tmp == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (nullptr == matrixAData || nullptr == matrixBData || nullptr == matrixCData) { + CHECK_STATUS(NULL_POINTER); + } + + DataType matrixADataType, matrixBDataType, matrixCDataType; + DataFormat matrixADataFormat, matrixBDataFormat, matrixCDataFormat; + U32 matrixA_M, matrixA_K, matrixB_K, matrixB_N, matrixC_M, matrixC_N; + CHECK_STATUS( + tensor2dGet(matrixADesc, &matrixADataType, &matrixADataFormat, &matrixA_M, &matrixA_K)); + CHECK_STATUS( + tensor2dGet(matrixBDesc, &matrixBDataType, &matrixBDataFormat, &matrixB_K, &matrixB_N)); + CHECK_STATUS( + tensor2dGet(matrixCDesc, &matrixCDataType, &matrixCDataFormat, &matrixC_M, &matrixC_N)); + + if (matrixADataType != matrixBDataType) { + CHECK_STATUS(NOT_MATCH); + } + if (matrixADataType != matrixCDataType) { + if (matrixADataType != DT_I8 || matrixCDataType != DT_I32) { + CHECK_STATUS(NOT_MATCH); + } + } + + bool transposeA = false, transposeB = false; + if (matrixADataFormat == DF_TRANSPOSE) { + std::swap(matrixA_M, matrixA_K); + transposeA = true; + } + if (matrixBDataFormat == DF_TRANSPOSE) { + std::swap(matrixB_K, matrixB_N); + transposeB = true; + } + if (matrixA_M != matrixC_M || matrixB_N != matrixC_N || matrixA_K != matrixB_K) { + CHECK_STATUS(NOT_MATCH); + } + + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = mmm_general(matrixC_N, matrixC_M, matrixA_K, transposeA, transposeB, matrixADataType, + matrixAData, matrixBData, matrixCData); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + TensorDesc tranDescB; + U8 *dataB = (U8 *)matrixBData; + if (matrixBDataFormat != targetFormat4MatrixB(matrixBDataType)) { + dataB = ((U8 *)tmp) + matrixA_M * matrixA_K * bytesOf(matrixADataType); + ret = matrix_matrix_multiply_transform_rhs_x86( + matrixBDesc, matrixBData, &tranDescB, dataB); + } + ret = mmm_x86(matrixC_N, matrixC_M, matrixA_K, matrixADataType, transposeA, matrixAData, + dataB, tmp, matrixCData); +#endif +#ifdef _USE_NEON + } else { + TensorDesc tranDescB; + U8 *dataB = (U8 *)matrixBData; + if (matrixBDataFormat != targetFormat4MatrixB(matrixBDataType)) { + U32 K = matrixA_K; + if (DT_I8 == matrixADataType) { + K = pad_to_4_multiple(K); + } + dataB = ((U8 *)tmp) + matrixA_M * K * bytesOf(matrixADataType); + ret = matrix_matrix_multiply_transform_rhs_arm( + matrixBDesc, matrixBData, &tranDescB, dataB); + } + ret = mmm_arm(matrixC_N, matrixC_M, matrixA_K, matrixADataType, transposeA, matrixAData, + dataB, tmp, matrixCData, arch); +#endif + } + return ret; +} diff --git a/blas-enhance/src/mvm.cpp b/compute/blas_enhance/src/mvm.cpp similarity index 51% rename from blas-enhance/src/mvm.cpp rename to compute/blas_enhance/src/mvm.cpp index bcdddeb1..518b5e02 100644 --- a/blas-enhance/src/mvm.cpp +++ b/compute/blas_enhance/src/mvm.cpp @@ -1,48 +1,77 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include "blas-enhance.h" +#include "blas_enhance.h" #ifdef _USE_GENERAL #include "cpu/general/blas_general.h" #endif #ifdef _USE_NEON #include "cpu/arm/blas_arm.h" #endif +#ifdef _USE_X86 +#include "cpu/x86/blas_x86.h" +#endif - -EE matrix_vector_multiply_tmp_bytes(TensorDesc matrixDesc, TensorDesc vectorDesc, U32* bytes, Arch arch) +EE matrix_vector_multiply_tmp_bytes( + TensorDesc matrixDesc, TensorDesc vectorDesc, U32 *bytes, Arch arch) { UNUSED(vectorDesc); bool transpose = (matrixDesc.df == DF_TRANSPOSE); EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { + if (IS_GENERAL(arch)) { #ifdef _USE_GENERAL ret = SUCCESS; #endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = matrix_vector_multiply_tmp_bytes_x86(transpose, matrixDesc.dt, bytes); +#endif #ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { + } else if (IS_ARM(arch)) { ret = matrix_vector_multiply_tmp_bytes_arm(transpose, matrixDesc.dt, bytes); #endif } return ret; } -EE matrix_vector_multiply(TensorDesc matrixDesc, const void* matrix, - TensorDesc vectorDesc, const void* vector, - U32 bytes, void* tmp, - TensorDesc resultDesc, void* result, +EE matrix_vector_multiply_transform_weight( + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst, Arch arch) +{ + EE ret = NOT_SUPPORTED; +#ifdef _USE_NEON + if (IS_ARM(arch)) { + ret = matrix_vector_multiply_transform_weight_arm(desc, src, descTran, dst); + } +#endif +#ifdef _USE_GENERAL + if (IS_GENERAL(arch)) { + memcpy(dst, src, tensorNumBytes(desc)); + (*descTran) = desc; + ret = SUCCESS; + } +#endif + return ret; +} + +EE matrix_vector_multiply(TensorDesc matrixDesc, + const void *matrix, + TensorDesc vectorDesc, + const void *vector, + U32 bytes, + void *tmp, + TensorDesc resultDesc, + void *result, Arch arch) { if (bytes != 0 && tmp == nullptr) { @@ -52,30 +81,44 @@ EE matrix_vector_multiply(TensorDesc matrixDesc, const void* matrix, CHECK_STATUS(NULL_POINTER); } DataType matrixDataType, vectorDataType, resultDataType; - DataFormat matrixDataFormat; + DataFormat matrixDataFormat, vectorDataFormat, resultDataFormat; U32 matrixRow, matrixColumn, vectorColumn, resultColumn; - CHECK_STATUS(tensor2dfGet(matrixDesc, &matrixDataType, &matrixDataFormat, &matrixRow, &matrixColumn)); - CHECK_STATUS(tensor1dGet(vectorDesc, &vectorDataType, &vectorColumn)); - CHECK_STATUS(tensor1dGet(resultDesc, &resultDataType, &resultColumn)); + CHECK_STATUS( + tensor2dGet(matrixDesc, &matrixDataType, &matrixDataFormat, &matrixRow, &matrixColumn)); + CHECK_STATUS(tensor1dGet(vectorDesc, &vectorDataType, &vectorDataFormat, &vectorColumn)); + CHECK_STATUS(tensor1dGet(resultDesc, &resultDataType, &resultDataFormat, &resultColumn)); - if (matrixDataType != vectorDataType) + if (matrixDataType != vectorDataType) { CHECK_STATUS(NOT_MATCH); - if (matrixDataType != resultDataType) - if (matrixDataType != DT_I8 || resultDataType != DT_I32) + } + if (matrixDataType != resultDataType) { + if (matrixDataType != DT_I8 || resultDataType != DT_I32) { CHECK_STATUS(NOT_MATCH); + } + } - if (matrixRow != resultColumn || matrixColumn != vectorColumn) + bool transpose = (matrixDataFormat == DF_TRANSPOSE); + if (transpose) { + std::swap(matrixRow, matrixColumn); + } + if (matrixRow != resultColumn || matrixColumn != vectorColumn) { CHECK_STATUS(NOT_MATCH); + } - bool transpose = (matrixDataFormat == DF_TRANSPOSE); EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { + if (IS_GENERAL(arch)) { #ifdef _USE_GENERAL - ret = mvm_general(matrixRow, matrixColumn, matrixDataType, transpose, matrix, vector, result); + ret = + mvm_general(matrixRow, matrixColumn, matrixDataType, transpose, matrix, vector, result); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = mvm_x86(matrixRow, matrixColumn, matrixDataType, transpose, matrix, vector, result); #endif #ifdef _USE_NEON } else { - ret = mvm_arm(matrixRow, matrixColumn, matrixDataType, transpose, matrix, vector, tmp, result, arch); + ret = mvm_arm(matrixRow, matrixColumn, matrixDataType, matrixDataFormat, matrix, vector, + tmp, result, arch); #endif } return ret; diff --git a/compute/blas_enhance/tests/.CMakeLists.txt.swp b/compute/blas_enhance/tests/.CMakeLists.txt.swp new file mode 100644 index 0000000000000000000000000000000000000000..1f220ac0beb56d9684a64e01aa845c6c83e7663e GIT binary patch literal 12288 zcmeI&y>8S%5C`x%8XCR?FJMTDb8+Vc3a$eQffSUWNJ)ThwO-HVR9f-s-68Z065d!6@NW22+8K^Km%fbNZRkB-w{qF_wVxHk1q2|lpuj_V^>X~9Tzc_5c=mK> z!TCss00bZa0SG_<0uX=z1n#nc&s+3MUf|y3B_2)s){TC0iw*)1fB*y_009U<00Izz z00bZafjcOm6QYl@=|65Z{__9-bpQYNG0~~SkwjnOtHc+Hw-Rq8mL%>=T*`zCi4%#Y z#%~EU2tWV=5P$##AOHafKmY;|fWTq`>=N4#+B;kC)!L`+UPyto>u1DOb4I+~e5h&FRxCH -#include "blas-enhance.h" +#include "blas_enhance.h" #include "ut_util.h" -int mmmTest(int argc, char** argv, DataType dt) +int mmmTest(int argc, char **argv, DataType dt) { CHECK_REQUIREMENT(argc == 4); U32 m = atoi(argv[1]); U32 k = atoi(argv[2]); U32 n = atoi(argv[3]); - TensorDesc A_desc = tensor2df(dt, DF_NORMAL, m, k); + TensorDesc A_desc = tensor2df(dt, DF_TRANSPOSE, k, m); TensorDesc B_desc = tensor2df(dt, DF_NORMAL, k, n); TensorDesc tranDescB; TensorDesc C_desc = tensor2df(dt, DF_NORMAL, m, n); U32 bytes = 0; - U8* A = ut_input_v(m * k, dt, UT_INIT_RANDOM); - U8* B = ut_input_v(k * n, dt, UT_INIT_RANDOM); - U8* B_tran = ut_input_v(k * n + 32, dt, UT_INIT_ZERO); - U8* C = ut_input_v(m * n, dt, UT_INIT_ZERO); - U8* C_ref = ut_input_v(m * n, dt, UT_INIT_ZERO); + U8 *A = ut_input_v(m * k, dt, UT_INIT_RANDOM); + U8 *B = ut_input_v(k * n, dt, UT_INIT_RANDOM); + U8 *B_tran = ut_input_v(k * n + 32, dt, UT_INIT_ZERO); + U8 *C = ut_input_v(m * n, dt, UT_INIT_ZERO); + U8 *C_ref = ut_input_v(m * n, dt, UT_INIT_ZERO); CHECK_STATUS(matrix_matrix_multiply_tmp_bytes(A_desc, B_desc, &bytes, UT_ARCH)); - U8* tmp = ut_input_v(bytes/bytesOf(dt), dt, UT_INIT_ZERO); + U8 *tmp = ut_input_v(bytes / bytesOf(dt), dt, UT_INIT_ZERO); - matrix_matrix_multiply_transform_rhs(B_desc, B, &tranDescB, B_tran); + matrix_matrix_multiply_transform_rhs(B_desc, B, &tranDescB, B_tran, UT_ARCH); if (UT_CHECK) { - CHECK_STATUS(matrix_matrix_multiply(A_desc, A, tranDescB, B_tran, bytes, tmp, C_desc, C, UT_ARCH)); + CHECK_STATUS( + matrix_matrix_multiply(A_desc, A, tranDescB, B_tran, bytes, tmp, C_desc, C, UT_ARCH)); // naive implement - CHECK_STATUS(matrix_matrix_multiply(A_desc, A, B_desc, B, bytes, tmp, C_desc, C_ref, CPU_GENERAL)); + CHECK_STATUS( + matrix_matrix_multiply(A_desc, A, B_desc, B, bytes, tmp, C_desc, C_ref, CPU_GENERAL)); // check - ut_check_v(C, C_ref, m*n, dt, 10, __FILE__, __LINE__); + ut_check_v(C, C_ref, m * n, dt, 10, __FILE__, __LINE__); } - // benchmark + // benchmark double time_start = ut_time_ms(); for (int iter = 0; iter < UT_LOOPS; iter++) { matrix_matrix_multiply(A_desc, A, tranDescB, B_tran, bytes, tmp, C_desc, C, UT_ARCH); @@ -59,8 +60,7 @@ int mmmTest(int argc, char** argv, DataType dt) // log performance data char buffer[150]; char params[120]; - sprintf(params, "(%u %u)+(%u %u)=(%u %u)", - m, k, k, n, m, n); + sprintf(params, "(%u %u)+(%u %u)=(%u %u)", m, k, k, n, m, n); sprintf(buffer, "%20s, %80s", "MatrixMultiply", params); double ops = 2.0 * m * n * k + 1.0 * m * n; ut_log(dt, buffer, ops, time); @@ -71,11 +71,11 @@ int mmmTest(int argc, char** argv, DataType dt) free(C); free(C_ref); free(tmp); - + return 0; } -int main(int argc, char** argv) +int main(int argc, char **argv) { #ifdef _USE_FP16 mmmTest(argc, argv, DT_F16); diff --git a/tests/test_mmm_int8.cpp b/compute/blas_enhance/tests/test_mmm_int8.cpp similarity index 66% rename from tests/test_mmm_int8.cpp rename to compute/blas_enhance/tests/test_mmm_int8.cpp index 51686b1a..c32a7352 100644 --- a/tests/test_mmm_int8.cpp +++ b/compute/blas_enhance/tests/test_mmm_int8.cpp @@ -1,24 +1,23 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include -#include "blas-enhance.h" +#include "blas_enhance.h" #include "ut_util.h" - -int main(int argc, char** argv) +int main(int argc, char **argv) { +#ifdef _USE_INT8 CHECK_REQUIREMENT(argc == 4); U32 m = atoi(argv[1]); U32 k = atoi(argv[2]); @@ -26,7 +25,7 @@ int main(int argc, char** argv) DataType dt = DT_I8; DataType odt = DT_I32; - TensorDesc A_desc = tensor2df(dt, DF_NORMAL, m, k); + TensorDesc A_desc = tensor2df(dt, DF_TRANSPOSE, k, m); TensorDesc B_desc = tensor2df(dt, DF_NORMAL, k, n); TensorDesc tranDescB; TensorDesc C_desc = tensor2df(odt, DF_NORMAL, m, n); @@ -36,26 +35,28 @@ int main(int argc, char** argv) if (k4 % 4 != 0) { k4 = (k4 / 4) * 4 + 4; } - INT8* A = (INT8*)ut_input_v(m * k, DT_I8, UT_INIT_RANDOM); - INT8* B = (INT8*)ut_input_v(k * n, DT_I8, UT_INIT_RANDOM); - INT8* B_tran = (INT8*)ut_input_v(k4 * n + 32, DT_I8, UT_INIT_ZERO); - I32* C = (I32*)ut_input_v(m * n, DT_I32, UT_INIT_ZERO); - I32* C_ref = (I32*)ut_input_v(m * n, DT_I32, UT_INIT_ZERO); + INT8 *A = (INT8 *)ut_input_v(m * k, DT_I8, UT_INIT_RANDOM); + INT8 *B = (INT8 *)ut_input_v(k * n, DT_I8, UT_INIT_RANDOM); + INT8 *B_tran = (INT8 *)ut_input_v(k4 * n + 32, DT_I8, UT_INIT_ZERO); + I32 *C = (I32 *)ut_input_v(m * n, DT_I32, UT_INIT_ZERO); + I32 *C_ref = (I32 *)ut_input_v(m * n, DT_I32, UT_INIT_ZERO); CHECK_STATUS(matrix_matrix_multiply_tmp_bytes(A_desc, B_desc, &bytes, UT_ARCH)); - INT8* tmp = (INT8 *)ut_input_v(bytes, DT_I8, UT_INIT_ZERO); + INT8 *tmp = (INT8 *)ut_input_v(bytes, DT_I8, UT_INIT_ZERO); - matrix_matrix_multiply_transform_rhs(B_desc, B, &tranDescB, B_tran); - if (UT_CHECK){ - CHECK_STATUS(matrix_matrix_multiply(A_desc, A, tranDescB, B_tran, bytes, tmp, C_desc, C, UT_ARCH)); + matrix_matrix_multiply_transform_rhs(B_desc, B, &tranDescB, B_tran, UT_ARCH); + if (UT_CHECK) { + CHECK_STATUS( + matrix_matrix_multiply(A_desc, A, tranDescB, B_tran, bytes, tmp, C_desc, C, UT_ARCH)); // naive implement - CHECK_STATUS(matrix_matrix_multiply(A_desc, A, B_desc, B, bytes, tmp, C_desc, C_ref, CPU_GENERAL)); + CHECK_STATUS( + matrix_matrix_multiply(A_desc, A, B_desc, B, bytes, tmp, C_desc, C_ref, CPU_GENERAL)); // check - ut_check_v(C, C_ref, m*n, DT_I32, 1, __FILE__, __LINE__); + ut_check_v(C, C_ref, m * n, DT_I32, 1, __FILE__, __LINE__); } - // benchmark + // benchmark double time_start = ut_time_ms(); for (int iter = 0; iter < UT_LOOPS; iter++) { matrix_matrix_multiply(A_desc, A, tranDescB, B_tran, bytes, tmp, C_desc, C, UT_ARCH); @@ -66,8 +67,7 @@ int main(int argc, char** argv) // log performance data char buffer[150]; char params[120]; - sprintf(params, "(%u %u)+(%u %u)=(%u %u)", - m, k, k, n, m, n); + sprintf(params, "(%u %u)+(%u %u)=(%u %u)", m, k, k, n, m, n); sprintf(buffer, "%20s, %80s", "MatrixMultiply", params); double ops = 2.0 * m * n * k + 1.0 * m * n; ut_log(DT_I8, buffer, ops, time); @@ -78,6 +78,6 @@ int main(int argc, char** argv) free(C); free(C_ref); free(tmp); - +#endif return 0; } diff --git a/tests/test_mvm.cpp b/compute/blas_enhance/tests/test_mvm.cpp similarity index 67% rename from tests/test_mvm.cpp rename to compute/blas_enhance/tests/test_mvm.cpp index 9d8b55c4..5d6443ca 100644 --- a/tests/test_mvm.cpp +++ b/compute/blas_enhance/tests/test_mvm.cpp @@ -1,62 +1,65 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include -#include "blas-enhance.h" +#include "blas_enhance.h" #include "ut_util.h" - -int mvmTest(int argc, char** argv, DataType dt) +int mvmTest(int argc, char **argv, DataType dt) { CHECK_REQUIREMENT(argc == 3); U32 m = atoi(argv[1]); U32 k = atoi(argv[2]); + float threshold = 0.0001; + if (dt == DT_F16) { + threshold = 0.05; + } DataFormat df = DF_NORMAL; U32 vc, rc; if (df == DF_NORMAL) { vc = k; rc = m; - } - else { + } else { vc = m; rc = k; } - TensorDesc mat_desc = tensor2df(dt, df, m, k); + TensorDesc mat_desc = tensor2df(dt, df, rc, vc); TensorDesc vec_desc = tensor1d(dt, vc); TensorDesc res_desc = tensor1d(dt, rc); - U8* mat = ut_input_v(m * k, dt, UT_INIT_RANDOM); - U8* vec = ut_input_v(vc, dt, UT_INIT_RANDOM); - U8* res = ut_input_v(rc, dt, UT_INIT_ZERO); - U8* res_ref = ut_input_v(rc, dt, UT_INIT_ZERO); + U8 *mat = ut_input_v(m * k, dt, UT_INIT_RANDOM); + U8 *vec = ut_input_v(vc, dt, UT_INIT_RANDOM); + U8 *res = ut_input_v(rc, dt, UT_INIT_ZERO); + U8 *res_ref = ut_input_v(rc, dt, UT_INIT_ZERO); U32 bytes = 0; CHECK_STATUS(matrix_vector_multiply_tmp_bytes(mat_desc, vec_desc, &bytes, UT_ARCH)); - U8* tmp = ut_input_v(bytes/bytesOf(dt), dt, UT_INIT_ZERO); + U8 *tmp = ut_input_v(bytes / bytesOf(dt), dt, UT_INIT_ZERO); // check if (UT_CHECK) { - CHECK_STATUS(matrix_vector_multiply(mat_desc, mat, vec_desc, vec, bytes, tmp, res_desc, res, UT_ARCH)); + CHECK_STATUS(matrix_vector_multiply( + mat_desc, mat, vec_desc, vec, bytes, tmp, res_desc, res, UT_ARCH)); // naive implement - CHECK_STATUS(matrix_vector_multiply(mat_desc, mat, vec_desc, vec, bytes, tmp, res_desc, res_ref, CPU_GENERAL)); + CHECK_STATUS(matrix_vector_multiply( + mat_desc, mat, vec_desc, vec, bytes, tmp, res_desc, res_ref, CPU_GENERAL)); - ut_check_v(res, res_ref, rc, dt, 1, __FILE__, __LINE__); + ut_check_v(res, res_ref, rc, dt, threshold, __FILE__, __LINE__); } - // benchmark + // benchmark double time_start = ut_time_ms(); for (int iter = 0; iter < UT_LOOPS; iter++) { matrix_vector_multiply(mat_desc, mat, vec_desc, vec, bytes, tmp, res_desc, res, UT_ARCH); @@ -67,8 +70,7 @@ int mvmTest(int argc, char** argv, DataType dt) // log performance data char buffer[150]; char params[120]; - sprintf(params, "(%u %u)+(%u)=(%u)", - m, k, vc, rc); + sprintf(params, "(%u %u)+(%u)=(%u)", m, k, vc, rc); sprintf(buffer, "%20s, %80s", "MatrixVectorMultiply", params); double ops = 2.0 * m * k; ut_log(dt, buffer, ops, time); @@ -78,11 +80,11 @@ int mvmTest(int argc, char** argv, DataType dt) free(tmp); free(res); free(res_ref); - + return 0; } -int main(int argc, char** argv) +int main(int argc, char **argv) { #ifdef _USE_FP16 mvmTest(argc, argv, DT_F16); diff --git a/tests/test_mvm_int8.cpp b/compute/blas_enhance/tests/test_mvm_int8.cpp similarity index 60% rename from tests/test_mvm_int8.cpp rename to compute/blas_enhance/tests/test_mvm_int8.cpp index 955540bf..5e5a82aa 100644 --- a/tests/test_mvm_int8.cpp +++ b/compute/blas_enhance/tests/test_mvm_int8.cpp @@ -1,23 +1,23 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include -#include "blas-enhance.h" +#include "blas_enhance.h" #include "ut_util.h" - -int main(int argc, char** argv) { +int main(int argc, char **argv) +{ +#ifdef _USE_INT8 CHECK_REQUIREMENT(argc == 3); U32 m = atoi(argv[1]); U32 k = atoi(argv[2]); @@ -29,38 +29,48 @@ int main(int argc, char** argv) { if (df == DF_NORMAL) { vc = k; rc = m; - } - else { + } else { vc = m; rc = k; } TensorDesc mat_desc = tensor2df(dt, df, m, k); + TensorDesc tranDesc; TensorDesc vec_desc = tensor1d(dt, vc); TensorDesc res_desc = tensor1d(odt, rc); - INT8* mat = (INT8*)ut_input_v(m * k, DT_I8, UT_INIT_RANDOM); - INT8* vec = (INT8*)ut_input_v(vc, DT_I8, UT_INIT_RANDOM); - I32* res = (I32*)ut_input_v(rc, DT_I32, UT_INIT_ZERO); - I32* res_ref = (I32*)ut_input_v(rc, DT_I32, UT_INIT_ZERO); + U32 k4 = k; + if (k4 % 4 != 0) { + k4 = (k4 / 4) * 4 + 4; + } + + INT8 *mat = (INT8 *)ut_input_v(m * k, DT_I8, UT_INIT_RANDOM); + INT8 *matTran = (INT8 *)ut_input_v(m * k4, DT_I8, UT_INIT_ZERO); + INT8 *vec = (INT8 *)ut_input_v(vc, DT_I8, UT_INIT_RANDOM); + I32 *res = (I32 *)ut_input_v(rc, DT_I32, UT_INIT_ZERO); + I32 *res_ref = (I32 *)ut_input_v(rc, DT_I32, UT_INIT_ZERO); + + matrix_vector_multiply_transform_weight(mat_desc, mat, &tranDesc, matTran, UT_ARCH); U32 bytes; CHECK_STATUS(matrix_vector_multiply_tmp_bytes(mat_desc, vec_desc, &bytes, UT_ARCH)); - I32* tmp = (I32*)ut_input_v(bytes/bytesOf(DT_I32), DT_I32, UT_INIT_ZERO); + I32 *tmp = (I32 *)ut_input_v(bytes / bytesOf(DT_I32), DT_I32, UT_INIT_ZERO); // check if (UT_CHECK) { - CHECK_STATUS(matrix_vector_multiply(mat_desc, mat, vec_desc, vec, bytes, tmp, res_desc, res, UT_ARCH)); + CHECK_STATUS(matrix_vector_multiply( + tranDesc, matTran, vec_desc, vec, bytes, tmp, res_desc, res, UT_ARCH)); // naive implement - CHECK_STATUS(matrix_vector_multiply(mat_desc, mat, vec_desc, vec, bytes, tmp, res_desc, res_ref, CPU_GENERAL)); + CHECK_STATUS(matrix_vector_multiply( + mat_desc, mat, vec_desc, vec, bytes, tmp, res_desc, res_ref, CPU_GENERAL)); - ut_check_v(res, res_ref, rc, DT_I32, 10, __FILE__, __LINE__); + ut_check_v(res, res_ref, rc, DT_I32, 1, __FILE__, __LINE__); } - // benchmark + // benchmark double time_start = ut_time_ms(); for (int iter = 0; iter < UT_LOOPS; iter++) { - matrix_vector_multiply(mat_desc, mat, vec_desc, vec, bytes, tmp, res_desc, res, UT_ARCH); + matrix_vector_multiply(tranDesc, matTran, vec_desc, vec, bytes, tmp, res_desc, res, UT_ARCH); } double time_end = ut_time_ms(); double time = (time_end - time_start) / UT_LOOPS; @@ -68,8 +78,7 @@ int main(int argc, char** argv) { // log performance data char buffer[150]; char params[120]; - sprintf(params, "(%u %u)+(%u)=(%u)", - m, k, vc, rc); + sprintf(params, "(%u %u)+(%u)=(%u)", m, k, vc, rc); sprintf(buffer, "%20s, %80s", "MatrixVectorMultiply", params); double ops = 2.0 * m * k; ut_log(DT_I8, buffer, ops, time); @@ -78,6 +87,6 @@ int main(int argc, char** argv) { free(vec); free(res); free(res_ref); - +#endif return 0; } diff --git a/image/CMakeLists.txt b/compute/image/CMakeLists.txt similarity index 61% rename from image/CMakeLists.txt rename to compute/image/CMakeLists.txt index d1266406..e9e5af57 100644 --- a/image/CMakeLists.txt +++ b/compute/image/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.2) -file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake) +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) if (BOLT_CONFIGURE_FILE) include(${BOLT_CONFIGURE_FILE}) else (BOLT_CONFIGURE_FILE) @@ -12,14 +12,9 @@ endif (BOLT_CONFIGURE_FILE) project(image) -set_policy() - -SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${BOLT_ROOT}/cmakes") -find_package(Uni) -find_package(Image) - -set_project_install_directory() - set_c_cxx_flags() +include_image() + add_subdirectory(src) +add_subdirectory(tests) diff --git a/compute/image/include/image.h b/compute/image/include/image.h new file mode 100644 index 00000000..274a5066 --- /dev/null +++ b/compute/image/include/image.h @@ -0,0 +1,38 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_IMAGE +#define _H_IMAGE + +#include "tensor.hpp" +#include "tensor_desc.h" +#include "sys.h" + +#ifdef _USE_MALI +#include "gcl.h" +#include "ocl_desc_trans.h" +#endif + +typedef struct { + DataType paramDT; +} ResizeDesc; + +EE resize_infer_output_size(Tensor *inputTensor, + ResizeDesc resizeDesc, + void *params, + Tensor *outputTensor, + U32 *outputBytes, + ArchInfo_t archInfo); + +EE resize(Tensor inputTensor, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo); +#endif diff --git a/image/include/image_processing.hpp b/compute/image/include/image_processing.hpp similarity index 76% rename from image/include/image_processing.hpp rename to compute/image/include/image_processing.hpp index f5ac79fd..5aa20860 100644 --- a/image/include/image_processing.hpp +++ b/compute/image/include/image_processing.hpp @@ -1,27 +1,28 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_IMAGEPROCESSING #define _H_IMAGEPROCESSING #include #include #include "tensor_desc.h" +#include "tensor.hpp" #include "error.h" std::shared_ptr load_fake_image(TensorDesc inputDesc); -std::shared_ptr load_resize_image(TensorDesc rgbDesc, void* rgb, TensorDesc imageDesc, ImageFormat targetImageFormat, float scaleValue); +std::shared_ptr load_resize_image( + Tensor rgbTensor, TensorDesc imageDesc, ImageFormat targetImageFormat, float scaleValue); #endif diff --git a/compute/image/src/CMakeLists.txt b/compute/image/src/CMakeLists.txt new file mode 100644 index 00000000..0fccaf61 --- /dev/null +++ b/compute/image/src/CMakeLists.txt @@ -0,0 +1,31 @@ +if (USE_GENERAL) + file(GLOB general_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/general/*.cpp) +endif (USE_GENERAL) + +if (USE_NEON) + file(GLOB arm_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/*.cpp) +endif (USE_NEON) + +if (USE_MALI) + file(GLOB mali_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/mali/*.cpp) + file(GLOB mali_fp16_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/mali/fp16/*.cpp) +endif (USE_MALI) + +file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) +set(srcs "${srcs};${general_srcs};${arm_srcs};${mali_srcs};${mali_fp16_srcs}") + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + +# shared library +add_library(${PROJECT_NAME} SHARED ${srcs}) +target_link_libraries (${PROJECT_NAME} LINK_PUBLIC uni) + +# static library +add_library(${PROJECT_NAME}_static STATIC ${srcs}) + +set_target_properties(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}") +set_target_properties(${PROJECT_NAME} PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(${PROJECT_NAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) +install(TARGETS ${PROJECT_NAME} ${PROJECT_NAME}_static + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) diff --git a/image/src/cpu/arm/image_arm.h b/compute/image/src/cpu/arm/image_arm.h similarity index 79% rename from image/src/cpu/arm/image_arm.h rename to compute/image/src/cpu/arm/image_arm.h index a67f1349..a42c596d 100644 --- a/image/src/cpu/arm/image_arm.h +++ b/compute/image/src/cpu/arm/image_arm.h @@ -1,17 +1,16 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_IMAGE_ARM #define _H_IMAGE_ARM @@ -21,6 +20,5 @@ #include "image.h" #include "arm_neon_expand.h" -EE resize_bilinear_arm(TensorDesc inputDesc, void* input, - TensorDesc outputDesc, void* output); +EE resize_bilinear_arm(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output); #endif diff --git a/image/src/cpu/arm/resize_bilinear.cpp b/compute/image/src/cpu/arm/resize_bilinear.cpp similarity index 67% rename from image/src/cpu/arm/resize_bilinear.cpp rename to compute/image/src/cpu/arm/resize_bilinear.cpp index 8de01a58..2c8e868d 100644 --- a/image/src/cpu/arm/resize_bilinear.cpp +++ b/compute/image/src/cpu/arm/resize_bilinear.cpp @@ -1,28 +1,26 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include -#include -#include "type.h" +#include +#include +#include "types.h" #include "tensor_desc.h" #include "error.h" #include "image.h" #include "cpu/arm/image_arm.h" #ifdef _USE_FP16 -EE resize_bilinear_fp16(TensorDesc inputDesc, F16* inArray, - TensorDesc outputDesc, F16* outArray) +EE resize_bilinear_fp16(TensorDesc inputDesc, F16 *inArray, TensorDesc outputDesc, F16 *outArray) { DataType idt, odt; DataFormat idf, odf; @@ -42,24 +40,28 @@ EE resize_bilinear_fp16(TensorDesc inputDesc, F16* inArray, for (U32 n = 0; n < on; n++) { for (U32 c = 0; c < oc; c++) { - I32 outBase = n*oc*oh*ow + c*oh*ow*8; - I32 inBase = n*oc*ih*iw + c*ih*iw*8; + I32 outBase = n * oc * oh * ow + c * oh * ow * 8; + I32 inBase = n * oc * ih * iw + c * ih * iw * 8; for (U32 h = 0; h < oh; h++) { for (U32 w = 0; w < ow; w++) { if (h == 0 && w == 0) { - memcpy(outArray + outBase, inArray + inBase, 8*bytesOf(DT_F16)); + memcpy(outArray + outBase, inArray + inBase, 8 * bytesOf(DT_F16)); continue; } if (h == 0 && w == ow - 1) { - memcpy(outArray + outBase + w*8, inArray + inBase + (iw-1)*8, 8*bytesOf(DT_F16)); + memcpy(outArray + outBase + w * 8, inArray + inBase + (iw - 1) * 8, + 8 * bytesOf(DT_F16)); continue; } if (h == oh - 1 && w == 0) { - memcpy(outArray + outBase + h*ow*8, inArray + inBase + (ih-1)*iw*8, 8*bytesOf(DT_F16)); + memcpy(outArray + outBase + h * ow * 8, + inArray + inBase + (ih - 1) * iw * 8, 8 * bytesOf(DT_F16)); continue; } if (h == oh - 1 && w == ow - 1) { - memcpy(outArray + outBase + h*ow*8 + w*8, inArray + inBase + (ih-1)*iw*8 + (iw-1)*8, 8*bytesOf(DT_F16)); + memcpy(outArray + outBase + h * ow * 8 + w * 8, + inArray + inBase + (ih - 1) * iw * 8 + (iw - 1) * 8, + 8 * bytesOf(DT_F16)); continue; } @@ -72,32 +74,33 @@ EE resize_bilinear_fp16(TensorDesc inputDesc, F16* inArray, I32 wR = ceil(wC); if (hT == hB && wL == wR) { - memcpy(outArray + outBase + h*ow*8 + w*8, inArray + inBase + hT*iw*8 + wL*8, 8*bytesOf(DT_F16)); + memcpy(outArray + outBase + h * ow * 8 + w * 8, + inArray + inBase + hT * iw * 8 + wL * 8, 8 * bytesOf(DT_F16)); } else if (hT == hB) { float16x8_t res = {0}; - float16x8_t vecL = vld1q_f16(inArray + inBase + hT*iw*8 + wL*8); - float16x8_t vecR = vld1q_f16(inArray + inBase + hT*iw*8 + wR*8); + float16x8_t vecL = vld1q_f16(inArray + inBase + hT * iw * 8 + wL * 8); + float16x8_t vecR = vld1q_f16(inArray + inBase + hT * iw * 8 + wR * 8); res = vfmaq_n_f16(res, vecL, wR - wC); res = vfmaq_n_f16(res, vecR, wC - wL); - vst1q_f16(outArray + outBase + h*ow*8 + w*8, res); + vst1q_f16(outArray + outBase + h * ow * 8 + w * 8, res); } else if (wL == wR) { float16x8_t res = {0}; - float16x8_t vecT = vld1q_f16(inArray + inBase + hT*iw*8 + wL*8); - float16x8_t vecB = vld1q_f16(inArray + inBase + hB*iw*8 + wL*8); + float16x8_t vecT = vld1q_f16(inArray + inBase + hT * iw * 8 + wL * 8); + float16x8_t vecB = vld1q_f16(inArray + inBase + hB * iw * 8 + wL * 8); res = vfmaq_n_f16(res, vecT, hB - hC); res = vfmaq_n_f16(res, vecB, hC - hT); - vst1q_f16(outArray + outBase + h*ow*8 + w*8, res); + vst1q_f16(outArray + outBase + h * ow * 8 + w * 8, res); } else { float16x8_t res = {0}; - float16x8_t vecTL = vld1q_f16(inArray + inBase + hT*iw*8 + wL*8); - float16x8_t vecTR = vld1q_f16(inArray + inBase + hT*iw*8 + wR*8); - float16x8_t vecBL = vld1q_f16(inArray + inBase + hB*iw*8 + wL*8); - float16x8_t vecBR = vld1q_f16(inArray + inBase + hB*iw*8 + wR*8); + float16x8_t vecTL = vld1q_f16(inArray + inBase + hT * iw * 8 + wL * 8); + float16x8_t vecTR = vld1q_f16(inArray + inBase + hT * iw * 8 + wR * 8); + float16x8_t vecBL = vld1q_f16(inArray + inBase + hB * iw * 8 + wL * 8); + float16x8_t vecBR = vld1q_f16(inArray + inBase + hB * iw * 8 + wR * 8); res = vfmaq_n_f16(res, vecTL, (hB - hC) * (wR - wC)); res = vfmaq_n_f16(res, vecTR, (hB - hC) * (wC - wL)); res = vfmaq_n_f16(res, vecBL, (hC - hT) * (wR - wC)); res = vfmaq_n_f16(res, vecBR, (hC - hT) * (wC - wL)); - vst1q_f16(outArray + outBase + h*ow*8 + w*8, res); + vst1q_f16(outArray + outBase + h * ow * 8 + w * 8, res); } } } @@ -108,8 +111,7 @@ EE resize_bilinear_fp16(TensorDesc inputDesc, F16* inArray, #endif #ifdef _USE_FP32 -EE resize_bilinear_fp32(TensorDesc inputDesc, F32* inArray, - TensorDesc outputDesc, F32* outArray) +EE resize_bilinear_fp32(TensorDesc inputDesc, F32 *inArray, TensorDesc outputDesc, F32 *outArray) { DataType idt, odt; DataFormat idf, odf; @@ -129,24 +131,28 @@ EE resize_bilinear_fp32(TensorDesc inputDesc, F32* inArray, for (U32 n = 0; n < on; n++) { for (U32 c = 0; c < oc; c++) { - I32 outBase = n*oc*oh*ow + c*oh*ow*8; - I32 inBase = n*oc*ih*iw + c*ih*iw*8; + I32 outBase = n * oc * oh * ow + c * oh * ow * 8; + I32 inBase = n * oc * ih * iw + c * ih * iw * 8; for (U32 h = 0; h < oh; h++) { for (U32 w = 0; w < ow; w++) { if (h == 0 && w == 0) { - memcpy(outArray + outBase, inArray + inBase, 8*bytesOf(DT_F32)); + memcpy(outArray + outBase, inArray + inBase, 8 * bytesOf(DT_F32)); continue; } if (h == 0 && w == ow - 1) { - memcpy(outArray + outBase + w*8, inArray + inBase + (iw-1)*8, 8*bytesOf(DT_F32)); + memcpy(outArray + outBase + w * 8, inArray + inBase + (iw - 1) * 8, + 8 * bytesOf(DT_F32)); continue; } if (h == oh - 1 && w == 0) { - memcpy(outArray + outBase + h*ow*8, inArray + inBase + (ih-1)*iw*8, 8*bytesOf(DT_F32)); + memcpy(outArray + outBase + h * ow * 8, + inArray + inBase + (ih - 1) * iw * 8, 8 * bytesOf(DT_F32)); continue; } if (h == oh - 1 && w == ow - 1) { - memcpy(outArray + outBase + h*ow*8 + w*8, inArray + inBase + (ih-1)*iw*8 + (iw-1)*8, 8*bytesOf(DT_F32)); + memcpy(outArray + outBase + h * ow * 8 + w * 8, + inArray + inBase + (ih - 1) * iw * 8 + (iw - 1) * 8, + 8 * bytesOf(DT_F32)); continue; } @@ -159,41 +165,42 @@ EE resize_bilinear_fp32(TensorDesc inputDesc, F32* inArray, I32 wR = ceil(wC); if (hT == hB && wL == wR) { - memcpy(outArray + outBase + h*ow*8 + w*8, inArray + inBase + hT*iw*8 + wL*8, 8*bytesOf(DT_F32)); + memcpy(outArray + outBase + h * ow * 8 + w * 8, + inArray + inBase + hT * iw * 8 + wL * 8, 8 * bytesOf(DT_F32)); } else if (hT == hB) { float32x4_t res[2] = {0}; - float32x4_t vecL = vld1q_f32(inArray + inBase + hT*iw*8 + wL*8); - float32x4_t vecL1 = vld1q_f32(inArray + inBase + hT*iw*8 + wL*8 + 4); - float32x4_t vecR = vld1q_f32(inArray + inBase + hT*iw*8 + wR*8); - float32x4_t vecR1 = vld1q_f32(inArray + inBase + hT*iw*8 + wR*8 + 4); + float32x4_t vecL = vld1q_f32(inArray + inBase + hT * iw * 8 + wL * 8); + float32x4_t vecL1 = vld1q_f32(inArray + inBase + hT * iw * 8 + wL * 8 + 4); + float32x4_t vecR = vld1q_f32(inArray + inBase + hT * iw * 8 + wR * 8); + float32x4_t vecR1 = vld1q_f32(inArray + inBase + hT * iw * 8 + wR * 8 + 4); res[0] = vfmaq_n_f32(res[0], vecL, wR - wC); res[1] = vfmaq_n_f32(res[1], vecL1, wR - wC); res[0] = vfmaq_n_f32(res[0], vecR, wC - wL); res[1] = vfmaq_n_f32(res[1], vecR1, wC - wL); - vst1q_f32(outArray + outBase + h*ow*8 + w*8, res[0]); - vst1q_f32(outArray + outBase + h*ow*8 + w*8 + 4, res[1]); + vst1q_f32(outArray + outBase + h * ow * 8 + w * 8, res[0]); + vst1q_f32(outArray + outBase + h * ow * 8 + w * 8 + 4, res[1]); } else if (wL == wR) { float32x4_t res[2] = {0}; - float32x4_t vecT = vld1q_f32(inArray + inBase + hT*iw*8 + wL*8); - float32x4_t vecT1 = vld1q_f32(inArray + inBase + hT*iw*8 + wL*8 + 4); - float32x4_t vecB = vld1q_f32(inArray + inBase + hB*iw*8 + wL*8); - float32x4_t vecB1 = vld1q_f32(inArray + inBase + hB*iw*8 + wL*8 + 4); + float32x4_t vecT = vld1q_f32(inArray + inBase + hT * iw * 8 + wL * 8); + float32x4_t vecT1 = vld1q_f32(inArray + inBase + hT * iw * 8 + wL * 8 + 4); + float32x4_t vecB = vld1q_f32(inArray + inBase + hB * iw * 8 + wL * 8); + float32x4_t vecB1 = vld1q_f32(inArray + inBase + hB * iw * 8 + wL * 8 + 4); res[0] = vfmaq_n_f32(res[0], vecT, hB - hC); res[1] = vfmaq_n_f32(res[1], vecT1, hB - hC); res[0] = vfmaq_n_f32(res[0], vecB, hC - hT); res[1] = vfmaq_n_f32(res[1], vecB1, hC - hT); - vst1q_f32(outArray + outBase + h*ow*8 + w*8, res[0]); - vst1q_f32(outArray + outBase + h*ow*8 + w*8 + 4, res[1]); + vst1q_f32(outArray + outBase + h * ow * 8 + w * 8, res[0]); + vst1q_f32(outArray + outBase + h * ow * 8 + w * 8 + 4, res[1]); } else { float32x4_t res[2] = {0}; - float32x4_t vecTL = vld1q_f32(inArray + inBase + hT*iw*8 + wL*8); - float32x4_t vecTL1 = vld1q_f32(inArray + inBase + hT*iw*8 + wL*8 + 4); - float32x4_t vecTR = vld1q_f32(inArray + inBase + hT*iw*8 + wR*8); - float32x4_t vecTR1 = vld1q_f32(inArray + inBase + hT*iw*8 + wR*8 + 4); - float32x4_t vecBL = vld1q_f32(inArray + inBase + hB*iw*8 + wL*8); - float32x4_t vecBL1 = vld1q_f32(inArray + inBase + hB*iw*8 + wL*8 + 4); - float32x4_t vecBR = vld1q_f32(inArray + inBase + hB*iw*8 + wR*8); - float32x4_t vecBR1 = vld1q_f32(inArray + inBase + hB*iw*8 + wR*8 + 4); + float32x4_t vecTL = vld1q_f32(inArray + inBase + hT * iw * 8 + wL * 8); + float32x4_t vecTL1 = vld1q_f32(inArray + inBase + hT * iw * 8 + wL * 8 + 4); + float32x4_t vecTR = vld1q_f32(inArray + inBase + hT * iw * 8 + wR * 8); + float32x4_t vecTR1 = vld1q_f32(inArray + inBase + hT * iw * 8 + wR * 8 + 4); + float32x4_t vecBL = vld1q_f32(inArray + inBase + hB * iw * 8 + wL * 8); + float32x4_t vecBL1 = vld1q_f32(inArray + inBase + hB * iw * 8 + wL * 8 + 4); + float32x4_t vecBR = vld1q_f32(inArray + inBase + hB * iw * 8 + wR * 8); + float32x4_t vecBR1 = vld1q_f32(inArray + inBase + hB * iw * 8 + wR * 8 + 4); res[0] = vfmaq_n_f32(res[0], vecTL, (hB - hC) * (wR - wC)); res[1] = vfmaq_n_f32(res[1], vecTL1, (hB - hC) * (wR - wC)); res[0] = vfmaq_n_f32(res[0], vecTR, (hB - hC) * (wC - wL)); @@ -202,8 +209,8 @@ EE resize_bilinear_fp32(TensorDesc inputDesc, F32* inArray, res[1] = vfmaq_n_f32(res[1], vecBL1, (hC - hT) * (wR - wC)); res[0] = vfmaq_n_f32(res[0], vecBR, (hC - hT) * (wC - wL)); res[1] = vfmaq_n_f32(res[1], vecBR1, (hC - hT) * (wC - wL)); - vst1q_f32(outArray + outBase + h*ow*8 + w*8, res[0]); - vst1q_f32(outArray + outBase + h*ow*8 + w*8 + 4, res[1]); + vst1q_f32(outArray + outBase + h * ow * 8 + w * 8, res[0]); + vst1q_f32(outArray + outBase + h * ow * 8 + w * 8 + 4, res[1]); } } } @@ -213,21 +220,18 @@ EE resize_bilinear_fp32(TensorDesc inputDesc, F32* inArray, } #endif -EE resize_bilinear_arm(TensorDesc inputDesc, void* input, - TensorDesc outputDesc, void* output) +EE resize_bilinear_arm(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output) { EE ret = SUCCESS; switch (inputDesc.dt) { #ifdef _USE_FP16 case DT_F16: - ret = resize_bilinear_fp16(inputDesc, (F16*)input, - outputDesc, (F16*)output); + ret = resize_bilinear_fp16(inputDesc, (F16 *)input, outputDesc, (F16 *)output); break; #endif #ifdef _USE_FP32 case DT_F32: - ret = resize_bilinear_fp32(inputDesc, (F32*)input, - outputDesc, (F32*)output); + ret = resize_bilinear_fp32(inputDesc, (F32 *)input, outputDesc, (F32 *)output); break; #endif default: diff --git a/image/src/cpu/general/image_general.h b/compute/image/src/cpu/general/image_general.h similarity index 68% rename from image/src/cpu/general/image_general.h rename to compute/image/src/cpu/general/image_general.h index 5ff00261..ca83b318 100644 --- a/image/src/cpu/general/image_general.h +++ b/compute/image/src/cpu/general/image_general.h @@ -1,17 +1,16 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_IMAGE_GENERAL #define _H_IMAGE_GENERAL @@ -20,20 +19,22 @@ #include "tensor_desc.h" #include "image.h" -EE resize_bilinear_general(TensorDesc inputDesc, void* input, - TensorDesc outputDesc, void* output); +EE resize_bilinear_general(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output); -template -inline EE from_nchwc8_to_nchw(TensorDesc *desc, T *data) { - if (desc == nullptr || data == nullptr) +template +inline EE from_nchwc8_to_nchw(TensorDesc *desc, T *data) +{ + if (desc == nullptr || data == nullptr) { CHECK_STATUS(NULL_POINTER); + } DataType idt; DataFormat idf; U32 in, ic, ih, iw; CHECK_STATUS(tensor4dGet(*desc, &idt, &idf, &in, &ic, &ih, &iw)); - if (idf != DF_NCHWC8) + if (idf != DF_NCHWC8) { CHECK_STATUS(NOT_MATCH); + } *desc = tensor4df(idt, DF_NCHW, in, ic, ih, iw); @@ -41,9 +42,10 @@ inline EE from_nchwc8_to_nchw(TensorDesc *desc, T *data) { ic /= 8; for (U32 n = 0; n < in; n++) { for (U32 c = 0; c < ic; c++) { - for (U32 hw = 0; hw < ih*iw; hw++) { + for (U32 hw = 0; hw < ih * iw; hw++) { for (U32 c8 = 0; c8 < 8; c8++) { - tmp[n*ic*8*ih*iw + (c*8 + c8)*ih*iw + hw] = data[n*ic*ih*iw*8 + c*ih*iw*8 + hw*8 + c8]; + tmp[n * ic * 8 * ih * iw + (c * 8 + c8) * ih * iw + hw] = + data[n * ic * ih * iw * 8 + c * ih * iw * 8 + hw * 8 + c8]; } } } @@ -53,17 +55,20 @@ inline EE from_nchwc8_to_nchw(TensorDesc *desc, T *data) { return SUCCESS; } -template -inline EE from_nchw_to_nchwc8(TensorDesc *desc, T *data) { - if (desc == nullptr || data == nullptr) +template +inline EE from_nchw_to_nchwc8(TensorDesc *desc, T *data) +{ + if (desc == nullptr || data == nullptr) { CHECK_STATUS(NULL_POINTER); + } DataType idt; DataFormat idf; U32 in, ic, ih, iw; CHECK_STATUS(tensor4dGet(*desc, &idt, &idf, &in, &ic, &ih, &iw)); - if (idf != DF_NCHW) + if (idf != DF_NCHW) { CHECK_STATUS(NOT_MATCH); + } *desc = tensor4df(idt, DF_NCHWC8, in, ic, ih, iw); @@ -71,9 +76,10 @@ inline EE from_nchw_to_nchwc8(TensorDesc *desc, T *data) { ic /= 8; for (U32 n = 0; n < in; n++) { for (U32 c = 0; c < ic; c++) { - for (U32 hw = 0; hw < ih*iw; hw++) { + for (U32 hw = 0; hw < ih * iw; hw++) { for (U32 c8 = 0; c8 < 8; c8++) { - tmp[n*ic*ih*iw*8 + c*ih*iw*8 + hw*8 + c8] = data[n*ic*8*ih*iw + (c*8 + c8)*ih*iw + hw]; + tmp[n * ic * ih * iw * 8 + c * ih * iw * 8 + hw * 8 + c8] = + data[n * ic * 8 * ih * iw + (c * 8 + c8) * ih * iw + hw]; } } } diff --git a/image/src/cpu/general/resize_bilinear.cpp b/compute/image/src/cpu/general/resize_bilinear.cpp similarity index 78% rename from image/src/cpu/general/resize_bilinear.cpp rename to compute/image/src/cpu/general/resize_bilinear.cpp index 7806a64a..9068d1ef 100644 --- a/image/src/cpu/general/resize_bilinear.cpp +++ b/compute/image/src/cpu/general/resize_bilinear.cpp @@ -1,28 +1,26 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include -#include -#include "type.h" +#include +#include +#include "types.h" #include "tensor_desc.h" #include "error.h" #include "image.h" #include "cpu/general/image_general.h" -template -EE resize_bilinear(TensorDesc inputDesc, IT* inArray, - TensorDesc outputDesc, OT* outArray) +template +EE resize_bilinear(TensorDesc inputDesc, IT *inArray, TensorDesc outputDesc, OT *outArray) { DataType idt, odt; DataFormat idf, odf; @@ -44,8 +42,8 @@ EE resize_bilinear(TensorDesc inputDesc, IT* inArray, for (U32 n = 0; n < on; n++) { for (U32 c = 0; c < oc; c++) { - I32 outBase = n*oc*oh*ow + c*oh*ow; - I32 inBase = n*oc*ih*iw + c*ih*iw; + I32 outBase = n * oc * oh * ow + c * oh * ow; + I32 inBase = n * oc * ih * iw + c * ih * iw; for (U32 h = 0; h < oh; h++) { for (U32 w = 0; w < ow; w++) { if (h == 0 && w == 0) { @@ -76,9 +74,11 @@ EE resize_bilinear(TensorDesc inputDesc, IT* inArray, if (hT == hB && wL == wR) { outArray[outBase + h * ow + w] = inArray[inBase + hT * iw + wL]; } else if (hT == hB) { - outArray[outBase + h * ow + w] = inArray[inBase + hT * iw + wL] * (wR - wC) + inArray[inBase + hT * iw + wR] * (wC - wL); + outArray[outBase + h * ow + w] = inArray[inBase + hT * iw + wL] * (wR - wC) + + inArray[inBase + hT * iw + wR] * (wC - wL); } else if (wL == wR) { - outArray[outBase + h * ow + w] = inArray[inBase + hT * iw + wL] * (hB - hC) + inArray[inBase + hB * iw + wL] * (hC - hT); + outArray[outBase + h * ow + w] = inArray[inBase + hT * iw + wL] * (hB - hC) + + inArray[inBase + hB * iw + wL] * (hC - hT); } else { F32 factorTL = (hB - hC) * (wR - wC); F32 factorTR = (hB - hC) * (wC - wL); @@ -102,36 +102,31 @@ EE resize_bilinear(TensorDesc inputDesc, IT* inArray, return SUCCESS; } -EE resize_bilinear_general(TensorDesc inputDesc, void* input, - TensorDesc outputDesc, void* output) +EE resize_bilinear_general(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output) { EE ret = SUCCESS; switch (inputDesc.dt) { #ifdef __aarch64__ case DT_F16: { - ret = resize_bilinear(inputDesc, (F16*)input, - outputDesc, (F16*)output); + ret = resize_bilinear(inputDesc, (F16 *)input, outputDesc, (F16 *)output); break; } #endif #ifdef _USE_FP32 case DT_F32: { - ret = resize_bilinear(inputDesc, (F32*)input, - outputDesc, (F32*)output); + ret = resize_bilinear(inputDesc, (F32 *)input, outputDesc, (F32 *)output); break; } #endif case DT_U8: { #ifdef __aarch64__ if (DT_F16 == outputDesc.dt) { - ret = resize_bilinear(inputDesc, (U8*)input, - outputDesc, (F16*)output); + ret = resize_bilinear(inputDesc, (U8 *)input, outputDesc, (F16 *)output); } #endif #ifdef _USE_FP32 if (DT_F32 == outputDesc.dt) { - ret = resize_bilinear(inputDesc, (U8*)input, - outputDesc, (F32*)output); + ret = resize_bilinear(inputDesc, (U8 *)input, outputDesc, (F32 *)output); } #endif break; diff --git a/compute/image/src/gpu/mali/cl/resize_bilinear.cl b/compute/image/src/gpu/mali/cl/resize_bilinear.cl new file mode 100644 index 00000000..6aa3c70e --- /dev/null +++ b/compute/image/src/gpu/mali/cl/resize_bilinear.cl @@ -0,0 +1,72 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void resize_bilinear(const int ih, + const int ih_str, + const int ih_off, + const int iw, + const int iw_str, + const int iw_off, + const int oh, + const int oh_str, + const int oh_off, + const int ow, + const int ow_str, + const int ow_off, + const float ratioh, + const float ratiow, + __global T *input, + __global T *output) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + int idz = get_global_id(2); + + if (idx >= oh || idy >= ow) { + return; + } + + float2 posi; + float2 ratio; + ratio.x = ratioh; + ratio.y = ratiow; + + posi.x = (float)idx * ratio.x; + posi.y = (float)idy * ratio.y; + + int4 tblr; + tblr.x = max(0, (int)floor(posi.y)); // T + tblr.y = min(tblr.x + 1, iw - 1); // B + tblr.z = max(0, (int)floor(posi.x)); // L + tblr.w = min(tblr.z + 1, ih - 1); // R + + int out_off = (idz * ow_str + idy + ow_off) * oh_str + idx + oh_off; + int4 in_off; + in_off.x = (idz * iw_str + tblr.x + iw_off) * ih_str + tblr.z + ih_off; // TL_off + in_off.y = (idz * iw_str + tblr.x + iw_off) * ih_str + tblr.w + ih_off; // TR_off + in_off.z = (idz * iw_str + tblr.y + iw_off) * ih_str + tblr.z + ih_off; // BL_off + in_off.w = (idz * iw_str + tblr.y + iw_off) * ih_str + tblr.w + ih_off; // BR_off + + T4 val_TL, val_TR, val_BL, val_BR; + val_TL = vload4(0, input + (in_off.x << 2)); + val_TR = vload4(0, input + (in_off.y << 2)); + val_BL = vload4(0, input + (in_off.z << 2)); + val_BR = vload4(0, input + (in_off.w << 2)); + float dif1 = posi.x - (float)tblr.z; // C-L + float dif2 = posi.y - (float)tblr.x; // C-T + + T4 top = mad((val_TR - val_TL), dif1, val_TL); + T4 bottom = mad((val_BR - val_BL), dif1, val_BL); + T4 out = mad((bottom - top), dif2, top); + vstore4(out, 0, output + (out_off << 2)); +} diff --git a/compute/image/src/gpu/mali/cl/resize_bilinear_nchw.cl b/compute/image/src/gpu/mali/cl/resize_bilinear_nchw.cl new file mode 100644 index 00000000..1a452573 --- /dev/null +++ b/compute/image/src/gpu/mali/cl/resize_bilinear_nchw.cl @@ -0,0 +1,72 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void resize_bilinear_nchw(const int ih, + const int ih_str, + const int ih_off, + const int iw, + const int iw_str, + const int iw_off, + const int oh, + const int oh_str, + const int oh_off, + const int ow, + const int ow_str, + const int ow_off, + const float ratioh, + const float ratiow, + __global T *input, + __global T *output) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + int idz = get_global_id(2); + + if (idx >= ow || idy >= oh) { + return; + } + + float2 posi; + float2 ratio; + ratio.x = ratiow; + ratio.y = ratioh; + + posi.x = (float)idx * ratio.x; + posi.y = (float)idy * ratio.y; + + int4 tblr; + tblr.x = max(0, (int)floor(posi.x)); // L + tblr.y = min(tblr.x + 1, iw - 1); // R + tblr.z = max(0, (int)floor(posi.y)); // T + tblr.w = min(tblr.z + 1, ih - 1); // B + + int4 in_off; + in_off.x = (idz * ih_str + tblr.z + ih_off) * iw_str + tblr.x + iw_off; // TL_off + in_off.y = (idz * ih_str + tblr.z + ih_off) * iw_str + tblr.y + iw_off; // TR_off + in_off.z = (idz * ih_str + tblr.w + ih_off) * iw_str + tblr.x + iw_off; // BL_off + in_off.w = (idz * ih_str + tblr.w + ih_off) * iw_str + tblr.y + iw_off; // BR_off + + T val_TL, val_TR, val_BL, val_BR; + val_TL = input[in_off.x]; + val_TR = input[in_off.y]; + val_BL = input[in_off.z]; + val_BR = input[in_off.w]; + float dif1 = posi.x - (float)tblr.x; // C-L + float dif2 = posi.y - (float)tblr.z; // C-T + + float top = mad((float)(val_TR - val_TL), dif1, (float)val_TL); + float bottom = mad((float)(val_BR - val_BL), dif1, (float)val_BL); + T out = mad((bottom - top), dif2, top); + int out_off = (idz * oh_str + idy + oh_off) * ow_str + idx + ow_off; + output[out_off] = out; +} diff --git a/compute/image/src/gpu/mali/fp16/resize_bilinear_mali_fp16.cpp b/compute/image/src/gpu/mali/fp16/resize_bilinear_mali_fp16.cpp new file mode 100644 index 00000000..f098bd75 --- /dev/null +++ b/compute/image/src/gpu/mali/fp16/resize_bilinear_mali_fp16.cpp @@ -0,0 +1,113 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "image.h" +#include "gpu/mali/fp16/resize_bilinear_mali_fp16.h" +#include + +inline EE resize_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt) { + return NOT_SUPPORTED; + } + if (outputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE resize_bilinear_core_mali_fp16( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + U32 iw, ih, ic, in; + U32 ow, oh, oc, on; + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + F32 v_ratio[2] = {(F32)(ih - 1) / (F32)(oh - 1), (F32)(iw - 1) / (F32)(ow - 1)}; + + U32 iw_str, ih_str, iw_off, ih_off; + U32 ow_str, oh_str, ow_off, oh_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, NULL, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + + cl_mem inbuf = input->mem; + cl_mem outbuf = output->mem; + + U32 gs[3] = {oh, ow, (oc + 3) / 4}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "resize_bilinear", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, ih_str, ih_off, iw, iw_str, iw_off, oh, oh_str, + oh_off, ow, ow_str, ow_off, v_ratio[0], v_ratio[1], inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "resize_bilinear"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "resize_bilinear")); + CHECK_STATUS(gcl_print_memory(handle, input, "resize_input")); + CHECK_STATUS(gcl_print_memory(handle, output, "resize_output")); +#endif + return SUCCESS; +} + +inline EE resize_bilinear_core_nchw_mali_fp16( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + U32 iw, ih, ic, in; + U32 ow, oh, oc, on; + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + F32 v_ratio[2] = {(F32)(ih - 1) / (F32)(oh - 1), (F32)(iw - 1) / (F32)(ow - 1)}; + + U32 iw_str, ih_str, iw_off, ih_off; + U32 ow_str, oh_str, ow_off, oh_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, NULL, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + + cl_mem inbuf = input->mem; + cl_mem outbuf = output->mem; + char kernelname[128]; + sprintf(kernelname, "resize_bilinear_nchw"); + U32 gs[3] = {ow, oh, oc}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, ih_str, ih_off, iw, iw_str, iw_off, oh, oh_str, + oh_off, ow, ow_str, ow_off, v_ratio[0], v_ratio[1], inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; +} + +EE resize_bilinear_mali_fp16( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + CHECK_STATUS(resize_checkpara_mali_fp16(inputDesc, outputDesc)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + if (input->desc.memFormat == DF_NCHW) { + CHECK_STATUS( + resize_bilinear_core_nchw_mali_fp16(handle, inputDesc, input, outputDesc, output)); + } else { + CHECK_STATUS(resize_bilinear_core_mali_fp16(handle, inputDesc, input, outputDesc, output)); + } + return SUCCESS; +} diff --git a/compute/image/src/gpu/mali/fp16/resize_bilinear_mali_fp16.h b/compute/image/src/gpu/mali/fp16/resize_bilinear_mali_fp16.h new file mode 100644 index 00000000..02bc8a63 --- /dev/null +++ b/compute/image/src/gpu/mali/fp16/resize_bilinear_mali_fp16.h @@ -0,0 +1,20 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RESIZE_MALI_FP16 +#define _RESIZE_MALI_FP16 +#include "image.h" + +EE resize_bilinear_mali_fp16( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output); +#endif diff --git a/compute/image/src/gpu/mali/image_mali.h b/compute/image/src/gpu/mali/image_mali.h new file mode 100644 index 00000000..110e67d6 --- /dev/null +++ b/compute/image/src/gpu/mali/image_mali.h @@ -0,0 +1,30 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_IMAGE_MALI +#define _H_IMAGE_MALI + +#include "image.h" + +EE resize_infer_output_size_mali(TensorDesc inputDesc, + ResizeDesc resizeDesc, + void *params, + TensorDesc *outputDesc, + U32 *outputBytes, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE resize_bilinear_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output); + +#endif diff --git a/compute/image/src/gpu/mali/resize_bilinear.cpp b/compute/image/src/gpu/mali/resize_bilinear.cpp new file mode 100644 index 00000000..bd5280cb --- /dev/null +++ b/compute/image/src/gpu/mali/resize_bilinear.cpp @@ -0,0 +1,93 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "gpu/mali/image_mali.h" +#include "gpu/mali/fp16/resize_bilinear_mali_fp16.h" + +EE resize_infer_output_size_mali(TensorDesc inputDesc, + ResizeDesc resizeDesc, + void *params, + TensorDesc *outputDesc, + U32 *outputBytes, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + if (outputDesc == nullptr || gclmemInputDesc == nullptr || outputBytes == nullptr || + gclmemOutputDesc == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + U32 oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + switch (resizeDesc.paramDT) { + case DT_F32: { + F32 *scales = (F32 *)params; + oh = ih * scales[0]; + ow = iw * scales[1]; + break; + } + case DT_U32: { + U32 *len = (U32 *)params; + oh = len[0]; + ow = len[1]; + break; + } + default: { + return NOT_SUPPORTED; + } + } + *outputDesc = tensor4df(idt, DF_NCHW, in, ic, oh, ow); + *outputBytes = tensorNumBytes(*outputDesc); + if ((idf == gclmemInputDesc->byteSize == 0 || gclmemInputDesc->memFormat == DF_NCHW) && ic <= 2) { + CHECK_STATUS(infer_gclmem_desc_nchw( + iw, ih, ic, 0, 0, ow, oh, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + } else { + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, ow, oh, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + } + return SUCCESS; +} + +inline EE resize_checkpara_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + if (handle == nullptr || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + return SUCCESS; +} + +EE resize_bilinear_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(resize_checkpara_mali(handle, inputDesc, input, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = resize_bilinear_mali_fp16(handle, inputDesc, input, outputDesc, output); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/image/src/image_processing.cpp b/compute/image/src/image_processing.cpp similarity index 69% rename from image/src/image_processing.cpp rename to compute/image/src/image_processing.cpp index 60283413..d7feecdd 100644 --- a/image/src/image_processing.cpp +++ b/compute/image/src/image_processing.cpp @@ -1,48 +1,55 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include #include #include "image.h" #include "tensor_desc.h" -#include "type.h" +#include "tensor.hpp" +#include "types.h" #include "error.h" - -template -std::shared_ptr get_resize_image(TensorDesc rgbDesc, void* rgb, TensorDesc imageDesc, ImageFormat targetImageFormat, float scaleValue) +template +std::shared_ptr get_resize_image( + Tensor rgbTensor, TensorDesc imageDesc, ImageFormat targetImageFormat, float scaleValue) { + ArchInfo archInfo; + auto arch = CPU_GENERAL; + archInfo.arch = arch; DataType rgbDt = DT_F16, imageDt = DT_F16; DataFormat rgbDf = DF_RGB, imageDf = DF_RGB; U32 rgbNum = 0, rgbChannel = 0, rgbHeight = 0, rgbWidth = 0; U32 imageNum = 0, imageChannel = 0, imageHeight = 0, imageWidth = 0; + TensorDesc rgbDesc = rgbTensor.get_desc(); CHECK_STATUS(tensor4dGet(rgbDesc, &rgbDt, &rgbDf, &rgbNum, &rgbChannel, &rgbHeight, &rgbWidth)); CHECK_REQUIREMENT(rgbDf == DF_RGB); CHECK_REQUIREMENT(rgbChannel == 3); CHECK_REQUIREMENT(rgbNum == 1); - CHECK_STATUS(tensor4dGet(imageDesc, &imageDt, &imageDf, &imageNum, &imageChannel, &imageHeight, &imageWidth)); + CHECK_STATUS(tensor4dGet( + imageDesc, &imageDt, &imageDf, &imageNum, &imageChannel, &imageHeight, &imageWidth)); CHECK_REQUIREMENT(imageDf == DF_NCHW); CHECK_REQUIREMENT(imageNum == 1); - U32 height = rgbHeight; - U32 width = rgbWidth; - - U32 totalBytes = tensorNumBytes(imageDesc); - T *transferSpacePtr = (T *)operator new(totalBytes); - T *transferSpacePtrMov = transferSpacePtr; + U32 height = rgbHeight; + U32 width = rgbWidth; + + Tensor temp; + std::shared_ptr transferSpaceTensor(new Tensor()); + transferSpaceTensor->resize(imageDesc); + transferSpaceTensor->alloc(); + T *transferSpacePtrMov = (T *)get_ptr_from_tensor(*transferSpaceTensor, arch); // magic number float meanRGB[3] = {122.6789143406786, 116.66876761696767, 104.0069879317889}; @@ -82,11 +89,10 @@ std::shared_ptr get_resize_image(TensorDesc rgbDesc, void* rgb, TensorDesc i transform[2] = 2; break; default: - std::cerr << "[ERROR] unsupported image format" << std::endl; - exit(1); + UNI_ERROR_LOG("[ERROR] unsupported image format\n"); return nullptr; } - + // consider the dataformat if (targetImageFormat == RGB_SC) { // Specific for Birealnet18, scale short edge to 224 first F32 scale = 224.0 / UNI_MIN(height, width); @@ -97,26 +103,29 @@ std::shared_ptr get_resize_image(TensorDesc rgbDesc, void* rgb, TensorDesc i height = (U32)(scale * height + 0.5); width = 224; } + Tensor scaleTensor; TensorDesc scaledDesc = tensor4df(imageDt, imageDf, imageNum, imageChannel, height, width); - T *scaled = (T*)malloc(tensorNumBytes(scaledDesc)); - resize(rgbDesc, rgb, scaledDesc, scaled, CPU_GENERAL); + scaleTensor.resize(scaledDesc); + scaleTensor.alloc(); + resize(rgbTensor, temp, scaleTensor, &archInfo); U32 h0 = (U32)((height - 224) * 0.5); U32 w0 = (U32)((width - 224) * 0.5); + T *scaled = (T *)get_ptr_from_tensor(scaleTensor, arch); for (U32 c : transform) { for (U32 h = h0; h < h0 + imageHeight; h++) { for (U32 w = w0; w < w0 + imageWidth; w++) { - T value = (scaled[c*height*width + h*width + w] / 255 - meanRGBSC[c]) / stdRGBSC[c]; + T value = (scaled[c * height * width + h * width + w] / 255 - meanRGBSC[c]) / + stdRGBSC[c]; CHECK_REQUIREMENT(!UNI_ISNAN(value)); *transferSpacePtrMov = value; transferSpacePtrMov++; } } } - free(scaled); } else if (targetImageFormat == RGB_RAW) { - resize(rgbDesc, rgb, imageDesc, transferSpacePtr, CPU_GENERAL); + resize(rgbTensor, temp, *transferSpaceTensor.get(), &archInfo); } else if (targetImageFormat == RGB_SC_RAW || targetImageFormat == BGR_SC_RAW) { F32 scale = 256.0 / UNI_MIN(height, width); if (height < width) { @@ -126,39 +135,44 @@ std::shared_ptr get_resize_image(TensorDesc rgbDesc, void* rgb, TensorDesc i height = (U32)(scale * (F32)height + 0.5); width = 256; } + Tensor scaleTensor; TensorDesc scaledDesc = tensor4df(imageDt, imageDf, imageNum, imageChannel, height, width); - T *scaled = (T*)malloc(tensorNumBytes(scaledDesc)); - resize(rgbDesc, rgb, scaledDesc, scaled, CPU_GENERAL); + scaleTensor.resize(scaledDesc); + scaleTensor.alloc(); + resize(rgbTensor, temp, scaleTensor, &archInfo); U32 h0 = (U32)((height - 224) * 0.5); U32 w0 = (U32)((width - 224) * 0.5); + T *scaled = (T *)get_ptr_from_tensor(scaleTensor, arch); for (U32 c : transform) { for (U32 h = h0; h < h0 + 224; h++) { - memcpy(transferSpacePtrMov, scaled + c*height*width + h*width + w0, 224*bytesOf(imageDt)); + memcpy(transferSpacePtrMov, scaled + c * height * width + h * width + w0, + 224 * bytesOf(imageDt)); transferSpacePtrMov += 224; } } - free(scaled); } else { - T *resized = (T*)malloc(tensorNumBytes(imageDesc)); - resize(rgbDesc, rgb, imageDesc, resized, CPU_GENERAL); + Tensor scaleTensor; + scaleTensor.resize(imageDesc); + scaleTensor.alloc(); + resize(rgbTensor, temp, scaleTensor, &archInfo); + T *resized = (T *)get_ptr_from_tensor(scaleTensor, arch); for (U32 c : transform) { for (U32 h = 0; h < imageHeight; h++) { for (U32 w = 0; w < imageWidth; w++) { - T value = (resized[c*imageHeight*imageWidth + h*imageWidth + w] - 1.0*meanRGB[c]) * scaleValue; + T value = (resized[c * imageHeight * imageWidth + h * imageWidth + w] - + 1.0 * meanRGB[c]) * + scaleValue; CHECK_REQUIREMENT(!UNI_ISNAN(value)); *transferSpacePtrMov = value; transferSpacePtrMov++; } } } - free(resized); } - - std::shared_ptr val((U8*)transferSpacePtr); - return val; + return transferSpaceTensor; } // CImg load image to save in RGB format @@ -169,23 +183,25 @@ std::shared_ptr get_resize_image(TensorDesc rgbDesc, void* rgb, TensorDesc i // numpy use OpenCV to load image // Assume most networks require 224*224 inputs -std::shared_ptr load_resize_image(TensorDesc rgbDesc, void* rgb, TensorDesc imageDesc, ImageFormat targetImageFormat, float scaleValue) +std::shared_ptr load_resize_image( + Tensor rgbTensor, TensorDesc imageDesc, ImageFormat targetImageFormat, float scaleValue) { DataType imageDt = DT_F32; DataFormat imageDf; U32 imageNum, imageChannel, imageHeight, imageWidth; - CHECK_STATUS(tensor4dGet(imageDesc, &imageDt, &imageDf, &imageNum, &imageChannel, &imageHeight, &imageWidth)); + CHECK_STATUS(tensor4dGet( + imageDesc, &imageDt, &imageDf, &imageNum, &imageChannel, &imageHeight, &imageWidth)); switch (imageDt) { #ifdef __aarch64__ case DT_F16: { - return get_resize_image(rgbDesc, rgb, imageDesc, targetImageFormat, scaleValue); + return get_resize_image(rgbTensor, imageDesc, targetImageFormat, scaleValue); } #endif #ifdef _USE_FP32 case DT_F32: { - return get_resize_image(rgbDesc, rgb, imageDesc, targetImageFormat, scaleValue); + return get_resize_image(rgbTensor, imageDesc, targetImageFormat, scaleValue); } #endif default: { @@ -195,7 +211,7 @@ std::shared_ptr load_resize_image(TensorDesc rgbDesc, void* rgb, TensorDesc } } -template +template std::shared_ptr gen_fake_image(TensorDesc inputDesc) { DataType dt; @@ -221,7 +237,7 @@ std::shared_ptr gen_fake_image(TensorDesc inputDesc) } } - std::shared_ptr val((U8*)transferSpacePtr); + std::shared_ptr val((U8 *)transferSpacePtr); return val; } diff --git a/compute/image/src/resize.cpp b/compute/image/src/resize.cpp new file mode 100644 index 00000000..9f97b86e --- /dev/null +++ b/compute/image/src/resize.cpp @@ -0,0 +1,157 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "image.h" +#ifdef _USE_GENERAL +#include "cpu/general/image_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/image_arm.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/image_mali.h" +#endif +#include + +// params is a pointer to either the target size or the resize ratios +// When resizeDesc specifies DT_U32, params should point to target sizes (height and width) +// When resizeDesc specifies DT_F32, params should point to resize ratios +EE resize_infer_output_size_cpu(TensorDesc inputDesc, + ResizeDesc resizeDesc, + void *params, + TensorDesc *outputDesc, + U32 *outputBytes) +{ + if (nullptr == outputDesc || nullptr == outputBytes) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + U32 oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + + switch (resizeDesc.paramDT) { + case DT_F32: { + F32 *scales = (F32 *)params; + oh = ih * scales[0]; + ow = iw * scales[1]; + break; + } + case DT_U32: { + U32 *len = (U32 *)params; + oh = len[0]; + ow = len[1]; + break; + } + default: { + return NOT_SUPPORTED; + } + } + *outputDesc = tensor4df(idt, idf, in, ic, oh, ow); + *outputBytes = tensorNumBytes(*outputDesc); + return SUCCESS; +} + +EE resize_infer_output_size(Tensor *inputTensor, + ResizeDesc resizeDesc, + void *params, + Tensor *outputTensor, + U32 *outputBytes, + ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = resize_infer_output_size_mali(inputDesc, resizeDesc, params, &outputDesc, outputBytes, + &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = resize_infer_output_size_cpu(inputDesc, resizeDesc, params, &outputDesc, outputBytes); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE resize(Tensor inputTensor, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + + DataType idt, odt; + DataFormat idf, odf; + U32 in, ic, ih, iw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + CHECK_REQUIREMENT(in == on && ic == oc); + + if (ih == oh && iw == ow && archInfo->arch != MALI) { + memcpy(output, input, tensorNumBytes(inputDesc)); + return SUCCESS; + } + + TensorDesc inDescARM = inputDesc; + U8 *inputARM = (U8 *)input; + TensorDesc outDescARM = outputDesc; + U8 *outputARM = (U8 *)output; + if (DF_NCHWC8 != inputDesc.df && IS_ARM(arch)) { + U32 paddedC = (inputDesc.dims[2] + 7) / 8 * 8; + inDescARM.dims[2] = paddedC; + inDescARM.df = DF_NCHWC8; + outDescARM.dims[2] = paddedC; + outDescARM.df = DF_NCHWC8; + inputARM = (U8 *)tmp; + outputARM = inputARM + tensorNumBytes(inDescARM); + transformNCHWToNCHWC8(inputDesc, input, inDescARM, inputARM); + } + EE ret = NOT_SUPPORTED; + + if (IS_GENERAL(arch) || IS_X86_AVX2(arch)) { +#if defined(_USE_GENERAL) || defined(_USE_X86) + ret = resize_bilinear_general(inputDesc, input, outputDesc, output); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = resize_bilinear_arm(inDescARM, inputARM, outDescARM, outputARM); +#endif + +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = resize_bilinear_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (GCLMem_t)input, outputDesc, (GCLMem_t)output); + +#endif + } + if (DF_NCHWC8 != outputDesc.df && IS_ARM(arch)) { + transformToNCHW(outDescARM, outputARM, outputDesc, output); + } + return ret; +} diff --git a/compute/image/tests/CMakeLists.txt b/compute/image/tests/CMakeLists.txt new file mode 100644 index 00000000..60dcb75c --- /dev/null +++ b/compute/image/tests/CMakeLists.txt @@ -0,0 +1,12 @@ +function(image_test name) + add_executable(${name} ${name}.cpp) + link_image(${name}) +endfunction() + +set_test_c_cxx_flags() + +#image_test(test_image_processing) +#image_test(test_image_resize) +if (USE_MALI) + image_test(test_image_resize_ocl test_image_resize_ocl.cpp) +endif (USE_MALI) diff --git a/tests/test_image_processing.cpp b/compute/image/tests/test_image_processing.cpp similarity index 81% rename from tests/test_image_processing.cpp rename to compute/image/tests/test_image_processing.cpp index 95c833d9..0a1ff9c0 100644 --- a/tests/test_image_processing.cpp +++ b/compute/image/tests/test_image_processing.cpp @@ -1,25 +1,24 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "ut_util.h" #include "tensor_desc.h" #include "image_processing.hpp" - -int main() { +int main() +{ TensorDesc rgbDesc = tensor4df(DT_U8, DF_RGB, 1, 3, 1280, 960); - U8* rgb = ut_input_v(tensorNumElements(rgbDesc), DT_U8, UT_INIT_POS); + U8 *rgb = ut_input_v(tensorNumElements(rgbDesc), DT_U8, UT_INIT_POS); TensorDesc imageDesc = tensor4df(DT_F32, DF_NCHW, 1, 3, 224, 224); load_resize_image(rgbDesc, rgb, imageDesc, RGB, 0.017); diff --git a/tests/test_image_resize.cpp b/compute/image/tests/test_image_resize.cpp similarity index 68% rename from tests/test_image_resize.cpp rename to compute/image/tests/test_image_resize.cpp index 5780aec5..e40b48d2 100644 --- a/tests/test_image_resize.cpp +++ b/compute/image/tests/test_image_resize.cpp @@ -1,23 +1,21 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "image.h" #include "ut_util.h" - -int resizeTest(int argc, char* argv[], DataType dt) +int resizeTest(int argc, char *argv[], DataType dt) { CHECK_REQUIREMENT(argc == 9); // in data @@ -30,6 +28,10 @@ int resizeTest(int argc, char* argv[], DataType dt) U32 oc = atoi(argv[6]); U32 oh = atoi(argv[7]); U32 ow = atoi(argv[8]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; CHECK_REQUIREMENT(in == 1 && on == 1); CHECK_REQUIREMENT(ic % 8 == 0 && oc % 8 == 0); @@ -44,27 +46,24 @@ int resizeTest(int argc, char* argv[], DataType dt) scales[1] = (F32)ow / (F32)iw; // setup input, filter - U8 *input = ut_input_v(in*ic*ih*iw, dt, UT_INIT_RANDOM); - U8 *input_ref = ut_input_v(in*ic*ih*iw, dt, UT_INIT_ZERO); - memcpy(input_ref, input, bytesOf(dt)*in*ic*ih*iw); + U8 *input = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + U8 *input_ref = ut_input_v(in * ic * ih * iw, dt, UT_INIT_ZERO); + memcpy(input_ref, input, bytesOf(dt) * in * ic * ih * iw); // setup output U32 outputBytes; - CHECK_STATUS(resize_infer_output_size(inputDesc, resizeDesc, scales, &outputDesc, &outputBytes)); - CHECK_REQUIREMENT(tensorNumElements(outputDesc) == on*oc*oh*ow); + CHECK_STATUS(resize_infer_output_size( + inputDesc, resizeDesc, scales, &outputDesc, &outputBytes, &archInfo)); + CHECK_REQUIREMENT(tensorNumElements(outputDesc) == on * oc * oh * ow); U32 output_size = outputBytes / bytesOf(dt); - U8 *output = ut_input_v(output_size, dt, UT_INIT_ZERO); + U8 *output = ut_input_v(output_size, dt, UT_INIT_ZERO); U8 *output_ref = ut_input_v(output_size, dt, UT_INIT_ZERO); if (UT_CHECK) { - CHECK_STATUS(resize(inputDesc, input, - outputDesc, output, - UT_ARCH)); + CHECK_STATUS(resize(inputDesc, input, nullptr, outputDesc, output, &archInfo)); // naive implement - CHECK_STATUS(resize(inputDesc, input_ref, - outputDesc, output_ref, - CPU_GENERAL)); + CHECK_STATUS(resize(inputDesc, input_ref, nullptr, outputDesc, output_ref, &archInfo_org)); // check ut_check_v(output, output_ref, output_size, dt, 0.05, __FILE__, __LINE__); @@ -73,9 +72,7 @@ int resizeTest(int argc, char* argv[], DataType dt) // benchmark double time_start = ut_time_ms(); for (int iter = 0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(resize(inputDesc, input_ref, - outputDesc, output_ref, - CPU_GENERAL)); + CHECK_STATUS(resize(inputDesc, input_ref, nullptr, outputDesc, output_ref, &archInfo_org)); } double time_end = ut_time_ms(); double time = (time_end - time_start) / UT_LOOPS; @@ -83,9 +80,7 @@ int resizeTest(int argc, char* argv[], DataType dt) // log performance data char buffer[150]; char params[120]; - sprintf(params, "(%u %u %u %u)=>(%u %u %u %u)", - in, ic, ih, iw, - on, oc, oh, ow); + sprintf(params, "(%u %u %u %u)=>(%u %u %u %u)", in, ic, ih, iw, on, oc, oh, ow); sprintf(buffer, "%20s, %80s", "Resize", params); double ops = 15.0 * on * oc * oh * ow; ut_log(dt, buffer, ops, time); @@ -97,7 +92,7 @@ int resizeTest(int argc, char* argv[], DataType dt) return 0; } -int main(int argc, char* argv[]) +int main(int argc, char *argv[]) { #ifdef _USE_FP16 resizeTest(argc, argv, DT_F16); diff --git a/compute/image/tests/test_image_resize_ocl.cpp b/compute/image/tests/test_image_resize_ocl.cpp new file mode 100644 index 00000000..41371b70 --- /dev/null +++ b/compute/image/tests/test_image_resize_ocl.cpp @@ -0,0 +1,164 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "types.h" +#include "image.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" +#include "tensor_computing.h" + +#ifdef _USE_FP16 +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} +int resizeTest(int argc, char *argv[], DataType dt) +{ + CHECK_REQUIREMENT(argc == 9); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + // output + U32 on = atoi(argv[5]); + U32 oc = atoi(argv[6]); + U32 oh = atoi(argv[7]); + U32 ow = atoi(argv[8]); + + CHECK_REQUIREMENT(in == 1 && on == 1); + + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + TensorDesc inputDesc_cpu, inputDesc_gpu, outputDesc_cpu, outputDesc_gpu; + ResizeDesc resizeDesc; + inputDesc_cpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + inputDesc_gpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + + resizeDesc.paramDT = DT_F32; + F32 scales[2]; + scales[0] = (F32)oh / (F32)ih; + scales[1] = (F32)ow / (F32)iw; + + // setup input + U8 *input_cpu = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + + Tensor inputTensorCpu; + inputTensorCpu.resize(inputDesc_cpu); + inputTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu, tensorNumBytes(inputDesc_cpu)); + Tensor outputTensorCpu; + Tensor tmpTensorCpu; + U32 outputBytes; + CHECK_STATUS(resize_infer_output_size( + &inputTensorCpu, resizeDesc, scales, &outputTensorCpu, &outputBytes, &archInfo_org)); + outputTensorCpu.alloc(); + + // naive implement + // CPU output + CHECK_STATUS(resize(inputTensorCpu, tmpTensorCpu, outputTensorCpu, &archInfo_org)); + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + ; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(inputDesc_gpu); + U8 *output_gpu = NULL; + + MaliPara maliPara; + maliPara.handle = handle; + archInfo.archPara = &maliPara; + + CHECK_STATUS(resize_infer_output_size( + &inputTensor, resizeDesc, scales, &outputTensor, &outputBytes, &archInfo)); + U32 maxBytes = 0; + U32 tmpBytes = 0; + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + tmpBytes = tensorNumBytes(inputDesc_gpu); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + CHECK_STATUS(ocl_set_input(handle, input, inputDesc_gpu, input_cpu, tmpbuf, true)); + + CHECK_STATUS(resize(inputTensor, tmpTensor, outputTensor, &archInfo)); + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + + UNI_INFO_LOG("Run:\n") +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + outputDesc_gpu = outputTensor.get_desc(); + ; + CHECK_STATUS(ocl_get_output(handle, output, outputDesc_gpu, true)); + output_gpu = output->mapPtrArray.back(); + + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)->(%u %u %u %u)", in, ic, ih, iw, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "bilinear", params); +#ifdef _DEBUG + double ops = on * oc * oh * ow * 4; // TO DO + ut_log(dt, buffer, ops, time); +#endif + ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, UT_ARCH), on * oc * ow * oh, dt); + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(input_cpu); + return 0; +} +#endif + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + resizeTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/CMakeLists.txt b/compute/tensor/CMakeLists.txt new file mode 100644 index 00000000..b0bd366e --- /dev/null +++ b/compute/tensor/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 3.2) + +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) +if (BOLT_CONFIGURE_FILE) + include(${BOLT_CONFIGURE_FILE}) +else (BOLT_CONFIGURE_FILE) + message(FATAL_ERROR " +FATAL: can not find bolt.cmake in directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (BOLT_CONFIGURE_FILE) + +project(tensor) + +set_c_cxx_flags() + +include_tensor() + +add_subdirectory(src) +add_subdirectory(tests) diff --git a/compute/tensor/include/tensor_computing.h b/compute/tensor/include/tensor_computing.h new file mode 100644 index 00000000..42816f46 --- /dev/null +++ b/compute/tensor/include/tensor_computing.h @@ -0,0 +1,702 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TENSOR_COMPUTING +#define _H_TENSOR_COMPUTING + +#include +#include "sys.h" +#include "types.h" +#include "tensor.hpp" +#include "tensor_computing_type.h" + +EE convolution_infer_output_size(Tensor *inputTensor, + Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + Tensor *outputTensor, + DataType targetDataType, + ArchInfo_t archInfo); + +EE convolution_infer_forward_algorithm(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ConvolutionForwardAlgorithm *algorithm, + DataType targetDataType, + ActivationParamSpec activationDesc, + ArchInfo_t archInfo); + +EE convolution_transform_filter_bytes(Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo); + +EE convolution_transform_filter(Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + Tensor tmpTensor, + Tensor *ftmTensor, + ArchInfo_t archInfo); + +EE convolution_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo); + +EE convolution(Tensor inputTensor, + Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + void *scale, + Tensor biasTensor, + Tensor tmpTensor, + Tensor outputTensor, + ActivationParamSpec activationDesc, + ArchInfo_t archInfo); + +EE deconvolution_infer_output_size(Tensor *inputTensor, + Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + Tensor *outputTensor, + DataType targetDataType, + ArchInfo_t archInfo); + +EE deconvolution_transform_filter_bytes(Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo); + +EE deconvolution_transform_filter(Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + Tensor tmpTensor, + Tensor *ftmTensor, + ArchInfo_t archInfo); + +EE deconvolution_infer_forward_algorithm(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ConvolutionForwardAlgorithm *algorithm, + DataType targetDataType, + ActivationParamSpec activationDesc, + ArchInfo_t archInfo); + +EE deconvolution_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo); + +EE deconvolution(Tensor inputTensor, + Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + void *scale, + Tensor biasTensor, + Tensor tmpTensor, + Tensor outputTensor, + ActivationParamSpec activationDesc, + ArchInfo_t archInfo); + +EE depthwise_pointwise_convolution_infer_output_size(Tensor *inputTensor, + Tensor dwFilterTensor, + Tensor pwFilterTensor, + ConvolutionParamSpec convParamSpec, + Tensor *outputTensor, + DataType targetDataType, + ArchInfo_t archInfo); + +EE depthwise_pointwise_convolution_infer_forward_algorithm(Tensor inputTensor, + Tensor dwFilterTensor, + Tensor pwFilterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + DepthwiseConvolutionForwardAlgorithm *algorithm, + DataType targetDataType, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + ArchInfo_t archInfo); + +EE depthwise_pointwise_convolution_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor dwFilterTensor, + Tensor pwFilterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo); + +EE depthwise_pointwise_convolution_transform_filter_bytes(Tensor dwFilterTensor, + Tensor pwFilterTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *dwBytes, + U32 *pwBytes, + ArchInfo_t archInfo); + +EE depthwise_pointwise_convolution_transform_filter(Tensor dwFilterTensor, + Tensor pwFilterTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + Tensor *dwFtm, + Tensor *pwFtm, + ArchInfo_t archInfo); + +EE depthwise_pointwise_convolution(Tensor inputTensor, + Tensor dwFilterTensor, + Tensor pwFilterTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + Tensor dwBiasTensor, + Tensor pwBiasTensor, + Tensor tmpTensor, + Tensor outputTensor, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + ArchInfo_t archInfo); + +EE depthwise_convolution_infer_output_size(Tensor *inputTensor, + Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + Tensor *outputTensor, + DataType targetDataType, + ArchInfo_t archInfo); + +EE depthwise_convolution_infer_forward_algorithm(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + DepthwiseConvolutionForwardAlgorithm *algorithm, + DataType targetDataType, + ActivationParamSpec depthwiseActivationParamSpec, + ArchInfo_t archInfo); + +EE depthwise_convolution_transform_filter_bytes(Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo); + +EE depthwise_convolution_transform_filter(Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + Tensor *ftmTensor, + ArchInfo_t archInfo); + +EE depthwise_convolution_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo); + +EE depthwise_convolution(Tensor inputTensor, + Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + Tensor biasTensor, + Tensor tmpTensor, + Tensor outputTensor, + ActivationParamSpec depthwiseActivationParamSpec, + ArchInfo_t archInfo); + +EE detectionoutput_infer_output_size(std::vector inputTensor, + DetectionOutputParamSpec detectionOutputParamSpec, + Tensor *outputTensor, + ArchInfo_t archInfo); + +EE detectionoutput(std::vector inputTensor, + DetectionOutputParamSpec detectionOutputParamSpec, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE pooling_infer_output_size(Tensor *inputTensor, + PoolingParamSpec poolingParamSpec, + Tensor *outputTensor, + ArchInfo_t archInfo); + +EE pooling_infer_forward_tmp_bytes( + Tensor inputTensor, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo); + +EE pooling(Tensor inputTensor, + PoolingParamSpec poolingParamSpec, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE pooling_bp( + Tensor inputTensor, PoolingParamSpec poolingParamSpec, Tensor outputTensor, ArchInfo_t archInfo); + +EE priorbox_infer_output_size(std::vector inputTensor, + PriorBoxParamSpec priorBoxParamSpec, + Tensor *outputTensor, + ArchInfo_t archInfo); + +EE priorbox(std::vector inputTensor, + PriorBoxParamSpec priorBoxParamSpec, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE activation_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE activation( + Tensor inputTensor, ActivationParamSpec activationDesc, Tensor outputTensor, ArchInfo_t archInfo); + +EE concat_infer_output_size( + std::vector inputTensor, ConcatParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo); + +EE concat_infer_forward_tmp_bytes(std::vector inputTensor, U32 *bytes, ArchInfo_t archInfo); + +EE concat(std::vector inputTensor, + ConcatParamSpec p, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE eltwise_infer_output_size( + std::vector inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE eltwise_infer_forward_tmp_bytes( + std::vector inputTensor, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo); + +EE eltwise(std::vector inputTensor, + EltwiseParamSpec eltwiseDesc, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE split_infer_output_size(Tensor *inputTensor, std::vector output); + +EE split(Tensor inputTensor, std::vector outputTensor, ArchInfo_t archInfo); + +EE fully_connected_infer_output_size( + Tensor *inputTensor, Tensor filterTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE fully_connected_infer_forward_algorithm( + Tensor inputTensor, Tensor filterTensor, Tensor outputTensor, ArchInfo_t archInfo); + +EE fully_connected_infer_forward_tmp_bytes( + Tensor inputTensor, Tensor filterTensor, U32 *bytes, ArchInfo_t archInfo); + +EE fully_connected_transform_filter_bytes(Tensor filterTensor, U32 *bytes, ArchInfo_t archInfo); + +EE fully_connected_transform_filter( + Tensor inputTensor, Tensor filterTensor, Tensor *ftmTensor, ArchInfo_t archInfo); + +EE fully_connected(Tensor inputTensor, + Tensor filterTensor, + Tensor biasTensor, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE softmax_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE softmax(Tensor inputTensor, + SoftmaxParamSpec p, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE softmax_infer_forward_tmp_bytes(Tensor inputTensor, U32 *bytes, ArchInfo_t archInfo); + +EE rnn_infer_output_size( + Tensor *inputTensor, RNNParamSpec rnnParamSpec, Tensor *outputTensor, ArchInfo_t archInfo); + +EE rnn_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + RNNParamSpec rnnParamSpec, + U32 *bytes, + ArchInfo_t archInfo); + +EE rnn_transform_filter_bytes( + std::vector filterTensor, RNNParamSpec rnnParamSpec, U32 *bytes, ArchInfo_t archInfo); + +EE rnn_transform_filter(std::vector filterTensor, + RNNParamSpec rnnParamSpec, + std::vector ftmTensor, + ArchInfo_t archInfo); + +EE rnn(Tensor inputTensor, + std::vector filterTensors, + std::vector biasTensors, + RNNParamSpec rnnParamSpec, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE rnncell_infer_output_size(std::vector inputTensor, + RNNParamSpec rnnParamSpec, + Tensor *outputTensor, + ArchInfo_t archInfo); + +EE rnncell_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + RNNParamSpec rnnParamSpec, + U32 *bytes, + ArchInfo_t archInfo); + +EE rnncell_infer_forward_algorithm(Tensor xTensor, + Tensor filterTensor, + Tensor biasTensor, + RNNParamSpec rnncellDesc, + U32 batchStrideX, + U32 batchStrideH, + Tensor hTensor, + ArchInfo_t archInfo); + +EE rnncell(Tensor xTensor, + std::vector filterTensors, + std::vector biasTensors, + Tensor stateTensor, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + U32 tmpOffset, + Tensor tmpTensor, + Tensor hTensor, + ArchInfo_t archInfo); + +EE scale_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE scale(Tensor inputTensor, + void *alpha, + void *beta, + ScaleParamSpec p, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE prelu_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE prelu(Tensor inputTensor, + Tensor weightTensor, + PReLUParamSpec preluDesc, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE normalization_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE layer_normalization(Tensor inputTensor, + Tensor alphaTensor, + Tensor betaTensor, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE slice_infer_output_size( + Tensor *inputTensor, SliceParamSpec p, std::vector outputTensor, ArchInfo_t archInfo); + +EE slice( + Tensor inputTensor, SliceParamSpec p, std::vector outputTensor, ArchInfo_t archInfo); + +EE tfslice_infer_output_size( + Tensor *inputTensor, TfSliceParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo); + +EE tfslice(Tensor inputTensor, TfSliceParamSpec p, Tensor outputTensor, ArchInfo_t archInfo); + +EE transpose_infer_output_size( + Tensor *inputTensor, TransposeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo); + +EE transpose_infer_forward_tmp_bytes( + Tensor inputTensor, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo); + +EE transpose(Tensor inputTensor, + TransposeParamSpec p, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE matmul_infer_output_size(Tensor *matrixATensor, + bool transposeA, + Tensor *matrixBTensor, + bool transposeB, + Tensor *matrixCTensor, + ArchInfo_t archInfo); + +EE matmul_infer_forward_algorithm(Tensor matrixATensor, + bool transposeA, + Tensor matrixBTensor, + bool transposeB, + Tensor matrixCTensor, + ArchInfo_t archInfo); + +EE matmul_infer_forward_tmp_bytes(Tensor matrixATensor, + bool transposeA, + Tensor matrixBTensor, + bool transposeB, + U32 *bytes, + ArchInfo_t archInfo); + +EE matmul(Tensor matrixATensor, + bool transposeA, + Tensor matrixBTensor, + bool transposeB, + Tensor tmpTensor, + Tensor matirxCTensor, + ArchInfo_t archInfo); + +EE reshape_infer_output_size( + Tensor *inputTensor, ReshapeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo); + +EE reshape_infer_forward_tmp_bytes( + Tensor inputTensor, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo); + +EE reshape(Tensor inputTensor, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo); + +EE attention_infer_output_size(Tensor *inputTensor, AttentionParamSpec p, Tensor *outputTensor); + +EE attention(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo); + +EE power_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE power(Tensor inputTensor, PowerParamSpec p, Tensor outputTensor, ArchInfo_t archInfo); + +EE clip_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE clip(Tensor inputTensor, ClipParamSpec p, Tensor outputTensor, ArchInfo_t archInfo); + +EE bilateral_slice_apply_infer_output_size(Tensor *inputTensor, + Tensor *guideTensor, + Tensor *gridTensor, + BilateralSliceApplyParamSpec p, + Tensor *outputTensor, + ArchInfo_t archInfo); + +EE bilateral_slice_apply_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor guideTensor, + Tensor gridTensor, + BilateralSliceApplyParamSpec p, + U32 *bytes, + ArchInfo_t archInfo); + +EE bilateral_slice_apply(Tensor inputTensor, + Tensor guideTensor, + Tensor gridTensor, + BilateralSliceApplyParamSpec p, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE argmax_infer_output_size( + Tensor *inputTensor, ArgMaxParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo); + +EE argmax_infer_forward_tmp_bytes( + Tensor inputTensor, ArgMaxParamSpec p, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo); + +EE argmax( + Tensor inputTensor, ArgMaxParamSpec p, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo); + +EE reduction_infer_output_size( + Tensor *inputTensor, Tensor maskTensor, ReductionParamSpec p, Tensor *outputTensor); + +EE reduction_infer_forward_tmp_bytes( + Tensor inputTensor, ReductionParamSpec p, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo); + +EE reduction(Tensor inputTensor, + Tensor maskTensor, + ReductionParamSpec p, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE check_infer_output_size( + std::vector inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE check(Tensor inputTensorA, + Tensor inputTensorB, + CheckParamSpec p, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE squeeze_infer_output_size( + Tensor *inputTensor, SqueezeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo); + +EE squeeze(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo); + +EE unsqueeze_infer_output_size( + Tensor *inputTensor, UnsqueezeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo); + +EE unsqueeze(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo); + +EE space2depth_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE space2depth(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo); + +EE depth2space_infer_output_size( + Tensor *inputTensor, Depth2SpaceParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo); + +EE depth2space_infer_forward_tmp_bytes( + Tensor inputTensor, Depth2SpaceParamSpec p, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo); + +EE depth2space(Tensor inputTensor, + Depth2SpaceParamSpec p, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE attention_mask( + Tensor inputTensor, AttentionMaskParamSpec p, Tensor outputTensor, ArchInfo_t archInfo); + +EE attention_mask_infer_output_size(Tensor *inputTensor, Tensor *outputTensor); + +EE padding_infer_output_size( + Tensor *inputTensor, PadParamSpec padParamSpec, Tensor *outputTensor, ArchInfo_t archInfo); + +EE padding(Tensor inputTensor, PadParamSpec padParamSpec, Tensor outputTensor, ArchInfo_t archInfo); + +EE embedding_infer_output_size(Tensor *inputTensor, + EmbedParamSpec p, + DataType outputDt, + Tensor *outputTensor, + ArchInfo_t archInfo); + +EE embedding(Tensor inputTensor, + Tensor weightTensor, + EmbedParamSpec p, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE yolov3detectionoutput_infer_output_size(std::vector inputTensor, + Yolov3DetectionOutputParamSpec yolov3DetectionOutputParamSpec, + Tensor *outputTensor, + ArchInfo_t archInfo); + +EE yolov3detectionoutput(std::vector inputTensor, + Yolov3DetectionOutputParamSpec yolov3DetectionOutputParamSpec, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE preallocated_memory_infer_output_size(Tensor *outputTensor, ArchInfo_t archInfo); + +EE preallocated_memory(Tensor outputTensor, ArchInfo_t archInfo); + +EE copy_infer_output_size(std::vector inputTensor, ArchInfo_t archInfo); + +EE copy(std::vector inputTensor, + U32 srcOffset, + U32 dstOffset, + U32 srcStride, + U32 dstStride, + U32 length, + ArchInfo_t archInfo); + +EE non_max_suppression_infer_output_size(std::vector inputTensor, + NonMaxSuppressionParamSpec p, + Tensor *outputTensor, + ArchInfo_t archInfo); + +EE non_max_suppression(std::vector inputTensor, + NonMaxSuppressionParamSpec p, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE roialign_infer_output_size(std::vector inputTensor, + RoiAlignParamSpec p, + Tensor *outputTensor, + ArchInfo_t archInfo); + +EE roialign( + std::vector inputTensor, RoiAlignParamSpec p, Tensor outputTensor, ArchInfo_t archInfo); + +EE multihead_attention_infer_output_size(Tensor *inputTensor, + std::vector filterTensor, + Tensor *outputTensor, + U32 *firstFCSliceNum, + ArchInfo_t archInfo); + +EE multihead_attention_infer_forward_algorithm(Tensor inputTensor, + std::vector filterTensor, + void *multiplyAlpha, + void *multiplyBeta, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + std::vector eltwiseWithLayerNormIn, + ActivationMode activation, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE multihead_attention_infer_forward_tmp_bytes(Tensor inputTensor, + std::vector filterTensor, + std::vector eltwiseWithLayerNormIn, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + U32 *bytes, + ArchInfo_t archInfo); + +EE multihead_attention_transform_filter_bytes( + std::vector filterTensor, U32 *bytes, ArchInfo_t archInfo); + +EE multihead_attention_transform_filter( + std::vector filterTensor, std::vector ftmTensor, ArchInfo_t archInfo); + +EE multihead_attention(Tensor inputTensor, + std::vector filterTensor, + std::vector biasTensor, + std::vector layerNormAlphaTensor, + std::vector layerNormBetaTensor, + void *multiplyAlpha, + void *multiplyBeta, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + std::vector eltwiseWithLayerNormIn, + ActivationMode activation, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo); + +EE channel_resize_infer_output_size( + Tensor *inputTensor, ChannelResizeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo); + +EE channel_resize( + Tensor inputTensor, ChannelResizeParamSpec p, Tensor outputTensor, ArchInfo_t archInfo); + +EE l2normalization_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE l2normalization(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo); + +EE tile_infer_output_size( + Tensor *inputTensor, TileParamSpec tileParamSpec, Tensor *outputTensor, ArchInfo_t archInfo); + +EE tile(Tensor inputTensor, TileParamSpec tileParamSpec, Tensor outputTensor, ArchInfo_t archInfo); + +EE quantize_tensor(TensorDesc dDesc, const void *data, TensorDesc *qDesc, void *qData, void *scale); + +#if defined(_USE_NEON) && defined(_USE_INT8) +void dequantize_int8_to_fp16(U32 len, INT8 *q, F32 scale, F16 *d); + +void dequantize_int32_to_fp16( + U32 len, I32 *q, F32 scale, F16 *d, U32 biasLen = 0, F16 *biasPtr = nullptr); +#endif + +#ifdef _USE_FP16 +void update_histogram(U32 len, const F16 *data, int numBins, F32 interval, F32 *histo); +#endif + +std::vector compress_histogram(std::vector &histogram, F32 numPerBin, F32 last_max); + +std::vector compute_scale_with_KL(std::vector &histogram, F32 interval); +#endif diff --git a/tensor_computing/include/tensor_computing_library_algorithm_search.h b/compute/tensor/include/tensor_computing_library_algorithm_search.h similarity index 84% rename from tensor_computing/include/tensor_computing_library_algorithm_search.h rename to compute/tensor/include/tensor_computing_library_algorithm_search.h index 6aaa5b2d..c7febed1 100644 --- a/tensor_computing/include/tensor_computing_library_algorithm_search.h +++ b/compute/tensor/include/tensor_computing_library_algorithm_search.h @@ -1,17 +1,16 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_TENSOR_COMPUTING_LIBRARY_ALGORITHM_SEARCH #define _H_TENSOR_COMPUTING_LIBRARY_ALGORITHM_SEARCH @@ -19,8 +18,7 @@ #include #include -#include "type.h" -#include "tensor_computing_type.h" +#include "types.h" extern std::map libraryAlgorithmMap; extern std::map libraryAlgorithmParameters; @@ -31,7 +29,7 @@ void saveLibraryAlgorithmMapToTxt(); std::string getConvolutionAlgorithmMapNameFromInput(TensorDesc inputDesc, TensorDesc filterDesc, - ConvolutionDesc convDesc, + ConvolutionParamSpec convParamSpec, DataType targetDataType); #endif #endif diff --git a/compute/tensor/include/tensor_computing_type.h b/compute/tensor/include/tensor_computing_type.h new file mode 100644 index 00000000..959233a9 --- /dev/null +++ b/compute/tensor/include/tensor_computing_type.h @@ -0,0 +1,70 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TENSOR_COMPUTING_TYPE +#define _H_TENSOR_COMPUTING_TYPE + +#include +#include "types.h" +#include "tensor.hpp" + +#ifdef _USE_MALI +#include "gcl.h" +#include "ocl_desc_trans.h" +#define ALIGN(len, align_num) ((len + align_num - 1) / align_num * align_num) +#endif + +ConvolutionParamSpec createConvolutionParamSpec(U32 group, + U32 kernelH, + U32 kernelW, + U32 strideH, + U32 strideW, + U32 paddingT, + U32 paddingB, + U32 paddingL, + U32 paddingR, + U32 dilateH, + U32 dilateW, + U32 num_outputs, + ConvolutionMode convMode); + +FullyConnectedParamSpec createFullyConnectedParamSpec( + U32 num_outputs, U32 num_slices, I32 *slice_point); + +PoolingParamSpec createPoolingParamSpec(PoolingMode pm, + U32 ksH, + U32 ksW, + U32 strideH, + U32 strideW, + U32 paddingT, + U32 paddingB, + U32 paddingL, + U32 paddingR, + RoundMode rm); + +ReshapeParamSpec createReshapeParamSpec(I32 *shape_dims, I32 shape_size, I32 axis, I32 num_axes); + +ClipParamSpec createClipParamSpec(float min, float max); + +SqueezeParamSpec createSqueezeParamSpec(int *axes, int axes_num); + +std::vector get_desc_from_tensors(std::vector tensors); +std::vector get_desc_from_tensor_ptrs(std::vector tensors); + +std::vector get_scale_from_tensors(std::vector tensors); + +template +std::vector get_data_from_tensors(std::vector tensors, Arch arch); +template +std::vector get_data_from_tensor_ptrs(std::vector tensors, Arch arch); +#endif diff --git a/tensor_computing/src/CMakeLists.txt b/compute/tensor/src/CMakeLists.txt similarity index 54% rename from tensor_computing/src/CMakeLists.txt rename to compute/tensor/src/CMakeLists.txt index 5f5d7256..39950f23 100644 --- a/tensor_computing/src/CMakeLists.txt +++ b/compute/tensor/src/CMakeLists.txt @@ -1,5 +1,6 @@ if (USE_GENERAL) file(GLOB general_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/general/*.cpp) + file(GLOB cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/*.cpp) endif (USE_GENERAL) if (USE_NEON) @@ -15,8 +16,18 @@ if (USE_NEON) endif (USE_INT8) file(GLOB arm_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/*.cpp) set(arm_srcs "${arm_srcs};${arm_fp16_srcs};${arm_fp32_srcs};${arm_int8_srcs};${arm_bnn_srcs}") + file(GLOB cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/*.cpp) endif (USE_NEON) +if (USE_X86) + if (USE_FP32) + file(GLOB x86_fp32_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/x86/fp32/*.cpp) + endif (USE_FP32) + file(GLOB x86_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/x86/*.cpp) + set(x86_srcs "${x86_srcs};${x86_fp32_srcs}") + file(GLOB cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/*.cpp) +endif (USE_X86) + if (USE_MALI) file(GLOB mali_fp16_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/mali/fp16/*.cpp) file(GLOB mali_uchar_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/mali/uchar/*.cpp) @@ -25,15 +36,20 @@ if (USE_MALI) endif (USE_MALI) file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) -set(srcs "${srcs};${general_srcs};${arm_srcs};${mali_srcs}") +set(srcs "${srcs};${general_srcs};${arm_srcs};${cpu_srcs};${mali_srcs};${x86_srcs}") include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # shared library -ADD_LIBRARY(${PROJECT_NAME} SHARED ${srcs}) +add_library(${PROJECT_NAME} SHARED ${srcs}) +target_link_libraries(${PROJECT_NAME} LINK_PUBLIC blas_enhance uni) # static library -ADD_LIBRARY(${PROJECT_NAME}_static STATIC ${srcs}) -SET_TARGET_PROPERTIES(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}") -SET_TARGET_PROPERTIES(${PROJECT_NAME} PROPERTIES CLEAN_DIRECT_OUTPUT 1) -SET_TARGET_PROPERTIES(${PROJECT_NAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) +add_library(${PROJECT_NAME}_static STATIC ${srcs}) + +set_target_properties(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}") +set_target_properties(${PROJECT_NAME} PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(${PROJECT_NAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) +install(TARGETS ${PROJECT_NAME} ${PROJECT_NAME}_static + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) diff --git a/compute/tensor/src/activation.cpp b/compute/tensor/src/activation.cpp new file mode 100644 index 00000000..ec24a289 --- /dev/null +++ b/compute/tensor/src/activation.cpp @@ -0,0 +1,80 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +inline EE activation_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + + *outputDesc = inputDesc; + return SUCCESS; +} + +EE activation_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = activation_infer_output_size_mali( + inputDesc, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = activation_infer_output_size_cpu(inputDesc, &outputDesc); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE activation( + Tensor inputTensor, ActivationParamSpec activationDesc, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = activation_cpu(inputDesc, input, activationDesc, outputDesc, output, arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = activation_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (GCLMem_t)input, outputDesc, (GCLMem_t)output, activationDesc.mode); +#endif + } + return ret; +} diff --git a/compute/tensor/src/argmax.cpp b/compute/tensor/src/argmax.cpp new file mode 100644 index 00000000..7cf2d414 --- /dev/null +++ b/compute/tensor/src/argmax.cpp @@ -0,0 +1,99 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE argmax( + Tensor inputTensor, ArgMaxParamSpec p, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#if defined(_USE_CPU) + ret = argmax_cpu(inputDesc, input, p, outputDesc, output); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + ret = argmax_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, p, + (GCLMem_t)tmp, outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} + +EE argmax_infer_forward_tmp_bytes( + Tensor inputTensor, ArgMaxParamSpec p, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo) +{ + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + ret = argmax_infer_forward_tmp_bytes_mali(inputDesc, p, outputDesc, bytes); +#endif + } else { + *bytes = 0; + ret = SUCCESS; + } + return ret; +} + +EE argmax_infer_output_size( + Tensor *inputTensor, ArgMaxParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = argmax_infer_output_size_mali( + inputDesc, p, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + outputDesc = inputDesc; + int axis = p.axis; + if (axis < 0) { + axis += inputDesc.nDims; + } + axis = inputDesc.nDims - 1 - axis; + for (int i = axis; i < (I32)(inputDesc.nDims) - 1; i++) { + outputDesc.dims[i] = outputDesc.dims[i + 1]; + } + outputDesc.nDims = inputDesc.nDims - 1; + outputDesc.dt = DT_U32; + outputTensor->resize(outputDesc); + ret = SUCCESS; + } + return ret; +} diff --git a/tensor_computing/src/attention.cpp b/compute/tensor/src/attention.cpp similarity index 52% rename from tensor_computing/src/attention.cpp rename to compute/tensor/src/attention.cpp index 8003c092..71d7db2c 100644 --- a/tensor_computing/src/attention.cpp +++ b/compute/tensor/src/attention.cpp @@ -1,17 +1,16 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "tensor_computing.h" #ifdef _USE_GENERAL #include "cpu/general/tensor_computing_general.h" @@ -20,34 +19,48 @@ #include "cpu/arm/tensor_computing_arm.h" #endif -EE attention(TensorDesc inputDesc, const void *input, - TensorDesc outputDesc, void *output, Arch arch) +EE attention(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo) { + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { + if (IS_GENERAL(arch)) { #ifdef _USE_GENERAL ret = attention_general(inputDesc, input, outputDesc, output); #endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + UNI_WARNING_LOG("The x86 attention operator is not optimized now.\n"); + ret = attention_general(inputDesc, input, outputDesc, output); +#endif #ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { + } else if (IS_ARM(arch)) { ret = attention_arm(inputDesc, input, outputDesc, output); #endif } return ret; } -EE attention_infer_output_size(TensorDesc inputDesc, - U32 numHeads, U32 fromSequenceLength, U32 toSequenceLength, - TensorDesc *outputDesc) +EE attention_infer_output_size(Tensor *inputTensor, AttentionParamSpec p, Tensor *outputTensor) { - if (nullptr == outputDesc) + if (inputTensor == nullptr) { CHECK_STATUS(NULL_POINTER); - + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); DataType dt; + DataFormat df; U32 batch, sequenceLength; - CHECK_STATUS(tensor2dGet(inputDesc, &dt, &batch, &sequenceLength)); - - *outputDesc = tensor4df(dt, DF_NCHW, batch, numHeads, fromSequenceLength, toSequenceLength); - + CHECK_STATUS(tensor2dGet(inputDesc, &dt, &df, &batch, &sequenceLength)); + outputDesc = + tensor4df(dt, DF_NCHW, batch, p.num_heads, p.from_sequence_length, p.to_sequence_length); + outputTensor->resize(outputDesc); return SUCCESS; } diff --git a/tensor_computing/src/attention_mask.cpp b/compute/tensor/src/attention_mask.cpp similarity index 50% rename from tensor_computing/src/attention_mask.cpp rename to compute/tensor/src/attention_mask.cpp index d31b4dc3..7c2a1609 100644 --- a/tensor_computing/src/attention_mask.cpp +++ b/compute/tensor/src/attention_mask.cpp @@ -1,17 +1,16 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "tensor_computing.h" #ifdef _USE_GENERAL #include "cpu/general/tensor_computing_general.h" @@ -19,33 +18,52 @@ #ifdef _USE_NEON #include "cpu/arm/tensor_computing_arm.h" #endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif -EE attention_mask(TensorDesc inputDesc, const void* input, - I32 attentionLength, bool sameLength, float mask, - TensorDesc outputDesc, void* output, Arch arch) +EE attention_mask( + Tensor inputTensor, AttentionMaskParamSpec p, Tensor outputTensor, ArchInfo_t archInfo) { + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + // reinit mask value to avoid overflow - if (bytesOf(inputDesc.dt) == 2 && mask > 10000) - mask = 10000; + if (bytesOf(inputDesc.dt) == 2 && p.mask > 10000) { + p.mask = 10000; + } EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { + if (IS_GENERAL(arch)) { #ifdef _USE_GENERAL - ret = attention_mask_general(inputDesc, input, attentionLength, sameLength, mask, outputDesc, output); + ret = attention_mask_general(inputDesc, input, p, outputDesc, output); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = attention_mask_x86(inputDesc, input, p, outputDesc, output); #endif #ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = attention_mask_arm(inputDesc, input, attentionLength, sameLength, mask, outputDesc, output); + } else if (IS_ARM(arch)) { + ret = attention_mask_arm(inputDesc, input, p, outputDesc, output); #endif } return ret; } -EE attention_mask_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc) +EE attention_mask_infer_output_size(Tensor *inputTensor, Tensor *outputTensor) { - if (nullptr == outputDesc) + if (inputTensor == nullptr) { CHECK_STATUS(NULL_POINTER); - if (inputDesc.nDims < 2) + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + if (inputDesc.nDims < 2) { return NOT_MATCH; - *outputDesc = inputDesc; + } + outputTensor->resize(inputDesc); return SUCCESS; } diff --git a/compute/tensor/src/bilateral_slice_apply.cpp b/compute/tensor/src/bilateral_slice_apply.cpp new file mode 100644 index 00000000..47c20c77 --- /dev/null +++ b/compute/tensor/src/bilateral_slice_apply.cpp @@ -0,0 +1,118 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +inline EE bilateral_slice_apply_infer_output_size_cpu() +{ + return NOT_SUPPORTED; +} + +EE bilateral_slice_apply_infer_output_size(Tensor *inputTensor, + Tensor *guideTensor, + Tensor *gridTensor, + BilateralSliceApplyParamSpec p, + Tensor *outputTensor, + ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (guideTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (gridTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + TensorDesc guideDesc = guideTensor->get_desc(); + TensorDesc gridDesc = gridTensor->get_desc(); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemGuideDesc = ocl_get_desc(*guideTensor); + GCLMemDesc gclmemGridDesc = ocl_get_desc(*gridTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = bilateral_slice_apply_infer_output_size_mali(inputDesc, guideDesc, gridDesc, p, + &outputDesc, &gclmemInputDesc, &gclmemGuideDesc, &gclmemGridDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(guideTensor, gclmemGuideDesc); + ocl_set_desc(gridTensor, gclmemGridDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } + outputTensor->resize(outputDesc); + return ret; +} + +EE bilateral_slice_apply_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor guideTensor, + Tensor gridTensor, + BilateralSliceApplyParamSpec p, + U32 *bytes, + ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc guideDesc = guideTensor.get_desc(); + TensorDesc gridDesc = gridTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + ret = bilateral_slice_apply_infer_forward_tmp_bytes_mali(inputDesc, guideDesc, gridDesc, p, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, bytes); +#endif + } + return ret; +} + +EE bilateral_slice_apply(Tensor inputTensor, + Tensor guideTensor, + Tensor gridTensor, + BilateralSliceApplyParamSpec p, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + U32 tmpBytes = tmpTensor.bytes(); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + TensorDesc guideDesc = guideTensor.get_desc(); + void *guide = get_ptr_from_tensor(guideTensor, arch); + TensorDesc gridDesc = gridTensor.get_desc(); + void *grid = get_ptr_from_tensor(gridTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + ret = bilateral_slice_apply_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (GCLMem_t)input, guideDesc, (GCLMem_t)guide, gridDesc, (GCLMem_t)grid, p, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, tmpBytes, (GCLMem_t)tmp, outputDesc, + (GCLMem_t)output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/channel_resize.cpp b/compute/tensor/src/channel_resize.cpp new file mode 100644 index 00000000..cb7c80f4 --- /dev/null +++ b/compute/tensor/src/channel_resize.cpp @@ -0,0 +1,66 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE channel_resize( + Tensor inputTensor, ChannelResizeParamSpec p, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + ret = channel_resize_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (GCLMem_t)input, p, outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} + +EE channel_resize_infer_output_size( + Tensor *inputTensor, ChannelResizeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc outputDesc = outputTensor->get_desc(); + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + TensorDesc inputDesc = inputTensor->get_desc(); + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + CHECK_STATUS(channel_resize_infer_output_size_mali( + inputDesc, p, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc)) + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } + outputTensor->resize(outputDesc); + return SUCCESS; +} diff --git a/compute/tensor/src/check.cpp b/compute/tensor/src/check.cpp new file mode 100644 index 00000000..f5e4b826 --- /dev/null +++ b/compute/tensor/src/check.cpp @@ -0,0 +1,96 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE check(Tensor inputTensorA, + Tensor inputTensorB, + CheckParamSpec p, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDescA = inputTensorA.get_desc(); + void *inputA = get_ptr_from_tensor(inputTensorA, arch); + TensorDesc inputDescB = inputTensorB.get_desc(); + void *inputB = get_ptr_from_tensor(inputTensorB, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = check_general(inputDescA, inputA, inputDescB, inputB, p, outputDesc, output); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = check_x86(inputDescA, inputA, inputDescB, inputB, p, outputDesc, output); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = check_arm(inputDescA, inputA, inputDescB, inputB, p, outputDesc, output); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = check_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDescA, (GCLMem_t)inputA, + inputDescB, (GCLMem_t)inputB, p, outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} + +EE check_infer_output_size( + std::vector inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + EE ret = NOT_SUPPORTED; + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + for (auto p : inputTensor) { + if (p == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + } + TensorDesc inputDesc = inputTensor[0]->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDescA = ocl_get_desc(*(inputTensor[0])); + GCLMemDesc gclmemInputDescB = ocl_get_desc(*(inputTensor[1])); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = check_infer_output_size_mali( + inputDesc, &outputDesc, &gclmemInputDescA, &gclmemInputDescB, &gclmemOutputDesc); + ocl_set_desc(inputTensor[0], gclmemInputDescA); + ocl_set_desc(inputTensor[1], gclmemInputDescB); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + outputDesc.dt = DT_I32; + outputDesc.nDims = 1; + outputDesc.dims[0] = inputDesc.dims[inputDesc.nDims - 1]; + ret = SUCCESS; + } + outputTensor->resize(outputDesc); + return ret; +} diff --git a/compute/tensor/src/clip.cpp b/compute/tensor/src/clip.cpp new file mode 100644 index 00000000..994462d8 --- /dev/null +++ b/compute/tensor/src/clip.cpp @@ -0,0 +1,78 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +inline EE clip_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + *outputDesc = inputDesc; + return SUCCESS; +} + +EE clip_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = clip_infer_output_size_mali( + inputDesc, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = clip_infer_output_size_cpu(inputDesc, &outputDesc); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE clip(Tensor inputTensor, ClipParamSpec p, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = clip_cpu(inputDesc, input, p, outputDesc, output, arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = clip_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, p, + outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/concat.cpp b/compute/tensor/src/concat.cpp new file mode 100644 index 00000000..4ca0a88b --- /dev/null +++ b/compute/tensor/src/concat.cpp @@ -0,0 +1,184 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +inline void processInputDescs(std::vector *inputDesc, I32 axis) +{ + int inputNum = inputDesc->size(); + int axisInfo = (axis > 0) ? axis : ((*inputDesc)[0].nDims + axis); + axisInfo = (*inputDesc)[0].nDims - 1 - axisInfo; + for (int i = 0; i < (int)(*inputDesc)[0].nDims; i++) { + if (i == axisInfo) { + continue; + } + U32 minDim = (*inputDesc)[0].dims[i]; + for (int j = 1; j < inputNum; j++) { + if ((*inputDesc)[j].dims[i] < minDim) { + minDim = (*inputDesc)[j].dims[i]; + } + } + if (minDim == 0) { + continue; + } + for (int j = 0; j < inputNum; j++) { + (*inputDesc)[j].dims[i] = minDim; + } + } +} + +inline EE concat_infer_output_size_cpu( + std::vector inputDesc, ConcatParamSpec p, TensorDesc *outputDesc) +{ + if (inputDesc.size() < 1) { + CHECK_STATUS(NOT_MATCH); + } + if (inputDesc.size() == 1) { + *outputDesc = inputDesc[0]; + return SUCCESS; + } + + bool hasC8 = false; + for (U32 i = 1; i < inputDesc.size(); i++) { + if (inputDesc[i].nDims != 0) { + *outputDesc = inputDesc[i]; + } + if (inputDesc[i].df == DF_NCHWC8) { + hasC8 = true; + } + } + I32 dim = outputDesc->nDims; + int axis = p.axis; + axis = (axis + dim) % dim; + axis = dim - 1 - axis; + outputDesc->dims[axis] = 0; + + for (U32 i = 0; i < inputDesc.size(); i++) { + if (inputDesc[i].nDims == 0) { + continue; + } + + if (inputDesc[i].nDims != (U32)dim) { + return NOT_MATCH; + } + + for (I32 j = 0; j < dim; j++) { + if (j == axis) { + outputDesc->dims[j] += inputDesc[i].dims[j]; + } else { + outputDesc->dims[j] = UNI_MAX(inputDesc[i].dims[j], outputDesc->dims[j]); + if (inputDesc[i].dims[j] != 0 && outputDesc->dims[j] != 0 && + outputDesc->dims[j] != inputDesc[i].dims[j]) { + return NOT_MATCH; + } + } + } + } + + if ((outputDesc->dims[3] % 8 == 0) && hasC8) { + outputDesc->df = DF_NCHWC8; + } + + if ((outputDesc->df == DF_NCHWC8) && (outputDesc->dims[2] % 8 != 0)) { + outputDesc->df = DF_NCHW; + } + + return SUCCESS; +} + +EE concat_infer_output_size( + std::vector inputTensor, ConcatParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + std::vector inputDesc = get_desc_from_tensor_ptrs(inputTensor); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + std::vector gclmemInputDescs; + for (auto p : inputTensor) { + gclmemInputDescs.push_back(ocl_get_desc(*p)); + } + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = concat_infer_output_size_mali( + inputDesc, p, &outputDesc, gclmemInputDescs.data(), &gclmemOutputDesc); + for (U32 i = 0; i < inputTensor.size(); i++) { + ocl_set_desc(inputTensor[i], gclmemInputDescs[i]); + } + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + processInputDescs(&inputDesc, p.axis); + ret = concat_infer_output_size_cpu(inputDesc, p, &outputDesc); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE concat_infer_forward_tmp_bytes(std::vector inputTensor, U32 *bytes, ArchInfo_t archInfo) +{ + std::vector inputDesc = get_desc_from_tensors(inputTensor); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + ret = concat_infer_forward_tmp_bytes_mali(inputDesc, bytes); +#endif + } else { + *bytes = 0; + for (auto p : inputDesc) { + *bytes += tensorNumBytes(p); + } + ret = SUCCESS; + } + return ret; +} + +EE concat(std::vector inputTensor, + ConcatParamSpec p, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + std::vector inputDesc = get_desc_from_tensors(inputTensor); + std::vector inputScale = get_scale_from_tensors(inputTensor); + std::vector input = get_data_from_tensors(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + F32 outputScale = outputTensor.get_scale(); + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + processInputDescs(&inputDesc, p.axis); + ret = concat_cpu( + inputDesc, input, inputScale.data(), p, tmp, outputDesc, output, &outputScale); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = concat_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, input, NULL, p, + (GCLMem_t)tmp, outputDesc, (GCLMem_t)output, NULL); +#endif + } + outputTensor.set_scale(outputScale); + return ret; +} diff --git a/compute/tensor/src/convolution.cpp b/compute/tensor/src/convolution.cpp new file mode 100644 index 00000000..261dc90e --- /dev/null +++ b/compute/tensor/src/convolution.cpp @@ -0,0 +1,334 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif + +inline EE convolution_infer_output_size_cpu(TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc *outputDesc, + DataType targetDataType) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt; + DataFormat idf, fdf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + I32 oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + if (fh < 1 || fw < 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + + I32 strideH = convParamSpec.stride_h; + I32 strideW = convParamSpec.stride_w; + I32 paddingT = convParamSpec.padding_top; + I32 paddingB = convParamSpec.padding_bottom; + I32 paddingL = convParamSpec.padding_left; + I32 paddingR = convParamSpec.padding_right; + I32 dilateH = convParamSpec.dilatedRate_h; + I32 dilateW = convParamSpec.dilatedRate_w; + + U32 fhDilated = (fh - 1) * dilateH + 1; + U32 fwDilated = (fw - 1) * dilateW + 1; + oh = (ih + paddingT + paddingB - fhDilated) / strideH + 1; + ow = (iw + paddingL + paddingR - fwDilated) / strideW + 1; + + if (fn % 8 != 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + + *outputDesc = tensor4df(targetDataType, DF_NCHWC8, in, fn, oh, ow); + return SUCCESS; +} + +EE convolution_infer_output_size(Tensor *inputTensor, + Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + Tensor *outputTensor, + DataType targetDataType, + ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = convolution_infer_output_size_mali( + inputDesc, filterDesc, convParamSpec, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = convolution_infer_output_size_cpu( + inputDesc, filterDesc, convParamSpec, &outputDesc, targetDataType); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE convolution_infer_forward_algorithm(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ConvolutionForwardAlgorithm *algorithm, + DataType targetDataType, + ActivationParamSpec activationDesc, + ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = SUCCESS; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = convolution_infer_forward_algorithm_x86( + inputDesc, filterDesc, outputDesc, convParamSpec, policy, algorithm, targetDataType); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = convolution_infer_forward_algorithm_arm( + inputDesc, filterDesc, outputDesc, convParamSpec, policy, algorithm, targetDataType); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + GCLMemDesc gclmemInputDesc = ocl_get_desc(inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(outputTensor); + ret = convolution_infer_forward_algorithm_mali(((MaliPara_t)(archInfo->archPara))->handle, + inputDesc, filterDesc, convParamSpec, outputDesc, gclmemInputDesc, gclmemOutputDesc, + policy, activationDesc.mode, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } + return ret; +} + +EE convolution_transform_filter_bytes(Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo) +{ + TensorDesc filterDesc = filterTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + *bytes = tensorNumBytes(filterDesc); + ret = SUCCESS; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = convolution_transform_filter_bytes_x86(filterDesc, convParamSpec, algorithm, bytes); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = convolution_transform_filter_bytes_arm(filterDesc, convParamSpec, algorithm, bytes); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = convolution_transform_filter_bytes_mali(filterDesc, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, + ((MaliPara_t)(archInfo->archPara))->gclmemFilterDesc, bytes); +#endif + } + return ret; +} + +EE convolution_transform_filter(Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + Tensor tmpTensor, + Tensor *ftmTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc filterDesc = filterTensor.get_desc(); + void *filter = get_ptr_from_tensor(filterTensor, arch); + TensorDesc ftmDesc = ftmTensor->get_desc(); + void *filterTransformed = get_ptr_from_tensor(*ftmTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + UNI_memcpy(filterTransformed, filter, tensorNumBytes(filterDesc)); + ftmDesc = filterDesc; + ret = SUCCESS; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = convolution_transform_filter_x86( + filterDesc, filter, convParamSpec, algorithm, &ftmDesc, filterTransformed); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = convolution_transform_filter_arm( + filterDesc, filter, convParamSpec, algorithm, &ftmDesc, filterTransformed); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + ret = convolution_transform_filter_mali(((MaliPara_t)(archInfo->archPara))->handle, + filterDesc, (GCLMem_t)filter, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, + (GCLMem_t)tmp, &ftmDesc, (GCLMem_t)filterTransformed); +#endif + } + ftmTensor->resize(ftmDesc); + return ret; +} + +EE convolution_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = SUCCESS; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = convolution_infer_forward_tmp_bytes_x86( + inputDesc, filterDesc, outputDesc, convParamSpec, algorithm, bytes); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = convolution_infer_forward_tmp_bytes_arm( + inputDesc, filterDesc, outputDesc, convParamSpec, algorithm, bytes); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = convolution_infer_forward_tmp_bytes_mali(inputDesc, filterDesc, outputDesc, + convParamSpec, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, bytes); +#endif + } + return ret; +} + +inline void convolution_process_bnn_scale( + U8 **bias, U8 **scale, TensorDesc *biasDesc, TensorDesc *scaleDesc) +{ + U32 vecLen = tensorNumElements(*biasDesc) / 2; + biasDesc->dims[0] = vecLen; + *scaleDesc = *biasDesc; + *scale = *bias; + *bias += vecLen * bytesOf(DT_F16); +} + +EE convolution(Tensor inputTensor, + Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + void *scale, + Tensor biasTensor, + Tensor tmpTensor, + Tensor outputTensor, + ActivationParamSpec activationDesc, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + if (3 == inputDesc.nDims) { + inputDesc = tensor4df(inputDesc.dt, DF_NCHW, inputDesc.dims[2], inputDesc.dims[1], inputDesc.dims[0], 1); + } + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc filterDesc = filterTensor.get_desc(); + void *filter = get_ptr_from_tensor(filterTensor, arch); + TensorDesc biasDesc = biasTensor.get_desc(); + void *bias = get_ptr_from_tensor(biasTensor, arch); + U32 tmpBytes = tmpTensor.bytes(); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + TensorDesc scaleDesc = filterDesc; + + EE ret = NOT_SUPPORTED; +#ifdef _USE_FP16 + if (IS_GENERAL(arch) || (IS_ARM(arch))) { + if (filterDesc.dt == DT_BIN01 || filterDesc.dt == DT_BIN11) { + U8 *biasPtr = (U8 *)get_ptr_from_tensor(biasTensor, arch); + U8 *scalePtr = nullptr; + convolution_process_bnn_scale(&biasPtr, &scalePtr, &biasDesc, &scaleDesc); + bias = biasPtr; + scale = scalePtr; + } + } +#endif + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = convolution_general(inputDesc, input, filterDesc, filter, convParamSpec, scaleDesc, + scale, biasDesc, bias, outputDesc, output, activationDesc); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = convolution_x86(inputDesc, input, filterDesc, filter, convParamSpec, algorithm, + scaleDesc, scale, biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc, + archInfo->arch); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = convolution_arm(inputDesc, input, filterDesc, filter, convParamSpec, algorithm, + scaleDesc, scale, biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc, + archInfo->arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = convolution_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (GCLMem_t)input, filterDesc, (GCLMem_t)filter, convParamSpec, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, scaleDesc, (GCLMem_t)scale, + biasDesc, (GCLMem_t)bias, tmpBytes, (GCLMem_t)tmp, outputDesc, (GCLMem_t)output, + activationDesc.mode); +#endif + } + return ret; +} diff --git a/compute/tensor/src/copy.cpp b/compute/tensor/src/copy.cpp new file mode 100644 index 00000000..6d79ed46 --- /dev/null +++ b/compute/tensor/src/copy.cpp @@ -0,0 +1,66 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE copy_infer_output_size(std::vector inputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + std::vector inputDesc = get_desc_from_tensor_ptrs(inputTensor); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + std::vector gclmemInputDescs; + for (auto p : inputTensor) { + gclmemInputDescs.push_back(ocl_get_desc(*p)); + } + ret = copy_infer_output_size_mali(inputDesc, gclmemInputDescs.data()); + for (U32 i = 0; i < inputTensor.size(); i++) { + ocl_set_desc(inputTensor[i], gclmemInputDescs[i]); + } +#endif + } + return ret; +} + +EE copy(std::vector inputTensor, + U32 srcOffset, + U32 dstOffset, + U32 srcStride, + U32 dstStride, + U32 length, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + std::vector inputDesc = get_desc_from_tensors(inputTensor); + std::vector input = get_data_from_tensors(inputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + ret = copy_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, input, srcOffset, + dstOffset, srcStride, dstStride, length); +#endif +#ifdef _USE_CPU + } else { + memcpy((U8 *)input[1] + bytesOf(inputDesc[1].dt) * dstOffset, + (U8 *)input[0] + bytesOf(inputDesc[0].dt) * srcOffset, + length * bytesOf(inputDesc[0].dt)); + ret = SUCCESS; +#endif + } + return ret; +} diff --git a/compute/tensor/src/cpu/activation.cpp b/compute/tensor/src/cpu/activation.cpp new file mode 100644 index 00000000..fb9cf6e4 --- /dev/null +++ b/compute/tensor/src/cpu/activation.cpp @@ -0,0 +1,32 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/tensor_computing_cpu.h" +#include "cpu/cpu_functions.h" + +EE activation_cpu(TensorDesc inputDesc, + void *input, + ActivationParamSpec activationDesc, + TensorDesc outputDesc, + void *output, + Arch arch) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt = inputDesc.dt; + U32 len = tensorNumElements(inputDesc); + CHECK_REQUIREMENT(len == tensorNumElements(outputDesc)); + ArrayActivationFunction activation_func = get_array_activation_function(arch); + return activation_func(idt, input, len, activationDesc, output); +} diff --git a/tensor_computing/src/cpu/general/argmax.cpp b/compute/tensor/src/cpu/argmax.cpp similarity index 62% rename from tensor_computing/src/cpu/general/argmax.cpp rename to compute/tensor/src/cpu/argmax.cpp index 8f8653a2..a5bb6ba6 100644 --- a/tensor_computing/src/cpu/general/argmax.cpp +++ b/compute/tensor/src/cpu/argmax.cpp @@ -1,80 +1,79 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#include "cpu/tensor_computing_cpu.h" -#include -#include "cpu/general/tensor_computing_general.h" - - -template -U32 array_argmax(const T* input, U32 len, U32 stride) { +template +static U32 array_argmax(const T *input, U32 len, U32 stride) +{ U32 index = 0; U32 j = stride; - for (U32 i = 1; i < len; i++, j+=stride) { - if(input[j] > input[index]) + for (U32 i = 1; i < len; i++, j += stride) { + if (input[j] > input[index]) { index = j; + } } return index / stride; } -template -EE argmax(TensorDesc inputDesc, const T* input, - I32 axis, - TensorDesc outputDesc, U32* output) +template +static EE argmax(TensorDesc inputDesc, const T *input, I32 axis, TensorDesc outputDesc, U32 *output) { UNUSED(outputDesc); - if (nullptr == input || nullptr == output) + if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); + } - if (axis < 0) + if (axis < 0) { axis = inputDesc.nDims + axis; + } axis = inputDesc.nDims - 1 - axis; U32 loopInner = 1; for (int i = 0; i < axis; i++) { loopInner *= inputDesc.dims[i]; } U32 loopOuter = 1; - for (U32 i = axis+1; i < inputDesc.nDims; i++) { + for (U32 i = axis + 1; i < inputDesc.nDims; i++) { loopOuter *= inputDesc.dims[i]; } U32 len = inputDesc.dims[axis]; for (U32 i = 0; i < loopOuter; i++) { for (U32 j = 0; j < loopInner; j++) { - const T* array = input + i * (len * loopInner) + j; - output[i*loopInner+j] = array_argmax(array, len, loopInner); + const T *array = input + i * (len * loopInner) + j; + output[i * loopInner + j] = array_argmax(array, len, loopInner); } } return SUCCESS; } -EE argmax_general(TensorDesc inputDesc, const void* input, - I32 axis, - TensorDesc outputDesc, void* output) +EE argmax_cpu( + TensorDesc inputDesc, const void *input, ArgMaxParamSpec p, TensorDesc outputDesc, void *output) { DataType idt = inputDesc.dt; EE ret = SUCCESS; + int axis = p.axis; switch (idt) { -#ifdef _USE_FP16 - case DT_F16: { - ret = argmax(inputDesc, (const F16*)input, axis, outputDesc, (U32*)output); +#ifdef _USE_FP32 + case DT_F32: { + ret = argmax(inputDesc, (const F32 *)input, axis, outputDesc, (U32 *)output); break; } #endif -#ifdef _USE_FP32 - case DT_F32: { - ret = argmax(inputDesc, (const F32*)input, axis, outputDesc, (U32*)output); +#ifdef _USE_FP16 + case DT_F16: { + ret = argmax(inputDesc, (const F16 *)input, axis, outputDesc, (U32 *)output); break; } #endif diff --git a/compute/tensor/src/cpu/arm/arm_functions.h b/compute/tensor/src/cpu/arm/arm_functions.h new file mode 100644 index 00000000..0db1458f --- /dev/null +++ b/compute/tensor/src/cpu/arm/arm_functions.h @@ -0,0 +1,249 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_ARM_FUNCTIONS +#define _H_ARM_FUNCTIONS + +#include "cpu/cpu_functions_template.h" +#ifdef _USE_FP16 +#include "cpu/arm/fp16/arm_functions_fp16.h" +#endif +#ifdef _USE_FP32 +#include "cpu/arm/fp32/arm_functions_fp32.h" +#endif +#ifdef _USE_INT8 +#include "cpu/arm/int8/arm_functions_int8.h" +#endif + +// array sum +inline F32 array_sum_arm(DataType dt, const void *data, I32 len) +{ + F32 result = 0; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + result = array_sum_f16((const F16 *)data, len); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + result = array_sum_f32((const F32 *)data, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return result; +} + +// array mean +inline F32 array_mean_arm(DataType dt, const void *data, I32 len) +{ + F32 result = 0; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + result = array_mean_f16((const F16 *)data, len); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + result = array_mean_f32((const F32 *)data, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return result; +} + +// array var +inline F32 array_var_arm(DataType dt, const void *data, I32 len, F32 mean) +{ + F32 result = 0; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + result = array_var_f16((const F16 *)data, len, mean); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + result = array_var_f32((const F32 *)data, len, mean); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return result; +} + +// array max +inline F32 array_max_arm(DataType dt, const void *data, I32 len) +{ + F32 result = 0; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + result = array_max_f16((const F16 *)data, len); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + result = array_max_f32((const F32 *)data, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return result; +} + +inline F32 array_maxabs_arm(DataType dt, const void *data, I32 len) +{ + F32 result = 0; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + result = array_maxabs_f16((const F16 *)data, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return result; +} + +inline void array_scale_arm( + DataType dt, const void *input, void *output, I32 len, F32 alpha, F32 beta) +{ + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + array_scale_f16((const F16 *)input, (F16 *)output, len, alpha, beta); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + array_scale_f32((const F32 *)input, (F32 *)output, len, alpha, beta); + break; +#endif + case DT_I32: + array_scale_template((const I32 *)input, (I32 *)output, len, alpha, beta); + break; + case DT_U32: + array_scale_template((const U32 *)input, (U32 *)output, len, alpha, beta); + break; + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } +} + +inline void array_power_arm(DataType dt, void *input, void *output, I32 len, F32 power) +{ + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + array_power_f16((F16 *)input, (F16 *)output, len, power); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + array_power_f32((F32 *)input, (F32 *)output, len, power); + break; +#endif + case DT_I32: + array_power_template((I32 *)input, (I32 *)output, len, power); + break; + case DT_U32: + array_power_template((U32 *)input, (U32 *)output, len, power); + break; + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } +} + +inline EE array_activation_arm( + DataType dt, void *input, U32 len, ActivationParamSpec activationDesc, void *output) +{ + EE ret = SUCCESS; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + ret = activation_fp16((F16 *)input, len, activationDesc, (F16 *)output); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + ret = activation_fp32((F32 *)input, len, activationDesc, (F32 *)output); + break; +#endif +#ifdef _USE_INT8 + case DT_I8: { + ret = activation_int8((INT8 *)input, len, activationDesc, (INT8 *)output); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +inline void array_add_arm(DataType dt, const void *inputA, const void *inputB, void *output, I32 len) +{ + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + array_add_f16((const F16 *)inputA, (const F16 *)inputB, (F16 *)output, len); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + array_add_f32((const F32 *)inputA, (const F32 *)inputB, (F32 *)output, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } +} + +inline void array_square_and_add_arm( + DataType dt, const void *inputA, const void *inputB, void *output, I32 len) +{ + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + array_square_and_add_f16((const F16 *)inputA, (const F16 *)inputB, (F16 *)output, len); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + array_square_and_add_f32((const F32 *)inputA, (const F32 *)inputB, (F32 *)output, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } +} +#endif diff --git a/tensor_computing/src/cpu/arm/attention.cpp b/compute/tensor/src/cpu/arm/attention.cpp similarity index 80% rename from tensor_computing/src/cpu/arm/attention.cpp rename to compute/tensor/src/cpu/arm/attention.cpp index 3b9ce727..c8637194 100644 --- a/tensor_computing/src/cpu/arm/attention.cpp +++ b/compute/tensor/src/cpu/arm/attention.cpp @@ -1,17 +1,16 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "cpu/arm/tensor_computing_arm.h" #ifdef _USE_FP32 #include "cpu/arm/fp32/tensor_computing_fp32.h" @@ -27,19 +26,22 @@ EE attention_arm(TensorDesc inputDesc, const void *input, TensorDesc outputDesc, U32 batch, numHeads, fromSequenceLength, toSequenceLength; CHECK_REQUIREMENT(tensorIs2d(inputDesc)); CHECK_REQUIREMENT(tensorIs4d(outputDesc)); - CHECK_STATUS(tensor4dGet(outputDesc, &dt, &df, &batch, &numHeads, &fromSequenceLength, &toSequenceLength)); + CHECK_STATUS(tensor4dGet( + outputDesc, &dt, &df, &batch, &numHeads, &fromSequenceLength, &toSequenceLength)); EE ret = SUCCESS; switch (dt) { #ifdef _USE_FP32 case DT_F32: { - ret = attention_fp32(batch, numHeads, fromSequenceLength, toSequenceLength, (const F32*)input, (F32*)output); + ret = attention_fp32(batch, numHeads, fromSequenceLength, toSequenceLength, + (const F32 *)input, (F32 *)output); break; } #endif #ifdef _USE_FP16 case DT_F16: { - ret = attention_fp16(batch, numHeads, fromSequenceLength, toSequenceLength, (const F16*)input, (F16*)output); + ret = attention_fp16(batch, numHeads, fromSequenceLength, toSequenceLength, + (const F16 *)input, (F16 *)output); break; } #endif diff --git a/tensor_computing/src/cpu/arm/normalization.cpp b/compute/tensor/src/cpu/arm/attention_mask.cpp similarity index 72% rename from tensor_computing/src/cpu/arm/normalization.cpp rename to compute/tensor/src/cpu/arm/attention_mask.cpp index b495ffd0..e79b0936 100644 --- a/tensor_computing/src/cpu/arm/normalization.cpp +++ b/compute/tensor/src/cpu/arm/attention_mask.cpp @@ -1,17 +1,16 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "cpu/arm/tensor_computing_arm.h" #ifdef _USE_FP32 #include "cpu/arm/fp32/tensor_computing_fp32.h" @@ -20,22 +19,24 @@ #include "cpu/arm/fp16/tensor_computing_fp16.h" #endif -EE layer_normalization_arm(void *alpha, void *beta, - TensorDesc inputDesc, void* input, - TensorDesc outputDesc, void* output) +EE attention_mask_arm(TensorDesc inputDesc, + const void *input, + AttentionMaskParamSpec p, + TensorDesc outputDesc, + void *output) { DataType idt = inputDesc.dt; EE ret = SUCCESS; switch (idt) { #ifdef _USE_FP32 case DT_F32: { - ret = layer_normalization_fp32((F32*)alpha, (F32*)beta, inputDesc, (F32*)input, outputDesc, (F32*)output); + ret = attention_mask_fp32(inputDesc, (const F32 *)input, p, outputDesc, (F32 *)output); break; } #endif #ifdef _USE_FP16 case DT_F16: { - ret = layer_normalization_fp16((F16*)alpha, (F16*)beta, inputDesc, (F16*)input, outputDesc, (F16*)output); + ret = attention_mask_fp16(inputDesc, (const F16 *)input, p, outputDesc, (F16 *)output); break; } #endif @@ -43,5 +44,6 @@ EE layer_normalization_arm(void *alpha, void *beta, ret = NOT_SUPPORTED; break; } + return ret; } diff --git a/compute/tensor/src/cpu/arm/bnn/convolution.cpp b/compute/tensor/src/cpu/arm/bnn/convolution.cpp new file mode 100644 index 00000000..bdf74d24 --- /dev/null +++ b/compute/tensor/src/cpu/arm/bnn/convolution.cpp @@ -0,0 +1,74 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_FP16 +#include "cpu/arm/bnn/tensor_computing_bnn.h" + +EE convolution_bnn(TensorDesc inputDesc, + const F16 *input, + TensorDesc filterDesc, + const BIN8 *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const F16 *scale, + TensorDesc biasDesc, + const F16 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + if (nullptr == input || nullptr == filter || nullptr == output || nullptr == scale || + nullptr == bias || nullptr == tmp) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (idt != DT_F16) { + CHECK_STATUS(NOT_MATCH); + } + if (odt != DT_F16) { + CHECK_STATUS(NOT_MATCH); + } + if (idf != DF_NCHWC8 || odf != DF_NCHWC8) { + CHECK_STATUS(NOT_MATCH); + } + + EE ret = SUCCESS; + switch (fdt) { + case DT_BIN01: + ret = convolution_dorefa(inputDesc, (F16 *)input, filterDesc, (BIN8 *)filter, + convParamSpec, scaleDesc, (F16 *)scale, biasDesc, (F16 *)bias, tmpBytes, tmp, + outputDesc, (F16 *)output, activationDesc, arch); + break; + case DT_BIN11: + ret = convolution_xnor(inputDesc, (F16 *)input, filterDesc, (BIN8 *)filter, + convParamSpec, scaleDesc, (F16 *)scale, biasDesc, (F16 *)bias, tmpBytes, tmp, + outputDesc, (F16 *)output, activationDesc, arch); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} +#endif diff --git a/compute/tensor/src/cpu/arm/bnn/convolution_dorefa.h b/compute/tensor/src/cpu/arm/bnn/convolution_dorefa.h new file mode 100644 index 00000000..8b586610 --- /dev/null +++ b/compute/tensor/src/cpu/arm/bnn/convolution_dorefa.h @@ -0,0 +1,86 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONVOLUTION_DOREFA +#define _H_CONVOLUTION_DOREFA + +#ifdef _USE_FP16 +#include +#include +#include "sys.h" +#include "types.h" +#include "error.h" + +EE convolution_dorefa_A55(TensorDesc inputDesc, + const F16 *input, + TensorDesc filterDesc, + const BIN8 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const F16 *scaleArray, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc); + +EE convolution_dorefa_A76(TensorDesc inputDesc, + const F16 *input, + TensorDesc filterDesc, + const BIN8 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const F16 *scaleArray, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc); + +inline EE convolution_dorefa(TensorDesc inputDesc, + const F16 *input, + TensorDesc filterDesc, + const BIN8 *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const F16 *scale, + TensorDesc biasDesc, + const F16 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + EE ret = SUCCESS; + switch (arch) { + case ARM_A55: + ret = convolution_dorefa_A55(inputDesc, input, filterDesc, filter, convParamSpec, + scaleDesc, scale, biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc); + break; + case ARM_A76: + ret = convolution_dorefa_A76(inputDesc, input, filterDesc, filter, convParamSpec, + scaleDesc, scale, biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc); + break; + default: + return NOT_SUPPORTED; + } + return ret; +} +#endif +#endif diff --git a/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A55.cpp b/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A55.cpp new file mode 100644 index 00000000..b6f0fef9 --- /dev/null +++ b/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A55.cpp @@ -0,0 +1,779 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_FP16 +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" + +#include "cpu/arm/bnn/convolution_dorefa.h" + +EE convolution_dorefa_A55(TensorDesc inputDesc, + const F16 *input, + TensorDesc filterDesc, + const BIN8 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const F16 *scaleArray, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(scaleDesc); + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(activationDesc); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + if (fdf != DF_NCHWN16C8) { + CHECK_STATUS(NOT_MATCH); + } + if (!(ic == fc && oc == fn)) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + U32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + + BIN8 *inArray = ((BIN8 *)tmp) + ic * ihiw + 8 * fh * fw * ic; // ic has been divided by 8 + BIN8 *inArray_pad; + for (U32 n = 0; n < in; n++) { + const F16 *in = input + n * ic * ih * iw * 8; + for (U32 i = 0; i < ic * ih * iw; i++) { + BIN8 temp = 0; + for (U32 j = 0; j < 8; j++) { + if (in[i * 8 + j] >= 0.5) { + temp |= (1 << (7 - j)); // set + } + } + inArray[i] = temp; + } + + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + inArray_pad = inArray + n * ic * ih * iw; // ic has been divided by 8 + } else { + // copy input into a input with padding + inArray_pad = (BIN8 *)tmp; + BIN8 *inArray_pad_mov = inArray_pad; + BIN8 *inArray_mov = inArray + n * ic * ih * iw; + for (U32 c = 0; c < ic; c++) { // All divide by 8 + for (U32 h = 0; h < paddingT; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(DT_BIN01)); + inArray_pad_mov += iw_pad; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * bytesOf(DT_BIN01)); + inArray_pad_mov += paddingL; + memcpy(inArray_pad_mov, inArray_mov, iw * bytesOf(DT_BIN01)); + inArray_pad_mov += iw; + inArray_mov += iw; + memset(inArray_pad_mov, 0, paddingR * bytesOf(DT_BIN01)); + inArray_pad_mov += paddingR; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(DT_BIN01)); + inArray_pad_mov += iw_pad; + } + } + } + for (U32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *s0 = scaleArray; + const F16 *s1 = scaleArray + 8; + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + BIN8 *in_order = ((BIN8 *)tmp) + ic * ihiw; // ic has been divided by 8 + // reorder input + // NCHWc8 => NHWChw8c8 + im2col + U32 in_h[8]; + U32 in_w[8]; + for (U32 i = 0; i < 8; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + BIN8 *in_hw8c8 = inArray_pad + c * ihiw + fh_idx * iw_pad + fw_idx; + // NHWChw8c8 + BIN8 *in_order_hw8c8 = in_order + c * fh * fw * 8 + fh_idx * fw * 8 + + fw_idx * 8; // This 8 comes from hw8 + for (U32 i = 0; i < 8; i++) { + in_order_hw8c8[i] = *(in_hw8c8 + in_h[i] * iw_pad + in_w[i]); + } + } + } + } + + // compute + for (U32 o = 0; o < oc; o += + 2) { // oc should be multiple of 32. It will at least be multiple of 16 in the future. + BIN8 *in_hw0 = in_order; + const BIN8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; // ic has been divided by 8 + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // scale and bias + const F16 *s_o0 = s0; + const F16 *s_o1 = s1; + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + /* Layout + 5 6 + 7 8 + 9 10 + 11 12 + + 13 14 + 15 16 + 17 18 + 19 20 + */ + "eor v5.16b, v5.16b, v5.16b\n" + "ldr d29, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + "eor v7.16b, v7.16b, v7.16b\n" + "ldr x2, [%[f_0], #8]\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ins v0.d[1], x2\n" + "eor v9.16b, v9.16b, v9.16b\n" + "dup v1.16b, v29.b[0]\n" // duplicate a full register + "eor v10.16b, v10.16b, v10.16b\n" + "dup v2.16b, v29.b[1]\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + "eor v13.16b, v13.16b, v13.16b\n" + "eor v14.16b, v14.16b, v14.16b\n" + "eor v15.16b, v15.16b, v15.16b\n" + "eor v16.16b, v16.16b, v16.16b\n" + "eor v17.16b, v17.16b, v17.16b\n" + "eor v18.16b, v18.16b, v18.16b\n" + "eor v19.16b, v19.16b, v19.16b\n" + "eor v20.16b, v20.16b, v20.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + + "0:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "mov x9, %[fhfw]\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + "eor v25.16b, v25.16b, v25.16b\n" + "eor v26.16b, v26.16b, v26.16b\n" + "eor v27.16b, v27.16b, v27.16b\n" + "eor v28.16b, v28.16b, v28.16b\n" + + "mov x4, #4\n" + + "1:\n" + "and v3.16b, v1.16b, v0.16b\n" + "ldr d30, [x0, 16]!\n" // next filter + + "and v4.16b, v2.16b, v0.16b\n" + "ldr x1, [x0, 8]\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "add v21.16b, v21.16b, v3.16b\n" // Use add because the latency is shorter + "dup v2.16b, v29.b[3]\n" + + "add v22.16b, v22.16b, v4.16b\n" + "ins v30.d[1], x1\n" + + "and v3.16b, v1.16b, v0.16b\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[4]\n" + "add v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[5]\n" + "add v24.16b, v24.16b, v4.16b\n" + + "and v3.16b, v1.16b, v0.16b\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[6]\n" + "add v25.16b, v25.16b, v3.16b\n" + "dup v2.16b, v29.b[7]\n" + "add v26.16b, v26.16b, v4.16b\n" + + "and v3.16b, v1.16b, v0.16b\n" + "ldr d29, [x3, 8]!\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "mov v0.16b, v30.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "add v27.16b, v27.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "add v28.16b, v28.16b, v4.16b\n" + "bne 1b\n" + + "movi v3.16b, #1\n" + "umlal v5.8h, v21.8b, v3.8b\n" + "umlal v7.8h, v22.8b, v3.8b\n" + "umlal v9.8h, v23.8b, v3.8b\n" + "umlal v11.8h, v24.8b, v3.8b\n" + "umlal v13.8h, v25.8b, v3.8b\n" + "umlal v15.8h, v26.8b, v3.8b\n" + "umlal v17.8h, v27.8b, v3.8b\n" + "umlal v19.8h, v28.8b, v3.8b\n" + + "umlal2 v6.8h, v21.16b, v3.16b\n" + "umlal2 v8.8h, v22.16b, v3.16b\n" + "umlal2 v10.8h, v23.16b, v3.16b\n" + "umlal2 v12.8h, v24.16b, v3.16b\n" + "umlal2 v14.8h, v25.16b, v3.16b\n" + "umlal2 v16.8h, v26.16b, v3.16b\n" + "umlal2 v18.8h, v27.16b, v3.16b\n" + "umlal2 v20.8h, v28.16b, v3.16b\n" + + "subs x9, x9, #1\n" + "beq 4f\n" // 1x1, continue with the next 32 input channels + + "2:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + "eor v25.16b, v25.16b, v25.16b\n" + "eor v26.16b, v26.16b, v26.16b\n" + "eor v27.16b, v27.16b, v27.16b\n" + "eor v28.16b, v28.16b, v28.16b\n" + + "mov x4, #32\n" // Assume 256 will not happen + "3:\n" + "and v3.16b, v1.16b, v0.16b\n" + "ldr d30, [x0, 16]!\n" // next filter + + "and v4.16b, v2.16b, v0.16b\n" + "ldr x1, [x0, 8]\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "uqadd v21.16b, v21.16b, v3.16b\n" + "dup v2.16b, v29.b[3]\n" + + "uqadd v22.16b, v22.16b, v4.16b\n" + "ins v30.d[1], x1\n" + + "and v3.16b, v1.16b, v0.16b\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[4]\n" + "uqadd v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[5]\n" + "uqadd v24.16b, v24.16b, v4.16b\n" + + "and v3.16b, v1.16b, v0.16b\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[6]\n" + "uqadd v25.16b, v25.16b, v3.16b\n" + "dup v2.16b, v29.b[7]\n" + "uqadd v26.16b, v26.16b, v4.16b\n" + + "and v3.16b, v1.16b, v0.16b\n" + "ldr d29, [x3, 8]!\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "mov v0.16b, v30.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "uqadd v27.16b, v27.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "uqadd v28.16b, v28.16b, v4.16b\n" + "bne 3b\n" + + "movi v3.16b, #1\n" + "umlal v5.8h, v21.8b, v3.8b\n" + "umlal v7.8h, v22.8b, v3.8b\n" + "umlal v9.8h, v23.8b, v3.8b\n" + "umlal v11.8h, v24.8b, v3.8b\n" + "umlal v13.8h, v25.8b, v3.8b\n" + "umlal v15.8h, v26.8b, v3.8b\n" + "umlal v17.8h, v27.8b, v3.8b\n" + "umlal v19.8h, v28.8b, v3.8b\n" + + "umlal2 v6.8h, v21.16b, v3.16b\n" + "umlal2 v8.8h, v22.16b, v3.16b\n" + "umlal2 v10.8h, v23.16b, v3.16b\n" + "umlal2 v12.8h, v24.16b, v3.16b\n" + "umlal2 v14.8h, v25.16b, v3.16b\n" + "umlal2 v16.8h, v26.16b, v3.16b\n" + "umlal2 v18.8h, v27.16b, v3.16b\n" + "umlal2 v20.8h, v28.16b, v3.16b\n" + + "subs x9, x9, #8\n" + "bne 2b\n" + + "4:\n" // Wrap up computation for 32 input channels + "subs x2, x2, #32\n" + "bne 0b\n" + + // pipelined + "ucvtf v5.8h, v5.8h\n" + "ucvtf v6.8h, v6.8h\n" + "ldr q21, [%[b_0]]\n" + "ucvtf v7.8h, v7.8h\n" + "ldr q22, [%[b_1]]\n" + "ucvtf v8.8h, v8.8h\n" + "ldr q23, [%[s_0]]\n" + "ucvtf v9.8h, v9.8h\n" + "ldr q24, [%[s_1]]\n" + "ucvtf v10.8h, v10.8h\n" + "ucvtf v11.8h, v11.8h\n" + "mov v1.16b, v21.16b\n" + "ucvtf v12.8h, v12.8h\n" + "mov v2.16b, v22.16b\n" + "ucvtf v13.8h, v13.8h\n" + "fmla v1.8h, v5.8h, v23.8h\n" + "ucvtf v14.8h, v14.8h\n" + "fmla v2.8h, v6.8h, v24.8h\n" + "ucvtf v15.8h, v15.8h\n" + "mov v3.16b, v21.16b\n" + "ucvtf v16.8h, v16.8h\n" + "mov v4.16b, v22.16b\n" + "ucvtf v17.8h, v17.8h\n" + "fmla v3.8h, v7.8h, v23.8h\n" + "ucvtf v18.8h, v18.8h\n" + "fmla v4.8h, v8.8h, v24.8h\n" + "ucvtf v19.8h, v19.8h\n" + "mov v5.16b, v21.16b\n" + "ucvtf v20.8h, v20.8h\n" + "mov v6.16b, v22.16b\n" + + "fmla v5.8h, v9.8h, v23.8h\n" + "mov v7.16b, v21.16b\n" + "fmla v6.8h, v10.8h, v24.8h\n" + "mov v8.16b, v22.16b\n" + "fmla v7.8h, v11.8h, v23.8h\n" + "mov v9.16b, v21.16b\n" + "fmla v8.8h, v12.8h, v24.8h\n" + "mov v10.16b, v22.16b\n" + "fmla v9.8h, v13.8h, v23.8h\n" + "mov v11.16b, v21.16b\n" + "fmla v10.8h, v14.8h, v24.8h\n" + "mov v12.16b, v22.16b\n" + "fmla v11.8h, v15.8h, v23.8h\n" + "mov v13.16b, v21.16b\n" + "fmla v12.8h, v16.8h, v24.8h\n" + "mov v14.16b, v22.16b\n" + "fmla v13.8h, v17.8h, v23.8h\n" + "mov v15.16b, v21.16b\n" + "fmla v14.8h, v18.8h, v24.8h\n" + "mov v16.16b, v22.16b\n" + "fmla v15.8h, v19.8h, v23.8h\n" + "fmla v16.8h, v20.8h, v24.8h\n" + + "str q1, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q5, [%[out_0], #32]\n" // out_o0hw2 + "str q7, [%[out_0], #48]\n" // out_o0hw3 + "str q9, [%[out_0], #64]\n" // out_o0hw4 + "str q11, [%[out_0], #80]\n" // out_o0hw5 + "str q13, [%[out_0], #96]\n" // out_o0hw6 + "str q15, [%[out_0], #112]\n" // out_o0hw7 + + "str q2, [%[out_1]]\n" // out_o1hw0 + "str q4, [%[out_1], #16]\n" // out_o1hw1 + "str q6, [%[out_1], #32]\n" // out_o1hw2 + "str q8, [%[out_1], #48]\n" // out_o1hw3 + "str q10, [%[out_1], #64]\n" // out_o1hw4 + "str q12, [%[out_1], #80]\n" // out_o1hw5 + "str q14, [%[out_1], #96]\n" // out_o1hw6 + "str q16, [%[out_1], #112]\n" // out_o1hw7 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [fhfw] "r"((I64)fh * fw), [s_0] "r"(s_o0), + [s_1] "r"(s_o1), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", + "x1", "x2", "x3", "x4", "x9"); + s0 += 16; + s1 += 16; + b0 += 16; + b1 += 16; + } + } + // ohow_remainder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + + for (U32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F16 *s0 = scaleArray; + const F16 *s1 = scaleArray + 8; + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + BIN8 *in_order = ((BIN8 *)tmp) + ic * ihiw; // ic has been divided by 8 + // reorder input + // NCHWc8 => NHWChw4c8 + im2col + U32 in_h[4]; + U32 in_w[4]; + for (U32 i = 0; i < 4; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + BIN8 *in_hw4c8 = inArray_pad + c * ihiw + fh_idx * iw_pad + fw_idx; + // NHWChw4c8 + BIN8 *in_order_hw4c8 = + in_order + c * fh * fw * 4 + fh_idx * fw * 4 + fw_idx * 4; + for (U32 i = 0; i < 4; i++) { + in_order_hw4c8[i] = *(in_hw4c8 + in_h[i] * iw_pad + in_w[i]); + } + } + } + } + + // compute + for (U32 o = 0; o < oc; o += + 2) { // oc should be multiple of 32. It will at least be multiple of 16 in the future. + BIN8 *in_hw0 = in_order; + const BIN8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; // ic has been divided by 8 + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // scale and bias + const F16 *s_o0 = s0; + const F16 *s_o1 = s1; + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + /* Layout + 5 6 + 7 8 + 9 10 + 11 12 + */ + "eor v5.16b, v5.16b, v5.16b\n" + "ldr s29, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + "eor v7.16b, v7.16b, v7.16b\n" + "ldr x2, [%[f_0], #8]\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ins v0.d[1], x2\n" + "eor v9.16b, v9.16b, v9.16b\n" + "dup v1.16b, v29.b[0]\n" // duplicate a full register + "eor v10.16b, v10.16b, v10.16b\n" + "dup v2.16b, v29.b[1]\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + + "0:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "mov x9, %[fhfw]\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + + "mov x4, #4\n" + + "1:\n" + "and v3.16b, v1.16b, v0.16b\n" + "ldr d30, [x0, 16]!\n" // next filter + + "and v4.16b, v2.16b, v0.16b\n" + "ldr x1, [x0, 8]\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "add v21.16b, v21.16b, v3.16b\n" // Use add because the latency is shorter + "dup v2.16b, v29.b[3]\n" + + "add v22.16b, v22.16b, v4.16b\n" + "ins v30.d[1], x1\n" + + "and v3.16b, v1.16b, v0.16b\n" + "ldr s29, [x3, 4]!\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "mov v0.16b, v30.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "add v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "add v24.16b, v24.16b, v4.16b\n" + "bne 1b\n" + + "movi v3.16b, #1\n" + "umlal v5.8h, v21.8b, v3.8b\n" + "umlal v7.8h, v22.8b, v3.8b\n" + "umlal v9.8h, v23.8b, v3.8b\n" + "umlal v11.8h, v24.8b, v3.8b\n" + + "umlal2 v6.8h, v21.16b, v3.16b\n" + "umlal2 v8.8h, v22.16b, v3.16b\n" + "umlal2 v10.8h, v23.16b, v3.16b\n" + "umlal2 v12.8h, v24.16b, v3.16b\n" + + "subs x9, x9, #1\n" + "beq 4f\n" // 1x1, continue with the next 32 input channels + + "2:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + + "mov x4, #32\n" // Assume 256 will not happen + "3:\n" + "and v3.16b, v1.16b, v0.16b\n" + "ldr d30, [x0, 16]!\n" // next filter + + "and v4.16b, v2.16b, v0.16b\n" + "ldr x1, [x0, 8]\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "uqadd v21.16b, v21.16b, v3.16b\n" + "dup v2.16b, v29.b[3]\n" + + "uqadd v22.16b, v22.16b, v4.16b\n" + "ins v30.d[1], x1\n" + + "and v3.16b, v1.16b, v0.16b\n" + "ldr s29, [x3, 4]!\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "mov v0.16b, v30.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "uqadd v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "uqadd v24.16b, v24.16b, v4.16b\n" + "bne 3b\n" + + "movi v3.16b, #1\n" + "umlal v5.8h, v21.8b, v3.8b\n" + "umlal v7.8h, v22.8b, v3.8b\n" + "umlal v9.8h, v23.8b, v3.8b\n" + "umlal v11.8h, v24.8b, v3.8b\n" + + "umlal2 v6.8h, v21.16b, v3.16b\n" + "umlal2 v8.8h, v22.16b, v3.16b\n" + "umlal2 v10.8h, v23.16b, v3.16b\n" + "umlal2 v12.8h, v24.16b, v3.16b\n" + + "subs x9, x9, #8\n" + "bne 2b\n" + + "4:\n" // Wrap up computation for 32 input channels + "subs x2, x2, #32\n" + "bne 0b\n" + + // pipelined + "ucvtf v5.8h, v5.8h\n" + "ucvtf v6.8h, v6.8h\n" + "ldr q21, [%[b_0]]\n" + "ucvtf v7.8h, v7.8h\n" + "ldr q22, [%[b_1]]\n" + "ucvtf v8.8h, v8.8h\n" + "ldr q23, [%[s_0]]\n" + "ucvtf v9.8h, v9.8h\n" + "ldr q24, [%[s_1]]\n" + "ucvtf v10.8h, v10.8h\n" + "ucvtf v11.8h, v11.8h\n" + "mov v1.16b, v21.16b\n" + "ucvtf v12.8h, v12.8h\n" + "mov v2.16b, v22.16b\n" + "fmla v1.8h, v5.8h, v23.8h\n" + "fmla v2.8h, v6.8h, v24.8h\n" + "mov v3.16b, v21.16b\n" + "mov v4.16b, v22.16b\n" + "fmla v3.8h, v7.8h, v23.8h\n" + "fmla v4.8h, v8.8h, v24.8h\n" + "mov v5.16b, v21.16b\n" + "mov v6.16b, v22.16b\n" + + "fmla v5.8h, v9.8h, v23.8h\n" + "mov v7.16b, v21.16b\n" + "fmla v6.8h, v10.8h, v24.8h\n" + "mov v8.16b, v22.16b\n" + "fmla v7.8h, v11.8h, v23.8h\n" + "fmla v8.8h, v12.8h, v24.8h\n" + + "str q1, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q5, [%[out_0], #32]\n" // out_o0hw2 + "str q7, [%[out_0], #48]\n" // out_o0hw3 + + "str q2, [%[out_1]]\n" // out_o1hw0 + "str q4, [%[out_1], #16]\n" // out_o1hw1 + "str q6, [%[out_1], #32]\n" // out_o1hw2 + "str q8, [%[out_1], #48]\n" // out_o1hw3 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [fhfw] "r"((I64)fh * fw), [s_0] "r"(s_o0), + [s_1] "r"(s_o1), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v21", "v22", "v23", "v24", + "v29", "v30", "x0", "x1", "x2", "x3", "x4", "x9"); + s0 += 16; + s1 += 16; + b0 += 16; + b1 += 16; + } + } + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (U32 hw = ohow_s; hw < ohow; hw++) { + const F16 *s0 = scaleArray; + const F16 *s1 = scaleArray + 8; + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + BIN8 *in_order = ((BIN8 *)tmp) + ic * ih_pad * iw_pad; // ic has been divided by 8 + // reorder input + // NCHWc8 => NHWChw1c8 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + BIN8 *in_hw1c8 = inArray_pad + c * ihiw + fh_idx * iw_pad + fw_idx; + BIN8 *in_0 = in_hw1c8 + in_h_0 * iw_pad + in_w_0; + BIN8 *in_order_hw1c8 = in_order + c * fh * fw + fh_idx * fw + fw_idx; + *in_order_hw1c8 = (*in_0); + } + } + } + // compute + for (U32 o = 0; o < oc; o += 2) { + BIN8 *in_hw0 = in_order; + const BIN8 *f_o = filterArray + o * 8 * fh * fw * ic; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + + uint16x8_t sum[2] = {0}; + uint8x8_t v1 = vdup_n_u8(1); + for (U32 i = 0; i < ic * 8; i += 32) { + uint8x8_t sub0[2] = {0}; + + for (U32 j = 0; j < 4; j++) { + uint8x8_t f_0 = vld1_u8(f_o); + uint8x8_t f_1 = vld1_u8(f_o + 8); + f_o += 16; + uint8x8_t in_1 = vdup_n_u8(*in_hw0); + in_hw0++; + f_0 = vand_u8(in_1, f_0); + f_1 = vand_u8(in_1, f_1); + f_0 = vcnt_u8(f_0); + f_1 = vcnt_u8(f_1); + sub0[0] = vadd_u8(sub0[0], f_0); + sub0[1] = vadd_u8(sub0[1], f_1); + } + sum[0] = vmlal_u8(sum[0], sub0[0], v1); + sum[1] = vmlal_u8(sum[1], sub0[1], v1); + + for (U32 j = 1; j < fh * fw; j += 8) { + uint8x8_t sub1[2] = {0}; + for (U32 k = 0; k < 32; k++) { + uint8x8_t f_0 = vld1_u8(f_o); + uint8x8_t f_1 = vld1_u8(f_o + 8); + f_o += 16; + uint8x8_t in_1 = vdup_n_u8(*in_hw0); + in_hw0++; + f_0 = vand_u8(in_1, f_0); + f_1 = vand_u8(in_1, f_1); + f_0 = vcnt_u8(f_0); + f_1 = vcnt_u8(f_1); + sub1[0] = vadd_u8(sub1[0], f_0); + sub1[1] = vadd_u8(sub1[1], f_1); + } + sum[0] = vmlal_u8(sum[0], sub1[0], v1); + sum[1] = vmlal_u8(sum[1], sub1[1], v1); + } + } + + float16x8_t res_o0 = vcvtq_f16_u16(sum[0]); + float16x8_t res_o1 = vcvtq_f16_u16(sum[1]); + float16x8_t scale_o0 = vld1q_f16(s0); + s0 += 16; + float16x8_t scale_o1 = vld1q_f16(s1); + s1 += 16; + float16x8_t bias_o0 = vld1q_f16(b0); + b0 += 16; + float16x8_t bias_o1 = vld1q_f16(b1); + b1 += 16; + res_o0 = vmulq_f16(res_o0, scale_o0); + res_o1 = vmulq_f16(res_o1, scale_o1); + res_o0 = vaddq_f16(res_o0, bias_o0); + res_o1 = vaddq_f16(res_o1, bias_o1); + vst1q_f16(out_o0hw0, res_o0); + vst1q_f16(out_o1hw0, res_o1); + } + } + } + return SUCCESS; +} +#endif diff --git a/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A76.cpp b/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A76.cpp new file mode 100644 index 00000000..420ba6ef --- /dev/null +++ b/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A76.cpp @@ -0,0 +1,759 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_FP16 +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" + +#include "cpu/arm/bnn/convolution_dorefa.h" + +EE convolution_dorefa_A76(TensorDesc inputDesc, + const F16 *input, + TensorDesc filterDesc, + const BIN8 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const F16 *scaleArray, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(scaleDesc); + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(activationDesc); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + if (fdf != DF_NCHWN16C8) { + CHECK_STATUS(NOT_MATCH); + } + if (!(ic == fc && oc == fn)) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + U32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + + BIN8 *inArray = ((BIN8 *)tmp) + ic * ihiw + 8 * fh * fw * ic; // ic has been divided by 8 + BIN8 *inArray_pad; + for (U32 n = 0; n < in; n++) { + const F16 *in = input + n * ic * ih * iw * 8; + for (U32 i = 0; i < ic * ih * iw; i++) { + BIN8 temp = 0; + for (U32 j = 0; j < 8; j++) { + if (in[i * 8 + j] >= 0.5) { + temp |= (1 << (7 - j)); // set + } + } + inArray[i] = temp; + } + + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + inArray_pad = inArray + n * ic * ih * iw; // ic has been divided by 8 + } else { + // copy input into a input with padding + inArray_pad = (BIN8 *)tmp; + BIN8 *inArray_pad_mov = inArray_pad; + BIN8 *inArray_mov = inArray + n * ic * ih * iw; + for (U32 c = 0; c < ic; c++) { // All divide by 8 + for (U32 h = 0; h < paddingT; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(DT_BIN01)); + inArray_pad_mov += iw_pad; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * bytesOf(DT_BIN01)); + inArray_pad_mov += paddingL; + memcpy(inArray_pad_mov, inArray_mov, iw * bytesOf(DT_BIN01)); + inArray_pad_mov += iw; + inArray_mov += iw; + memset(inArray_pad_mov, 0, paddingR * bytesOf(DT_BIN01)); + inArray_pad_mov += paddingR; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(DT_BIN01)); + inArray_pad_mov += iw_pad; + } + } + } + for (U32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *s0 = scaleArray; + const F16 *s1 = scaleArray + 8; + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + BIN8 *in_order = ((BIN8 *)tmp) + ic * ihiw; // ic has been divided by 8 + // reorder input + // NCHWc8 => NHWChw8c8 + im2col + U32 in_h[8]; + U32 in_w[8]; + for (U32 i = 0; i < 8; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + BIN8 *in_hw8c8 = inArray_pad + c * ihiw + fh_idx * iw_pad + fw_idx; + // NHWChw8c8 + BIN8 *in_order_hw8c8 = in_order + c * fh * fw * 8 + fh_idx * fw * 8 + + fw_idx * 8; // This 8 comes from hw8 + for (U32 i = 0; i < 8; i++) { + in_order_hw8c8[i] = *(in_hw8c8 + in_h[i] * iw_pad + in_w[i]); + } + } + } + } + + // compute + for (U32 o = 0; o < oc; o += + 2) { // oc should be multiple of 32. It will at least be multiple of 16 in the future. + BIN8 *in_hw0 = in_order; + const BIN8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; // ic has been divided by 8 + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // scale and bias + const F16 *s_o0 = s0; + const F16 *s_o1 = s1; + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr d29, [%[in_0]]\n" // in_0 + "ldr q0, [%[f_0]]\n" // f_0 + /* Layout + 5 6 + 7 8 + 9 10 + 11 12 + + 13 14 + 15 16 + 17 18 + 19 20 + */ + "eor v5.16b, v5.16b, v5.16b\n" + "eor v6.16b, v6.16b, v6.16b\n" + "eor v7.16b, v7.16b, v7.16b\n" + "eor v8.16b, v8.16b, v8.16b\n" + "eor v9.16b, v9.16b, v9.16b\n" + "dup v1.16b, v29.b[0]\n" // duplicate a full register + "eor v10.16b, v10.16b, v10.16b\n" + "dup v2.16b, v29.b[1]\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + "eor v13.16b, v13.16b, v13.16b\n" + "eor v14.16b, v14.16b, v14.16b\n" + "eor v15.16b, v15.16b, v15.16b\n" + "eor v16.16b, v16.16b, v16.16b\n" + "eor v17.16b, v17.16b, v17.16b\n" + "eor v18.16b, v18.16b, v18.16b\n" + "eor v19.16b, v19.16b, v19.16b\n" + "eor v20.16b, v20.16b, v20.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + + "0:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "mov x9, %[fhfw]\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + "eor v25.16b, v25.16b, v25.16b\n" + "eor v26.16b, v26.16b, v26.16b\n" + "eor v27.16b, v27.16b, v27.16b\n" + "eor v28.16b, v28.16b, v28.16b\n" + + "mov x4, #4\n" + + "1:\n" + "and v3.16b, v1.16b, v0.16b\n" + "and v4.16b, v2.16b, v0.16b\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "add v21.16b, v21.16b, v3.16b\n" // Use add because the latency is shorter + "dup v2.16b, v29.b[3]\n" + + "add v22.16b, v22.16b, v4.16b\n" + + "and v3.16b, v1.16b, v0.16b\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[4]\n" + "add v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[5]\n" + "add v24.16b, v24.16b, v4.16b\n" + + "and v3.16b, v1.16b, v0.16b\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[6]\n" + "add v25.16b, v25.16b, v3.16b\n" + "dup v2.16b, v29.b[7]\n" + "add v26.16b, v26.16b, v4.16b\n" + + "and v3.16b, v1.16b, v0.16b\n" + "ldr d29, [x3, 8]!\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "ldr q0, [x0, 16]!\n" // next filter + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "add v27.16b, v27.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "add v28.16b, v28.16b, v4.16b\n" + "bne 1b\n" + + "movi v3.16b, #1\n" + "umlal v5.8h, v21.8b, v3.8b\n" + "umlal v7.8h, v22.8b, v3.8b\n" + "umlal v9.8h, v23.8b, v3.8b\n" + "umlal v11.8h, v24.8b, v3.8b\n" + "umlal v13.8h, v25.8b, v3.8b\n" + "umlal v15.8h, v26.8b, v3.8b\n" + "umlal v17.8h, v27.8b, v3.8b\n" + "umlal v19.8h, v28.8b, v3.8b\n" + + "umlal2 v6.8h, v21.16b, v3.16b\n" + "umlal2 v8.8h, v22.16b, v3.16b\n" + "umlal2 v10.8h, v23.16b, v3.16b\n" + "umlal2 v12.8h, v24.16b, v3.16b\n" + "umlal2 v14.8h, v25.16b, v3.16b\n" + "umlal2 v16.8h, v26.16b, v3.16b\n" + "umlal2 v18.8h, v27.16b, v3.16b\n" + "umlal2 v20.8h, v28.16b, v3.16b\n" + + "subs x9, x9, #1\n" + "beq 4f\n" // 1x1, continue with the next 32 input channels + + "2:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + "eor v25.16b, v25.16b, v25.16b\n" + "eor v26.16b, v26.16b, v26.16b\n" + "eor v27.16b, v27.16b, v27.16b\n" + "eor v28.16b, v28.16b, v28.16b\n" + + "mov x4, #32\n" // Assume 256 will not happen + "3:\n" + "and v3.16b, v1.16b, v0.16b\n" + "and v4.16b, v2.16b, v0.16b\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "uqadd v21.16b, v21.16b, v3.16b\n" + "dup v2.16b, v29.b[3]\n" + + "uqadd v22.16b, v22.16b, v4.16b\n" + + "and v3.16b, v1.16b, v0.16b\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[4]\n" + "uqadd v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[5]\n" + "uqadd v24.16b, v24.16b, v4.16b\n" + + "and v3.16b, v1.16b, v0.16b\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[6]\n" + "uqadd v25.16b, v25.16b, v3.16b\n" + "dup v2.16b, v29.b[7]\n" + "uqadd v26.16b, v26.16b, v4.16b\n" + + "and v3.16b, v1.16b, v0.16b\n" + "ldr d29, [x3, 8]!\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "ldr q0, [x0, 16]!\n" // next filter + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "uqadd v27.16b, v27.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "uqadd v28.16b, v28.16b, v4.16b\n" + "bne 3b\n" + + "movi v3.16b, #1\n" + "umlal v5.8h, v21.8b, v3.8b\n" + "umlal v7.8h, v22.8b, v3.8b\n" + "umlal v9.8h, v23.8b, v3.8b\n" + "umlal v11.8h, v24.8b, v3.8b\n" + "umlal v13.8h, v25.8b, v3.8b\n" + "umlal v15.8h, v26.8b, v3.8b\n" + "umlal v17.8h, v27.8b, v3.8b\n" + "umlal v19.8h, v28.8b, v3.8b\n" + + "umlal2 v6.8h, v21.16b, v3.16b\n" + "umlal2 v8.8h, v22.16b, v3.16b\n" + "umlal2 v10.8h, v23.16b, v3.16b\n" + "umlal2 v12.8h, v24.16b, v3.16b\n" + "umlal2 v14.8h, v25.16b, v3.16b\n" + "umlal2 v16.8h, v26.16b, v3.16b\n" + "umlal2 v18.8h, v27.16b, v3.16b\n" + "umlal2 v20.8h, v28.16b, v3.16b\n" + + "subs x9, x9, #8\n" + "bne 2b\n" + + "4:\n" // Wrap up computation for 32 input channels + "subs x2, x2, #32\n" + "bne 0b\n" + + // pipelined + "ucvtf v5.8h, v5.8h\n" + "ucvtf v6.8h, v6.8h\n" + "ldr q21, [%[b_0]]\n" + "ucvtf v7.8h, v7.8h\n" + "ldr q22, [%[b_1]]\n" + "ucvtf v8.8h, v8.8h\n" + "ldr q23, [%[s_0]]\n" + "ucvtf v9.8h, v9.8h\n" + "ldr q24, [%[s_1]]\n" + "ucvtf v10.8h, v10.8h\n" + "ucvtf v11.8h, v11.8h\n" + "mov v1.16b, v21.16b\n" + "ucvtf v12.8h, v12.8h\n" + "mov v2.16b, v22.16b\n" + "ucvtf v13.8h, v13.8h\n" + "fmla v1.8h, v5.8h, v23.8h\n" + "ucvtf v14.8h, v14.8h\n" + "fmla v2.8h, v6.8h, v24.8h\n" + "ucvtf v15.8h, v15.8h\n" + "mov v3.16b, v21.16b\n" + "ucvtf v16.8h, v16.8h\n" + "mov v4.16b, v22.16b\n" + "ucvtf v17.8h, v17.8h\n" + "fmla v3.8h, v7.8h, v23.8h\n" + "ucvtf v18.8h, v18.8h\n" + "fmla v4.8h, v8.8h, v24.8h\n" + "ucvtf v19.8h, v19.8h\n" + "mov v5.16b, v21.16b\n" + "ucvtf v20.8h, v20.8h\n" + "mov v6.16b, v22.16b\n" + + "fmla v5.8h, v9.8h, v23.8h\n" + "mov v7.16b, v21.16b\n" + "fmla v6.8h, v10.8h, v24.8h\n" + "mov v8.16b, v22.16b\n" + "fmla v7.8h, v11.8h, v23.8h\n" + "mov v9.16b, v21.16b\n" + "fmla v8.8h, v12.8h, v24.8h\n" + "mov v10.16b, v22.16b\n" + "fmla v9.8h, v13.8h, v23.8h\n" + "mov v11.16b, v21.16b\n" + "fmla v10.8h, v14.8h, v24.8h\n" + "mov v12.16b, v22.16b\n" + "fmla v11.8h, v15.8h, v23.8h\n" + "mov v13.16b, v21.16b\n" + "fmla v12.8h, v16.8h, v24.8h\n" + "mov v14.16b, v22.16b\n" + "fmla v13.8h, v17.8h, v23.8h\n" + "mov v15.16b, v21.16b\n" + "fmla v14.8h, v18.8h, v24.8h\n" + "mov v16.16b, v22.16b\n" + "fmla v15.8h, v19.8h, v23.8h\n" + "fmla v16.8h, v20.8h, v24.8h\n" + + "str q1, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q5, [%[out_0], #32]\n" // out_o0hw2 + "str q7, [%[out_0], #48]\n" // out_o0hw3 + "str q9, [%[out_0], #64]\n" // out_o0hw4 + "str q11, [%[out_0], #80]\n" // out_o0hw5 + "str q13, [%[out_0], #96]\n" // out_o0hw6 + "str q15, [%[out_0], #112]\n" // out_o0hw7 + + "str q2, [%[out_1]]\n" // out_o1hw0 + "str q4, [%[out_1], #16]\n" // out_o1hw1 + "str q6, [%[out_1], #32]\n" // out_o1hw2 + "str q8, [%[out_1], #48]\n" // out_o1hw3 + "str q10, [%[out_1], #64]\n" // out_o1hw4 + "str q12, [%[out_1], #80]\n" // out_o1hw5 + "str q14, [%[out_1], #96]\n" // out_o1hw6 + "str q16, [%[out_1], #112]\n" // out_o1hw7 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [fhfw] "r"((I64)fh * fw), [s_0] "r"(s_o0), + [s_1] "r"(s_o1), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", + "x1", "x2", "x3", "x4", "x9"); + s0 += 16; + s1 += 16; + b0 += 16; + b1 += 16; + } + } + // ohow_remainder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + + for (U32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F16 *s0 = scaleArray; + const F16 *s1 = scaleArray + 8; + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + BIN8 *in_order = ((BIN8 *)tmp) + ic * ihiw; // ic has been divided by 8 + // reorder input + // NCHWc8 => NHWChw4c8 + im2col + U32 in_h[4]; + U32 in_w[4]; + for (U32 i = 0; i < 4; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + BIN8 *in_hw4c8 = inArray_pad + c * ihiw + fh_idx * iw_pad + fw_idx; + // NHWChw4c8 + BIN8 *in_order_hw4c8 = + in_order + c * fh * fw * 4 + fh_idx * fw * 4 + fw_idx * 4; + for (U32 i = 0; i < 4; i++) { + in_order_hw4c8[i] = *(in_hw4c8 + in_h[i] * iw_pad + in_w[i]); + } + } + } + } + + // compute + for (U32 o = 0; o < oc; o += + 2) { // oc should be multiple of 32. It will at least be multiple of 16 in the future. + BIN8 *in_hw0 = in_order; + const BIN8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; // ic has been divided by 8 + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // scale and bias + const F16 *s_o0 = s0; + const F16 *s_o1 = s1; + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q0, [%[f_0]]\n" // f_0 + "ldr s29, [%[in_0]]\n" // in_0 + /* Layout + 5 6 + 7 8 + 9 10 + 11 12 + */ + "eor v5.16b, v5.16b, v5.16b\n" + "eor v6.16b, v6.16b, v6.16b\n" + "eor v7.16b, v7.16b, v7.16b\n" + "eor v8.16b, v8.16b, v8.16b\n" + "eor v9.16b, v9.16b, v9.16b\n" + "dup v1.16b, v29.b[0]\n" // duplicate a full register + "eor v10.16b, v10.16b, v10.16b\n" + "dup v2.16b, v29.b[1]\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + + "0:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "mov x9, %[fhfw]\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + + "mov x4, #4\n" + + "1:\n" + "and v3.16b, v1.16b, v0.16b\n" + "and v4.16b, v2.16b, v0.16b\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "add v21.16b, v21.16b, v3.16b\n" // Use add because the latency is shorter + "dup v2.16b, v29.b[3]\n" + + "add v22.16b, v22.16b, v4.16b\n" + + "and v3.16b, v1.16b, v0.16b\n" + "ldr s29, [x3, 4]!\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "ldr q0, [x0, 16]!\n" // next filter + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "add v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "add v24.16b, v24.16b, v4.16b\n" + "bne 1b\n" + + "movi v3.16b, #1\n" + "umlal v5.8h, v21.8b, v3.8b\n" + "umlal v7.8h, v22.8b, v3.8b\n" + "umlal v9.8h, v23.8b, v3.8b\n" + "umlal v11.8h, v24.8b, v3.8b\n" + + "umlal2 v6.8h, v21.16b, v3.16b\n" + "umlal2 v8.8h, v22.16b, v3.16b\n" + "umlal2 v10.8h, v23.16b, v3.16b\n" + "umlal2 v12.8h, v24.16b, v3.16b\n" + + "subs x9, x9, #1\n" + "beq 4f\n" // 1x1, continue with the next 32 input channels + + "2:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + + "mov x4, #32\n" // Assume 256 will not happen + "3:\n" + "and v3.16b, v1.16b, v0.16b\n" + "and v4.16b, v2.16b, v0.16b\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "uqadd v21.16b, v21.16b, v3.16b\n" + "dup v2.16b, v29.b[3]\n" + + "uqadd v22.16b, v22.16b, v4.16b\n" + + "and v3.16b, v1.16b, v0.16b\n" + "ldr s29, [x3, 4]!\n" + "and v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "ldr q0, [x0, 16]!\n" // next filter + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "uqadd v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "uqadd v24.16b, v24.16b, v4.16b\n" + "bne 3b\n" + + "movi v3.16b, #1\n" + "umlal v5.8h, v21.8b, v3.8b\n" + "umlal v7.8h, v22.8b, v3.8b\n" + "umlal v9.8h, v23.8b, v3.8b\n" + "umlal v11.8h, v24.8b, v3.8b\n" + + "umlal2 v6.8h, v21.16b, v3.16b\n" + "umlal2 v8.8h, v22.16b, v3.16b\n" + "umlal2 v10.8h, v23.16b, v3.16b\n" + "umlal2 v12.8h, v24.16b, v3.16b\n" + + "subs x9, x9, #8\n" + "bne 2b\n" + + "4:\n" // Wrap up computation for 32 input channels + "subs x2, x2, #32\n" + "bne 0b\n" + + // pipelined + "ucvtf v5.8h, v5.8h\n" + "ucvtf v6.8h, v6.8h\n" + "ldr q21, [%[b_0]]\n" + "ucvtf v7.8h, v7.8h\n" + "ldr q22, [%[b_1]]\n" + "ucvtf v8.8h, v8.8h\n" + "ldr q23, [%[s_0]]\n" + "ucvtf v9.8h, v9.8h\n" + "ldr q24, [%[s_1]]\n" + "ucvtf v10.8h, v10.8h\n" + "ucvtf v11.8h, v11.8h\n" + "mov v1.16b, v21.16b\n" + "ucvtf v12.8h, v12.8h\n" + "mov v2.16b, v22.16b\n" + "fmla v1.8h, v5.8h, v23.8h\n" + "fmla v2.8h, v6.8h, v24.8h\n" + "mov v3.16b, v21.16b\n" + "mov v4.16b, v22.16b\n" + "fmla v3.8h, v7.8h, v23.8h\n" + "fmla v4.8h, v8.8h, v24.8h\n" + "mov v5.16b, v21.16b\n" + "mov v6.16b, v22.16b\n" + + "fmla v5.8h, v9.8h, v23.8h\n" + "mov v7.16b, v21.16b\n" + "fmla v6.8h, v10.8h, v24.8h\n" + "mov v8.16b, v22.16b\n" + "fmla v7.8h, v11.8h, v23.8h\n" + "fmla v8.8h, v12.8h, v24.8h\n" + + "str q1, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q5, [%[out_0], #32]\n" // out_o0hw2 + "str q7, [%[out_0], #48]\n" // out_o0hw3 + + "str q2, [%[out_1]]\n" // out_o1hw0 + "str q4, [%[out_1], #16]\n" // out_o1hw1 + "str q6, [%[out_1], #32]\n" // out_o1hw2 + "str q8, [%[out_1], #48]\n" // out_o1hw3 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [fhfw] "r"((I64)fh * fw), [s_0] "r"(s_o0), + [s_1] "r"(s_o1), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v21", "v22", "v23", "v24", + "v29", "v30", "x0", "x1", "x2", "x3", "x4", "x9"); + s0 += 16; + s1 += 16; + b0 += 16; + b1 += 16; + } + } + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (U32 hw = ohow_s; hw < ohow; hw++) { + const F16 *s0 = scaleArray; + const F16 *s1 = scaleArray + 8; + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + BIN8 *in_order = ((BIN8 *)tmp) + ic * ih_pad * iw_pad; // ic has been divided by 8 + // reorder input + // NCHWc8 => NHWChw1c8 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + BIN8 *in_hw1c8 = inArray_pad + c * ihiw + fh_idx * iw_pad + fw_idx; + BIN8 *in_0 = in_hw1c8 + in_h_0 * iw_pad + in_w_0; + BIN8 *in_order_hw1c8 = in_order + c * fh * fw + fh_idx * fw + fw_idx; + *in_order_hw1c8 = (*in_0); + } + } + } + // compute + for (U32 o = 0; o < oc; o += 2) { + BIN8 *in_hw0 = in_order; + const BIN8 *f_o = filterArray + o * 8 * fh * fw * ic; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + + uint16x8_t sum[2] = {0}; + uint8x8_t v1 = vdup_n_u8(1); + for (U32 i = 0; i < ic * 8; i += 32) { + uint8x8_t sub0[2] = {0}; + + for (U32 j = 0; j < 4; j++) { + uint8x8_t f_0 = vld1_u8(f_o); + uint8x8_t f_1 = vld1_u8(f_o + 8); + f_o += 16; + uint8x8_t in_1 = vdup_n_u8(*in_hw0); + in_hw0++; + f_0 = vand_u8(in_1, f_0); + f_1 = vand_u8(in_1, f_1); + f_0 = vcnt_u8(f_0); + f_1 = vcnt_u8(f_1); + sub0[0] = vadd_u8(sub0[0], f_0); + sub0[1] = vadd_u8(sub0[1], f_1); + } + sum[0] = vmlal_u8(sum[0], sub0[0], v1); + sum[1] = vmlal_u8(sum[1], sub0[1], v1); + + for (U32 j = 1; j < fh * fw; j += 8) { + uint8x8_t sub1[2] = {0}; + for (U32 k = 0; k < 32; k++) { + uint8x8_t f_0 = vld1_u8(f_o); + uint8x8_t f_1 = vld1_u8(f_o + 8); + f_o += 16; + uint8x8_t in_1 = vdup_n_u8(*in_hw0); + in_hw0++; + f_0 = vand_u8(in_1, f_0); + f_1 = vand_u8(in_1, f_1); + f_0 = vcnt_u8(f_0); + f_1 = vcnt_u8(f_1); + sub1[0] = vadd_u8(sub1[0], f_0); + sub1[1] = vadd_u8(sub1[1], f_1); + } + sum[0] = vmlal_u8(sum[0], sub1[0], v1); + sum[1] = vmlal_u8(sum[1], sub1[1], v1); + } + } + + float16x8_t res_o0 = vcvtq_f16_u16(sum[0]); + float16x8_t res_o1 = vcvtq_f16_u16(sum[1]); + float16x8_t scale_o0 = vld1q_f16(s0); + s0 += 16; + float16x8_t scale_o1 = vld1q_f16(s1); + s1 += 16; + float16x8_t bias_o0 = vld1q_f16(b0); + b0 += 16; + float16x8_t bias_o1 = vld1q_f16(b1); + b1 += 16; + res_o0 = vmulq_f16(res_o0, scale_o0); + res_o1 = vmulq_f16(res_o1, scale_o1); + res_o0 = vaddq_f16(res_o0, bias_o0); + res_o1 = vaddq_f16(res_o1, bias_o1); + vst1q_f16(out_o0hw0, res_o0); + vst1q_f16(out_o1hw0, res_o1); + } + } + } + return SUCCESS; +} +#endif diff --git a/tensor_computing/src/cpu/arm/bnn/convolution_transform_bnn.h b/compute/tensor/src/cpu/arm/bnn/convolution_transform_bnn.h similarity index 69% rename from tensor_computing/src/cpu/arm/bnn/convolution_transform_bnn.h rename to compute/tensor/src/cpu/arm/bnn/convolution_transform_bnn.h index 14e39a71..c30d5ffd 100644 --- a/tensor_computing/src/cpu/arm/bnn/convolution_transform_bnn.h +++ b/compute/tensor/src/cpu/arm/bnn/convolution_transform_bnn.h @@ -1,17 +1,16 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_CONVOLUTION_TRANSFORM_BNN #define _H_CONVOLUTION_TRANSFORM_BNN @@ -19,29 +18,30 @@ #include #include -#include "type.h" +#include "types.h" #include "tensor_desc.h" #include "error.h" #include "tensor_computing.h" - -inline void bitwise_copy(BIN8 srcVal, U32 srcBit, BIN8* dest, U32 destBit) { +inline void bitwise_copy(BIN8 srcVal, U32 srcBit, BIN8 *dest, U32 destBit) +{ std::bitset<8> Src(srcVal); if (Src.test(srcBit)) { - *dest |= (1< (N/16)*(C/8)*(H*W)*n16*c8 */ - if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) + if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) { CHECK_STATUS(NULL_POINTER); + } DataType fdt; DataFormat fdf; @@ -50,28 +50,31 @@ inline EE convolution_transform_filter_bnn(TensorDesc filterDesc, const BIN8* fi switch (fdf) { case DF_NCHWN16C8: // Everything is ready - memcpy(ftmArray, filterArray, fn*fc*fh*fw/8*bytesOf(fdt)); + memcpy(ftmArray, filterArray, fn * fc * fh * fw / 8 * bytesOf(fdt)); break; case DF_NCHW: { /* - * NCHW => NCHWN16C8 - * Now assume fn is divisible by 32 - */ + * NCHW => NCHWN16C8 + * Now assume fn is divisible by 32 + */ U32 oc = fn / 16; U32 ic = fc / 8; for (U32 o = 0; o < oc; o++) { for (U32 c = 0; c < ic; c++) { - for (U32 hw = 0; hw < fh*fw; hw++) { + for (U32 hw = 0; hw < fh * fw; hw++) { for (U32 o16 = 0; o16 < 16; o16++) { for (U32 c8 = 0; c8 < 8; c8++) { - U32 ftmBitPos = o*fh*fw*ic*128 + c*fh*fw*128 + hw*128 + o16*8 + c8; + U32 ftmBitPos = o * fh * fw * ic * 128 + c * fh * fw * 128 + + hw * 128 + o16 * 8 + c8; U32 ftmSlot = ftmBitPos / 8; U32 ftmBitNo = 7 - (ftmBitPos % 8); - U32 filterBitPos = (o*16+o16)*ic*8*fh*fw + (c*8+c8)*fh*fw + hw; + U32 filterBitPos = + (o * 16 + o16) * ic * 8 * fh * fw + (c * 8 + c8) * fh * fw + hw; U32 filterSlot = filterBitPos / 8; U32 filterBitNo = 7 - (filterBitPos % 8); - bitwise_copy(filterArray[filterSlot], filterBitNo, ftmArray+ftmSlot, ftmBitNo); + bitwise_copy(filterArray[filterSlot], filterBitNo, + ftmArray + ftmSlot, ftmBitNo); } } } diff --git a/compute/tensor/src/cpu/arm/bnn/convolution_xnor.h b/compute/tensor/src/cpu/arm/bnn/convolution_xnor.h new file mode 100644 index 00000000..a78684cd --- /dev/null +++ b/compute/tensor/src/cpu/arm/bnn/convolution_xnor.h @@ -0,0 +1,86 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONVOLUTION_XNOR +#define _H_CONVOLUTION_XNOR + +#ifdef _USE_FP16 +#include +#include +#include "sys.h" +#include "types.h" +#include "error.h" + +EE convolution_xnor_A55(TensorDesc inputDesc, + const F16 *input, + TensorDesc filterDesc, + const BIN8 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const F16 *scaleArray, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc); + +EE convolution_xnor_A76(TensorDesc inputDesc, + const F16 *input, + TensorDesc filterDesc, + const BIN8 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const F16 *scaleArray, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc); + +inline EE convolution_xnor(TensorDesc inputDesc, + const F16 *input, + TensorDesc filterDesc, + const BIN8 *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const F16 *scale, + TensorDesc biasDesc, + const F16 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + EE ret = SUCCESS; + switch (arch) { + case ARM_A55: + ret = convolution_xnor_A55(inputDesc, input, filterDesc, filter, convParamSpec, + scaleDesc, scale, biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc); + break; + case ARM_A76: + ret = convolution_xnor_A76(inputDesc, input, filterDesc, filter, convParamSpec, + scaleDesc, scale, biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc); + break; + default: + return NOT_SUPPORTED; + } + return ret; +} +#endif +#endif diff --git a/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A55.cpp b/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A55.cpp new file mode 100644 index 00000000..fe5d6395 --- /dev/null +++ b/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A55.cpp @@ -0,0 +1,786 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_FP16 +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" + +#include "cpu/arm/bnn/convolution_xnor.h" + +EE convolution_xnor_A55(TensorDesc inputDesc, + const F16 *input, + TensorDesc filterDesc, + const BIN8 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const F16 *scaleArray, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(scaleDesc); + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(activationDesc); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + if (fdf != DF_NCHWN16C8) { + CHECK_STATUS(NOT_MATCH); + } + if (!(ic == fc && oc == fn)) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + U32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + + BIN8 *inArray = ((BIN8 *)tmp) + ic * ihiw + 8 * fh * fw * ic; // ic has been divided by 8 + BIN8 *inArray_pad; + + for (U32 n = 0; n < in; n++) { + const F16 *in = input + n * ic * ih * iw * 8; + for (U32 i = 0; i < ic * ih * iw; i++) { + BIN8 temp = 0; + for (U32 j = 0; j < 8; j++) { + if (in[i * 8 + j] >= 0) { + temp |= (1 << (7 - j)); // set + } + } + inArray[i] = temp; + } + + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + inArray_pad = inArray + n * ic * ih * iw; // ic has been divided by 8 + } else { + // copy input into a input with padding + inArray_pad = (BIN8 *)tmp; + BIN8 *inArray_pad_mov = inArray_pad; + BIN8 *inArray_mov = inArray + n * ic * ih * iw; + for (U32 c = 0; c < ic; c++) { // All divide by 8 + for (U32 h = 0; h < paddingT; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(DT_BIN11)); + inArray_pad_mov += iw_pad; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * bytesOf(DT_BIN11)); + inArray_pad_mov += paddingL; + memcpy(inArray_pad_mov, inArray_mov, iw * bytesOf(DT_BIN11)); + inArray_pad_mov += iw; + inArray_mov += iw; + memset(inArray_pad_mov, 0, paddingR * bytesOf(DT_BIN11)); + inArray_pad_mov += paddingR; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(DT_BIN11)); + inArray_pad_mov += iw_pad; + } + } + } + // ohow / 8 + short base_s = fh * fw * ic * 8; // For xnorNet, actual_sum = base_s - 2 * noOf1sFromXOR + short base_v[8]; // Assume the base can be represented as int16 + for (U32 i = 0; i < 8; i++) { + base_v[i] = base_s; + } + for (U32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *s0 = scaleArray; + const F16 *s1 = scaleArray + 8; + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + BIN8 *in_order = ((BIN8 *)tmp) + ic * ihiw; // ic has been divided by 8 + // reorder input + // NCHWc8 => NHWChw8c8 + im2col + U32 in_h[8]; + U32 in_w[8]; + for (U32 i = 0; i < 8; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + BIN8 *in_hw8c8 = inArray_pad + c * ihiw + fh_idx * iw_pad + fw_idx; + // NHWChw8c8 + BIN8 *in_order_hw8c8 = in_order + c * fh * fw * 8 + fh_idx * fw * 8 + + fw_idx * 8; // This 8 comes from hw8 + for (U32 i = 0; i < 8; i++) { + in_order_hw8c8[i] = *(in_hw8c8 + in_h[i] * iw_pad + in_w[i]); + } + } + } + } + + // compute + for (U32 o = 0; o < oc; o += + 2) { // oc should be multiple of 32. It will at least be multiple of 16 in the future. + BIN8 *in_hw0 = in_order; + const BIN8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; // ic has been divided by 8 + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // scale and bias + const F16 *s_o0 = s0; + const F16 *s_o1 = s1; + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q4, [%[base]]\n" + "mov v5.16b, v4.16b\n" + "ldr d29, [%[in_0]]\n" // in_0 + "mov v6.16b, v4.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + "mov v7.16b, v4.16b\n" + "ldr x2, [%[f_0], #8]\n" + "mov v8.16b, v4.16b\n" + "ins v0.d[1], x2\n" + "mov v9.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" // duplicate a full register + "mov v10.16b, v4.16b\n" + "dup v2.16b, v29.b[1]\n" + "mov v11.16b, v4.16b\n" + "mov v12.16b, v4.16b\n" + "mov v13.16b, v4.16b\n" + "mov v14.16b, v4.16b\n" + "mov v15.16b, v4.16b\n" + "mov v16.16b, v4.16b\n" + "mov v17.16b, v4.16b\n" + "mov v18.16b, v4.16b\n" + "mov v19.16b, v4.16b\n" + "mov v20.16b, v4.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + + "0:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "mov x9, %[fhfw]\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + "eor v25.16b, v25.16b, v25.16b\n" + "eor v26.16b, v26.16b, v26.16b\n" + "eor v27.16b, v27.16b, v27.16b\n" + "eor v28.16b, v28.16b, v28.16b\n" + + "mov x4, #4\n" + + "1:\n" + "eor v3.16b, v1.16b, v0.16b\n" + "ldr d30, [x0, 16]!\n" // next filter + + "eor v4.16b, v2.16b, v0.16b\n" + "ldr x1, [x0, 8]\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "add v21.16b, v21.16b, v3.16b\n" // Use add because the latency is shorter + "dup v2.16b, v29.b[3]\n" + + "add v22.16b, v22.16b, v4.16b\n" + "ins v30.d[1], x1\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[4]\n" + "add v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[5]\n" + "add v24.16b, v24.16b, v4.16b\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[6]\n" + "add v25.16b, v25.16b, v3.16b\n" + "dup v2.16b, v29.b[7]\n" + "add v26.16b, v26.16b, v4.16b\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "ldr d29, [x3, 8]!\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "mov v0.16b, v30.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "add v27.16b, v27.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "add v28.16b, v28.16b, v4.16b\n" + "bne 1b\n" + + "movi v3.16b, #2\n" + "umlsl v5.8h, v21.8b, v3.8b\n" + "umlsl v7.8h, v22.8b, v3.8b\n" + "umlsl v9.8h, v23.8b, v3.8b\n" + "umlsl v11.8h, v24.8b, v3.8b\n" + "umlsl v13.8h, v25.8b, v3.8b\n" + "umlsl v15.8h, v26.8b, v3.8b\n" + "umlsl v17.8h, v27.8b, v3.8b\n" + "umlsl v19.8h, v28.8b, v3.8b\n" + + "umlsl2 v6.8h, v21.16b, v3.16b\n" + "umlsl2 v8.8h, v22.16b, v3.16b\n" + "umlsl2 v10.8h, v23.16b, v3.16b\n" + "umlsl2 v12.8h, v24.16b, v3.16b\n" + "umlsl2 v14.8h, v25.16b, v3.16b\n" + "umlsl2 v16.8h, v26.16b, v3.16b\n" + "umlsl2 v18.8h, v27.16b, v3.16b\n" + "umlsl2 v20.8h, v28.16b, v3.16b\n" + + "subs x9, x9, #1\n" + "beq 4f\n" // 1x1, continue with the next 32 input channels + + "2:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + "eor v25.16b, v25.16b, v25.16b\n" + "eor v26.16b, v26.16b, v26.16b\n" + "eor v27.16b, v27.16b, v27.16b\n" + "eor v28.16b, v28.16b, v28.16b\n" + + "mov x4, #32\n" // Assume 256 will not happen + "3:\n" + "eor v3.16b, v1.16b, v0.16b\n" + "ldr d30, [x0, 16]!\n" // next filter + + "eor v4.16b, v2.16b, v0.16b\n" + "ldr x1, [x0, 8]\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "uqadd v21.16b, v21.16b, v3.16b\n" + "dup v2.16b, v29.b[3]\n" + + "uqadd v22.16b, v22.16b, v4.16b\n" + "ins v30.d[1], x1\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[4]\n" + "uqadd v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[5]\n" + "uqadd v24.16b, v24.16b, v4.16b\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[6]\n" + "uqadd v25.16b, v25.16b, v3.16b\n" + "dup v2.16b, v29.b[7]\n" + "uqadd v26.16b, v26.16b, v4.16b\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "ldr d29, [x3, 8]!\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "mov v0.16b, v30.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "uqadd v27.16b, v27.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "uqadd v28.16b, v28.16b, v4.16b\n" + "bne 3b\n" + + "movi v3.16b, #2\n" // actual sum = base - 2 * noOf1s + "umlsl v5.8h, v21.8b, v3.8b\n" + "umlsl v7.8h, v22.8b, v3.8b\n" + "umlsl v9.8h, v23.8b, v3.8b\n" + "umlsl v11.8h, v24.8b, v3.8b\n" + "umlsl v13.8h, v25.8b, v3.8b\n" + "umlsl v15.8h, v26.8b, v3.8b\n" + "umlsl v17.8h, v27.8b, v3.8b\n" + "umlsl v19.8h, v28.8b, v3.8b\n" + + "umlsl2 v6.8h, v21.16b, v3.16b\n" + "umlsl2 v8.8h, v22.16b, v3.16b\n" + "umlsl2 v10.8h, v23.16b, v3.16b\n" + "umlsl2 v12.8h, v24.16b, v3.16b\n" + "umlsl2 v14.8h, v25.16b, v3.16b\n" + "umlsl2 v16.8h, v26.16b, v3.16b\n" + "umlsl2 v18.8h, v27.16b, v3.16b\n" + "umlsl2 v20.8h, v28.16b, v3.16b\n" + + "subs x9, x9, #8\n" + "bne 2b\n" + + "4:\n" // Wrap up computation for 32 input channels + "subs x2, x2, #32\n" + "bne 0b\n" + + // pipelined + "scvtf v5.8h, v5.8h\n" + "scvtf v6.8h, v6.8h\n" + "ldr q21, [%[b_0]]\n" + "scvtf v7.8h, v7.8h\n" + "ldr q22, [%[b_1]]\n" + "scvtf v8.8h, v8.8h\n" + "ldr q23, [%[s_0]]\n" + "scvtf v9.8h, v9.8h\n" + "ldr q24, [%[s_1]]\n" + "scvtf v10.8h, v10.8h\n" + "scvtf v11.8h, v11.8h\n" + "mov v1.16b, v21.16b\n" + "scvtf v12.8h, v12.8h\n" + "mov v2.16b, v22.16b\n" + "scvtf v13.8h, v13.8h\n" + "fmla v1.8h, v5.8h, v23.8h\n" + "scvtf v14.8h, v14.8h\n" + "fmla v2.8h, v6.8h, v24.8h\n" + "scvtf v15.8h, v15.8h\n" + "mov v3.16b, v21.16b\n" + "scvtf v16.8h, v16.8h\n" + "mov v4.16b, v22.16b\n" + "scvtf v17.8h, v17.8h\n" + "fmla v3.8h, v7.8h, v23.8h\n" + "scvtf v18.8h, v18.8h\n" + "fmla v4.8h, v8.8h, v24.8h\n" + "scvtf v19.8h, v19.8h\n" + "mov v5.16b, v21.16b\n" + "scvtf v20.8h, v20.8h\n" + "mov v6.16b, v22.16b\n" + + "fmla v5.8h, v9.8h, v23.8h\n" + "mov v7.16b, v21.16b\n" + "fmla v6.8h, v10.8h, v24.8h\n" + "mov v8.16b, v22.16b\n" + "fmla v7.8h, v11.8h, v23.8h\n" + "mov v9.16b, v21.16b\n" + "fmla v8.8h, v12.8h, v24.8h\n" + "mov v10.16b, v22.16b\n" + "fmla v9.8h, v13.8h, v23.8h\n" + "mov v11.16b, v21.16b\n" + "fmla v10.8h, v14.8h, v24.8h\n" + "mov v12.16b, v22.16b\n" + "fmla v11.8h, v15.8h, v23.8h\n" + "mov v13.16b, v21.16b\n" + "fmla v12.8h, v16.8h, v24.8h\n" + "mov v14.16b, v22.16b\n" + "fmla v13.8h, v17.8h, v23.8h\n" + "mov v15.16b, v21.16b\n" + "fmla v14.8h, v18.8h, v24.8h\n" + "mov v16.16b, v22.16b\n" + "fmla v15.8h, v19.8h, v23.8h\n" + "fmla v16.8h, v20.8h, v24.8h\n" + + "str q1, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q5, [%[out_0], #32]\n" // out_o0hw2 + "str q7, [%[out_0], #48]\n" // out_o0hw3 + "str q9, [%[out_0], #64]\n" // out_o0hw4 + "str q11, [%[out_0], #80]\n" // out_o0hw5 + "str q13, [%[out_0], #96]\n" // out_o0hw6 + "str q15, [%[out_0], #112]\n" // out_o0hw7 + + "str q2, [%[out_1]]\n" // out_o1hw0 + "str q4, [%[out_1], #16]\n" // out_o1hw1 + "str q6, [%[out_1], #32]\n" // out_o1hw2 + "str q8, [%[out_1], #48]\n" // out_o1hw3 + "str q10, [%[out_1], #64]\n" // out_o1hw4 + "str q12, [%[out_1], #80]\n" // out_o1hw5 + "str q14, [%[out_1], #96]\n" // out_o1hw6 + "str q16, [%[out_1], #112]\n" // out_o1hw7 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [fhfw] "r"((I64)fh * fw), [base] "r"(base_v), + [s_0] "r"(s_o0), [s_1] "r"(s_o1), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", + "x1", "x2", "x3", "x4", "x9"); + s0 += 16; + s1 += 16; + b0 += 16; + b1 += 16; + } + } + // ohow_remainder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + + for (U32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F16 *s0 = scaleArray; + const F16 *s1 = scaleArray + 8; + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + BIN8 *in_order = ((BIN8 *)tmp) + ic * ihiw; // ic has been divided by 8 + // reorder input + // NCHWc8 => NHWChw4c8 + im2col + U32 in_h[4]; + U32 in_w[4]; + for (U32 i = 0; i < 4; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + BIN8 *in_hw4c8 = inArray_pad + c * ihiw + fh_idx * iw_pad + fw_idx; + // NHWChw8c8 + BIN8 *in_order_hw4c8 = + in_order + c * fh * fw * 4 + fh_idx * fw * 4 + fw_idx * 4; + for (U32 i = 0; i < 4; i++) { + in_order_hw4c8[i] = *(in_hw4c8 + in_h[i] * iw_pad + in_w[i]); + } + } + } + } + + // compute + for (U32 o = 0; o < oc; o += + 2) { // oc should be multiple of 32. It will at least be multiple of 16 in the future. + BIN8 *in_hw0 = in_order; + const BIN8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; // ic has been divided by 8 + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // scale and bias + const F16 *s_o0 = s0; + const F16 *s_o1 = s1; + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q4, [%[base]]\n" + /* Layout + 5 6 + 7 8 + 9 10 + 11 12 + */ + "mov v5.16b, v4.16b\n" + "ldr s29, [%[in_0]]\n" // in_0 + "mov v6.16b, v4.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + "mov v7.16b, v4.16b\n" + "ldr x2, [%[f_0], #8]\n" + "mov v8.16b, v4.16b\n" + "ins v0.d[1], x2\n" + "mov v9.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" // duplicate a full register + "mov v10.16b, v4.16b\n" + "dup v2.16b, v29.b[1]\n" + "mov v11.16b, v4.16b\n" + "mov v12.16b, v4.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + + "0:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "mov x9, %[fhfw]\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + + "mov x4, #4\n" + + "1:\n" + "eor v3.16b, v1.16b, v0.16b\n" + "ldr d30, [x0, 16]!\n" // next filter + + "eor v4.16b, v2.16b, v0.16b\n" + "ldr x1, [x0, 8]\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "add v21.16b, v21.16b, v3.16b\n" // Use add because the latency is shorter + "dup v2.16b, v29.b[3]\n" + + "add v22.16b, v22.16b, v4.16b\n" + "ins v30.d[1], x1\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "ldr s29, [x3, 4]!\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "mov v0.16b, v30.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "add v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "add v24.16b, v24.16b, v4.16b\n" + "bne 1b\n" + + "movi v3.16b, #2\n" + "umlsl v5.8h, v21.8b, v3.8b\n" + "umlsl v7.8h, v22.8b, v3.8b\n" + "umlsl v9.8h, v23.8b, v3.8b\n" + "umlsl v11.8h, v24.8b, v3.8b\n" + + "umlsl2 v6.8h, v21.16b, v3.16b\n" + "umlsl2 v8.8h, v22.16b, v3.16b\n" + "umlsl2 v10.8h, v23.16b, v3.16b\n" + "umlsl2 v12.8h, v24.16b, v3.16b\n" + + "subs x9, x9, #1\n" + "beq 4f\n" // 1x1, continue with the next 32 input channels + + "2:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + + "mov x4, #32\n" // Assume 256 will not happen + "3:\n" + "eor v3.16b, v1.16b, v0.16b\n" + "ldr d30, [x0, 16]!\n" // next filter + + "eor v4.16b, v2.16b, v0.16b\n" + "ldr x1, [x0, 8]\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "uqadd v21.16b, v21.16b, v3.16b\n" + "dup v2.16b, v29.b[3]\n" + + "uqadd v22.16b, v22.16b, v4.16b\n" + "ins v30.d[1], x1\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "ldr s29, [x3, 4]!\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "mov v0.16b, v30.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "uqadd v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "uqadd v24.16b, v24.16b, v4.16b\n" + "bne 3b\n" + + "movi v3.16b, #2\n" + "umlsl v5.8h, v21.8b, v3.8b\n" + "umlsl v7.8h, v22.8b, v3.8b\n" + "umlsl v9.8h, v23.8b, v3.8b\n" + "umlsl v11.8h, v24.8b, v3.8b\n" + + "umlsl2 v6.8h, v21.16b, v3.16b\n" + "umlsl2 v8.8h, v22.16b, v3.16b\n" + "umlsl2 v10.8h, v23.16b, v3.16b\n" + "umlsl2 v12.8h, v24.16b, v3.16b\n" + + "subs x9, x9, #8\n" + "bne 2b\n" + + "4:\n" // Wrap up computation for 32 input channels + "subs x2, x2, #32\n" + "bne 0b\n" + + // pipelined + "scvtf v5.8h, v5.8h\n" + "scvtf v6.8h, v6.8h\n" + "ldr q21, [%[b_0]]\n" + "scvtf v7.8h, v7.8h\n" + "ldr q22, [%[b_1]]\n" + "scvtf v8.8h, v8.8h\n" + "ldr q23, [%[s_0]]\n" + "scvtf v9.8h, v9.8h\n" + "ldr q24, [%[s_1]]\n" + "scvtf v10.8h, v10.8h\n" + "scvtf v11.8h, v11.8h\n" + "mov v1.16b, v21.16b\n" + "scvtf v12.8h, v12.8h\n" + "mov v2.16b, v22.16b\n" + "fmla v1.8h, v5.8h, v23.8h\n" + "fmla v2.8h, v6.8h, v24.8h\n" + "mov v3.16b, v21.16b\n" + "mov v4.16b, v22.16b\n" + "fmla v3.8h, v7.8h, v23.8h\n" + "fmla v4.8h, v8.8h, v24.8h\n" + "mov v5.16b, v21.16b\n" + "mov v6.16b, v22.16b\n" + + "fmla v5.8h, v9.8h, v23.8h\n" + "mov v7.16b, v21.16b\n" + "fmla v6.8h, v10.8h, v24.8h\n" + "mov v8.16b, v22.16b\n" + "fmla v7.8h, v11.8h, v23.8h\n" + "fmla v8.8h, v12.8h, v24.8h\n" + + "str q1, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q5, [%[out_0], #32]\n" // out_o0hw2 + "str q7, [%[out_0], #48]\n" // out_o0hw3 + + "str q2, [%[out_1]]\n" // out_o1hw0 + "str q4, [%[out_1], #16]\n" // out_o1hw1 + "str q6, [%[out_1], #32]\n" // out_o1hw2 + "str q8, [%[out_1], #48]\n" // out_o1hw3 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [fhfw] "r"((I64)fh * fw), [base] "r"(base_v), + [s_0] "r"(s_o0), [s_1] "r"(s_o1), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v21", "v22", "v23", "v24", + "v29", "v30", "x0", "x1", "x2", "x3", "x4", "x9"); + s0 += 16; + s1 += 16; + b0 += 16; + b1 += 16; + } + } + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (U32 hw = ohow_s; hw < ohow; hw++) { + const F16 *s0 = scaleArray; + const F16 *s1 = scaleArray + 8; + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + BIN8 *in_order = ((BIN8 *)tmp) + ic * ihiw; // ic has been divided by 8 + // reorder input + // NCHWc8 => NHWChw1c8 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + BIN8 *in_hw1c8 = inArray_pad + c * ihiw + fh_idx * iw_pad + fw_idx; + BIN8 *in_0 = in_hw1c8 + in_h_0 * iw_pad + in_w_0; + BIN8 *in_order_hw1c8 = in_order + c * fh * fw + fh_idx * fw + fw_idx; + *in_order_hw1c8 = (*in_0); + } + } + } + // compute + for (U32 o = 0; o < oc; o += 2) { + BIN8 *in_hw0 = in_order; + const BIN8 *f_o = filterArray + o * 8 * fh * fw * ic; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + + uint16x8_t sum[2] = {0}; + uint8x8_t v2 = vdup_n_u8(2); + for (U32 i = 0; i < ic * 8; i += 32) { + uint8x8_t sub0[2] = {0}; + + for (U32 j = 0; j < 4; j++) { + uint8x8_t f_0 = vld1_u8(f_o); + uint8x8_t f_1 = vld1_u8(f_o + 8); + f_o += 16; + uint8x8_t in_1 = vdup_n_u8(*in_hw0); + in_hw0++; + f_0 = veor_u8(in_1, f_0); + f_1 = veor_u8(in_1, f_1); + f_0 = vcnt_u8(f_0); + f_1 = vcnt_u8(f_1); + sub0[0] = vadd_u8(sub0[0], f_0); + sub0[1] = vadd_u8(sub0[1], f_1); + } + sum[0] = vmlal_u8(sum[0], sub0[0], v2); + sum[1] = vmlal_u8(sum[1], sub0[1], v2); + + for (U32 j = 1; j < fh * fw; j += 8) { + uint8x8_t sub1[2] = {0}; + for (U32 k = 0; k < 32; k++) { + uint8x8_t f_0 = vld1_u8(f_o); + uint8x8_t f_1 = vld1_u8(f_o + 8); + f_o += 16; + uint8x8_t in_1 = vdup_n_u8(*in_hw0); + in_hw0++; + f_0 = veor_u8(in_1, f_0); + f_1 = veor_u8(in_1, f_1); + f_0 = vcnt_u8(f_0); + f_1 = vcnt_u8(f_1); + sub1[0] = vadd_u8(sub1[0], f_0); + sub1[1] = vadd_u8(sub1[1], f_1); + } + sum[0] = vmlal_u8(sum[0], sub1[0], v2); + sum[1] = vmlal_u8(sum[1], sub1[1], v2); + } + } + short temp[16]; + vst1q_u16((uint16_t *)temp, sum[0]); + vst1q_u16((uint16_t *)(temp + 8), sum[1]); + int16x8_t base_abs = vdupq_n_s16(base_s); + int16x8_t ssum[2]; + ssum[0] = vld1q_s16(temp); + ssum[1] = vld1q_s16(temp + 8); + ssum[0] = vsubq_s16(base_abs, ssum[0]); + ssum[1] = vsubq_s16(base_abs, ssum[1]); + + float16x8_t res_o0 = vcvtq_f16_s16(ssum[0]); + float16x8_t res_o1 = vcvtq_f16_s16(ssum[1]); + float16x8_t scale_o0 = vld1q_f16(s0); + s0 += 16; + float16x8_t scale_o1 = vld1q_f16(s1); + s1 += 16; + float16x8_t bias_o0 = vld1q_f16(b0); + b0 += 16; + float16x8_t bias_o1 = vld1q_f16(b1); + b1 += 16; + res_o0 = vmulq_f16(res_o0, scale_o0); + res_o1 = vmulq_f16(res_o1, scale_o1); + res_o0 = vaddq_f16(res_o0, bias_o0); + res_o1 = vaddq_f16(res_o1, bias_o1); + vst1q_f16(out_o0hw0, res_o0); + vst1q_f16(out_o1hw0, res_o1); + } + } + } + return SUCCESS; +} +#endif diff --git a/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A76.cpp b/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A76.cpp new file mode 100644 index 00000000..163444a4 --- /dev/null +++ b/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A76.cpp @@ -0,0 +1,774 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_FP16 +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" + +#include "cpu/arm/bnn/convolution_xnor.h" + +EE convolution_xnor_A76(TensorDesc inputDesc, + const F16 *input, + TensorDesc filterDesc, + const BIN8 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const F16 *scaleArray, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(scaleDesc); + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(activationDesc); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + if (fdf != DF_NCHWN16C8) { + CHECK_STATUS(NOT_MATCH); + } + if (!(ic == fc && oc == fn)) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + U32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + + BIN8 *inArray = ((BIN8 *)tmp) + ic * ihiw + 8 * fh * fw * ic; // ic has been divided by 8 + BIN8 *inArray_pad; + + for (U32 n = 0; n < in; n++) { + const F16 *in = input + n * ic * ih * iw * 8; + for (U32 i = 0; i < ic * ih * iw; i++) { + BIN8 temp = 0; + for (U32 j = 0; j < 8; j++) { + if (in[i * 8 + j] >= 0) { + temp |= (1 << (7 - j)); // set + } + } + inArray[i] = temp; + } + + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + inArray_pad = inArray + n * ic * ih * iw; // ic has been divided by 8 + } else { + // copy input into a input with padding + inArray_pad = (BIN8 *)tmp; + BIN8 *inArray_pad_mov = inArray_pad; + BIN8 *inArray_mov = inArray + n * ic * ih * iw; + for (U32 c = 0; c < ic; c++) { // All divide by 8 + for (U32 h = 0; h < paddingT; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(DT_BIN11)); + inArray_pad_mov += iw_pad; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * bytesOf(DT_BIN11)); + inArray_pad_mov += paddingL; + memcpy(inArray_pad_mov, inArray_mov, iw * bytesOf(DT_BIN11)); + inArray_pad_mov += iw; + inArray_mov += iw; + memset(inArray_pad_mov, 0, paddingR * bytesOf(DT_BIN11)); + inArray_pad_mov += paddingR; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(DT_BIN11)); + inArray_pad_mov += iw_pad; + } + } + } + // ohow / 8 + short base_s = fh * fw * ic * 8; // For xnorNet, actual_sum = base_s - 2 * noOf1sFromXOR + short base_v[8]; // Assume the base can be represented as int16 + for (U32 i = 0; i < 8; i++) { + base_v[i] = base_s; + } + for (U32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *s0 = scaleArray; + const F16 *s1 = scaleArray + 8; + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + BIN8 *in_order = ((BIN8 *)tmp) + ic * ihiw; // ic has been divided by 8 + // reorder input + // NCHWc8 => NHWChw8c8 + im2col + U32 in_h[8]; + U32 in_w[8]; + for (U32 i = 0; i < 8; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + BIN8 *in_hw8c8 = inArray_pad + c * ihiw + fh_idx * iw_pad + fw_idx; + // NHWChw8c8 + BIN8 *in_order_hw8c8 = in_order + c * fh * fw * 8 + fh_idx * fw * 8 + + fw_idx * 8; // This 8 comes from hw8 + for (U32 i = 0; i < 8; i++) { + in_order_hw8c8[i] = *(in_hw8c8 + in_h[i] * iw_pad + in_w[i]); + } + } + } + } + + // compute + for (U32 o = 0; o < oc; o += + 2) { // oc should be multiple of 32. It will at least be multiple of 16 in the future. + BIN8 *in_hw0 = in_order; + const BIN8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; // ic has been divided by 8 + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // scale and bias + const F16 *s_o0 = s0; + const F16 *s_o1 = s1; + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q4, [%[base]]\n" + "ldr q0, [%[f_0]]\n" // f_0 + "ldr d29, [%[in_0]]\n" // in_0 + /* Layout + 5 6 + 7 8 + 9 10 + 11 12 + + 13 14 + 15 16 + 17 18 + 19 20 + */ + "mov v5.16b, v4.16b\n" + "mov v6.16b, v4.16b\n" + "mov v7.16b, v4.16b\n" + "mov v8.16b, v4.16b\n" + "mov v9.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" // duplicate a full register + "mov v10.16b, v4.16b\n" + "dup v2.16b, v29.b[1]\n" + "mov v11.16b, v4.16b\n" + "mov v12.16b, v4.16b\n" + "mov v13.16b, v4.16b\n" + "mov v14.16b, v4.16b\n" + "mov v15.16b, v4.16b\n" + "mov v16.16b, v4.16b\n" + "mov v17.16b, v4.16b\n" + "mov v18.16b, v4.16b\n" + "mov v19.16b, v4.16b\n" + "mov v20.16b, v4.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + + "0:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "mov x9, %[fhfw]\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + "eor v25.16b, v25.16b, v25.16b\n" + "eor v26.16b, v26.16b, v26.16b\n" + "eor v27.16b, v27.16b, v27.16b\n" + "eor v28.16b, v28.16b, v28.16b\n" + + "mov x4, #4\n" + + "1:\n" + "eor v3.16b, v1.16b, v0.16b\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "add v21.16b, v21.16b, v3.16b\n" // Use add because the latency is shorter + "dup v2.16b, v29.b[3]\n" + + "add v22.16b, v22.16b, v4.16b\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[4]\n" + "add v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[5]\n" + "add v24.16b, v24.16b, v4.16b\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[6]\n" + "add v25.16b, v25.16b, v3.16b\n" + "dup v2.16b, v29.b[7]\n" + "add v26.16b, v26.16b, v4.16b\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "ldr d29, [x3, 8]!\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "ldr q0, [x0, 16]!\n" // next filter + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "add v27.16b, v27.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "add v28.16b, v28.16b, v4.16b\n" + "bne 1b\n" + + "movi v3.16b, #2\n" + "umlsl v5.8h, v21.8b, v3.8b\n" + "umlsl v7.8h, v22.8b, v3.8b\n" + "umlsl v9.8h, v23.8b, v3.8b\n" + "umlsl v11.8h, v24.8b, v3.8b\n" + "umlsl v13.8h, v25.8b, v3.8b\n" + "umlsl v15.8h, v26.8b, v3.8b\n" + "umlsl v17.8h, v27.8b, v3.8b\n" + "umlsl v19.8h, v28.8b, v3.8b\n" + + "umlsl2 v6.8h, v21.16b, v3.16b\n" + "umlsl2 v8.8h, v22.16b, v3.16b\n" + "umlsl2 v10.8h, v23.16b, v3.16b\n" + "umlsl2 v12.8h, v24.16b, v3.16b\n" + "umlsl2 v14.8h, v25.16b, v3.16b\n" + "umlsl2 v16.8h, v26.16b, v3.16b\n" + "umlsl2 v18.8h, v27.16b, v3.16b\n" + "umlsl2 v20.8h, v28.16b, v3.16b\n" + + "subs x9, x9, #1\n" + "beq 4f\n" // 1x1, continue with the next 32 input channels + + "2:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + "eor v25.16b, v25.16b, v25.16b\n" + "eor v26.16b, v26.16b, v26.16b\n" + "eor v27.16b, v27.16b, v27.16b\n" + "eor v28.16b, v28.16b, v28.16b\n" + + "mov x4, #32\n" // Assume 256 will not happen + "3:\n" + "eor v3.16b, v1.16b, v0.16b\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "uqadd v21.16b, v21.16b, v3.16b\n" + "dup v2.16b, v29.b[3]\n" + + "uqadd v22.16b, v22.16b, v4.16b\n" + "subs x4, x4, #1\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[4]\n" + "uqadd v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[5]\n" + "uqadd v24.16b, v24.16b, v4.16b\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[6]\n" + "uqadd v25.16b, v25.16b, v3.16b\n" + "dup v2.16b, v29.b[7]\n" + "uqadd v26.16b, v26.16b, v4.16b\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "ldr d29, [x3, 8]!\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "ldr q0, [x0, 16]!\n" // next filter + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "uqadd v27.16b, v27.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "uqadd v28.16b, v28.16b, v4.16b\n" + "bne 3b\n" + + "movi v3.16b, #2\n" // actual sum = base - 2 * noOf1s + "umlsl v5.8h, v21.8b, v3.8b\n" + "umlsl v7.8h, v22.8b, v3.8b\n" + "umlsl v9.8h, v23.8b, v3.8b\n" + "umlsl v11.8h, v24.8b, v3.8b\n" + "umlsl v13.8h, v25.8b, v3.8b\n" + "umlsl v15.8h, v26.8b, v3.8b\n" + "umlsl v17.8h, v27.8b, v3.8b\n" + "umlsl v19.8h, v28.8b, v3.8b\n" + + "umlsl2 v6.8h, v21.16b, v3.16b\n" + "umlsl2 v8.8h, v22.16b, v3.16b\n" + "umlsl2 v10.8h, v23.16b, v3.16b\n" + "umlsl2 v12.8h, v24.16b, v3.16b\n" + "umlsl2 v14.8h, v25.16b, v3.16b\n" + "umlsl2 v16.8h, v26.16b, v3.16b\n" + "umlsl2 v18.8h, v27.16b, v3.16b\n" + "umlsl2 v20.8h, v28.16b, v3.16b\n" + + "subs x9, x9, #8\n" + "bne 2b\n" + + "4:\n" // Wrap up computation for 32 input channels + "subs x2, x2, #32\n" + "bne 0b\n" + + // pipelined + "scvtf v5.8h, v5.8h\n" + "scvtf v6.8h, v6.8h\n" + "ldr q21, [%[b_0]]\n" + "ldr q22, [%[b_1]]\n" + "scvtf v7.8h, v7.8h\n" + "scvtf v8.8h, v8.8h\n" + "ldr q23, [%[s_0]]\n" + "ldr q24, [%[s_1]]\n" + "scvtf v9.8h, v9.8h\n" + "scvtf v10.8h, v10.8h\n" + "scvtf v11.8h, v11.8h\n" + "scvtf v12.8h, v12.8h\n" + "mov v1.16b, v21.16b\n" + "mov v2.16b, v22.16b\n" + "scvtf v13.8h, v13.8h\n" + "scvtf v14.8h, v14.8h\n" + "fmla v1.8h, v5.8h, v23.8h\n" + "fmla v2.8h, v6.8h, v24.8h\n" + "scvtf v15.8h, v15.8h\n" + "scvtf v16.8h, v16.8h\n" + "mov v3.16b, v21.16b\n" + "mov v4.16b, v22.16b\n" + "scvtf v17.8h, v17.8h\n" + "scvtf v18.8h, v18.8h\n" + "fmla v3.8h, v7.8h, v23.8h\n" + "fmla v4.8h, v8.8h, v24.8h\n" + "scvtf v19.8h, v19.8h\n" + "scvtf v20.8h, v20.8h\n" + "mov v5.16b, v21.16b\n" + "mov v6.16b, v22.16b\n" + + "fmla v5.8h, v9.8h, v23.8h\n" + "mov v7.16b, v21.16b\n" + "fmla v6.8h, v10.8h, v24.8h\n" + "mov v8.16b, v22.16b\n" + "fmla v7.8h, v11.8h, v23.8h\n" + "mov v9.16b, v21.16b\n" + "fmla v8.8h, v12.8h, v24.8h\n" + "mov v10.16b, v22.16b\n" + "fmla v9.8h, v13.8h, v23.8h\n" + "mov v11.16b, v21.16b\n" + "fmla v10.8h, v14.8h, v24.8h\n" + "mov v12.16b, v22.16b\n" + "fmla v11.8h, v15.8h, v23.8h\n" + "mov v13.16b, v21.16b\n" + "fmla v12.8h, v16.8h, v24.8h\n" + "mov v14.16b, v22.16b\n" + "fmla v13.8h, v17.8h, v23.8h\n" + "mov v15.16b, v21.16b\n" + "fmla v14.8h, v18.8h, v24.8h\n" + "mov v16.16b, v22.16b\n" + "fmla v15.8h, v19.8h, v23.8h\n" + "fmla v16.8h, v20.8h, v24.8h\n" + + "str q1, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q5, [%[out_0], #32]\n" // out_o0hw2 + "str q7, [%[out_0], #48]\n" // out_o0hw3 + "str q9, [%[out_0], #64]\n" // out_o0hw4 + "str q11, [%[out_0], #80]\n" // out_o0hw5 + "str q13, [%[out_0], #96]\n" // out_o0hw6 + "str q15, [%[out_0], #112]\n" // out_o0hw7 + + "str q2, [%[out_1]]\n" // out_o1hw0 + "str q4, [%[out_1], #16]\n" // out_o1hw1 + "str q6, [%[out_1], #32]\n" // out_o1hw2 + "str q8, [%[out_1], #48]\n" // out_o1hw3 + "str q10, [%[out_1], #64]\n" // out_o1hw4 + "str q12, [%[out_1], #80]\n" // out_o1hw5 + "str q14, [%[out_1], #96]\n" // out_o1hw6 + "str q16, [%[out_1], #112]\n" // out_o1hw7 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [fhfw] "r"((I64)fh * fw), [base] "r"(base_v), + [s_0] "r"(s_o0), [s_1] "r"(s_o1), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x0", "x2", "x3", + "x4", "x9"); + s0 += 16; + s1 += 16; + b0 += 16; + b1 += 16; + } + } + // ohow_remainder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + + for (U32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F16 *s0 = scaleArray; + const F16 *s1 = scaleArray + 8; + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + BIN8 *in_order = ((BIN8 *)tmp) + ic * ihiw; // ic has been divided by 8 + // reorder input + // NCHWc8 => NHWChw4c8 + im2col + U32 in_h[4]; + U32 in_w[4]; + for (U32 i = 0; i < 4; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + BIN8 *in_hw4c8 = inArray_pad + c * ihiw + fh_idx * iw_pad + fw_idx; + // NHWChw8c8 + BIN8 *in_order_hw4c8 = + in_order + c * fh * fw * 4 + fh_idx * fw * 4 + fw_idx * 4; + for (U32 i = 0; i < 4; i++) { + in_order_hw4c8[i] = *(in_hw4c8 + in_h[i] * iw_pad + in_w[i]); + } + } + } + } + + // compute + for (U32 o = 0; o < oc; o += + 2) { // oc should be multiple of 32. It will at least be multiple of 16 in the future. + BIN8 *in_hw0 = in_order; + const BIN8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; // ic has been divided by 8 + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // scale and bias + const F16 *s_o0 = s0; + const F16 *s_o1 = s1; + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q4, [%[base]]\n" + "ldr q0, [%[f_0]]\n" // f_0 + "ldr s29, [%[in_0]]\n" // in_0 + /* Layout + 5 6 + 7 8 + 9 10 + 11 12 + */ + "mov v5.16b, v4.16b\n" + "mov v6.16b, v4.16b\n" + "mov v7.16b, v4.16b\n" + "mov v8.16b, v4.16b\n" + "mov v9.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" // duplicate a full register + "mov v10.16b, v4.16b\n" + "dup v2.16b, v29.b[1]\n" + "mov v11.16b, v4.16b\n" + "mov v12.16b, v4.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + + "0:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "mov x9, %[fhfw]\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + + "mov x4, #4\n" + + "1:\n" + "eor v3.16b, v1.16b, v0.16b\n" + "eor v4.16b, v2.16b, v0.16b\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "add v21.16b, v21.16b, v3.16b\n" // Use add because the latency is shorter + "dup v2.16b, v29.b[3]\n" + + "add v22.16b, v22.16b, v4.16b\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "ldr s29, [x3, 4]!\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "ldr q0, [x0, 16]!\n" // next filter + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "add v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "add v24.16b, v24.16b, v4.16b\n" + "bne 1b\n" + + "movi v3.16b, #2\n" + "umlsl v5.8h, v21.8b, v3.8b\n" + "umlsl v7.8h, v22.8b, v3.8b\n" + "umlsl v9.8h, v23.8b, v3.8b\n" + "umlsl v11.8h, v24.8b, v3.8b\n" + + "umlsl2 v6.8h, v21.16b, v3.16b\n" + "umlsl2 v8.8h, v22.16b, v3.16b\n" + "umlsl2 v10.8h, v23.16b, v3.16b\n" + "umlsl2 v12.8h, v24.16b, v3.16b\n" + + "subs x9, x9, #1\n" + "beq 4f\n" // 1x1, continue with the next 32 input channels + + "2:\n" + "eor v21.16b, v21.16b, v21.16b\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + + "mov x4, #32\n" // Assume 256 will not happen + "3:\n" + "eor v3.16b, v1.16b, v0.16b\n" + "eor v4.16b, v2.16b, v0.16b\n" + + "cnt v3.16b, v3.16b\n" + "subs x4, x4, #1\n" + + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[2]\n" + + "uqadd v21.16b, v21.16b, v3.16b\n" + "dup v2.16b, v29.b[3]\n" + + "uqadd v22.16b, v22.16b, v4.16b\n" + + "eor v3.16b, v1.16b, v0.16b\n" + "ldr s29, [x3, 4]!\n" + "eor v4.16b, v2.16b, v0.16b\n" + "cnt v3.16b, v3.16b\n" + "ldr q0, [x0, 16]!\n" // next filter + "cnt v4.16b, v4.16b\n" + "dup v1.16b, v29.b[0]\n" + "uqadd v23.16b, v23.16b, v3.16b\n" + "dup v2.16b, v29.b[1]\n" + "uqadd v24.16b, v24.16b, v4.16b\n" + "bne 3b\n" + + "movi v3.16b, #2\n" + "umlsl v5.8h, v21.8b, v3.8b\n" + "umlsl v7.8h, v22.8b, v3.8b\n" + "umlsl v9.8h, v23.8b, v3.8b\n" + "umlsl v11.8h, v24.8b, v3.8b\n" + + "umlsl2 v6.8h, v21.16b, v3.16b\n" + "umlsl2 v8.8h, v22.16b, v3.16b\n" + "umlsl2 v10.8h, v23.16b, v3.16b\n" + "umlsl2 v12.8h, v24.16b, v3.16b\n" + + "subs x9, x9, #8\n" + "bne 2b\n" + + "4:\n" // Wrap up computation for 32 input channels + "subs x2, x2, #32\n" + "bne 0b\n" + + // pipelined + "scvtf v5.8h, v5.8h\n" + "scvtf v6.8h, v6.8h\n" + "ldr q21, [%[b_0]]\n" + "scvtf v7.8h, v7.8h\n" + "ldr q22, [%[b_1]]\n" + "scvtf v8.8h, v8.8h\n" + "ldr q23, [%[s_0]]\n" + "scvtf v9.8h, v9.8h\n" + "ldr q24, [%[s_1]]\n" + "scvtf v10.8h, v10.8h\n" + "scvtf v11.8h, v11.8h\n" + "mov v1.16b, v21.16b\n" + "scvtf v12.8h, v12.8h\n" + "mov v2.16b, v22.16b\n" + "fmla v1.8h, v5.8h, v23.8h\n" + "fmla v2.8h, v6.8h, v24.8h\n" + "mov v3.16b, v21.16b\n" + "mov v4.16b, v22.16b\n" + "fmla v3.8h, v7.8h, v23.8h\n" + "fmla v4.8h, v8.8h, v24.8h\n" + "mov v5.16b, v21.16b\n" + "mov v6.16b, v22.16b\n" + + "fmla v5.8h, v9.8h, v23.8h\n" + "mov v7.16b, v21.16b\n" + "fmla v6.8h, v10.8h, v24.8h\n" + "mov v8.16b, v22.16b\n" + "fmla v7.8h, v11.8h, v23.8h\n" + "fmla v8.8h, v12.8h, v24.8h\n" + + "str q1, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q5, [%[out_0], #32]\n" // out_o0hw2 + "str q7, [%[out_0], #48]\n" // out_o0hw3 + + "str q2, [%[out_1]]\n" // out_o1hw0 + "str q4, [%[out_1], #16]\n" // out_o1hw1 + "str q6, [%[out_1], #32]\n" // out_o1hw2 + "str q8, [%[out_1], #48]\n" // out_o1hw3 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [fhfw] "r"((I64)fh * fw), [base] "r"(base_v), + [s_0] "r"(s_o0), [s_1] "r"(s_o1), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v21", "v22", "v23", "v24", + "v29", "v30", "x0", "x1", "x2", "x3", "x4", "x9"); + s0 += 16; + s1 += 16; + b0 += 16; + b1 += 16; + } + } + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (U32 hw = ohow_s; hw < ohow; hw++) { + const F16 *s0 = scaleArray; + const F16 *s1 = scaleArray + 8; + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + BIN8 *in_order = ((BIN8 *)tmp) + ic * ihiw; // ic has been divided by 8 + // reorder input + // NCHWc8 => NHWChw1c8 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + BIN8 *in_hw1c8 = inArray_pad + c * ihiw + fh_idx * iw_pad + fw_idx; + BIN8 *in_0 = in_hw1c8 + in_h_0 * iw_pad + in_w_0; + BIN8 *in_order_hw1c8 = in_order + c * fh * fw + fh_idx * fw + fw_idx; + *in_order_hw1c8 = (*in_0); + } + } + } + // compute + for (U32 o = 0; o < oc; o += 2) { + BIN8 *in_hw0 = in_order; + const BIN8 *f_o = filterArray + o * 8 * fh * fw * ic; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + + uint16x8_t sum[2] = {0}; + uint8x8_t v2 = vdup_n_u8(2); + for (U32 i = 0; i < ic * 8; i += 32) { + uint8x8_t sub0[2] = {0}; + + for (U32 j = 0; j < 4; j++) { + uint8x8_t f_0 = vld1_u8(f_o); + uint8x8_t f_1 = vld1_u8(f_o + 8); + f_o += 16; + uint8x8_t in_1 = vdup_n_u8(*in_hw0); + in_hw0++; + f_0 = veor_u8(in_1, f_0); + f_1 = veor_u8(in_1, f_1); + f_0 = vcnt_u8(f_0); + f_1 = vcnt_u8(f_1); + sub0[0] = vadd_u8(sub0[0], f_0); + sub0[1] = vadd_u8(sub0[1], f_1); + } + sum[0] = vmlal_u8(sum[0], sub0[0], v2); + sum[1] = vmlal_u8(sum[1], sub0[1], v2); + + for (U32 j = 1; j < fh * fw; j += 8) { + uint8x8_t sub1[2] = {0}; + for (U32 k = 0; k < 32; k++) { + uint8x8_t f_0 = vld1_u8(f_o); + uint8x8_t f_1 = vld1_u8(f_o + 8); + f_o += 16; + uint8x8_t in_1 = vdup_n_u8(*in_hw0); + in_hw0++; + f_0 = veor_u8(in_1, f_0); + f_1 = veor_u8(in_1, f_1); + f_0 = vcnt_u8(f_0); + f_1 = vcnt_u8(f_1); + sub1[0] = vadd_u8(sub1[0], f_0); + sub1[1] = vadd_u8(sub1[1], f_1); + } + sum[0] = vmlal_u8(sum[0], sub1[0], v2); + sum[1] = vmlal_u8(sum[1], sub1[1], v2); + } + } + short temp[16]; + vst1q_u16((uint16_t *)temp, sum[0]); + vst1q_u16((uint16_t *)(temp + 8), sum[1]); + int16x8_t base_abs = vdupq_n_s16(base_s); + int16x8_t ssum[2]; + ssum[0] = vld1q_s16(temp); + ssum[1] = vld1q_s16(temp + 8); + ssum[0] = vsubq_s16(base_abs, ssum[0]); + ssum[1] = vsubq_s16(base_abs, ssum[1]); + + float16x8_t res_o0 = vcvtq_f16_s16(ssum[0]); + float16x8_t res_o1 = vcvtq_f16_s16(ssum[1]); + float16x8_t scale_o0 = vld1q_f16(s0); + s0 += 16; + float16x8_t scale_o1 = vld1q_f16(s1); + s1 += 16; + float16x8_t bias_o0 = vld1q_f16(b0); + b0 += 16; + float16x8_t bias_o1 = vld1q_f16(b1); + b1 += 16; + res_o0 = vmulq_f16(res_o0, scale_o0); + res_o1 = vmulq_f16(res_o1, scale_o1); + res_o0 = vaddq_f16(res_o0, bias_o0); + res_o1 = vaddq_f16(res_o1, bias_o1); + vst1q_f16(out_o0hw0, res_o0); + vst1q_f16(out_o1hw0, res_o1); + } + } + } + return SUCCESS; +} +#endif diff --git a/tensor_computing/src/cpu/arm/bnn/tensor_computing_bnn.h b/compute/tensor/src/cpu/arm/bnn/tensor_computing_bnn.h similarity index 63% rename from tensor_computing/src/cpu/arm/bnn/tensor_computing_bnn.h rename to compute/tensor/src/cpu/arm/bnn/tensor_computing_bnn.h index 3dd951d3..38f4f28e 100644 --- a/tensor_computing/src/cpu/arm/bnn/tensor_computing_bnn.h +++ b/compute/tensor/src/cpu/arm/bnn/tensor_computing_bnn.h @@ -1,17 +1,16 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_TENSOR_COMPUTING_BNN #define _H_TENSOR_COMPUTING_BNN @@ -20,17 +19,20 @@ #include "cpu/arm/bnn/convolution_dorefa.h" #include "cpu/arm/bnn/convolution_xnor.h" -EE convolution_infer_forward_tmp_bytes_bnn(TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc, - ConvolutionDesc convDesc, ConvolutionForwardAlgorithm algorithm, U32 *bytes); - -EE convolution_bnn(TensorDesc inputDesc, const F16* input, - TensorDesc filterDesc, const BIN8* filter, - ConvolutionDesc convDesc, - TensorDesc scaleDesc, const F16* scale, - TensorDesc biasDesc, const F16* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* output, - ActivationDesc activationDesc, +EE convolution_bnn(TensorDesc inputDesc, + const F16 *input, + TensorDesc filterDesc, + const BIN8 *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const F16 *scale, + TensorDesc biasDesc, + const F16 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *output, + ActivationParamSpec activationDesc, Arch arch); #endif #endif diff --git a/tensor_computing/src/cpu/arm/check.cpp b/compute/tensor/src/cpu/arm/check.cpp similarity index 62% rename from tensor_computing/src/cpu/arm/check.cpp rename to compute/tensor/src/cpu/arm/check.cpp index ac18a698..e4e1ac81 100644 --- a/tensor_computing/src/cpu/arm/check.cpp +++ b/compute/tensor/src/cpu/arm/check.cpp @@ -1,14 +1,14 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "cpu/arm/tensor_computing_arm.h" @@ -20,21 +20,27 @@ #include "cpu/arm/fp16/tensor_computing_fp16.h" #endif -EE check_u32(TensorDesc inputDescA, const U32* inputA, - TensorDesc inputDescB, const U32* inputB, +static EE check_u32(TensorDesc inputDescA, + const U32 *inputA, + TensorDesc inputDescB, + const U32 *inputB, CheckMode checkMode, - TensorDesc outputDesc, I32* output) + TensorDesc outputDesc, + I32 *output) { - if (nullptr == inputA || nullptr == inputB || nullptr == output) + if (nullptr == inputA || nullptr == inputB || nullptr == output) { CHECK_STATUS(NULL_POINTER); + } - if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) + if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) { CHECK_STATUS(NOT_MATCH); + } U32 size = tensorNumElements(inputDescA); - U32 loopOuter = inputDescA.dims[inputDescA.nDims-1]; - if (tensorNumElements(outputDesc) != loopOuter) + U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1]; + if (tensorNumElements(outputDesc) != loopOuter) { CHECK_STATUS(NOT_MATCH); + } I32 length = size / loopOuter; for (U32 j = 0; j < loopOuter; j++) { const U32 *arrayA = inputA + j * length; @@ -43,15 +49,17 @@ EE check_u32(TensorDesc inputDescA, const U32* inputA, case CHECK_EQUAL: { uint32x4_t count_v = vdupq_n_u32(0); I32 i = 0; - for (; i < length-3; i+=4) { + for (; i < length - 3; i += 4) { uint32x4_t a = vld1q_u32(arrayA + i); uint32x4_t b = vld1q_u32(arrayA + i); count_v = vaddq_u32(count_v, vceqq_u32(a, b)); } I32 count = vaddvq_u32(count_v); - for (; i < length; i++) - if (arrayA[i] == arrayB[i]) - count ++; + for (; i < length; i++) { + if (arrayA[i] == arrayB[i]) { + count++; + } + } output[j] = (count == length); break; } @@ -63,44 +71,39 @@ EE check_u32(TensorDesc inputDescA, const U32* inputA, return SUCCESS; } -EE check_arm(TensorDesc inputDescA, const void* inputA, - TensorDesc inputDescB, const void* inputB, - CheckMode checkMode, - TensorDesc outputDesc, void* output) +EE check_arm(TensorDesc inputDescA, + const void *inputA, + TensorDesc inputDescB, + const void *inputB, + CheckParamSpec p, + TensorDesc outputDesc, + void *output) { DataType idt = inputDescA.dt; EE ret = SUCCESS; switch (idt) { #ifdef _USE_FP32 case DT_F32: { - ret = check_fp32(inputDescA, (const F32*)inputA, - inputDescB, (const F32*)inputB, - checkMode, - outputDesc, (I32*)output); + ret = check_fp32(inputDescA, (const F32 *)inputA, inputDescB, (const F32 *)inputB, + p.check_mode, outputDesc, (I32 *)output); break; } #endif #ifdef _USE_FP16 case DT_F16: { - ret = check_fp16(inputDescA, (const F16*)inputA, - inputDescB, (const F16*)inputB, - checkMode, - outputDesc, (I32*)output); + ret = check_fp16(inputDescA, (const F16 *)inputA, inputDescB, (const F16 *)inputB, + p.check_mode, outputDesc, (I32 *)output); break; } #endif case DT_U32: { - ret = check_u32(inputDescA, (const U32*)inputA, - inputDescB, (const U32*)inputB, - checkMode, - outputDesc, (I32*)output); + ret = check_u32(inputDescA, (const U32 *)inputA, inputDescB, (const U32 *)inputB, + p.check_mode, outputDesc, (I32 *)output); break; } case DT_I32: { - ret = check_u32(inputDescA, (const U32*)inputA, - inputDescB, (const U32*)inputB, - checkMode, - outputDesc, (I32*)output); + ret = check_u32(inputDescA, (const U32 *)inputA, inputDescB, (const U32 *)inputB, + p.check_mode, outputDesc, (I32 *)output); break; } default: diff --git a/tensor_computing/src/cpu/arm/clip.cpp b/compute/tensor/src/cpu/arm/clip.cpp similarity index 75% rename from tensor_computing/src/cpu/arm/clip.cpp rename to compute/tensor/src/cpu/arm/clip.cpp index 1e061223..e5de09c7 100644 --- a/tensor_computing/src/cpu/arm/clip.cpp +++ b/compute/tensor/src/cpu/arm/clip.cpp @@ -1,17 +1,16 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "cpu/arm/tensor_computing_arm.h" #ifdef _USE_FP32 #include "cpu/arm/fp32/tensor_computing_fp32.h" @@ -20,25 +19,20 @@ #include "cpu/arm/fp16/tensor_computing_fp16.h" #endif -EE clip_arm(void *minValue, void *maxValue, TensorDesc inputDesc, void* input, TensorDesc outputDesc, void *output) +EE clip_arm(TensorDesc inputDesc, void *input, ClipParamSpec p, TensorDesc outputDesc, void *output) { UNUSED(outputDesc); - - if (nullptr == minValue - || nullptr == maxValue) - CHECK_STATUS(NULL_POINTER); - EE ret = SUCCESS; switch (inputDesc.dt) { #ifdef _USE_FP32 case DT_F32: { - ret = clip_fp32((F32 *)input, (F32 *)output, tensorNumElements(inputDesc), *((F32 *)minValue), *((F32 *)maxValue)); + ret = clip_fp32((F32 *)input, (F32 *)output, tensorNumElements(inputDesc), p.min, p.max); break; } #endif #ifdef _USE_FP16 case DT_F16: { - ret = clip_fp16((F16 *)input, (F16 *)output, tensorNumElements(inputDesc), *((F32 *)minValue), *((F32 *)maxValue)); + ret = clip_fp16((F16 *)input, (F16 *)output, tensorNumElements(inputDesc), p.min, p.max); break; } #endif diff --git a/compute/tensor/src/cpu/arm/convolution.cpp b/compute/tensor/src/cpu/arm/convolution.cpp new file mode 100644 index 00000000..a14c003e --- /dev/null +++ b/compute/tensor/src/cpu/arm/convolution.cpp @@ -0,0 +1,492 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_FP32 +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#endif +#ifdef _USE_INT8 +#include "cpu/arm/int8/tensor_computing_int8.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/bnn/tensor_computing_bnn.h" +#endif +#include "ut_util.h" + +EE convolution_infer_forward_algorithm_arm(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ConvolutionForwardAlgorithm *algorithm, + DataType targetDataType) +{ + UNUSED(outputDesc); + if (nullptr == algorithm) { + CHECK_STATUS(NULL_POINTER); + } + if (*algorithm != CONVOLUTION_ALGORITHM_NULL) { + return SUCCESS; + } + + EE ret = SUCCESS; + if (policy == CONVOLUTION_FASTEST) { + DataType idt, fdt; + DataFormat idf, fdf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + U32 group = convParamSpec.group; + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + if (dilateH > 1 || dilateW > 1) { + *algorithm = CONVOLUTION_ALGORITHM_GEMM; + return SUCCESS; + } + + if ((idf != DF_NCHWC8 || ic / group % 8 != 0) && DT_I8 != idt) { + *algorithm = CONVOLUTION_ALGORITHM_GEMM_ICNCHW; + } else if (fh == 3 && fw == 3 && strideH == 1 && strideW == 1 && paddingT == 1 && + paddingB == 1 && paddingL == 1 && paddingR == 1) { + *algorithm = CONVOLUTION_ALGORITHM_WINOGRAD; + } else { + *algorithm = CONVOLUTION_ALGORITHM_GEMM; + } + + switch (targetDataType) { + case DT_BIN01: { + *algorithm = CONVOLUTION_ALGORITHM_BNN; + break; + } + case DT_BIN11: { + *algorithm = CONVOLUTION_ALGORITHM_BNN; + break; + } + case DT_I8: { + if (*algorithm == CONVOLUTION_ALGORITHM_WINOGRAD) { + *algorithm = CONVOLUTION_ALGORITHM_GEMM; + } + break; + } + default: + break; + } + +#ifndef __aarch64__ + if (CONVOLUTION_ALGORITHM_GEMM_ICNCHW != *algorithm) { + *algorithm = CONVOLUTION_ALGORITHM_GEMM; + } + return SUCCESS; +#endif + } else if (policy == CONVOLUTION_TUNNING) { + std::vector convolutionAlgorithms; + U32 filterBytes = 0; + U32 tmpBytes = 0; + for (U32 i = 0; i < convolutionAlgorithms.size(); i++) { + U32 bytes = 0; + CHECK_STATUS(convolution_transform_filter_bytes_arm( + filterDesc, convParamSpec, convolutionAlgorithms[i], &bytes)); + filterBytes = (bytes > filterBytes) ? bytes : filterBytes; + CHECK_STATUS(convolution_infer_forward_tmp_bytes_arm(inputDesc, filterDesc, outputDesc, + convParamSpec, convolutionAlgorithms[i], &bytes)); + tmpBytes = (bytes > tmpBytes) ? bytes : tmpBytes; + } + TensorDesc biasDesc = tensor1d(filterDesc.dt, outputDesc.dims[3]); + TensorDesc scaleDesc = tensor1d(DT_F32, outputDesc.dims[2]); + U8 *input = ut_input_v(tensorNumElements(inputDesc), inputDesc.dt, UT_INIT_RANDOM); + U8 *filter = ut_input_v(tensorNumElements(filterDesc), filterDesc.dt, UT_INIT_RANDOM); + U8 *filterTransformed = + ut_input_v(filterBytes / bytesOf(filterDesc.dt), filterDesc.dt, UT_INIT_RANDOM); + U8 *bias = ut_input_v(tensorNumElements(biasDesc), biasDesc.dt, UT_INIT_RANDOM); + U8 *scale = ut_input_v(tensorNumElements(scaleDesc), scaleDesc.dt, UT_INIT_RANDOM); + U8 *tmp = ut_input_v(tmpBytes / bytesOf(inputDesc.dt), inputDesc.dt, UT_INIT_ZERO); + U8 *output = ut_input_v(tensorNumElements(outputDesc), outputDesc.dt, UT_INIT_ZERO); + U32 algorithmIndex = 0; + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_RELU; + activationDesc.value[0] = 0; + for (U32 i = 0; i < convolutionAlgorithms.size(); i++) { + TensorDesc ftmDesc; + CHECK_STATUS(convolution_transform_filter_arm(filterDesc, filter, convParamSpec, + convolutionAlgorithms[i], &ftmDesc, filterTransformed)); + + memset(tmp, 0, tmpBytes); + double timeStart = ut_time_ms(); + CHECK_STATUS(convolution_arm(inputDesc, input, ftmDesc, filterTransformed, + convParamSpec, convolutionAlgorithms[i], scaleDesc, scale, biasDesc, bias, tmpBytes, + tmp, outputDesc, output, activationDesc, ARM_A76)); + double timeEnd = ut_time_ms(); + double timeMin = FLT_MAX; + if (timeMin > timeEnd - timeStart) { + timeMin = timeEnd - timeStart; + algorithmIndex = i; + } + } + free(input); + free(filter); + free(filterTransformed); + free(bias); + free(scale); + free(tmp); + free(output); + *algorithm = convolutionAlgorithms[algorithmIndex]; + ret = SUCCESS; + } else { + ret = NOT_SUPPORTED; + } + return ret; +} + +EE convolution_transform_filter_bytes_arm(TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes) +{ + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + EE ret = SUCCESS; + + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + U32 fnAlignSize = 8; + if (filterDesc.dt == DT_F16) { + fnAlignSize = 16; + } + U32 fnGroupSize = fn / convParamSpec.group; + U32 fnPadding = (fnGroupSize / fnAlignSize + ((fnGroupSize % fnAlignSize) == 0 ? 0 : 1)) * + fnAlignSize * convParamSpec.group; + switch (algorithm) { + case CONVOLUTION_ALGORITHM_WINOGRAD: + *bytes = fnPadding * fc * 6 * 6; + break; + case CONVOLUTION_ALGORITHM_DIRECT: + *bytes = fnPadding * fc * fh * fw; + break; + case CONVOLUTION_ALGORITHM_GEMM: + *bytes = fnPadding * fc * fh * fw; + break; + case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: + *bytes = fnPadding * fc * fh * fw; + break; + case CONVOLUTION_ALGORITHM_BNN: + *bytes = fnPadding * fc * fh * fw; + break; + default: + return NOT_SUPPORTED; + } + *bytes *= bytesOf(fdt); + + switch (filterDesc.dt) { + case DT_BIN01: { + *bytes /= 8; + break; + } + case DT_BIN11: { + *bytes /= 8; + break; + } + default: + break; + } + *bytes += 32; + return ret; +} + +EE convolution_transform_filter_arm(TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = convolution_transform_filter_fp32(filterDesc, (F32 *)filter, convParamSpec, + algorithm, ftmDesc, (F32 *)filterTransformed); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = convolution_transform_filter_fp16(filterDesc, (F16 *)filter, convParamSpec, + algorithm, ftmDesc, (F16 *)filterTransformed); + break; + } +#endif +#ifdef _USE_INT8 + case DT_I8: { + ret = convolution_transform_filter_int8( + filterDesc, filter, convParamSpec, algorithm, ftmDesc, filterTransformed); + break; + } + case DT_F16_8Q: { + ret = convolution_transform_filter_int8( + filterDesc, filter, convParamSpec, algorithm, ftmDesc, filterTransformed); + break; + } +#endif +#ifdef _USE_FP16 + case DT_BIN01: { + ret = convolution_transform_filter_bnn( + filterDesc, (BIN8 *)filter, ftmDesc, (BIN8 *)filterTransformed); + break; + } + case DT_BIN11: { + ret = convolution_transform_filter_bnn( + filterDesc, (BIN8 *)filter, ftmDesc, (BIN8 *)filterTransformed); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE convolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes) +{ + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + U32 tile_size = 0; + switch (fdt) { + case DT_F32: +#ifdef __aarch64__ + tile_size = 12; +#else + tile_size = 6; +#endif + break; + case DT_F16: + tile_size = 8; + break; + case DT_I8: + tile_size = 12; + break; + case DT_BIN01: + tile_size = 0; + break; + case DT_BIN11: + tile_size = 0; + break; + default: + return NOT_SUPPORTED; + } + EE ret = SUCCESS; + U32 element_size = bytesOf(idt); + *bytes = (ic * ih_pad * iw_pad) * element_size; + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + break; + case CONVOLUTION_ALGORITHM_GEMM: + *bytes += tile_size * fh * fw * ic * OMP_NUM_THREADS * element_size; + if (fdt == DT_I8) { + *bytes += ic * ih * iw; + } + if (odt == DT_I8) { + // scaled bias + results before quantization + *bytes += (oc + on * oc * oh * ow) * bytesOf(DT_I32); + } + break; + case CONVOLUTION_ALGORITHM_WINOGRAD: { + U32 tile_h = (oh + 3) / 4; + U32 tile_w = (ow + 3) / 4; + U32 pad_left = paddingL; + U32 pad_right = paddingR + (tile_w * 4 - ow); + U32 pad_top = paddingT; + U32 pad_bottom = paddingB + (tile_h * 4 - oh); + ih_pad = ih + pad_top + pad_bottom; + iw_pad = iw + pad_left + pad_right; + *bytes = ic * ih_pad * iw_pad * element_size; + if (fdt == DT_F32) { + *bytes += (ic + 8) * 6 * 6 * 12 * element_size; + } else if (fdt == DT_F16) { + *bytes += (ic + oc) * 6 * 6 * 8 * element_size; + } else if (fdt == DT_I8) { + // itm (int16 for int8 inputs) and otm (otm just contains o8 each time) + *bytes += (ic + 8) * 6 * 6 * 12 * bytesOf(DT_F16); + // quantized transformed input + *bytes += ic * 6 * 6 * 12; + if (odt == DT_I8) { + // Output before quantization + *bytes += on * oc * oh * ow * bytesOf(DT_F16); + } + } else { + ret = NOT_SUPPORTED; + } + break; + } + case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: + *bytes += tile_size * fh * fw * ic * element_size; + break; + case CONVOLUTION_ALGORITHM_BNN: + *bytes += (8 * fh * fw * ic + ic * ih * iw) * element_size; + *bytes /= 8; + break; + default: + ret = NOT_MATCH; + break; + } + if (DT_I8 == fdt && DF_NCHW == idf) { + CHECK_REQUIREMENT(ic % 8 == 0); + *bytes += tensorNumBytes(inputDesc); + } + *bytes += 32; + + // pre data processing space for not complete NCHWC8 group convolution input + U32 icGroupSize = ic / convParamSpec.group; + if (idf == DF_NCHWC8 && icGroupSize % 8 != 0) { + *bytes += tensorNumBytes(inputDesc); + } + return ret; +} + +EE convolution_arm(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + UNUSED(scaleDesc); + UNUSED(scale); + U32 group = convParamSpec.group; + U32 batchAxis = inputDesc.nDims - 1; + U32 dataChannelAxis = inputDesc.nDims - 2; + U32 filterChannelAxis = filterDesc.nDims - 1; + U32 biasChannelAxis = 0; + CHECK_REQUIREMENT(inputDesc.dims[batchAxis] == 1); + U32 icGroupSize = inputDesc.dims[dataChannelAxis] / group; + // pre data processing space for not complete NCHWC8 group convolution input + void *inputTransform; + if (inputDesc.df == DF_NCHWC8 && icGroupSize % 8 != 0) { + TensorDesc tmpInputDesc = inputDesc; + tmpInputDesc.df = DF_NCHW; + transformToNCHW(inputDesc, input, tmpInputDesc, tmp); + inputTransform = tmp; + tmp = (U8 *)tmp + tensorNumBytes(tmpInputDesc); + tmpBytes -= tensorNumBytes(tmpInputDesc); + inputDesc.df = DF_NCHW; + } else { + inputTransform = input; + } + TensorDesc tmpInputDesc = inputDesc; + tmpInputDesc.dims[dataChannelAxis] /= group; + TensorDesc tmpOutputDesc = outputDesc; + tmpOutputDesc.dims[dataChannelAxis] /= group; + TensorDesc tmpFilterDesc = filterDesc; + tmpFilterDesc.dims[filterChannelAxis] /= group; + TensorDesc tmpBiasDesc = biasDesc; + tmpBiasDesc.dims[biasChannelAxis] /= group; + EE ret = SUCCESS; + for (U32 g = 0; g < group; g++) { + void *tmpInput = (U8 *)inputTransform + g * tensorNumBytes(tmpInputDesc); + const void *tmpFilter = (U8 *)filter + g * tensorNumBytes(tmpFilterDesc); + const void *tmpBias = (U8 *)bias + g * tensorNumBytes(tmpBiasDesc); + void *tmpOutput = (U8 *)output + g * tensorNumBytes(tmpOutputDesc); + switch (filterDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = convolution_fp32(tmpInputDesc, (F32 *)tmpInput, tmpFilterDesc, + (F32 *)tmpFilter, convParamSpec, algorithm, tmpBiasDesc, (F32 *)tmpBias, + tmpBytes, tmp, tmpOutputDesc, (F32 *)tmpOutput, activationDesc, arch); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = convolution_fp16(tmpInputDesc, (F16 *)tmpInput, tmpFilterDesc, + (F16 *)tmpFilter, convParamSpec, algorithm, tmpBiasDesc, (F16 *)tmpBias, + tmpBytes, tmp, tmpOutputDesc, (F16 *)tmpOutput, activationDesc, arch); + break; + } +#endif +#ifdef _USE_INT8 + case DT_I8: { + ret = convolution_int8(tmpInputDesc, (INT8 *)tmpInput, tmpFilterDesc, + (INT8 *)tmpFilter, (F16 *)scale, convParamSpec, algorithm, tmpBiasDesc, + (F16 *)tmpBias, tmpBytes, tmp, tmpOutputDesc, tmpOutput, activationDesc, arch); + break; + } +#endif +#ifdef _USE_FP16 + case DT_BIN01: { + ret = convolution_bnn(tmpInputDesc, (F16 *)tmpInput, tmpFilterDesc, + (BIN8 *)tmpFilter, convParamSpec, scaleDesc, (F16 *)scale, tmpBiasDesc, + (F16 *)tmpBias, tmpBytes, tmp, tmpOutputDesc, (F16 *)tmpOutput, activationDesc, + arch); + break; + } + case DT_BIN11: { + ret = convolution_bnn(tmpInputDesc, (F16 *)tmpInput, tmpFilterDesc, + (BIN8 *)tmpFilter, convParamSpec, scaleDesc, (F16 *)scale, tmpBiasDesc, + (F16 *)tmpBias, tmpBytes, tmp, tmpOutputDesc, (F16 *)tmpOutput, activationDesc, + arch); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/deconvolution.cpp b/compute/tensor/src/cpu/arm/deconvolution.cpp new file mode 100644 index 00000000..6c13bc8f --- /dev/null +++ b/compute/tensor/src/cpu/arm/deconvolution.cpp @@ -0,0 +1,49 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_FP32 +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#endif + +EE deconvolution_transform_filter_arm(TensorDesc filterDesc, + const void *filter, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = deconvolution_transform_filter_fp32( + filterDesc, (F32 *)filter, algorithm, ftmDesc, (F32 *)filterTransformed); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = deconvolution_transform_filter_fp16( + filterDesc, (F16 *)filter, algorithm, ftmDesc, (F16 *)filterTransformed); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/depthwise_convolution.cpp b/compute/tensor/src/cpu/arm/depthwise_convolution.cpp new file mode 100644 index 00000000..c19516fa --- /dev/null +++ b/compute/tensor/src/cpu/arm/depthwise_convolution.cpp @@ -0,0 +1,125 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_FP32 +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#endif +#ifdef _USE_INT8 +#include "cpu/arm/int8/tensor_computing_int8.h" +#endif + +EE depthwise_convolution_transform_filter_arm(TensorDesc filterDesc, + const void *filter, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed) +{ + DataFormat ftmDataFormat; + switch (algorithm) { + case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: + ftmDataFormat = DF_NCHWC8; + break; + default: + return NOT_MATCH; + } + *ftmDesc = filterDesc; + ftmDesc->df = ftmDataFormat; + EE ret = NOT_SUPPORTED; + if (filterDesc.df == ftmDataFormat) { + memcpy(filterTransformed, filter, tensorNumBytes(filterDesc)); + ret = SUCCESS; + } else if (filterDesc.df == DF_NCHW) { + if (ftmDataFormat == DF_NCHWC8) { + ret = transformNCHWToNCHWC8(filterDesc, filter, *ftmDesc, filterTransformed); + } + } + return ret; +} + +EE depthwise_convolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes) +{ + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + EE ret = SUCCESS; + switch (algorithm) { + case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: + *bytes = ic * ih_pad * iw_pad; + break; + default: { + ret = NOT_MATCH; + *bytes = 0; + break; + } + } + *bytes *= bytesOf(idt); + + switch (filterDesc.dt) { +#ifdef _USE_INT8 + case DT_I8: { + *bytes += ic * oh * ow * sizeof(I32); + break; + } +#endif + default: + break; + } + *bytes += 32; + return ret; +} + +EE depthwise_convolution_arm(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec, + Arch arch) +{ + TensorDesc blankTensorDesc; + ActivationParamSpec blankActivationParamSpec; + return depthwise_pointwise_convolution_arm(inputDesc, input, filterDesc, filter, blankTensorDesc, + nullptr, convParamSpec, algorithm, blankTensorDesc, bias, biasDesc, nullptr, tmpBytes, tmp, + outputDesc, output, depthwiseActivationParamSpec, blankActivationParamSpec, arch); +} diff --git a/compute/tensor/src/cpu/arm/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/arm/depthwise_pointwise_convolution.cpp new file mode 100644 index 00000000..34058bdf --- /dev/null +++ b/compute/tensor/src/cpu/arm/depthwise_pointwise_convolution.cpp @@ -0,0 +1,211 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_FP32 +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#endif +#ifdef _USE_INT8 +#include "cpu/arm/int8/tensor_computing_int8.h" +#endif + +EE depthwise_pointwise_convolution_infer_forward_algorithm_arm(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + DepthwiseConvolutionForwardAlgorithm *algorithm, + DataType targetDataType) +{ + UNUSED(policy); + if (nullptr == algorithm) { + CHECK_STATUS(NULL_POINTER); + } + EE ret = SUCCESS; + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + *algorithm = DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT; + if (convParamSpec.dilatedRate_h != 1 || convParamSpec.dilatedRate_w != 1) { + return ret; + } + + switch (targetDataType) { + case DT_F16: { + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + if (fh == 3 && fw == 3 && strideH == 1 && strideW == 1 && paddingT == 1 && + paddingB == 1 && paddingL == 1 && paddingR == 1 && ow % 4 == 0 && ow >= 12) { + *algorithm = DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_3X3S1P1; + } + break; + } + default: { + break; + } + } + return ret; +} + +EE depthwise_pointwise_convolution_transform_filter_arm(TensorDesc dwFilterDesc, + const void *dwFilter, + TensorDesc pwFilterDesc, + const void *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *dwFtmDesc, + void *dwFilterTransformed, + TensorDesc *pwFtmDesc, + void *pwFilterTransformed) +{ + EE ret = depthwise_convolution_transform_filter_arm(dwFilterDesc, dwFilter, + DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT, dwFtmDesc, dwFilterTransformed); + if (ret == SUCCESS) { + convParamSpec.group = 1; + ret = convolution_transform_filter_arm(pwFilterDesc, pwFilter, convParamSpec, + CONVOLUTION_ALGORITHM_GEMM, pwFtmDesc, pwFilterTransformed); + } + return ret; +} + +EE depthwise_pointwise_convolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes) +{ + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, odt; + DataFormat idf, odf; + U32 in, ic, ih, iw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + EE ret = SUCCESS; + switch (algorithm) { + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: + *bytes = ic * ih_pad * iw_pad + ic * oh * ow; + break; + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT_NO_PADDING: + *bytes = ic * oh * ow; + break; + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_3X3S1P1: + *bytes = ic * oh * ow + ic * 8; + break; + default: { + ret = NOT_MATCH; + *bytes = 0; + break; + } + } + *bytes *= bytesOf(idt); + + switch (dwFilterDesc.dt) { +#ifdef _USE_INT8 + case DT_I8: { + *bytes += ic * oh * ow * sizeof(I32); + break; + } +#endif + default: + break; + } + *bytes += 32; + return ret; +} + +EE depthwise_pointwise_convolution_arm(TensorDesc inputDesc, + void *input, + TensorDesc dwFilterDesc, + const void *dwFilter, + TensorDesc pwFilterDesc, + const void *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const void *dwBias, + TensorDesc pwBiasDesc, + const void *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch) +{ + EE ret = SUCCESS; + switch (dwFilterDesc.dt) { +#ifdef _USE_FP16 + case DT_F16: { + ret = depthwise_pointwise_convolution_fp16(inputDesc, (F16 *)input, dwFilterDesc, + (const F16 *)dwFilter, pwFilterDesc, (const F16 *)pwFilter, convParamSpec, + algorithm, dwBiasDesc, (const F16 *)dwBias, pwBiasDesc, (const F16 *)pwBias, + tmpBytes, tmp, outputDesc, (F16 *)output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, arch); + break; + } +#endif +#ifdef _USE_FP32 + case DT_F32: { + ret = depthwise_pointwise_convolution_fp32(inputDesc, (F32 *)input, dwFilterDesc, + (const F32 *)dwFilter, pwFilterDesc, (const F32 *)pwFilter, convParamSpec, + algorithm, dwBiasDesc, (const F32 *)dwBias, pwBiasDesc, (const F32 *)pwBias, + tmpBytes, tmp, outputDesc, (F32 *)output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, arch); + break; + } +#endif +#ifdef _USE_INT8 + case DT_I8: { + ret = depthwise_pointwise_convolution_int8(inputDesc, (INT8 *)input, dwFilterDesc, + (const INT8 *)dwFilter, pwFilterDesc, (const INT8 *)pwFilter, convParamSpec, + algorithm, dwBiasDesc, (const I32 *)dwBias, pwBiasDesc, (const I32 *)pwBias, + tmpBytes, tmp, outputDesc, (I32 *)output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, arch); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/tensor_computing/src/cpu/arm/priorbox.cpp b/compute/tensor/src/cpu/arm/eltwise.cpp similarity index 72% rename from tensor_computing/src/cpu/arm/priorbox.cpp rename to compute/tensor/src/cpu/arm/eltwise.cpp index 4565f956..fcc8db99 100644 --- a/tensor_computing/src/cpu/arm/priorbox.cpp +++ b/compute/tensor/src/cpu/arm/eltwise.cpp @@ -1,43 +1,50 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include "cpu/arm/tensor_computing_arm.h" -#ifdef _USE_FP32 -#include "cpu/arm/fp32/tensor_computing_fp32.h" -#endif -#ifdef _USE_FP16 -#include "cpu/arm/fp16/tensor_computing_fp16.h" -#endif - -EE priorbox_arm(std::vector inputDesc, PriorBoxDesc priorboxDesc, TensorDesc outputDesc, void* output) -{ - EE ret = SUCCESS; - switch (inputDesc[0].dt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = priorbox_fp32(inputDesc, priorboxDesc, outputDesc, (F32*)output); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = priorbox_fp16(inputDesc, priorboxDesc, outputDesc, (F16*)output); - break; - } -#endif - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_FP32 +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#endif + +EE eltwise_arm(DataType dataType, + std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode) +{ + EE ret = SUCCESS; + switch (dataType) { +#ifdef _USE_FP32 + case DT_F32: { + ret = eltwise_fp32(input, inputSize, num, len, output, eltwiseMode); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = eltwise_fp16(input, inputSize, num, len, output, eltwiseMode); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/tensor_computing/src/cpu/arm/fp16/arm_functions_fp16.h b/compute/tensor/src/cpu/arm/fp16/arm_functions_fp16.h similarity index 62% rename from tensor_computing/src/cpu/arm/fp16/arm_functions_fp16.h rename to compute/tensor/src/cpu/arm/fp16/arm_functions_fp16.h index b8c37cd2..8cca0af4 100644 --- a/tensor_computing/src/cpu/arm/fp16/arm_functions_fp16.h +++ b/compute/tensor/src/cpu/arm/fp16/arm_functions_fp16.h @@ -1,64 +1,71 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_ARM_FUNCTIONS_FP16 #define _H_ARM_FUNCTIONS_FP16 -#ifdef _USE_FP16 +#include #include "arm_neon_expand.h" -#include -#include "tensor_computing_type.h" +#include "types.h" // array sum -inline F32 array_sum_f16(const F16 *data, I32 len) { - if(len <= 0) return 0; +inline F32 array_sum_f16(const F16 *data, I32 len) +{ + if (len <= 0) { + return 0; + } I32 i = 0; F32 sum_s = 0; float16x8_t sum_v = vdupq_n_f16(0); - for(i = 0; i < len - 7; i+=8){ + for (i = 0; i < len - 7; i += 8) { float16x8_t in = vld1q_f16(data + i); sum_v = vaddq_f16(sum_v, in); } sum_s += vaddvq_f16(sum_v); - for(; i < len; i++){ + for (; i < len; i++) { sum_s += data[i]; } return sum_s; } // array mean -inline F32 array_mean_f16(const F16 *data, I32 len) { - if(len <= 0) return 0; +inline F32 array_mean_f16(const F16 *data, I32 len) +{ + if (len <= 0) { + return 0; + } return array_sum_f16(data, len) / len; } // array var -inline F32 array_var_f16(const F16 *data, I32 len, F32 mean) { - if(len <= 0) return 0; +inline F32 array_var_f16(const F16 *data, I32 len, F32 mean) +{ + if (len <= 0) { + return 0; + } I32 i = 0; F32 sum_s = 0; float32x4_t mean_v = vdupq_n_f32(mean); - for(i = 0; i < len - 3; i+=4){ + for (i = 0; i < len - 3; i += 4) { float16x4_t in = vld1_f16(data + i); float32x4_t in_f32 = vcvt_f32_f16(in); float32x4_t tmp_v = vsubq_f32(in_f32, mean_v); float32x4_t sum_v = vmulq_f32(tmp_v, tmp_v); sum_s += vaddvq_f32(sum_v); } - for(; i < len; i++){ + for (; i < len; i++) { F16 in = data[i]; F32 tmp = in - mean; sum_s += tmp * tmp; @@ -67,36 +74,38 @@ inline F32 array_var_f16(const F16 *data, I32 len, F32 mean) { } // array max -inline F16 array_max_f16(const F16* data, I32 len) { +inline F16 array_max_f16(const F16 *data, I32 len) +{ F16 max_s = data[0]; I32 i = 0; - if(len >= 8){ + if (len >= 8) { float16x8_t max_v, tmp_v; max_v = vld1q_f16(data); - for(i = 8; i < len - 7; i+=8){ + for (i = 8; i < len - 7; i += 8) { tmp_v = vld1q_f16(data + i); max_v = vmaxq_f16(tmp_v, max_v); } max_s = vmaxvq_f16(max_v); } - for(; i < len; i++){ - if(data[i] > max_s) + for (; i < len; i++) { + if (data[i] > max_s) { max_s = data[i]; + } } return max_s; } -inline F16 array_maxabs_f16(const F16* data, I32 len) +inline F16 array_maxabs_f16(const F16 *data, I32 len) { - F16 max_s = std::abs(data[0]); + F16 max_s = abs(data[0]); I32 i = 0; if (len >= 8) { float16x8_t max_v, tmp_v; max_v = vld1q_f16(data); max_v = vabsq_f16(max_v); - for(i = 8; i < len - 7; i+=8){ + for (i = 8; i < len - 7; i += 8) { tmp_v = vld1q_f16(data + i); tmp_v = vabsq_f16(tmp_v); max_v = vmaxq_f16(tmp_v, max_v); @@ -104,20 +113,22 @@ inline F16 array_maxabs_f16(const F16* data, I32 len) max_s = vmaxvq_f16(max_v); } - for ( ; i < len; i++) { - if(std::abs(data[i]) > max_s) - max_s = std::abs(data[i]); + for (; i < len; i++) { + if (abs(data[i]) > max_s) { + max_s = abs(data[i]); + } } return max_s; } -inline void array_scale_f16(F16 *input, F16 *output, I32 len, F32 alpha, F32 beta) { +inline void array_scale_f16(const F16 *input, F16 *output, I32 len, F32 alpha, F32 beta) +{ I32 i = 0; #ifdef _USE_F16_MIX_PRECISION float32x4_t alpha_v = vdupq_n_f32(alpha); - float32x4_t beta_v = vdupq_n_f32(beta); - for(i = 0; i < len - 3; i+=4){ + float32x4_t beta_v = vdupq_n_f32(beta); + for (i = 0; i < len - 3; i += 4) { float16x4_t in = vld1_f16(input + i); float32x4_t in_f32 = vcvt_f32_f16(in); float32x4_t result = vfmaq_f32(beta_v, alpha_v, in_f32); @@ -125,11 +136,11 @@ inline void array_scale_f16(F16 *input, F16 *output, I32 len, F32 alpha, F32 bet } #else float16x8_t alpha_v = vdupq_n_f16(alpha); - float16x8_t beta_v = vdupq_n_f16(beta); + float16x8_t beta_v = vdupq_n_f16(beta); for (i = 0; i < len - 7; i += 8) { float16x8_t in = vld1q_f16(input + i); float16x8_t tmp_v = vfmaq_f16(beta_v, alpha_v, in); - vst1q_f16(output+i, tmp_v); + vst1q_f16(output + i, tmp_v); } #endif for (; i < len; i++) { @@ -137,18 +148,79 @@ inline void array_scale_f16(F16 *input, F16 *output, I32 len, F32 alpha, F32 bet } } -inline EE activation_fp16(F16* input, U32 len, ActivationDesc activationDesc, F16* output) +inline void array_power_f16(F16 *input, F16 *output, I32 len, F32 power) +{ + I32 i = 0; + if (power == -1) { +#ifdef _USE_F16_MIX_PRECISION + float32x4_t one_v = vdupq_n_f32(1); + for (i = 0; i < len - 3; i += 4) { + float16x4_t in = vld1_f16(input + i); + float32x4_t in_f32 = vcvt_f32_f16(in); + float32x4_t result = vdivq_f32(one_v, in_f32); + vst1_f16(output + i, vcvt_f16_f32(result)); + } +#else + float16x8_t one_v = vdupq_n_f16(1); + for (i = 0; i < len - 7; i += 8) { + float16x8_t in = vld1q_f16(input + i); + float16x8_t tmp_v = vdivq_f16(one_v, in); + vst1q_f16(output + i, tmp_v); + } +#endif + } else if (power == 0.5) { +#ifdef _USE_F16_MIX_PRECISION + for (i = 0; i < len - 3; i += 4) { + float16x4_t in = vld1_f16(input + i); + float32x4_t in_f32 = vcvt_f32_f16(in); + float32x4_t result = vsqrtq_f32(in_f32); + vst1_f16(output + i, vcvt_f16_f32(result)); + } +#else + for (i = 0; i < len - 7; i += 8) { + float16x8_t in = vld1q_f16(input + i); + float16x8_t tmp_v = vsqrtq_f16(in); + vst1q_f16(output + i, tmp_v); + } +#endif + } else if (power == 1) { + if (input != output) { + memcpy(output, input, len * sizeof(F16)); + } + i = len; + } else if (power == 2) { +#ifdef _USE_F16_MIX_PRECISION + for (i = 0; i < len - 3; i += 4) { + float16x4_t in = vld1_f16(input + i); + float32x4_t in_f32 = vcvt_f32_f16(in); + float32x4_t result = vmulq_f32(in_f32, in_f32); + vst1_f16(output + i, vcvt_f16_f32(result)); + } +#else + for (i = 0; i < len - 7; i += 8) { + float16x8_t in = vld1q_f16(input + i); + float16x8_t tmp_v = vmulq_f16(in, in); + vst1q_f16(output + i, tmp_v); + } +#endif + } + for (; i < len; i++) { + output[i] = powf(input[i], power); + } +} + +inline EE activation_fp16(F16 *input, U32 len, ActivationParamSpec activationDesc, F16 *output) { float16x8_t in, out; - float16x8_t zero = vdupq_n_f16(float16_t(0.)); - float16x8_t one = vdupq_n_f16(float16_t(1.)); + float16x8_t zero = vdupq_n_f16(float16_t(0.)); + float16x8_t one = vdupq_n_f16(float16_t(1.)); float16x8_t three = vdupq_n_f16(float16_t(3.)); - float16x8_t six = vdupq_n_f16(float16_t(6.)); + float16x8_t six = vdupq_n_f16(float16_t(6.)); U32 len_main = len / 8; U32 len_tail = len % 8; F16 value; - switch (activationDesc.mode){ + switch (activationDesc.mode) { case ACTIVATION_NULL: { break; } @@ -243,9 +315,9 @@ inline EE activation_fp16(F16* input, U32 len, ActivationDesc activationDesc, F1 } case ACTIVATION_GELU: { F16 two_div_PI_sqrt = sqrt(2 / 3.14159265358979323846); - float16x8_t vec0 = vdupq_n_f16(two_div_PI_sqrt); - float16x8_t vec1 = vdupq_n_f16(float16_t(0.044715)); - float16x8_t vec2 = vdupq_n_f16(float16_t(0.5)); + float16x8_t vec0 = vdupq_n_f16(two_div_PI_sqrt); + float16x8_t vec1 = vdupq_n_f16(float16_t(0.044715)); + float16x8_t vec2 = vdupq_n_f16(float16_t(0.5)); for (U32 i = 0; i < len_main; i++) { in = vld1q_f16(input); out = vmulq_f16(in, in); @@ -262,7 +334,7 @@ inline EE activation_fp16(F16* input, U32 len, ActivationDesc activationDesc, F1 } for (U32 i = 0; i < len_tail; i++) { value = input[i]; - value = two_div_PI_sqrt * (value + 0.044715 * pow(value, 3)); + value = two_div_PI_sqrt * (value + 0.044715 * powf(value, 3)); value = 1.0 - 2.0 / (exp(2.0 * value) + 1.0); value = 0.5 * (1.0 + value); value = input[i] * value; @@ -298,6 +370,27 @@ inline EE activation_fp16(F16* input, U32 len, ActivationDesc activationDesc, F1 } break; } + case ACTIVATION_MISH: { + for (U32 i = 0; i < len_main; i++) { + in = vld1q_f16(input); + out = vmulq_f16( + in, vtanhq_f16(vlogq_f16(vaddq_f16(vexpq_f16_03_percent_error(in), one)))); + vst1q_f16(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + value = input[i] * tanh(log(exp(input[i]) + 1.0)); + output[i] = value; + } + break; + } + case ACTIVATION_GREATER: { + for (U32 i = 0; i < len; i++) { + output[i] = input[i] > 1 ? 1 : 0; + } + break; + } default: return NOT_SUPPORTED; } @@ -305,20 +398,35 @@ inline EE activation_fp16(F16* input, U32 len, ActivationDesc activationDesc, F1 return SUCCESS; } -inline void array_add_f16(const F16* inputA, const F16* inputB, F16* output, I32 len) +inline void array_add_f16(const F16 *inputA, const F16 *inputB, F16 *output, I32 len) { I32 i = 0; - for(i = 0; i < len - 7; i+=8){ + for (i = 0; i < len - 7; i += 8) { float16x8_t a = vld1q_f16(inputA + i); float16x8_t b = vld1q_f16(inputB + i); float16x8_t c = vaddq_f16(a, b); - vst1q_f16(output+i, c); + vst1q_f16(output + i, c); } - for ( ; i < len; i++) { + for (; i < len; i++) { output[i] = inputA[i] + inputB[i]; } } -#endif +inline void array_square_and_add_f16(const F16 *inputA, const F16 *inputB, F16 *output, I32 len) +{ + I32 i = 0; + for (i = 0; i < len - 7; i += 8) { + float16x8_t a = vld1q_f16(inputA + i); + float16x8_t b = vld1q_f16(inputB + i); + b = vmulq_f16(b, b); + float16x8_t c = vaddq_f16(a, b); + vst1q_f16(output + i, c); + } + + for (; i < len; i++) { + output[i] = inputA[i] + inputB[i] * inputB[i]; + } +} + #endif diff --git a/tensor_computing/src/cpu/arm/fp16/attention.cpp b/compute/tensor/src/cpu/arm/fp16/attention.cpp similarity index 66% rename from tensor_computing/src/cpu/arm/fp16/attention.cpp rename to compute/tensor/src/cpu/arm/fp16/attention.cpp index 46a2041f..050203ab 100644 --- a/tensor_computing/src/cpu/arm/fp16/attention.cpp +++ b/compute/tensor/src/cpu/arm/fp16/attention.cpp @@ -1,68 +1,74 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "cpu/arm/fp16/tensor_computing_fp16.h" -EE attention_fp16(U32 batch, U32 numHeads, I32 fromSequenceLength, I32 toSequenceLength, const F16 *input, F16 *output) +EE attention_fp16(U32 batch, + U32 numHeads, + I32 fromSequenceLength, + I32 toSequenceLength, + const F16 *input, + F16 *output) { - if (nullptr == input || nullptr == output) + if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); + } F16 mask_s = -10000.0; I32 count = array_sum_f16(input, toSequenceLength); I32 valid = UNI_MIN(count, fromSequenceLength); float16x8_t mask_v = vdupq_n_f16(float16_t(mask_s)); float16x8_t one_v = vdupq_n_f16(float16_t(1.0)); - for(U32 n = 0; n < batch; n++){ + for (U32 n = 0; n < batch; n++) { for (U32 i = 0; i < numHeads; i++) { if (i == 0) { for (I32 j = 0; j < valid; j++) { if (j == 0) { I32 k = 0; - for (; k < toSequenceLength-7; k+=8) { + for (; k < toSequenceLength - 7; k += 8) { float16x8_t in_v = vld1q_f16(input + k); float16x8_t tmp_v = vsubq_f16(one_v, in_v); tmp_v = vmulq_f16(tmp_v, mask_v); - vst1q_f16(output+k, tmp_v); + vst1q_f16(output + k, tmp_v); } for (; k < toSequenceLength; k++) { F16 value = (1 - input[k]) * mask_s; output[k] = value; } - } - else { - memcpy(output+j*toSequenceLength, output, toSequenceLength*sizeof(F16)); + } else { + memcpy( + output + j * toSequenceLength, output, toSequenceLength * sizeof(F16)); } } for (I32 j = valid; j < fromSequenceLength; j++) { if (j == valid) { I32 k = 0; - for (; k < toSequenceLength-7; k+=8) { - vst1q_f16(output+j*toSequenceLength+k, mask_v); + for (; k < toSequenceLength - 7; k += 8) { + vst1q_f16(output + j * toSequenceLength + k, mask_v); } for (; k < toSequenceLength; k++) { - output[j*toSequenceLength+k] = mask_s; + output[j * toSequenceLength + k] = mask_s; } - } - else { - memcpy(output+j*toSequenceLength, output+valid*toSequenceLength, toSequenceLength*sizeof(F16)); + } else { + memcpy(output + j * toSequenceLength, output + valid * toSequenceLength, + toSequenceLength * sizeof(F16)); } } } else { - memcpy(output+i*fromSequenceLength*toSequenceLength, output, fromSequenceLength*toSequenceLength*sizeof(F16)); + memcpy(output + i * fromSequenceLength * toSequenceLength, output, + fromSequenceLength * toSequenceLength * sizeof(F16)); } } diff --git a/tensor_computing/src/cpu/arm/fp16/attention_mask.cpp b/compute/tensor/src/cpu/arm/fp16/attention_mask.cpp similarity index 78% rename from tensor_computing/src/cpu/arm/fp16/attention_mask.cpp rename to compute/tensor/src/cpu/arm/fp16/attention_mask.cpp index befcb250..afad68e5 100644 --- a/tensor_computing/src/cpu/arm/fp16/attention_mask.cpp +++ b/compute/tensor/src/cpu/arm/fp16/attention_mask.cpp @@ -1,27 +1,32 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "cpu/arm/fp16/tensor_computing_fp16.h" -EE attention_mask_fp16(TensorDesc inputDesc, const F16* input, - I32 attentionLength, bool sameLength, float maskValue, - TensorDesc outputDesc, F16* output) +EE attention_mask_fp16(TensorDesc inputDesc, + const F16 *input, + AttentionMaskParamSpec p, + TensorDesc outputDesc, + F16 *output) { UNUSED(outputDesc); - if (nullptr == input || nullptr == output) + if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); + } + I32 attentionLength = p.attention_length; + bool sameLength = p.same_length; + float maskValue = p.mask; int qlen = inputDesc.dims[1]; int klen = inputDesc.dims[0]; int mlen = klen - qlen; @@ -48,9 +53,10 @@ EE attention_mask_fp16(TensorDesc inputDesc, const F16* input, } loops = UNI_MAX(loops, 0); start = UNI_MIN(start, klen); - if (start + loops > klen) + if (start + loops > klen) { loops = UNI_MAX(klen - start, 0); - memset(&mask[i*klen+start], 0, sizeof(F16)*loops); + } + memset(&mask[i * klen + start], 0, sizeof(F16) * loops); } } I32 loops = tensorNumElements(inputDesc) / length; @@ -58,13 +64,13 @@ EE attention_mask_fp16(TensorDesc inputDesc, const F16* input, float16x8_t mask_value_v = vdupq_n_f16(maskValue); for (int i = 0, index = 0; i < loops; i++) { int j = 0; - for (; j < length-7; j+=8) { - float16x8_t in = vld1q_f16(input+index); + for (; j < length - 7; j += 8) { + float16x8_t in = vld1q_f16(input + index); float16x8_t mask_v = vld1q_f16(&mask[j]); float16x8_t tmp_v = vsubq_f16(one_v, mask_v); tmp_v = vmulq_f16(in, tmp_v); tmp_v = vfmsq_f16(tmp_v, mask_value_v, mask_v); - vst1q_f16(output+index, tmp_v); + vst1q_f16(output + index, tmp_v); index += 8; } for (; j < length; j++) { diff --git a/tensor_computing/src/cpu/arm/fp16/check.cpp b/compute/tensor/src/cpu/arm/fp16/check.cpp similarity index 72% rename from tensor_computing/src/cpu/arm/fp16/check.cpp rename to compute/tensor/src/cpu/arm/fp16/check.cpp index 1c31c74e..139677cd 100644 --- a/tensor_computing/src/cpu/arm/fp16/check.cpp +++ b/compute/tensor/src/cpu/arm/fp16/check.cpp @@ -1,35 +1,40 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "cpu/arm/fp16/tensor_computing_fp16.h" -EE check_fp16(TensorDesc inputDescA, const F16* inputA, - TensorDesc inputDescB, const F16* inputB, +EE check_fp16(TensorDesc inputDescA, + const F16 *inputA, + TensorDesc inputDescB, + const F16 *inputB, CheckMode checkMode, - TensorDesc outputDesc, I32* output) + TensorDesc outputDesc, + I32 *output) { - if (nullptr == inputA || nullptr == inputB || nullptr == output) + if (nullptr == inputA || nullptr == inputB || nullptr == output) { CHECK_STATUS(NULL_POINTER); + } - if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) + if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) { CHECK_STATUS(NOT_MATCH); + } U32 size = tensorNumElements(inputDescA); - U32 loopOuter = inputDescA.dims[inputDescA.nDims-1]; + U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1]; I32 length = size / loopOuter; - if (tensorNumElements(outputDesc) != loopOuter) + if (tensorNumElements(outputDesc) != loopOuter) { CHECK_STATUS(NOT_MATCH); + } for (U32 j = 0; j < loopOuter; j++) { const F16 *arrayA = inputA + j * length; const F16 *arrayB = inputB + j * length; @@ -37,45 +42,51 @@ EE check_fp16(TensorDesc inputDescA, const F16* inputA, case CHECK_GREAT: { uint16x8_t count_v = vdupq_n_u16(0); I32 i = 0; - for (; i < length-7; i+=8) { + for (; i < length - 7; i += 8) { float16x8_t a = vld1q_f16(arrayA + i); float16x8_t b = vld1q_f16(arrayA + i); count_v = vaddq_u16(count_v, vcgtq_f16(a, b)); } I32 count = vaddvq_u16(count_v); - for (; i < length; i++) - if (arrayA[i] > arrayB[i]) - count ++; + for (; i < length; i++) { + if (arrayA[i] > arrayB[i]) { + count++; + } + } output[j] = (count == length); break; } case CHECK_GREATEQUAL: { uint16x8_t count_v = vdupq_n_u16(0); I32 i = 0; - for (; i < length-7; i+=8) { + for (; i < length - 7; i += 8) { float16x8_t a = vld1q_f16(arrayA + i); float16x8_t b = vld1q_f16(arrayA + i); count_v = vaddq_u16(count_v, vcgeq_f16(a, b)); } I32 count = vaddvq_u16(count_v); - for (; i < length; i++) - if (arrayA[i] >= arrayB[i]) - count ++; + for (; i < length; i++) { + if (arrayA[i] >= arrayB[i]) { + count++; + } + } output[j] = (count == length); break; } case CHECK_EQUAL: { uint16x8_t count_v = vdupq_n_u16(0); I32 i = 0; - for (; i < length-7; i+=8) { + for (; i < length - 7; i += 8) { float16x8_t a = vld1q_f16(arrayA + i); float16x8_t b = vld1q_f16(arrayA + i); count_v = vaddq_u16(count_v, vceqq_f16(a, b)); } I32 count = vaddvq_u16(count_v); - for (; i < length; i++) - if (arrayA[i] == arrayB[i]) - count ++; + for (; i < length; i++) { + if (arrayA[i] == arrayB[i]) { + count++; + } + } output[j] = (count == length); break; } diff --git a/tensor_computing/src/cpu/arm/fp16/clip.cpp b/compute/tensor/src/cpu/arm/fp16/clip.cpp similarity index 82% rename from tensor_computing/src/cpu/arm/fp16/clip.cpp rename to compute/tensor/src/cpu/arm/fp16/clip.cpp index ce451034..3f19ae9e 100644 --- a/tensor_computing/src/cpu/arm/fp16/clip.cpp +++ b/compute/tensor/src/cpu/arm/fp16/clip.cpp @@ -1,32 +1,32 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "cpu/arm/fp16/tensor_computing_fp16.h" -EE clip_fp16(F16 *input, F16 *output, I32 len, F32 minValue, F32 maxValue) { - if (nullptr == input - || nullptr == output) +EE clip_fp16(F16 *input, F16 *output, I32 len, F32 minValue, F32 maxValue) +{ + if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); + } float16x8_t min_v = vdupq_n_f16(minValue); - float16x8_t max_v = vdupq_n_f16(maxValue); + float16x8_t max_v = vdupq_n_f16(maxValue); I32 i = 0; for (i = 0; i < len - 7; i += 8) { float16x8_t in = vld1q_f16(input + i); float16x8_t tmp_v = vminq_f16(max_v, vmaxq_f16(min_v, in)); - vst1q_f16(output+i, tmp_v); + vst1q_f16(output + i, tmp_v); } for (; i < len; i++) { F16 value = input[i]; diff --git a/compute/tensor/src/cpu/arm/fp16/convolution.cpp b/compute/tensor/src/cpu/arm/fp16/convolution.cpp new file mode 100644 index 00000000..8349c7ca --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/convolution.cpp @@ -0,0 +1,87 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#include "cpu/arm/fp16/convolution_winograd.h" +#include "cpu/arm/fp16/convolution_gemm.h" +#include "cpu/arm/fp16/convolution_gemm_icnchw.h" +#include "cpu/arm/fp16/convolution_direct.h" + +EE convolution_fp16(TensorDesc inputDesc, + F16 *input, + TensorDesc filterDesc, + const F16 *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const F16 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + if (nullptr == input || nullptr == filter || nullptr == output || nullptr == bias || + nullptr == tmp) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (!(idt == DT_F16 && fdt == DT_F16 && odt == DT_F16)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(odf == DF_NCHWC8)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(ic == fc && oc == fn)) { + CHECK_STATUS(NOT_MATCH); + } + + // In some cases when we adjust the model input, the input tensor of conv can change from NCHW to NCHWc8 + // In this case we can simply change the algo, because they both require the same filter transform + if (CONVOLUTION_ALGORITHM_GEMM_ICNCHW == algorithm && DF_NCHWC8 == idf) { + algorithm = CONVOLUTION_ALGORITHM_GEMM; + } + + EE ret = SUCCESS; + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + ret = convolution_direct(inputDesc, input, filterDesc, filter, convParamSpec, biasDesc, + bias, tmpBytes, tmp, outputDesc, output, activationDesc, arch); + break; + case CONVOLUTION_ALGORITHM_GEMM: + ret = convolution_gemm(inputDesc, input, filterDesc, filter, convParamSpec, biasDesc, + bias, tmpBytes, tmp, outputDesc, output, activationDesc, arch); + break; + case CONVOLUTION_ALGORITHM_WINOGRAD: + ret = convolution_winograd(inputDesc, input, filterDesc, filter, convParamSpec, + biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc, arch); + break; + case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: + ret = convolution_gemm_icnchw(inputDesc, input, filterDesc, filter, convParamSpec, + biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc, arch); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/tensor_computing/src/cpu/arm/fp16/convolution_direct.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_direct.cpp similarity index 77% rename from tensor_computing/src/cpu/arm/fp16/convolution_direct.cpp rename to compute/tensor/src/cpu/arm/fp16/convolution_direct.cpp index 9cb37781..3782db73 100644 --- a/tensor_computing/src/cpu/arm/fp16/convolution_direct.cpp +++ b/compute/tensor/src/cpu/arm/fp16/convolution_direct.cpp @@ -1,28 +1,32 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "cpu/arm/fp16/convolution_direct.h" -EE convolution_direct(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc, +EE convolution_direct(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc, Arch arch) { UNUSED(biasDesc); @@ -37,15 +41,16 @@ EE convolution_direct(TensorDesc inputDesc, F16* inArray, CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; - if (fdf != DF_NCHWN16) + if (fdf != DF_NCHWN16) { CHECK_STATUS(NOT_MATCH); + } oc /= 8; ic /= 8; @@ -57,61 +62,61 @@ EE convolution_direct(TensorDesc inputDesc, F16* inArray, EE ret = SUCCESS; for (U32 n = 0; n < in; n++) { // copy input into a input with padding - F16 *inArray_pad = (F16*)tmp; + F16 *inArray_pad = (F16 *)tmp; F16 *inArray_pad_mov = inArray_pad; - F16 *inArray_mov = inArray + n*ic*ih*iw*8; + F16 *inArray_mov = inArray + n * ic * ih * iw * 8; for (U32 c = 0; c < ic; c++) { for (U32 h = 0; h < paddingT; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*bytesOf(idt)); - inArray_pad_mov += iw_pad*8; + memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += iw_pad * 8; } for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL*8*bytesOf(idt)); - inArray_pad_mov += paddingL*8; - memcpy(inArray_pad_mov, inArray_mov, iw*8*bytesOf(idt)); - inArray_pad_mov += iw*8; - inArray_mov += iw*8; - memset(inArray_pad_mov, 0, paddingR*8*bytesOf(idt)); - inArray_pad_mov += paddingR*8; + memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(idt)); + inArray_pad_mov += paddingL * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(idt)); + inArray_pad_mov += paddingR * 8; } for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { - memset(inArray_pad_mov, 0, iw_pad*8*bytesOf(idt)); - inArray_pad_mov += iw_pad*8; + memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += iw_pad * 8; } } // compute const F16 *f0 = filterArray; - const F16 *f1 = f0 + fh*fw*16; - const F16 *f2 = f0 + fh*fw*16*2; - const F16 *f3 = f0 + fh*fw*16*3; - const F16 *f4 = f0 + fh*fw*16*4; - const F16 *f5 = f0 + fh*fw*16*5; - const F16 *f6 = f0 + fh*fw*16*6; - const F16 *f7 = f0 + fh*fw*16*7; + const F16 *f1 = f0 + fh * fw * 16; + const F16 *f2 = f0 + fh * fw * 16 * 2; + const F16 *f3 = f0 + fh * fw * 16 * 3; + const F16 *f4 = f0 + fh * fw * 16 * 4; + const F16 *f5 = f0 + fh * fw * 16 * 5; + const F16 *f6 = f0 + fh * fw * 16 * 6; + const F16 *f7 = f0 + fh * fw * 16 * 7; - F16 *outo0h0 = outArray + n*oc*oh*ow*8; - F16 *outo1h0 = outo0h0 + oh*ow*8; - F16 *outo0h1 = outo0h0 + ow*8; - F16 *outo1h1 = outo1h0 + ow*8; - for (U32 o = 0; o < oc; o+=2) { + F16 *outo0h0 = outArray + n * oc * oh * ow * 8; + F16 *outo1h0 = outo0h0 + oh * ow * 8; + F16 *outo0h1 = outo0h0 + ow * 8; + F16 *outo1h1 = outo1h0 + ow * 8; + for (U32 o = 0; o < oc; o += 2) { for (U32 c = 0; c < ic; c++) { F16 *out_o0h0 = outo0h0; F16 *out_o1h0 = outo1h0; F16 *out_o0h1 = outo0h1; F16 *out_o1h1 = outo1h1; - F16 *in_h0w0 = inArray_pad + n*ic*ih_pad*iw_pad*8 + c*ih_pad*iw_pad*8; - F16 *in_h0w1 = in_h0w0 + strideW*8; - F16 *in_h0w2 = in_h0w0 + strideW*8*2; - F16 *in_h0w3 = in_h0w0 + strideW*8*3; - F16 *in_h1w0 = in_h0w0 + strideH*iw_pad*8; - F16 *in_h1w1 = in_h1w0 + strideW*8; - F16 *in_h1w2 = in_h1w0 + strideW*8*2; - F16 *in_h1w3 = in_h1w0 + strideW*8*3; + F16 *in_h0w0 = inArray_pad + n * ic * ih_pad * iw_pad * 8 + c * ih_pad * iw_pad * 8; + F16 *in_h0w1 = in_h0w0 + strideW * 8; + F16 *in_h0w2 = in_h0w0 + strideW * 8 * 2; + F16 *in_h0w3 = in_h0w0 + strideW * 8 * 3; + F16 *in_h1w0 = in_h0w0 + strideH * iw_pad * 8; + F16 *in_h1w1 = in_h1w0 + strideW * 8; + F16 *in_h1w2 = in_h1w0 + strideW * 8 * 2; + F16 *in_h1w3 = in_h1w0 + strideW * 8 * 3; - for (U32 h = 0; h < oh; h+=2) { - for (U32 w = 0; w < ow; w+=4) { + for (U32 h = 0; h < oh; h += 2) { + for (U32 w = 0; w < ow; w += 4) { const F16 *f_c0 = f0; const F16 *f_c1 = f1; const F16 *f_c2 = f2; @@ -390,27 +395,19 @@ EE convolution_direct(TensorDesc inputDesc, F16* inArray, "str q24, [%[out_o1h1], #32]\n" "str q25, [%[out_o1h1], #48]\n" - :[out_o0h0]"+r"(out_o0h0), - [out_o0h1]"+r"(out_o0h1), - [out_o1h0]"+r"(out_o1h0), - [out_o1h1]"+r"(out_o1h1) - :[in_h0w0]"r"(in_h0w0), - [in_h0w1]"r"(in_h0w1), - [in_h0w2]"r"(in_h0w2), - [in_h0w3]"r"(in_h0w3), - [in_h1w0]"r"(in_h1w0), - [in_h1w1]"r"(in_h1w1), - [in_h1w2]"r"(in_h1w2), - [in_h1w3]"r"(in_h1w3), - [f_c0]"r"(f_c0), - [f_c1]"r"(f_c1), - [f_c2]"r"(f_c2), - [f_c3]"r"(f_c3), - [f_c4]"r"(f_c4), - [f_c5]"r"(f_c5), - [f_c6]"r"(f_c6), - [f_c7]"r"(f_c7) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "x0", "x1", "x2", "x3", "x4", "x5" ); + : [out_o0h0] "+r"(out_o0h0), [out_o0h1] "+r"(out_o0h1), + [out_o1h0] "+r"(out_o1h0), [out_o1h1] "+r"(out_o1h1) + : [in_h0w0] "r"(in_h0w0), [in_h0w1] "r"(in_h0w1), + [in_h0w2] "r"(in_h0w2), [in_h0w3] "r"(in_h0w3), + [in_h1w0] "r"(in_h1w0), [in_h1w1] "r"(in_h1w1), + [in_h1w2] "r"(in_h1w2), [in_h1w3] "r"(in_h1w3), + [f_c0] "r"(f_c0), [f_c1] "r"(f_c1), [f_c2] "r"(f_c2), + [f_c3] "r"(f_c3), [f_c4] "r"(f_c4), [f_c5] "r"(f_c5), + [f_c6] "r"(f_c6), [f_c7] "r"(f_c7) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "x0", "x1", "x2", "x3", "x4", "x5"); f_c0 += 16; f_c1 += 16; f_c2 += 16; @@ -428,62 +425,62 @@ EE convolution_direct(TensorDesc inputDesc, F16* inArray, in_h1w2 += 8; in_h1w3 += 8; } - in_h0w0 += iw_pad*8 - fw*8; - in_h0w1 += iw_pad*8 - fw*8; - in_h0w2 += iw_pad*8 - fw*8; - in_h0w3 += iw_pad*8 - fw*8; - in_h1w0 += iw_pad*8 - fw*8; - in_h1w1 += iw_pad*8 - fw*8; - in_h1w2 += iw_pad*8 - fw*8; - in_h1w3 += iw_pad*8 - fw*8; + in_h0w0 += iw_pad * 8 - fw * 8; + in_h0w1 += iw_pad * 8 - fw * 8; + in_h0w2 += iw_pad * 8 - fw * 8; + in_h0w3 += iw_pad * 8 - fw * 8; + in_h1w0 += iw_pad * 8 - fw * 8; + in_h1w1 += iw_pad * 8 - fw * 8; + in_h1w2 += iw_pad * 8 - fw * 8; + in_h1w3 += iw_pad * 8 - fw * 8; } - in_h0w0 = in_h0w0 + 4*strideW*8 - fh*iw_pad*8; - in_h0w1 = in_h0w1 + 4*strideW*8 - fh*iw_pad*8; - in_h0w2 = in_h0w2 + 4*strideW*8 - fh*iw_pad*8; - in_h0w3 = in_h0w3 + 4*strideW*8 - fh*iw_pad*8; - in_h1w0 = in_h1w0 + 4*strideW*8 - fh*iw_pad*8; - in_h1w1 = in_h1w1 + 4*strideW*8 - fh*iw_pad*8; - in_h1w2 = in_h1w2 + 4*strideW*8 - fh*iw_pad*8; - in_h1w3 = in_h1w3 + 4*strideW*8 - fh*iw_pad*8; + in_h0w0 = in_h0w0 + 4 * strideW * 8 - fh * iw_pad * 8; + in_h0w1 = in_h0w1 + 4 * strideW * 8 - fh * iw_pad * 8; + in_h0w2 = in_h0w2 + 4 * strideW * 8 - fh * iw_pad * 8; + in_h0w3 = in_h0w3 + 4 * strideW * 8 - fh * iw_pad * 8; + in_h1w0 = in_h1w0 + 4 * strideW * 8 - fh * iw_pad * 8; + in_h1w1 = in_h1w1 + 4 * strideW * 8 - fh * iw_pad * 8; + in_h1w2 = in_h1w2 + 4 * strideW * 8 - fh * iw_pad * 8; + in_h1w3 = in_h1w3 + 4 * strideW * 8 - fh * iw_pad * 8; out_o0h0 += 32; out_o1h0 += 32; out_o0h1 += 32; out_o1h1 += 32; } - in_h0w0 += 2*strideH*iw_pad*8 - ow*strideW*8; - in_h0w1 += 2*strideH*iw_pad*8 - ow*strideW*8; - in_h0w2 += 2*strideH*iw_pad*8 - ow*strideW*8; - in_h0w3 += 2*strideH*iw_pad*8 - ow*strideW*8; - in_h1w0 += 2*strideH*iw_pad*8 - ow*strideW*8; - in_h1w1 += 2*strideH*iw_pad*8 - ow*strideW*8; - in_h1w2 += 2*strideH*iw_pad*8 - ow*strideW*8; - in_h1w3 += 2*strideH*iw_pad*8 - ow*strideW*8; - out_o0h0 += ow*8; - out_o1h0 += ow*8; - out_o0h1 += ow*8; - out_o1h1 += ow*8; + in_h0w0 += 2 * strideH * iw_pad * 8 - ow * strideW * 8; + in_h0w1 += 2 * strideH * iw_pad * 8 - ow * strideW * 8; + in_h0w2 += 2 * strideH * iw_pad * 8 - ow * strideW * 8; + in_h0w3 += 2 * strideH * iw_pad * 8 - ow * strideW * 8; + in_h1w0 += 2 * strideH * iw_pad * 8 - ow * strideW * 8; + in_h1w1 += 2 * strideH * iw_pad * 8 - ow * strideW * 8; + in_h1w2 += 2 * strideH * iw_pad * 8 - ow * strideW * 8; + in_h1w3 += 2 * strideH * iw_pad * 8 - ow * strideW * 8; + out_o0h0 += ow * 8; + out_o1h0 += ow * 8; + out_o0h1 += ow * 8; + out_o1h1 += ow * 8; } - f0 += 8*fh*fw*16; - f1 += 8*fh*fw*16; - f2 += 8*fh*fw*16; - f3 += 8*fh*fw*16; - f4 += 8*fh*fw*16; - f5 += 8*fh*fw*16; - f6 += 8*fh*fw*16; - f7 += 8*fh*fw*16; + f0 += 8 * fh * fw * 16; + f1 += 8 * fh * fw * 16; + f2 += 8 * fh * fw * 16; + f3 += 8 * fh * fw * 16; + f4 += 8 * fh * fw * 16; + f5 += 8 * fh * fw * 16; + f6 += 8 * fh * fw * 16; + f7 += 8 * fh * fw * 16; } - outo0h0 += 2*oh*ow*8; - outo1h0 += 2*oh*ow*8; - outo0h1 += 2*oh*ow*8; - outo1h1 += 2*oh*ow*8; + outo0h0 += 2 * oh * ow * 8; + outo1h0 += 2 * oh * ow * 8; + outo0h1 += 2 * oh * ow * 8; + outo1h1 += 2 * oh * ow * 8; } // bias F16 *out = outArray; float16x8_t v_0 = vmovq_n_f16(0); for (U32 o = 0; o < oc; o++) { - float16x8_t v_b = vld1q_f16(biasArray + o*8); - for (U32 hw = 0; hw < oh*ow; hw++) { + float16x8_t v_b = vld1q_f16(biasArray + o * 8); + for (U32 hw = 0; hw < oh * ow; hw++) { float16x8_t v = vld1q_f16(out); switch (activationDesc.mode) { case ACTIVATION_NULL: diff --git a/tensor_computing/src/cpu/arm/fp16/convolution_direct.h b/compute/tensor/src/cpu/arm/fp16/convolution_direct.h similarity index 66% rename from tensor_computing/src/cpu/arm/fp16/convolution_direct.h rename to compute/tensor/src/cpu/arm/fp16/convolution_direct.h index 87e98f25..ffec13ac 100644 --- a/tensor_computing/src/cpu/arm/fp16/convolution_direct.h +++ b/compute/tensor/src/cpu/arm/fp16/convolution_direct.h @@ -1,31 +1,33 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_CONVOLUTION_DIRECT #define _H_CONVOLUTION_DIRECT #include "sys.h" -#include "type.h" +#include "types.h" #include "error.h" -#include "tensor_desc.h" -#include "tensor_computing_type.h" -EE convolution_direct(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc activationDesc, +EE convolution_direct(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc, Arch arch); #endif diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_gemm.h b/compute/tensor/src/cpu/arm/fp16/convolution_gemm.h new file mode 100644 index 00000000..ca11c77c --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/convolution_gemm.h @@ -0,0 +1,76 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONVOLUTION_GEMM +#define _H_CONVOLUTION_GEMM + +#include "sys.h" +#include "types.h" +#include "error.h" + +EE convolution_gemm_A55(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc); + +EE convolution_gemm_A76(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc); + +inline EE convolution_gemm(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc, + Arch arch) +{ + EE ret = SUCCESS; + switch (arch) { + case ARM_A55: + ret = convolution_gemm_A55(inputDesc, inArray, filterDesc, filterArray, convParamSpec, + biasDesc, biasArray, tmpBytes, tmp, outputDesc, outArray, activationDesc); + break; + case ARM_A76: + ret = convolution_gemm_A76(inputDesc, inArray, filterDesc, filterArray, convParamSpec, + biasDesc, biasArray, tmpBytes, tmp, outputDesc, outArray, activationDesc); + break; + default: + return NOT_SUPPORTED; + } + return ret; +} +#endif diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A55.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A55.cpp new file mode 100644 index 00000000..ed6cee0d --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A55.cpp @@ -0,0 +1,975 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/arm/fp16/convolution_gemm.h" + +EE convolution_gemm_A55(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if (fdf != DF_NHWCN16) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + I32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + F16 *inArray_pad; + EE ret = SUCCESS; + for (U32 n = 0; n < in; n++) { + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + inArray_pad = inArray + n * ic * ih * iw * 8; + } else { + // copy input into a input with padding + inArray_pad = (F16 *)tmp; + F16 *inArray_pad_mov = inArray_pad; + F16 *inArray_mov = inArray + n * ic * ih * iw * 8; + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < paddingT; h++) { + memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += iw_pad * 8; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(idt)); + inArray_pad_mov += paddingL * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(idt)); + inArray_pad_mov += paddingR * 8; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { + memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += iw_pad * 8; + } + } + } + // ohow / 8 + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + const F16 *f_o0c0 = filterArray; + F16 *in_pack = ((F16 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHWc8 => NHWChw8 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; + U32 in_h_4 = ((hw + 4) / ow) * strideH; + U32 in_w_4 = ((hw + 4) % ow) * strideW; + U32 in_h_5 = ((hw + 5) / ow) * strideH; + U32 in_w_5 = ((hw + 5) % ow) * strideW; + U32 in_h_6 = ((hw + 6) / ow) * strideH; + U32 in_w_6 = ((hw + 6) % ow) * strideW; + U32 in_h_7 = ((hw + 7) / ow) * strideH; + U32 in_w_7 = ((hw + 7) % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F16 *in_hw8c8 = inArray_pad + c * ihiw * 8 + fh_idx * dilateH * iw_pad * 8 + + fw_idx * dilateW * 8; + F16 *in_0 = in_hw8c8 + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F16 *in_1 = in_hw8c8 + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F16 *in_2 = in_hw8c8 + in_h_2 * iw_pad * 8 + in_w_2 * 8; + F16 *in_3 = in_hw8c8 + in_h_3 * iw_pad * 8 + in_w_3 * 8; + F16 *in_4 = in_hw8c8 + in_h_4 * iw_pad * 8 + in_w_4 * 8; + F16 *in_5 = in_hw8c8 + in_h_5 * iw_pad * 8 + in_w_5 * 8; + F16 *in_6 = in_hw8c8 + in_h_6 * iw_pad * 8 + in_w_6 * 8; + F16 *in_7 = in_hw8c8 + in_h_7 * iw_pad * 8 + in_w_7 * 8; + + // NHWChw8 + F16 *in_pack_c8hw8 = + in_pack + fh_idx * fw * ic * 8 * 8 + fw_idx * ic * 8 * 8 + c * 8 * 8; + /* + * for (U32 c8 = 0; c8 < 8; c8++) { + * for (U32 hw8 = 0; hw8 < 8; hw8++) { + * in_pack_c8hw8[c8*8 + hw8] = in_hw8c8[hw8*8 + c8]; + * } + * } + */ + float16x8_t v0 = vld1q_f16(in_0); + float16x8_t v1 = vld1q_f16(in_1); + float16x8_t v2 = vld1q_f16(in_2); + float16x8_t v3 = vld1q_f16(in_3); + float16x8_t v4 = vld1q_f16(in_4); + float16x8_t v5 = vld1q_f16(in_5); + float16x8_t v6 = vld1q_f16(in_6); + float16x8_t v7 = vld1q_f16(in_7); + vst1q_f16(in_pack_c8hw8, + vzip1q_f16(vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8, + vzip2q_f16(vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 2, + vzip1q_f16(vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 3, + vzip2q_f16(vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 4, + vzip1q_f16(vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 5, + vzip2q_f16(vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 6, + vzip1q_f16(vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 7, + vzip2q_f16(vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc) - 1; o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr d23, [%[b_1]]\n" // b_o1 + "ldr x2, [%[b_1], #8]\n" + "ins v23.d[1], x2\n" + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr x1, [%[in_0], #8]\n" + "mov v4.16b, v22.16b\n" // out_o0hw2 + "ins v0.d[1], x1\n" + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v6.16b, v22.16b\n" // out_o0hw4 + "ldr x2, [%[f_0], #8]\n" + "mov v7.16b, v22.16b\n" // out_o0hw5 + "ins v18.d[1], x2\n" + "mov v8.16b, v22.16b\n" // out_o0hw6 + "ldr d19, [%[f_0], #16]\n" // f_o1c0 + "mov v9.16b, v22.16b\n" // out_o0hw7 + "ldr x3, [%[f_0], #24]\n" + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ins v19.d[1], x3\n" + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + "mov v14.16b, v23.16b\n" // out_o1hw4 + "mov v15.16b, v23.16b\n" // out_o1hw5 + "mov v16.16b, v23.16b\n" // out_o1hw6 + "mov v17.16b, v23.16b\n" // out_o1hw7 + "0:\n" + "ldr d1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr x1, [%[in_0], #24]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ins v1.d[1], x1\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ldr d20, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v18.8h, v0.h[3]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v6.8h, v18.8h, v0.h[4]\n" + "ins v20.d[1], x2\n" + "fmla v7.8h, v18.8h, v0.h[5]\n" + "ldr d21, [%[f_0], #48]\n" // f_o1c0 + "fmla v8.8h, v18.8h, v0.h[6]\n" + "ldr x3, [%[f_0], #56]\n" + "fmla v9.8h, v18.8h, v0.h[7]\n" + "ins v21.d[1], x3\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "fmla v14.8h, v19.8h, v0.h[4]\n" + "fmla v15.8h, v19.8h, v0.h[5]\n" + "fmla v16.8h, v19.8h, v0.h[6]\n" + "fmla v17.8h, v19.8h, v0.h[7]\n" + + "ldr d0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr x1, [%[in_0], #40]\n" + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ins v0.d[1], x1\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr d18, [%[f_0], #64]\n" // f_o0c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "ldr x2, [%[f_0], #72]\n" + "fmla v6.8h, v20.8h, v1.h[4]\n" + "ins v18.d[1], x2\n" + "fmla v7.8h, v20.8h, v1.h[5]\n" + "ldr d19, [%[f_0], #80]\n" // f_o1c0 + "fmla v8.8h, v20.8h, v1.h[6]\n" + "ldr x3, [%[f_0], #88]\n" + "fmla v9.8h, v20.8h, v1.h[7]\n" + "ins v19.d[1], x3\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "add %[f_0], %[f_0], #64\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "subs x0, x0, #2\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "fmla v14.8h, v21.8h, v1.h[4]\n" + "fmla v15.8h, v21.8h, v1.h[5]\n" + "fmla v16.8h, v21.8h, v1.h[6]\n" + "fmla v17.8h, v21.8h, v1.h[7]\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "x0", "x1", "x2", "x3"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v6.8h, v6.8h, v1.8h\n" + "fmax v7.8h, v7.8h, v1.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + "fmax v14.8h, v14.8h, v1.8h\n" + "fmax v15.8h, v15.8h, v1.8h\n" + "fmax v16.8h, v16.8h, v1.8h\n" + "fmax v17.8h, v17.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v6.8h, v6.8h, v31.8h\n" + "fmax v7.8h, v7.8h, v31.8h\n" + "fmax v8.8h, v8.8h, v31.8h\n" + "fmax v9.8h, v9.8h, v31.8h\n" + "fmax v10.8h, v10.8h, v31.8h\n" + "fmax v11.8h, v11.8h, v31.8h\n" + "fmax v12.8h, v12.8h, v31.8h\n" + "fmax v13.8h, v13.8h, v31.8h\n" + "fmax v14.8h, v14.8h, v31.8h\n" + "fmax v15.8h, v15.8h, v31.8h\n" + "fmax v16.8h, v16.8h, v31.8h\n" + "fmax v17.8h, v17.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v6.8h, v6.8h, v30.8h\n" + "fmin v7.8h, v7.8h, v30.8h\n" + "fmin v8.8h, v8.8h, v30.8h\n" + "fmin v9.8h, v9.8h, v30.8h\n" + "fmin v10.8h, v10.8h, v30.8h\n" + "fmin v11.8h, v11.8h, v30.8h\n" + "fmin v12.8h, v12.8h, v30.8h\n" + "fmin v13.8h, v13.8h, v30.8h\n" + "fmin v14.8h, v14.8h, v30.8h\n" + "fmin v15.8h, v15.8h, v30.8h\n" + "fmin v16.8h, v16.8h, v30.8h\n" + "fmin v17.8h, v17.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q6, [%[out_0], #64]\n" // out_o0hw4 + "str q7, [%[out_0], #80]\n" // out_o0hw5 + "str q8, [%[out_0], #96]\n" // out_o0hw6 + "str q9, [%[out_0], #112]\n" // out_o0hw7 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + "str q14, [%[out_1], #64]\n" // out_o1hw4 + "str q15, [%[out_1], #80]\n" // out_o1hw5 + "str q16, [%[out_1], #96]\n" // out_o1hw6 + "str q17, [%[out_1], #112]\n" // out_o1hw7 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = filterArray + (oc - 1) * 8 * fh * fw * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = biasArray + (oc - 1) * 8; + __asm__ __volatile__("ldr q12, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v12.16b\n" // out_o0hw0 + "ldr x1, [%[in_0], #8]\n" + "mov v3.16b, v12.16b\n" // out_o0hw1 + "ins v0.d[1], x1\n" + "mov v4.16b, v12.16b\n" // out_o0hw2 + "ldr d10, [%[f_0]]\n" // f_o0c0 + "mov v5.16b, v12.16b\n" // out_o0hw3 + "ldr x2, [%[f_0], #8]\n" + "mov v6.16b, v12.16b\n" // out_o0hw4 + "ins v10.d[1], x2\n" + "mov v7.16b, v12.16b\n" // out_o0hw5 + "mov v8.16b, v12.16b\n" // out_o0hw6 + "mov v9.16b, v12.16b\n" // out_o0hw7 + "0:\n" + "ldr d1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v10.8h, v0.h[0]\n" + "ldr x1, [%[in_0], #24]\n" + "fmla v3.8h, v10.8h, v0.h[1]\n" + "ins v1.d[1], x1\n" + "fmla v4.8h, v10.8h, v0.h[2]\n" + "ldr d11, [%[f_0], #16]\n" // f_o0c0 + "fmla v5.8h, v10.8h, v0.h[3]\n" + "ldr x2, [%[f_0], #24]\n" + "fmla v6.8h, v10.8h, v0.h[4]\n" + "ins v11.d[1], x2\n" + "fmla v7.8h, v10.8h, v0.h[5]\n" + "subs x0, x0, #2\n" + "fmla v8.8h, v10.8h, v0.h[6]\n" + "fmla v9.8h, v10.8h, v0.h[7]\n" + + "ldr d0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v11.8h, v1.h[0]\n" + "ldr x1, [%[in_0], #40]\n" + "fmla v3.8h, v11.8h, v1.h[1]\n" + "ins v0.d[1], x1\n" + "fmla v4.8h, v11.8h, v1.h[2]\n" + "ldr d10, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v11.8h, v1.h[3]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v6.8h, v11.8h, v1.h[4]\n" + "ins v10.d[1], x2\n" + "fmla v7.8h, v11.8h, v1.h[5]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v8.8h, v11.8h, v1.h[6]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v9.8h, v11.8h, v1.h[7]\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "x0", "x1", "x2"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__( + "eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v6.8h, v6.8h, v1.8h\n" + "fmax v7.8h, v7.8h, v1.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v6.8h, v6.8h, v31.8h\n" + "fmax v7.8h, v7.8h, v31.8h\n" + "fmax v8.8h, v8.8h, v31.8h\n" + "fmax v9.8h, v9.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v6.8h, v6.8h, v30.8h\n" + "fmin v7.8h, v7.8h, v30.8h\n" + "fmin v8.8h, v8.8h, v30.8h\n" + "fmin v9.8h, v9.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__( + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw0 + "str q4, [%[out_0], #32]\n" // out_o0hw0 + "str q5, [%[out_0], #48]\n" // out_o0hw0 + "str q6, [%[out_0], #64]\n" // out_o0hw0 + "str q7, [%[out_0], #80]\n" // out_o0hw0 + "str q8, [%[out_0], #96]\n" // out_o0hw0 + "str q9, [%[out_0], #112]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9"); + } + } + + // ohow_reminder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + // U32 ohow_s = (ohow/8)*8; + for (I32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + const F16 *f_o0c0 = filterArray; + F16 *in_pack = ((F16 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHWc8 => NHWChw4 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F16 *in_hw4c8 = inArray_pad + c * ihiw * 8 + fh_idx * dilateH * iw_pad * 8 + + fw_idx * dilateW * 8; + F16 *in_0 = in_hw4c8 + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F16 *in_1 = in_hw4c8 + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F16 *in_2 = in_hw4c8 + in_h_2 * iw_pad * 8 + in_w_2 * 8; + F16 *in_3 = in_hw4c8 + in_h_3 * iw_pad * 8 + in_w_3 * 8; + F16 *in_pack_c8hw4 = + in_pack + fh_idx * fw * ic * 8 * 4 + fw_idx * ic * 8 * 4 + c * 8 * 4; + + /* + * for (U32 c8 = 0; c8 < 8; c8++) { + * for (U32 hw4 = 0; hw4 < 4; hw4++) { + * in_pack_c8hw4[c8*4 + hw4] = in_hw4c8[hw4*8 + c8]; + * } + * } + */ + + __asm__ __volatile__( + "ldr q0, [%[in_0]]\n" + "ldr q1, [%[in_1]]\n" + "ldr q2, [%[in_2]]\n" + "ldr q3, [%[in_3]]\n" + "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[in_pack_0]]\n" + : [in_pack_0] "+r"(in_pack_c8hw4) + : [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3) + : "memory", "cc", "v0", "v1", "v2", "v3"); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr d23, [%[b_1]]\n" // b_o1 + "ldr x2, [%[b_1], #8]\n" + "ins v23.d[1], x2\n" + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "ldr x2, [%[f_0], #8]\n" + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ins v18.d[1], x2\n" + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr d19, [%[f_0], #16]\n" // f_o1c0 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "ldr x3, [%[f_0], #24]\n" + "mov v12.16b, v23.16b\n" // out_o1hw2 + "ins v19.d[1], x3\n" + "mov v13.16b, v23.16b\n" // out_o1hw3 + "0:\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #32]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ins v20.d[1], x2\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "ldr d21, [%[f_0], #48]\n" // f_o1c0 + "fmla v10.8h, v19.8h, v0.h[0]\n" + "ldr x3, [%[f_0], #56]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "ins v21.d[1], x3\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "subs x0, x0, #2\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #64]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ldr x2, [%[f_0], #72]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr d19, [%[f_0], #80]\n" // f_o1c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "ins v18.d[1], x2\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "ldr x3, [%[f_0], #88]\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "ins v19.d[1], x3\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "add %[in_0], %[in_0], #16\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "add %[f_0], %[f_0], #64\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v10", "v11", "v12", + "v13", "v18", "v19", "v20", "v21", "v22", "v23", "x0", "x1", "x2", "x3"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5", "v10", + "v11", "v12", "v13"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v10.8h, v10.8h, v31.8h\n" + "fmax v11.8h, v11.8h, v31.8h\n" + "fmax v12.8h, v12.8h, v31.8h\n" + "fmax v13.8h, v13.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v10.8h, v10.8h, v30.8h\n" + "fmin v11.8h, v11.8h, v30.8h\n" + "fmin v12.8h, v12.8h, v30.8h\n" + "fmin v13.8h, v13.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v10", "v11", + "v12", "v13", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__( + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v10", "v11", "v12", "v13"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = filterArray + (oc - 1) * 8 * fh * fw * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = biasArray + (oc - 1) * 8; + __asm__ __volatile__("ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "ldr x2, [%[f_0], #8]\n" + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ins v18.d[1], x2\n" + "0:\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #16]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ldr x2, [%[f_0], #24]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ins v20.d[1], x2\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "subs x0, x0, #2\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #32]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ins v18.d[1], x2\n" + "fmla v5.8h, v20.8h, v1.h[3]\n" + "add %[in_0], %[in_0], #16\n" + "add %[f_0], %[f_0], #32\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v18", + "v20", "v22", "x0", "x1", "x2"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5"); + } + } + + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + const F16 *f_o0c0 = filterArray; + F16 *in_pack = ((F16 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHWc8 => NHWChw4 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F16 *in_hw1c8 = inArray_pad + c * ihiw * 8 + fh_idx * dilateH * iw_pad * 8 + + fw_idx * dilateW * 8; + F16 *in_0 = in_hw1c8 + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F16 *in_pack_c8hw1 = + in_pack + fh_idx * fw * ic * 8 + fw_idx * ic * 8 + c * 8; + /* + * for (U32 c8 = 0; c8 < 8; c8++) { + * in_pack_c8hw1[c8] = in_0[c8]; + * } + */ + memcpy(in_pack_c8hw1, in_0, 8 * bytesOf(idt)); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr d23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "ldr x2, [%[b_1], #8]\n" + "ins v23.d[1], x2\n" + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr x2, [%[f_0], #8]\n" + "ins v18.d[1], x2\n" + "ldr d19, [%[f_0], #16]\n" // f_o1c0 + "ldr x3, [%[f_0], #24]\n" + "ins v19.d[1], x3\n" + "0:\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #32]\n" // f_o0c0 + "fmla v10.8h, v19.8h, v0.h[0]\n" + "ldr x2, [%[f_0], #40]\n" + "ins v20.d[1], x2\n" + "ldr d21, [%[f_0], #48]\n" // f_o1c0 + "subs x0, x0, #2\n" + "ldr x3, [%[f_0], #56]\n" + "ins v21.d[1], x3\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #64]\n" // f_o0c0 + "fmla v10.8h, v21.8h, v1.h[0]\n" + "ldr x2, [%[f_0], #72]\n" + "ins v18.d[1], x2\n" + "ldr d19, [%[f_0], #80]\n" // f_o1c0 + "add %[in_0], %[in_0], #4\n" + "ldr x3, [%[f_0], #88]\n" + "ins v19.d[1], x3\n" + "add %[f_0], %[f_0], #64\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", "v21", "v22", + "v23", "x0", "x1", "x2", "x3"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v10.8h, v10.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v10"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v10.8h, v10.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v10.8h, v10.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v10", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + "str q10, [%[out_1]]\n" // out_o1hw0 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0) + : + : "memory", "cc", "v2", "v10"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = filterArray + (oc - 1) * 8 * fh * fw * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = biasArray + (oc - 1) * 8; + __asm__ __volatile__("ldr d22, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "ldr x2, [%[f_0], #8]\n" + "ins v18.d[1], x2\n" + "0:\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #16]\n" // f_o0c0 + "subs x0, x0, #2\n" + "ldr x2, [%[f_0], #24]\n" + "ins v20.d[1], x2\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #32]\n" // f_o0c0 + "ldr x2, [%[f_0], #40]\n" + "ins v18.d[1], x2\n" + "add %[in_0], %[in_0], #4\n" + "add %[f_0], %[f_0], #32\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", + "x0", "x1", "x2"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + : + : + : "memory", "cc", "v1", "v2"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmin v2.8h, v2.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v2"); + } + } + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A76.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A76.cpp new file mode 100644 index 00000000..028e32b0 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A76.cpp @@ -0,0 +1,893 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/arm/fp16/convolution_gemm.h" + +EE convolution_gemm_A76(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if (fdf != DF_NHWCN16) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + I32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + F16 *inArray_pad; + EE ret = SUCCESS; + for (U32 n = 0; n < in; n++) { + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + inArray_pad = inArray + n * ic * ih * iw * 8; + } else { + // copy input into a input with padding + inArray_pad = (F16 *)tmp; + F16 *inArray_pad_mov = inArray_pad; + F16 *inArray_mov = inArray + n * ic * ih * iw * 8; + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < paddingT; h++) { + memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += iw_pad * 8; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(idt)); + inArray_pad_mov += paddingL * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(idt)); + inArray_pad_mov += paddingR * 8; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { + memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += iw_pad * 8; + } + } + } + // ohow / 8 + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + const F16 *f_o0c0 = filterArray; + F16 *in_pack = ((F16 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHWc8 => NHWChw8 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; + U32 in_h_4 = ((hw + 4) / ow) * strideH; + U32 in_w_4 = ((hw + 4) % ow) * strideW; + U32 in_h_5 = ((hw + 5) / ow) * strideH; + U32 in_w_5 = ((hw + 5) % ow) * strideW; + U32 in_h_6 = ((hw + 6) / ow) * strideH; + U32 in_w_6 = ((hw + 6) % ow) * strideW; + U32 in_h_7 = ((hw + 7) / ow) * strideH; + U32 in_w_7 = ((hw + 7) % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F16 *in_hw8c8 = inArray_pad + c * ihiw * 8 + fh_idx * dilateH * iw_pad * 8 + + fw_idx * dilateW * 8; + F16 *in_0 = in_hw8c8 + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F16 *in_1 = in_hw8c8 + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F16 *in_2 = in_hw8c8 + in_h_2 * iw_pad * 8 + in_w_2 * 8; + F16 *in_3 = in_hw8c8 + in_h_3 * iw_pad * 8 + in_w_3 * 8; + F16 *in_4 = in_hw8c8 + in_h_4 * iw_pad * 8 + in_w_4 * 8; + F16 *in_5 = in_hw8c8 + in_h_5 * iw_pad * 8 + in_w_5 * 8; + F16 *in_6 = in_hw8c8 + in_h_6 * iw_pad * 8 + in_w_6 * 8; + F16 *in_7 = in_hw8c8 + in_h_7 * iw_pad * 8 + in_w_7 * 8; + + // NHWChw8 + F16 *in_pack_c8hw8 = + in_pack + fh_idx * fw * ic * 8 * 8 + fw_idx * ic * 8 * 8 + c * 8 * 8; + /* + * for (U32 c8 = 0; c8 < 8; c8++) { + * for (U32 hw8 = 0; hw8 < 8; hw8++) { + * in_pack_c8hw8[c8*8 + hw8] = in_hw8c8[hw8*8 + c8]; + * } + * } + */ + float16x8_t v0 = vld1q_f16(in_0); + float16x8_t v1 = vld1q_f16(in_1); + float16x8_t v2 = vld1q_f16(in_2); + float16x8_t v3 = vld1q_f16(in_3); + float16x8_t v4 = vld1q_f16(in_4); + float16x8_t v5 = vld1q_f16(in_5); + float16x8_t v6 = vld1q_f16(in_6); + float16x8_t v7 = vld1q_f16(in_7); + vst1q_f16(in_pack_c8hw8, + vzip1q_f16(vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8, + vzip2q_f16(vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 2, + vzip1q_f16(vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 3, + vzip2q_f16(vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 4, + vzip1q_f16(vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 5, + vzip2q_f16(vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 6, + vzip1q_f16(vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(in_pack_c8hw8 + 8 * 7, + vzip2q_f16(vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc) - 1; o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr q0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v6.16b, v22.16b\n" // out_o0hw4 + "mov v7.16b, v22.16b\n" // out_o0hw5 + "mov v8.16b, v22.16b\n" // out_o0hw6 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 + "mov v9.16b, v22.16b\n" // out_o0hw7 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + "mov v14.16b, v23.16b\n" // out_o1hw4 + "mov v15.16b, v23.16b\n" // out_o1hw5 + "mov v16.16b, v23.16b\n" // out_o1hw6 + "mov v17.16b, v23.16b\n" // out_o1hw7 + "0:\n" + "ldr q1, [%[in_0], #16]\n" // in_hw0 + "ldr q20, [%[f_0], #32]\n" // f_o0c0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ldr q21, [%[f_0], #48]\n" // f_o1c0 + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "fmla v6.8h, v18.8h, v0.h[4]\n" + "fmla v7.8h, v18.8h, v0.h[5]\n" + "fmla v8.8h, v18.8h, v0.h[6]\n" + "fmla v9.8h, v18.8h, v0.h[7]\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "fmla v14.8h, v19.8h, v0.h[4]\n" + "fmla v15.8h, v19.8h, v0.h[5]\n" + "fmla v16.8h, v19.8h, v0.h[6]\n" + "fmla v17.8h, v19.8h, v0.h[7]\n" + "subs x0, x0, #2\n" + + "ldr q0, [%[in_0], #32]\n" // in_hw0 + "ldr q18, [%[f_0], #64]\n" // f_o0c0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ldr q19, [%[f_0], #80]\n" // f_o1c0 + "fmla v4.8h, v20.8h, v1.h[2]\n" + "fmla v5.8h, v20.8h, v1.h[3]\n" + "fmla v6.8h, v20.8h, v1.h[4]\n" + "fmla v7.8h, v20.8h, v1.h[5]\n" + "fmla v8.8h, v20.8h, v1.h[6]\n" + "fmla v9.8h, v20.8h, v1.h[7]\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "add %[in_0], %[in_0], #32\n" + "add %[f_0], %[f_0], #64\n" + "fmla v14.8h, v21.8h, v1.h[4]\n" + "fmla v15.8h, v21.8h, v1.h[5]\n" + "fmla v16.8h, v21.8h, v1.h[6]\n" + "fmla v17.8h, v21.8h, v1.h[7]\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "x0"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v6.8h, v6.8h, v1.8h\n" + "fmax v7.8h, v7.8h, v1.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + "fmax v14.8h, v14.8h, v1.8h\n" + "fmax v15.8h, v15.8h, v1.8h\n" + "fmax v16.8h, v16.8h, v1.8h\n" + "fmax v17.8h, v17.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v6.8h, v6.8h, v31.8h\n" + "fmax v7.8h, v7.8h, v31.8h\n" + "fmax v8.8h, v8.8h, v31.8h\n" + "fmax v9.8h, v9.8h, v31.8h\n" + "fmax v10.8h, v10.8h, v31.8h\n" + "fmax v11.8h, v11.8h, v31.8h\n" + "fmax v12.8h, v12.8h, v31.8h\n" + "fmax v13.8h, v13.8h, v31.8h\n" + "fmax v14.8h, v14.8h, v31.8h\n" + "fmax v15.8h, v15.8h, v31.8h\n" + "fmax v16.8h, v16.8h, v31.8h\n" + "fmax v17.8h, v17.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v6.8h, v6.8h, v30.8h\n" + "fmin v7.8h, v7.8h, v30.8h\n" + "fmin v8.8h, v8.8h, v30.8h\n" + "fmin v9.8h, v9.8h, v30.8h\n" + "fmin v10.8h, v10.8h, v30.8h\n" + "fmin v11.8h, v11.8h, v30.8h\n" + "fmin v12.8h, v12.8h, v30.8h\n" + "fmin v13.8h, v13.8h, v30.8h\n" + "fmin v14.8h, v14.8h, v30.8h\n" + "fmin v15.8h, v15.8h, v30.8h\n" + "fmin v16.8h, v16.8h, v30.8h\n" + "fmin v17.8h, v17.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q6, [%[out_0], #64]\n" // out_o0hw4 + "str q7, [%[out_0], #80]\n" // out_o0hw5 + "str q8, [%[out_0], #96]\n" // out_o0hw6 + "str q9, [%[out_0], #112]\n" // out_o0hw7 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + "str q14, [%[out_1], #64]\n" // out_o1hw4 + "str q15, [%[out_1], #80]\n" // out_o1hw5 + "str q16, [%[out_1], #96]\n" // out_o1hw6 + "str q17, [%[out_1], #112]\n" // out_o1hw7 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = filterArray + (oc - 1) * 8 * fh * fw * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = biasArray + (oc - 1) * 8; + __asm__ __volatile__("ldr q12, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr q0, [%[in_0]]\n" // in_hw0 + "ldr q10, [%[f_0]]\n" // f_o0c0 + "mov v2.16b, v12.16b\n" // out_o0hw0 + "mov v3.16b, v12.16b\n" // out_o0hw1 + "mov v4.16b, v12.16b\n" // out_o0hw2 + "mov v5.16b, v12.16b\n" // out_o0hw3 + "mov v6.16b, v12.16b\n" // out_o0hw4 + "mov v7.16b, v12.16b\n" // out_o0hw5 + "mov v8.16b, v12.16b\n" // out_o0hw6 + "mov v9.16b, v12.16b\n" // out_o0hw7 + "0:\n" + "ldr q1, [%[in_0], #16]\n" // in_hw0 + "ldr q11, [%[f_0], #16]\n" // f_o0c0 + "fmla v2.8h, v10.8h, v0.h[0]\n" + "fmla v3.8h, v10.8h, v0.h[1]\n" + "fmla v4.8h, v10.8h, v0.h[2]\n" + "fmla v5.8h, v10.8h, v0.h[3]\n" + "fmla v6.8h, v10.8h, v0.h[4]\n" + "fmla v7.8h, v10.8h, v0.h[5]\n" + "fmla v8.8h, v10.8h, v0.h[6]\n" + "fmla v9.8h, v10.8h, v0.h[7]\n" + "subs x0, x0, #2\n" + + "ldr q0, [%[in_0], #32]\n" // in_hw0 + "ldr q10, [%[f_0], #32]\n" // f_o0c0 + "fmla v2.8h, v11.8h, v1.h[0]\n" + "fmla v3.8h, v11.8h, v1.h[1]\n" + "fmla v4.8h, v11.8h, v1.h[2]\n" + "fmla v5.8h, v11.8h, v1.h[3]\n" + "add %[in_0], %[in_0], #32\n" + "add %[f_0], %[f_0], #32\n" + "fmla v6.8h, v11.8h, v1.h[4]\n" + "fmla v7.8h, v11.8h, v1.h[5]\n" + "fmla v8.8h, v11.8h, v1.h[6]\n" + "fmla v9.8h, v11.8h, v1.h[7]\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "x0"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__( + "eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v6.8h, v6.8h, v1.8h\n" + "fmax v7.8h, v7.8h, v1.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v6.8h, v6.8h, v31.8h\n" + "fmax v7.8h, v7.8h, v31.8h\n" + "fmax v8.8h, v8.8h, v31.8h\n" + "fmax v9.8h, v9.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v6.8h, v6.8h, v30.8h\n" + "fmin v7.8h, v7.8h, v30.8h\n" + "fmin v8.8h, v8.8h, v30.8h\n" + "fmin v9.8h, v9.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__( + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw0 + "str q4, [%[out_0], #32]\n" // out_o0hw0 + "str q5, [%[out_0], #48]\n" // out_o0hw0 + "str q6, [%[out_0], #64]\n" // out_o0hw0 + "str q7, [%[out_0], #80]\n" // out_o0hw0 + "str q8, [%[out_0], #96]\n" // out_o0hw0 + "str q9, [%[out_0], #112]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9"); + } + } + + // ohow_reminder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + + for (I32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + const F16 *f_o0c0 = filterArray; + F16 *in_pack = ((F16 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHWc8 => NHWChw4 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F16 *in_hw4c8 = inArray_pad + c * ihiw * 8 + fh_idx * dilateH * iw_pad * 8 + + fw_idx * dilateW * 8; + F16 *in_0 = in_hw4c8 + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F16 *in_1 = in_hw4c8 + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F16 *in_2 = in_hw4c8 + in_h_2 * iw_pad * 8 + in_w_2 * 8; + F16 *in_3 = in_hw4c8 + in_h_3 * iw_pad * 8 + in_w_3 * 8; + F16 *in_pack_c8hw4 = + in_pack + fh_idx * fw * ic * 8 * 4 + fw_idx * ic * 8 * 4 + c * 8 * 4; + + /* + * for (U32 c8 = 0; c8 < 8; c8++) { + * for (U32 hw4 = 0; hw4 < 4; hw4++) { + * in_pack_c8hw4[c8*4 + hw4] = in_hw4c8[hw4*8 + c8]; + * } + * } + */ + + __asm__ __volatile__( + "ldr q0, [%[in_0]]\n" + "ldr q1, [%[in_1]]\n" + "ldr q2, [%[in_2]]\n" + "ldr q3, [%[in_3]]\n" + "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[in_pack_0]]\n" + : [in_pack_0] "+r"(in_pack_c8hw4) + : [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3) + : "memory", "cc", "v0", "v1", "v2", "v3"); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "ldr d0, [%[in_0]]\n" // in_hw0 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + "0:\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "ldr q20, [%[f_0], #32]\n" // f_o0c0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ldr q21, [%[f_0], #48]\n" // f_o1c0 + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "subs x0, x0, #2\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "ldr q18, [%[f_0], #64]\n" // f_o0c0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ldr q19, [%[f_0], #80]\n" // f_o1c0 + "fmla v4.8h, v20.8h, v1.h[2]\n" + "fmla v5.8h, v20.8h, v1.h[3]\n" + "add %[in_0], %[in_0], #16\n" + "add %[f_0], %[f_0], #64\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v10", "v11", "v12", + "v13", "v18", "v19", "v20", "v21", "v22", "v23", "x0"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5", "v10", + "v11", "v12", "v13"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v10.8h, v10.8h, v31.8h\n" + "fmax v11.8h, v11.8h, v31.8h\n" + "fmax v12.8h, v12.8h, v31.8h\n" + "fmax v13.8h, v13.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v10.8h, v10.8h, v30.8h\n" + "fmin v11.8h, v11.8h, v30.8h\n" + "fmin v12.8h, v12.8h, v30.8h\n" + "fmin v13.8h, v13.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v10", "v11", + "v12", "v13", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__( + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v10", "v11", "v12", "v13"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = filterArray + (oc - 1) * 8 * fh * fw * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = biasArray + (oc - 1) * 8; + __asm__ __volatile__("ldr q22, [%[b_0]]\n" // b_o0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 + "0:\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "ldr q20, [%[f_0], #16]\n" // f_o0c0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "subs x0, x0, #2\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "ldr q18, [%[f_0], #32]\n" // f_o0c0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "fmla v3.8h, v20.8h, v1.h[1]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "fmla v5.8h, v20.8h, v1.h[3]\n" + "add %[in_0], %[in_0], #16\n" + "add %[f_0], %[f_0], #32\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v18", + "v20", "v22", "x0"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5"); + } + } + + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + const F16 *f_o0c0 = filterArray; + F16 *in_pack = ((F16 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHWc8 => NHWChw4 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F16 *in_hw1c8 = inArray_pad + c * ihiw * 8 + fh_idx * dilateH * iw_pad * 8 + + fw_idx * dilateW * 8; + F16 *in_0 = in_hw1c8 + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F16 *in_pack_c8hw1 = + in_pack + fh_idx * fw * ic * 8 + fw_idx * ic * 8 + c * 8; + /* + * for (U32 c8 = 0; c8 < 8; c8++) { + * in_pack_c8hw1[c8] = in_0[c8]; + * } + */ + memcpy(in_pack_c8hw1, in_0, 8 * bytesOf(idt)); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "ldr h0, [%[in_0]]\n" // in_hw0 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "0:\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "ldr q20, [%[f_0], #32]\n" // f_o0c0 + "ldr q21, [%[f_0], #48]\n" // f_o1c0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "subs x0, x0, #2\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "ldr q18, [%[f_0], #64]\n" // f_o0c0 + "ldr q19, [%[f_0], #80]\n" // f_o1c0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "add %[in_0], %[in_0], #4\n" + "add %[f_0], %[f_0], #64\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", "v21", "v22", + "v23", "x0"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v10.8h, v10.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v10"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v10.8h, v10.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v10.8h, v10.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v10", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + "str q10, [%[out_1]]\n" // out_o1hw0 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0) + : + : "memory", "cc", "v2", "v10"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = filterArray + (oc - 1) * 8 * fh * fw * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = biasArray + (oc - 1) * 8; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "ldr h0, [%[in_0]]\n" // in_hw0 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "0:\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "ldr q20, [%[f_0], #16]\n" // f_o0c0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "subs x0, x0, #2\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "ldr q18, [%[f_0], #32]\n" // f_o0c0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "add %[in_0], %[in_0], #4\n" + "add %[f_0], %[f_0], #32\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", "x0"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + : + : + : "memory", "cc", "v1", "v2"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmin v2.8h, v2.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v2"); + } + } + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw.h b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw.h new file mode 100644 index 00000000..17778b77 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw.h @@ -0,0 +1,79 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONVOLUTION_GEMM_ICNCHW +#define _H_CONVOLUTION_GEMM_ICNCHW + +#include +#include "sys.h" +#include "types.h" +#include "error.h" + +EE convolution_gemm_icnchw_A55(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc); + +EE convolution_gemm_icnchw_A76(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc); + +inline EE convolution_gemm_icnchw(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc, + Arch arch) +{ + EE ret = SUCCESS; + switch (arch) { + case ARM_A55: + ret = convolution_gemm_icnchw_A55(inputDesc, inArray, filterDesc, filterArray, + convParamSpec, biasDesc, biasArray, tmpBytes, tmp, outputDesc, outArray, + activationDesc); + break; + case ARM_A76: + ret = convolution_gemm_icnchw_A76(inputDesc, inArray, filterDesc, filterArray, + convParamSpec, biasDesc, biasArray, tmpBytes, tmp, outputDesc, outArray, + activationDesc); + break; + default: + return NOT_SUPPORTED; + } + return ret; +} +#endif diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A55.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A55.cpp new file mode 100644 index 00000000..e51733cf --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A55.cpp @@ -0,0 +1,1003 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/convolution_gemm_icnchw.h" + +EE convolution_gemm_icnchw_A55(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if (fdf != DF_NHWCN16) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + I32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + F16 *inArray_pad; + EE ret = SUCCESS; + for (U32 n = 0; n < in; n++) { + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + inArray_pad = inArray + n * ic * ih * iw; + } else { + // copy input into a input with padding + inArray_pad = (F16 *)tmp; + F16 *inArray_pad_mov = inArray_pad; + F16 *inArray_mov = inArray + n * ic * ih * iw; + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < paddingT; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(idt)); + inArray_pad_mov += iw_pad; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * bytesOf(idt)); + inArray_pad_mov += paddingL; + memcpy(inArray_pad_mov, inArray_mov, iw * bytesOf(idt)); + inArray_pad_mov += iw; + inArray_mov += iw; + memset(inArray_pad_mov, 0, paddingR * bytesOf(idt)); + inArray_pad_mov += paddingR; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(idt)); + inArray_pad_mov += iw_pad; + } + } + } + + // ohow / 8 + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + const F16 *f_o0c0 = filterArray; + F16 *in_pack = ((F16 *)tmp) + ic * ih_pad * iw_pad; + // pack input + // NCHW => NHWChw8 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; + U32 in_h_4 = ((hw + 4) / ow) * strideH; + U32 in_w_4 = ((hw + 4) % ow) * strideW; + U32 in_h_5 = ((hw + 5) / ow) * strideH; + U32 in_w_5 = ((hw + 5) % ow) * strideW; + U32 in_h_6 = ((hw + 6) / ow) * strideH; + U32 in_w_6 = ((hw + 6) % ow) * strideW; + U32 in_h_7 = ((hw + 7) / ow) * strideH; + U32 in_w_7 = ((hw + 7) % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F16 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F16 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F16 *in_1 = in_hw + in_h_1 * iw_pad + in_w_1; + F16 *in_2 = in_hw + in_h_2 * iw_pad + in_w_2; + F16 *in_3 = in_hw + in_h_3 * iw_pad + in_w_3; + F16 *in_4 = in_hw + in_h_4 * iw_pad + in_w_4; + F16 *in_5 = in_hw + in_h_5 * iw_pad + in_w_5; + F16 *in_6 = in_hw + in_h_6 * iw_pad + in_w_6; + F16 *in_7 = in_hw + in_h_7 * iw_pad + in_w_7; + F16 *in_pack_hw8 = in_pack + fh_idx * fw * ic * 8 + fw_idx * ic * 8 + c * 8; + *in_pack_hw8 = *in_0; + *(in_pack_hw8 + 1) = *in_1; + *(in_pack_hw8 + 2) = *in_2; + *(in_pack_hw8 + 3) = *in_3; + *(in_pack_hw8 + 4) = *in_4; + *(in_pack_hw8 + 5) = *in_5; + *(in_pack_hw8 + 6) = *in_6; + *(in_pack_hw8 + 7) = *in_7; + } + } + } + + // compute + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__("ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr d23, [%[b_1]]\n" // b_o1 + "ldr x2, [%[b_1], #8]\n" + "ins v23.d[1], x2\n" + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr x1, [%[in_0], #8]\n" + "mov v4.16b, v22.16b\n" // out_o0hw2 + "ins v0.d[1], x1\n" + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v6.16b, v22.16b\n" // out_o0hw4 + "ldr x2, [%[f_0], #8]\n" + "mov v7.16b, v22.16b\n" // out_o0hw5 + "ins v18.d[1], x2\n" + "mov v8.16b, v22.16b\n" // out_o0hw6 + "ldr d19, [%[f_0], #16]\n" // f_o1c0 + "mov v9.16b, v22.16b\n" // out_o0hw7 + "ldr x3, [%[f_0], #24]\n" + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ins v19.d[1], x3\n" + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + "mov v14.16b, v23.16b\n" // out_o1hw4 + "mov v15.16b, v23.16b\n" // out_o1hw5 + "mov v16.16b, v23.16b\n" // out_o1hw6 + "mov v17.16b, v23.16b\n" // out_o1hw7 + + "0:\n" + "cmp x0, #1\n" + "ble 1f\n" + "ldr d1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr x1, [%[in_0], #24]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ins v1.d[1], x1\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ldr d20, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v18.8h, v0.h[3]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v6.8h, v18.8h, v0.h[4]\n" + "ins v20.d[1], x2\n" + "fmla v7.8h, v18.8h, v0.h[5]\n" + "ldr d21, [%[f_0], #48]\n" // f_o1c0 + "fmla v8.8h, v18.8h, v0.h[6]\n" + "ldr x3, [%[f_0], #56]\n" + "fmla v9.8h, v18.8h, v0.h[7]\n" + "ins v21.d[1], x3\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "fmla v14.8h, v19.8h, v0.h[4]\n" + "fmla v15.8h, v19.8h, v0.h[5]\n" + "fmla v16.8h, v19.8h, v0.h[6]\n" + "fmla v17.8h, v19.8h, v0.h[7]\n" + + "ldr d0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr x1, [%[in_0], #40]\n" + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ins v0.d[1], x1\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr d18, [%[f_0], #64]\n" // f_o0c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "ldr x2, [%[f_0], #72]\n" + "fmla v6.8h, v20.8h, v1.h[4]\n" + "ins v18.d[1], x2\n" + "fmla v7.8h, v20.8h, v1.h[5]\n" + "ldr d19, [%[f_0], #80]\n" // f_o1c0 + "fmla v8.8h, v20.8h, v1.h[6]\n" + "ldr x3, [%[f_0], #88]\n" + "fmla v9.8h, v20.8h, v1.h[7]\n" + "ins v19.d[1], x3\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "add %[f_0], %[f_0], #64\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "sub x0, x0, #2\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "fmla v14.8h, v21.8h, v1.h[4]\n" + "fmla v15.8h, v21.8h, v1.h[5]\n" + "fmla v16.8h, v21.8h, v1.h[6]\n" + "fmla v17.8h, v21.8h, v1.h[7]\n" + "b 0b\n" + + "1:\n" + "blt 2f\n" + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "fmla v6.8h, v18.8h, v0.h[4]\n" + "fmla v7.8h, v18.8h, v0.h[5]\n" + "fmla v8.8h, v18.8h, v0.h[6]\n" + "fmla v9.8h, v18.8h, v0.h[7]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "fmla v14.8h, v19.8h, v0.h[4]\n" + "fmla v15.8h, v19.8h, v0.h[5]\n" + "fmla v16.8h, v19.8h, v0.h[6]\n" + "fmla v17.8h, v19.8h, v0.h[7]\n" + "2:\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "x0", + "x1", "x2", "x3"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v6.8h, v6.8h, v1.8h\n" + "fmax v7.8h, v7.8h, v1.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + "fmax v14.8h, v14.8h, v1.8h\n" + "fmax v15.8h, v15.8h, v1.8h\n" + "fmax v16.8h, v16.8h, v1.8h\n" + "fmax v17.8h, v17.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v6.8h, v6.8h, v31.8h\n" + "fmax v7.8h, v7.8h, v31.8h\n" + "fmax v8.8h, v8.8h, v31.8h\n" + "fmax v9.8h, v9.8h, v31.8h\n" + "fmax v10.8h, v10.8h, v31.8h\n" + "fmax v11.8h, v11.8h, v31.8h\n" + "fmax v12.8h, v12.8h, v31.8h\n" + "fmax v13.8h, v13.8h, v31.8h\n" + "fmax v14.8h, v14.8h, v31.8h\n" + "fmax v15.8h, v15.8h, v31.8h\n" + "fmax v16.8h, v16.8h, v31.8h\n" + "fmax v17.8h, v17.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v6.8h, v6.8h, v30.8h\n" + "fmin v7.8h, v7.8h, v30.8h\n" + "fmin v8.8h, v8.8h, v30.8h\n" + "fmin v9.8h, v9.8h, v30.8h\n" + "fmin v10.8h, v10.8h, v30.8h\n" + "fmin v11.8h, v11.8h, v30.8h\n" + "fmin v12.8h, v12.8h, v30.8h\n" + "fmin v13.8h, v13.8h, v30.8h\n" + "fmin v14.8h, v14.8h, v30.8h\n" + "fmin v15.8h, v15.8h, v30.8h\n" + "fmin v16.8h, v16.8h, v30.8h\n" + "fmin v17.8h, v17.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q6, [%[out_0], #64]\n" // out_o0hw4 + "str q7, [%[out_0], #80]\n" // out_o0hw5 + "str q8, [%[out_0], #96]\n" // out_o0hw6 + "str q9, [%[out_0], #112]\n" // out_o0hw7 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + "str q14, [%[out_1], #64]\n" // out_o1hw4 + "str q15, [%[out_1], #80]\n" // out_o1hw5 + "str q16, [%[out_1], #96]\n" // out_o1hw6 + "str q17, [%[out_1], #112]\n" // out_o1hw7 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = filterArray + (oc - 1) * 8 * fh * fw * ic; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = biasArray + (oc - 1) * 8; + __asm__ __volatile__("ldr q12, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v12.16b\n" // out_o0hw0 + "ldr x1, [%[in_0], #8]\n" + "mov v3.16b, v12.16b\n" // out_o0hw1 + "ins v0.d[1], x1\n" + "mov v4.16b, v12.16b\n" // out_o0hw2 + "ldr d10, [%[f_0]]\n" // f_o0c0 + "mov v5.16b, v12.16b\n" // out_o0hw3 + "ldr x2, [%[f_0], #8]\n" + "mov v6.16b, v12.16b\n" // out_o0hw4 + "ins v10.d[1], x2\n" + "mov v7.16b, v12.16b\n" // out_o0hw5 + "mov v8.16b, v12.16b\n" // out_o0hw6 + "mov v9.16b, v12.16b\n" // out_o0hw7 + + "0:\n" + "cmp x0, #1\n" + "ble 1f\n" + "ldr d1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v10.8h, v0.h[0]\n" + "ldr x1, [%[in_0], #24]\n" + "fmla v3.8h, v10.8h, v0.h[1]\n" + "ins v1.d[1], x1\n" + "fmla v4.8h, v10.8h, v0.h[2]\n" + "ldr d11, [%[f_0], #16]\n" // f_o0c0 + "fmla v5.8h, v10.8h, v0.h[3]\n" + "ldr x2, [%[f_0], #24]\n" + "fmla v6.8h, v10.8h, v0.h[4]\n" + "ins v11.d[1], x2\n" + "fmla v7.8h, v10.8h, v0.h[5]\n" + "sub x0, x0, #2\n" + "fmla v8.8h, v10.8h, v0.h[6]\n" + "fmla v9.8h, v10.8h, v0.h[7]\n" + + "ldr d0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v11.8h, v1.h[0]\n" + "ldr x1, [%[in_0], #40]\n" + "fmla v3.8h, v11.8h, v1.h[1]\n" + "ins v0.d[1], x1\n" + "fmla v4.8h, v11.8h, v1.h[2]\n" + "ldr d10, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v11.8h, v1.h[3]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v6.8h, v11.8h, v1.h[4]\n" + "ins v10.d[1], x2\n" + "fmla v7.8h, v11.8h, v1.h[5]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v8.8h, v11.8h, v1.h[6]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v9.8h, v11.8h, v1.h[7]\n" + "b 0b\n" + + "1:\n" + "blt 2f\n" + "fmla v2.8h, v10.8h, v0.h[0]\n" + "fmla v3.8h, v10.8h, v0.h[1]\n" + "fmla v4.8h, v10.8h, v0.h[2]\n" + "fmla v5.8h, v10.8h, v0.h[3]\n" + "add %[f_0], %[f_0], #16\n" + "fmla v6.8h, v10.8h, v0.h[4]\n" + "fmla v7.8h, v10.8h, v0.h[5]\n" + "fmla v8.8h, v10.8h, v0.h[6]\n" + "fmla v9.8h, v10.8h, v0.h[7]\n" + "2:\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "x0", "x1", "x2"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__( + "eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v6.8h, v6.8h, v1.8h\n" + "fmax v7.8h, v7.8h, v1.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v6.8h, v6.8h, v31.8h\n" + "fmax v7.8h, v7.8h, v31.8h\n" + "fmax v8.8h, v8.8h, v31.8h\n" + "fmax v9.8h, v9.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v6.8h, v6.8h, v30.8h\n" + "fmin v7.8h, v7.8h, v30.8h\n" + "fmin v8.8h, v8.8h, v30.8h\n" + "fmin v9.8h, v9.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__( + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw0 + "str q4, [%[out_0], #32]\n" // out_o0hw0 + "str q5, [%[out_0], #48]\n" // out_o0hw0 + "str q6, [%[out_0], #64]\n" // out_o0hw0 + "str q7, [%[out_0], #80]\n" // out_o0hw0 + "str q8, [%[out_0], #96]\n" // out_o0hw0 + "str q9, [%[out_0], #112]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9"); + } + } + // ohow_reminder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + // U32 ohow_s = (ohow/8)*8; + for (I32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + const F16 *f_o0c0 = filterArray; + F16 *in_pack = ((F16 *)tmp) + ic * ih_pad * iw_pad; + + // pack input + // NCHWc8 => NHWChw4 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F16 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F16 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F16 *in_1 = in_hw + in_h_1 * iw_pad + in_w_1; + F16 *in_2 = in_hw + in_h_2 * iw_pad + in_w_2; + F16 *in_3 = in_hw + in_h_3 * iw_pad + in_w_3; + F16 *in_pack_hw4 = in_pack + fh_idx * fw * ic * 4 + fw_idx * ic * 4 + c * 4; + *in_pack_hw4 = *in_0; + *(in_pack_hw4 + 1) = *in_1; + *(in_pack_hw4 + 2) = *in_2; + *(in_pack_hw4 + 3) = *in_3; + } + } + } + + // compute + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__("ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr d23, [%[b_1]]\n" // b_o1 + "ldr x2, [%[b_1], #8]\n" + "ins v23.d[1], x2\n" + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "ldr x2, [%[f_0], #8]\n" + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ins v18.d[1], x2\n" + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr d19, [%[f_0], #16]\n" // f_o1c0 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "ldr x3, [%[f_0], #24]\n" + "mov v12.16b, v23.16b\n" // out_o1hw2 + "ins v19.d[1], x3\n" + "mov v13.16b, v23.16b\n" // out_o1hw3 + + "0:\n" + "cmp x0, #1\n" + "ble 1f\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #32]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ins v20.d[1], x2\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "ldr d21, [%[f_0], #48]\n" // f_o1c0 + "fmla v10.8h, v19.8h, v0.h[0]\n" + "ldr x3, [%[f_0], #56]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "ins v21.d[1], x3\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "sub x0, x0, #2\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #64]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ldr x2, [%[f_0], #72]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr d19, [%[f_0], #80]\n" // f_o1c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "ins v18.d[1], x2\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "ldr x3, [%[f_0], #88]\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "ins v19.d[1], x3\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "add %[in_0], %[in_0], #16\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "add %[f_0], %[f_0], #64\n" + "b 0b\n" + + "1:\n" + "blt 2f\n" + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "2:\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v10", + "v11", "v12", "v13", "v18", "v19", "v20", "v21", "v22", "v23", + "x0", "x1", "x2", "x3"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5", "v10", + "v11", "v12", "v13"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v10.8h, v10.8h, v31.8h\n" + "fmax v11.8h, v11.8h, v31.8h\n" + "fmax v12.8h, v12.8h, v31.8h\n" + "fmax v13.8h, v13.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v10.8h, v10.8h, v30.8h\n" + "fmin v11.8h, v11.8h, v30.8h\n" + "fmin v12.8h, v12.8h, v30.8h\n" + "fmin v13.8h, v13.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v10", "v11", + "v12", "v13", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__( + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v10", "v11", "v12", "v13"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = filterArray + (oc - 1) * 8 * fh * fw * ic; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = biasArray + (oc - 1) * 8; + __asm__ __volatile__("ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "ldr x2, [%[f_0], #8]\n" + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ins v18.d[1], x2\n" + + "0:\n" + "cmp x0, #1\n" + "ble 1f\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #16]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ldr x2, [%[f_0], #24]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ins v20.d[1], x2\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "sub x0, x0, #2\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #32]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ins v18.d[1], x2\n" + "fmla v5.8h, v20.8h, v1.h[3]\n" + "add %[in_0], %[in_0], #16\n" + "add %[f_0], %[f_0], #32\n" + "b 0b\n" + + "1:\n" + "blt 2f\n" + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "add %[f_0], %[f_0], #16\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "2:\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v18", + "v20", "v22", "x0", "x1", "x2"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5"); + } + } + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + const F16 *f_o0c0 = filterArray; + F16 *in_pack = ((F16 *)tmp) + ic * ih_pad * iw_pad; + // pack input + // NCHWc8 => NHWChw4 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F16 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F16 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F16 *in_pack_hw1 = in_pack + fh_idx * fw * ic + fw_idx * ic + c; + /* + * for (U32 c8 = 0; c8 < 8; c8++) { + * in_pack_c8hw1[c8] = in_0[c8]; + * } + */ + *in_pack_hw1 = *in_0; + } + } + } + + // compute + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__("ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr d23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "ldr x2, [%[b_1], #8]\n" + "ins v23.d[1], x2\n" + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr x2, [%[f_0], #8]\n" + "ins v18.d[1], x2\n" + "ldr d19, [%[f_0], #16]\n" // f_o1c0 + "ldr x3, [%[f_0], #24]\n" + "ins v19.d[1], x3\n" + + "0:\n" + "cmp x0, #1\n" + "ble 1f\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #32]\n" // f_o0c0 + "fmla v10.8h, v19.8h, v0.h[0]\n" + "ldr x2, [%[f_0], #40]\n" + "ins v20.d[1], x2\n" + "ldr d21, [%[f_0], #48]\n" // f_o1c0 + "sub x0, x0, #2\n" + "ldr x3, [%[f_0], #56]\n" + "ins v21.d[1], x3\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #64]\n" // f_o0c0 + "fmla v10.8h, v21.8h, v1.h[0]\n" + "ldr x2, [%[f_0], #72]\n" + "ins v18.d[1], x2\n" + "ldr d19, [%[f_0], #80]\n" // f_o1c0 + "add %[in_0], %[in_0], #4\n" + "ldr x3, [%[f_0], #88]\n" + "ins v19.d[1], x3\n" + "add %[f_0], %[f_0], #64\n" + "b 0b\n" + + "1:\n" + "blt 2f\n" + "fmla v2.8h, v18.8h, v0.h[0]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "2:\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", + "v21", "v22", "v23", "x0", "x1", "x2", "x3"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v10.8h, v10.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v10"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v10.8h, v10.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v10.8h, v10.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v10", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + "str q10, [%[out_1]]\n" // out_o1hw0 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0) + : + : "memory", "cc", "v2", "v10"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = filterArray + (oc - 1) * 8 * fh * fw * ic; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = biasArray + (oc - 1) * 8; + __asm__ __volatile__("ldr d22, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "ldr x2, [%[f_0], #8]\n" + "ins v18.d[1], x2\n" + + "0:\n" + "cmp x0, #1\n" + "ble 1f\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #16]\n" // f_o0c0 + "sub x0, x0, #2\n" + "ldr x2, [%[f_0], #24]\n" + "ins v20.d[1], x2\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #32]\n" // f_o0c0 + "ldr x2, [%[f_0], #40]\n" + "ins v18.d[1], x2\n" + "add %[in_0], %[in_0], #4\n" + "add %[f_0], %[f_0], #32\n" + "b 0b\n" + + "1:\n" + "blt 2f\n" + "fmla v2.8h, v18.8h, v0.h[0]\n" + "add %[f_0], %[f_0], #16\n" + "2:\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", + "x0", "x1", "x2"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + : + : + : "memory", "cc", "v1", "v2"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmin v2.8h, v2.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v2"); + } + } + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A76.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A76.cpp new file mode 100644 index 00000000..c6a6aa60 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A76.cpp @@ -0,0 +1,920 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/convolution_gemm_icnchw.h" + +EE convolution_gemm_icnchw_A76(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if (fdf != DF_NHWCN16) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + I32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + F16 *inArray_pad; + EE ret = SUCCESS; + for (U32 n = 0; n < in; n++) { + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + inArray_pad = inArray + n * ic * ih * iw; + } else { + // copy input into a input with padding + inArray_pad = (F16 *)tmp; + F16 *inArray_pad_mov = inArray_pad; + F16 *inArray_mov = inArray + n * ic * ih * iw; + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < paddingT; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(idt)); + inArray_pad_mov += iw_pad; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * bytesOf(idt)); + inArray_pad_mov += paddingL; + memcpy(inArray_pad_mov, inArray_mov, iw * bytesOf(idt)); + inArray_pad_mov += iw; + inArray_mov += iw; + memset(inArray_pad_mov, 0, paddingR * bytesOf(idt)); + inArray_pad_mov += paddingR; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(idt)); + inArray_pad_mov += iw_pad; + } + } + } + + // ohow / 8 + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + const F16 *f_o0c0 = filterArray; + F16 *in_pack = ((F16 *)tmp) + ic * ih_pad * iw_pad; + // pack input + // NCHW => NHWChw8 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; + U32 in_h_4 = ((hw + 4) / ow) * strideH; + U32 in_w_4 = ((hw + 4) % ow) * strideW; + U32 in_h_5 = ((hw + 5) / ow) * strideH; + U32 in_w_5 = ((hw + 5) % ow) * strideW; + U32 in_h_6 = ((hw + 6) / ow) * strideH; + U32 in_w_6 = ((hw + 6) % ow) * strideW; + U32 in_h_7 = ((hw + 7) / ow) * strideH; + U32 in_w_7 = ((hw + 7) % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F16 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F16 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F16 *in_1 = in_hw + in_h_1 * iw_pad + in_w_1; + F16 *in_2 = in_hw + in_h_2 * iw_pad + in_w_2; + F16 *in_3 = in_hw + in_h_3 * iw_pad + in_w_3; + F16 *in_4 = in_hw + in_h_4 * iw_pad + in_w_4; + F16 *in_5 = in_hw + in_h_5 * iw_pad + in_w_5; + F16 *in_6 = in_hw + in_h_6 * iw_pad + in_w_6; + F16 *in_7 = in_hw + in_h_7 * iw_pad + in_w_7; + F16 *in_pack_hw8 = in_pack + fh_idx * fw * ic * 8 + fw_idx * ic * 8 + c * 8; + *in_pack_hw8 = *in_0; + *(in_pack_hw8 + 1) = *in_1; + *(in_pack_hw8 + 2) = *in_2; + *(in_pack_hw8 + 3) = *in_3; + *(in_pack_hw8 + 4) = *in_4; + *(in_pack_hw8 + 5) = *in_5; + *(in_pack_hw8 + 6) = *in_6; + *(in_pack_hw8 + 7) = *in_7; + } + } + } + + // compute + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__("ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr q0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v6.16b, v22.16b\n" // out_o0hw4 + "mov v7.16b, v22.16b\n" // out_o0hw5 + "mov v8.16b, v22.16b\n" // out_o0hw6 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 + "mov v9.16b, v22.16b\n" // out_o0hw7 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + "mov v14.16b, v23.16b\n" // out_o1hw4 + "mov v15.16b, v23.16b\n" // out_o1hw5 + "mov v16.16b, v23.16b\n" // out_o1hw6 + "mov v17.16b, v23.16b\n" // out_o1hw7 + + "0:\n" + "cmp x0, #1\n" + "ble 1f\n" + "ldr q1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ldr q20, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v18.8h, v0.h[3]\n" + "fmla v6.8h, v18.8h, v0.h[4]\n" + "fmla v7.8h, v18.8h, v0.h[5]\n" + "ldr q21, [%[f_0], #48]\n" // f_o1c0 + "fmla v8.8h, v18.8h, v0.h[6]\n" + "fmla v9.8h, v18.8h, v0.h[7]\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "fmla v14.8h, v19.8h, v0.h[4]\n" + "fmla v15.8h, v19.8h, v0.h[5]\n" + "fmla v16.8h, v19.8h, v0.h[6]\n" + "fmla v17.8h, v19.8h, v0.h[7]\n" + + "ldr q0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "fmla v3.8h, v20.8h, v1.h[1]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr q18, [%[f_0], #64]\n" // f_o0c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "fmla v6.8h, v20.8h, v1.h[4]\n" + "fmla v7.8h, v20.8h, v1.h[5]\n" + "ldr q19, [%[f_0], #80]\n" // f_o1c0 + "fmla v8.8h, v20.8h, v1.h[6]\n" + "fmla v9.8h, v20.8h, v1.h[7]\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "add %[f_0], %[f_0], #64\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "sub x0, x0, #2\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "fmla v14.8h, v21.8h, v1.h[4]\n" + "fmla v15.8h, v21.8h, v1.h[5]\n" + "fmla v16.8h, v21.8h, v1.h[6]\n" + "fmla v17.8h, v21.8h, v1.h[7]\n" + "b 0b\n" + + "1:\n" + "blt 2f\n" + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "fmla v6.8h, v18.8h, v0.h[4]\n" + "fmla v7.8h, v18.8h, v0.h[5]\n" + "fmla v8.8h, v18.8h, v0.h[6]\n" + "fmla v9.8h, v18.8h, v0.h[7]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "fmla v14.8h, v19.8h, v0.h[4]\n" + "fmla v15.8h, v19.8h, v0.h[5]\n" + "fmla v16.8h, v19.8h, v0.h[6]\n" + "fmla v17.8h, v19.8h, v0.h[7]\n" + "2:\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "x0"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v6.8h, v6.8h, v1.8h\n" + "fmax v7.8h, v7.8h, v1.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + "fmax v14.8h, v14.8h, v1.8h\n" + "fmax v15.8h, v15.8h, v1.8h\n" + "fmax v16.8h, v16.8h, v1.8h\n" + "fmax v17.8h, v17.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v6.8h, v6.8h, v31.8h\n" + "fmax v7.8h, v7.8h, v31.8h\n" + "fmax v8.8h, v8.8h, v31.8h\n" + "fmax v9.8h, v9.8h, v31.8h\n" + "fmax v10.8h, v10.8h, v31.8h\n" + "fmax v11.8h, v11.8h, v31.8h\n" + "fmax v12.8h, v12.8h, v31.8h\n" + "fmax v13.8h, v13.8h, v31.8h\n" + "fmax v14.8h, v14.8h, v31.8h\n" + "fmax v15.8h, v15.8h, v31.8h\n" + "fmax v16.8h, v16.8h, v31.8h\n" + "fmax v17.8h, v17.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v6.8h, v6.8h, v30.8h\n" + "fmin v7.8h, v7.8h, v30.8h\n" + "fmin v8.8h, v8.8h, v30.8h\n" + "fmin v9.8h, v9.8h, v30.8h\n" + "fmin v10.8h, v10.8h, v30.8h\n" + "fmin v11.8h, v11.8h, v30.8h\n" + "fmin v12.8h, v12.8h, v30.8h\n" + "fmin v13.8h, v13.8h, v30.8h\n" + "fmin v14.8h, v14.8h, v30.8h\n" + "fmin v15.8h, v15.8h, v30.8h\n" + "fmin v16.8h, v16.8h, v30.8h\n" + "fmin v17.8h, v17.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q6, [%[out_0], #64]\n" // out_o0hw4 + "str q7, [%[out_0], #80]\n" // out_o0hw5 + "str q8, [%[out_0], #96]\n" // out_o0hw6 + "str q9, [%[out_0], #112]\n" // out_o0hw7 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + "str q14, [%[out_1], #64]\n" // out_o1hw4 + "str q15, [%[out_1], #80]\n" // out_o1hw5 + "str q16, [%[out_1], #96]\n" // out_o1hw6 + "str q17, [%[out_1], #112]\n" // out_o1hw7 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = filterArray + (oc - 1) * 8 * fh * fw * ic; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = biasArray + (oc - 1) * 8; + __asm__ __volatile__("ldr q12, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr q0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v12.16b\n" // out_o0hw0 + "mov v3.16b, v12.16b\n" // out_o0hw1 + "mov v4.16b, v12.16b\n" // out_o0hw2 + "ldr q10, [%[f_0]]\n" // f_o0c0 + "mov v5.16b, v12.16b\n" // out_o0hw3 + "mov v6.16b, v12.16b\n" // out_o0hw4 + "mov v7.16b, v12.16b\n" // out_o0hw5 + "mov v8.16b, v12.16b\n" // out_o0hw6 + "mov v9.16b, v12.16b\n" // out_o0hw7 + + "0:\n" + "cmp x0, #1\n" + "ble 1f\n" + "ldr q1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v10.8h, v0.h[0]\n" + "fmla v3.8h, v10.8h, v0.h[1]\n" + "fmla v4.8h, v10.8h, v0.h[2]\n" + "ldr q11, [%[f_0], #16]\n" // f_o0c0 + "fmla v5.8h, v10.8h, v0.h[3]\n" + "fmla v6.8h, v10.8h, v0.h[4]\n" + "fmla v7.8h, v10.8h, v0.h[5]\n" + "sub x0, x0, #2\n" + "fmla v8.8h, v10.8h, v0.h[6]\n" + "fmla v9.8h, v10.8h, v0.h[7]\n" + + "ldr q0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v11.8h, v1.h[0]\n" + "fmla v3.8h, v11.8h, v1.h[1]\n" + "fmla v4.8h, v11.8h, v1.h[2]\n" + "ldr q10, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v11.8h, v1.h[3]\n" + "fmla v6.8h, v11.8h, v1.h[4]\n" + "fmla v7.8h, v11.8h, v1.h[5]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v8.8h, v11.8h, v1.h[6]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v9.8h, v11.8h, v1.h[7]\n" + "b 0b\n" + + "1:\n" + "blt 2f\n" + "fmla v2.8h, v10.8h, v0.h[0]\n" + "fmla v3.8h, v10.8h, v0.h[1]\n" + "fmla v4.8h, v10.8h, v0.h[2]\n" + "fmla v5.8h, v10.8h, v0.h[3]\n" + "add %[f_0], %[f_0], #16\n" + "fmla v6.8h, v10.8h, v0.h[4]\n" + "fmla v7.8h, v10.8h, v0.h[5]\n" + "fmla v8.8h, v10.8h, v0.h[6]\n" + "fmla v9.8h, v10.8h, v0.h[7]\n" + "2:\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "x0"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__( + "eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v6.8h, v6.8h, v1.8h\n" + "fmax v7.8h, v7.8h, v1.8h\n" + "fmax v8.8h, v8.8h, v1.8h\n" + "fmax v9.8h, v9.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v6.8h, v6.8h, v31.8h\n" + "fmax v7.8h, v7.8h, v31.8h\n" + "fmax v8.8h, v8.8h, v31.8h\n" + "fmax v9.8h, v9.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v6.8h, v6.8h, v30.8h\n" + "fmin v7.8h, v7.8h, v30.8h\n" + "fmin v8.8h, v8.8h, v30.8h\n" + "fmin v9.8h, v9.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__( + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw0 + "str q4, [%[out_0], #32]\n" // out_o0hw0 + "str q5, [%[out_0], #48]\n" // out_o0hw0 + "str q6, [%[out_0], #64]\n" // out_o0hw0 + "str q7, [%[out_0], #80]\n" // out_o0hw0 + "str q8, [%[out_0], #96]\n" // out_o0hw0 + "str q9, [%[out_0], #112]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9"); + } + } + // ohow_reminder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + + for (I32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + const F16 *f_o0c0 = filterArray; + F16 *in_pack = ((F16 *)tmp) + ic * ih_pad * iw_pad; + + // pack input + // NCHWc8 => NHWChw4 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F16 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F16 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F16 *in_1 = in_hw + in_h_1 * iw_pad + in_w_1; + F16 *in_2 = in_hw + in_h_2 * iw_pad + in_w_2; + F16 *in_3 = in_hw + in_h_3 * iw_pad + in_w_3; + F16 *in_pack_hw4 = in_pack + fh_idx * fw * ic * 4 + fw_idx * ic * 4 + c * 4; + *in_pack_hw4 = *in_0; + *(in_pack_hw4 + 1) = *in_1; + *(in_pack_hw4 + 2) = *in_2; + *(in_pack_hw4 + 3) = *in_3; + } + } + } + + // compute + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__("ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + + "0:\n" + "cmp x0, #1\n" + "ble 1f\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f_0], #32]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "ldr q21, [%[f_0], #48]\n" // f_o1c0 + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "sub x0, x0, #2\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f_0], #64]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr q19, [%[f_0], #80]\n" // f_o1c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "add %[in_0], %[in_0], #16\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "add %[f_0], %[f_0], #64\n" + "b 0b\n" + + "1:\n" + "blt 2f\n" + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "2:\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v10", + "v11", "v12", "v13", "v18", "v19", "v20", "v21", "v22", "v23", + "x0"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + "fmax v10.8h, v10.8h, v1.8h\n" + "fmax v11.8h, v11.8h, v1.8h\n" + "fmax v12.8h, v12.8h, v1.8h\n" + "fmax v13.8h, v13.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5", "v10", + "v11", "v12", "v13"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v10.8h, v10.8h, v31.8h\n" + "fmax v11.8h, v11.8h, v31.8h\n" + "fmax v12.8h, v12.8h, v31.8h\n" + "fmax v13.8h, v13.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v10.8h, v10.8h, v30.8h\n" + "fmin v11.8h, v11.8h, v30.8h\n" + "fmin v12.8h, v12.8h, v30.8h\n" + "fmin v13.8h, v13.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v10", "v11", + "v12", "v13", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__( + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v10", "v11", "v12", "v13"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = filterArray + (oc - 1) * 8 * fh * fw * ic; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = biasArray + (oc - 1) * 8; + __asm__ __volatile__("ldr q22, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 + + "0:\n" + "cmp x0, #1\n" + "ble 1f\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f_0], #16]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "sub x0, x0, #2\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f_0], #32]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "fmla v5.8h, v20.8h, v1.h[3]\n" + "add %[in_0], %[in_0], #16\n" + "add %[f_0], %[f_0], #32\n" + "b 0b\n" + + "1:\n" + "blt 2f\n" + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "add %[f_0], %[f_0], #16\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "2:\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v18", + "v20", "v22", "x0"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v3.8h, v3.8h, v1.8h\n" + "fmax v4.8h, v4.8h, v1.8h\n" + "fmax v5.8h, v5.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v3", "v4", "v5"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v3", "v4", "v5", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v2", "v3", "v4", "v5"); + } + } + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + const F16 *b0 = biasArray; + const F16 *b1 = biasArray + 8; + const F16 *f_o0c0 = filterArray; + F16 *in_pack = ((F16 *)tmp) + ic * ih_pad * iw_pad; + // pack input + // NCHWc8 => NHWChw4 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F16 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F16 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F16 *in_pack_hw1 = in_pack + fh_idx * fw * ic + fw_idx * ic + c; + /* + * for (U32 c8 = 0; c8 < 8; c8++) { + * in_pack_c8hw1[c8] = in_0[c8]; + * } + */ + *in_pack_hw1 = *in_0; + } + } + } + + // compute + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__("ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 + + "0:\n" + "cmp x0, #1\n" + "ble 1f\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f_0], #32]\n" // f_o0c0 + "fmla v10.8h, v19.8h, v0.h[0]\n" + "ldr q21, [%[f_0], #48]\n" // f_o1c0 + "sub x0, x0, #2\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f_0], #64]\n" // f_o0c0 + "fmla v10.8h, v21.8h, v1.h[0]\n" + "ldr q19, [%[f_0], #80]\n" // f_o1c0 + "add %[in_0], %[in_0], #4\n" + "add %[f_0], %[f_0], #64\n" + "b 0b\n" + + "1:\n" + "blt 2f\n" + "fmla v2.8h, v18.8h, v0.h[0]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "2:\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", + "v21", "v22", "v23", "x0"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + "fmax v10.8h, v10.8h, v1.8h\n" + : + : + : "memory", "cc", "v1", "v2", "v10"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v10.8h, v10.8h, v31.8h\n" + + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v10.8h, v10.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v10", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + "str q10, [%[out_1]]\n" // out_o1hw0 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0) + : + : "memory", "cc", "v2", "v10"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = filterArray + (oc - 1) * 8 * fh * fw * ic; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = biasArray + (oc - 1) * 8; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr q18, [%[f_0]]\n" // f_o0c0 + + "0:\n" + "cmp x0, #1\n" + "ble 1f\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f_0], #16]\n" // f_o0c0 + "sub x0, x0, #2\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f_0], #32]\n" // f_o0c0 + "add %[in_0], %[in_0], #4\n" + "add %[f_0], %[f_0], #32\n" + "b 0b\n" + + "1:\n" + "blt 2f\n" + "fmla v2.8h, v18.8h, v0.h[0]\n" + "add %[f_0], %[f_0], #16\n" + "2:\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", "x0"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v2.8h, v2.8h, v1.8h\n" // max(v2, 0) + : + : + : "memory", "cc", "v1", "v2"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v31.8h\n" + "fmin v2.8h, v2.8h, v30.8h\n" + : + : + : "memory", "cc", "v2", "v30", "v31"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q2, [%[out_0]]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v2"); + } + } + } + return ret; +} diff --git a/tensor_computing/src/cpu/arm/fp16/convolution_transform.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_transform.cpp similarity index 61% rename from tensor_computing/src/cpu/arm/fp16/convolution_transform.cpp rename to compute/tensor/src/cpu/arm/fp16/convolution_transform.cpp index 13a9dd6a..8bd7b6bb 100644 --- a/tensor_computing/src/cpu/arm/fp16/convolution_transform.cpp +++ b/compute/tensor/src/cpu/arm/fp16/convolution_transform.cpp @@ -1,59 +1,64 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "cpu/arm/fp16/tensor_computing_fp16.h" #include "cpu/arm/fp16/convolution_winograd_transform.h" -inline EE convolution_transform_filter_kernel_fp16(TensorDesc filterDesc, const F16* filterArray, - TensorDesc *ftmDesc, F16* ftmArray, +static EE convolution_transform_filter_kernel_fp16(TensorDesc filterDesc, + const F16 *filterArray, + TensorDesc *ftmDesc, + F16 *ftmArray, DataFormat ftmDataFormat) { - if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) + if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) { CHECK_STATUS(NULL_POINTER); + } DataType fdt; DataFormat fdf; U32 fn, fc, fh, fw; CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); if (fdf == ftmDataFormat) { *ftmDesc = filterDesc; - memcpy(ftmArray, filterArray, fn*fc*fh*fw*bytesOf(fdt)); + memcpy(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt)); return SUCCESS; } - if (fdf != DF_NCHW) + if (fdf != DF_NCHW) { CHECK_STATUS(NOT_SUPPORTED); + } EE ret = SUCCESS; switch (ftmDataFormat) { case DF_NHWCN16: { /* - * NCHW => NHWCN16 - * if there is remainder, it should be NHWCN8 - */ + * NCHW => NHWCN16 + * if there is remainder, it should be NHWCN8 + */ U32 oc = fn / 16; for (U32 o = 0; o < oc; o++) { - for (U32 hw = 0; hw < fh*fw; hw++) { + for (U32 hw = 0; hw < fh * fw; hw++) { for (U32 c = 0; c < fc; c++) { for (U32 o16 = 0; o16 < 16; o16++) { - ftmArray[o*fh*fw*fc*16 + hw*fc*16 + c*16 + o16] = filterArray[(o*16+o16)*fc*fh*fw + c*fh*fw + hw]; + ftmArray[o * fh * fw * fc * 16 + hw * fc * 16 + c * 16 + o16] = + filterArray[(o * 16 + o16) * fc * fh * fw + c * fh * fw + hw]; } } } } - if (fn != oc*16) { - for (U32 hw = 0; hw < fh*fw; hw++) { + if (fn != oc * 16) { + for (U32 hw = 0; hw < fh * fw; hw++) { for (U32 c = 0; c < fc; c++) { for (U32 o8 = 0; o8 < 8; o8++) { - ftmArray[(oc*16)*fh*fw*fc + hw*fc*8 + c*8 + o8] = filterArray[(oc*16+o8)*fc*fh*fw + c*fh*fw + hw]; + ftmArray[(oc * 16) * fh * fw * fc + hw * fc * 8 + c * 8 + o8] = + filterArray[(oc * 16 + o8) * fc * fh * fw + c * fh * fw + hw]; } } } @@ -63,13 +68,14 @@ inline EE convolution_transform_filter_kernel_fp16(TensorDesc filterDesc, const } case DF_NCHWN16: { /* - * NCHW => NCHWN16 - */ + * NCHW => NCHWN16 + */ U32 oc = fn / 16; for (U32 o = 0; o < oc; o++) { - for (U32 chw = 0; chw < fc*fh*fw; chw++) { + for (U32 chw = 0; chw < fc * fh * fw; chw++) { for (U32 o16 = 0; o16 < 16; o16++) { - ftmArray[o*fc*fh*fw*16 + chw*16 + o16] = filterArray[(o*16+o16)*fc*fh*fw + chw]; + ftmArray[o * fc * fh * fw * 16 + chw * 16 + o16] = + filterArray[(o * 16 + o16) * fc * fh * fw + chw]; } } } @@ -77,33 +83,33 @@ inline EE convolution_transform_filter_kernel_fp16(TensorDesc filterDesc, const break; } case DF_HWNCN16: { - for (U32 o = 0; o < fn/16; o++) { + for (U32 o = 0; o < fn / 16; o++) { for (U32 c = 0; c < fc; c++) { - U32 f_off_0 = (o*16)*fc*fh*fw + c*fh*fw; - U32 f_off_1 = (o*16+8)*fc*fh*fw + c*fh*fw; - U32 ftm_off_0 = o*36*fc*16 + c*16; - U32 ftm_off_1 = o*36*fc*16 + c*16 + 8; + U32 f_off_0 = (o * 16) * fc * fh * fw + c * fh * fw; + U32 f_off_1 = (o * 16 + 8) * fc * fh * fw + c * fh * fw; + U32 ftm_off_0 = o * 36 * fc * 16 + c * 16; + U32 ftm_off_1 = o * 36 * fc * 16 + c * 16 + 8; F16 F[9][8]; F16 *F_ptr[9]; F16 *Fw[36]; for (U32 hw = 0; hw < 9; hw++) { for (U32 oo = 0; oo < 8; oo++) { - F[hw][oo] = filterArray[f_off_0 + hw + oo*fc*fh*fw]; + F[hw][oo] = filterArray[f_off_0 + hw + oo * fc * fh * fw]; } F_ptr[hw] = F[hw]; } for (U32 hw = 0; hw < 36; hw++) { - Fw[hw] = ftmArray + ftm_off_0 + hw*fc*16; + Fw[hw] = ftmArray + ftm_off_0 + hw * fc * 16; } trans_W_4x4_3x3(Fw, F_ptr); for (U32 hw = 0; hw < 9; hw++) { for (U32 oo = 0; oo < 8; oo++) { - F[hw][oo] = filterArray[f_off_1 + hw + oo*fc*fh*fw]; + F[hw][oo] = filterArray[f_off_1 + hw + oo * fc * fh * fw]; } F_ptr[hw] = F[hw]; } for (U32 hw = 0; hw < 36; hw++) { - Fw[hw] = ftmArray + ftm_off_1 + hw*fc*16; + Fw[hw] = ftmArray + ftm_off_1 + hw * fc * 16; } trans_W_4x4_3x3(Fw, F_ptr); } @@ -111,19 +117,19 @@ inline EE convolution_transform_filter_kernel_fp16(TensorDesc filterDesc, const U32 oc = (fn / 16) * 16; if (oc != fn) { for (U32 c = 0; c < fc; c++) { - U32 f_off_0 = oc*fc*fh*fw + c*fh*fw; - U32 ftm_off_0 = oc*36*fc + c*8; + U32 f_off_0 = oc * fc * fh * fw + c * fh * fw; + U32 ftm_off_0 = oc * 36 * fc + c * 8; F16 F[9][8]; F16 *F_ptr[9]; F16 *Fw[36]; for (U32 hw = 0; hw < 9; hw++) { for (U32 oo = 0; oo < 8; oo++) { - F[hw][oo] = filterArray[f_off_0 + hw + oo*fc*fh*fw]; + F[hw][oo] = filterArray[f_off_0 + hw + oo * fc * fh * fw]; } F_ptr[hw] = F[hw]; } for (U32 hw = 0; hw < 36; hw++) { - Fw[hw] = ftmArray + ftm_off_0 + hw*fc*8; + Fw[hw] = ftmArray + ftm_off_0 + hw * fc * 8; } trans_W_4x4_3x3(Fw, F_ptr); } @@ -138,9 +144,12 @@ inline EE convolution_transform_filter_kernel_fp16(TensorDesc filterDesc, const return ret; } -EE convolution_transform_filter_fp16(TensorDesc filterDesc, const F16* filter, +EE convolution_transform_filter_fp16(TensorDesc filterDesc, + const F16 *filter, + ConvolutionParamSpec convParamSpec, ConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, F16* filterTransformed) + TensorDesc *ftmDesc, + F16 *filterTransformed) { DataFormat ftmDataFormat; switch (algorithm) { @@ -159,7 +168,22 @@ EE convolution_transform_filter_fp16(TensorDesc filterDesc, const F16* filter, default: return NOT_MATCH; } - EE ret = convolution_transform_filter_kernel_fp16(filterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat); - CHECK_STATUS(ret); - return ret; + + U32 channelAxis = filterDesc.nDims - 1; + TensorDesc tmpFilterDesc = filterDesc; + tmpFilterDesc.dims[channelAxis] /= convParamSpec.group; + U32 fnPadding = tmpFilterDesc.dims[channelAxis]; + if (fnPadding % 8 != 0) { + fnPadding = (fnPadding / 8 + 1) * 8; + } + U32 originalTileSize = tensorNumElements(tmpFilterDesc); + for (U32 g = 0; g < convParamSpec.group; g++) { + CHECK_STATUS(convolution_transform_filter_kernel_fp16( + tmpFilterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat)); + U32 newTileSize = tensorNumElements(*ftmDesc) / tmpFilterDesc.dims[channelAxis] * fnPadding; + filter += originalTileSize; + filterTransformed += newTileSize; + } + ftmDesc->dims[channelAxis] = filterDesc.dims[channelAxis]; + return SUCCESS; } diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_winograd.h b/compute/tensor/src/cpu/arm/fp16/convolution_winograd.h new file mode 100644 index 00000000..898338b8 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/convolution_winograd.h @@ -0,0 +1,78 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONVOLUTION_WINOGRAD +#define _H_CONVOLUTION_WINOGRAD + +#include "sys.h" +#include "types.h" +#include "error.h" + +EE convolution_winograd_A55(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc); + +EE convolution_winograd_A76(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc); + +inline EE convolution_winograd(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc, + Arch arch) +{ + EE ret = SUCCESS; + switch (arch) { + case ARM_A55: + ret = convolution_winograd_A55(inputDesc, inArray, filterDesc, filterArray, + convParamSpec, biasDesc, biasArray, tmpBytes, tmp, outputDesc, outArray, + activationDesc); + break; + case ARM_A76: + ret = convolution_winograd_A76(inputDesc, inArray, filterDesc, filterArray, + convParamSpec, biasDesc, biasArray, tmpBytes, tmp, outputDesc, outArray, + activationDesc); + break; + default: + return NOT_SUPPORTED; + } + return ret; +} +#endif diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A55.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A55.cpp new file mode 100644 index 00000000..dc206e87 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A55.cpp @@ -0,0 +1,859 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/convolution_winograd_transform.h" +#include "cpu/arm/fp16/convolution_winograd.h" + +EE convolution_winograd_A55(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + if (fdf != DF_HWNCN16) { + CHECK_STATUS(NOT_MATCH); + } + if (!(fh == 6 && fw == 6)) { + CHECK_STATUS(NOT_SUPPORTED); + } + + oc /= 8; + ic /= 8; + + U32 tile_h = (oh + 3) / 4; + U32 tile_w = (ow + 3) / 4; + I32 tiles = tile_h * tile_w; // num of 6x6 tiles + U32 pad_left = paddingL; + U32 pad_right = paddingR + (tile_w * 4 - ow); + U32 pad_w_mod_4 = tile_w * 4 - ow; + U32 pad_top = paddingT; + U32 pad_bottom = paddingB + (tile_h * 4 - oh); + U32 pad_h_mod_4 = tile_h * 4 - oh; + U32 ih_pad = ih + pad_top + pad_bottom; + U32 iw_pad = iw + pad_left + pad_right; + // tmp = in_pad + itm + otm + // in_pad: ic*ih_pad*iw_pad*8 + // itm: 6*6*ic*8*8 + // otm: oc*6*6*8*8 + F16 *inArray_pad = (F16 *)tmp; + F16 *itmArray = inArray_pad + ic * ih_pad * iw_pad * 8; + F16 *otmArray = itmArray + 6 * 6 * ic * 8 * 8; + + EE ret = SUCCESS; + // copy input into a input with padding + for (U32 n = 0; n < in; n++) { + F16 *inArray_pad_mov = inArray_pad; + F16 *inArray_mov = inArray + n * ic * ih * iw * 8; + for (U32 c = 0; c < ic; c++) { + memset(inArray_pad_mov, 0, pad_top * iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += pad_top * iw_pad * 8; + for (U32 h = pad_top; h < ih_pad - pad_bottom; h++) { + memset(inArray_pad_mov, 0, pad_left * 8 * bytesOf(idt)); + inArray_pad_mov += pad_left * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, pad_right * 8 * bytesOf(idt)); + inArray_pad_mov += pad_right * 8; + } + memset(inArray_pad_mov, 0, pad_bottom * iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += pad_bottom * iw_pad * 8; + } + + // tiles / 8 + for (I32 hw = 0; hw < tiles - 7; hw += 8) { + const F16 *ftm_0 = filterArray; + F16 *otm_0 = otmArray; + // in trans + // NCHWc8 => (6*6)*C*c8*hw8 + for (U32 c = 0; c < ic; c++) { + F16 *inArray_pad_mov = inArray_pad + c * ih_pad * iw_pad * 8; + F16 *Iw_ptr[36]; + F16 Iw0[36][8]; + F16 *I0[36]; + F16 Iw1[36][8]; + F16 *I1[36]; + F16 Iw2[36][8]; + F16 *I2[36]; + F16 Iw3[36][8]; + F16 *I3[36]; + F16 Iw4[36][8]; + F16 *I4[36]; + F16 Iw5[36][8]; + F16 *I5[36]; + F16 Iw6[36][8]; + F16 *I6[36]; + F16 Iw7[36][8]; + F16 *I7[36]; + F16 *itmArray_mov = itmArray + c * 8 * 8; + U32 h0 = (hw / tile_w) * 4; + U32 w0 = (hw % tile_w) * 4; + U32 h1 = ((hw + 1) / tile_w) * 4; + U32 w1 = ((hw + 1) % tile_w) * 4; + U32 h2 = ((hw + 2) / tile_w) * 4; + U32 w2 = ((hw + 2) % tile_w) * 4; + U32 h3 = ((hw + 3) / tile_w) * 4; + U32 w3 = ((hw + 3) % tile_w) * 4; + U32 h4 = ((hw + 4) / tile_w) * 4; + U32 w4 = ((hw + 4) % tile_w) * 4; + U32 h5 = ((hw + 5) / tile_w) * 4; + U32 w5 = ((hw + 5) % tile_w) * 4; + U32 h6 = ((hw + 6) / tile_w) * 4; + U32 w6 = ((hw + 6) % tile_w) * 4; + U32 h7 = ((hw + 7) / tile_w) * 4; + U32 w7 = ((hw + 7) % tile_w) * 4; + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + I1[i * 6 + j] = inArray_pad_mov + (h1 + i) * iw_pad * 8 + (w1 + j) * 8; + I2[i * 6 + j] = inArray_pad_mov + (h2 + i) * iw_pad * 8 + (w2 + j) * 8; + I3[i * 6 + j] = inArray_pad_mov + (h3 + i) * iw_pad * 8 + (w3 + j) * 8; + I4[i * 6 + j] = inArray_pad_mov + (h4 + i) * iw_pad * 8 + (w4 + j) * 8; + I5[i * 6 + j] = inArray_pad_mov + (h5 + i) * iw_pad * 8 + (w5 + j) * 8; + I6[i * 6 + j] = inArray_pad_mov + (h6 + i) * iw_pad * 8 + (w6 + j) * 8; + I7[i * 6 + j] = inArray_pad_mov + (h7 + i) * iw_pad * 8 + (w7 + j) * 8; + } + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + trans_I_4x4_3x3(Iw_ptr, I0); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw1[i]; + } + trans_I_4x4_3x3(Iw_ptr, I1); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw2[i]; + } + trans_I_4x4_3x3(Iw_ptr, I2); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw3[i]; + } + trans_I_4x4_3x3(Iw_ptr, I3); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw4[i]; + } + trans_I_4x4_3x3(Iw_ptr, I4); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw5[i]; + } + trans_I_4x4_3x3(Iw_ptr, I5); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw6[i]; + } + trans_I_4x4_3x3(Iw_ptr, I6); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw7[i]; + } + trans_I_4x4_3x3(Iw_ptr, I7); + for (U32 i = 0; i < 36; i++) { + F16 *itm = itmArray_mov + i * ic * 8 * 8; + + // for (U32 c8 = 0; c8 < 8; c8++) { + // itm[c8*8] = Iw0[i][c8]; + // itm[c8*8 + 1] = Iw1[i][c8]; + // itm[c8*8 + 2] = Iw2[i][c8]; + // itm[c8*8 + 3] = Iw3[i][c8]; + // itm[c8*8 + 4] = Iw4[i][c8]; + // itm[c8*8 + 5] = Iw5[i][c8]; + // itm[c8*8 + 6] = Iw6[i][c8]; + // itm[c8*8 + 7] = Iw7[i][c8]; + // } + + float16x8_t v0 = vld1q_f16(Iw0[i]); + float16x8_t v1 = vld1q_f16(Iw1[i]); + float16x8_t v2 = vld1q_f16(Iw2[i]); + float16x8_t v3 = vld1q_f16(Iw3[i]); + float16x8_t v4 = vld1q_f16(Iw4[i]); + float16x8_t v5 = vld1q_f16(Iw5[i]); + float16x8_t v6 = vld1q_f16(Iw6[i]); + float16x8_t v7 = vld1q_f16(Iw7[i]); + vst1q_f16(itm, + vzip1q_f16(vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(itm + 8, + vzip2q_f16(vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(itm + 8 * 2, + vzip1q_f16(vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(itm + 8 * 3, + vzip2q_f16(vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(itm + 8 * 4, + vzip1q_f16(vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(itm + 8 * 5, + vzip2q_f16(vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(itm + 8 * 6, + vzip1q_f16(vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(itm + 8 * 7, + vzip2q_f16(vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + } + } + for (I32 o = 0; o < I32(oc - 1); o += 2) { + const F16 *b_0 = biasArray + o * 8; + const F16 *b_1 = b_0 + 8; + F16 *itm_0 = itmArray; + // dot prod + // (6*6)*C*c8*hw8 times O*(6*6)*C*c8*o16 = O*(6*6)*hw8*o16 + for (U32 idx = 0; idx < 36; idx++) { + __asm__ __volatile__("mov x0, %[ic]\n" // ic_blk + "eor v2.16b, v2.16b, v2.16b\n" // out_o0hw0 + "ldr d0, [%[in]]\n" // in_hw0 + "eor v4.16b, v4.16b, v4.16b\n" // out_o0hw1 + "ldr x1, [%[in], #8]\n" + "eor v6.16b, v6.16b, v6.16b\n" // out_o0hw2 + "ins v0.d[1], x1\n" + "eor v8.16b, v8.16b, v8.16b\n" // out_o0hw3 + "ldr d18, [%[f]]\n" // f_o0c0 + "eor v10.16b, v10.16b, v10.16b\n" // out_o0hw4 + "ldr x2, [%[f], #8]\n" + "eor v12.16b, v12.16b, v12.16b\n" // out_o0hw5 + "ins v18.d[1], x2\n" + "eor v14.16b, v14.16b, v14.16b\n" // out_o0hw6 + "ldr d19, [%[f], #16]\n" // f_o1c0 + "eor v16.16b, v16.16b, v16.16b\n" // out_o0hw7 + "ldr x3, [%[f], #24]\n" + "eor v3.16b, v3.16b, v3.16b\n" // out_o1hw0 + "ins v19.d[1], x3\n" + "eor v5.16b, v5.16b, v5.16b\n" // out_o1hw1 + "eor v7.16b, v7.16b, v7.16b\n" // out_o1hw2 + "eor v9.16b, v9.16b, v9.16b\n" // out_o1hw3 + "eor v11.16b, v11.16b, v11.16b\n" // out_o1hw4 + "eor v13.16b, v13.16b, v13.16b\n" // out_o1hw5 + "eor v15.16b, v15.16b, v15.16b\n" // out_o1hw6 + "eor v17.16b, v17.16b, v17.16b\n" // out_o1hw7 + "0:\n" + "ldr d1, [%[in], #16]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr x1, [%[in], #24]\n" + "fmla v4.8h, v18.8h, v0.h[1]\n" + "ins v1.d[1], x1\n" + "fmla v6.8h, v18.8h, v0.h[2]\n" + "ldr d20, [%[f], #32]\n" // f_o0c0 + "fmla v8.8h, v18.8h, v0.h[3]\n" + "ldr x2, [%[f], #40]\n" + "fmla v10.8h, v18.8h, v0.h[4]\n" + "ins v20.d[1], x2\n" + "fmla v12.8h, v18.8h, v0.h[5]\n" + "ldr d21, [%[f], #48]\n" // f_o1c0 + "fmla v14.8h, v18.8h, v0.h[6]\n" + "ldr x3, [%[f], #56]\n" + "fmla v16.8h, v18.8h, v0.h[7]\n" + "ins v21.d[1], x3\n" + "fmla v3.8h, v19.8h, v0.h[0]\n" + "fmla v5.8h, v19.8h, v0.h[1]\n" + "fmla v7.8h, v19.8h, v0.h[2]\n" + "fmla v9.8h, v19.8h, v0.h[3]\n" + "fmla v11.8h, v19.8h, v0.h[4]\n" + "fmla v13.8h, v19.8h, v0.h[5]\n" + "fmla v15.8h, v19.8h, v0.h[6]\n" + "fmla v17.8h, v19.8h, v0.h[7]\n" + + "ldr d0, [%[in], #32]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr x1, [%[in], #40]\n" + "fmla v4.8h, v20.8h, v1.h[1]\n" + "ins v0.d[1], x1\n" + "fmla v6.8h, v20.8h, v1.h[2]\n" + "ldr d18, [%[f], #64]\n" // f_o0c0 + "fmla v8.8h, v20.8h, v1.h[3]\n" + "ldr x2, [%[f], #72]\n" + "fmla v10.8h, v20.8h, v1.h[4]\n" + "ins v18.d[1], x2\n" + "fmla v12.8h, v20.8h, v1.h[5]\n" + "ldr d19, [%[f], #80]\n" // f_o1c0 + "fmla v14.8h, v20.8h, v1.h[6]\n" + "ldr x3, [%[f], #88]\n" + "fmla v16.8h, v20.8h, v1.h[7]\n" + "ins v19.d[1], x3\n" + "fmla v3.8h, v21.8h, v1.h[0]\n" + "add %[in], %[in], #32\n" + "fmla v5.8h, v21.8h, v1.h[1]\n" + "add %[f], %[f], #64\n" + "fmla v7.8h, v21.8h, v1.h[2]\n" + "subs x0, x0, #2\n" + "fmla v9.8h, v21.8h, v1.h[3]\n" + "fmla v11.8h, v21.8h, v1.h[4]\n" + "fmla v13.8h, v21.8h, v1.h[5]\n" + "fmla v15.8h, v21.8h, v1.h[6]\n" + "fmla v17.8h, v21.8h, v1.h[7]\n" + "bne 0b\n" + "st1 { v2.8h, v3.8h, v4.8h, v5.8h}, [%[out]], #64\n" + "st1 { v6.8h, v7.8h, v8.8h, v9.8h}, [%[out]], #64\n" + "st1 {v10.8h, v11.8h, v12.8h, v13.8h}, [%[out]], #64\n" + "st1 {v14.8h, v15.8h, v16.8h, v17.8h}, [%[out]], #64\n" + : [out] "+r"(otm_0), [in] "+r"(itm_0), [f] "+r"(ftm_0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "x0", "x1", "x2", + "x3"); + } + // out trans + // O*(6*6)*hw8*o16 => NOHWo8 + for (U32 hw8 = 0; hw8 < 8; hw8++) { + U32 h = (hw + hw8) / tile_w; + U32 w = (hw + hw8) % tile_w; + F16 *out_0 = outArray + n * oc * oh * ow * 8 + o * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; + F16 *out_1 = out_0 + oh * ow * 8; + U32 otm_off_0 = o * 8 * 36 * 8 + hw8 * 16; + U32 otm_off_1 = otm_off_0 + 8; + + F16 *Ow_0[36]; + F16 *Ow_1[36]; + F16 *O_0[16]; + F16 *O_1[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + otm_off_0 + idx * 8 * 16; + Ow_1[idx] = otmArray + otm_off_1 + idx * 8 * 16; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + O_1[i * 4 + j] = out_1 + i * ow * 8 + j * 8; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + } + if (oc & 1) { + F16 *itm_0 = itmArray; + const F16 *ftm_0 = filterArray + (oc - 1) * 36 * ic * 8 * 8; + F16 *otm_0 = otmArray + (oc - 1) * 36 * 8 * 8; + const F16 *b_0 = biasArray + (oc - 1) * 8; + // dot prod + // (6*6)*C*c8*hw8 times O*(6*6)*C*c8*o8 = O*(6*6)*hw8*o8 + for (U32 idx = 0; idx < 36; idx++) { + __asm__ __volatile__("mov x0, %[ic]\n" // ic_blk + "eor v2.16b, v2.16b, v2.16b\n" // out_o0hw0 + "ldr d0, [%[in]]\n" // in_hw0 + "eor v3.16b, v3.16b, v3.16b\n" // out_o0hw1 + "ldr x1, [%[in], #8]\n" + "eor v4.16b, v4.16b, v4.16b\n" // out_o0hw2 + "ins v0.d[1], x1\n" + "eor v5.16b, v5.16b, v5.16b\n" // out_o0hw3 + "ldr d18, [%[f]]\n" // f_o0c0 + "eor v6.16b, v6.16b, v6.16b\n" // out_o0hw4 + "ldr x2, [%[f], #8]\n" + "eor v7.16b, v7.16b, v7.16b\n" // out_o0hw5 + "ins v18.d[1], x2\n" + "eor v8.16b, v8.16b, v8.16b\n" // out_o0hw6 + "eor v9.16b, v9.16b, v9.16b\n" // out_o0hw7 + "0:\n" + "ldr d1, [%[in], #16]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr x1, [%[in], #24]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ins v1.d[1], x1\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ldr d20, [%[f], #16]\n" // f_o0c0 + "fmla v5.8h, v18.8h, v0.h[3]\n" + "ldr x2, [%[f], #24]\n" + "fmla v6.8h, v18.8h, v0.h[4]\n" + "ins v20.d[1], x2\n" + "fmla v7.8h, v18.8h, v0.h[5]\n" + "fmla v8.8h, v18.8h, v0.h[6]\n" + "subs x0, x0, #2\n" + "fmla v9.8h, v18.8h, v0.h[7]\n" + + "ldr d0, [%[in], #32]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr x1, [%[in], #40]\n" + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ins v0.d[1], x1\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr d18, [%[f], #32]\n" // f_o0c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "ldr x2, [%[f], #40]\n" + "fmla v6.8h, v20.8h, v1.h[4]\n" + "ins v18.d[1], x2\n" + "fmla v7.8h, v20.8h, v1.h[5]\n" + "add %[in], %[in], #32\n" + "fmla v8.8h, v20.8h, v1.h[6]\n" + "add %[f], %[f], #32\n" + "fmla v9.8h, v20.8h, v1.h[7]\n" + "bne 0b\n" + "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%[out]], #64\n" + "st1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%[out]], #64\n" + : [out] "+r"(otm_0), [in] "+r"(itm_0), [f] "+r"(ftm_0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v18", "v20", "x0", "x1", "x2"); + } + // out trans + // O*(6*6)*hw8*o8 => NOWHo8 + for (U32 hw8 = 0; hw8 < 8; hw8++) { + U32 h = (hw + hw8) / tile_w; + U32 w = (hw + hw8) % tile_w; + F16 *out_0 = outArray + n * oc * oh * ow * 8 + (oc - 1) * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; + U32 otm_off_0 = (oc - 1) * 8 * 36 * 8 + hw8 * 8; + + F16 *Ow_0[36]; + F16 *O_0[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + otm_off_0 + idx * 8 * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + } + } + + // tiles_reminder % 8 / 4 + I32 tiles_s = (tiles / 8) * 8; + for (I32 hw = tiles_s; hw < tiles - 3; hw += 4) { + const F16 *ftm_0 = filterArray; + F16 *otm_0 = otmArray; + // in trans + // NCHWc8 => (6*6)*C*c8*hw4 + for (U32 c = 0; c < ic; c++) { + F16 *inArray_pad_mov = inArray_pad + c * ih_pad * iw_pad * 8; + F16 *Iw_ptr[36]; + F16 Iw0[36][8]; + F16 *I0[36]; + F16 Iw1[36][8]; + F16 *I1[36]; + F16 Iw2[36][8]; + F16 *I2[36]; + F16 Iw3[36][8]; + F16 *I3[36]; + F16 *itmArray_mov = itmArray + c * 8 * 4; + U32 h0 = (hw / tile_w) * 4; + U32 w0 = (hw % tile_w) * 4; + U32 h1 = ((hw + 1) / tile_w) * 4; + U32 w1 = ((hw + 1) % tile_w) * 4; + U32 h2 = ((hw + 2) / tile_w) * 4; + U32 w2 = ((hw + 2) % tile_w) * 4; + U32 h3 = ((hw + 3) / tile_w) * 4; + U32 w3 = ((hw + 3) % tile_w) * 4; + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + I1[i * 6 + j] = inArray_pad_mov + (h1 + i) * iw_pad * 8 + (w1 + j) * 8; + I2[i * 6 + j] = inArray_pad_mov + (h2 + i) * iw_pad * 8 + (w2 + j) * 8; + I3[i * 6 + j] = inArray_pad_mov + (h3 + i) * iw_pad * 8 + (w3 + j) * 8; + } + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + trans_I_4x4_3x3(Iw_ptr, I0); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw1[i]; + } + trans_I_4x4_3x3(Iw_ptr, I1); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw2[i]; + } + trans_I_4x4_3x3(Iw_ptr, I2); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw3[i]; + } + trans_I_4x4_3x3(Iw_ptr, I3); + for (U32 i = 0; i < 36; i++) { + F16 *itm = itmArray_mov + i * ic * 8 * 4; + + // for (U32 c8 = 0; c8 < 8; c8++) { + // itm[c8*4] = Iw0[i][c8]; + // itm[c8*4 + 1] = Iw1[i][c8]; + // itm[c8*4 + 2] = Iw2[i][c8]; + // itm[c8*4 + 3] = Iw3[i][c8]; + // } + + __asm__ __volatile__("ldr q0, [%[in_0]]\n" + "ldr q1, [%[in_1]]\n" + "ldr q2, [%[in_2]]\n" + "ldr q3, [%[in_3]]\n" + "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[itm]]\n" + : [itm] "+r"(itm) + : [in_0] "r"(Iw0[i]), [in_1] "r"(Iw1[i]), + [in_2] "r"(Iw2[i]), [in_3] "r"(Iw3[i]) + : "memory", "cc", "v0", "v1", "v2", "v3"); + } + } + for (I32 o = 0; o < I32(oc - 1); o += 2) { + const F16 *b_0 = biasArray + o * 8; + const F16 *b_1 = b_0 + 8; + F16 *itm_0 = itmArray; + // dot prod + // (6*6)*C*c8*hw4 times O*(6*6)*C*c8*o16 = O*(6*6)*hw4*o16 + for (U32 idx = 0; idx < 36; idx++) { + __asm__ __volatile__("mov x0, %[ic]\n" // ic_blk + "eor v2.16b, v2.16b, v2.16b\n" // out_o0hw0 + "ldr d0, [%[in]]\n" // in_hw0 + "eor v4.16b, v4.16b, v4.16b\n" // out_o0hw1 + "ldr d18, [%[f]]\n" // f_o0c0 + "eor v6.16b, v6.16b, v6.16b\n" // out_o0hw2 + "ldr x2, [%[f], #8]\n" // f_o0c0 + "eor v8.16b, v8.16b, v8.16b\n" // out_o0hw3 + "ins v18.d[1], x2\n" // f_o0c0 + "ldr d19, [%[f], #16]\n" // f_o1c0 + "eor v3.16b, v3.16b, v3.16b\n" // out_o1hw0 + "ldr x3, [%[f], #24]\n" // f_o1c0 + "eor v5.16b, v5.16b, v5.16b\n" // out_o1hw1 + "ins v19.d[1], x3\n" // f_o1c0 + "eor v7.16b, v7.16b, v7.16b\n" // out_o1hw2 + "eor v9.16b, v9.16b, v9.16b\n" // out_o1hw3 + "0:\n" + "ldr d1, [%[in], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f], #32]\n" // f_o0c0 + "fmla v4.8h, v18.8h, v0.h[1]\n" + "ldr x2, [%[f], #40]\n" // f_o0c0 + "fmla v6.8h, v18.8h, v0.h[2]\n" + "ins v20.d[1], x2\n" // f_o0c0 + "fmla v8.8h, v18.8h, v0.h[3]\n" + "ldr d21, [%[f], #48]\n" // f_o1c0 + "fmla v3.8h, v19.8h, v0.h[0]\n" + "ldr x3, [%[f], #56]\n" // f_o1c0 + "fmla v5.8h, v19.8h, v0.h[1]\n" + "ins v21.d[1], x3\n" // f_o1c0 + "fmla v7.8h, v19.8h, v0.h[2]\n" + "subs x0, x0, #2\n" + "fmla v9.8h, v19.8h, v0.h[3]\n" + + "ldr d0, [%[in], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f], #64]\n" // f_o0c0 + "fmla v4.8h, v20.8h, v1.h[1]\n" + "ldr x2, [%[f], #72]\n" // f_o0c0 + "fmla v6.8h, v20.8h, v1.h[2]\n" + "ins v18.d[1], x2\n" // f_o0c0 + "fmla v8.8h, v20.8h, v1.h[3]\n" + "ldr d19, [%[f], #80]\n" // f_o1c0 + "fmla v3.8h, v21.8h, v1.h[0]\n" + "ldr x3, [%[f], #88]\n" // f_o1c0 + "fmla v5.8h, v21.8h, v1.h[1]\n" + "ins v19.d[1], x3\n" // f_o1c0 + "fmla v7.8h, v21.8h, v1.h[2]\n" + "add %[in], %[in], #16\n" + "fmla v9.8h, v21.8h, v1.h[3]\n" + "add %[f], %[f], #64\n" + "bne 0b\n" + "st1 { v2.8h, v3.8h, v4.8h, v5.8h}, [%[out]], #64\n" + "st1 { v6.8h, v7.8h, v8.8h, v9.8h}, [%[out]], #64\n" + : [out] "+r"(otm_0), [in] "+r"(itm_0), [f] "+r"(ftm_0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v18", "v19", "v20", "v21", "x0", "x1", + "x2", "x3"); + } + // out trans + // O*(6*6)*hw4*o16 => NOWHo8 + for (U32 hw4 = 0; hw4 < 4; hw4++) { + U32 h = (hw + hw4) / tile_w; + U32 w = (hw + hw4) % tile_w; + F16 *out_0 = outArray + n * oc * oh * ow * 8 + o * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; + F16 *out_1 = out_0 + oh * ow * 8; + U32 otm_off_0 = o * 8 * 36 * 4 + hw4 * 16; + U32 otm_off_1 = otm_off_0 + 8; + + F16 *Ow_0[36]; + F16 *Ow_1[36]; + F16 *O_0[16]; + F16 *O_1[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + otm_off_0 + idx * 4 * 16; + Ow_1[idx] = otmArray + otm_off_1 + idx * 4 * 16; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + O_1[i * 4 + j] = out_1 + i * ow * 8 + j * 8; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + } + if (oc & 1) { + F16 *itm_0 = itmArray; + const F16 *ftm_0 = filterArray + (oc - 1) * 8 * 36 * ic * 8; + F16 *otm_0 = otmArray + (oc - 1) * 8 * 36 * 4; + const F16 *b_0 = biasArray + (oc - 1) * 8; + // dot prod + // (6*6)*C*c8*hw4 times O*(6*6)*C*c8*o8 = O*(6*6)*hw4*o8 + for (U32 idx = 0; idx < 36; idx++) { + __asm__ __volatile__("mov x0, %[ic]\n" // ic_blk + "eor v2.16b, v2.16b, v2.16b\n" // out_o0hw0 + "ldr d0, [%[in]]\n" // in_hw0 + "eor v3.16b, v3.16b, v3.16b\n" // out_o0hw1 + "ldr d18, [%[f]]\n" // f_o0c0 + "eor v4.16b, v4.16b, v4.16b\n" // out_o0hw2 + "ldr x2, [%[f], #8]\n" + "eor v5.16b, v5.16b, v5.16b\n" // out_o0hw3 + "ins v18.d[1], x2\n" + "0:\n" + "ldr d1, [%[in], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f], #16]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ldr x2, [%[f], #24]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ins v20.d[1], x2\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "subs x0, x0, #2\n" + + "ldr d0, [%[in], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f], #32]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ldr x2, [%[f], #40]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ins v18.d[1], x2\n" + "fmla v5.8h, v20.8h, v1.h[3]\n" + "add %[in], %[in], #16\n" + "add %[f], %[f], #32\n" + "bne 0b\n" + "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%[out]], #64\n" + : [out] "+r"(otm_0), [in] "+r"(itm_0), [f] "+r"(ftm_0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v18", "v20", "x0", "x2"); + } + // out trans + // O*(6*6)*hw4*o8 => NOWHo8 + for (U32 hw4 = 0; hw4 < 4; hw4++) { + U32 h = (hw + hw4) / tile_w; + U32 w = (hw + hw4) % tile_w; + F16 *out_0 = outArray + n * oc * oh * ow * 8 + (oc - 1) * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; + U32 otm_off_0 = (oc - 1) * 8 * 36 * 4 + hw4 * 8; + + F16 *Ow_0[36]; + F16 *O_0[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + otm_off_0 + idx * 4 * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + } + } + + // tiles_reminder % 4 + tiles_s = (tiles / 4) * 4; + for (I32 hw = tiles_s; hw < tiles; hw++) { + const F16 *ftm_0 = filterArray; + F16 *otm_0 = otmArray; + // in trans + // NCHWc8 => (6*6)*C*c8*hw1 + for (U32 c = 0; c < ic; c++) { + F16 *inArray_pad_mov = inArray_pad + c * ih_pad * iw_pad * 8; + F16 *Iw_ptr[36]; + F16 Iw0[36][8]; + F16 *I0[36]; + F16 *itmArray_mov = itmArray + c * 8; + U32 h0 = (hw / tile_w) * 4; + U32 w0 = (hw % tile_w) * 4; + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + } + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + trans_I_4x4_3x3(Iw_ptr, I0); + for (U32 i = 0; i < 36; i++) { + F16 *itm = itmArray_mov + i * ic * 8; + + // for (U32 c8 = 0; c8 < 8; c8++) { + // itm[c8] = Iw0[i][c8]; + // } + memcpy(itm, Iw0[i], 8 * bytesOf(idt)); + } + } + for (I32 o = 0; o < I32(oc - 1); o += 2) { + const F16 *b_0 = biasArray + o * 8; + const F16 *b_1 = b_0 + 8; + F16 *itm_0 = itmArray; + // dot prod + // (6*6)*C*c8*hw1 times O*(6*6)*C*c8*o16 = O*(6*6)*hw1*o16 + for (U32 idx = 0; idx < 36; idx++) { + __asm__ __volatile__("mov x0, %[ic]\n" // ic_blk + "eor v2.16b, v2.16b, v2.16b\n" // out_o0hw0 + "ldr h0, [%[in]]\n" // in_hw0 + "eor v3.16b, v3.16b, v3.16b\n" // out_o1hw0 + "ldr d18, [%[f]]\n" // f_o0c0 + "ldr x2, [%[f], #8]\n" // f_o0c0 + "ins v18.d[1], x2\n" // f_o0c0 + "ldr d19, [%[f], #16]\n" // f_o1c0 + "ldr x3, [%[f], #24]\n" // f_o1c0 + "ins v19.d[1], x3\n" // f_o1c0 + "0:\n" + "ldr h1, [%[in], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f], #32]\n" // f_o0c0 + "fmla v3.8h, v19.8h, v0.h[0]\n" + "ldr x2, [%[f], #40]\n" // f_o0c0 + "ins v20.d[1], x2\n" // f_o0c0 + "ldr d21, [%[f], #48]\n" // f_o1c0 + "subs x0, x0, #2\n" + "ldr x3, [%[f], #56]\n" // f_o1c0 + "ins v21.d[1], x3\n" // f_o1c0 + + "ldr h0, [%[in], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f], #64]\n" // f_o0c0 + "fmla v3.8h, v21.8h, v1.h[0]\n" + "ldr x2, [%[f], #72]\n" // f_o0c0 + "ins v18.d[1], x2\n" // f_o0c0 + "ldr d19, [%[f], #80]\n" // f_o1c0 + "add %[in], %[in], #4\n" + "ldr x3, [%[f], #88]\n" // f_o1c0 + "ins v19.d[1], x3\n" // f_o1c0 + "add %[f], %[f], #64\n" + "bne 0b\n" + "st1 {v2.8h, v3.8h}, [%[out]], #32\n" + : [out] "+r"(otm_0), [in] "+r"(itm_0), [f] "+r"(ftm_0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v3", "v18", "v19", + "v20", "v21", "x0", "x2", "x3"); + } + // out trans + // O*(6*6)*hw1*o16 => NOWHo8 + U32 h = hw / tile_w; + U32 w = hw % tile_w; + F16 *out_0 = + outArray + n * oc * oh * ow * 8 + o * oh * ow * 8 + h * 4 * ow * 8 + w * 4 * 8; + F16 *out_1 = out_0 + oh * ow * 8; + U32 otm_off_0 = o * 8 * 36; + U32 otm_off_1 = otm_off_0 + 8; + + F16 *Ow_0[36]; + F16 *Ow_1[36]; + F16 *O_0[16]; + F16 *O_1[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + otm_off_0 + idx * 16; + Ow_1[idx] = otmArray + otm_off_1 + idx * 16; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + O_1[i * 4 + j] = out_1 + i * ow * 8 + j * 8; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + if (oc & 1) { + F16 *itm_0 = itmArray; + const F16 *ftm_0 = filterArray + (oc - 1) * 8 * 36 * ic * 8; + F16 *otm_0 = otmArray + (oc - 1) * 8 * 36; + const F16 *b_0 = biasArray + (oc - 1) * 8; + // dot prod + // (6*6)*C*c8*hw1 times O*(6*6)*C*c8*o8 = O*(6*6)*hw1*o8 + for (U32 idx = 0; idx < 36; idx++) { + __asm__ __volatile__( + "mov x0, %[ic]\n" // ic_blk + "eor v2.16b, v2.16b, v2.16b\n" // out_o0hw0 + "ldr s0, [%[in]]\n" // in_hw0 + "ldr d18, [%[f]]\n" // f_o0c0 + "ldr x2, [%[f], #8]\n" + "ins v18.d[1], x2\n" + "0:\n" + "ldr h1, [%[in], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f], #16]\n" // f_o0c0 + "ldr x2, [%[f], #24]\n" + "ins v20.d[1], x2\n" + "subs x0, x0, #2\n" + + "ldr h0, [%[in], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f], #32]\n" // f_o0c0 + "ldr x2, [%[f], #40]\n" + "ins v18.d[1], x2\n" + "add %[in], %[in], #4\n" + "add %[f], %[f], #32\n" + "bne 0b\n" + "st1 {v2.8h}, [%[out]], #16\n" + : [out] "+r"(otm_0), [in] "+r"(itm_0), [f] "+r"(ftm_0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v18", "v20", "x0", "x2"); + } + // out trans + // O*(6*6)*hw1*o8 => NOWHo8 + U32 h = hw / tile_w; + U32 w = hw % tile_w; + F16 *out_0 = outArray + n * oc * oh * ow * 8 + (oc - 1) * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; + U32 otm_off_0 = (oc - 1) * 8 * 36; + + F16 *Ow_0[36]; + F16 *O_0[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + otm_off_0 + idx * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + } + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A76.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A76.cpp new file mode 100644 index 00000000..a47a5952 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A76.cpp @@ -0,0 +1,725 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/convolution_winograd_transform.h" +#include "cpu/arm/fp16/convolution_winograd.h" + +EE convolution_winograd_A76(TensorDesc inputDesc, + F16 *inArray, + TensorDesc filterDesc, + const F16 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F16 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + if (fdf != DF_HWNCN16) { + CHECK_STATUS(NOT_MATCH); + } + if (!(fh == 6 && fw == 6)) { + CHECK_STATUS(NOT_SUPPORTED); + } + + oc /= 8; + ic /= 8; + + U32 tile_h = (oh + 3) / 4; + U32 tile_w = (ow + 3) / 4; + // num of 6x6 tiles + I32 tiles = tile_h * tile_w; + U32 pad_left = paddingL; + U32 pad_right = paddingR + (tile_w * 4 - ow); + U32 pad_w_mod_4 = tile_w * 4 - ow; + U32 pad_top = paddingT; + U32 pad_bottom = paddingB + (tile_h * 4 - oh); + U32 pad_h_mod_4 = tile_h * 4 - oh; + U32 ih_pad = ih + pad_top + pad_bottom; + U32 iw_pad = iw + pad_left + pad_right; + // tmp = in_pad + itm + otm + // in_pad: ic*ih_pad*iw_pad*8 + // itm: 6*6*ic*8*8 + // otm: oc*6*6*8*8 + F16 *inArray_pad = (F16 *)tmp; + F16 *itmArray = inArray_pad + ic * ih_pad * iw_pad * 8; + F16 *otmArray = itmArray + 6 * 6 * ic * 8 * 8; + + EE ret = SUCCESS; + // copy input into a input with padding + for (U32 n = 0; n < in; n++) { + F16 *inArray_pad_mov = inArray_pad; + F16 *inArray_mov = inArray + n * ic * ih * iw * 8; + for (U32 c = 0; c < ic; c++) { + memset(inArray_pad_mov, 0, pad_top * iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += pad_top * iw_pad * 8; + for (U32 h = pad_top; h < ih_pad - pad_bottom; h++) { + memset(inArray_pad_mov, 0, pad_left * 8 * bytesOf(idt)); + inArray_pad_mov += pad_left * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, pad_right * 8 * bytesOf(idt)); + inArray_pad_mov += pad_right * 8; + } + memset(inArray_pad_mov, 0, pad_bottom * iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += pad_bottom * iw_pad * 8; + } + + // tiles / 8 + for (I32 hw = 0; hw < tiles - 7; hw += 8) { + const F16 *ftm_0 = filterArray; + F16 *otm_0 = otmArray; + // in trans + // NCHWc8 => (6*6)*C*c8*hw8 + for (U32 c = 0; c < ic; c++) { + F16 *inArray_pad_mov = inArray_pad + c * ih_pad * iw_pad * 8; + F16 *itmArray_mov = itmArray + c * 8 * 8; + F16 *Iw_ptr[36]; + F16 Iw[8][36][8]; + F16 *I[8][36]; + U32 h[8]; + U32 w[8]; + for (U32 index = 0; index < 8; index++) { + h[index] = ((hw + index) / tile_w) * 4; + w[index] = ((hw + index) % tile_w) * 4; + } + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + for (U32 index = 0; index < 8; index++) { + I[index][i * 6 + j] = + inArray_pad_mov + (h[index] + i) * iw_pad * 8 + (w[index] + j) * 8; + } + } + } + for (U32 index = 0; index < 8; index++) { + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw[index][i]; + } + trans_I_4x4_3x3(Iw_ptr, I[index]); + } + for (U32 i = 0; i < 36; i++) { + F16 *itm = itmArray_mov + i * ic * 8 * 8; + float16x8_t v0 = vld1q_f16(Iw[0][i]); + float16x8_t v1 = vld1q_f16(Iw[1][i]); + float16x8_t v2 = vld1q_f16(Iw[2][i]); + float16x8_t v3 = vld1q_f16(Iw[3][i]); + float16x8_t v4 = vld1q_f16(Iw[4][i]); + float16x8_t v5 = vld1q_f16(Iw[5][i]); + float16x8_t v6 = vld1q_f16(Iw[6][i]); + float16x8_t v7 = vld1q_f16(Iw[7][i]); + vst1q_f16(itm, + vzip1q_f16(vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(itm + 8, + vzip2q_f16(vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(itm + 8 * 2, + vzip1q_f16(vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(itm + 8 * 3, + vzip2q_f16(vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); + vst1q_f16(itm + 8 * 4, + vzip1q_f16(vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(itm + 8 * 5, + vzip2q_f16(vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(itm + 8 * 6, + vzip1q_f16(vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + vst1q_f16(itm + 8 * 7, + vzip2q_f16(vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); + } + } + for (I32 o = 0; o < I32(oc - 1); o += 2) { + const F16 *b_0 = biasArray + o * 8; + const F16 *b_1 = b_0 + 8; + F16 *itm_0 = itmArray; + // dot prod + // (6*6)*C*c8*hw8 times O*(6*6)*C*c8*o16 = O*(6*6)*hw8*o16 + for (U32 idx = 0; idx < 36; idx++) { + __asm__ __volatile__("mov x0, %[ic]\n" // ic_blk + "eor v2.16b, v2.16b, v2.16b\n" // out_o0hw0 + "ldr q0, [%[in]]\n" // in_hw0 + "eor v4.16b, v4.16b, v4.16b\n" // out_o0hw1 + "eor v6.16b, v6.16b, v6.16b\n" // out_o0hw2 + "eor v8.16b, v8.16b, v8.16b\n" // out_o0hw3 + "ldr q18, [%[f]]\n" // f_o0c0 + "eor v10.16b, v10.16b, v10.16b\n" // out_o0hw4 + "eor v12.16b, v12.16b, v12.16b\n" // out_o0hw5 + "eor v14.16b, v14.16b, v14.16b\n" // out_o0hw6 + "ldr q19, [%[f], #16]\n" // f_o1c0 + "eor v16.16b, v16.16b, v16.16b\n" // out_o0hw7 + "eor v3.16b, v3.16b, v3.16b\n" // out_o1hw0 + "eor v5.16b, v5.16b, v5.16b\n" // out_o1hw1 + "eor v7.16b, v7.16b, v7.16b\n" // out_o1hw2 + "eor v9.16b, v9.16b, v9.16b\n" // out_o1hw3 + "eor v11.16b, v11.16b, v11.16b\n" // out_o1hw4 + "eor v13.16b, v13.16b, v13.16b\n" // out_o1hw5 + "eor v15.16b, v15.16b, v15.16b\n" // out_o1hw6 + "eor v17.16b, v17.16b, v17.16b\n" // out_o1hw7 + "0:\n" + "ldr q1, [%[in], #16]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v4.8h, v18.8h, v0.h[1]\n" + "ldr q20, [%[f], #32]\n" // f_o0c0 + "fmla v6.8h, v18.8h, v0.h[2]\n" + "fmla v8.8h, v18.8h, v0.h[3]\n" + "ldr q21, [%[f], #48]\n" // f_o1c0 + "fmla v10.8h, v18.8h, v0.h[4]\n" + "fmla v12.8h, v18.8h, v0.h[5]\n" + "fmla v14.8h, v18.8h, v0.h[6]\n" + "fmla v16.8h, v18.8h, v0.h[7]\n" + "fmla v3.8h, v19.8h, v0.h[0]\n" + "fmla v5.8h, v19.8h, v0.h[1]\n" + "fmla v7.8h, v19.8h, v0.h[2]\n" + "fmla v9.8h, v19.8h, v0.h[3]\n" + "fmla v11.8h, v19.8h, v0.h[4]\n" + "fmla v13.8h, v19.8h, v0.h[5]\n" + "fmla v15.8h, v19.8h, v0.h[6]\n" + "fmla v17.8h, v19.8h, v0.h[7]\n" + + "ldr q0, [%[in], #32]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "fmla v4.8h, v20.8h, v1.h[1]\n" + "ldr q18, [%[f], #64]\n" // f_o0c0 + "fmla v6.8h, v20.8h, v1.h[2]\n" + "fmla v8.8h, v20.8h, v1.h[3]\n" + "ldr q19, [%[f], #80]\n" // f_o1c0 + "fmla v10.8h, v20.8h, v1.h[4]\n" + "fmla v12.8h, v20.8h, v1.h[5]\n" + "fmla v14.8h, v20.8h, v1.h[6]\n" + "fmla v16.8h, v20.8h, v1.h[7]\n" + "fmla v3.8h, v21.8h, v1.h[0]\n" + "add %[in], %[in], #32\n" + "fmla v5.8h, v21.8h, v1.h[1]\n" + "add %[f], %[f], #64\n" + "fmla v7.8h, v21.8h, v1.h[2]\n" + "subs x0, x0, #2\n" + "fmla v9.8h, v21.8h, v1.h[3]\n" + "fmla v11.8h, v21.8h, v1.h[4]\n" + "fmla v13.8h, v21.8h, v1.h[5]\n" + "fmla v15.8h, v21.8h, v1.h[6]\n" + "fmla v17.8h, v21.8h, v1.h[7]\n" + "bne 0b\n" + "st1 { v2.8h, v3.8h, v4.8h, v5.8h}, [%[out]], #64\n" + "st1 { v6.8h, v7.8h, v8.8h, v9.8h}, [%[out]], #64\n" + "st1 {v10.8h, v11.8h, v12.8h, v13.8h}, [%[out]], #64\n" + "st1 {v14.8h, v15.8h, v16.8h, v17.8h}, [%[out]], #64\n" + : [out] "+r"(otm_0), [in] "+r"(itm_0), [f] "+r"(ftm_0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "x0"); + } + // out trans + // O*(6*6)*hw8*o16 => NOHWo8 + for (U32 hw8 = 0; hw8 < 8; hw8++) { + U32 h = (hw + hw8) / tile_w; + U32 w = (hw + hw8) % tile_w; + F16 *out_0 = outArray + n * oc * oh * ow * 8 + o * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; + F16 *out_1 = out_0 + oh * ow * 8; + U32 otm_off_0 = o * 8 * 36 * 8 + hw8 * 16; + U32 otm_off_1 = otm_off_0 + 8; + + F16 *Ow_0[36]; + F16 *Ow_1[36]; + F16 *O_0[16]; + F16 *O_1[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + otm_off_0 + idx * 8 * 16; + Ow_1[idx] = otmArray + otm_off_1 + idx * 8 * 16; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + O_1[i * 4 + j] = out_1 + i * ow * 8 + j * 8; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + } + if (oc & 1) { + F16 *itm_0 = itmArray; + const F16 *ftm_0 = filterArray + (oc - 1) * 36 * ic * 8 * 8; + F16 *otm_0 = otmArray + (oc - 1) * 36 * 8 * 8; + const F16 *b_0 = biasArray + (oc - 1) * 8; + // dot prod + // (6*6)*C*c8*hw8 times O*(6*6)*C*c8*o8 = O*(6*6)*hw8*o8 + for (U32 idx = 0; idx < 36; idx++) { + __asm__ __volatile__("mov x0, %[ic]\n" // ic_blk + "eor v2.16b, v2.16b, v2.16b\n" // out_o0hw0 + "ldr q0, [%[in]]\n" // in_hw0 + "eor v3.16b, v3.16b, v3.16b\n" // out_o0hw1 + "eor v4.16b, v4.16b, v4.16b\n" // out_o0hw2 + "eor v5.16b, v5.16b, v5.16b\n" // out_o0hw3 + "ldr q18, [%[f]]\n" // f_o0c0 + "eor v6.16b, v6.16b, v6.16b\n" // out_o0hw4 + "eor v7.16b, v7.16b, v7.16b\n" // out_o0hw5 + "eor v8.16b, v8.16b, v8.16b\n" // out_o0hw6 + "eor v9.16b, v9.16b, v9.16b\n" // out_o0hw7 + "0:\n" + "ldr q1, [%[in], #16]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ldr q20, [%[f], #16]\n" // f_o0c0 + "fmla v5.8h, v18.8h, v0.h[3]\n" + "fmla v6.8h, v18.8h, v0.h[4]\n" + "fmla v7.8h, v18.8h, v0.h[5]\n" + "fmla v8.8h, v18.8h, v0.h[6]\n" + "subs x0, x0, #2\n" + "fmla v9.8h, v18.8h, v0.h[7]\n" + + "ldr q0, [%[in], #32]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "fmla v3.8h, v20.8h, v1.h[1]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr q18, [%[f], #32]\n" // f_o0c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "fmla v6.8h, v20.8h, v1.h[4]\n" + "fmla v7.8h, v20.8h, v1.h[5]\n" + "add %[in], %[in], #32\n" + "fmla v8.8h, v20.8h, v1.h[6]\n" + "add %[f], %[f], #32\n" + "fmla v9.8h, v20.8h, v1.h[7]\n" + "bne 0b\n" + "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%[out]], #64\n" + "st1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%[out]], #64\n" + : [out] "+r"(otm_0), [in] "+r"(itm_0), [f] "+r"(ftm_0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v18", "v20", "x0"); + } + // out trans + // O*(6*6)*hw8*o8 => NOWHo8 + for (U32 hw8 = 0; hw8 < 8; hw8++) { + U32 h = (hw + hw8) / tile_w; + U32 w = (hw + hw8) % tile_w; + F16 *out_0 = outArray + n * oc * oh * ow * 8 + (oc - 1) * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; + U32 otm_off_0 = (oc - 1) * 8 * 36 * 8 + hw8 * 8; + + F16 *Ow_0[36]; + F16 *O_0[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + otm_off_0 + idx * 8 * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + } + } + + // tiles_reminder % 8 / 4 + I32 tiles_s = (tiles / 8) * 8; + for (I32 hw = tiles_s; hw < tiles - 3; hw += 4) { + const F16 *ftm_0 = filterArray; + F16 *otm_0 = otmArray; + // in trans + // NCHWc8 => (6*6)*C*c8*hw4 + for (U32 c = 0; c < ic; c++) { + F16 *inArray_pad_mov = inArray_pad + c * ih_pad * iw_pad * 8; + F16 *Iw_ptr[36]; + F16 Iw0[36][8]; + F16 *I0[36]; + F16 Iw1[36][8]; + F16 *I1[36]; + F16 Iw2[36][8]; + F16 *I2[36]; + F16 Iw3[36][8]; + F16 *I3[36]; + F16 *itmArray_mov = itmArray + c * 8 * 4; + U32 h0 = (hw / tile_w) * 4; + U32 w0 = (hw % tile_w) * 4; + U32 h1 = ((hw + 1) / tile_w) * 4; + U32 w1 = ((hw + 1) % tile_w) * 4; + U32 h2 = ((hw + 2) / tile_w) * 4; + U32 w2 = ((hw + 2) % tile_w) * 4; + U32 h3 = ((hw + 3) / tile_w) * 4; + U32 w3 = ((hw + 3) % tile_w) * 4; + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + I1[i * 6 + j] = inArray_pad_mov + (h1 + i) * iw_pad * 8 + (w1 + j) * 8; + I2[i * 6 + j] = inArray_pad_mov + (h2 + i) * iw_pad * 8 + (w2 + j) * 8; + I3[i * 6 + j] = inArray_pad_mov + (h3 + i) * iw_pad * 8 + (w3 + j) * 8; + } + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + trans_I_4x4_3x3(Iw_ptr, I0); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw1[i]; + } + trans_I_4x4_3x3(Iw_ptr, I1); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw2[i]; + } + trans_I_4x4_3x3(Iw_ptr, I2); + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw3[i]; + } + trans_I_4x4_3x3(Iw_ptr, I3); + for (U32 i = 0; i < 36; i++) { + F16 *itm = itmArray_mov + i * ic * 8 * 4; + + // for (U32 c8 = 0; c8 < 8; c8++) { + // itm[c8*4] = Iw0[i][c8]; + // itm[c8*4 + 1] = Iw1[i][c8]; + // itm[c8*4 + 2] = Iw2[i][c8]; + // itm[c8*4 + 3] = Iw3[i][c8]; + // } + + __asm__ __volatile__("ldr q0, [%[in_0]]\n" + "ldr q1, [%[in_1]]\n" + "ldr q2, [%[in_2]]\n" + "ldr q3, [%[in_3]]\n" + "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[itm]]\n" + : [itm] "+r"(itm) + : [in_0] "r"(Iw0[i]), [in_1] "r"(Iw1[i]), + [in_2] "r"(Iw2[i]), [in_3] "r"(Iw3[i]) + : "memory", "cc", "v0", "v1", "v2", "v3"); + } + } + for (I32 o = 0; o < I32(oc - 1); o += 2) { + const F16 *b_0 = biasArray + o * 8; + const F16 *b_1 = b_0 + 8; + F16 *itm_0 = itmArray; + // dot prod + // (6*6)*C*c8*hw4 times O*(6*6)*C*c8*o16 = O*(6*6)*hw4*o16 + for (U32 idx = 0; idx < 36; idx++) { + __asm__ __volatile__("mov x0, %[ic]\n" // ic_blk + "eor v2.16b, v2.16b, v2.16b\n" // out_o0hw0 + "ldr d0, [%[in]]\n" // in_hw0 + "eor v4.16b, v4.16b, v4.16b\n" // out_o0hw1 + "ldr q18, [%[f]]\n" // f_o0c0 + "eor v6.16b, v6.16b, v6.16b\n" // out_o0hw2 + "eor v8.16b, v8.16b, v8.16b\n" // out_o0hw3 + "ldr q19, [%[f], #16]\n" // f_o1c0 + "eor v3.16b, v3.16b, v3.16b\n" // out_o1hw0 + "eor v5.16b, v5.16b, v5.16b\n" // out_o1hw1 + "eor v7.16b, v7.16b, v7.16b\n" // out_o1hw2 + "eor v9.16b, v9.16b, v9.16b\n" // out_o1hw3 + "0:\n" + "ldr d1, [%[in], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f], #32]\n" // f_o0c0 + "fmla v4.8h, v18.8h, v0.h[1]\n" + "fmla v6.8h, v18.8h, v0.h[2]\n" + "fmla v8.8h, v18.8h, v0.h[3]\n" + "ldr q21, [%[f], #48]\n" // f_o1c0 + "fmla v3.8h, v19.8h, v0.h[0]\n" + "fmla v5.8h, v19.8h, v0.h[1]\n" + "fmla v7.8h, v19.8h, v0.h[2]\n" + "subs x0, x0, #2\n" + "fmla v9.8h, v19.8h, v0.h[3]\n" + + "ldr d0, [%[in], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f], #64]\n" // f_o0c0 + "fmla v4.8h, v20.8h, v1.h[1]\n" + "fmla v6.8h, v20.8h, v1.h[2]\n" + "fmla v8.8h, v20.8h, v1.h[3]\n" + "ldr q19, [%[f], #80]\n" // f_o1c0 + "fmla v3.8h, v21.8h, v1.h[0]\n" + "fmla v5.8h, v21.8h, v1.h[1]\n" + "fmla v7.8h, v21.8h, v1.h[2]\n" + "add %[in], %[in], #16\n" + "fmla v9.8h, v21.8h, v1.h[3]\n" + "add %[f], %[f], #64\n" + "bne 0b\n" + "st1 { v2.8h, v3.8h, v4.8h, v5.8h}, [%[out]], #64\n" + "st1 { v6.8h, v7.8h, v8.8h, v9.8h}, [%[out]], #64\n" + : [out] "+r"(otm_0), [in] "+r"(itm_0), [f] "+r"(ftm_0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v18", "v19", "v20", "v21", "x0"); + } + // out trans + // O*(6*6)*hw4*o16 => NOWHo8 + for (U32 hw4 = 0; hw4 < 4; hw4++) { + U32 h = (hw + hw4) / tile_w; + U32 w = (hw + hw4) % tile_w; + F16 *out_0 = outArray + n * oc * oh * ow * 8 + o * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; + F16 *out_1 = out_0 + oh * ow * 8; + U32 otm_off_0 = o * 8 * 36 * 4 + hw4 * 16; + U32 otm_off_1 = otm_off_0 + 8; + + F16 *Ow_0[36]; + F16 *Ow_1[36]; + F16 *O_0[16]; + F16 *O_1[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + otm_off_0 + idx * 4 * 16; + Ow_1[idx] = otmArray + otm_off_1 + idx * 4 * 16; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + O_1[i * 4 + j] = out_1 + i * ow * 8 + j * 8; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + } + if (oc & 1) { + F16 *itm_0 = itmArray; + const F16 *ftm_0 = filterArray + (oc - 1) * 8 * 36 * ic * 8; + F16 *otm_0 = otmArray + (oc - 1) * 8 * 36 * 4; + const F16 *b_0 = biasArray + (oc - 1) * 8; + // dot prod + // (6*6)*C*c8*hw4 times O*(6*6)*C*c8*o8 = O*(6*6)*hw4*o8 + for (U32 idx = 0; idx < 36; idx++) { + __asm__ __volatile__( + "mov x0, %[ic]\n" // ic_blk + "eor v2.16b, v2.16b, v2.16b\n" // out_o0hw0 + "ldr d0, [%[in]]\n" // in_hw0 + "eor v3.16b, v3.16b, v3.16b\n" // out_o0hw1 + "ldr q18, [%[f]]\n" // f_o0c0 + "eor v4.16b, v4.16b, v4.16b\n" // out_o0hw2 + "eor v5.16b, v5.16b, v5.16b\n" // out_o0hw3 + "0:\n" + "ldr d1, [%[in], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f], #16]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "subs x0, x0, #2\n" + + "ldr d0, [%[in], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f], #32]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "fmla v5.8h, v20.8h, v1.h[3]\n" + "add %[in], %[in], #16\n" + "add %[f], %[f], #32\n" + "bne 0b\n" + "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%[out]], #64\n" + : [out] "+r"(otm_0), [in] "+r"(itm_0), [f] "+r"(ftm_0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v18", "v20", "x0"); + } + // out trans + // O*(6*6)*hw4*o8 => NOWHo8 + for (U32 hw4 = 0; hw4 < 4; hw4++) { + U32 h = (hw + hw4) / tile_w; + U32 w = (hw + hw4) % tile_w; + F16 *out_0 = outArray + n * oc * oh * ow * 8 + (oc - 1) * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; + U32 otm_off_0 = (oc - 1) * 8 * 36 * 4 + hw4 * 8; + + F16 *Ow_0[36]; + F16 *O_0[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + otm_off_0 + idx * 4 * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + } + } + + // tiles_reminder % 4 + tiles_s = (tiles / 4) * 4; + for (I32 hw = tiles_s; hw < tiles; hw++) { + const F16 *ftm_0 = filterArray; + F16 *otm_0 = otmArray; + // in trans + // NCHWc8 => (6*6)*C*c8*hw1 + for (U32 c = 0; c < ic; c++) { + F16 *inArray_pad_mov = inArray_pad + c * ih_pad * iw_pad * 8; + F16 *Iw_ptr[36]; + F16 Iw0[36][8]; + F16 *I0[36]; + F16 *itmArray_mov = itmArray + c * 8; + U32 h0 = (hw / tile_w) * 4; + U32 w0 = (hw % tile_w) * 4; + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + } + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + trans_I_4x4_3x3(Iw_ptr, I0); + for (U32 i = 0; i < 36; i++) { + F16 *itm = itmArray_mov + i * ic * 8; + + // for (U32 c8 = 0; c8 < 8; c8++) { + // itm[c8] = Iw0[i][c8]; + // } + memcpy(itm, Iw0[i], 8 * bytesOf(idt)); + } + } + for (I32 o = 0; o < I32(oc - 1); o += 2) { + const F16 *b_0 = biasArray + o * 8; + const F16 *b_1 = b_0 + 8; + F16 *itm_0 = itmArray; + // dot prod + // (6*6)*C*c8*hw1 times O*(6*6)*C*c8*o16 = O*(6*6)*hw1*o16 + for (U32 idx = 0; idx < 36; idx++) { + __asm__ __volatile__( + "mov x0, %[ic]\n" // ic_blk + "eor v2.16b, v2.16b, v2.16b\n" // out_o0hw0 + "ldr h0, [%[in]]\n" // in_hw0 + "eor v3.16b, v3.16b, v3.16b\n" // out_o1hw0 + "ldr q18, [%[f]]\n" // f_o0c0 + "ldr q19, [%[f], #16]\n" // f_o1c0 + "0:\n" + "ldr h1, [%[in], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f], #32]\n" // f_o0c0 + "fmla v3.8h, v19.8h, v0.h[0]\n" + "ldr q21, [%[f], #48]\n" // f_o1c0 + "subs x0, x0, #2\n" + + "ldr h0, [%[in], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f], #64]\n" // f_o0c0 + "fmla v3.8h, v21.8h, v1.h[0]\n" + "ldr q19, [%[f], #80]\n" // f_o1c0 + "add %[in], %[in], #4\n" + "add %[f], %[f], #64\n" + "bne 0b\n" + "st1 {v2.8h, v3.8h}, [%[out]], #32\n" + : [out] "+r"(otm_0), [in] "+r"(itm_0), [f] "+r"(ftm_0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v3", "v18", "v19", "v20", "v21", "x0"); + } + // out trans + // O*(6*6)*hw1*o16 => NOWHo8 + U32 h = hw / tile_w; + U32 w = hw % tile_w; + F16 *out_0 = + outArray + n * oc * oh * ow * 8 + o * oh * ow * 8 + h * 4 * ow * 8 + w * 4 * 8; + F16 *out_1 = out_0 + oh * ow * 8; + U32 otm_off_0 = o * 8 * 36; + U32 otm_off_1 = otm_off_0 + 8; + + F16 *Ow_0[36]; + F16 *Ow_1[36]; + F16 *O_0[16]; + F16 *O_1[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + otm_off_0 + idx * 16; + Ow_1[idx] = otmArray + otm_off_1 + idx * 16; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + O_1[i * 4 + j] = out_1 + i * ow * 8 + j * 8; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + if (oc & 1) { + F16 *itm_0 = itmArray; + const F16 *ftm_0 = filterArray + (oc - 1) * 8 * 36 * ic * 8; + F16 *otm_0 = otmArray + (oc - 1) * 8 * 36; + const F16 *b_0 = biasArray + (oc - 1) * 8; + // dot prod + // (6*6)*C*c8*hw1 times O*(6*6)*C*c8*o8 = O*(6*6)*hw1*o8 + for (U32 idx = 0; idx < 36; idx++) { + __asm__ __volatile__("mov x0, %[ic]\n" // ic_blk + "eor v2.16b, v2.16b, v2.16b\n" // out_o0hw0 + "ldr s0, [%[in]]\n" // in_hw0 + "ldr q18, [%[f]]\n" // f_o0c0 + "0:\n" + "ldr h1, [%[in], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f], #16]\n" // f_o0c0 + "subs x0, x0, #2\n" + + "ldr h0, [%[in], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f], #32]\n" // f_o0c0 + "add %[in], %[in], #4\n" + "add %[f], %[f], #32\n" + "bne 0b\n" + "st1 {v2.8h}, [%[out]], #16\n" + : [out] "+r"(otm_0), [in] "+r"(itm_0), [f] "+r"(ftm_0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v18", "v20", "x0"); + } + // out trans + // O*(6*6)*hw1*o8 => NOWHo8 + U32 h = hw / tile_w; + U32 w = hw % tile_w; + F16 *out_0 = outArray + n * oc * oh * ow * 8 + (oc - 1) * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; + U32 otm_off_0 = (oc - 1) * 8 * 36; + + F16 *Ow_0[36]; + F16 *O_0[16]; + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + otm_off_0 + idx * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + } + } + } + return ret; +} diff --git a/tensor_computing/src/cpu/arm/fp16/convolution_winograd_transform.h b/compute/tensor/src/cpu/arm/fp16/convolution_winograd_transform.h similarity index 78% rename from tensor_computing/src/cpu/arm/fp16/convolution_winograd_transform.h rename to compute/tensor/src/cpu/arm/fp16/convolution_winograd_transform.h index 18893534..45c7206d 100644 --- a/tensor_computing/src/cpu/arm/fp16/convolution_winograd_transform.h +++ b/compute/tensor/src/cpu/arm/fp16/convolution_winograd_transform.h @@ -1,26 +1,25 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_WINOGRAD_TRANSFORM #define _H_WINOGRAD_TRANSFORM -#ifdef _USE_FP16 #include #include #include "cpu/arm/fp16/arm_functions_fp16.h" -inline void trans_W_4x4_3x3(F16 *Fw[36], F16* const F[9]) { +inline void trans_W_4x4_3x3(F16 *Fw[36], F16 *const F[9]) +{ F16 T[6][3][8]; float16x8_t v_01666 = vmovq_n_f16(0.1666666666666667f); @@ -31,9 +30,9 @@ inline void trans_W_4x4_3x3(F16 *Fw[36], F16* const F[9]) { float16x8_t v_025 = vmovq_n_f16(0.25f); for (U32 i = 0; i < 3; i++) { - float16x8_t v_F0 = vld1q_f16(F[0*3+i]); - float16x8_t v_F1 = vld1q_f16(F[1*3+i]); - float16x8_t v_F2 = vld1q_f16(F[2*3+i]); + float16x8_t v_F0 = vld1q_f16(F[0 * 3 + i]); + float16x8_t v_F1 = vld1q_f16(F[1 * 3 + i]); + float16x8_t v_F2 = vld1q_f16(F[2 * 3 + i]); float16x8_t v_t0 = vmulq_f16(v_01666, v_F2); float16x8_t v_t1 = vsubq_f16(vmulq_f16(v_minus_01666, v_F0), v_t0); @@ -67,17 +66,25 @@ inline void trans_W_4x4_3x3(F16 *Fw[36], F16* const F[9]) { float16x8_t v_Fw3 = vfmaq_f16(v_t2, v_00833, v_T1); float16x8_t v_Fw4 = vfmaq_f16(v_t2, v_minus_00833, v_T1); - vst1q_f16(Fw[i*6+0], v_Fw0); - vst1q_f16(Fw[i*6+1], v_Fw1); - vst1q_f16(Fw[i*6+2], v_Fw2); - vst1q_f16(Fw[i*6+3], v_Fw3); - vst1q_f16(Fw[i*6+4], v_Fw4); - vst1q_f16(Fw[i*6+5], v_T2); + vst1q_f16(Fw[i * 6 + 0], v_Fw0); + vst1q_f16(Fw[i * 6 + 1], v_Fw1); + vst1q_f16(Fw[i * 6 + 2], v_Fw2); + vst1q_f16(Fw[i * 6 + 3], v_Fw3); + vst1q_f16(Fw[i * 6 + 4], v_Fw4); + vst1q_f16(Fw[i * 6 + 5], v_T2); } } -inline EE trans_O_4x4_3x3(F16* const Ow[36], F16 *O[16], const F16* bias, - U32 h, U32 w, U32 _pad_h_mod_4, U32 _pad_w_mod_4, U32 oh, U32 ow, ActivationDesc activationDesc) +inline EE trans_O_4x4_3x3(F16 *const Ow[36], + F16 *O[16], + const F16 *bias, + U32 h, + U32 w, + U32 _pad_h_mod_4, + U32 _pad_w_mod_4, + U32 oh, + U32 ow, + ActivationParamSpec activationDesc) { F16 T[4][6][8]; // bias @@ -90,11 +97,11 @@ inline EE trans_O_4x4_3x3(F16* const Ow[36], F16 *O[16], const F16* bias, for (U32 i = 0; i < 6; i++) { float16x8_t v_Ow0 = vld1q_f16(Ow[i]); - float16x8_t v_Ow1 = vld1q_f16(Ow[1*6+i]); - float16x8_t v_Ow2 = vld1q_f16(Ow[2*6+i]); - float16x8_t v_Ow3 = vld1q_f16(Ow[3*6+i]); - float16x8_t v_Ow4 = vld1q_f16(Ow[4*6+i]); - float16x8_t v_Ow5 = vld1q_f16(Ow[5*6+i]); + float16x8_t v_Ow1 = vld1q_f16(Ow[1 * 6 + i]); + float16x8_t v_Ow2 = vld1q_f16(Ow[2 * 6 + i]); + float16x8_t v_Ow3 = vld1q_f16(Ow[3 * 6 + i]); + float16x8_t v_Ow4 = vld1q_f16(Ow[4 * 6 + i]); + float16x8_t v_Ow5 = vld1q_f16(Ow[5 * 6 + i]); float16x8_t v_t0 = vaddq_f16(v_Ow1, v_Ow2); float16x8_t v_t1 = vaddq_f16(v_Ow3, v_Ow4); @@ -143,55 +150,55 @@ inline EE trans_O_4x4_3x3(F16* const Ow[36], F16 *O[16], const F16* bias, switch (activationDesc.mode) { case ACTIVATION_NULL: { if (pad_w_mod_4 == 0) { - vst1q_f16(O[i*4+0], vaddq_f16(v_O0, v_b)); - vst1q_f16(O[i*4+1], vaddq_f16(v_O1, v_b)); - vst1q_f16(O[i*4+2], vaddq_f16(v_O2, v_b)); - vst1q_f16(O[i*4+3], vaddq_f16(v_O3, v_b)); + vst1q_f16(O[i * 4 + 0], vaddq_f16(v_O0, v_b)); + vst1q_f16(O[i * 4 + 1], vaddq_f16(v_O1, v_b)); + vst1q_f16(O[i * 4 + 2], vaddq_f16(v_O2, v_b)); + vst1q_f16(O[i * 4 + 3], vaddq_f16(v_O3, v_b)); } else if (pad_w_mod_4 == 1) { - vst1q_f16(O[i*4+0], vaddq_f16(v_O0, v_b)); - vst1q_f16(O[i*4+1], vaddq_f16(v_O1, v_b)); - vst1q_f16(O[i*4+2], vaddq_f16(v_O2, v_b)); + vst1q_f16(O[i * 4 + 0], vaddq_f16(v_O0, v_b)); + vst1q_f16(O[i * 4 + 1], vaddq_f16(v_O1, v_b)); + vst1q_f16(O[i * 4 + 2], vaddq_f16(v_O2, v_b)); } else if (pad_w_mod_4 == 2) { - vst1q_f16(O[i*4+0], vaddq_f16(v_O0, v_b)); - vst1q_f16(O[i*4+1], vaddq_f16(v_O1, v_b)); + vst1q_f16(O[i * 4 + 0], vaddq_f16(v_O0, v_b)); + vst1q_f16(O[i * 4 + 1], vaddq_f16(v_O1, v_b)); } else if (pad_w_mod_4 == 3) { - vst1q_f16(O[i*4+0], vaddq_f16(v_O0, v_b)); + vst1q_f16(O[i * 4 + 0], vaddq_f16(v_O0, v_b)); } break; } case ACTIVATION_RELU: { if (pad_w_mod_4 == 0) { - vst1q_f16(O[i*4+0], vmaxq_f16(vaddq_f16(v_O0, v_b), v_0)); - vst1q_f16(O[i*4+1], vmaxq_f16(vaddq_f16(v_O1, v_b), v_0)); - vst1q_f16(O[i*4+2], vmaxq_f16(vaddq_f16(v_O2, v_b), v_0)); - vst1q_f16(O[i*4+3], vmaxq_f16(vaddq_f16(v_O3, v_b), v_0)); + vst1q_f16(O[i * 4 + 0], vmaxq_f16(vaddq_f16(v_O0, v_b), v_0)); + vst1q_f16(O[i * 4 + 1], vmaxq_f16(vaddq_f16(v_O1, v_b), v_0)); + vst1q_f16(O[i * 4 + 2], vmaxq_f16(vaddq_f16(v_O2, v_b), v_0)); + vst1q_f16(O[i * 4 + 3], vmaxq_f16(vaddq_f16(v_O3, v_b), v_0)); } else if (pad_w_mod_4 == 1) { - vst1q_f16(O[i*4+0], vmaxq_f16(vaddq_f16(v_O0, v_b), v_0)); - vst1q_f16(O[i*4+1], vmaxq_f16(vaddq_f16(v_O1, v_b), v_0)); - vst1q_f16(O[i*4+2], vmaxq_f16(vaddq_f16(v_O2, v_b), v_0)); + vst1q_f16(O[i * 4 + 0], vmaxq_f16(vaddq_f16(v_O0, v_b), v_0)); + vst1q_f16(O[i * 4 + 1], vmaxq_f16(vaddq_f16(v_O1, v_b), v_0)); + vst1q_f16(O[i * 4 + 2], vmaxq_f16(vaddq_f16(v_O2, v_b), v_0)); } else if (pad_w_mod_4 == 2) { - vst1q_f16(O[i*4+0], vmaxq_f16(vaddq_f16(v_O0, v_b), v_0)); - vst1q_f16(O[i*4+1], vmaxq_f16(vaddq_f16(v_O1, v_b), v_0)); + vst1q_f16(O[i * 4 + 0], vmaxq_f16(vaddq_f16(v_O0, v_b), v_0)); + vst1q_f16(O[i * 4 + 1], vmaxq_f16(vaddq_f16(v_O1, v_b), v_0)); } else if (pad_w_mod_4 == 3) { - vst1q_f16(O[i*4+0], vmaxq_f16(vaddq_f16(v_O0, v_b), v_0)); + vst1q_f16(O[i * 4 + 0], vmaxq_f16(vaddq_f16(v_O0, v_b), v_0)); } break; } case ACTIVATION_SIGMOID: { if (pad_w_mod_4 == 0) { - vst1q_f16(O[i*4+0], vsigmoidq_f16(vaddq_f16(v_O0, v_b))); - vst1q_f16(O[i*4+1], vsigmoidq_f16(vaddq_f16(v_O1, v_b))); - vst1q_f16(O[i*4+2], vsigmoidq_f16(vaddq_f16(v_O2, v_b))); - vst1q_f16(O[i*4+3], vsigmoidq_f16(vaddq_f16(v_O3, v_b))); + vst1q_f16(O[i * 4 + 0], vsigmoidq_f16(vaddq_f16(v_O0, v_b))); + vst1q_f16(O[i * 4 + 1], vsigmoidq_f16(vaddq_f16(v_O1, v_b))); + vst1q_f16(O[i * 4 + 2], vsigmoidq_f16(vaddq_f16(v_O2, v_b))); + vst1q_f16(O[i * 4 + 3], vsigmoidq_f16(vaddq_f16(v_O3, v_b))); } else if (pad_w_mod_4 == 1) { - vst1q_f16(O[i*4+0], vsigmoidq_f16(vaddq_f16(v_O0, v_b))); - vst1q_f16(O[i*4+1], vsigmoidq_f16(vaddq_f16(v_O1, v_b))); - vst1q_f16(O[i*4+2], vsigmoidq_f16(vaddq_f16(v_O2, v_b))); + vst1q_f16(O[i * 4 + 0], vsigmoidq_f16(vaddq_f16(v_O0, v_b))); + vst1q_f16(O[i * 4 + 1], vsigmoidq_f16(vaddq_f16(v_O1, v_b))); + vst1q_f16(O[i * 4 + 2], vsigmoidq_f16(vaddq_f16(v_O2, v_b))); } else if (pad_w_mod_4 == 2) { - vst1q_f16(O[i*4+0], vsigmoidq_f16(vaddq_f16(v_O0, v_b))); - vst1q_f16(O[i*4+1], vsigmoidq_f16(vaddq_f16(v_O1, v_b))); + vst1q_f16(O[i * 4 + 0], vsigmoidq_f16(vaddq_f16(v_O0, v_b))); + vst1q_f16(O[i * 4 + 1], vsigmoidq_f16(vaddq_f16(v_O1, v_b))); } else if (pad_w_mod_4 == 3) { - vst1q_f16(O[i*4+0], vsigmoidq_f16(vaddq_f16(v_O0, v_b))); + vst1q_f16(O[i * 4 + 0], vsigmoidq_f16(vaddq_f16(v_O0, v_b))); } break; } @@ -202,8 +209,7 @@ inline EE trans_O_4x4_3x3(F16* const Ow[36], F16 *O[16], const F16* bias, return SUCCESS; } - -inline void trans_I_4x4_3x3(F16 *Iw[36], F16* const I[36]) +inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36]) { F16 T[6][6][8]; @@ -213,12 +219,12 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16* const I[36]) float16x8_t v_minus_5 = vmovq_n_f16(-5); for (U32 i = 0; i < 6; i++) { - float16x8_t v_I0 = vld1q_f16(I[0*6+i]); - float16x8_t v_I1 = vld1q_f16(I[1*6+i]); - float16x8_t v_I2 = vld1q_f16(I[2*6+i]); - float16x8_t v_I3 = vld1q_f16(I[3*6+i]); - float16x8_t v_I4 = vld1q_f16(I[4*6+i]); - float16x8_t v_I5 = vld1q_f16(I[5*6+i]); + float16x8_t v_I0 = vld1q_f16(I[0 * 6 + i]); + float16x8_t v_I1 = vld1q_f16(I[1 * 6 + i]); + float16x8_t v_I2 = vld1q_f16(I[2 * 6 + i]); + float16x8_t v_I3 = vld1q_f16(I[3 * 6 + i]); + float16x8_t v_I4 = vld1q_f16(I[4 * 6 + i]); + float16x8_t v_I5 = vld1q_f16(I[5 * 6 + i]); float16x8_t v_t0 = vfmaq_f16(v_I4, v_I2, v_minus_4); float16x8_t v_t1 = vfmaq_f16(v_I3, v_I1, v_minus_4); @@ -298,11 +304,11 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16* const I[36]) check[c] = tmp; } } - memcpy(Iw[i*6+0], check, 8*bytesOf(DT_F16)); + memcpy(Iw[i * 6 + 0], check, 8 * bytesOf(DT_F16)); } else { - vst1q_f16(Iw[i*6+0], v_Iw0); + vst1q_f16(Iw[i * 6 + 0], v_Iw0); } - + max = vmaxvq_f16(v_Iw1); min = vminvq_f16(v_Iw1); if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) { @@ -337,11 +343,11 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16* const I[36]) check[c] = tmp; } } - memcpy(Iw[i*6+1], check, 8*bytesOf(DT_F16)); + memcpy(Iw[i * 6 + 1], check, 8 * bytesOf(DT_F16)); } else { - vst1q_f16(Iw[i*6+1], v_Iw1); + vst1q_f16(Iw[i * 6 + 1], v_Iw1); } - + max = vmaxvq_f16(v_Iw2); min = vminvq_f16(v_Iw2); if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) { @@ -376,11 +382,11 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16* const I[36]) check[c] = tmp; } } - memcpy(Iw[i*6+2], check, 8*bytesOf(DT_F16)); + memcpy(Iw[i * 6 + 2], check, 8 * bytesOf(DT_F16)); } else { - vst1q_f16(Iw[i*6+2], v_Iw2); + vst1q_f16(Iw[i * 6 + 2], v_Iw2); } - + max = vmaxvq_f16(v_Iw3); min = vminvq_f16(v_Iw3); if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) { @@ -415,11 +421,11 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16* const I[36]) check[c] = tmp; } } - memcpy(Iw[i*6+3], check, 8*bytesOf(DT_F16)); + memcpy(Iw[i * 6 + 3], check, 8 * bytesOf(DT_F16)); } else { - vst1q_f16(Iw[i*6+3], v_Iw3); + vst1q_f16(Iw[i * 6 + 3], v_Iw3); } - + max = vmaxvq_f16(v_Iw4); min = vminvq_f16(v_Iw4); if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) { @@ -454,11 +460,11 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16* const I[36]) check[c] = tmp; } } - memcpy(Iw[i*6+4], check, 8*bytesOf(DT_F16)); + memcpy(Iw[i * 6 + 4], check, 8 * bytesOf(DT_F16)); } else { - vst1q_f16(Iw[i*6+4], v_Iw4); + vst1q_f16(Iw[i * 6 + 4], v_Iw4); } - + max = vmaxvq_f16(v_Iw5); min = vminvq_f16(v_Iw5); if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) { @@ -493,11 +499,10 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16* const I[36]) check[c] = tmp; } } - memcpy(Iw[i*6+5], check, 8*bytesOf(DT_F16)); + memcpy(Iw[i * 6 + 5], check, 8 * bytesOf(DT_F16)); } else { - vst1q_f16(Iw[i*6+5], v_Iw5); + vst1q_f16(Iw[i * 6 + 5], v_Iw5); } } } #endif -#endif diff --git a/compute/tensor/src/cpu/arm/fp16/deconvolution_transform.cpp b/compute/tensor/src/cpu/arm/fp16/deconvolution_transform.cpp new file mode 100644 index 00000000..f54f6ad7 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/deconvolution_transform.cpp @@ -0,0 +1,97 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/transform_functions.h" +#include "cpu/arm/fp16/tensor_computing_fp16.h" + +inline EE deconvolution_transform_filter_kernel_fp16(TensorDesc filterDesc, + const F16 *filterArray, + TensorDesc *ftmDesc, + F16 *ftmArray, + DataFormat ftmDataFormat) +{ + // Procedure should be the same, but fhfw is reversed + if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + if (fdf == ftmDataFormat) { + *ftmDesc = filterDesc; + memcpy(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt)); + return SUCCESS; + } + if (fdf != DF_NCHW) { + CHECK_STATUS(NOT_SUPPORTED); + } + EE ret = SUCCESS; + switch (ftmDataFormat) { + case DF_NHWCN16: { + /* + * CNHW => NHWCN16 + * if there is remainder, it should be NHWCN8 + */ + *ftmDesc = tensor4df(fdt, ftmDataFormat, fc, fn, fh, fw); + transformCNHWToNHWCNx(filterDesc, filterArray, *ftmDesc, ftmArray); + break; + } + case DF_HWNCN16: { + /* + * CNHW => NHWCN16 + NHWCN8 if there is remainder divided by 16 + */ + *ftmDesc = tensor4df(fdt, ftmDataFormat, fc, fn, 6, 6); + transformCNHWToHWNCNx(filterDesc, filterArray, *ftmDesc, ftmArray); + break; + } + case DF_NCHWC8: { + *ftmDesc = tensor4df(fdt, DF_NCHWC8, fn, fc, fh, fw); + transformCNHWToNCHWC8(filterDesc, filterArray, *ftmDesc, ftmArray); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE deconvolution_transform_filter_fp16(TensorDesc filterDesc, + const F16 *filter, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F16 *filterTransformed) +{ + DataFormat ftmDataFormat; + switch (algorithm) { + case CONVOLUTION_ALGORITHM_WINOGRAD: + ftmDataFormat = DF_HWNCN16; + break; + case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: + ftmDataFormat = DF_NHWCN16; + break; + case CONVOLUTION_ALGORITHM_GEMM: + ftmDataFormat = DF_NHWCN16; + break; + case CONVOLUTION_ALGORITHM_GROUP_DECONV: + ftmDataFormat = DF_NCHWC8; + break; + default: + return NOT_MATCH; + } + EE ret = deconvolution_transform_filter_kernel_fp16( + filterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat); + CHECK_STATUS(ret); + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution.cpp new file mode 100644 index 00000000..af32a944 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution.cpp @@ -0,0 +1,93 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#include "cpu/arm/fp16/depthwise_pointwise_convolution_direct.h" +#include "cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h" +#include "cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1.h" + +EE depthwise_pointwise_convolution_fp16(TensorDesc inputDesc, + F16 *input, + TensorDesc dwFilterDesc, + const F16 *dwFilter, + TensorDesc pwFilterDesc, + const F16 *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const F16 *dwBias, + TensorDesc pwBiasDesc, + const F16 *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch) +{ + if (nullptr == input || nullptr == dwFilter || nullptr == output || nullptr == dwBias || + nullptr == tmp) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (!(idt == DT_F16 && fdt == DT_F16 && odt == DT_F16)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(idf == DF_NCHWC8 && odf == DF_NCHWC8)) { + CHECK_STATUS(NOT_MATCH); + } + if (ic != fc) { + CHECK_STATUS(NOT_MATCH); + } + + EE ret = SUCCESS; + switch (algorithm) { + case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: + ret = depthwise_pointwise_convolution_direct(inputDesc, input, dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, dwBiasDesc, dwBias, pwBiasDesc, pwBias, + tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, arch); + break; + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: + ret = depthwise_pointwise_convolution_direct(inputDesc, input, dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, dwBiasDesc, dwBias, pwBiasDesc, pwBias, + tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, arch); + break; + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT_NO_PADDING: + ret = depthwise_pointwise_convolution_direct_no_padding(inputDesc, input, dwFilterDesc, + dwFilter, pwFilterDesc, pwFilter, convParamSpec, dwBiasDesc, dwBias, pwBiasDesc, + pwBias, tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, arch); + break; + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_3X3S1P1: + ret = depthwise_pointwise_convolution_3x3s1p1(inputDesc, input, dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, dwBiasDesc, dwBias, pwBiasDesc, pwBias, + tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, arch); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1.h b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1.h new file mode 100644 index 00000000..1a6e644c --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1.h @@ -0,0 +1,95 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_DEPTHWISE_POINTWISE_CONVOLUTION_3X3S1P1 +#define _H_DEPTHWISE_POINTWISE_CONVOLUTION_3X3S1P1 + +#include "sys.h" +#include "types.h" +#include "error.h" + +EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec); + +EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec); + +inline EE depthwise_pointwise_convolution_3x3s1p1(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch) +{ + EE ret = SUCCESS; + switch (arch) { + case ARM_A55: + ret = depthwise_pointwise_convolution_3x3s1p1_A55(inputDesc, inArray, dwFilterDesc, + dwFilterArray, pwFilterDesc, pwFilterArray, convParamSpec, dwBiasDesc, dwBiasArray, + pwBiasDesc, pwBiasArray, tmpBytes, tmp, outputDesc, outArray, + depthwiseActivationParamSpec, pointwiseActivationParamSpec); + break; + case ARM_A76: + ret = depthwise_pointwise_convolution_3x3s1p1_A76(inputDesc, inArray, dwFilterDesc, + dwFilterArray, pwFilterDesc, pwFilterArray, convParamSpec, dwBiasDesc, dwBiasArray, + pwBiasDesc, pwBiasArray, tmpBytes, tmp, outputDesc, outArray, + depthwiseActivationParamSpec, pointwiseActivationParamSpec); + break; + default: + return NOT_SUPPORTED; + } + return ret; +} +#endif diff --git a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A55.cpp b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A55.cpp similarity index 74% rename from tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A55.cpp rename to compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A55.cpp index 9262862d..7b30c26c 100644 --- a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A55.cpp +++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A55.cpp @@ -1,31 +1,38 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1.h" -EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc) +EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec) { - UNUSED(biasDesc); UNUSED(tmpBytes); - UNUSED(convDesc); + UNUSED(convParamSpec); DataType idt, fdt, odt; DataFormat idf, fdf, odf; @@ -33,28 +40,29 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra U32 fn, fc, fh, fw; U32 on, oc, oh, ow; CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - if (fdf != DF_CHWC8_NCN16) + if (dwFilterDesc.df != DF_NCHWC8 || pwFilterDesc.df != DF_NHWCN16) { CHECK_STATUS(NOT_MATCH); + } oc /= 8; ic /= 8; I32 ohow = oh * ow; - F16 *pwArray = (F16*)tmp; + F16 *pwArray = (F16 *)tmp; for (U32 n = 0; n < in; n++) { // dw_conv + padding for (U32 c = 0; c < ic; c++) { - const F16 *b = biasArray + c*8; - F16 *in_c = inArray + c*ih*iw*8; - const F16 *f = filterArray + c*fh*fw*8; - F16 *out = pwArray + c*ohow*8; + const F16 *b = dwBiasArray + c * 8; + F16 *in_c = inArray + c * ih * iw * 8; + const F16 *f = dwFilterArray + c * fh * fw * 8; + F16 *out = pwArray + c * ohow * 8; F16 *in0 = in_c; - F16 *in1 = in0 + iw*8; - F16 *in2 = in1 + iw*8; + F16 *in1 = in0 + iw * 8; + F16 *in2 = in1 + iw * 8; __asm__ __volatile__( "mov x0, %[w]\n" "ldr q28, [%[b]]\n" @@ -72,10 +80,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "ldr q19, [%[in_1], #16]\n" "ldr q20, [%[in_1], #32]\n" "ldr q21, [%[in_1], #48]\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 "ldr q17, [%[in_0], #64]\n" "fmla v10.8h, v3.8h, v13.8h\n" @@ -117,10 +125,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "add %[in_0], %[in_0], #48\n" "add %[in_1], %[in_1], #48\n" - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse "bne 111f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" @@ -128,13 +136,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "111:\n" "cmp %[depthwiseActivationMode], %[am_relu6]\n" "bne 112f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) "fmin v10.8h, v10.8h, v22.8h\n" "fmin v11.8h, v11.8h, v22.8h\n" "fmin v12.8h, v12.8h, v22.8h\n" @@ -142,13 +150,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "112:\n" "cmp %[depthwiseActivationMode], %[am_h_swish]\n" "bne 113f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three "fadd v27.8h, v9.8h, v22.8h\n" "fadd v29.8h, v10.8h, v22.8h\n" "fadd v30.8h, v11.8h, v22.8h\n" "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six + "movi v22.8h, #0x46, lsl #8\n" // six "fmax v27.8h, v27.8h, v17.8h\n" "fmax v29.8h, v29.8h, v17.8h\n" "fmax v30.8h, v30.8h, v17.8h\n" @@ -170,10 +178,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" "0:\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 "ldr d17, [%[in_0], #64]\n" "fmla v9.8h, v3.8h, v13.8h\n" @@ -236,10 +244,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "add %[in_0], %[in_0], #64\n" "add %[in_1], %[in_1], #64\n" - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse "bne 211f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" @@ -247,13 +255,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "211:\n" "cmp %[depthwiseActivationMode], %[am_relu6]\n" "bne 212f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) "fmin v10.8h, v10.8h, v22.8h\n" "fmin v11.8h, v11.8h, v22.8h\n" "fmin v12.8h, v12.8h, v22.8h\n" @@ -261,13 +269,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "212:\n" "cmp %[depthwiseActivationMode], %[am_h_swish]\n" "bne 213f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three "fadd v27.8h, v9.8h, v22.8h\n" "fadd v29.8h, v10.8h, v22.8h\n" "fadd v30.8h, v11.8h, v22.8h\n" "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six + "movi v22.8h, #0x46, lsl #8\n" // six "fmax v27.8h, v27.8h, v17.8h\n" "fmax v29.8h, v29.8h, v17.8h\n" "fmax v30.8h, v30.8h, v17.8h\n" @@ -290,10 +298,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" "bne 0b\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 "ldr q17, [%[in_0], #64]\n" "fmla v9.8h, v3.8h, v13.8h\n" @@ -322,10 +330,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "fmla v10.8h, v8.8h, v21.8h\n" "fmla v11.8h, v8.8h, v22.8h\n" - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse "bne 311f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" @@ -333,13 +341,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "311:\n" "cmp %[depthwiseActivationMode], %[am_relu6]\n" "bne 312f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) "fmin v10.8h, v10.8h, v22.8h\n" "fmin v11.8h, v11.8h, v22.8h\n" "fmin v12.8h, v12.8h, v22.8h\n" @@ -347,13 +355,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "312:\n" "cmp %[depthwiseActivationMode], %[am_h_swish]\n" "bne 313f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three "fadd v27.8h, v9.8h, v22.8h\n" "fadd v29.8h, v10.8h, v22.8h\n" "fadd v30.8h, v11.8h, v22.8h\n" "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six + "movi v22.8h, #0x46, lsl #8\n" // six "fmax v27.8h, v27.8h, v17.8h\n" "fmax v29.8h, v29.8h, v17.8h\n" "fmax v30.8h, v30.8h, v17.8h\n" @@ -373,25 +381,20 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "313:\n" "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" - :[out]"+r"(out), - [in_0]"+r"(in0), - [in_1]"+r"(in1) - :[f]"r"(f), - [b]"r"(b), - [w]"r"((I64)ow-8), - [depthwiseActivationMode]"r"((I64)depthwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3" - ); - - for (U32 h = 0; h < oh-2; h++) { - in0 = in_c + h*iw*8; - in1 = in0 + iw*8; - in2 = in1 + iw*8; + : [out] "+r"(out), [in_0] "+r"(in0), [in_1] "+r"(in1) + : [f] "r"(f), [b] "r"(b), [w] "r"((I64)ow - 8), + [depthwiseActivationMode] "r"((I64)depthwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", + "x3"); + + for (U32 h = 0; h < oh - 2; h++) { + in0 = in_c + h * iw * 8; + in1 = in0 + iw * 8; + in2 = in1 + iw * 8; __asm__ __volatile__( "mov x0, %[w]\n" "ldr q28, [%[b]]\n" @@ -417,10 +420,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "ldr q25, [%[in_2], #32]\n" "ldr q26, [%[in_2], #48]\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 "ldr q17, [%[in_0], #64]\n" "fmla v10.8h, v0.8h, v13.8h\n" @@ -480,10 +483,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "add %[in_1], %[in_1], #48\n" "add %[in_2], %[in_2], #48\n" - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse "bne 111f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" @@ -491,13 +494,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "111:\n" "cmp %[depthwiseActivationMode], %[am_relu6]\n" "bne 112f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) "fmin v10.8h, v10.8h, v22.8h\n" "fmin v11.8h, v11.8h, v22.8h\n" "fmin v12.8h, v12.8h, v22.8h\n" @@ -505,13 +508,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "112:\n" "cmp %[depthwiseActivationMode], %[am_h_swish]\n" "bne 113f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three "fadd v27.8h, v9.8h, v22.8h\n" "fadd v29.8h, v10.8h, v22.8h\n" "fadd v30.8h, v11.8h, v22.8h\n" "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six + "movi v22.8h, #0x46, lsl #8\n" // six "fmax v27.8h, v27.8h, v17.8h\n" "fmax v29.8h, v29.8h, v17.8h\n" "fmax v30.8h, v30.8h, v17.8h\n" @@ -533,10 +536,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" "0:\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 "ldr d17, [%[in_0], #64]\n" "fmla v9.8h, v0.8h, v13.8h\n" @@ -604,7 +607,6 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "fmla v11.8h, v8.8h, v27.8h\n" "fmla v12.8h, v8.8h, v23.8h\n" - "ldr d16, [%[in_0], #112]\n" "mov v15.16b, v14.16b\n" "ldr x1, [%[in_0], #120]\n" @@ -628,10 +630,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "add %[in_1], %[in_1], #64\n" "add %[in_2], %[in_2], #64\n" - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse "bne 211f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" @@ -639,13 +641,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "211:\n" "cmp %[depthwiseActivationMode], %[am_relu6]\n" "bne 212f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) "fmin v10.8h, v10.8h, v22.8h\n" "fmin v11.8h, v11.8h, v22.8h\n" "fmin v12.8h, v12.8h, v22.8h\n" @@ -653,13 +655,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "212:\n" "cmp %[depthwiseActivationMode], %[am_h_swish]\n" "bne 213f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three "fadd v27.8h, v9.8h, v22.8h\n" "fadd v29.8h, v10.8h, v22.8h\n" "fadd v30.8h, v11.8h, v22.8h\n" "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six + "movi v22.8h, #0x46, lsl #8\n" // six "fmax v27.8h, v27.8h, v17.8h\n" "fmax v29.8h, v29.8h, v17.8h\n" "fmax v30.8h, v30.8h, v17.8h\n" @@ -682,10 +684,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" "bne 0b\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 "ldr q17, [%[in_0], #64]\n" "fmla v9.8h, v0.8h, v13.8h\n" @@ -726,10 +728,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "fmla v10.8h, v8.8h, v26.8h\n" "fmla v11.8h, v8.8h, v27.8h\n" - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse "bne 311f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" @@ -737,13 +739,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "311:\n" "cmp %[depthwiseActivationMode], %[am_relu6]\n" "bne 312f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) "fmin v10.8h, v10.8h, v22.8h\n" "fmin v11.8h, v11.8h, v22.8h\n" "fmin v12.8h, v12.8h, v22.8h\n" @@ -751,13 +753,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "312:\n" "cmp %[depthwiseActivationMode], %[am_h_swish]\n" "bne 313f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three "fadd v27.8h, v9.8h, v22.8h\n" "fadd v29.8h, v10.8h, v22.8h\n" "fadd v30.8h, v11.8h, v22.8h\n" "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six + "movi v22.8h, #0x46, lsl #8\n" // six "fmax v27.8h, v27.8h, v17.8h\n" "fmax v29.8h, v29.8h, v17.8h\n" "fmax v30.8h, v30.8h, v17.8h\n" @@ -777,25 +779,19 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "313:\n" "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" - :[out]"+r"(out), - [in_0]"+r"(in0), - [in_1]"+r"(in1), - [in_2]"+r"(in2) - :[f]"r"(f), - [b]"r"(b), - [w]"r"((I64)ow-8), - [depthwiseActivationMode]"r"((I64)depthwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3" - ); + : [out] "+r"(out), [in_0] "+r"(in0), [in_1] "+r"(in1), [in_2] "+r"(in2) + : [f] "r"(f), [b] "r"(b), [w] "r"((I64)ow - 8), + [depthwiseActivationMode] "r"((I64)depthwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", + "x0", "x1", "x2", "x3"); } - in0 = in_c + (ih-2)*iw*8; - in1 = in0 + iw*8; - in2 = in1 + iw*8; + in0 = in_c + (ih - 2) * iw * 8; + in1 = in0 + iw * 8; + in2 = in1 + iw * 8; __asm__ __volatile__( "mov x0, %[w]\n" "ldr q28, [%[b]]\n" @@ -814,10 +810,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "ldr q20, [%[in_1], #32]\n" "ldr q21, [%[in_1], #48]\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 "ldr q17, [%[in_0], #64]\n" "fmla v10.8h, v0.8h, v13.8h\n" @@ -859,10 +855,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "add %[in_0], %[in_0], #48\n" "add %[in_1], %[in_1], #48\n" - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse "bne 111f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" @@ -870,13 +866,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "111:\n" "cmp %[depthwiseActivationMode], %[am_relu6]\n" "bne 112f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) "fmin v10.8h, v10.8h, v22.8h\n" "fmin v11.8h, v11.8h, v22.8h\n" "fmin v12.8h, v12.8h, v22.8h\n" @@ -884,13 +880,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "112:\n" "cmp %[depthwiseActivationMode], %[am_h_swish]\n" "bne 113f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three "fadd v27.8h, v9.8h, v22.8h\n" "fadd v29.8h, v10.8h, v22.8h\n" "fadd v30.8h, v11.8h, v22.8h\n" "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six + "movi v22.8h, #0x46, lsl #8\n" // six "fmax v27.8h, v27.8h, v17.8h\n" "fmax v29.8h, v29.8h, v17.8h\n" "fmax v30.8h, v30.8h, v17.8h\n" @@ -912,10 +908,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" "0:\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 "ldr d17, [%[in_0], #64]\n" "fmla v9.8h, v0.8h, v13.8h\n" @@ -962,7 +958,6 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "fmla v11.8h, v5.8h, v22.8h\n" "fmla v12.8h, v5.8h, v18.8h\n" - "ldr d16, [%[in_0], #112]\n" "mov v15.16b, v14.16b\n" "ldr x1, [%[in_0], #120]\n" @@ -979,10 +974,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "add %[in_0], %[in_0], #64\n" "add %[in_1], %[in_1], #64\n" - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse "bne 211f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" @@ -990,13 +985,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "211:\n" "cmp %[depthwiseActivationMode], %[am_relu6]\n" "bne 212f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) "fmin v10.8h, v10.8h, v22.8h\n" "fmin v11.8h, v11.8h, v22.8h\n" "fmin v12.8h, v12.8h, v22.8h\n" @@ -1004,13 +999,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "212:\n" "cmp %[depthwiseActivationMode], %[am_h_swish]\n" "bne 213f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three "fadd v27.8h, v9.8h, v22.8h\n" "fadd v29.8h, v10.8h, v22.8h\n" "fadd v30.8h, v11.8h, v22.8h\n" "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six + "movi v22.8h, #0x46, lsl #8\n" // six "fmax v27.8h, v27.8h, v17.8h\n" "fmax v29.8h, v29.8h, v17.8h\n" "fmax v30.8h, v30.8h, v17.8h\n" @@ -1033,10 +1028,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" "bne 0b\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 "ldr q17, [%[in_0], #64]\n" "fmla v9.8h, v0.8h, v13.8h\n" @@ -1065,10 +1060,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "fmla v10.8h, v5.8h, v21.8h\n" "fmla v11.8h, v5.8h, v22.8h\n" - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse "bne 311f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" @@ -1076,13 +1071,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "311:\n" "cmp %[depthwiseActivationMode], %[am_relu6]\n" "bne 312f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) "fmin v10.8h, v10.8h, v22.8h\n" "fmin v11.8h, v11.8h, v22.8h\n" "fmin v12.8h, v12.8h, v22.8h\n" @@ -1090,13 +1085,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "312:\n" "cmp %[depthwiseActivationMode], %[am_h_swish]\n" "bne 313f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three "fadd v27.8h, v9.8h, v22.8h\n" "fadd v29.8h, v10.8h, v22.8h\n" "fadd v30.8h, v11.8h, v22.8h\n" "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six + "movi v22.8h, #0x46, lsl #8\n" // six "fmax v27.8h, v27.8h, v17.8h\n" "fmax v29.8h, v29.8h, v17.8h\n" "fmax v30.8h, v30.8h, v17.8h\n" @@ -1116,34 +1111,29 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "313:\n" "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" - :[out]"+r"(out), - [in_0]"+r"(in0), - [in_1]"+r"(in1) - :[f]"r"(f), - [b]"r"(b), - [w]"r"((I64)ow-8), - [depthwiseActivationMode]"r"((I64)depthwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3" - ); + : [out] "+r"(out), [in_0] "+r"(in0), [in_1] "+r"(in1) + : [f] "r"(f), [b] "r"(b), [w] "r"((I64)ow - 8), + [depthwiseActivationMode] "r"((I64)depthwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", + "x3"); } // pw_conv - for (I32 hw = 0; hw < ohow-7; hw+=8) { - const F16 *b0 = biasArray + ic*8; + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *b0 = pwBiasArray; const F16 *b1 = b0 + 8; - const F16 *f_o0c0 = filterArray + ic*fh*fw*8; - F16 *in_pack = pwArray + ohow*ic*8; + const F16 *f_o0c0 = pwFilterArray; + F16 *in_pack = pwArray + ohow * ic * 8; // pack input // NCHWc8 => NHWChw8 for (U32 c = 0; c < ic; c++) { - F16 *in_pack_c8hw8 = in_pack + c*8*8; + F16 *in_pack_c8hw8 = in_pack + c * 8 * 8; // it is 2% faster than in_hw8c8 = ... + hw*8; Amazing! - F16 *in_hw8c8 = pwArray + c*ohow*8; + F16 *in_hw8c8 = pwArray + c * ohow * 8; // // for (U32 c8 = 0; c8 < 8; c8++) { // for (U32 hw8 = 0; hw8 < 8; hw8++) { @@ -1151,102 +1141,94 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra // } // } // - float16x8_t v0 = vld1q_f16(in_hw8c8 + hw*8); - float16x8_t v1 = vld1q_f16(in_hw8c8 + hw*8 + 8); - float16x8_t v2 = vld1q_f16(in_hw8c8 + hw*8 + 8*2); - float16x8_t v3 = vld1q_f16(in_hw8c8 + hw*8 + 8*3); - float16x8_t v4 = vld1q_f16(in_hw8c8 + hw*8 + 8*4); - float16x8_t v5 = vld1q_f16(in_hw8c8 + hw*8 + 8*5); - float16x8_t v6 = vld1q_f16(in_hw8c8 + hw*8 + 8*6); - float16x8_t v7 = vld1q_f16(in_hw8c8 + hw*8 + 8*7); + float16x8_t v0 = vld1q_f16(in_hw8c8 + hw * 8); + float16x8_t v1 = vld1q_f16(in_hw8c8 + hw * 8 + 8); + float16x8_t v2 = vld1q_f16(in_hw8c8 + hw * 8 + 8 * 2); + float16x8_t v3 = vld1q_f16(in_hw8c8 + hw * 8 + 8 * 3); + float16x8_t v4 = vld1q_f16(in_hw8c8 + hw * 8 + 8 * 4); + float16x8_t v5 = vld1q_f16(in_hw8c8 + hw * 8 + 8 * 5); + float16x8_t v6 = vld1q_f16(in_hw8c8 + hw * 8 + 8 * 6); + float16x8_t v7 = vld1q_f16(in_hw8c8 + hw * 8 + 8 * 7); vst1q_f16(in_pack_c8hw8, - vzip1q_f16( - vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip1q_f16(vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); vst1q_f16(in_pack_c8hw8 + 8, - vzip2q_f16( - vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip2q_f16(vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*2, - vzip1q_f16( - vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vst1q_f16(in_pack_c8hw8 + 8 * 2, + vzip1q_f16(vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*3, - vzip2q_f16( - vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vst1q_f16(in_pack_c8hw8 + 8 * 3, + vzip2q_f16(vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*4, - vzip1q_f16( - vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vst1q_f16(in_pack_c8hw8 + 8 * 4, + vzip1q_f16(vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*5, - vzip2q_f16( - vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vst1q_f16(in_pack_c8hw8 + 8 * 5, + vzip2q_f16(vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*6, - vzip1q_f16( - vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vst1q_f16(in_pack_c8hw8 + 8 * 6, + vzip1q_f16(vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*7, - vzip2q_f16( - vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vst1q_f16(in_pack_c8hw8 + 8 * 7, + vzip2q_f16(vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); } // compute - for (I32 o = 0; o < I32(oc-1); o+=2) { + for (I32 o = 0; o < I32(oc - 1); o += 2) { F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; // bias const F16 *b_o0 = b0; const F16 *b_o1 = b1; __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 + "ldr d22, [%[b_0]]\n" // b_o0 "ldr x1, [%[b_0], #8]\n" "ins v22.d[1], x1\n" - "ldr d23, [%[b_1]]\n" //b_o1 + "ldr d23, [%[b_1]]\n" // b_o1 "ldr x2, [%[b_1], #8]\n" "ins v23.d[1], x2\n" - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 "ldr x1, [%[in_0], #8]\n" - "mov v4.16b, v22.16b\n" //out_o0hw2 + "mov v4.16b, v22.16b\n" // out_o0hw2 "ins v0.d[1], x1\n" - "mov v5.16b, v22.16b\n" //out_o0hw3 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "mov v6.16b, v22.16b\n" //out_o0hw4 + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v6.16b, v22.16b\n" // out_o0hw4 "ldr x2, [%[f_0], #8]\n" - "mov v7.16b, v22.16b\n" //out_o0hw5 + "mov v7.16b, v22.16b\n" // out_o0hw5 "ins v18.d[1], x2\n" - "mov v8.16b, v22.16b\n" //out_o0hw6 - "ldr d19, [%[f_0], #16]\n" //f_o1c0 - "mov v9.16b, v22.16b\n" //out_o0hw7 + "mov v8.16b, v22.16b\n" // out_o0hw6 + "ldr d19, [%[f_0], #16]\n" // f_o1c0 + "mov v9.16b, v22.16b\n" // out_o0hw7 "ldr x3, [%[f_0], #24]\n" - "mov v10.16b, v23.16b\n" //out_o1hw0 + "mov v10.16b, v23.16b\n" // out_o1hw0 "ins v19.d[1], x3\n" - "mov v11.16b, v23.16b\n" //out_o1hw1 - "mov v12.16b, v23.16b\n" //out_o1hw2 - "mov v13.16b, v23.16b\n" //out_o1hw3 - "mov v14.16b, v23.16b\n" //out_o1hw4 - "mov v15.16b, v23.16b\n" //out_o1hw5 - "mov v16.16b, v23.16b\n" //out_o1hw6 - "mov v17.16b, v23.16b\n" //out_o1hw7 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + "mov v14.16b, v23.16b\n" // out_o1hw4 + "mov v15.16b, v23.16b\n" // out_o1hw5 + "mov v16.16b, v23.16b\n" // out_o1hw6 + "mov v17.16b, v23.16b\n" // out_o1hw7 "0:\n" - "ldr d1, [%[in_0], #16]\n" //in_hw0 + "ldr d1, [%[in_0], #16]\n" // in_hw0 "fmla v2.8h, v18.8h, v0.h[0]\n" "ldr x1, [%[in_0], #24]\n" "fmla v3.8h, v18.8h, v0.h[1]\n" "ins v1.d[1], x1\n" "fmla v4.8h, v18.8h, v0.h[2]\n" - "ldr d20, [%[f_0], #32]\n" //f_o0c0 + "ldr d20, [%[f_0], #32]\n" // f_o0c0 "fmla v5.8h, v18.8h, v0.h[3]\n" "ldr x2, [%[f_0], #40]\n" "fmla v6.8h, v18.8h, v0.h[4]\n" "ins v20.d[1], x2\n" "fmla v7.8h, v18.8h, v0.h[5]\n" - "ldr d21, [%[f_0], #48]\n" //f_o1c0 + "ldr d21, [%[f_0], #48]\n" // f_o1c0 "fmla v8.8h, v18.8h, v0.h[6]\n" "ldr x3, [%[f_0], #56]\n" "fmla v9.8h, v18.8h, v0.h[7]\n" @@ -1260,19 +1242,19 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "fmla v16.8h, v19.8h, v0.h[6]\n" "fmla v17.8h, v19.8h, v0.h[7]\n" - "ldr d0, [%[in_0], #32]\n" //in_hw0 + "ldr d0, [%[in_0], #32]\n" // in_hw0 "fmla v2.8h, v20.8h, v1.h[0]\n" "ldr x1, [%[in_0], #40]\n" "fmla v3.8h, v20.8h, v1.h[1]\n" "ins v0.d[1], x1\n" "fmla v4.8h, v20.8h, v1.h[2]\n" - "ldr d18, [%[f_0], #64]\n" //f_o0c0 + "ldr d18, [%[f_0], #64]\n" // f_o0c0 "fmla v5.8h, v20.8h, v1.h[3]\n" "ldr x2, [%[f_0], #72]\n" "fmla v6.8h, v20.8h, v1.h[4]\n" "ins v18.d[1], x2\n" "fmla v7.8h, v20.8h, v1.h[5]\n" - "ldr d19, [%[f_0], #80]\n" //f_o1c0 + "ldr d19, [%[f_0], #80]\n" // f_o1c0 "fmla v8.8h, v20.8h, v1.h[6]\n" "ldr x3, [%[f_0], #88]\n" "fmla v9.8h, v20.8h, v1.h[7]\n" @@ -1292,7 +1274,7 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "cmp %[pointwiseActivationMode], %[am_relu]\n" "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero + "eor v0.16b, v0.16b, v0.16b\n" // zero "fmax v2.8h, v2.8h, v0.8h\n" "fmax v3.8h, v3.8h, v0.8h\n" "fmax v4.8h, v4.8h, v0.8h\n" @@ -1313,8 +1295,8 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "11:\n" "cmp %[pointwiseActivationMode], %[am_relu6]\n" "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six "fmax v2.8h, v2.8h, v0.8h\n" "fmax v3.8h, v3.8h, v0.8h\n" "fmax v4.8h, v4.8h, v0.8h\n" @@ -1351,9 +1333,9 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "12:\n" "cmp %[pointwiseActivationMode], %[am_h_swish]\n" "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three "fadd v19.8h, v2.8h, v18.8h\n" "fadd v20.8h, v3.8h, v18.8h\n" "fadd v21.8h, v4.8h, v18.8h\n" @@ -1437,72 +1419,66 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "fmul v17.8h, v26.8h, v17.8h\n" "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - "str q6, [%[out_0], #64]\n" //out_o0hw4 - "str q7, [%[out_0], #80]\n" //out_o0hw5 - "str q8, [%[out_0], #96]\n" //out_o0hw6 - "str q9, [%[out_0], #112]\n" //out_o0hw7 - "str q10, [%[out_1]]\n" //out_o1hw0 - "str q11, [%[out_1], #16]\n" //out_o1hw1 - "str q12, [%[out_1], #32]\n" //out_o1hw2 - "str q13, [%[out_1], #48]\n" //out_o1hw3 - "str q14, [%[out_1], #64]\n" //out_o1hw4 - "str q15, [%[out_1], #80]\n" //out_o1hw5 - "str q16, [%[out_1], #96]\n" //out_o1hw6 - "str q17, [%[out_1], #112]\n" //out_o1hw7 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "x0", "x1", "x2", "x3" - ); + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q6, [%[out_0], #64]\n" // out_o0hw4 + "str q7, [%[out_0], #80]\n" // out_o0hw5 + "str q8, [%[out_0], #96]\n" // out_o0hw6 + "str q9, [%[out_0], #112]\n" // out_o0hw7 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + "str q14, [%[out_1], #64]\n" // out_o1hw4 + "str q15, [%[out_1], #80]\n" // out_o1hw5 + "str q16, [%[out_1], #96]\n" // out_o1hw6 + "str q17, [%[out_1], #112]\n" // out_o1hw7 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "x0", "x1", "x2", "x3"); b0 += 16; b1 += 16; } if (oc & 1) { // oc%2 != 0 - const F16 *f_r = filterArray + ic*fh*fw*8 + (oc-1)*8*ic*8; + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; // bias - const F16 *b_o0 = biasArray + ic*8 + (oc-1)*8; + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; __asm__ __volatile__( - "ldr q12, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" // ic_blk - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v12.16b\n" //out_o0hw0 + "ldr q12, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v12.16b\n" // out_o0hw0 "ldr x1, [%[in_0], #8]\n" - "mov v3.16b, v12.16b\n" //out_o0hw1 + "mov v3.16b, v12.16b\n" // out_o0hw1 "ins v0.d[1], x1\n" - "mov v4.16b, v12.16b\n" //out_o0hw2 - "ldr d10, [%[f_0]]\n" //f_o0c0 - "mov v5.16b, v12.16b\n" //out_o0hw3 + "mov v4.16b, v12.16b\n" // out_o0hw2 + "ldr d10, [%[f_0]]\n" // f_o0c0 + "mov v5.16b, v12.16b\n" // out_o0hw3 "ldr x2, [%[f_0], #8]\n" - "mov v6.16b, v12.16b\n" //out_o0hw4 + "mov v6.16b, v12.16b\n" // out_o0hw4 "ins v10.d[1], x2\n" - "mov v7.16b, v12.16b\n" //out_o0hw5 - "mov v8.16b, v12.16b\n" //out_o0hw6 - "mov v9.16b, v12.16b\n" //out_o0hw7 + "mov v7.16b, v12.16b\n" // out_o0hw5 + "mov v8.16b, v12.16b\n" // out_o0hw6 + "mov v9.16b, v12.16b\n" // out_o0hw7 "0:\n" - "ldr d1, [%[in_0], #16]\n" //in_hw0 + "ldr d1, [%[in_0], #16]\n" // in_hw0 "fmla v2.8h, v10.8h, v0.h[0]\n" "ldr x1, [%[in_0], #24]\n" "fmla v3.8h, v10.8h, v0.h[1]\n" "ins v1.d[1], x1\n" "fmla v4.8h, v10.8h, v0.h[2]\n" - "ldr d11, [%[f_0], #16]\n" //f_o0c0 + "ldr d11, [%[f_0], #16]\n" // f_o0c0 "fmla v5.8h, v10.8h, v0.h[3]\n" "ldr x2, [%[f_0], #24]\n" "fmla v6.8h, v10.8h, v0.h[4]\n" @@ -1512,13 +1488,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "fmla v8.8h, v10.8h, v0.h[6]\n" "fmla v9.8h, v10.8h, v0.h[7]\n" - "ldr d0, [%[in_0], #32]\n" //in_hw0 + "ldr d0, [%[in_0], #32]\n" // in_hw0 "fmla v2.8h, v11.8h, v1.h[0]\n" "ldr x1, [%[in_0], #40]\n" "fmla v3.8h, v11.8h, v1.h[1]\n" "ins v0.d[1], x1\n" "fmla v4.8h, v11.8h, v1.h[2]\n" - "ldr d10, [%[f_0], #32]\n" //f_o0c0 + "ldr d10, [%[f_0], #32]\n" // f_o0c0 "fmla v5.8h, v11.8h, v1.h[3]\n" "ldr x2, [%[f_0], #40]\n" "fmla v6.8h, v11.8h, v1.h[4]\n" @@ -1532,7 +1508,7 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "cmp %[pointwiseActivationMode], %[am_relu]\n" "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero + "eor v0.16b, v0.16b, v0.16b\n" // zero "fmax v2.8h, v2.8h, v0.8h\n" "fmax v3.8h, v3.8h, v0.8h\n" "fmax v4.8h, v4.8h, v0.8h\n" @@ -1545,8 +1521,8 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "11:\n" "cmp %[pointwiseActivationMode], %[am_relu6]\n" "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six "fmax v2.8h, v2.8h, v0.8h\n" "fmax v3.8h, v3.8h, v0.8h\n" "fmax v4.8h, v4.8h, v0.8h\n" @@ -1564,12 +1540,12 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "fmin v8.8h, v8.8h, v1.8h\n" "fmin v9.8h, v9.8h, v1.8h\n" - "12:\n" + "12:\n" "cmp %[pointwiseActivationMode], %[am_h_swish]\n" "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v10.8h, #0x42, lsl #8\n" //three + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v10.8h, #0x42, lsl #8\n" // three "fadd v11.8h, v2.8h, v10.8h\n" "fadd v12.8h, v3.8h, v10.8h\n" "fadd v13.8h, v4.8h, v10.8h\n" @@ -1612,26 +1588,21 @@ EE depthwise_pointwise_convolution_3x3s1p1_A55(TensorDesc inputDesc, F16* inArra "fmul v9.8h, v18.8h, v9.8h\n" "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw0 - "str q4, [%[out_0], #32]\n" //out_o0hw0 - "str q5, [%[out_0], #48]\n" //out_o0hw0 - "str q6, [%[out_0], #64]\n" //out_o0hw0 - "str q7, [%[out_0], #80]\n" //out_o0hw0 - "str q8, [%[out_0], #96]\n" //out_o0hw0 - "str q9, [%[out_0], #112]\n" //out_o0hw0 - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "x0", "x1", "x2" - ); + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw0 + "str q4, [%[out_0], #32]\n" // out_o0hw0 + "str q5, [%[out_0], #48]\n" // out_o0hw0 + "str q6, [%[out_0], #64]\n" // out_o0hw0 + "str q7, [%[out_0], #80]\n" // out_o0hw0 + "str q8, [%[out_0], #96]\n" // out_o0hw0 + "str q9, [%[out_0], #112]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "x0", "x1", "x2"); } } } diff --git a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A76.cpp b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A76.cpp similarity index 72% rename from tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A76.cpp rename to compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A76.cpp index f790ff80..e9340602 100644 --- a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A76.cpp +++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1_A76.cpp @@ -1,31 +1,38 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "cpu/arm/fp16/depthwise_pointwise_convolution_3x3s1p1.h" -EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc) +EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec) { - UNUSED(biasDesc); UNUSED(tmpBytes); - UNUSED(convDesc); + UNUSED(convParamSpec); DataType idt, fdt, odt; DataFormat idf, fdf, odf; @@ -33,28 +40,29 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra U32 fn, fc, fh, fw; U32 on, oc, oh, ow; CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - if (fdf != DF_CHWC8_NCN16) + if (dwFilterDesc.df != DF_NCHWC8 || pwFilterDesc.df != DF_NHWCN16) { CHECK_STATUS(NOT_MATCH); + } oc /= 8; ic /= 8; I32 ohow = oh * ow; - F16 *pwArray = (F16*)tmp; + F16 *pwArray = (F16 *)tmp; for (U32 n = 0; n < in; n++) { // dw_conv + padding for (U32 c = 0; c < ic; c++) { - const F16 *b = biasArray + c*8; - F16 *in_c = inArray + c*ih*iw*8; - const F16 *f = filterArray + c*fh*fw*8; - F16 *out = pwArray + c*ohow*8; + const F16 *b = dwBiasArray + c * 8; + F16 *in_c = inArray + c * ih * iw * 8; + const F16 *f = dwFilterArray + c * fh * fw * 8; + F16 *out = pwArray + c * ohow * 8; F16 *in0 = in_c; - F16 *in1 = in0 + iw*8; - F16 *in2 = in1 + iw*8; + F16 *in1 = in0 + iw * 8; + F16 *in2 = in1 + iw * 8; __asm__ __volatile__( "mov x0, %[w]\n" "ldr q28, [%[b]]\n" @@ -72,10 +80,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "ldr q19, [%[in_1], #16]\n" "ldr q20, [%[in_1], #32]\n" "ldr q21, [%[in_1], #48]\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 "ldr q17, [%[in_0], #64]\n" "fmla v10.8h, v3.8h, v13.8h\n" @@ -117,10 +125,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "add %[in_0], %[in_0], #48\n" "add %[in_1], %[in_1], #48\n" - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse "bne 111f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" @@ -128,13 +136,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "111:\n" "cmp %[depthwiseActivationMode], %[am_relu6]\n" "bne 112f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) "fmin v10.8h, v10.8h, v22.8h\n" "fmin v11.8h, v11.8h, v22.8h\n" "fmin v12.8h, v12.8h, v22.8h\n" @@ -142,13 +150,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "112:\n" "cmp %[depthwiseActivationMode], %[am_h_swish]\n" "bne 113f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three "fadd v27.8h, v9.8h, v22.8h\n" "fadd v29.8h, v10.8h, v22.8h\n" "fadd v30.8h, v11.8h, v22.8h\n" "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six + "movi v22.8h, #0x46, lsl #8\n" // six "fmax v27.8h, v27.8h, v17.8h\n" "fmax v29.8h, v29.8h, v17.8h\n" "fmax v30.8h, v30.8h, v17.8h\n" @@ -170,10 +178,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" "0:\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 "ldr q17, [%[in_0], #64]\n" "fmla v9.8h, v3.8h, v13.8h\n" @@ -220,10 +228,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "add %[in_0], %[in_0], #64\n" "add %[in_1], %[in_1], #64\n" - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse "bne 211f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" @@ -231,13 +239,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "211:\n" "cmp %[depthwiseActivationMode], %[am_relu6]\n" "bne 212f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) "fmin v10.8h, v10.8h, v22.8h\n" "fmin v11.8h, v11.8h, v22.8h\n" "fmin v12.8h, v12.8h, v22.8h\n" @@ -245,13 +253,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "212:\n" "cmp %[depthwiseActivationMode], %[am_h_swish]\n" "bne 213f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three "fadd v27.8h, v9.8h, v22.8h\n" "fadd v29.8h, v10.8h, v22.8h\n" "fadd v30.8h, v11.8h, v22.8h\n" "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six + "movi v22.8h, #0x46, lsl #8\n" // six "fmax v27.8h, v27.8h, v17.8h\n" "fmax v29.8h, v29.8h, v17.8h\n" "fmax v30.8h, v30.8h, v17.8h\n" @@ -274,10 +282,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" "bne 0b\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 "ldr q17, [%[in_0], #64]\n" "fmla v9.8h, v3.8h, v13.8h\n" @@ -306,10 +314,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "fmla v10.8h, v8.8h, v21.8h\n" "fmla v11.8h, v8.8h, v22.8h\n" - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse "bne 311f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" @@ -317,13 +325,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "311:\n" "cmp %[depthwiseActivationMode], %[am_relu6]\n" "bne 312f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) "fmin v10.8h, v10.8h, v22.8h\n" "fmin v11.8h, v11.8h, v22.8h\n" "fmin v12.8h, v12.8h, v22.8h\n" @@ -331,13 +339,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "312:\n" "cmp %[depthwiseActivationMode], %[am_h_swish]\n" "bne 313f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three "fadd v27.8h, v9.8h, v22.8h\n" "fadd v29.8h, v10.8h, v22.8h\n" "fadd v30.8h, v11.8h, v22.8h\n" "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six + "movi v22.8h, #0x46, lsl #8\n" // six "fmax v27.8h, v27.8h, v17.8h\n" "fmax v29.8h, v29.8h, v17.8h\n" "fmax v30.8h, v30.8h, v17.8h\n" @@ -357,25 +365,20 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "313:\n" "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" - :[out]"+r"(out), - [in_0]"+r"(in0), - [in_1]"+r"(in1) - :[f]"r"(f), - [b]"r"(b), - [w]"r"((I64)ow-8), - [depthwiseActivationMode]"r"((I64)depthwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3" - ); - - for (U32 h = 0; h < oh-2; h++) { - in0 = in_c + h*iw*8; - in1 = in0 + iw*8; - in2 = in1 + iw*8; + : [out] "+r"(out), [in_0] "+r"(in0), [in_1] "+r"(in1) + : [f] "r"(f), [b] "r"(b), [w] "r"((I64)ow - 8), + [depthwiseActivationMode] "r"((I64)depthwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", + "x3"); + + for (U32 h = 0; h < oh - 2; h++) { + in0 = in_c + h * iw * 8; + in1 = in0 + iw * 8; + in2 = in1 + iw * 8; __asm__ __volatile__( "mov x0, %[w]\n" "ldr q28, [%[b]]\n" @@ -401,10 +404,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "ldr q25, [%[in_2], #32]\n" "ldr q26, [%[in_2], #48]\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 "ldr q17, [%[in_0], #64]\n" "fmla v10.8h, v0.8h, v13.8h\n" @@ -464,10 +467,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "add %[in_1], %[in_1], #48\n" "add %[in_2], %[in_2], #48\n" - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse "bne 111f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" @@ -475,13 +478,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "111:\n" "cmp %[depthwiseActivationMode], %[am_relu6]\n" "bne 112f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) "fmin v10.8h, v10.8h, v22.8h\n" "fmin v11.8h, v11.8h, v22.8h\n" "fmin v12.8h, v12.8h, v22.8h\n" @@ -489,13 +492,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "112:\n" "cmp %[depthwiseActivationMode], %[am_h_swish]\n" "bne 113f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three "fadd v27.8h, v9.8h, v22.8h\n" "fadd v29.8h, v10.8h, v22.8h\n" "fadd v30.8h, v11.8h, v22.8h\n" "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six + "movi v22.8h, #0x46, lsl #8\n" // six "fmax v27.8h, v27.8h, v17.8h\n" "fmax v29.8h, v29.8h, v17.8h\n" "fmax v30.8h, v30.8h, v17.8h\n" @@ -517,10 +520,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" "0:\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 "ldr q17, [%[in_0], #64]\n" "fmla v9.8h, v0.8h, v13.8h\n" @@ -570,7 +573,6 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "fmla v11.8h, v8.8h, v27.8h\n" "fmla v12.8h, v8.8h, v23.8h\n" - "ldr q16, [%[in_0], #112]\n" "mov v15.16b, v14.16b\n" "mov v20.16b, v19.16b\n" @@ -588,10 +590,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "add %[in_1], %[in_1], #64\n" "add %[in_2], %[in_2], #64\n" - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse "bne 211f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" @@ -599,13 +601,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "211:\n" "cmp %[depthwiseActivationMode], %[am_relu6]\n" "bne 212f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) "fmin v10.8h, v10.8h, v22.8h\n" "fmin v11.8h, v11.8h, v22.8h\n" "fmin v12.8h, v12.8h, v22.8h\n" @@ -613,13 +615,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "212:\n" "cmp %[depthwiseActivationMode], %[am_h_swish]\n" "bne 213f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three "fadd v27.8h, v9.8h, v22.8h\n" "fadd v29.8h, v10.8h, v22.8h\n" "fadd v30.8h, v11.8h, v22.8h\n" "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six + "movi v22.8h, #0x46, lsl #8\n" // six "fmax v27.8h, v27.8h, v17.8h\n" "fmax v29.8h, v29.8h, v17.8h\n" "fmax v30.8h, v30.8h, v17.8h\n" @@ -642,10 +644,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" "bne 0b\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 "ldr q17, [%[in_0], #64]\n" "fmla v9.8h, v0.8h, v13.8h\n" @@ -686,10 +688,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "fmla v10.8h, v8.8h, v26.8h\n" "fmla v11.8h, v8.8h, v27.8h\n" - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse "bne 311f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" @@ -697,13 +699,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "311:\n" "cmp %[depthwiseActivationMode], %[am_relu6]\n" "bne 312f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) "fmin v10.8h, v10.8h, v22.8h\n" "fmin v11.8h, v11.8h, v22.8h\n" "fmin v12.8h, v12.8h, v22.8h\n" @@ -711,13 +713,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "312:\n" "cmp %[depthwiseActivationMode], %[am_h_swish]\n" "bne 313f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three "fadd v27.8h, v9.8h, v22.8h\n" "fadd v29.8h, v10.8h, v22.8h\n" "fadd v30.8h, v11.8h, v22.8h\n" "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six + "movi v22.8h, #0x46, lsl #8\n" // six "fmax v27.8h, v27.8h, v17.8h\n" "fmax v29.8h, v29.8h, v17.8h\n" "fmax v30.8h, v30.8h, v17.8h\n" @@ -737,25 +739,19 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "313:\n" "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" - :[out]"+r"(out), - [in_0]"+r"(in0), - [in_1]"+r"(in1), - [in_2]"+r"(in2) - :[f]"r"(f), - [b]"r"(b), - [w]"r"((I64)ow-8), - [depthwiseActivationMode]"r"((I64)depthwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3" - ); + : [out] "+r"(out), [in_0] "+r"(in0), [in_1] "+r"(in1), [in_2] "+r"(in2) + : [f] "r"(f), [b] "r"(b), [w] "r"((I64)ow - 8), + [depthwiseActivationMode] "r"((I64)depthwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", + "x0", "x1", "x2", "x3"); } - in0 = in_c + (ih-2)*iw*8; - in1 = in0 + iw*8; - in2 = in1 + iw*8; + in0 = in_c + (ih - 2) * iw * 8; + in1 = in0 + iw * 8; + in2 = in1 + iw * 8; __asm__ __volatile__( "mov x0, %[w]\n" "ldr q28, [%[b]]\n" @@ -774,10 +770,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "ldr q20, [%[in_1], #32]\n" "ldr q21, [%[in_1], #48]\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 "ldr q17, [%[in_0], #64]\n" "fmla v10.8h, v0.8h, v13.8h\n" @@ -819,10 +815,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "add %[in_0], %[in_0], #48\n" "add %[in_1], %[in_1], #48\n" - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse "bne 111f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" @@ -830,13 +826,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "111:\n" "cmp %[depthwiseActivationMode], %[am_relu6]\n" "bne 112f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) "fmin v10.8h, v10.8h, v22.8h\n" "fmin v11.8h, v11.8h, v22.8h\n" "fmin v12.8h, v12.8h, v22.8h\n" @@ -844,13 +840,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "112:\n" "cmp %[depthwiseActivationMode], %[am_h_swish]\n" "bne 113f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three "fadd v27.8h, v9.8h, v22.8h\n" "fadd v29.8h, v10.8h, v22.8h\n" "fadd v30.8h, v11.8h, v22.8h\n" "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six + "movi v22.8h, #0x46, lsl #8\n" // six "fmax v27.8h, v27.8h, v17.8h\n" "fmax v29.8h, v29.8h, v17.8h\n" "fmax v30.8h, v30.8h, v17.8h\n" @@ -872,10 +868,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" "0:\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 "ldr q17, [%[in_0], #64]\n" "fmla v9.8h, v0.8h, v13.8h\n" @@ -910,7 +906,6 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "fmla v11.8h, v5.8h, v22.8h\n" "fmla v12.8h, v5.8h, v18.8h\n" - "ldr q16, [%[in_0], #112]\n" "mov v15.16b, v14.16b\n" "mov v20.16b, v19.16b\n" @@ -923,10 +918,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "add %[in_0], %[in_0], #64\n" "add %[in_1], %[in_1], #64\n" - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse "bne 211f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" @@ -934,13 +929,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "211:\n" "cmp %[depthwiseActivationMode], %[am_relu6]\n" "bne 212f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) "fmin v10.8h, v10.8h, v22.8h\n" "fmin v11.8h, v11.8h, v22.8h\n" "fmin v12.8h, v12.8h, v22.8h\n" @@ -948,13 +943,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "212:\n" "cmp %[depthwiseActivationMode], %[am_h_swish]\n" "bne 213f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three "fadd v27.8h, v9.8h, v22.8h\n" "fadd v29.8h, v10.8h, v22.8h\n" "fadd v30.8h, v11.8h, v22.8h\n" "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six + "movi v22.8h, #0x46, lsl #8\n" // six "fmax v27.8h, v27.8h, v17.8h\n" "fmax v29.8h, v29.8h, v17.8h\n" "fmax v30.8h, v30.8h, v17.8h\n" @@ -977,10 +972,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" "bne 0b\n" - "mov v9.16b, v28.16b\n" //out_0 - "mov v10.16b, v28.16b\n" //out_1 - "mov v11.16b, v28.16b\n" //out_2 - "mov v12.16b, v28.16b\n" //out_3 + "mov v9.16b, v28.16b\n" // out_0 + "mov v10.16b, v28.16b\n" // out_1 + "mov v11.16b, v28.16b\n" // out_2 + "mov v12.16b, v28.16b\n" // out_3 "ldr q17, [%[in_0], #64]\n" "fmla v9.8h, v0.8h, v13.8h\n" @@ -1009,10 +1004,10 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "fmla v10.8h, v5.8h, v21.8h\n" "fmla v11.8h, v5.8h, v22.8h\n" - "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse + "cmp %[depthwiseActivationMode], %[am_relu]\n" // v17, v22, v27, 29, 30, 31 will be reuse "bne 311f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" @@ -1020,13 +1015,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "311:\n" "cmp %[depthwiseActivationMode], %[am_relu6]\n" "bne 312f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x46, lsl #8\n" //six - "fmax v9.8h, v9.8h, v17.8h\n" //max(v9, 0) + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x46, lsl #8\n" // six + "fmax v9.8h, v9.8h, v17.8h\n" // max(v9, 0) "fmax v10.8h, v10.8h, v17.8h\n" "fmax v11.8h, v11.8h, v17.8h\n" "fmax v12.8h, v12.8h, v17.8h\n" - "fmin v9.8h, v9.8h, v22.8h\n" //min(v9, 6) + "fmin v9.8h, v9.8h, v22.8h\n" // min(v9, 6) "fmin v10.8h, v10.8h, v22.8h\n" "fmin v11.8h, v11.8h, v22.8h\n" "fmin v12.8h, v12.8h, v22.8h\n" @@ -1034,13 +1029,13 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "312:\n" "cmp %[depthwiseActivationMode], %[am_h_swish]\n" "bne 313f\n" - "eor v17.16b, v17.16b, v17.16b\n" //zero - "movi v22.8h, #0x42, lsl #8\n" //three + "eor v17.16b, v17.16b, v17.16b\n" // zero + "movi v22.8h, #0x42, lsl #8\n" // three "fadd v27.8h, v9.8h, v22.8h\n" "fadd v29.8h, v10.8h, v22.8h\n" "fadd v30.8h, v11.8h, v22.8h\n" "fadd v31.8h, v12.8h, v22.8h\n" - "movi v22.8h, #0x46, lsl #8\n" //six + "movi v22.8h, #0x46, lsl #8\n" // six "fmax v27.8h, v27.8h, v17.8h\n" "fmax v29.8h, v29.8h, v17.8h\n" "fmax v30.8h, v30.8h, v17.8h\n" @@ -1060,34 +1055,29 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "313:\n" "st1 {v9.8h, v10.8h, v11.8h, v12.8h}, [%[out]], #64\n" - :[out]"+r"(out), - [in_0]"+r"(in0), - [in_1]"+r"(in1) - :[f]"r"(f), - [b]"r"(b), - [w]"r"((I64)ow-8), - [depthwiseActivationMode]"r"((I64)depthwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3" - ); + : [out] "+r"(out), [in_0] "+r"(in0), [in_1] "+r"(in1) + : [f] "r"(f), [b] "r"(b), [w] "r"((I64)ow - 8), + [depthwiseActivationMode] "r"((I64)depthwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", + "x3"); } // pw_conv - for (I32 hw = 0; hw < ohow-7; hw+=8) { - const F16 *b0 = biasArray + ic*8; + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *b0 = pwBiasArray; const F16 *b1 = b0 + 8; - const F16 *f_o0c0 = filterArray + ic*fh*fw*8; - F16 *in_pack = pwArray + ohow*ic*8; + const F16 *f_o0c0 = pwFilterArray; + F16 *in_pack = pwArray + ohow * ic * 8; // pack input // NCHWc8 => NHWChw8 for (U32 c = 0; c < ic; c++) { - F16 *in_pack_c8hw8 = in_pack + c*8*8; + F16 *in_pack_c8hw8 = in_pack + c * 8 * 8; // it is 2% faster than in_hw8c8 = ... + hw*8; Amazing! - F16 *in_hw8c8 = pwArray + c*ohow*8; + F16 *in_hw8c8 = pwArray + c * ohow * 8; // // for (U32 c8 = 0; c8 < 8; c8++) { // for (U32 hw8 = 0; hw8 < 8; hw8++) { @@ -1095,88 +1085,80 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra // } // } // - float16x8_t v0 = vld1q_f16(in_hw8c8 + hw*8); - float16x8_t v1 = vld1q_f16(in_hw8c8 + hw*8 + 8); - float16x8_t v2 = vld1q_f16(in_hw8c8 + hw*8 + 8*2); - float16x8_t v3 = vld1q_f16(in_hw8c8 + hw*8 + 8*3); - float16x8_t v4 = vld1q_f16(in_hw8c8 + hw*8 + 8*4); - float16x8_t v5 = vld1q_f16(in_hw8c8 + hw*8 + 8*5); - float16x8_t v6 = vld1q_f16(in_hw8c8 + hw*8 + 8*6); - float16x8_t v7 = vld1q_f16(in_hw8c8 + hw*8 + 8*7); + float16x8_t v0 = vld1q_f16(in_hw8c8 + hw * 8); + float16x8_t v1 = vld1q_f16(in_hw8c8 + hw * 8 + 8); + float16x8_t v2 = vld1q_f16(in_hw8c8 + hw * 8 + 8 * 2); + float16x8_t v3 = vld1q_f16(in_hw8c8 + hw * 8 + 8 * 3); + float16x8_t v4 = vld1q_f16(in_hw8c8 + hw * 8 + 8 * 4); + float16x8_t v5 = vld1q_f16(in_hw8c8 + hw * 8 + 8 * 5); + float16x8_t v6 = vld1q_f16(in_hw8c8 + hw * 8 + 8 * 6); + float16x8_t v7 = vld1q_f16(in_hw8c8 + hw * 8 + 8 * 7); vst1q_f16(in_pack_c8hw8, - vzip1q_f16( - vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip1q_f16(vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); vst1q_f16(in_pack_c8hw8 + 8, - vzip2q_f16( - vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vzip2q_f16(vzip1q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), vzip1q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*2, - vzip1q_f16( - vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vst1q_f16(in_pack_c8hw8 + 8 * 2, + vzip1q_f16(vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*3, - vzip2q_f16( - vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), + vst1q_f16(in_pack_c8hw8 + 8 * 3, + vzip2q_f16(vzip2q_f16(vzip1q_f16(v0, v4), vzip1q_f16(v2, v6)), vzip2q_f16(vzip1q_f16(v1, v5), vzip1q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*4, - vzip1q_f16( - vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vst1q_f16(in_pack_c8hw8 + 8 * 4, + vzip1q_f16(vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*5, - vzip2q_f16( - vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vst1q_f16(in_pack_c8hw8 + 8 * 5, + vzip2q_f16(vzip1q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), vzip1q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*6, - vzip1q_f16( - vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vst1q_f16(in_pack_c8hw8 + 8 * 6, + vzip1q_f16(vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); - vst1q_f16(in_pack_c8hw8 + 8*7, - vzip2q_f16( - vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), + vst1q_f16(in_pack_c8hw8 + 8 * 7, + vzip2q_f16(vzip2q_f16(vzip2q_f16(v0, v4), vzip2q_f16(v2, v6)), vzip2q_f16(vzip2q_f16(v1, v5), vzip2q_f16(v3, v7)))); } // compute - for (I32 o = 0; o < I32(oc-1); o+=2) { + for (I32 o = 0; o < I32(oc - 1); o += 2) { F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; // bias const F16 *b_o0 = b0; const F16 *b_o1 = b1; __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "ldr q23, [%[b_1]]\n" //b_o1 - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr q0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "mov v4.16b, v22.16b\n" //out_o0hw2 - "mov v5.16b, v22.16b\n" //out_o0hw3 - "ldr q18, [%[f_0]]\n" //f_o0c0 - "mov v6.16b, v22.16b\n" //out_o0hw4 - "mov v7.16b, v22.16b\n" //out_o0hw5 - "mov v8.16b, v22.16b\n" //out_o0hw6 - "ldr q19, [%[f_0], #16]\n" //f_o1c0 - "mov v9.16b, v22.16b\n" //out_o0hw7 - "mov v10.16b, v23.16b\n" //out_o1hw0 - "mov v11.16b, v23.16b\n" //out_o1hw1 - "mov v12.16b, v23.16b\n" //out_o1hw2 - "mov v13.16b, v23.16b\n" //out_o1hw3 - "mov v14.16b, v23.16b\n" //out_o1hw4 - "mov v15.16b, v23.16b\n" //out_o1hw5 - "mov v16.16b, v23.16b\n" //out_o1hw6 - "mov v17.16b, v23.16b\n" //out_o1hw7 + "ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr q0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v6.16b, v22.16b\n" // out_o0hw4 + "mov v7.16b, v22.16b\n" // out_o0hw5 + "mov v8.16b, v22.16b\n" // out_o0hw6 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 + "mov v9.16b, v22.16b\n" // out_o0hw7 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + "mov v14.16b, v23.16b\n" // out_o1hw4 + "mov v15.16b, v23.16b\n" // out_o1hw5 + "mov v16.16b, v23.16b\n" // out_o1hw6 + "mov v17.16b, v23.16b\n" // out_o1hw7 "0:\n" - "ldr q1, [%[in_0], #16]\n" //in_hw0 + "ldr q1, [%[in_0], #16]\n" // in_hw0 "fmla v2.8h, v18.8h, v0.h[0]\n" "fmla v3.8h, v18.8h, v0.h[1]\n" "fmla v4.8h, v18.8h, v0.h[2]\n" - "ldr q20, [%[f_0], #32]\n" //f_o0c0 + "ldr q20, [%[f_0], #32]\n" // f_o0c0 "fmla v5.8h, v18.8h, v0.h[3]\n" "fmla v6.8h, v18.8h, v0.h[4]\n" "fmla v7.8h, v18.8h, v0.h[5]\n" - "ldr q21, [%[f_0], #48]\n" //f_o1c0 + "ldr q21, [%[f_0], #48]\n" // f_o1c0 "fmla v8.8h, v18.8h, v0.h[6]\n" "fmla v9.8h, v18.8h, v0.h[7]\n" "fmla v10.8h, v19.8h, v0.h[0]\n" @@ -1188,15 +1170,15 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "fmla v16.8h, v19.8h, v0.h[6]\n" "fmla v17.8h, v19.8h, v0.h[7]\n" - "ldr q0, [%[in_0], #32]\n" //in_hw0 + "ldr q0, [%[in_0], #32]\n" // in_hw0 "fmla v2.8h, v20.8h, v1.h[0]\n" "fmla v3.8h, v20.8h, v1.h[1]\n" "fmla v4.8h, v20.8h, v1.h[2]\n" - "ldr q18, [%[f_0], #64]\n" //f_o0c0 + "ldr q18, [%[f_0], #64]\n" // f_o0c0 "fmla v5.8h, v20.8h, v1.h[3]\n" "fmla v6.8h, v20.8h, v1.h[4]\n" "fmla v7.8h, v20.8h, v1.h[5]\n" - "ldr q19, [%[f_0], #80]\n" //f_o1c0 + "ldr q19, [%[f_0], #80]\n" // f_o1c0 "fmla v8.8h, v20.8h, v1.h[6]\n" "fmla v9.8h, v20.8h, v1.h[7]\n" "fmla v10.8h, v21.8h, v1.h[0]\n" @@ -1214,7 +1196,7 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "cmp %[pointwiseActivationMode], %[am_relu]\n" "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero + "eor v0.16b, v0.16b, v0.16b\n" // zero "fmax v2.8h, v2.8h, v0.8h\n" "fmax v3.8h, v3.8h, v0.8h\n" "fmax v4.8h, v4.8h, v0.8h\n" @@ -1235,8 +1217,8 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "11:\n" "cmp %[pointwiseActivationMode], %[am_relu6]\n" "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six "fmax v2.8h, v2.8h, v0.8h\n" "fmax v3.8h, v3.8h, v0.8h\n" "fmax v4.8h, v4.8h, v0.8h\n" @@ -1273,9 +1255,9 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "12:\n" "cmp %[pointwiseActivationMode], %[am_h_swish]\n" "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three "fadd v19.8h, v2.8h, v18.8h\n" "fadd v20.8h, v3.8h, v18.8h\n" "fadd v21.8h, v4.8h, v18.8h\n" @@ -1359,66 +1341,60 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "fmul v17.8h, v26.8h, v17.8h\n" "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - "str q6, [%[out_0], #64]\n" //out_o0hw4 - "str q7, [%[out_0], #80]\n" //out_o0hw5 - "str q8, [%[out_0], #96]\n" //out_o0hw6 - "str q9, [%[out_0], #112]\n" //out_o0hw7 - "str q10, [%[out_1]]\n" //out_o1hw0 - "str q11, [%[out_1], #16]\n" //out_o1hw1 - "str q12, [%[out_1], #32]\n" //out_o1hw2 - "str q13, [%[out_1], #48]\n" //out_o1hw3 - "str q14, [%[out_1], #64]\n" //out_o1hw4 - "str q15, [%[out_1], #80]\n" //out_o1hw5 - "str q16, [%[out_1], #96]\n" //out_o1hw6 - "str q17, [%[out_1], #112]\n" //out_o1hw7 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "x0", "x1", "x2", "x3" - ); + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q6, [%[out_0], #64]\n" // out_o0hw4 + "str q7, [%[out_0], #80]\n" // out_o0hw5 + "str q8, [%[out_0], #96]\n" // out_o0hw6 + "str q9, [%[out_0], #112]\n" // out_o0hw7 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + "str q14, [%[out_1], #64]\n" // out_o1hw4 + "str q15, [%[out_1], #80]\n" // out_o1hw5 + "str q16, [%[out_1], #96]\n" // out_o1hw6 + "str q17, [%[out_1], #112]\n" // out_o1hw7 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "x0", "x1", "x2", "x3"); b0 += 16; b1 += 16; } if (oc & 1) { // oc%2 != 0 - const F16 *f_r = filterArray + ic*fh*fw*8 + (oc-1)*8*ic*8; + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; // bias - const F16 *b_o0 = biasArray + ic*8 + (oc-1)*8; + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; __asm__ __volatile__( - "ldr q12, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" // ic_blk - "ldr q0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v12.16b\n" //out_o0hw0 - "mov v3.16b, v12.16b\n" //out_o0hw1 - "mov v4.16b, v12.16b\n" //out_o0hw2 - "ldr q10, [%[f_0]]\n" //f_o0c0 - "mov v5.16b, v12.16b\n" //out_o0hw3 - "mov v6.16b, v12.16b\n" //out_o0hw4 - "mov v7.16b, v12.16b\n" //out_o0hw5 - "mov v8.16b, v12.16b\n" //out_o0hw6 - "mov v9.16b, v12.16b\n" //out_o0hw7 + "ldr q12, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr q0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v12.16b\n" // out_o0hw0 + "mov v3.16b, v12.16b\n" // out_o0hw1 + "mov v4.16b, v12.16b\n" // out_o0hw2 + "ldr q10, [%[f_0]]\n" // f_o0c0 + "mov v5.16b, v12.16b\n" // out_o0hw3 + "mov v6.16b, v12.16b\n" // out_o0hw4 + "mov v7.16b, v12.16b\n" // out_o0hw5 + "mov v8.16b, v12.16b\n" // out_o0hw6 + "mov v9.16b, v12.16b\n" // out_o0hw7 "0:\n" - "ldr q1, [%[in_0], #16]\n" //in_hw0 + "ldr q1, [%[in_0], #16]\n" // in_hw0 "fmla v2.8h, v10.8h, v0.h[0]\n" "fmla v3.8h, v10.8h, v0.h[1]\n" "fmla v4.8h, v10.8h, v0.h[2]\n" - "ldr q11, [%[f_0], #16]\n" //f_o0c0 + "ldr q11, [%[f_0], #16]\n" // f_o0c0 "fmla v5.8h, v10.8h, v0.h[3]\n" "fmla v6.8h, v10.8h, v0.h[4]\n" "fmla v7.8h, v10.8h, v0.h[5]\n" @@ -1426,11 +1402,11 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "fmla v8.8h, v10.8h, v0.h[6]\n" "fmla v9.8h, v10.8h, v0.h[7]\n" - "ldr q0, [%[in_0], #32]\n" //in_hw0 + "ldr q0, [%[in_0], #32]\n" // in_hw0 "fmla v2.8h, v11.8h, v1.h[0]\n" "fmla v3.8h, v11.8h, v1.h[1]\n" "fmla v4.8h, v11.8h, v1.h[2]\n" - "ldr q10, [%[f_0], #32]\n" //f_o0c0 + "ldr q10, [%[f_0], #32]\n" // f_o0c0 "fmla v5.8h, v11.8h, v1.h[3]\n" "fmla v6.8h, v11.8h, v1.h[4]\n" "fmla v7.8h, v11.8h, v1.h[5]\n" @@ -1442,7 +1418,7 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "cmp %[pointwiseActivationMode], %[am_relu]\n" "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero + "eor v0.16b, v0.16b, v0.16b\n" // zero "fmax v2.8h, v2.8h, v0.8h\n" "fmax v3.8h, v3.8h, v0.8h\n" "fmax v4.8h, v4.8h, v0.8h\n" @@ -1455,8 +1431,8 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "11:\n" "cmp %[pointwiseActivationMode], %[am_relu6]\n" "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six "fmax v2.8h, v2.8h, v0.8h\n" "fmax v3.8h, v3.8h, v0.8h\n" "fmax v4.8h, v4.8h, v0.8h\n" @@ -1474,12 +1450,12 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "fmin v8.8h, v8.8h, v1.8h\n" "fmin v9.8h, v9.8h, v1.8h\n" - "12:\n" + "12:\n" "cmp %[pointwiseActivationMode], %[am_h_swish]\n" "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v10.8h, #0x42, lsl #8\n" //three + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v10.8h, #0x42, lsl #8\n" // three "fadd v11.8h, v2.8h, v10.8h\n" "fadd v12.8h, v3.8h, v10.8h\n" "fadd v13.8h, v4.8h, v10.8h\n" @@ -1522,26 +1498,21 @@ EE depthwise_pointwise_convolution_3x3s1p1_A76(TensorDesc inputDesc, F16* inArra "fmul v9.8h, v18.8h, v9.8h\n" "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw0 - "str q4, [%[out_0], #32]\n" //out_o0hw0 - "str q5, [%[out_0], #48]\n" //out_o0hw0 - "str q6, [%[out_0], #64]\n" //out_o0hw0 - "str q7, [%[out_0], #80]\n" //out_o0hw0 - "str q8, [%[out_0], #96]\n" //out_o0hw0 - "str q9, [%[out_0], #112]\n" //out_o0hw0 - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "x0", "x1", "x2" - ); + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw0 + "str q4, [%[out_0], #32]\n" // out_o0hw0 + "str q5, [%[out_0], #48]\n" // out_o0hw0 + "str q6, [%[out_0], #64]\n" // out_o0hw0 + "str q7, [%[out_0], #80]\n" // out_o0hw0 + "str q8, [%[out_0], #96]\n" // out_o0hw0 + "str q9, [%[out_0], #112]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "x0", "x1", "x2"); } } } diff --git a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct.h b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct.h new file mode 100644 index 00000000..ef34d886 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct.h @@ -0,0 +1,96 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_DEPTHWISE_POINTWISE_CONVOLUTION_DIRECT +#define _H_DEPTHWISE_POINTWISE_CONVOLUTION_DIRECT + +#include +#include "sys.h" +#include "types.h" +#include "error.h" + +EE depthwise_pointwise_convolution_direct_A55(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec); + +EE depthwise_pointwise_convolution_direct_A76(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec); + +inline EE depthwise_pointwise_convolution_direct(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch) +{ + EE ret = SUCCESS; + switch (arch) { + case ARM_A55: + ret = depthwise_pointwise_convolution_direct_A55(inputDesc, inArray, dwFilterDesc, + dwFilterArray, pwFilterDesc, pwFilterArray, convParamSpec, dwBiasDesc, dwBiasArray, + pwBiasDesc, pwBiasArray, tmpBytes, tmp, outputDesc, outArray, + depthwiseActivationParamSpec, pointwiseActivationParamSpec); + break; + case ARM_A76: + ret = depthwise_pointwise_convolution_direct_A76(inputDesc, inArray, dwFilterDesc, + dwFilterArray, pwFilterDesc, pwFilterArray, convParamSpec, dwBiasDesc, dwBiasArray, + pwBiasDesc, pwBiasArray, tmpBytes, tmp, outputDesc, outArray, + depthwiseActivationParamSpec, pointwiseActivationParamSpec); + break; + default: + return NOT_SUPPORTED; + } + return ret; +} +#endif diff --git a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A55.cpp b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A55.cpp new file mode 100644 index 00000000..2998aa5a --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A55.cpp @@ -0,0 +1,1417 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/depthwise_pointwise_convolution_direct.h" + +EE depthwise_pointwise_convolution_direct_A55(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec) +{ + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if (dwFilterDesc.df != DF_NCHWC8) { + CHECK_STATUS(NOT_MATCH); + } + if (pwFilterArray != nullptr && pwFilterDesc.df != DF_NHWCN16) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + U32 ihiw = ih * iw; + I32 ohow = oh * ow; + F16 *pwArray = (F16 *)tmp + ic * ih_pad * iw_pad * 8; + for (U32 n = 0; n < in; n++) { + // cpy input into a input with padding + F16 *inArray_pad = (F16 *)tmp; + F16 *inArray_pad_mov = inArray_pad; + F16 *inArray_mov = inArray + n * ic * ihiw * 8; + for (U32 c = 0; c < ic; c++) { + if (paddingT > 0) { + memset(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingT * iw_pad * 8; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingL * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(fdt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingR * 8; + } + if (paddingB > 0) { + memset(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingB * iw_pad * 8; + } + + // dw_conv + const F16 *b = dwBiasArray + c * 8; + F16 *in_pad = inArray_pad + c * ih_pad * iw_pad * 8; + const F16 *f = dwFilterArray + c * fh * fw * 8; + // ohow / 8 + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + U32 in_h_1 = (hw + 1) / ow * strideH; + U32 in_w_1 = (hw + 1) % ow * strideW; + U32 in_h_2 = (hw + 2) / ow * strideH; + U32 in_w_2 = (hw + 2) % ow * strideW; + U32 in_h_3 = (hw + 3) / ow * strideH; + U32 in_w_3 = (hw + 3) % ow * strideW; + U32 in_h_4 = (hw + 4) / ow * strideH; + U32 in_w_4 = (hw + 4) % ow * strideW; + U32 in_h_5 = (hw + 5) / ow * strideH; + U32 in_w_5 = (hw + 5) % ow * strideW; + U32 in_h_6 = (hw + 6) / ow * strideH; + U32 in_w_6 = (hw + 6) % ow * strideW; + U32 in_h_7 = (hw + 7) / ow * strideH; + U32 in_w_7 = (hw + 7) % ow * strideW; + // TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. + __asm__ __volatile__( + "ldr q8, [%[b]]\n" + "mov v0.16b, v8.16b\n" + "mov v1.16b, v8.16b\n" + "mov v2.16b, v8.16b\n" + "mov v3.16b, v8.16b\n" + "mov v4.16b, v8.16b\n" + "mov v5.16b, v8.16b\n" + "mov v6.16b, v8.16b\n" + "mov v7.16b, v8.16b\n" + : + : [b] "r"(b) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const F16 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + F16 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + F16 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F16 *in_1 = in_idx + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F16 *in_2 = in_idx + in_h_2 * iw_pad * 8 + in_w_2 * 8; + F16 *in_3 = in_idx + in_h_3 * iw_pad * 8 + in_w_3 * 8; + F16 *in_4 = in_idx + in_h_4 * iw_pad * 8 + in_w_4 * 8; + F16 *in_5 = in_idx + in_h_5 * iw_pad * 8 + in_w_5 * 8; + F16 *in_6 = in_idx + in_h_6 * iw_pad * 8 + in_w_6 * 8; + F16 *in_7 = in_idx + in_h_7 * iw_pad * 8 + in_w_7 * 8; + __asm__ __volatile__("ldr q17, [%[f0]]\n" + "ldr q9, [%[in0]]\n" + "ldr q10, [%[in1]]\n" + "ldr q11, [%[in2]]\n" + "ldr q12, [%[in3]]\n" + "ldr q13, [%[in4]]\n" + "ldr q14, [%[in5]]\n" + "ldr q15, [%[in6]]\n" + "ldr q16, [%[in7]]\n" + "fmla v0.8h, v9.8h, v17.8h\n" + "fmla v1.8h, v10.8h, v17.8h\n" + "fmla v2.8h, v11.8h, v17.8h\n" + "fmla v3.8h, v12.8h, v17.8h\n" + "fmla v4.8h, v13.8h, v17.8h\n" + "fmla v5.8h, v14.8h, v17.8h\n" + "fmla v6.8h, v15.8h, v17.8h\n" + "fmla v7.8h, v16.8h, v17.8h\n" + : + : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), + [in3] "r"(in_3), [in4] "r"(in_4), [in5] "r"(in_5), + [in6] "r"(in_6), [in7] "r"(in_7), [f0] "r"(f_0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "fmax v0.8h, v0.8h, v31.8h\n" + "fmax v1.8h, v1.8h, v31.8h\n" + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v6.8h, v6.8h, v31.8h\n" + "fmax v7.8h, v7.8h, v31.8h\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v31"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v0.8h, v0.8h, v31.8h\n" + "fmax v1.8h, v1.8h, v31.8h\n" + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v6.8h, v6.8h, v31.8h\n" + "fmax v7.8h, v7.8h, v31.8h\n" + "fmin v0.8h, v0.8h, v30.8h\n" + "fmin v1.8h, v1.8h, v30.8h\n" + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v6.8h, v6.8h, v30.8h\n" + "fmin v7.8h, v7.8h, v30.8h\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v30", "v31"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__("movi v29.8h, #0x42, lsl #8\n" // three + "movi v30.8h, #0x46, lsl #8\n" // six + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fadd v21.8h, v0.8h, v29.8h\n" + "fadd v22.8h, v1.8h, v29.8h\n" + "fadd v23.8h, v2.8h, v29.8h\n" + "fadd v24.8h, v3.8h, v29.8h\n" + "fadd v25.8h, v4.8h, v29.8h\n" + "fadd v26.8h, v5.8h, v29.8h\n" + "fadd v27.8h, v6.8h, v29.8h\n" + "fadd v28.8h, v7.8h, v29.8h\n" + "fmax v21.8h, v21.8h, v31.8h\n" + "fmax v22.8h, v22.8h, v31.8h\n" + "fmax v23.8h, v23.8h, v31.8h\n" + "fmax v24.8h, v24.8h, v31.8h\n" + "fmax v25.8h, v25.8h, v31.8h\n" + "fmax v26.8h, v26.8h, v31.8h\n" + "fmax v27.8h, v27.8h, v31.8h\n" + "fmax v28.8h, v28.8h, v31.8h\n" + "fmin v21.8h, v21.8h, v30.8h\n" + "fmin v22.8h, v22.8h, v30.8h\n" + "fmin v23.8h, v23.8h, v30.8h\n" + "fmin v24.8h, v24.8h, v30.8h\n" + "fmin v25.8h, v25.8h, v30.8h\n" + "fmin v26.8h, v26.8h, v30.8h\n" + "fmin v27.8h, v27.8h, v30.8h\n" + "fmin v28.8h, v28.8h, v30.8h\n" + "fdiv v21.8h, v21.8h, v30.8h\n" + "fdiv v22.8h, v22.8h, v30.8h\n" + "fdiv v23.8h, v23.8h, v30.8h\n" + "fdiv v24.8h, v24.8h, v30.8h\n" + "fdiv v25.8h, v25.8h, v30.8h\n" + "fdiv v26.8h, v26.8h, v30.8h\n" + "fdiv v27.8h, v27.8h, v30.8h\n" + "fdiv v28.8h, v28.8h, v30.8h\n" + "fmul v0.8h, v0.8h, v21.8h\n" + "fmul v1.8h, v1.8h, v22.8h\n" + "fmul v2.8h, v2.8h, v23.8h\n" + "fmul v3.8h, v3.8h, v24.8h\n" + "fmul v4.8h, v4.8h, v25.8h\n" + "fmul v5.8h, v5.8h, v26.8h\n" + "fmul v6.8h, v6.8h, v27.8h\n" + "fmul v7.8h, v7.8h, v28.8h\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + break; + } + default: + return NOT_SUPPORTED; + } + + if (pwFilterArray != nullptr) { + F16 *pw_pack_0 = pwArray + hw * ic * 8 + c * 8 * 8; + __asm__ __volatile__("zip1 v8.8h, v0.8h, v4.8h\n" + "zip1 v9.8h, v2.8h, v6.8h\n" + "zip1 v10.8h, v1.8h, v5.8h\n" + "zip1 v11.8h, v3.8h, v7.8h\n" + "zip2 v0.8h, v0.8h, v4.8h\n" + "zip2 v2.8h, v2.8h, v6.8h\n" + "zip2 v1.8h, v1.8h, v5.8h\n" + "zip2 v3.8h, v3.8h, v7.8h\n" + "zip1 v12.8h, v8.8h, v9.8h\n" + "zip1 v13.8h, v10.8h, v11.8h\n" + "zip2 v8.8h, v8.8h, v9.8h\n" + "zip2 v10.8h, v10.8h, v11.8h\n" + "zip1 v14.8h, v0.8h, v2.8h\n" + "zip1 v15.8h, v1.8h, v3.8h\n" + "zip2 v0.8h, v0.8h, v2.8h\n" + "zip2 v1.8h, v1.8h, v3.8h\n" + "zip1 v16.8h, v12.8h, v13.8h\n" + "zip2 v12.8h, v12.8h, v13.8h\n" + "zip1 v17.8h, v8.8h, v10.8h\n" + "zip2 v8.8h, v8.8h, v10.8h\n" + "zip1 v18.8h, v14.8h, v15.8h\n" + "zip2 v14.8h, v14.8h, v15.8h\n" + "zip1 v19.8h, v0.8h, v1.8h\n" + "zip2 v0.8h, v0.8h, v1.8h\n" + "str q16, [%[pw0]]\n" + "str q12, [%[pw0], #16]\n" + "str q17, [%[pw0], #32]\n" + "str q8, [%[pw0], #48]\n" + "str q18, [%[pw0], #64]\n" + "str q14, [%[pw0], #80]\n" + "str q19, [%[pw0], #96]\n" + "str q0, [%[pw0], #112]\n" + : [pw0] "+r"(pw_pack_0) + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19"); + } else { + F16 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; + __asm__ __volatile__("str q0, [%[out]]\n" + "str q1, [%[out], #16]\n" + "str q2, [%[out], #32]\n" + "str q3, [%[out], #48]\n" + "str q4, [%[out], #64]\n" + "str q5, [%[out], #80]\n" + "str q6, [%[out], #96]\n" + "str q7, [%[out], #112]\n" + : [out] "+r"(out_ptr) + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19"); + } + } + + // ohow_reminder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + for (I32 hw = ohow_s; hw < ohow - 3; hw += 4) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + U32 in_h_1 = (hw + 1) / ow * strideH; + U32 in_w_1 = (hw + 1) % ow * strideW; + U32 in_h_2 = (hw + 2) / ow * strideH; + U32 in_w_2 = (hw + 2) % ow * strideW; + U32 in_h_3 = (hw + 3) / ow * strideH; + U32 in_w_3 = (hw + 3) % ow * strideW; + // TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. + __asm__ __volatile__("ldr q8, [%[b]]\n" + "mov v0.16b, v8.16b\n" + "mov v1.16b, v8.16b\n" + "mov v2.16b, v8.16b\n" + "mov v3.16b, v8.16b\n" + : + : [b] "r"(b) + : "memory", "cc", "v0", "v1", "v2", "v3", "v8"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const F16 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + F16 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + F16 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F16 *in_1 = in_idx + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F16 *in_2 = in_idx + in_h_2 * iw_pad * 8 + in_w_2 * 8; + F16 *in_3 = in_idx + in_h_3 * iw_pad * 8 + in_w_3 * 8; + __asm__ __volatile__("ldr q17, [%[f0]]\n" + "ldr q9, [%[in0]]\n" + "ldr q10, [%[in1]]\n" + "ldr q11, [%[in2]]\n" + "ldr q12, [%[in3]]\n" + "fmla v0.8h, v9.8h, v17.8h\n" + "fmla v1.8h, v10.8h, v17.8h\n" + "fmla v2.8h, v11.8h, v17.8h\n" + "fmla v3.8h, v12.8h, v17.8h\n" + : + : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), + [in3] "r"(in_3), [f0] "r"(f_0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v9", "v10", + "v11", "v12", "v17"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "fmax v0.8h, v0.8h, v31.8h\n" + "fmax v1.8h, v1.8h, v31.8h\n" + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v0.8h, v0.8h, v31.8h\n" + "fmax v1.8h, v1.8h, v31.8h\n" + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmin v0.8h, v0.8h, v30.8h\n" + "fmin v1.8h, v1.8h, v30.8h\n" + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v30", "v31"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__("movi v29.8h, #0x42, lsl #8\n" // three + "movi v30.8h, #0x46, lsl #8\n" // six + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fadd v25.8h, v0.8h, v29.8h\n" + "fadd v26.8h, v1.8h, v29.8h\n" + "fadd v27.8h, v2.8h, v29.8h\n" + "fadd v28.8h, v3.8h, v29.8h\n" + "fmax v25.8h, v25.8h, v31.8h\n" + "fmax v26.8h, v26.8h, v31.8h\n" + "fmax v27.8h, v27.8h, v31.8h\n" + "fmax v28.8h, v28.8h, v31.8h\n" + "fmin v25.8h, v25.8h, v30.8h\n" + "fmin v26.8h, v26.8h, v30.8h\n" + "fmin v27.8h, v27.8h, v30.8h\n" + "fmin v28.8h, v28.8h, v30.8h\n" + "fdiv v25.8h, v25.8h, v30.8h\n" + "fdiv v26.8h, v26.8h, v30.8h\n" + "fdiv v27.8h, v27.8h, v30.8h\n" + "fdiv v28.8h, v28.8h, v30.8h\n" + "fmul v0.8h, v0.8h, v25.8h\n" + "fmul v1.8h, v1.8h, v26.8h\n" + "fmul v2.8h, v2.8h, v27.8h\n" + "fmul v3.8h, v3.8h, v28.8h\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + break; + } + default: + return NOT_SUPPORTED; + } + + if (pwFilterArray != nullptr) { + F16 *pw_pack_0 = pwArray + hw * ic * 8 + c * 8 * 4; + __asm__ __volatile__("st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[pw0]]\n" + : [pw0] "+r"(pw_pack_0) + : + : "memory", "cc", "v0", "v1", "v2", "v3"); + } else { + F16 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; + __asm__ __volatile__("st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[out]]\n" + : [out] "+r"(out_ptr) + : + : "memory", "cc", "v0", "v1", "v2", "v3"); + } + } + + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + // TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. + __asm__ __volatile__("ldr q8, [%[b]]\n" + "mov v0.16b, v8.16b\n" + : + : [b] "r"(b) + : "memory", "cc", "v0"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const F16 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + F16 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + F16 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + __asm__ __volatile__("ldr q17, [%[f0]]\n" + "ldr q9, [%[in0]]\n" + "fmla v0.8h, v9.8h, v17.8h\n" + : + : [in0] "r"(in_0), [f0] "r"(f_0) + : "memory", "cc", "v0", "v9", "v17"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "fmax v0.8h, v0.8h, v31.8h\n" + : + : + : "memory", "cc", "v0", "v31"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v0.8h, v0.8h, v31.8h\n" + "fmin v0.8h, v0.8h, v30.8h\n" + : + : + : "memory", "cc", "v0", "v30", "v31"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__("movi v29.8h, #0x42, lsl #8\n" // three + "movi v30.8h, #0x46, lsl #8\n" // six + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fadd v28.8h, v0.8h, v29.8h\n" + "fmax v28.8h, v28.8h, v31.8h\n" + "fmin v28.8h, v28.8h, v30.8h\n" + "fdiv v28.8h, v28.8h, v30.8h\n" + "fmul v0.8h, v0.8h, v28.8h\n" + : + : + : "memory", "cc", "v0", "v28", "v29", "v30", "v31"); + break; + } + default: + return NOT_SUPPORTED; + } + + F16 *out_ptr; + if (pwFilterArray != nullptr) { + out_ptr = pwArray + hw * ic * 8 + c * 8; + } else { + out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; + } + __asm__ __volatile__("str q0, [%[out]]\n" + : [out] "+r"(out_ptr) + : + : "memory", "cc", "v0"); + } + } + + if (pwFilterArray == nullptr) { + continue; + } + // pw_conv + // ohow / 8 + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *b0 = pwBiasArray; + const F16 *b1 = b0 + 8; + F16 *in_pack = pwArray + hw * ic * 8; + const F16 *f_o0c0 = pwFilterArray; + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr d23, [%[b_1]]\n" // b_o1 + "ldr x2, [%[b_1], #8]\n" + "ins v23.d[1], x2\n" + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr x1, [%[in_0], #8]\n" + "mov v4.16b, v22.16b\n" // out_o0hw2 + "ins v0.d[1], x1\n" + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v6.16b, v22.16b\n" // out_o0hw4 + "ldr x2, [%[f_0], #8]\n" + "mov v7.16b, v22.16b\n" // out_o0hw5 + "ins v18.d[1], x2\n" + "mov v8.16b, v22.16b\n" // out_o0hw6 + "ldr d19, [%[f_0], #16]\n" // f_o1c0 + "mov v9.16b, v22.16b\n" // out_o0hw7 + "ldr x3, [%[f_0], #24]\n" + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ins v19.d[1], x3\n" + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + "mov v14.16b, v23.16b\n" // out_o1hw4 + "mov v15.16b, v23.16b\n" // out_o1hw5 + "mov v16.16b, v23.16b\n" // out_o1hw6 + "mov v17.16b, v23.16b\n" // out_o1hw7 + "0:\n" + "ldr d1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr x1, [%[in_0], #24]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ins v1.d[1], x1\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ldr d20, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v18.8h, v0.h[3]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v6.8h, v18.8h, v0.h[4]\n" + "ins v20.d[1], x2\n" + "fmla v7.8h, v18.8h, v0.h[5]\n" + "ldr d21, [%[f_0], #48]\n" // f_o1c0 + "fmla v8.8h, v18.8h, v0.h[6]\n" + "ldr x3, [%[f_0], #56]\n" + "fmla v9.8h, v18.8h, v0.h[7]\n" + "ins v21.d[1], x3\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "fmla v14.8h, v19.8h, v0.h[4]\n" + "fmla v15.8h, v19.8h, v0.h[5]\n" + "fmla v16.8h, v19.8h, v0.h[6]\n" + "fmla v17.8h, v19.8h, v0.h[7]\n" + + "ldr d0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr x1, [%[in_0], #40]\n" + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ins v0.d[1], x1\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr d18, [%[f_0], #64]\n" // f_o0c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "ldr x2, [%[f_0], #72]\n" + "fmla v6.8h, v20.8h, v1.h[4]\n" + "ins v18.d[1], x2\n" + "fmla v7.8h, v20.8h, v1.h[5]\n" + "ldr d19, [%[f_0], #80]\n" // f_o1c0 + "fmla v8.8h, v20.8h, v1.h[6]\n" + "ldr x3, [%[f_0], #88]\n" + "fmla v9.8h, v20.8h, v1.h[7]\n" + "ins v19.d[1], x3\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "add %[f_0], %[f_0], #64\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "subs x0, x0, #2\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "fmla v14.8h, v21.8h, v1.h[4]\n" + "fmla v15.8h, v21.8h, v1.h[5]\n" + "fmla v16.8h, v21.8h, v1.h[6]\n" + "fmla v17.8h, v21.8h, v1.h[7]\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmax v14.8h, v14.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v0.8h\n" + "fmax v16.8h, v16.8h, v0.8h\n" + "fmax v17.8h, v17.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmax v14.8h, v14.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v0.8h\n" + "fmax v16.8h, v16.8h, v0.8h\n" + "fmax v17.8h, v17.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + "fmin v10.8h, v10.8h, v1.8h\n" + "fmin v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v1.8h\n" + "fmin v13.8h, v13.8h, v1.8h\n" + "fmin v14.8h, v14.8h, v1.8h\n" + "fmin v15.8h, v15.8h, v1.8h\n" + "fmin v16.8h, v16.8h, v1.8h\n" + "fmin v17.8h, v17.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v19.8h, v2.8h, v18.8h\n" + "fadd v20.8h, v3.8h, v18.8h\n" + "fadd v21.8h, v4.8h, v18.8h\n" + "fadd v22.8h, v5.8h, v18.8h\n" + "fadd v23.8h, v6.8h, v18.8h\n" + "fadd v24.8h, v7.8h, v18.8h\n" + "fadd v25.8h, v8.8h, v18.8h\n" + "fadd v26.8h, v9.8h, v18.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v0.8h\n" + "fmax v22.8h, v22.8h, v0.8h\n" + "fmax v23.8h, v23.8h, v0.8h\n" + "fmax v24.8h, v24.8h, v0.8h\n" + "fmax v25.8h, v25.8h, v0.8h\n" + "fmax v26.8h, v26.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v1.8h\n" + "fmin v22.8h, v22.8h, v1.8h\n" + "fmin v23.8h, v23.8h, v1.8h\n" + "fmin v24.8h, v24.8h, v1.8h\n" + "fmin v25.8h, v25.8h, v1.8h\n" + "fmin v26.8h, v26.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fdiv v21.8h, v21.8h, v1.8h\n" + "fdiv v22.8h, v22.8h, v1.8h\n" + "fdiv v23.8h, v23.8h, v1.8h\n" + "fdiv v24.8h, v24.8h, v1.8h\n" + "fdiv v25.8h, v25.8h, v1.8h\n" + "fdiv v26.8h, v26.8h, v1.8h\n" + "fmul v2.8h, v19.8h, v2.8h\n" + "fmul v3.8h, v20.8h, v3.8h\n" + "fmul v4.8h, v21.8h, v4.8h\n" + "fmul v5.8h, v22.8h, v5.8h\n" + "fmul v6.8h, v23.8h, v6.8h\n" + "fmul v7.8h, v24.8h, v7.8h\n" + "fmul v8.8h, v25.8h, v8.8h\n" + "fmul v9.8h, v26.8h, v9.8h\n" + + "fadd v19.8h, v10.8h, v18.8h\n" + "fadd v20.8h, v11.8h, v18.8h\n" + "fadd v21.8h, v12.8h, v18.8h\n" + "fadd v22.8h, v13.8h, v18.8h\n" + "fadd v23.8h, v14.8h, v18.8h\n" + "fadd v24.8h, v15.8h, v18.8h\n" + "fadd v25.8h, v16.8h, v18.8h\n" + "fadd v26.8h, v17.8h, v18.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v0.8h\n" + "fmax v22.8h, v22.8h, v0.8h\n" + "fmax v23.8h, v23.8h, v0.8h\n" + "fmax v24.8h, v24.8h, v0.8h\n" + "fmax v25.8h, v25.8h, v0.8h\n" + "fmax v26.8h, v26.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v1.8h\n" + "fmin v22.8h, v22.8h, v1.8h\n" + "fmin v23.8h, v23.8h, v1.8h\n" + "fmin v24.8h, v24.8h, v1.8h\n" + "fmin v25.8h, v25.8h, v1.8h\n" + "fmin v26.8h, v26.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fdiv v21.8h, v21.8h, v1.8h\n" + "fdiv v22.8h, v22.8h, v1.8h\n" + "fdiv v23.8h, v23.8h, v1.8h\n" + "fdiv v24.8h, v24.8h, v1.8h\n" + "fdiv v25.8h, v25.8h, v1.8h\n" + "fdiv v26.8h, v26.8h, v1.8h\n" + "fmul v10.8h, v19.8h, v10.8h\n" + "fmul v11.8h, v20.8h, v11.8h\n" + "fmul v12.8h, v21.8h, v12.8h\n" + "fmul v13.8h, v22.8h, v13.8h\n" + "fmul v14.8h, v23.8h, v14.8h\n" + "fmul v15.8h, v24.8h, v15.8h\n" + "fmul v16.8h, v25.8h, v16.8h\n" + "fmul v17.8h, v26.8h, v17.8h\n" + + "13:\n" + "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%[out_0]], #64\n" + "st1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%[out_0]], #64\n" + "st1 {v10.8h, v11.8h, v12.8h, v13.8h}, [%[out_1]], #64\n" + "st1 {v14.8h, v15.8h, v16.8h, v17.8h}, [%[out_1]], #64\n" + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "x0", "x1", "x2", "x3"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; + __asm__ __volatile__( + "ldr q12, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v12.16b\n" // out_o0hw0 + "ldr x1, [%[in_0], #8]\n" + "mov v3.16b, v12.16b\n" // out_o0hw1 + "ins v0.d[1], x1\n" + "mov v4.16b, v12.16b\n" // out_o0hw2 + "ldr d10, [%[f_0]]\n" // f_o0c0 + "mov v5.16b, v12.16b\n" // out_o0hw3 + "ldr x2, [%[f_0], #8]\n" + "mov v6.16b, v12.16b\n" // out_o0hw4 + "ins v10.d[1], x2\n" + "mov v7.16b, v12.16b\n" // out_o0hw5 + "mov v8.16b, v12.16b\n" // out_o0hw6 + "mov v9.16b, v12.16b\n" // out_o0hw7 + "0:\n" + "ldr d1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v10.8h, v0.h[0]\n" + "ldr x1, [%[in_0], #24]\n" + "fmla v3.8h, v10.8h, v0.h[1]\n" + "ins v1.d[1], x1\n" + "fmla v4.8h, v10.8h, v0.h[2]\n" + "ldr d11, [%[f_0], #16]\n" // f_o0c0 + "fmla v5.8h, v10.8h, v0.h[3]\n" + "ldr x2, [%[f_0], #24]\n" + "fmla v6.8h, v10.8h, v0.h[4]\n" + "ins v11.d[1], x2\n" + "fmla v7.8h, v10.8h, v0.h[5]\n" + "subs x0, x0, #2\n" + "fmla v8.8h, v10.8h, v0.h[6]\n" + "fmla v9.8h, v10.8h, v0.h[7]\n" + + "ldr d0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v11.8h, v1.h[0]\n" + "ldr x1, [%[in_0], #40]\n" + "fmla v3.8h, v11.8h, v1.h[1]\n" + "ins v0.d[1], x1\n" + "fmla v4.8h, v11.8h, v1.h[2]\n" + "ldr d10, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v11.8h, v1.h[3]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v6.8h, v11.8h, v1.h[4]\n" + "ins v10.d[1], x2\n" + "fmla v7.8h, v11.8h, v1.h[5]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v8.8h, v11.8h, v1.h[6]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v9.8h, v11.8h, v1.h[7]\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v10.8h, #0x42, lsl #8\n" // three + "fadd v11.8h, v2.8h, v10.8h\n" + "fadd v12.8h, v3.8h, v10.8h\n" + "fadd v13.8h, v4.8h, v10.8h\n" + "fadd v14.8h, v5.8h, v10.8h\n" + "fadd v15.8h, v6.8h, v10.8h\n" + "fadd v16.8h, v7.8h, v10.8h\n" + "fadd v17.8h, v8.8h, v10.8h\n" + "fadd v18.8h, v9.8h, v10.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmax v14.8h, v14.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v0.8h\n" + "fmax v16.8h, v16.8h, v0.8h\n" + "fmax v17.8h, v17.8h, v0.8h\n" + "fmax v18.8h, v18.8h, v0.8h\n" + "fmin v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v1.8h\n" + "fmin v13.8h, v13.8h, v1.8h\n" + "fmin v14.8h, v14.8h, v1.8h\n" + "fmin v15.8h, v15.8h, v1.8h\n" + "fmin v16.8h, v16.8h, v1.8h\n" + "fmin v17.8h, v17.8h, v1.8h\n" + "fmin v18.8h, v18.8h, v1.8h\n" + "fdiv v11.8h, v11.8h, v1.8h\n" + "fdiv v12.8h, v12.8h, v1.8h\n" + "fdiv v13.8h, v13.8h, v1.8h\n" + "fdiv v14.8h, v14.8h, v1.8h\n" + "fdiv v15.8h, v15.8h, v1.8h\n" + "fdiv v16.8h, v16.8h, v1.8h\n" + "fdiv v17.8h, v17.8h, v1.8h\n" + "fdiv v18.8h, v18.8h, v1.8h\n" + "fmul v2.8h, v11.8h, v2.8h\n" + "fmul v3.8h, v12.8h, v3.8h\n" + "fmul v4.8h, v13.8h, v4.8h\n" + "fmul v5.8h, v14.8h, v5.8h\n" + "fmul v6.8h, v15.8h, v6.8h\n" + "fmul v7.8h, v16.8h, v7.8h\n" + "fmul v8.8h, v17.8h, v8.8h\n" + "fmul v9.8h, v18.8h, v9.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw0 + "str q4, [%[out_0], #32]\n" // out_o0hw0 + "str q5, [%[out_0], #48]\n" // out_o0hw0 + "str q6, [%[out_0], #64]\n" // out_o0hw0 + "str q7, [%[out_0], #80]\n" // out_o0hw0 + "str q8, [%[out_0], #96]\n" // out_o0hw0 + "str q9, [%[out_0], #112]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "x0", "x1", "x2"); + } + } + + // ohow_remainder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + for (I32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F16 *b0 = pwBiasArray; + const F16 *b1 = b0 + 8; + const F16 *f_o0c0 = pwFilterArray; + F16 *in_pack = pwArray + hw * ic * 8; + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr d23, [%[b_1]]\n" // b_o1 + "ldr x2, [%[b_1], #8]\n" + "ins v23.d[1], x2\n" + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "ldr x2, [%[f_0], #8]\n" + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ins v18.d[1], x2\n" + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr d19, [%[f_0], #16]\n" // f_o1c0 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "ldr x3, [%[f_0], #24]\n" + "mov v12.16b, v23.16b\n" // out_o1hw2 + "ins v19.d[1], x3\n" + "mov v13.16b, v23.16b\n" // out_o1hw3 + "0:\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #32]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ins v20.d[1], x2\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "ldr d21, [%[f_0], #48]\n" // f_o1c0 + "fmla v10.8h, v19.8h, v0.h[0]\n" + "ldr x3, [%[f_0], #56]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "ins v21.d[1], x3\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "subs x0, x0, #2\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #64]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ldr x2, [%[f_0], #72]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr d19, [%[f_0], #80]\n" // f_o1c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "ins v18.d[1], x2\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "ldr x3, [%[f_0], #88]\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "ins v19.d[1], x3\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "add %[in_0], %[in_0], #16\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "add %[f_0], %[f_0], #64\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + "fmin v10.8h, v10.8h, v1.8h\n" + "fmin v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v1.8h\n" + "fmin v13.8h, v13.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v6.8h, v2.8h, v18.8h\n" + "fadd v7.8h, v3.8h, v18.8h\n" + "fadd v8.8h, v4.8h, v18.8h\n" + "fadd v9.8h, v5.8h, v18.8h\n" + "fadd v19.8h, v10.8h, v18.8h\n" + "fadd v20.8h, v11.8h, v18.8h\n" + "fadd v21.8h, v12.8h, v18.8h\n" + "fadd v22.8h, v13.8h, v18.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v0.8h\n" + "fmax v22.8h, v22.8h, v0.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v1.8h\n" + "fmin v22.8h, v22.8h, v1.8h\n" + "fdiv v6.8h, v6.8h, v1.8h\n" + "fdiv v7.8h, v7.8h, v1.8h\n" + "fdiv v8.8h, v8.8h, v1.8h\n" + "fdiv v9.8h, v9.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fdiv v21.8h, v21.8h, v1.8h\n" + "fdiv v22.8h, v22.8h, v1.8h\n" + "fmul v2.8h, v6.8h, v2.8h\n" + "fmul v3.8h, v7.8h, v3.8h\n" + "fmul v4.8h, v8.8h, v4.8h\n" + "fmul v5.8h, v9.8h, v5.8h\n" + "fmul v10.8h, v19.8h, v10.8h\n" + "fmul v11.8h, v20.8h, v11.8h\n" + "fmul v12.8h, v21.8h, v12.8h\n" + "fmul v13.8h, v22.8h, v13.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v18", "v19", "v20", "v21", "v22", "v23", "x0", + "x1", "x2", "x3"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; + __asm__ __volatile__( + "ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "ldr x2, [%[f_0], #8]\n" + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ins v18.d[1], x2\n" + "0:\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #16]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "ldr x2, [%[f_0], #24]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ins v20.d[1], x2\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "subs x0, x0, #2\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #32]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "ldr x2, [%[f_0], #40]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ins v18.d[1], x2\n" + "fmla v5.8h, v20.8h, v1.h[3]\n" + "add %[in_0], %[in_0], #16\n" + "add %[f_0], %[f_0], #32\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v6.8h, v2.8h, v18.8h\n" + "fadd v7.8h, v3.8h, v18.8h\n" + "fadd v8.8h, v4.8h, v18.8h\n" + "fadd v9.8h, v5.8h, v18.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + "fdiv v6.8h, v6.8h, v1.8h\n" + "fdiv v7.8h, v7.8h, v1.8h\n" + "fdiv v8.8h, v8.8h, v1.8h\n" + "fdiv v9.8h, v9.8h, v1.8h\n" + "fmul v2.8h, v6.8h, v2.8h\n" + "fmul v3.8h, v7.8h, v3.8h\n" + "fmul v4.8h, v8.8h, v4.8h\n" + "fmul v5.8h, v9.8h, v5.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v18", "v20", "v22", "x0", "x1", "x2"); + } + } + + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + const F16 *b0 = pwBiasArray; + const F16 *b1 = b0 + 8; + const F16 *f_o0c0 = pwFilterArray; + F16 *in_pack = pwArray + hw * ic * 8; + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr d22, [%[b_0]]\n" // b_o0 + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr d23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "ldr x2, [%[b_1], #8]\n" + "ins v23.d[1], x2\n" + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr x2, [%[f_0], #8]\n" + "ins v18.d[1], x2\n" + "ldr d19, [%[f_0], #16]\n" // f_o1c0 + "ldr x3, [%[f_0], #24]\n" + "ins v19.d[1], x3\n" + "0:\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #32]\n" // f_o0c0 + "fmla v10.8h, v19.8h, v0.h[0]\n" + "ldr x2, [%[f_0], #40]\n" + "ins v20.d[1], x2\n" + "ldr d21, [%[f_0], #48]\n" // f_o1c0 + "subs x0, x0, #2\n" + "ldr x3, [%[f_0], #56]\n" + "ins v21.d[1], x3\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #64]\n" // f_o0c0 + "fmla v10.8h, v21.8h, v1.h[0]\n" + "ldr x2, [%[f_0], #72]\n" + "ins v18.d[1], x2\n" + "ldr d19, [%[f_0], #80]\n" // f_o1c0 + "add %[in_0], %[in_0], #4\n" + "ldr x3, [%[f_0], #88]\n" + "ins v19.d[1], x3\n" + "add %[f_0], %[f_0], #64\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v10.8h, v10.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v19.8h, v2.8h, v18.8h\n" + "fadd v20.8h, v10.8h, v18.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fmul v2.8h, v19.8h, v2.8h\n" + "fmul v10.8h, v20.8h, v10.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q10, [%[out_1]]\n" // out_o1hw0 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", "v21", "v22", + "v23", "x0", "x1", "x2", "x3"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; + __asm__ __volatile__( + "ldr d22, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr x1, [%[b_0], #8]\n" + "ins v22.d[1], x1\n" + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "ldr x2, [%[f_0], #8]\n" + "ins v18.d[1], x2\n" + "0:\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr d20, [%[f_0], #16]\n" // f_o0c0 + "subs x0, x0, #2\n" + "ldr x2, [%[f_0], #24]\n" + "ins v20.d[1], x2\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr d18, [%[f_0], #32]\n" // f_o0c0 + "ldr x2, [%[f_0], #40]\n" + "ins v18.d[1], x2\n" + "add %[in_0], %[in_0], #4\n" + "add %[f_0], %[f_0], #32\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v20.8h, v2.8h, v18.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fmul v2.8h, v20.8h, v2.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", "x0", "x1", + "x2"); + } + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A76.cpp b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A76.cpp new file mode 100644 index 00000000..46d0c628 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A76.cpp @@ -0,0 +1,1334 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/depthwise_pointwise_convolution_direct.h" + +EE depthwise_pointwise_convolution_direct_A76(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec) +{ + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if (dwFilterDesc.df != DF_NCHWC8) { + CHECK_STATUS(NOT_MATCH); + } + if (pwFilterArray != nullptr && pwFilterDesc.df != DF_NHWCN16) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + U32 ihiw = ih * iw; + I32 ohow = oh * ow; + F16 *pwArray = (F16 *)tmp + ic * ih_pad * iw_pad * 8; + for (U32 n = 0; n < in; n++) { + F16 *inArray_pad = (F16 *)tmp; + F16 *inArray_pad_mov = inArray_pad; + F16 *inArray_mov = inArray + n * ic * ihiw * 8; + for (U32 c = 0; c < ic; c++) { + if (paddingT > 0) { + memset(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingT * iw_pad * 8; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingL * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(fdt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingR * 8; + } + if (paddingB > 0) { + memset(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingB * iw_pad * 8; + } + + // dw_conv + const F16 *b = dwBiasArray + c * 8; + F16 *in_pad = inArray_pad + c * ih_pad * iw_pad * 8; + const F16 *f = dwFilterArray + c * fh * fw * 8; + // ohow / 8 + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + U32 in_h_1 = (hw + 1) / ow * strideH; + U32 in_w_1 = (hw + 1) % ow * strideW; + U32 in_h_2 = (hw + 2) / ow * strideH; + U32 in_w_2 = (hw + 2) % ow * strideW; + U32 in_h_3 = (hw + 3) / ow * strideH; + U32 in_w_3 = (hw + 3) % ow * strideW; + U32 in_h_4 = (hw + 4) / ow * strideH; + U32 in_w_4 = (hw + 4) % ow * strideW; + U32 in_h_5 = (hw + 5) / ow * strideH; + U32 in_w_5 = (hw + 5) % ow * strideW; + U32 in_h_6 = (hw + 6) / ow * strideH; + U32 in_w_6 = (hw + 6) % ow * strideW; + U32 in_h_7 = (hw + 7) / ow * strideH; + U32 in_w_7 = (hw + 7) % ow * strideW; + // TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. + __asm__ __volatile__( + "ldr q8, [%[b]]\n" + "mov v0.16b, v8.16b\n" + "mov v1.16b, v8.16b\n" + "mov v2.16b, v8.16b\n" + "mov v3.16b, v8.16b\n" + "mov v4.16b, v8.16b\n" + "mov v5.16b, v8.16b\n" + "mov v6.16b, v8.16b\n" + "mov v7.16b, v8.16b\n" + : + : [b] "r"(b) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const F16 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + F16 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + F16 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F16 *in_1 = in_idx + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F16 *in_2 = in_idx + in_h_2 * iw_pad * 8 + in_w_2 * 8; + F16 *in_3 = in_idx + in_h_3 * iw_pad * 8 + in_w_3 * 8; + F16 *in_4 = in_idx + in_h_4 * iw_pad * 8 + in_w_4 * 8; + F16 *in_5 = in_idx + in_h_5 * iw_pad * 8 + in_w_5 * 8; + F16 *in_6 = in_idx + in_h_6 * iw_pad * 8 + in_w_6 * 8; + F16 *in_7 = in_idx + in_h_7 * iw_pad * 8 + in_w_7 * 8; + __asm__ __volatile__("ldr q17, [%[f0]]\n" + "ldr q9, [%[in0]]\n" + "ldr q10, [%[in1]]\n" + "ldr q11, [%[in2]]\n" + "ldr q12, [%[in3]]\n" + "ldr q13, [%[in4]]\n" + "ldr q14, [%[in5]]\n" + "ldr q15, [%[in6]]\n" + "ldr q16, [%[in7]]\n" + "fmla v0.8h, v9.8h, v17.8h\n" + "fmla v1.8h, v10.8h, v17.8h\n" + "fmla v2.8h, v11.8h, v17.8h\n" + "fmla v3.8h, v12.8h, v17.8h\n" + "fmla v4.8h, v13.8h, v17.8h\n" + "fmla v5.8h, v14.8h, v17.8h\n" + "fmla v6.8h, v15.8h, v17.8h\n" + "fmla v7.8h, v16.8h, v17.8h\n" + : + : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), + [in3] "r"(in_3), [in4] "r"(in_4), [in5] "r"(in_5), + [in6] "r"(in_6), [in7] "r"(in_7), [f0] "r"(f_0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "fmax v0.8h, v0.8h, v31.8h\n" + "fmax v1.8h, v1.8h, v31.8h\n" + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v6.8h, v6.8h, v31.8h\n" + "fmax v7.8h, v7.8h, v31.8h\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v31"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v0.8h, v0.8h, v31.8h\n" + "fmax v1.8h, v1.8h, v31.8h\n" + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmax v4.8h, v4.8h, v31.8h\n" + "fmax v5.8h, v5.8h, v31.8h\n" + "fmax v6.8h, v6.8h, v31.8h\n" + "fmax v7.8h, v7.8h, v31.8h\n" + "fmin v0.8h, v0.8h, v30.8h\n" + "fmin v1.8h, v1.8h, v30.8h\n" + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + "fmin v4.8h, v4.8h, v30.8h\n" + "fmin v5.8h, v5.8h, v30.8h\n" + "fmin v6.8h, v6.8h, v30.8h\n" + "fmin v7.8h, v7.8h, v30.8h\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v30", "v31"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__("movi v29.8h, #0x42, lsl #8\n" // three + "movi v30.8h, #0x46, lsl #8\n" // six + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fadd v21.8h, v0.8h, v29.8h\n" + "fadd v22.8h, v1.8h, v29.8h\n" + "fadd v23.8h, v2.8h, v29.8h\n" + "fadd v24.8h, v3.8h, v29.8h\n" + "fadd v25.8h, v4.8h, v29.8h\n" + "fadd v26.8h, v5.8h, v29.8h\n" + "fadd v27.8h, v6.8h, v29.8h\n" + "fadd v28.8h, v7.8h, v29.8h\n" + "fmax v21.8h, v21.8h, v31.8h\n" + "fmax v22.8h, v22.8h, v31.8h\n" + "fmax v23.8h, v23.8h, v31.8h\n" + "fmax v24.8h, v24.8h, v31.8h\n" + "fmax v25.8h, v25.8h, v31.8h\n" + "fmax v26.8h, v26.8h, v31.8h\n" + "fmax v27.8h, v27.8h, v31.8h\n" + "fmax v28.8h, v28.8h, v31.8h\n" + "fmin v21.8h, v21.8h, v30.8h\n" + "fmin v22.8h, v22.8h, v30.8h\n" + "fmin v23.8h, v23.8h, v30.8h\n" + "fmin v24.8h, v24.8h, v30.8h\n" + "fmin v25.8h, v25.8h, v30.8h\n" + "fmin v26.8h, v26.8h, v30.8h\n" + "fmin v27.8h, v27.8h, v30.8h\n" + "fmin v28.8h, v28.8h, v30.8h\n" + "fdiv v21.8h, v21.8h, v30.8h\n" + "fdiv v22.8h, v22.8h, v30.8h\n" + "fdiv v23.8h, v23.8h, v30.8h\n" + "fdiv v24.8h, v24.8h, v30.8h\n" + "fdiv v25.8h, v25.8h, v30.8h\n" + "fdiv v26.8h, v26.8h, v30.8h\n" + "fdiv v27.8h, v27.8h, v30.8h\n" + "fdiv v28.8h, v28.8h, v30.8h\n" + "fmul v0.8h, v0.8h, v21.8h\n" + "fmul v1.8h, v1.8h, v22.8h\n" + "fmul v2.8h, v2.8h, v23.8h\n" + "fmul v3.8h, v3.8h, v24.8h\n" + "fmul v4.8h, v4.8h, v25.8h\n" + "fmul v5.8h, v5.8h, v26.8h\n" + "fmul v6.8h, v6.8h, v27.8h\n" + "fmul v7.8h, v7.8h, v28.8h\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + break; + } + default: + return NOT_SUPPORTED; + } + + if (pwFilterArray != nullptr) { + F16 *pw_pack_0 = pwArray + hw * ic * 8 + c * 8 * 8; + __asm__ __volatile__("zip1 v8.8h, v0.8h, v4.8h\n" + "zip1 v9.8h, v2.8h, v6.8h\n" + "zip1 v10.8h, v1.8h, v5.8h\n" + "zip1 v11.8h, v3.8h, v7.8h\n" + "zip2 v0.8h, v0.8h, v4.8h\n" + "zip2 v2.8h, v2.8h, v6.8h\n" + "zip2 v1.8h, v1.8h, v5.8h\n" + "zip2 v3.8h, v3.8h, v7.8h\n" + "zip1 v12.8h, v8.8h, v9.8h\n" + "zip1 v13.8h, v10.8h, v11.8h\n" + "zip2 v8.8h, v8.8h, v9.8h\n" + "zip2 v10.8h, v10.8h, v11.8h\n" + "zip1 v14.8h, v0.8h, v2.8h\n" + "zip1 v15.8h, v1.8h, v3.8h\n" + "zip2 v0.8h, v0.8h, v2.8h\n" + "zip2 v1.8h, v1.8h, v3.8h\n" + "zip1 v16.8h, v12.8h, v13.8h\n" + "zip2 v12.8h, v12.8h, v13.8h\n" + "zip1 v17.8h, v8.8h, v10.8h\n" + "zip2 v8.8h, v8.8h, v10.8h\n" + "zip1 v18.8h, v14.8h, v15.8h\n" + "zip2 v14.8h, v14.8h, v15.8h\n" + "zip1 v19.8h, v0.8h, v1.8h\n" + "zip2 v0.8h, v0.8h, v1.8h\n" + "str q16, [%[pw0]]\n" + "str q12, [%[pw0], #16]\n" + "str q17, [%[pw0], #32]\n" + "str q8, [%[pw0], #48]\n" + "str q18, [%[pw0], #64]\n" + "str q14, [%[pw0], #80]\n" + "str q19, [%[pw0], #96]\n" + "str q0, [%[pw0], #112]\n" + : [pw0] "+r"(pw_pack_0) + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19"); + } else { + F16 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; + __asm__ __volatile__("str q0, [%[out]]\n" + "str q1, [%[out], #16]\n" + "str q2, [%[out], #32]\n" + "str q3, [%[out], #48]\n" + "str q4, [%[out], #64]\n" + "str q5, [%[out], #80]\n" + "str q6, [%[out], #96]\n" + "str q7, [%[out], #112]\n" + : [out] "+r"(out_ptr) + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19"); + } + } + + // ohow_reminder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + for (I32 hw = ohow_s; hw < ohow - 3; hw += 4) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + U32 in_h_1 = (hw + 1) / ow * strideH; + U32 in_w_1 = (hw + 1) % ow * strideW; + U32 in_h_2 = (hw + 2) / ow * strideH; + U32 in_w_2 = (hw + 2) % ow * strideW; + U32 in_h_3 = (hw + 3) / ow * strideH; + U32 in_w_3 = (hw + 3) % ow * strideW; + // TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. + __asm__ __volatile__("ldr q8, [%[b]]\n" + "mov v0.16b, v8.16b\n" + "mov v1.16b, v8.16b\n" + "mov v2.16b, v8.16b\n" + "mov v3.16b, v8.16b\n" + : + : [b] "r"(b) + : "memory", "cc", "v0", "v1", "v2", "v3", "v8"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const F16 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + F16 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + F16 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F16 *in_1 = in_idx + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F16 *in_2 = in_idx + in_h_2 * iw_pad * 8 + in_w_2 * 8; + F16 *in_3 = in_idx + in_h_3 * iw_pad * 8 + in_w_3 * 8; + __asm__ __volatile__("ldr q17, [%[f0]]\n" + "ldr q9, [%[in0]]\n" + "ldr q10, [%[in1]]\n" + "ldr q11, [%[in2]]\n" + "ldr q12, [%[in3]]\n" + "fmla v0.8h, v9.8h, v17.8h\n" + "fmla v1.8h, v10.8h, v17.8h\n" + "fmla v2.8h, v11.8h, v17.8h\n" + "fmla v3.8h, v12.8h, v17.8h\n" + : + : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), + [in3] "r"(in_3), [f0] "r"(f_0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v9", "v10", + "v11", "v12", "v17"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "fmax v0.8h, v0.8h, v31.8h\n" + "fmax v1.8h, v1.8h, v31.8h\n" + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v0.8h, v0.8h, v31.8h\n" + "fmax v1.8h, v1.8h, v31.8h\n" + "fmax v2.8h, v2.8h, v31.8h\n" + "fmax v3.8h, v3.8h, v31.8h\n" + "fmin v0.8h, v0.8h, v30.8h\n" + "fmin v1.8h, v1.8h, v30.8h\n" + "fmin v2.8h, v2.8h, v30.8h\n" + "fmin v3.8h, v3.8h, v30.8h\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v30", "v31"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__("movi v29.8h, #0x42, lsl #8\n" // three + "movi v30.8h, #0x46, lsl #8\n" // six + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fadd v25.8h, v0.8h, v29.8h\n" + "fadd v26.8h, v1.8h, v29.8h\n" + "fadd v27.8h, v2.8h, v29.8h\n" + "fadd v28.8h, v3.8h, v29.8h\n" + "fmax v25.8h, v25.8h, v31.8h\n" + "fmax v26.8h, v26.8h, v31.8h\n" + "fmax v27.8h, v27.8h, v31.8h\n" + "fmax v28.8h, v28.8h, v31.8h\n" + "fmin v25.8h, v25.8h, v30.8h\n" + "fmin v26.8h, v26.8h, v30.8h\n" + "fmin v27.8h, v27.8h, v30.8h\n" + "fmin v28.8h, v28.8h, v30.8h\n" + "fdiv v25.8h, v25.8h, v30.8h\n" + "fdiv v26.8h, v26.8h, v30.8h\n" + "fdiv v27.8h, v27.8h, v30.8h\n" + "fdiv v28.8h, v28.8h, v30.8h\n" + "fmul v0.8h, v0.8h, v25.8h\n" + "fmul v1.8h, v1.8h, v26.8h\n" + "fmul v2.8h, v2.8h, v27.8h\n" + "fmul v3.8h, v3.8h, v28.8h\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + break; + } + default: + return NOT_SUPPORTED; + } + + if (pwFilterArray != nullptr) { + F16 *pw_pack_0 = pwArray + hw * ic * 8 + c * 8 * 4; + __asm__ __volatile__("st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[pw0]]\n" + : [pw0] "+r"(pw_pack_0) + : + : "memory", "cc", "v0", "v1", "v2", "v3"); + } else { + F16 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; + __asm__ __volatile__("st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[out]]\n" + : [out] "+r"(out_ptr) + : + : "memory", "cc", "v0", "v1", "v2", "v3"); + } + } + + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + // TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. + __asm__ __volatile__("ldr q8, [%[b]]\n" + "mov v0.16b, v8.16b\n" + : + : [b] "r"(b) + : "memory", "cc", "v0"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const F16 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + F16 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + F16 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + __asm__ __volatile__("ldr q17, [%[f0]]\n" + "ldr q9, [%[in0]]\n" + "fmla v0.8h, v9.8h, v17.8h\n" + : + : [in0] "r"(in_0), [f0] "r"(f_0) + : "memory", "cc", "v0", "v9", "v17"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "fmax v0.8h, v0.8h, v31.8h\n" + : + : + : "memory", "cc", "v0", "v31"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "movi v30.8h, #0x46, lsl #8\n" // six + "fmax v0.8h, v0.8h, v31.8h\n" + "fmin v0.8h, v0.8h, v30.8h\n" + : + : + : "memory", "cc", "v0", "v30", "v31"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__("movi v29.8h, #0x42, lsl #8\n" // three + "movi v30.8h, #0x46, lsl #8\n" // six + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fadd v28.8h, v0.8h, v29.8h\n" + "fmax v28.8h, v28.8h, v31.8h\n" + "fmin v28.8h, v28.8h, v30.8h\n" + "fdiv v28.8h, v28.8h, v30.8h\n" + "fmul v0.8h, v0.8h, v28.8h\n" + : + : + : "memory", "cc", "v0", "v28", "v29", "v30", "v31"); + break; + } + default: + return NOT_SUPPORTED; + } + + F16 *out_ptr; + if (pwFilterArray != nullptr) { + out_ptr = pwArray + hw * ic * 8 + c * 8; + } else { + out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; + } + __asm__ __volatile__("str q0, [%[out]]\n" + : [out] "+r"(out_ptr) + : + : "memory", "cc", "v0"); + } + } + + if (pwFilterArray == nullptr) { + continue; + } + // pw_conv + // ohow / 8 + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *b0 = pwBiasArray; + const F16 *b1 = b0 + 8; + F16 *in_pack = pwArray + hw * ic * 8; + const F16 *f_o0c0 = pwFilterArray; + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr q0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v6.16b, v22.16b\n" // out_o0hw4 + "mov v7.16b, v22.16b\n" // out_o0hw5 + "mov v8.16b, v22.16b\n" // out_o0hw6 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 + "mov v9.16b, v22.16b\n" // out_o0hw7 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + "mov v14.16b, v23.16b\n" // out_o1hw4 + "mov v15.16b, v23.16b\n" // out_o1hw5 + "mov v16.16b, v23.16b\n" // out_o1hw6 + "mov v17.16b, v23.16b\n" // out_o1hw7 + "0:\n" + "ldr q1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "ldr q20, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v18.8h, v0.h[3]\n" + "fmla v6.8h, v18.8h, v0.h[4]\n" + "fmla v7.8h, v18.8h, v0.h[5]\n" + "ldr q21, [%[f_0], #48]\n" // f_o1c0 + "fmla v8.8h, v18.8h, v0.h[6]\n" + "fmla v9.8h, v18.8h, v0.h[7]\n" + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + "fmla v14.8h, v19.8h, v0.h[4]\n" + "fmla v15.8h, v19.8h, v0.h[5]\n" + "fmla v16.8h, v19.8h, v0.h[6]\n" + "fmla v17.8h, v19.8h, v0.h[7]\n" + + "ldr q0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "fmla v3.8h, v20.8h, v1.h[1]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr q18, [%[f_0], #64]\n" // f_o0c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "fmla v6.8h, v20.8h, v1.h[4]\n" + "fmla v7.8h, v20.8h, v1.h[5]\n" + "ldr q19, [%[f_0], #80]\n" // f_o1c0 + "fmla v8.8h, v20.8h, v1.h[6]\n" + "fmla v9.8h, v20.8h, v1.h[7]\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "add %[f_0], %[f_0], #64\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "subs x0, x0, #2\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "fmla v14.8h, v21.8h, v1.h[4]\n" + "fmla v15.8h, v21.8h, v1.h[5]\n" + "fmla v16.8h, v21.8h, v1.h[6]\n" + "fmla v17.8h, v21.8h, v1.h[7]\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmax v14.8h, v14.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v0.8h\n" + "fmax v16.8h, v16.8h, v0.8h\n" + "fmax v17.8h, v17.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmax v14.8h, v14.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v0.8h\n" + "fmax v16.8h, v16.8h, v0.8h\n" + "fmax v17.8h, v17.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + "fmin v10.8h, v10.8h, v1.8h\n" + "fmin v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v1.8h\n" + "fmin v13.8h, v13.8h, v1.8h\n" + "fmin v14.8h, v14.8h, v1.8h\n" + "fmin v15.8h, v15.8h, v1.8h\n" + "fmin v16.8h, v16.8h, v1.8h\n" + "fmin v17.8h, v17.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v19.8h, v2.8h, v18.8h\n" + "fadd v20.8h, v3.8h, v18.8h\n" + "fadd v21.8h, v4.8h, v18.8h\n" + "fadd v22.8h, v5.8h, v18.8h\n" + "fadd v23.8h, v6.8h, v18.8h\n" + "fadd v24.8h, v7.8h, v18.8h\n" + "fadd v25.8h, v8.8h, v18.8h\n" + "fadd v26.8h, v9.8h, v18.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v0.8h\n" + "fmax v22.8h, v22.8h, v0.8h\n" + "fmax v23.8h, v23.8h, v0.8h\n" + "fmax v24.8h, v24.8h, v0.8h\n" + "fmax v25.8h, v25.8h, v0.8h\n" + "fmax v26.8h, v26.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v1.8h\n" + "fmin v22.8h, v22.8h, v1.8h\n" + "fmin v23.8h, v23.8h, v1.8h\n" + "fmin v24.8h, v24.8h, v1.8h\n" + "fmin v25.8h, v25.8h, v1.8h\n" + "fmin v26.8h, v26.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fdiv v21.8h, v21.8h, v1.8h\n" + "fdiv v22.8h, v22.8h, v1.8h\n" + "fdiv v23.8h, v23.8h, v1.8h\n" + "fdiv v24.8h, v24.8h, v1.8h\n" + "fdiv v25.8h, v25.8h, v1.8h\n" + "fdiv v26.8h, v26.8h, v1.8h\n" + "fmul v2.8h, v19.8h, v2.8h\n" + "fmul v3.8h, v20.8h, v3.8h\n" + "fmul v4.8h, v21.8h, v4.8h\n" + "fmul v5.8h, v22.8h, v5.8h\n" + "fmul v6.8h, v23.8h, v6.8h\n" + "fmul v7.8h, v24.8h, v7.8h\n" + "fmul v8.8h, v25.8h, v8.8h\n" + "fmul v9.8h, v26.8h, v9.8h\n" + + "fadd v19.8h, v10.8h, v18.8h\n" + "fadd v20.8h, v11.8h, v18.8h\n" + "fadd v21.8h, v12.8h, v18.8h\n" + "fadd v22.8h, v13.8h, v18.8h\n" + "fadd v23.8h, v14.8h, v18.8h\n" + "fadd v24.8h, v15.8h, v18.8h\n" + "fadd v25.8h, v16.8h, v18.8h\n" + "fadd v26.8h, v17.8h, v18.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v0.8h\n" + "fmax v22.8h, v22.8h, v0.8h\n" + "fmax v23.8h, v23.8h, v0.8h\n" + "fmax v24.8h, v24.8h, v0.8h\n" + "fmax v25.8h, v25.8h, v0.8h\n" + "fmax v26.8h, v26.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v1.8h\n" + "fmin v22.8h, v22.8h, v1.8h\n" + "fmin v23.8h, v23.8h, v1.8h\n" + "fmin v24.8h, v24.8h, v1.8h\n" + "fmin v25.8h, v25.8h, v1.8h\n" + "fmin v26.8h, v26.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fdiv v21.8h, v21.8h, v1.8h\n" + "fdiv v22.8h, v22.8h, v1.8h\n" + "fdiv v23.8h, v23.8h, v1.8h\n" + "fdiv v24.8h, v24.8h, v1.8h\n" + "fdiv v25.8h, v25.8h, v1.8h\n" + "fdiv v26.8h, v26.8h, v1.8h\n" + "fmul v10.8h, v19.8h, v10.8h\n" + "fmul v11.8h, v20.8h, v11.8h\n" + "fmul v12.8h, v21.8h, v12.8h\n" + "fmul v13.8h, v22.8h, v13.8h\n" + "fmul v14.8h, v23.8h, v14.8h\n" + "fmul v15.8h, v24.8h, v15.8h\n" + "fmul v16.8h, v25.8h, v16.8h\n" + "fmul v17.8h, v26.8h, v17.8h\n" + + "13:\n" + "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%[out_0]], #64\n" + "st1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%[out_0]], #64\n" + "st1 {v10.8h, v11.8h, v12.8h, v13.8h}, [%[out_1]], #64\n" + "st1 {v14.8h, v15.8h, v16.8h, v17.8h}, [%[out_1]], #64\n" + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "x0", "x1", "x2", "x3"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; + __asm__ __volatile__( + "ldr q12, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr q0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v12.16b\n" // out_o0hw0 + "mov v3.16b, v12.16b\n" // out_o0hw1 + "mov v4.16b, v12.16b\n" // out_o0hw2 + "ldr q10, [%[f_0]]\n" // f_o0c0 + "mov v5.16b, v12.16b\n" // out_o0hw3 + "mov v6.16b, v12.16b\n" // out_o0hw4 + "mov v7.16b, v12.16b\n" // out_o0hw5 + "mov v8.16b, v12.16b\n" // out_o0hw6 + "mov v9.16b, v12.16b\n" // out_o0hw7 + "0:\n" + "ldr q1, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v10.8h, v0.h[0]\n" + "fmla v3.8h, v10.8h, v0.h[1]\n" + "fmla v4.8h, v10.8h, v0.h[2]\n" + "ldr q11, [%[f_0], #16]\n" // f_o0c0 + "fmla v5.8h, v10.8h, v0.h[3]\n" + "fmla v6.8h, v10.8h, v0.h[4]\n" + "fmla v7.8h, v10.8h, v0.h[5]\n" + "subs x0, x0, #2\n" + "fmla v8.8h, v10.8h, v0.h[6]\n" + "fmla v9.8h, v10.8h, v0.h[7]\n" + + "ldr q0, [%[in_0], #32]\n" // in_hw0 + "fmla v2.8h, v11.8h, v1.h[0]\n" + "fmla v3.8h, v11.8h, v1.h[1]\n" + "fmla v4.8h, v11.8h, v1.h[2]\n" + "ldr q10, [%[f_0], #32]\n" // f_o0c0 + "fmla v5.8h, v11.8h, v1.h[3]\n" + "fmla v6.8h, v11.8h, v1.h[4]\n" + "fmla v7.8h, v11.8h, v1.h[5]\n" + "add %[in_0], %[in_0], #32\n" + "fmla v8.8h, v11.8h, v1.h[6]\n" + "add %[f_0], %[f_0], #32\n" + "fmla v9.8h, v11.8h, v1.h[7]\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v10.8h, #0x42, lsl #8\n" // three + "fadd v11.8h, v2.8h, v10.8h\n" + "fadd v12.8h, v3.8h, v10.8h\n" + "fadd v13.8h, v4.8h, v10.8h\n" + "fadd v14.8h, v5.8h, v10.8h\n" + "fadd v15.8h, v6.8h, v10.8h\n" + "fadd v16.8h, v7.8h, v10.8h\n" + "fadd v17.8h, v8.8h, v10.8h\n" + "fadd v18.8h, v9.8h, v10.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmax v14.8h, v14.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v0.8h\n" + "fmax v16.8h, v16.8h, v0.8h\n" + "fmax v17.8h, v17.8h, v0.8h\n" + "fmax v18.8h, v18.8h, v0.8h\n" + "fmin v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v1.8h\n" + "fmin v13.8h, v13.8h, v1.8h\n" + "fmin v14.8h, v14.8h, v1.8h\n" + "fmin v15.8h, v15.8h, v1.8h\n" + "fmin v16.8h, v16.8h, v1.8h\n" + "fmin v17.8h, v17.8h, v1.8h\n" + "fmin v18.8h, v18.8h, v1.8h\n" + "fdiv v11.8h, v11.8h, v1.8h\n" + "fdiv v12.8h, v12.8h, v1.8h\n" + "fdiv v13.8h, v13.8h, v1.8h\n" + "fdiv v14.8h, v14.8h, v1.8h\n" + "fdiv v15.8h, v15.8h, v1.8h\n" + "fdiv v16.8h, v16.8h, v1.8h\n" + "fdiv v17.8h, v17.8h, v1.8h\n" + "fdiv v18.8h, v18.8h, v1.8h\n" + "fmul v2.8h, v11.8h, v2.8h\n" + "fmul v3.8h, v12.8h, v3.8h\n" + "fmul v4.8h, v13.8h, v4.8h\n" + "fmul v5.8h, v14.8h, v5.8h\n" + "fmul v6.8h, v15.8h, v6.8h\n" + "fmul v7.8h, v16.8h, v7.8h\n" + "fmul v8.8h, v17.8h, v8.8h\n" + "fmul v9.8h, v18.8h, v9.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw0 + "str q4, [%[out_0], #32]\n" // out_o0hw0 + "str q5, [%[out_0], #48]\n" // out_o0hw0 + "str q6, [%[out_0], #64]\n" // out_o0hw0 + "str q7, [%[out_0], #80]\n" // out_o0hw0 + "str q8, [%[out_0], #96]\n" // out_o0hw0 + "str q9, [%[out_0], #112]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "x0", "x1", "x2"); + } + } + + // ohow_remainder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + for (I32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F16 *b0 = pwBiasArray; + const F16 *b1 = b0 + 8; + const F16 *f_o0c0 = pwFilterArray; + F16 *in_pack = pwArray + hw * ic * 8; + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + "0:\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f_0], #32]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "ldr q21, [%[f_0], #48]\n" // f_o1c0 + "fmla v10.8h, v19.8h, v0.h[0]\n" + "fmla v11.8h, v19.8h, v0.h[1]\n" + "fmla v12.8h, v19.8h, v0.h[2]\n" + "subs x0, x0, #2\n" + "fmla v13.8h, v19.8h, v0.h[3]\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f_0], #64]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "ldr q19, [%[f_0], #80]\n" // f_o1c0 + "fmla v5.8h, v20.8h, v1.h[3]\n" + "fmla v10.8h, v21.8h, v1.h[0]\n" + "fmla v11.8h, v21.8h, v1.h[1]\n" + "fmla v12.8h, v21.8h, v1.h[2]\n" + "add %[in_0], %[in_0], #16\n" + "fmla v13.8h, v21.8h, v1.h[3]\n" + "add %[f_0], %[f_0], #64\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + "fmin v10.8h, v10.8h, v1.8h\n" + "fmin v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v1.8h\n" + "fmin v13.8h, v13.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v6.8h, v2.8h, v18.8h\n" + "fadd v7.8h, v3.8h, v18.8h\n" + "fadd v8.8h, v4.8h, v18.8h\n" + "fadd v9.8h, v5.8h, v18.8h\n" + "fadd v19.8h, v10.8h, v18.8h\n" + "fadd v20.8h, v11.8h, v18.8h\n" + "fadd v21.8h, v12.8h, v18.8h\n" + "fadd v22.8h, v13.8h, v18.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v0.8h\n" + "fmax v22.8h, v22.8h, v0.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v1.8h\n" + "fmin v22.8h, v22.8h, v1.8h\n" + "fdiv v6.8h, v6.8h, v1.8h\n" + "fdiv v7.8h, v7.8h, v1.8h\n" + "fdiv v8.8h, v8.8h, v1.8h\n" + "fdiv v9.8h, v9.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fdiv v21.8h, v21.8h, v1.8h\n" + "fdiv v22.8h, v22.8h, v1.8h\n" + "fmul v2.8h, v6.8h, v2.8h\n" + "fmul v3.8h, v7.8h, v3.8h\n" + "fmul v4.8h, v8.8h, v4.8h\n" + "fmul v5.8h, v9.8h, v5.8h\n" + "fmul v10.8h, v19.8h, v10.8h\n" + "fmul v11.8h, v20.8h, v11.8h\n" + "fmul v12.8h, v21.8h, v12.8h\n" + "fmul v13.8h, v22.8h, v13.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v18", "v19", "v20", "v21", "v22", "v23", "x0", + "x1", "x2", "x3"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 + "0:\n" + "ldr d1, [%[in_0], #8]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f_0], #16]\n" // f_o0c0 + "fmla v3.8h, v18.8h, v0.h[1]\n" + "fmla v4.8h, v18.8h, v0.h[2]\n" + "fmla v5.8h, v18.8h, v0.h[3]\n" + "subs x0, x0, #2\n" + + "ldr d0, [%[in_0], #16]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f_0], #32]\n" // f_o0c0 + "fmla v3.8h, v20.8h, v1.h[1]\n" + "fmla v4.8h, v20.8h, v1.h[2]\n" + "fmla v5.8h, v20.8h, v1.h[3]\n" + "add %[in_0], %[in_0], #16\n" + "add %[f_0], %[f_0], #32\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v3.8h, v3.8h, v0.8h\n" + "fmax v4.8h, v4.8h, v0.8h\n" + "fmax v5.8h, v5.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v3.8h, v3.8h, v1.8h\n" + "fmin v4.8h, v4.8h, v1.8h\n" + "fmin v5.8h, v5.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v6.8h, v2.8h, v18.8h\n" + "fadd v7.8h, v3.8h, v18.8h\n" + "fadd v8.8h, v4.8h, v18.8h\n" + "fadd v9.8h, v5.8h, v18.8h\n" + "fmax v6.8h, v6.8h, v0.8h\n" + "fmax v7.8h, v7.8h, v0.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmin v6.8h, v6.8h, v1.8h\n" + "fmin v7.8h, v7.8h, v1.8h\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + "fdiv v6.8h, v6.8h, v1.8h\n" + "fdiv v7.8h, v7.8h, v1.8h\n" + "fdiv v8.8h, v8.8h, v1.8h\n" + "fdiv v9.8h, v9.8h, v1.8h\n" + "fmul v2.8h, v6.8h, v2.8h\n" + "fmul v3.8h, v7.8h, v3.8h\n" + "fmul v4.8h, v8.8h, v4.8h\n" + "fmul v5.8h, v9.8h, v5.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v18", "v20", "v22", "x0", "x1", "x2"); + } + } + + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + const F16 *b0 = pwBiasArray; + const F16 *b1 = b0 + 8; + const F16 *f_o0c0 = pwFilterArray; + F16 *in_pack = pwArray + hw * ic * 8; + for (I32 o = 0; o < I32(oc - 1); o += 2) { + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; + // bias + const F16 *b_o0 = b0; + const F16 *b_o1 = b1; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 + "0:\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f_0], #32]\n" // f_o0c0 + "fmla v10.8h, v19.8h, v0.h[0]\n" + "ldr q21, [%[f_0], #48]\n" // f_o1c0 + "subs x0, x0, #2\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f_0], #64]\n" // f_o0c0 + "fmla v10.8h, v21.8h, v1.h[0]\n" + "ldr q19, [%[f_0], #80]\n" // f_o1c0 + "add %[in_0], %[in_0], #4\n" + "add %[f_0], %[f_0], #64\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + "fmin v10.8h, v10.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v19.8h, v2.8h, v18.8h\n" + "fadd v20.8h, v10.8h, v18.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fdiv v19.8h, v19.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fmul v2.8h, v19.8h, v2.8h\n" + "fmul v10.8h, v20.8h, v10.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q10, [%[out_1]]\n" // out_o1hw0 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", "v21", "v22", + "v23", "x0", "x1", "x2", "x3"); + b0 += 16; + b1 += 16; + } + if (oc & 1) { + // oc%2 != 0 + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; + F16 *in_hw0 = in_pack; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; + // bias + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; + __asm__ __volatile__( + "ldr q22, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "0:\n" + "ldr h1, [%[in_0], #2]\n" // in_hw0 + "fmla v2.8h, v18.8h, v0.h[0]\n" + "ldr q20, [%[f_0], #16]\n" // f_o0c0 + "subs x0, x0, #2\n" + + "ldr h0, [%[in_0], #4]\n" // in_hw0 + "fmla v2.8h, v20.8h, v1.h[0]\n" + "ldr q18, [%[f_0], #32]\n" // f_o0c0 + "add %[in_0], %[in_0], #4\n" + "add %[f_0], %[f_0], #32\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v2.8h, v2.8h, v0.8h\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "fmax v2.8h, v2.8h, v0.8h\n" + "fmin v2.8h, v2.8h, v1.8h\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three + "fadd v20.8h, v2.8h, v18.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fdiv v20.8h, v20.8h, v1.8h\n" + "fmul v2.8h, v20.8h, v2.8h\n" + + "13:\n" + "str q2, [%[out_0]]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", "x0", "x1", + "x2"); + } + } + } + return SUCCESS; +} diff --git a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h similarity index 51% rename from tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h rename to compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h index 82877ff9..8e684953 100644 --- a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h +++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h @@ -1,35 +1,36 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_DEPTHWISE_POINTWISE_CONVOLUTION_DIRECT_NO_PADDING #define _H_DEPTHWISE_POINTWISE_CONVOLUTION_DIRECT_NO_PADDING #include - #include "sys.h" -#include "type.h" +#include "types.h" #include "error.h" -#include "tensor_desc.h" -#include "tensor_computing_type.h" - inline void calc_eight_channel_elements(I32 hw, - I32 ih_base, I32 ih, I32 iw, - I32 fh, I32 fw, + I32 ih_base, + I32 ih, + I32 iw, + I32 fh, + I32 fw, I32 ow, F16 *inArray, - I32 strideH, I32 strideW, I32 paddingT, I32 paddingL, + I32 strideH, + I32 strideW, + I32 paddingT, + I32 paddingL, const F16 *filterArray, float16x8_t bias, F16 *output) @@ -40,7 +41,7 @@ inline void calc_eight_channel_elements(I32 hw, I32 ih_start = h * strideH - paddingT; I32 iw_start = w * strideW - paddingL; I32 fh_start = 0; - if (ih_start < 0) { + if (ih_start < 0) { fh_start -= ih_start; } I32 fw_start = 0; @@ -49,14 +50,16 @@ inline void calc_eight_channel_elements(I32 hw, } for (I32 fh_idx = fh_start; fh_idx < fh; fh_idx++) { I32 ih_idx = ih_start + fh_idx; - if (ih_idx >= ih) + if (ih_idx >= ih) { break; + } I32 iw_base = ((ih_base + ih_idx) * iw); I32 filter_index = (fh_idx * fw + fw_start) * 8; - for (I32 fw_idx = fw_start; fw_idx < fw; fw_idx++, filter_index+=8) { + for (I32 fw_idx = fw_start; fw_idx < fw; fw_idx++, filter_index += 8) { I32 iw_idx = iw_start + fw_idx; - if (iw_idx >= iw) + if (iw_idx >= iw) { break; + } { U32 in_index = (iw_base + iw_idx) * 8; float16x8_t v1 = vld1q_f16(inArray + in_index); @@ -68,55 +71,74 @@ inline void calc_eight_channel_elements(I32 hw, vst1q_f16(output, v0); } -EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc); +EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec); -EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc); +EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec); -inline EE depthwise_pointwise_convolution_direct_no_padding(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc, +inline EE depthwise_pointwise_convolution_direct_no_padding(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, Arch arch) { EE ret = SUCCESS; switch (arch) { case ARM_A55: ret = depthwise_pointwise_convolution_direct_no_padding_A55(inputDesc, inArray, - filterDesc, filterArray, - convDesc, - biasDesc, biasArray, - tmpBytes, tmp, - outputDesc, outArray, - depthwiseActivationDesc, - pointwiseActivationDesc); + dwFilterDesc, dwFilterArray, pwFilterDesc, pwFilterArray, convParamSpec, dwBiasDesc, + dwBiasArray, pwBiasDesc, pwBiasArray, tmpBytes, tmp, outputDesc, outArray, + depthwiseActivationParamSpec, pointwiseActivationParamSpec); break; case ARM_A76: ret = depthwise_pointwise_convolution_direct_no_padding_A76(inputDesc, inArray, - filterDesc, filterArray, - convDesc, - biasDesc, biasArray, - tmpBytes, tmp, - outputDesc, outArray, - depthwiseActivationDesc, - pointwiseActivationDesc); + dwFilterDesc, dwFilterArray, pwFilterDesc, pwFilterArray, convParamSpec, dwBiasDesc, + dwBiasArray, pwBiasDesc, pwBiasArray, tmpBytes, tmp, outputDesc, outArray, + depthwiseActivationParamSpec, pointwiseActivationParamSpec); break; default: return NOT_SUPPORTED; diff --git a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A55.cpp b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A55.cpp similarity index 64% rename from tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A55.cpp rename to compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A55.cpp index 270ca2c3..e86fe2c9 100644 --- a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A55.cpp +++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A55.cpp @@ -1,30 +1,37 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h" #include "cpu/arm/fp16/arm_functions_fp16.h" -EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc) +EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec) { - UNUSED(biasDesc); UNUSED(tmpBytes); DataType idt, fdt, odt; @@ -33,88 +40,68 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F U32 fn, fc, fh, fw; U32 on, oc, oh, ow; CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingL = convDesc.padding_left; + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingL = convParamSpec.padding_left; - if (fdf != DF_CHWC8_NCN16) + if (dwFilterDesc.df != DF_NCHWC8 || pwFilterDesc.df != DF_NHWCN16) { CHECK_STATUS(NOT_MATCH); + } oc /= 8; ic /= 8; - I32 ohow = oh*ow; - F16 *pwArray = (F16*)tmp; + I32 ohow = oh * ow; + F16 *pwArray = (F16 *)tmp; F16 buffer[8]; for (U32 n = 0; n < in; n++) { for (U32 c = 0; c < ic; c++) { - const F16 *f = filterArray + c*fh*fw*8; - const F16 *b = biasArray + c*8; + const F16 *f = dwFilterArray + c * fh * fw * 8; + const F16 *b = dwBiasArray + c * 8; float16x8_t vv0 = vld1q_f16(b); I32 iter = 0; U32 ih_base = ((n * ic) + c) * ih; // nhwchw8 - for (; iter < ohow-7; iter += 8) { + for (; iter < ohow - 7; iter += 8) { U32 out_base = iter * ic * 8 + c * 8 * 8; for (I32 j = 0; j < 8; j++) { I32 hw = iter + j; - calc_eight_channel_elements(hw, - ih_base, ih, iw, - fh, fw, - ow, - inArray, - strideH, strideW, paddingT, paddingL, - f, - vv0, - buffer); - CHECK_STATUS(activation_fp16(buffer, 8, depthwiseActivationDesc, buffer)); + calc_eight_channel_elements(hw, ih_base, ih, iw, fh, fw, ow, inArray, strideH, + strideW, paddingT, paddingL, f, vv0, buffer); + CHECK_STATUS(activation_fp16(buffer, 8, depthwiseActivationParamSpec, buffer)); U32 out_index = out_base + j; - for (I32 i = 0; i < 8; i++, out_index+=8) { + for (I32 i = 0; i < 8; i++, out_index += 8) { pwArray[out_index] = buffer[i]; } } } // nhwchw4 - for (; iter < ohow-3; iter += 4) { + for (; iter < ohow - 3; iter += 4) { U32 out_base = iter * ic * 8 + c * 8 * 4; for (I32 j = 0; j < 4; j++) { I32 hw = iter + j; - calc_eight_channel_elements(hw, - ih_base, ih, iw, - fh, fw, - ow, - inArray, - strideH, strideW, paddingT, paddingL, - f, - vv0, - buffer); - CHECK_STATUS(activation_fp16(buffer, 8, depthwiseActivationDesc, buffer)); + calc_eight_channel_elements(hw, ih_base, ih, iw, fh, fw, ow, inArray, strideH, + strideW, paddingT, paddingL, f, vv0, buffer); + CHECK_STATUS(activation_fp16(buffer, 8, depthwiseActivationParamSpec, buffer)); U32 out_index = out_base + j; - for (I32 i = 0; i < 8; i++, out_index+=4) { + for (I32 i = 0; i < 8; i++, out_index += 4) { pwArray[out_index] = buffer[i]; } } } // nhwchw1 - for (; iter < ohow; iter ++) { + for (; iter < ohow; iter++) { U32 out_base = iter * ic * 8 + c * 8; for (I32 j = 0; j < 1; j++) { I32 hw = iter + j; - calc_eight_channel_elements(hw, - ih_base, ih, iw, - fh, fw, - ow, - inArray, - strideH, strideW, paddingT, paddingL, - f, - vv0, - buffer); - CHECK_STATUS(activation_fp16(buffer, 8, depthwiseActivationDesc, buffer)); + calc_eight_channel_elements(hw, ih_base, ih, iw, fh, fw, ow, inArray, strideH, + strideW, paddingT, paddingL, f, vv0, buffer); + CHECK_STATUS(activation_fp16(buffer, 8, depthwiseActivationParamSpec, buffer)); U32 out_index = out_base + j; for (I32 i = 0; i < 8; i++, out_index++) { pwArray[out_index] = buffer[i]; @@ -125,65 +112,65 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F // pw_conv // ohow / 8 - for (I32 hw = 0; hw < ohow-7; hw+=8) { - const F16 *b0 = biasArray + ic*8; + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *b0 = pwBiasArray; const F16 *b1 = b0 + 8; - F16 *in_pack = pwArray + hw*ic*8; - const F16 *f_o0c0 = filterArray + ic*fh*fw*8; - for (I32 o = 0; o < I32(oc-1); o+=2) { + F16 *in_pack = pwArray + hw * ic * 8; + const F16 *f_o0c0 = pwFilterArray; + for (I32 o = 0; o < I32(oc - 1); o += 2) { F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; // bias const F16 *b_o0 = b0; const F16 *b_o1 = b1; __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 + "ldr d22, [%[b_0]]\n" // b_o0 "ldr x1, [%[b_0], #8]\n" "ins v22.d[1], x1\n" - "ldr d23, [%[b_1]]\n" //b_o1 + "ldr d23, [%[b_1]]\n" // b_o1 "ldr x2, [%[b_1], #8]\n" "ins v23.d[1], x2\n" - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 "ldr x1, [%[in_0], #8]\n" - "mov v4.16b, v22.16b\n" //out_o0hw2 + "mov v4.16b, v22.16b\n" // out_o0hw2 "ins v0.d[1], x1\n" - "mov v5.16b, v22.16b\n" //out_o0hw3 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "mov v6.16b, v22.16b\n" //out_o0hw4 + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v6.16b, v22.16b\n" // out_o0hw4 "ldr x2, [%[f_0], #8]\n" - "mov v7.16b, v22.16b\n" //out_o0hw5 + "mov v7.16b, v22.16b\n" // out_o0hw5 "ins v18.d[1], x2\n" - "mov v8.16b, v22.16b\n" //out_o0hw6 - "ldr d19, [%[f_0], #16]\n" //f_o1c0 - "mov v9.16b, v22.16b\n" //out_o0hw7 + "mov v8.16b, v22.16b\n" // out_o0hw6 + "ldr d19, [%[f_0], #16]\n" // f_o1c0 + "mov v9.16b, v22.16b\n" // out_o0hw7 "ldr x3, [%[f_0], #24]\n" - "mov v10.16b, v23.16b\n" //out_o1hw0 + "mov v10.16b, v23.16b\n" // out_o1hw0 "ins v19.d[1], x3\n" - "mov v11.16b, v23.16b\n" //out_o1hw1 - "mov v12.16b, v23.16b\n" //out_o1hw2 - "mov v13.16b, v23.16b\n" //out_o1hw3 - "mov v14.16b, v23.16b\n" //out_o1hw4 - "mov v15.16b, v23.16b\n" //out_o1hw5 - "mov v16.16b, v23.16b\n" //out_o1hw6 - "mov v17.16b, v23.16b\n" //out_o1hw7 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + "mov v14.16b, v23.16b\n" // out_o1hw4 + "mov v15.16b, v23.16b\n" // out_o1hw5 + "mov v16.16b, v23.16b\n" // out_o1hw6 + "mov v17.16b, v23.16b\n" // out_o1hw7 "0:\n" - "ldr d1, [%[in_0], #16]\n" //in_hw0 + "ldr d1, [%[in_0], #16]\n" // in_hw0 "fmla v2.8h, v18.8h, v0.h[0]\n" "ldr x1, [%[in_0], #24]\n" "fmla v3.8h, v18.8h, v0.h[1]\n" "ins v1.d[1], x1\n" "fmla v4.8h, v18.8h, v0.h[2]\n" - "ldr d20, [%[f_0], #32]\n" //f_o0c0 + "ldr d20, [%[f_0], #32]\n" // f_o0c0 "fmla v5.8h, v18.8h, v0.h[3]\n" "ldr x2, [%[f_0], #40]\n" "fmla v6.8h, v18.8h, v0.h[4]\n" "ins v20.d[1], x2\n" "fmla v7.8h, v18.8h, v0.h[5]\n" - "ldr d21, [%[f_0], #48]\n" //f_o1c0 + "ldr d21, [%[f_0], #48]\n" // f_o1c0 "fmla v8.8h, v18.8h, v0.h[6]\n" "ldr x3, [%[f_0], #56]\n" "fmla v9.8h, v18.8h, v0.h[7]\n" @@ -197,19 +184,19 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F "fmla v16.8h, v19.8h, v0.h[6]\n" "fmla v17.8h, v19.8h, v0.h[7]\n" - "ldr d0, [%[in_0], #32]\n" //in_hw0 + "ldr d0, [%[in_0], #32]\n" // in_hw0 "fmla v2.8h, v20.8h, v1.h[0]\n" "ldr x1, [%[in_0], #40]\n" "fmla v3.8h, v20.8h, v1.h[1]\n" "ins v0.d[1], x1\n" "fmla v4.8h, v20.8h, v1.h[2]\n" - "ldr d18, [%[f_0], #64]\n" //f_o0c0 + "ldr d18, [%[f_0], #64]\n" // f_o0c0 "fmla v5.8h, v20.8h, v1.h[3]\n" "ldr x2, [%[f_0], #72]\n" "fmla v6.8h, v20.8h, v1.h[4]\n" "ins v18.d[1], x2\n" "fmla v7.8h, v20.8h, v1.h[5]\n" - "ldr d19, [%[f_0], #80]\n" //f_o1c0 + "ldr d19, [%[f_0], #80]\n" // f_o1c0 "fmla v8.8h, v20.8h, v1.h[6]\n" "ldr x3, [%[f_0], #88]\n" "fmla v9.8h, v20.8h, v1.h[7]\n" @@ -229,7 +216,7 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F "cmp %[pointwiseActivationMode], %[am_relu]\n" "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero + "eor v0.16b, v0.16b, v0.16b\n" // zero "fmax v2.8h, v2.8h, v0.8h\n" "fmax v3.8h, v3.8h, v0.8h\n" "fmax v4.8h, v4.8h, v0.8h\n" @@ -250,8 +237,8 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F "11:\n" "cmp %[pointwiseActivationMode], %[am_relu6]\n" "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six "fmax v2.8h, v2.8h, v0.8h\n" "fmax v3.8h, v3.8h, v0.8h\n" "fmax v4.8h, v4.8h, v0.8h\n" @@ -288,9 +275,9 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F "12:\n" "cmp %[pointwiseActivationMode], %[am_h_swish]\n" "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three "fadd v19.8h, v2.8h, v18.8h\n" "fadd v20.8h, v3.8h, v18.8h\n" "fadd v21.8h, v4.8h, v18.8h\n" @@ -378,56 +365,50 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F "st1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%[out_0]], #64\n" "st1 {v10.8h, v11.8h, v12.8h, v13.8h}, [%[out_1]], #64\n" "st1 {v14.8h, v15.8h, v16.8h, v17.8h}, [%[out_1]], #64\n" - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "x0", "x1", "x2", "x3" - ); + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "x0", "x1", "x2", "x3"); b0 += 16; b1 += 16; } if (oc & 1) { // oc%2 != 0 - const F16 *f_r = filterArray + ic*fh*fw*8 + (oc-1)*8*ic*8; + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; // bias - const F16 *b_o0 = biasArray + ic*8 + (oc-1)*8; + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; __asm__ __volatile__( - "ldr q12, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" // ic_blk - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v12.16b\n" //out_o0hw0 + "ldr q12, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v12.16b\n" // out_o0hw0 "ldr x1, [%[in_0], #8]\n" - "mov v3.16b, v12.16b\n" //out_o0hw1 + "mov v3.16b, v12.16b\n" // out_o0hw1 "ins v0.d[1], x1\n" - "mov v4.16b, v12.16b\n" //out_o0hw2 - "ldr d10, [%[f_0]]\n" //f_o0c0 - "mov v5.16b, v12.16b\n" //out_o0hw3 + "mov v4.16b, v12.16b\n" // out_o0hw2 + "ldr d10, [%[f_0]]\n" // f_o0c0 + "mov v5.16b, v12.16b\n" // out_o0hw3 "ldr x2, [%[f_0], #8]\n" - "mov v6.16b, v12.16b\n" //out_o0hw4 + "mov v6.16b, v12.16b\n" // out_o0hw4 "ins v10.d[1], x2\n" - "mov v7.16b, v12.16b\n" //out_o0hw5 - "mov v8.16b, v12.16b\n" //out_o0hw6 - "mov v9.16b, v12.16b\n" //out_o0hw7 + "mov v7.16b, v12.16b\n" // out_o0hw5 + "mov v8.16b, v12.16b\n" // out_o0hw6 + "mov v9.16b, v12.16b\n" // out_o0hw7 "0:\n" - "ldr d1, [%[in_0], #16]\n" //in_hw0 + "ldr d1, [%[in_0], #16]\n" // in_hw0 "fmla v2.8h, v10.8h, v0.h[0]\n" "ldr x1, [%[in_0], #24]\n" "fmla v3.8h, v10.8h, v0.h[1]\n" "ins v1.d[1], x1\n" "fmla v4.8h, v10.8h, v0.h[2]\n" - "ldr d11, [%[f_0], #16]\n" //f_o0c0 + "ldr d11, [%[f_0], #16]\n" // f_o0c0 "fmla v5.8h, v10.8h, v0.h[3]\n" "ldr x2, [%[f_0], #24]\n" "fmla v6.8h, v10.8h, v0.h[4]\n" @@ -437,13 +418,13 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F "fmla v8.8h, v10.8h, v0.h[6]\n" "fmla v9.8h, v10.8h, v0.h[7]\n" - "ldr d0, [%[in_0], #32]\n" //in_hw0 + "ldr d0, [%[in_0], #32]\n" // in_hw0 "fmla v2.8h, v11.8h, v1.h[0]\n" "ldr x1, [%[in_0], #40]\n" "fmla v3.8h, v11.8h, v1.h[1]\n" "ins v0.d[1], x1\n" "fmla v4.8h, v11.8h, v1.h[2]\n" - "ldr d10, [%[f_0], #32]\n" //f_o0c0 + "ldr d10, [%[f_0], #32]\n" // f_o0c0 "fmla v5.8h, v11.8h, v1.h[3]\n" "ldr x2, [%[f_0], #40]\n" "fmla v6.8h, v11.8h, v1.h[4]\n" @@ -457,7 +438,7 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F "cmp %[pointwiseActivationMode], %[am_relu]\n" "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero + "eor v0.16b, v0.16b, v0.16b\n" // zero "fmax v2.8h, v2.8h, v0.8h\n" "fmax v3.8h, v3.8h, v0.8h\n" "fmax v4.8h, v4.8h, v0.8h\n" @@ -470,8 +451,8 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F "11:\n" "cmp %[pointwiseActivationMode], %[am_relu6]\n" "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six "fmax v2.8h, v2.8h, v0.8h\n" "fmax v3.8h, v3.8h, v0.8h\n" "fmax v4.8h, v4.8h, v0.8h\n" @@ -492,9 +473,9 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F "12:\n" "cmp %[pointwiseActivationMode], %[am_h_swish]\n" "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v10.8h, #0x42, lsl #8\n" //three + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v10.8h, #0x42, lsl #8\n" // three "fadd v11.8h, v2.8h, v10.8h\n" "fadd v12.8h, v3.8h, v10.8h\n" "fadd v13.8h, v4.8h, v10.8h\n" @@ -537,76 +518,71 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F "fmul v9.8h, v18.8h, v9.8h\n" "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw0 - "str q4, [%[out_0], #32]\n" //out_o0hw0 - "str q5, [%[out_0], #48]\n" //out_o0hw0 - "str q6, [%[out_0], #64]\n" //out_o0hw0 - "str q7, [%[out_0], #80]\n" //out_o0hw0 - "str q8, [%[out_0], #96]\n" //out_o0hw0 - "str q9, [%[out_0], #112]\n" //out_o0hw0 - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "x0", "x1", "x2" - ); + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw0 + "str q4, [%[out_0], #32]\n" // out_o0hw0 + "str q5, [%[out_0], #48]\n" // out_o0hw0 + "str q6, [%[out_0], #64]\n" // out_o0hw0 + "str q7, [%[out_0], #80]\n" // out_o0hw0 + "str q8, [%[out_0], #96]\n" // out_o0hw0 + "str q9, [%[out_0], #112]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "x0", "x1", "x2"); } } // ohow_remainder % 8 / 4 U32 ohow_s = (ohow / 8) * 8; - for (I32 hw = ohow_s; hw < ohow-3; hw+=4) { - const F16 *b0 = biasArray + ic*8; + for (I32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F16 *b0 = pwBiasArray; const F16 *b1 = b0 + 8; - const F16 *f_o0c0 = filterArray + ic*fh*fw*8; - F16 *in_pack = pwArray + hw*ic*8; - for (I32 o = 0; o < I32(oc-1); o+=2) { + const F16 *f_o0c0 = pwFilterArray; + F16 *in_pack = pwArray + hw * ic * 8; + for (I32 o = 0; o < I32(oc - 1); o += 2) { F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; // bias const F16 *b_o0 = b0; const F16 *b_o1 = b1; __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 + "ldr d22, [%[b_0]]\n" // b_o0 "ldr x1, [%[b_0], #8]\n" "ins v22.d[1], x1\n" - "ldr d23, [%[b_1]]\n" //b_o1 + "ldr d23, [%[b_1]]\n" // b_o1 "ldr x2, [%[b_1], #8]\n" "ins v23.d[1], x2\n" - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "mov v4.16b, v22.16b\n" //out_o0hw2 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 "ldr x2, [%[f_0], #8]\n" - "mov v5.16b, v22.16b\n" //out_o0hw3 + "mov v5.16b, v22.16b\n" // out_o0hw3 "ins v18.d[1], x2\n" - "mov v10.16b, v23.16b\n" //out_o1hw0 - "ldr d19, [%[f_0], #16]\n" //f_o1c0 - "mov v11.16b, v23.16b\n" //out_o1hw1 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr d19, [%[f_0], #16]\n" // f_o1c0 + "mov v11.16b, v23.16b\n" // out_o1hw1 "ldr x3, [%[f_0], #24]\n" - "mov v12.16b, v23.16b\n" //out_o1hw2 + "mov v12.16b, v23.16b\n" // out_o1hw2 "ins v19.d[1], x3\n" - "mov v13.16b, v23.16b\n" //out_o1hw3 + "mov v13.16b, v23.16b\n" // out_o1hw3 "0:\n" - "ldr d1, [%[in_0], #8]\n" //in_hw0 + "ldr d1, [%[in_0], #8]\n" // in_hw0 "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr d20, [%[f_0], #32]\n" //f_o0c0 + "ldr d20, [%[f_0], #32]\n" // f_o0c0 "fmla v3.8h, v18.8h, v0.h[1]\n" "ldr x2, [%[f_0], #40]\n" "fmla v4.8h, v18.8h, v0.h[2]\n" "ins v20.d[1], x2\n" "fmla v5.8h, v18.8h, v0.h[3]\n" - "ldr d21, [%[f_0], #48]\n" //f_o1c0 + "ldr d21, [%[f_0], #48]\n" // f_o1c0 "fmla v10.8h, v19.8h, v0.h[0]\n" "ldr x3, [%[f_0], #56]\n" "fmla v11.8h, v19.8h, v0.h[1]\n" @@ -615,13 +591,13 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F "subs x0, x0, #2\n" "fmla v13.8h, v19.8h, v0.h[3]\n" - "ldr d0, [%[in_0], #16]\n" //in_hw0 + "ldr d0, [%[in_0], #16]\n" // in_hw0 "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr d18, [%[f_0], #64]\n" //f_o0c0 + "ldr d18, [%[f_0], #64]\n" // f_o0c0 "fmla v3.8h, v20.8h, v1.h[1]\n" "ldr x2, [%[f_0], #72]\n" "fmla v4.8h, v20.8h, v1.h[2]\n" - "ldr d19, [%[f_0], #80]\n" //f_o1c0 + "ldr d19, [%[f_0], #80]\n" // f_o1c0 "fmla v5.8h, v20.8h, v1.h[3]\n" "ins v18.d[1], x2\n" "fmla v10.8h, v21.8h, v1.h[0]\n" @@ -636,7 +612,7 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F "cmp %[pointwiseActivationMode], %[am_relu]\n" "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero + "eor v0.16b, v0.16b, v0.16b\n" // zero "fmax v2.8h, v2.8h, v0.8h\n" "fmax v3.8h, v3.8h, v0.8h\n" "fmax v4.8h, v4.8h, v0.8h\n" @@ -649,8 +625,8 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F "11:\n" "cmp %[pointwiseActivationMode], %[am_relu6]\n" "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six "fmax v2.8h, v2.8h, v0.8h\n" "fmax v3.8h, v3.8h, v0.8h\n" "fmax v4.8h, v4.8h, v0.8h\n" @@ -671,9 +647,9 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F "12:\n" "cmp %[pointwiseActivationMode], %[am_h_swish]\n" "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three "fadd v6.8h, v2.8h, v18.8h\n" "fadd v7.8h, v3.8h, v18.8h\n" "fadd v8.8h, v4.8h, v18.8h\n" @@ -716,55 +692,50 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F "fmul v13.8h, v22.8h, v13.8h\n" "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - "str q10, [%[out_1]]\n" //out_o1hw0 - "str q11, [%[out_1], #16]\n" //out_o1hw1 - "str q12, [%[out_1], #32]\n" //out_o1hw2 - "str q13, [%[out_1], #48]\n" //out_o1hw3 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v18", "v19", "v20", "v21", "v22", "v23", "x0", "x1", "x2", "x3" - ); + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v18", "v19", "v20", "v21", "v22", "v23", "x0", + "x1", "x2", "x3"); b0 += 16; b1 += 16; } if (oc & 1) { // oc%2 != 0 - const F16 *f_r = filterArray + ic*fh*fw*8 + (oc-1)*8*ic*8; + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; // bias - const F16 *b_o0 = biasArray + ic*8 + (oc-1)*8; + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 + "ldr d22, [%[b_0]]\n" // b_o0 "ldr x1, [%[b_0], #8]\n" "ins v22.d[1], x1\n" - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "mov v4.16b, v22.16b\n" //out_o0hw2 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 "ldr x2, [%[f_0], #8]\n" - "mov v5.16b, v22.16b\n" //out_o0hw3 + "mov v5.16b, v22.16b\n" // out_o0hw3 "ins v18.d[1], x2\n" "0:\n" - "ldr d1, [%[in_0], #8]\n" //in_hw0 + "ldr d1, [%[in_0], #8]\n" // in_hw0 "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr d20, [%[f_0], #16]\n" //f_o0c0 + "ldr d20, [%[f_0], #16]\n" // f_o0c0 "fmla v3.8h, v18.8h, v0.h[1]\n" "ldr x2, [%[f_0], #24]\n" "fmla v4.8h, v18.8h, v0.h[2]\n" @@ -772,9 +743,9 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F "fmla v5.8h, v18.8h, v0.h[3]\n" "subs x0, x0, #2\n" - "ldr d0, [%[in_0], #16]\n" //in_hw0 + "ldr d0, [%[in_0], #16]\n" // in_hw0 "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr d18, [%[f_0], #32]\n" //f_o0c0 + "ldr d18, [%[f_0], #32]\n" // f_o0c0 "fmla v3.8h, v20.8h, v1.h[1]\n" "ldr x2, [%[f_0], #40]\n" "fmla v4.8h, v20.8h, v1.h[2]\n" @@ -786,7 +757,7 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F "cmp %[pointwiseActivationMode], %[am_relu]\n" "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero + "eor v0.16b, v0.16b, v0.16b\n" // zero "fmax v2.8h, v2.8h, v0.8h\n" "fmax v3.8h, v3.8h, v0.8h\n" "fmax v4.8h, v4.8h, v0.8h\n" @@ -795,8 +766,8 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F "11:\n" "cmp %[pointwiseActivationMode], %[am_relu6]\n" "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six "fmax v2.8h, v2.8h, v0.8h\n" "fmax v3.8h, v3.8h, v0.8h\n" "fmax v4.8h, v4.8h, v0.8h\n" @@ -809,9 +780,9 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F "12:\n" "cmp %[pointwiseActivationMode], %[am_h_swish]\n" "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three "fadd v6.8h, v2.8h, v18.8h\n" "fadd v7.8h, v3.8h, v18.8h\n" "fadd v8.8h, v4.8h, v18.8h\n" @@ -834,75 +805,70 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F "fmul v5.8h, v9.8h, v5.8h\n" "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", - "v18", "v20", "v22", "x0", "x1", "x2" - ); + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v18", "v20", "v22", "x0", "x1", "x2"); } } // ohow_reminder % 4 ohow_s = (ohow / 4) * 4; for (I32 hw = ohow_s; hw < ohow; hw++) { - const F16 *b0 = biasArray + ic*8; + const F16 *b0 = pwBiasArray; const F16 *b1 = b0 + 8; - const F16 *f_o0c0 = filterArray + ic*fh*fw*8; - F16 *in_pack = pwArray + hw*ic*8; - for (I32 o = 0; o < I32(oc-1); o+=2) { + const F16 *f_o0c0 = pwFilterArray; + F16 *in_pack = pwArray + hw * ic * 8; + for (I32 o = 0; o < I32(oc - 1); o += 2) { F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; // bias const F16 *b_o0 = b0; const F16 *b_o1 = b1; __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 + "ldr d22, [%[b_0]]\n" // b_o0 "ldr x1, [%[b_0], #8]\n" "ins v22.d[1], x1\n" - "ldr d23, [%[b_1]]\n" //b_o1 - "mov x0, %[ic]\n" //ic_blk + "ldr d23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk "ldr x2, [%[b_1], #8]\n" "ins v23.d[1], x2\n" - "ldr h0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d18, [%[f_0]]\n" //f_o0c0 - "mov v10.16b, v23.16b\n" //out_o1hw0 + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d18, [%[f_0]]\n" // f_o0c0 + "mov v10.16b, v23.16b\n" // out_o1hw0 "ldr x2, [%[f_0], #8]\n" "ins v18.d[1], x2\n" - "ldr d19, [%[f_0], #16]\n" //f_o1c0 + "ldr d19, [%[f_0], #16]\n" // f_o1c0 "ldr x3, [%[f_0], #24]\n" "ins v19.d[1], x3\n" "0:\n" - "ldr h1, [%[in_0], #2]\n" //in_hw0 + "ldr h1, [%[in_0], #2]\n" // in_hw0 "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr d20, [%[f_0], #32]\n" //f_o0c0 + "ldr d20, [%[f_0], #32]\n" // f_o0c0 "fmla v10.8h, v19.8h, v0.h[0]\n" "ldr x2, [%[f_0], #40]\n" "ins v20.d[1], x2\n" - "ldr d21, [%[f_0], #48]\n" //f_o1c0 + "ldr d21, [%[f_0], #48]\n" // f_o1c0 "subs x0, x0, #2\n" "ldr x3, [%[f_0], #56]\n" "ins v21.d[1], x3\n" - "ldr h0, [%[in_0], #4]\n" //in_hw0 + "ldr h0, [%[in_0], #4]\n" // in_hw0 "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr d18, [%[f_0], #64]\n" //f_o0c0 + "ldr d18, [%[f_0], #64]\n" // f_o0c0 "fmla v10.8h, v21.8h, v1.h[0]\n" "ldr x2, [%[f_0], #72]\n" "ins v18.d[1], x2\n" - "ldr d19, [%[f_0], #80]\n" //f_o1c0 + "ldr d19, [%[f_0], #80]\n" // f_o1c0 "add %[in_0], %[in_0], #4\n" "ldr x3, [%[f_0], #88]\n" "ins v19.d[1], x3\n" @@ -911,15 +877,15 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F "cmp %[pointwiseActivationMode], %[am_relu]\n" "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero + "eor v0.16b, v0.16b, v0.16b\n" // zero "fmax v2.8h, v2.8h, v0.8h\n" "fmax v10.8h, v10.8h, v0.8h\n" "11:\n" "cmp %[pointwiseActivationMode], %[am_relu6]\n" "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six "fmax v2.8h, v2.8h, v0.8h\n" "fmax v10.8h, v10.8h, v0.8h\n" "fmin v2.8h, v2.8h, v1.8h\n" @@ -928,9 +894,9 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F "12:\n" "cmp %[pointwiseActivationMode], %[am_h_swish]\n" "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three "fadd v19.8h, v2.8h, v18.8h\n" "fadd v20.8h, v10.8h, v18.8h\n" "fmax v19.8h, v19.8h, v0.8h\n" @@ -943,52 +909,47 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F "fmul v10.8h, v20.8h, v10.8h\n" "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q10, [%[out_1]]\n" //out_o1hw0 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", "v21", "v22", "v23", "x0", "x1", "x2", "x3" - ); + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q10, [%[out_1]]\n" // out_o1hw0 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", "v21", "v22", + "v23", "x0", "x1", "x2", "x3"); b0 += 16; b1 += 16; } if (oc & 1) { // oc%2 != 0 - const F16 *f_r = filterArray + ic*fh*fw*8 + (oc-1)*8*ic*8; + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; // bias - const F16 *b_o0 = biasArray + ic*8 + (oc-1)*8; + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; __asm__ __volatile__( - "ldr d22, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" //ic_blk + "ldr d22, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk "ldr x1, [%[b_0], #8]\n" "ins v22.d[1], x1\n" - "ldr h0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d18, [%[f_0]]\n" //f_o0c0 + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d18, [%[f_0]]\n" // f_o0c0 "ldr x2, [%[f_0], #8]\n" "ins v18.d[1], x2\n" "0:\n" - "ldr h1, [%[in_0], #2]\n" //in_hw0 + "ldr h1, [%[in_0], #2]\n" // in_hw0 "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr d20, [%[f_0], #16]\n" //f_o0c0 + "ldr d20, [%[f_0], #16]\n" // f_o0c0 "subs x0, x0, #2\n" "ldr x2, [%[f_0], #24]\n" "ins v20.d[1], x2\n" - "ldr h0, [%[in_0], #4]\n" //in_hw0 + "ldr h0, [%[in_0], #4]\n" // in_hw0 "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr d18, [%[f_0], #32]\n" //f_o0c0 + "ldr d18, [%[f_0], #32]\n" // f_o0c0 "ldr x2, [%[f_0], #40]\n" "ins v18.d[1], x2\n" "add %[in_0], %[in_0], #4\n" @@ -997,23 +958,23 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F "cmp %[pointwiseActivationMode], %[am_relu]\n" "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero + "eor v0.16b, v0.16b, v0.16b\n" // zero "fmax v2.8h, v2.8h, v0.8h\n" "11:\n" "cmp %[pointwiseActivationMode], %[am_relu6]\n" "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six "fmax v2.8h, v2.8h, v0.8h\n" "fmin v2.8h, v2.8h, v1.8h\n" "12:\n" "cmp %[pointwiseActivationMode], %[am_h_swish]\n" "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three "fadd v20.8h, v2.8h, v18.8h\n" "fmax v20.8h, v20.8h, v0.8h\n" "fmin v20.8h, v20.8h, v1.8h\n" @@ -1021,18 +982,14 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, F "fmul v2.8h, v20.8h, v2.8h\n" "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", "x0", "x1", "x2" - ); + "str q2, [%[out_0]]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", "x0", "x1", + "x2"); } } } diff --git a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A76.cpp b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A76.cpp similarity index 61% rename from tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A76.cpp rename to compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A76.cpp index bf501f4f..24bcfb4a 100644 --- a/tensor_computing/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A76.cpp +++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A76.cpp @@ -1,30 +1,37 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h" #include "cpu/arm/fp16/arm_functions_fp16.h" -EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F16* inArray, - TensorDesc filterDesc, const F16* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F16* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F16* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc) +EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, + F16 *inArray, + TensorDesc dwFilterDesc, + const F16 *dwFilterArray, + TensorDesc pwFilterDesc, + const F16 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F16 *dwBiasArray, + TensorDesc pwBiasDesc, + const F16 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec) { - UNUSED(biasDesc); UNUSED(tmpBytes); DataType idt, fdt, odt; @@ -33,88 +40,68 @@ EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F U32 fn, fc, fh, fw; U32 on, oc, oh, ow; CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingL = convDesc.padding_left; + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingL = convParamSpec.padding_left; - if (fdf != DF_CHWC8_NCN16) + if (dwFilterDesc.df != DF_NCHWC8 || pwFilterDesc.df != DF_NHWCN16) { CHECK_STATUS(NOT_MATCH); + } oc /= 8; ic /= 8; - I32 ohow = oh*ow; - F16 *pwArray = (F16*)tmp; + I32 ohow = oh * ow; + F16 *pwArray = (F16 *)tmp; F16 buffer[8]; for (U32 n = 0; n < in; n++) { for (U32 c = 0; c < ic; c++) { - const F16 *f = filterArray + c*fh*fw*8; - const F16 *b = biasArray + c*8; + const F16 *f = dwFilterArray + c * fh * fw * 8; + const F16 *b = dwBiasArray + c * 8; float16x8_t vv0 = vld1q_f16(b); I32 iter = 0; U32 ih_base = ((n * ic) + c) * ih; // nhwchw8 - for (; iter < ohow-7; iter += 8) { + for (; iter < ohow - 7; iter += 8) { U32 out_base = iter * ic * 8 + c * 8 * 8; for (I32 j = 0; j < 8; j++) { I32 hw = iter + j; - calc_eight_channel_elements(hw, - ih_base, ih, iw, - fh, fw, - ow, - inArray, - strideH, strideW, paddingT, paddingL, - f, - vv0, - buffer); - CHECK_STATUS(activation_fp16(buffer, 8, depthwiseActivationDesc, buffer)); + calc_eight_channel_elements(hw, ih_base, ih, iw, fh, fw, ow, inArray, strideH, + strideW, paddingT, paddingL, f, vv0, buffer); + CHECK_STATUS(activation_fp16(buffer, 8, depthwiseActivationParamSpec, buffer)); U32 out_index = out_base + j; - for (I32 i = 0; i < 8; i++, out_index+=8) { + for (I32 i = 0; i < 8; i++, out_index += 8) { pwArray[out_index] = buffer[i]; } } } // nhwchw4 - for (; iter < ohow-3; iter += 4) { + for (; iter < ohow - 3; iter += 4) { U32 out_base = iter * ic * 8 + c * 8 * 4; for (I32 j = 0; j < 4; j++) { I32 hw = iter + j; - calc_eight_channel_elements(hw, - ih_base, ih, iw, - fh, fw, - ow, - inArray, - strideH, strideW, paddingT, paddingL, - f, - vv0, - buffer); - CHECK_STATUS(activation_fp16(buffer, 8, depthwiseActivationDesc, buffer)); + calc_eight_channel_elements(hw, ih_base, ih, iw, fh, fw, ow, inArray, strideH, + strideW, paddingT, paddingL, f, vv0, buffer); + CHECK_STATUS(activation_fp16(buffer, 8, depthwiseActivationParamSpec, buffer)); U32 out_index = out_base + j; - for (I32 i = 0; i < 8; i++, out_index+=4) { + for (I32 i = 0; i < 8; i++, out_index += 4) { pwArray[out_index] = buffer[i]; } } } // nhwchw1 - for (; iter < ohow; iter ++) { + for (; iter < ohow; iter++) { U32 out_base = iter * ic * 8 + c * 8; for (I32 j = 0; j < 1; j++) { I32 hw = iter + j; - calc_eight_channel_elements(hw, - ih_base, ih, iw, - fh, fw, - ow, - inArray, - strideH, strideW, paddingT, paddingL, - f, - vv0, - buffer); - CHECK_STATUS(activation_fp16(buffer, 8, depthwiseActivationDesc, buffer)); + calc_eight_channel_elements(hw, ih_base, ih, iw, fh, fw, ow, inArray, strideH, + strideW, paddingT, paddingL, f, vv0, buffer); + CHECK_STATUS(activation_fp16(buffer, 8, depthwiseActivationParamSpec, buffer)); U32 out_index = out_base + j; for (I32 i = 0; i < 8; i++, out_index++) { pwArray[out_index] = buffer[i]; @@ -125,51 +112,51 @@ EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F // pw_conv // ohow / 8 - for (I32 hw = 0; hw < ohow-7; hw+=8) { - const F16 *b0 = biasArray + ic*8; + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + const F16 *b0 = pwBiasArray; const F16 *b1 = b0 + 8; - F16 *in_pack = pwArray + hw*ic*8; - const F16 *f_o0c0 = filterArray + ic*fh*fw*8; - for (I32 o = 0; o < I32(oc-1); o+=2) { + F16 *in_pack = pwArray + hw * ic * 8; + const F16 *f_o0c0 = pwFilterArray; + for (I32 o = 0; o < I32(oc - 1); o += 2) { F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; // bias const F16 *b_o0 = b0; const F16 *b_o1 = b1; __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "ldr q23, [%[b_1]]\n" //b_o1 - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr q0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "mov v4.16b, v22.16b\n" //out_o0hw2 - "mov v5.16b, v22.16b\n" //out_o0hw3 - "ldr q18, [%[f_0]]\n" //f_o0c0 - "mov v6.16b, v22.16b\n" //out_o0hw4 - "mov v7.16b, v22.16b\n" //out_o0hw5 - "mov v8.16b, v22.16b\n" //out_o0hw6 - "ldr q19, [%[f_0], #16]\n" //f_o1c0 - "mov v9.16b, v22.16b\n" //out_o0hw7 - "mov v10.16b, v23.16b\n" //out_o1hw0 - "mov v11.16b, v23.16b\n" //out_o1hw1 - "mov v12.16b, v23.16b\n" //out_o1hw2 - "mov v13.16b, v23.16b\n" //out_o1hw3 - "mov v14.16b, v23.16b\n" //out_o1hw4 - "mov v15.16b, v23.16b\n" //out_o1hw5 - "mov v16.16b, v23.16b\n" //out_o1hw6 - "mov v17.16b, v23.16b\n" //out_o1hw7 + "ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr q0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v6.16b, v22.16b\n" // out_o0hw4 + "mov v7.16b, v22.16b\n" // out_o0hw5 + "mov v8.16b, v22.16b\n" // out_o0hw6 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 + "mov v9.16b, v22.16b\n" // out_o0hw7 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 + "mov v14.16b, v23.16b\n" // out_o1hw4 + "mov v15.16b, v23.16b\n" // out_o1hw5 + "mov v16.16b, v23.16b\n" // out_o1hw6 + "mov v17.16b, v23.16b\n" // out_o1hw7 "0:\n" - "ldr q1, [%[in_0], #16]\n" //in_hw0 + "ldr q1, [%[in_0], #16]\n" // in_hw0 "fmla v2.8h, v18.8h, v0.h[0]\n" "fmla v3.8h, v18.8h, v0.h[1]\n" "fmla v4.8h, v18.8h, v0.h[2]\n" - "ldr q20, [%[f_0], #32]\n" //f_o0c0 + "ldr q20, [%[f_0], #32]\n" // f_o0c0 "fmla v5.8h, v18.8h, v0.h[3]\n" "fmla v6.8h, v18.8h, v0.h[4]\n" "fmla v7.8h, v18.8h, v0.h[5]\n" - "ldr q21, [%[f_0], #48]\n" //f_o1c0 + "ldr q21, [%[f_0], #48]\n" // f_o1c0 "fmla v8.8h, v18.8h, v0.h[6]\n" "fmla v9.8h, v18.8h, v0.h[7]\n" "fmla v10.8h, v19.8h, v0.h[0]\n" @@ -181,15 +168,15 @@ EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F "fmla v16.8h, v19.8h, v0.h[6]\n" "fmla v17.8h, v19.8h, v0.h[7]\n" - "ldr q0, [%[in_0], #32]\n" //in_hw0 + "ldr q0, [%[in_0], #32]\n" // in_hw0 "fmla v2.8h, v20.8h, v1.h[0]\n" "fmla v3.8h, v20.8h, v1.h[1]\n" "fmla v4.8h, v20.8h, v1.h[2]\n" - "ldr q18, [%[f_0], #64]\n" //f_o0c0 + "ldr q18, [%[f_0], #64]\n" // f_o0c0 "fmla v5.8h, v20.8h, v1.h[3]\n" "fmla v6.8h, v20.8h, v1.h[4]\n" "fmla v7.8h, v20.8h, v1.h[5]\n" - "ldr q19, [%[f_0], #80]\n" //f_o1c0 + "ldr q19, [%[f_0], #80]\n" // f_o1c0 "fmla v8.8h, v20.8h, v1.h[6]\n" "fmla v9.8h, v20.8h, v1.h[7]\n" "fmla v10.8h, v21.8h, v1.h[0]\n" @@ -207,7 +194,7 @@ EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F "cmp %[pointwiseActivationMode], %[am_relu]\n" "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero + "eor v0.16b, v0.16b, v0.16b\n" // zero "fmax v2.8h, v2.8h, v0.8h\n" "fmax v3.8h, v3.8h, v0.8h\n" "fmax v4.8h, v4.8h, v0.8h\n" @@ -228,8 +215,8 @@ EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F "11:\n" "cmp %[pointwiseActivationMode], %[am_relu6]\n" "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six "fmax v2.8h, v2.8h, v0.8h\n" "fmax v3.8h, v3.8h, v0.8h\n" "fmax v4.8h, v4.8h, v0.8h\n" @@ -266,9 +253,9 @@ EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F "12:\n" "cmp %[pointwiseActivationMode], %[am_h_swish]\n" "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three "fadd v19.8h, v2.8h, v18.8h\n" "fadd v20.8h, v3.8h, v18.8h\n" "fadd v21.8h, v4.8h, v18.8h\n" @@ -356,50 +343,44 @@ EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F "st1 {v6.8h, v7.8h, v8.8h, v9.8h}, [%[out_0]], #64\n" "st1 {v10.8h, v11.8h, v12.8h, v13.8h}, [%[out_1]], #64\n" "st1 {v14.8h, v15.8h, v16.8h, v17.8h}, [%[out_1]], #64\n" - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "x0", "x1", "x2", "x3" - ); + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "x0", "x1", "x2", "x3"); b0 += 16; b1 += 16; } if (oc & 1) { // oc%2 != 0 - const F16 *f_r = filterArray + ic*fh*fw*8 + (oc-1)*8*ic*8; + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; // bias - const F16 *b_o0 = biasArray + ic*8 + (oc-1)*8; + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; __asm__ __volatile__( - "ldr q12, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" // ic_blk - "ldr q0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v12.16b\n" //out_o0hw0 - "mov v3.16b, v12.16b\n" //out_o0hw1 - "mov v4.16b, v12.16b\n" //out_o0hw2 - "ldr q10, [%[f_0]]\n" //f_o0c0 - "mov v5.16b, v12.16b\n" //out_o0hw3 - "mov v6.16b, v12.16b\n" //out_o0hw4 - "mov v7.16b, v12.16b\n" //out_o0hw5 - "mov v8.16b, v12.16b\n" //out_o0hw6 - "mov v9.16b, v12.16b\n" //out_o0hw7 + "ldr q12, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr q0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v12.16b\n" // out_o0hw0 + "mov v3.16b, v12.16b\n" // out_o0hw1 + "mov v4.16b, v12.16b\n" // out_o0hw2 + "ldr q10, [%[f_0]]\n" // f_o0c0 + "mov v5.16b, v12.16b\n" // out_o0hw3 + "mov v6.16b, v12.16b\n" // out_o0hw4 + "mov v7.16b, v12.16b\n" // out_o0hw5 + "mov v8.16b, v12.16b\n" // out_o0hw6 + "mov v9.16b, v12.16b\n" // out_o0hw7 "0:\n" - "ldr q1, [%[in_0], #16]\n" //in_hw0 + "ldr q1, [%[in_0], #16]\n" // in_hw0 "fmla v2.8h, v10.8h, v0.h[0]\n" "fmla v3.8h, v10.8h, v0.h[1]\n" "fmla v4.8h, v10.8h, v0.h[2]\n" - "ldr q11, [%[f_0], #16]\n" //f_o0c0 + "ldr q11, [%[f_0], #16]\n" // f_o0c0 "fmla v5.8h, v10.8h, v0.h[3]\n" "fmla v6.8h, v10.8h, v0.h[4]\n" "fmla v7.8h, v10.8h, v0.h[5]\n" @@ -407,11 +388,11 @@ EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F "fmla v8.8h, v10.8h, v0.h[6]\n" "fmla v9.8h, v10.8h, v0.h[7]\n" - "ldr q0, [%[in_0], #32]\n" //in_hw0 + "ldr q0, [%[in_0], #32]\n" // in_hw0 "fmla v2.8h, v11.8h, v1.h[0]\n" "fmla v3.8h, v11.8h, v1.h[1]\n" "fmla v4.8h, v11.8h, v1.h[2]\n" - "ldr q10, [%[f_0], #32]\n" //f_o0c0 + "ldr q10, [%[f_0], #32]\n" // f_o0c0 "fmla v5.8h, v11.8h, v1.h[3]\n" "fmla v6.8h, v11.8h, v1.h[4]\n" "fmla v7.8h, v11.8h, v1.h[5]\n" @@ -423,7 +404,7 @@ EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F "cmp %[pointwiseActivationMode], %[am_relu]\n" "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero + "eor v0.16b, v0.16b, v0.16b\n" // zero "fmax v2.8h, v2.8h, v0.8h\n" "fmax v3.8h, v3.8h, v0.8h\n" "fmax v4.8h, v4.8h, v0.8h\n" @@ -436,8 +417,8 @@ EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F "11:\n" "cmp %[pointwiseActivationMode], %[am_relu6]\n" "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six "fmax v2.8h, v2.8h, v0.8h\n" "fmax v3.8h, v3.8h, v0.8h\n" "fmax v4.8h, v4.8h, v0.8h\n" @@ -458,9 +439,9 @@ EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F "12:\n" "cmp %[pointwiseActivationMode], %[am_h_swish]\n" "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v10.8h, #0x42, lsl #8\n" //three + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v10.8h, #0x42, lsl #8\n" // three "fadd v11.8h, v2.8h, v10.8h\n" "fadd v12.8h, v3.8h, v10.8h\n" "fadd v13.8h, v4.8h, v10.8h\n" @@ -503,78 +484,73 @@ EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F "fmul v9.8h, v18.8h, v9.8h\n" "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw0 - "str q4, [%[out_0], #32]\n" //out_o0hw0 - "str q5, [%[out_0], #48]\n" //out_o0hw0 - "str q6, [%[out_0], #64]\n" //out_o0hw0 - "str q7, [%[out_0], #80]\n" //out_o0hw0 - "str q8, [%[out_0], #96]\n" //out_o0hw0 - "str q9, [%[out_0], #112]\n" //out_o0hw0 - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "x0", "x1", "x2" - ); + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw0 + "str q4, [%[out_0], #32]\n" // out_o0hw0 + "str q5, [%[out_0], #48]\n" // out_o0hw0 + "str q6, [%[out_0], #64]\n" // out_o0hw0 + "str q7, [%[out_0], #80]\n" // out_o0hw0 + "str q8, [%[out_0], #96]\n" // out_o0hw0 + "str q9, [%[out_0], #112]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "x0", "x1", "x2"); } } // ohow_remainder % 8 / 4 U32 ohow_s = (ohow / 8) * 8; - for (I32 hw = ohow_s; hw < ohow-3; hw+=4) { - const F16 *b0 = biasArray + ic*8; + for (I32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F16 *b0 = pwBiasArray; const F16 *b1 = b0 + 8; - const F16 *f_o0c0 = filterArray + ic*fh*fw*8; - F16 *in_pack = pwArray + hw*ic*8; - for (I32 o = 0; o < I32(oc-1); o+=2) { + const F16 *f_o0c0 = pwFilterArray; + F16 *in_pack = pwArray + hw * ic * 8; + for (I32 o = 0; o < I32(oc - 1); o += 2) { F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; // bias const F16 *b_o0 = b0; const F16 *b_o1 = b1; __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "ldr q23, [%[b_1]]\n" //b_o1 - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "ldr q18, [%[f_0]]\n" //f_o0c0 - "mov v4.16b, v22.16b\n" //out_o0hw2 - "mov v5.16b, v22.16b\n" //out_o0hw3 - "mov v10.16b, v23.16b\n" //out_o1hw0 - "ldr q19, [%[f_0], #16]\n" //f_o1c0 - "mov v11.16b, v23.16b\n" //out_o1hw1 - "mov v12.16b, v23.16b\n" //out_o1hw2 - "mov v13.16b, v23.16b\n" //out_o1hw3 + "ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 + "mov v11.16b, v23.16b\n" // out_o1hw1 + "mov v12.16b, v23.16b\n" // out_o1hw2 + "mov v13.16b, v23.16b\n" // out_o1hw3 "0:\n" - "ldr d1, [%[in_0], #8]\n" //in_hw0 + "ldr d1, [%[in_0], #8]\n" // in_hw0 "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr q20, [%[f_0], #32]\n" //f_o0c0 + "ldr q20, [%[f_0], #32]\n" // f_o0c0 "fmla v3.8h, v18.8h, v0.h[1]\n" "fmla v4.8h, v18.8h, v0.h[2]\n" "fmla v5.8h, v18.8h, v0.h[3]\n" - "ldr q21, [%[f_0], #48]\n" //f_o1c0 + "ldr q21, [%[f_0], #48]\n" // f_o1c0 "fmla v10.8h, v19.8h, v0.h[0]\n" "fmla v11.8h, v19.8h, v0.h[1]\n" "fmla v12.8h, v19.8h, v0.h[2]\n" "subs x0, x0, #2\n" "fmla v13.8h, v19.8h, v0.h[3]\n" - "ldr d0, [%[in_0], #16]\n" //in_hw0 + "ldr d0, [%[in_0], #16]\n" // in_hw0 "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr q18, [%[f_0], #64]\n" //f_o0c0 + "ldr q18, [%[f_0], #64]\n" // f_o0c0 "fmla v3.8h, v20.8h, v1.h[1]\n" "fmla v4.8h, v20.8h, v1.h[2]\n" - "ldr q19, [%[f_0], #80]\n" //f_o1c0 + "ldr q19, [%[f_0], #80]\n" // f_o1c0 "fmla v5.8h, v20.8h, v1.h[3]\n" "fmla v10.8h, v21.8h, v1.h[0]\n" "fmla v11.8h, v21.8h, v1.h[1]\n" @@ -586,7 +562,7 @@ EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F "cmp %[pointwiseActivationMode], %[am_relu]\n" "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero + "eor v0.16b, v0.16b, v0.16b\n" // zero "fmax v2.8h, v2.8h, v0.8h\n" "fmax v3.8h, v3.8h, v0.8h\n" "fmax v4.8h, v4.8h, v0.8h\n" @@ -599,8 +575,8 @@ EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F "11:\n" "cmp %[pointwiseActivationMode], %[am_relu6]\n" "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six "fmax v2.8h, v2.8h, v0.8h\n" "fmax v3.8h, v3.8h, v0.8h\n" "fmax v4.8h, v4.8h, v0.8h\n" @@ -621,9 +597,9 @@ EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F "12:\n" "cmp %[pointwiseActivationMode], %[am_h_swish]\n" "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three "fadd v6.8h, v2.8h, v18.8h\n" "fadd v7.8h, v3.8h, v18.8h\n" "fadd v8.8h, v4.8h, v18.8h\n" @@ -666,59 +642,54 @@ EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F "fmul v13.8h, v22.8h, v13.8h\n" "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - "str q10, [%[out_1]]\n" //out_o1hw0 - "str q11, [%[out_1], #16]\n" //out_o1hw1 - "str q12, [%[out_1], #32]\n" //out_o1hw2 - "str q13, [%[out_1], #48]\n" //out_o1hw3 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v18", "v19", "v20", "v21", "v22", "v23", "x0", "x1", "x2", "x3" - ); + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + "str q10, [%[out_1]]\n" // out_o1hw0 + "str q11, [%[out_1], #16]\n" // out_o1hw1 + "str q12, [%[out_1], #32]\n" // out_o1hw2 + "str q13, [%[out_1], #48]\n" // out_o1hw3 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v18", "v19", "v20", "v21", "v22", "v23", "x0", + "x1", "x2", "x3"); b0 += 16; b1 += 16; } if (oc & 1) { // oc%2 != 0 - const F16 *f_r = filterArray + ic*fh*fw*8 + (oc-1)*8*ic*8; + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; // bias - const F16 *b_o0 = biasArray + ic*8 + (oc-1)*8; + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" //ic_blk - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr d0, [%[in_0]]\n" //in_hw0 - "mov v3.16b, v22.16b\n" //out_o0hw1 - "ldr q18, [%[f_0]]\n" //f_o0c0 - "mov v4.16b, v22.16b\n" //out_o0hw2 - "mov v5.16b, v22.16b\n" //out_o0hw3 + "ldr q22, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr d0, [%[in_0]]\n" // in_hw0 + "mov v3.16b, v22.16b\n" // out_o0hw1 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v4.16b, v22.16b\n" // out_o0hw2 + "mov v5.16b, v22.16b\n" // out_o0hw3 "0:\n" - "ldr d1, [%[in_0], #8]\n" //in_hw0 + "ldr d1, [%[in_0], #8]\n" // in_hw0 "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr q20, [%[f_0], #16]\n" //f_o0c0 + "ldr q20, [%[f_0], #16]\n" // f_o0c0 "fmla v3.8h, v18.8h, v0.h[1]\n" "fmla v4.8h, v18.8h, v0.h[2]\n" "fmla v5.8h, v18.8h, v0.h[3]\n" "subs x0, x0, #2\n" - "ldr d0, [%[in_0], #16]\n" //in_hw0 + "ldr d0, [%[in_0], #16]\n" // in_hw0 "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr q18, [%[f_0], #32]\n" //f_o0c0 + "ldr q18, [%[f_0], #32]\n" // f_o0c0 "fmla v3.8h, v20.8h, v1.h[1]\n" "fmla v4.8h, v20.8h, v1.h[2]\n" "fmla v5.8h, v20.8h, v1.h[3]\n" @@ -728,7 +699,7 @@ EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F "cmp %[pointwiseActivationMode], %[am_relu]\n" "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero + "eor v0.16b, v0.16b, v0.16b\n" // zero "fmax v2.8h, v2.8h, v0.8h\n" "fmax v3.8h, v3.8h, v0.8h\n" "fmax v4.8h, v4.8h, v0.8h\n" @@ -737,8 +708,8 @@ EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F "11:\n" "cmp %[pointwiseActivationMode], %[am_relu6]\n" "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six "fmax v2.8h, v2.8h, v0.8h\n" "fmax v3.8h, v3.8h, v0.8h\n" "fmax v4.8h, v4.8h, v0.8h\n" @@ -751,9 +722,9 @@ EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F "12:\n" "cmp %[pointwiseActivationMode], %[am_h_swish]\n" "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three "fadd v6.8h, v2.8h, v18.8h\n" "fadd v7.8h, v3.8h, v18.8h\n" "fadd v8.8h, v4.8h, v18.8h\n" @@ -776,76 +747,71 @@ EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F "fmul v5.8h, v9.8h, v5.8h\n" "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q3, [%[out_0], #16]\n" //out_o0hw1 - "str q4, [%[out_0], #32]\n" //out_o0hw2 - "str q5, [%[out_0], #48]\n" //out_o0hw3 - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", - "v18", "v20", "v22", "x0", "x1", "x2" - ); + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q3, [%[out_0], #16]\n" // out_o0hw1 + "str q4, [%[out_0], #32]\n" // out_o0hw2 + "str q5, [%[out_0], #48]\n" // out_o0hw3 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v18", "v20", "v22", "x0", "x1", "x2"); } } // ohow_reminder % 4 ohow_s = (ohow / 4) * 4; for (I32 hw = ohow_s; hw < ohow; hw++) { - const F16 *b0 = biasArray + ic*8; + const F16 *b0 = pwBiasArray; const F16 *b1 = b0 + 8; - const F16 *f_o0c0 = filterArray + ic*fh*fw*8; - F16 *in_pack = pwArray + hw*ic*8; - for (I32 o = 0; o < I32(oc-1); o+=2) { + const F16 *f_o0c0 = pwFilterArray; + F16 *in_pack = pwArray + hw * ic * 8; + for (I32 o = 0; o < I32(oc - 1); o += 2) { F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o1hw0 = out_o0hw0 + ohow*8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o1hw0 = out_o0hw0 + ohow * 8; // bias const F16 *b_o0 = b0; const F16 *b_o1 = b1; __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "ldr q23, [%[b_1]]\n" //b_o1 - "mov x0, %[ic]\n" //ic_blk - "ldr h0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr q18, [%[f_0]]\n" //f_o0c0 - "mov v10.16b, v23.16b\n" //out_o1hw0 - "ldr q19, [%[f_0], #16]\n" //f_o1c0 + "ldr q22, [%[b_0]]\n" // b_o0 + "ldr q23, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr q18, [%[f_0]]\n" // f_o0c0 + "mov v10.16b, v23.16b\n" // out_o1hw0 + "ldr q19, [%[f_0], #16]\n" // f_o1c0 "0:\n" - "ldr h1, [%[in_0], #2]\n" //in_hw0 + "ldr h1, [%[in_0], #2]\n" // in_hw0 "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr q20, [%[f_0], #32]\n" //f_o0c0 + "ldr q20, [%[f_0], #32]\n" // f_o0c0 "fmla v10.8h, v19.8h, v0.h[0]\n" - "ldr q21, [%[f_0], #48]\n" //f_o1c0 + "ldr q21, [%[f_0], #48]\n" // f_o1c0 "subs x0, x0, #2\n" - "ldr h0, [%[in_0], #4]\n" //in_hw0 + "ldr h0, [%[in_0], #4]\n" // in_hw0 "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr q18, [%[f_0], #64]\n" //f_o0c0 + "ldr q18, [%[f_0], #64]\n" // f_o0c0 "fmla v10.8h, v21.8h, v1.h[0]\n" - "ldr q19, [%[f_0], #80]\n" //f_o1c0 + "ldr q19, [%[f_0], #80]\n" // f_o1c0 "add %[in_0], %[in_0], #4\n" "add %[f_0], %[f_0], #64\n" "bne 0b\n" "cmp %[pointwiseActivationMode], %[am_relu]\n" "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero + "eor v0.16b, v0.16b, v0.16b\n" // zero "fmax v2.8h, v2.8h, v0.8h\n" "fmax v10.8h, v10.8h, v0.8h\n" "11:\n" "cmp %[pointwiseActivationMode], %[am_relu6]\n" "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six "fmax v2.8h, v2.8h, v0.8h\n" "fmax v10.8h, v10.8h, v0.8h\n" "fmin v2.8h, v2.8h, v1.8h\n" @@ -854,9 +820,9 @@ EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F "12:\n" "cmp %[pointwiseActivationMode], %[am_h_swish]\n" "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three "fadd v19.8h, v2.8h, v18.8h\n" "fadd v20.8h, v10.8h, v18.8h\n" "fmax v19.8h, v19.8h, v0.8h\n" @@ -869,69 +835,64 @@ EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F "fmul v10.8h, v20.8h, v10.8h\n" "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - "str q10, [%[out_1]]\n" //out_o1hw0 - :[out_0]"+r"(out_o0hw0), - [out_1]"+r"(out_o1hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", "v21", "v22", "v23", "x0", "x1", "x2", "x3" - ); + "str q2, [%[out_0]]\n" // out_o0hw0 + "str q10, [%[out_1]]\n" // out_o1hw0 + : [out_0] "+r"(out_o0hw0), [out_1] "+r"(out_o1hw0), [in_0] "+r"(in_hw0), + [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v19", "v20", "v21", "v22", + "v23", "x0", "x1", "x2", "x3"); b0 += 16; b1 += 16; } if (oc & 1) { // oc%2 != 0 - const F16 *f_r = filterArray + ic*fh*fw*8 + (oc-1)*8*ic*8; + const F16 *f_r = pwFilterArray + (oc - 1) * 8 * ic * 8; F16 *in_hw0 = in_pack; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + (oc-1)*ohow*8 + hw*8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + (oc - 1) * ohow * 8 + hw * 8; // bias - const F16 *b_o0 = biasArray + ic*8 + (oc-1)*8; + const F16 *b_o0 = pwBiasArray + (oc - 1) * 8; __asm__ __volatile__( - "ldr q22, [%[b_0]]\n" //b_o0 - "mov x0, %[ic]\n" //ic_blk - "ldr h0, [%[in_0]]\n" //in_hw0 - "mov v2.16b, v22.16b\n" //out_o0hw0 - "ldr q18, [%[f_0]]\n" //f_o0c0 + "ldr q22, [%[b_0]]\n" // b_o0 + "mov x0, %[ic]\n" // ic_blk + "ldr h0, [%[in_0]]\n" // in_hw0 + "mov v2.16b, v22.16b\n" // out_o0hw0 + "ldr q18, [%[f_0]]\n" // f_o0c0 "0:\n" - "ldr h1, [%[in_0], #2]\n" //in_hw0 + "ldr h1, [%[in_0], #2]\n" // in_hw0 "fmla v2.8h, v18.8h, v0.h[0]\n" - "ldr q20, [%[f_0], #16]\n" //f_o0c0 + "ldr q20, [%[f_0], #16]\n" // f_o0c0 "subs x0, x0, #2\n" - "ldr h0, [%[in_0], #4]\n" //in_hw0 + "ldr h0, [%[in_0], #4]\n" // in_hw0 "fmla v2.8h, v20.8h, v1.h[0]\n" - "ldr q18, [%[f_0], #32]\n" //f_o0c0 + "ldr q18, [%[f_0], #32]\n" // f_o0c0 "add %[in_0], %[in_0], #4\n" "add %[f_0], %[f_0], #32\n" "bne 0b\n" "cmp %[pointwiseActivationMode], %[am_relu]\n" "bne 11f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero + "eor v0.16b, v0.16b, v0.16b\n" // zero "fmax v2.8h, v2.8h, v0.8h\n" "11:\n" "cmp %[pointwiseActivationMode], %[am_relu6]\n" "bne 12f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six "fmax v2.8h, v2.8h, v0.8h\n" "fmin v2.8h, v2.8h, v1.8h\n" "12:\n" "cmp %[pointwiseActivationMode], %[am_h_swish]\n" "bne 13f\n" - "eor v0.16b, v0.16b, v0.16b\n" //zero - "movi v1.8h, #0x46, lsl #8\n" //six - "movi v18.8h, #0x42, lsl #8\n" //three + "eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v1.8h, #0x46, lsl #8\n" // six + "movi v18.8h, #0x42, lsl #8\n" // three "fadd v20.8h, v2.8h, v18.8h\n" "fmax v20.8h, v20.8h, v0.8h\n" "fmin v20.8h, v20.8h, v1.8h\n" @@ -939,18 +900,14 @@ EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, F "fmul v2.8h, v20.8h, v2.8h\n" "13:\n" - "str q2, [%[out_0]]\n" //out_o0hw0 - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_r) - :[ic]"r"((I64)ic*8), - [b_0]"r"(b_o0), - [pointwiseActivationMode]"r"((I64)pointwiseActivationDesc.mode), - [am_relu]"r"((I64)ACTIVATION_RELU), - [am_relu6]"r"((I64)ACTIVATION_RELU6), - [am_h_swish]"r"((I64)ACTIVATION_H_SWISH) - :"memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", "x0", "x1", "x2" - ); + "str q2, [%[out_0]]\n" // out_o0hw0 + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_r) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v10", "v18", "v20", "v22", "x0", "x1", + "x2"); } } } diff --git a/tensor_computing/src/cpu/arm/fp16/eltwise.cpp b/compute/tensor/src/cpu/arm/fp16/eltwise.cpp similarity index 56% rename from tensor_computing/src/cpu/arm/fp16/eltwise.cpp rename to compute/tensor/src/cpu/arm/fp16/eltwise.cpp index 69be7620..6f8fc40f 100644 --- a/tensor_computing/src/cpu/arm/fp16/eltwise.cpp +++ b/compute/tensor/src/cpu/arm/fp16/eltwise.cpp @@ -1,64 +1,52 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include #include "cpu/arm/fp16/tensor_computing_fp16.h" +#include "cpu/cpu_functions.h" -float16x8_t getHalfVector(void* input, int inputSize, int index) { - float16x8_t result; - if (inputSize == 1) { - result = vdupq_n_f16(*((F16*)input)); - return result; - } - int local = index % inputSize; - int remain = inputSize - local; - if (remain >= 8) { - result = vld1q_f16((F16*)(input) + local); - } else { - F16 buffer[8]; - F16 *ptr = (F16*)input; - memcpy(buffer, ptr+local, sizeof(F16)*remain); - for (int i = 0; i < 8 - remain; i++) { - buffer[remain+i] = ptr[i % inputSize]; - } - result = vld1q_f16(buffer); - } - return result; -} - -F32 getHalfScalar(void* input, int inputSize, int index) { - int local = index % inputSize; - return ((F16*)input)[local]; -} - -EE eltwise_fp16(std::vectorinput, std::vector inputSize, U32 num, U32 len, void *output, EltwiseMode eltwiseMode) { +EE eltwise_fp16(std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode) +{ + F16 buffer[8]; U32 len_tail = len % 8; U32 len_main = len - len_tail; F16 *output_ptr = (F16 *)output; - for (U32 i = 0; i < len_main; i+=8){ - float16x8_t tmp_v = getHalfVector(input[0], inputSize[0], i); + F16 *tmp = buffer; + for (U32 i = 0; i < len_main; i += 8) { + get_vector((F16 *)input[0], inputSize[0], &tmp, 8, i, 8, buffer); + float16x8_t tmp_v = vld1q_f16(tmp); for (U32 j = 1; j < num; j++) { - float16x8_t value_v = getHalfVector(input[j], inputSize[j], i); + get_vector((F16 *)input[j], inputSize[j], &tmp, 8, i, 8, buffer); + float16x8_t value_v = vld1q_f16(tmp); switch (eltwiseMode) { case ELTWISE_SUM: - tmp_v = vaddq_f16(value_v, tmp_v); + tmp_v = vaddq_f16(tmp_v, value_v); break; case ELTWISE_MAX: - tmp_v = vmaxq_f16(value_v, tmp_v); + tmp_v = vmaxq_f16(tmp_v, value_v); break; case ELTWISE_PROD: - tmp_v = vmulq_f16(value_v, tmp_v); + tmp_v = vmulq_f16(tmp_v, value_v); + break; + case ELTWISE_SUB: + tmp_v = vsubq_f16(tmp_v, value_v); + break; + case ELTWISE_DIV: + tmp_v = vdivq_f16(tmp_v, value_v); break; default: return NOT_SUPPORTED; @@ -66,10 +54,12 @@ EE eltwise_fp16(std::vectorinput, std::vector inputSize, U32 num, U3 } vst1q_f16(output_ptr + i, tmp_v); } - for (U32 i = len_main; i < len; i++){ - F32 tmp_s = getHalfScalar(input[0], inputSize[0], i); + for (U32 i = len_main; i < len; i++) { + get_vector((F16 *)input[0], inputSize[0], &tmp, 8, i, 1, buffer); + F32 tmp_s = tmp[0]; for (U32 j = 1; j < num; j++) { - F32 value_s = getHalfScalar(input[j], inputSize[j], i); + get_vector((F16 *)input[j], inputSize[j], &tmp, 8, i, 1, buffer); + F32 value_s = tmp[0]; switch (eltwiseMode) { case ELTWISE_SUM: tmp_s = value_s + tmp_s; @@ -80,6 +70,12 @@ EE eltwise_fp16(std::vectorinput, std::vector inputSize, U32 num, U3 case ELTWISE_PROD: tmp_s *= value_s; break; + case ELTWISE_SUB: + tmp_s -= value_s; + break; + case ELTWISE_DIV: + tmp_s /= value_s; + break; default: return NOT_SUPPORTED; } @@ -87,4 +83,4 @@ EE eltwise_fp16(std::vectorinput, std::vector inputSize, U32 num, U3 output_ptr[i] = tmp_s; } return SUCCESS; -} +} diff --git a/compute/tensor/src/cpu/arm/fp16/lstm.cpp b/compute/tensor/src/cpu/arm/fp16/lstm.cpp new file mode 100644 index 00000000..ec00702b --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/lstm.cpp @@ -0,0 +1,263 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/arm/fp16/tensor_computing_fp16.h" + +void mvm_nkn32(U32 fn, U32 fk, const F16 *filterArray, F16 *input, F16 *output) +{ + for (U32 n = 0; n < fn; n++) { + F16 *in = input; + const F16 *f = filterArray + n * fk * 32; + __asm__ __volatile__("ldr s0, [%[in]]\n" + "ldr q1, [%[out]]\n" + "ldr q2, [%[out], #16]\n" + "ldr q3, [%[out], #32]\n" + "ldr q4, [%[out], #48]\n" + "mov x0, %[k]\n" + "ldr q5, [%[f]]\n" + "ldr q6, [%[f], #16]\n" + "ldr q7, [%[f], #32]\n" + "ldr q8, [%[f], #48]\n" + "0:\n" + "prfm pldl2strm, [%[f], #4096]\n" + "prfm pldl1strm, [%[f], #1024]\n" + "ldr d9, [%[f], #64]\n" + "fmla v1.8h, v5.8h, v0.h[0]\n" + "ldr x9, [%[f], #72]\n" + "ins v9.d[1], x9\n" + "ldr d10, [%[f], #80]\n" + "fmla v2.8h, v6.8h, v0.h[0]\n" + "ldr x10, [%[f], #88]\n" + "ins v10.d[1], x10\n" + "ldr d11, [%[f], #96]\n" + "fmla v3.8h, v7.8h, v0.h[0]\n" + "ldr x11, [%[f], #104]\n" + "ins v11.d[1], x11\n" + "ldr d12, [%[f], #112]\n" + "fmla v4.8h, v8.8h, v0.h[0]\n" + "ldr x12, [%[f], #120]\n" + "ins v12.d[1], x12\n" + + "ldr d5, [%[f], #128]\n" + "fmla v1.8h, v9.8h, v0.h[1]\n" + "ldr x5, [%[f], #136]\n" + "ins v5.d[1], x5\n" + "ldr d6, [%[f], #144]\n" + "fmla v2.8h, v10.8h, v0.h[1]\n" + "ldr x6, [%[f], #152]\n" + "ins v6.d[1], x6\n" + "ldr d7, [%[f], #160]\n" + "fmla v3.8h, v11.8h, v0.h[1]\n" + "ldr x7, [%[f], #168]\n" + "ins v7.d[1], x7\n" + "ldr d8, [%[f], #176]\n" + "fmla v4.8h, v12.8h, v0.h[1]\n" + "ldr x8, [%[f], #184]\n" + "add %[in], %[in], #4\n" + "ins v8.d[1], x8\n" + "add %[f], %[f], #128\n" + "ldr s0, [%[in]]\n" + "sub x0, x0, #2\n" + + "cmp x0, #3\n" + "bgt 0b\n" + "ldr q9, [%[f], #64]\n" + "ldr q10, [%[f], #80]\n" + "ldr q11, [%[f], #96]\n" + "ldr q12, [%[f], #112]\n" + "fmla v1.8h, v5.8h, v0.h[0]\n" + "fmla v2.8h, v6.8h, v0.h[0]\n" + "fmla v3.8h, v7.8h, v0.h[0]\n" + "fmla v4.8h, v8.8h, v0.h[0]\n" + "fmla v1.8h, v9.8h, v0.h[1]\n" + "fmla v2.8h, v10.8h, v0.h[1]\n" + "fmla v3.8h, v11.8h, v0.h[1]\n" + "fmla v4.8h, v12.8h, v0.h[1]\n" + "cmp x0, #3\n" + "bne 1f\n" + "ldr h0, [%[in], #4]\n" + "ldr q5, [%[f], #128]\n" + "ldr q6, [%[f], #144]\n" + "ldr q7, [%[f], #160]\n" + "ldr q8, [%[f], #176]\n" + "fmla v1.8h, v5.8h, v0.h[0]\n" + "fmla v2.8h, v6.8h, v0.h[0]\n" + "fmla v3.8h, v7.8h, v0.h[0]\n" + "fmla v4.8h, v8.8h, v0.h[0]\n" + + "1:\n" + "str q1, [%[out]]\n" + "str q2, [%[out], #16]\n" + "str q3, [%[out], #32]\n" + "str q4, [%[out], #48]\n" + : [out] "+r"(output), [f] "+r"(f), [in] "+r"(in) + : [k] "r"((I64)fk) + : "memory", "cc", "x0", "x5", "x6", "x7", "x8", "x9", "x10", "x11", + "x12", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12"); + output += 32; + } +} + +EE rnncell_fp16(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *output, + Arch arch) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(arch); + if (nullptr == currentX || nullptr == filter || nullptr == bias || nullptr == state || + nullptr == tmp || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ix; + U32 on, oh; + U32 fk, fn; + CHECK_STATUS(tensor2dGet(xDesc, &idt, &idf, &in, &ix)); + CHECK_STATUS(tensor2dGet(filterDesc[0], &fdt, &fdf, &fn, &fk)); + CHECK_STATUS(tensor2dGet(hDesc, &odt, &odf, &on, &oh)); + if (fdf != DF_NKN32) { + CHECK_STATUS(NOT_MATCH); + } + fn /= 32; + + U32 batch = in; + I32 xDim = ix; + I32 hDim = rnnParamSpec.numOutput; + I32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection + : rnnParamSpec.numOutput; + if (!(idt == DT_F16 && fdt == DT_F16 && odt == DT_F16)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(4 * column == (I32)fn * 32 && (ix + oh) == fk && in == on)) { + CHECK_STATUS(NOT_MATCH); + } + F32 forgetBias = rnnParamSpec.forgetBias; + ActivationMode activationMode = rnnParamSpec.activationMode; + if (activationMode != ACTIVATION_TANH) { + CHECK_STATUS(NOT_SUPPORTED); + } + + const F16 *currentXArray = (const F16 *)currentX; + F16 *lastStateArray = (F16 *)state; + F16 *lastHArray = lastStateArray + column; + F16 *tmpArray = (F16 *)tmp; + F16 *currentStateArray = (F16 *)state; + F16 *currentHArray = currentStateArray + column; + F16 *outputArray = (F16 *)output; + F16 *xhArray = tmpArray; + F16 *intermediateH = xhArray + (xDim + hDim); + U32 lastStateStride = column + hDim; + U32 lastHStride = column + hDim; + U32 currentStateStride = column + hDim; + U32 currentHStride = column + hDim; + float16x8_t forgetBiasVector = vdupq_n_f16(forgetBias); + for (U32 m = 0; m < batch; m++) { + F16 *lastBatchH = lastHArray + m * lastHStride; + memcpy(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F16)); + memcpy(xhArray + xDim, lastBatchH, hDim * sizeof(F16)); + + memcpy(intermediateH, bias[0], column * 4 * sizeof(F16)); + mvm_nkn32(fn, fk, (const F16 *)filter[0], xhArray, intermediateH); + + F16 *out_i = intermediateH; + F16 *out_g = out_i + column; + F16 *out_f = out_i + column * 2; + F16 *out_o = out_i + column * 3; + + F16 *lastBatchState = lastStateArray + m * lastStateStride; + F16 *currentBatchState = currentStateArray + m * currentStateStride; + F16 *currentBatchH = currentHArray + m * currentHStride; + F16 *currentOutput = outputArray + m * batchStrideH; + + F16 *tmpState, *tmpHH, *tmpH; + if (rnnParamSpec.zoneoutCell == 0) { + tmpState = currentBatchState; + } else { + tmpState = out_i; + } + if (rnnParamSpec.numProjection > 0) { + tmpHH = out_g; + tmpH = currentOutput; + } else { + tmpHH = currentOutput; + tmpH = out_g; + } + + I32 h = 0; + for (; h < column - 7; h += 8) { + float16x8_t out_i_v = vld1q_f16(out_i + h); + float16x8_t out_g_v = vld1q_f16(out_g + h); + float16x8_t out_f_v = vld1q_f16(out_f + h); + float16x8_t out_o_v = vld1q_f16(out_o + h); + float16x8_t C_v = vld1q_f16(lastBatchState + h); + float16x8_t I_v = vsigmoidq_f16(out_i_v); + float16x8_t F_v = vsigmoidq_f16(vaddq_f16(out_f_v, forgetBiasVector)); + float16x8_t O_v = vsigmoidq_f16(out_o_v); + float16x8_t G_v = vtanhq_f16(out_g_v); + C_v = vaddq_f16_f32(vmulq_f16(C_v, F_v), vmulq_f16(I_v, G_v)); + float16x8_t out_hidden_v = vmulq_f16(O_v, vtanhq_f16(C_v)); + vst1q_f16(tmpState + h, C_v); + vst1q_f16(tmpHH + h, out_hidden_v); + } + for (; h < column; h++) { + F16 C_s = lastBatchState[h]; + F16 I_s = 1.0 / (1.0 + exp(-out_i[h])); + F16 F_s = 1.0 / (1.0 + exp(-(out_f[h] + forgetBias))); + F16 O_s = 1.0 / (1.0 + exp(-out_o[h])); + F16 G_s = tanh(out_g[h]); + C_s = C_s * F_s + I_s * G_s; + F16 value = O_s * tanh(C_s); + tmpState[h] = C_s; + tmpHH[h] = value; + } + if (rnnParamSpec.zoneoutCell != 0) { + array_scale_f16(tmpState, tmpState, column, 1 - rnnParamSpec.zoneoutCell, 0); + array_scale_f16(lastBatchState, lastBatchState, column, rnnParamSpec.zoneoutCell, 0); + array_add_f16(tmpState, lastBatchState, currentBatchState, column); + } + + if (rnnParamSpec.numProjection > 0) { + memset(tmpH, 0, sizeof(F16) * hDim); + mvm_nkn32(hDim / 32, rnnParamSpec.numProjection, (const F16 *)filter[1], tmpHH, tmpH); + } + if (rnnParamSpec.zoneoutOutput != 0) { + if (rnnParamSpec.numProjection > 0) { + array_scale_f16(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0); + } else { + array_scale_f16(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0); + } + array_scale_f16(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneoutOutput, 0); + array_add_f16(out_f, lastBatchH, currentBatchH, hDim); + } else { + memcpy(currentBatchH, currentOutput, sizeof(F16) * hDim); + } + } + return SUCCESS; +} diff --git a/tensor_computing/src/cpu/arm/fp16/normalization.cpp b/compute/tensor/src/cpu/arm/fp16/normalization.cpp similarity index 72% rename from tensor_computing/src/cpu/arm/fp16/normalization.cpp rename to compute/tensor/src/cpu/arm/fp16/normalization.cpp index 7621c132..503e2970 100644 --- a/tensor_computing/src/cpu/arm/fp16/normalization.cpp +++ b/compute/tensor/src/cpu/arm/fp16/normalization.cpp @@ -1,61 +1,62 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "cpu/arm/fp16/tensor_computing_fp16.h" -inline void array_norm_scale_fp16(F16 *input, F16 *output, I32 len, F32 mean, F32 var, F16 *alpha, F16 *beta) { +inline void array_norm_scale_fp16( + F16 *input, F16 *output, I32 len, F32 mean, F32 var, F16 *alpha, F16 *beta) +{ F32 eps = 1e-6; F32 std_value = sqrt(var + eps); float16x8_t mean_v = vdupq_n_f16(mean); - float16x8_t std_v = vdupq_n_f16(std_value); + float16x8_t std_v = vdupq_n_f16(std_value); I32 i = 0; - for(i = 0; i < len - 7; i += 8){ + for (i = 0; i < len - 7; i += 8) { float16x8_t in = vld1q_f16(input + i); float16x8_t alpha_v = vld1q_f16(alpha + i); - float16x8_t beta_v = vld1q_f16(beta + i); + float16x8_t beta_v = vld1q_f16(beta + i); float16x8_t tmp_v = vsubq_f16(in, mean_v); tmp_v = vdivq_f16(tmp_v, std_v); tmp_v = vfmaq_f16(beta_v, alpha_v, tmp_v); - vst1q_f16(output+i, tmp_v); + vst1q_f16(output + i, tmp_v); } - for(; i < len; i++){ + for (; i < len; i++) { output[i] = alpha[i] * (input[i] - mean) / std_value + beta[i]; } } -EE layer_normalization_fp16(F16 *alpha, F16 *beta, - TensorDesc inputDesc, F16* input, - TensorDesc outputDesc, F16* output) +EE layer_normalization_fp16( + TensorDesc inputDesc, F16 *input, F16 *alpha, F16 *beta, TensorDesc outputDesc, F16 *output) { UNUSED(outputDesc); - if (nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) + if (nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); + } U32 size = tensorNumElements(inputDesc); I32 size_inner = inputDesc.dims[0]; I32 size_outer = size / size_inner; - for(I32 i = 0; i < size_outer; i++) { + for (I32 i = 0; i < size_outer; i++) { F16 *current_input = input + i * size_inner; F16 *current_output = output + i * size_inner; F32 mean = array_mean_f16(current_input, size_inner); - F32 var = array_var_f16(current_input, size_inner, mean); + F32 var = array_var_f16(current_input, size_inner, mean); array_norm_scale_fp16(current_input, current_output, size_inner, mean, var, alpha, beta); } - + return SUCCESS; } diff --git a/compute/tensor/src/cpu/arm/fp16/pooling.cpp b/compute/tensor/src/cpu/arm/fp16/pooling.cpp new file mode 100644 index 00000000..b8d87d25 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/pooling.cpp @@ -0,0 +1,91 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp16/tensor_computing_fp16.h" + +EE pooling_c8_fp16(const F16 *input, + U32 stride, + int hstart, + int hend, + int wstart, + int wend, + F16 *output, + PoolingParamSpec poolingParamSpec) +{ + EE ret = SUCCESS; + PoolingMode pm = poolingParamSpec.mode; + float16x8_t in1, out1; + float16x8_t poolSize = vdupq_n_f16(float16_t((hend - hstart) * (wend - wstart))); + out1 = vdupq_n_f16(float16_t((pm == POOLING_MAX) ? UNI_F16_MIN : 0)); + for (int kernelH = hstart; kernelH < hend; kernelH++) { + for (int kernelW = wstart; kernelW < wend; kernelW++) { + const U32 index = (kernelH * stride + kernelW) * 8; + in1 = vld1q_f16(input + index); + switch (pm) { + case POOLING_MAX: + out1 = vmaxq_f16(in1, out1); + break; + case POOLING_MEAN: + out1 = vaddq_f16(out1, in1); + break; + default: + ret = NOT_SUPPORTED; + break; + } + } + } + vst1q_f16(output, ((pm == POOLING_MAX) ? out1 : vdivq_f16(out1, poolSize))); + return ret; +} + +EE pooling_c8_big_fp16(const F16 *input, + U32 stride, + int hstart, + int hend, + int wstart, + int wend, + F16 *output, + int poolSize) +{ + EE ret = SUCCESS; + float32x4_t out0, out1; + float32x4_t p = vdupq_n_f32(poolSize); + float16x4_t in0, in1, temp0, temp1; + temp0 = vdup_n_f16(0); + temp1 = temp0; + out0 = vdupq_n_f32(0); + out1 = out0; + int count = 0; + for (int kernelH = hstart; kernelH < hend; kernelH++) { + for (int kernelW = wstart; kernelW < wend; kernelW++, count++) { + const U32 index = (kernelH * stride + kernelW) * 8; + in0 = vld1_f16(input + index); + in1 = vld1_f16(input + index + 4); + temp0 = vadd_f16(temp0, in0); + temp1 = vadd_f16(temp1, in1); + if (count % 256 == 255) { + out0 = vaddq_f32(out0, vcvt_f32_f16(temp0)); + out1 = vaddq_f32(out1, vcvt_f32_f16(temp1)); + temp0 = vdup_n_f16(0); + temp1 = temp0; + } + } + } + out0 = vaddq_f32(out0, vcvt_f32_f16(temp0)); + out1 = vaddq_f32(out1, vcvt_f32_f16(temp1)); + out0 = vdivq_f32(out0, p); + out1 = vdivq_f32(out1, p); + vst1_f16(output, vcvt_f16_f32(out0)); + vst1_f16(output + 4, vcvt_f16_f32(out1)); + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp16/prelu.cpp b/compute/tensor/src/cpu/arm/fp16/prelu.cpp new file mode 100644 index 00000000..a8fa835c --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/prelu.cpp @@ -0,0 +1,61 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/tensor_computing_arm.h" +#include "cpu/arm/fp16/tensor_computing_fp16.h" + +EE prelu_fp16(TensorDesc inputDesc, + F16 *input, + F16 *weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + F16 *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, odt; + DataFormat idf, odf; + U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0; + if (tensorIs4d(inputDesc) && tensorIs4d(outputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + if (idf != DF_NCHWC8) { + CHECK_STATUS(NOT_SUPPORTED); + } + } else { + return NOT_SUPPORTED; + } + + CHECK_REQUIREMENT(in == on && ic == oc && ih == oh && iw == ow); + ic /= 8; + float16x8_t slope; + uint16x8_t mask; + float16x8_t in0, out0; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 hw = 0; hw < ih * iw; hw++) { + slope = preluDesc.propagate_down ? vdupq_n_f16(weight[0]) + : vld1q_f16(weight + c * 8); + in0 = vld1q_f16(input); + mask = vcleq_f16(in0, vdupq_n_f16(0.f)); + float16x8_t tmp = vmulq_f16(in0, slope); + out0 = vbslq_f16(mask, tmp, in0); + vst1q_f16(output, out0); + input += 8; + output += 8; + } + } + } + return SUCCESS; +} diff --git a/tensor_computing/src/cpu/arm/fp16/quantize.cpp b/compute/tensor/src/cpu/arm/fp16/quantize.cpp similarity index 73% rename from tensor_computing/src/cpu/arm/fp16/quantize.cpp rename to compute/tensor/src/cpu/arm/fp16/quantize.cpp index a9fbf7ea..8adfa1cd 100644 --- a/tensor_computing/src/cpu/arm/fp16/quantize.cpp +++ b/compute/tensor/src/cpu/arm/fp16/quantize.cpp @@ -1,37 +1,31 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include #include "cpu/arm/fp16/tensor_computing_fp16.h" -inline void apply_scale_f16(U32 numData, F16* array, F16 scale, INT8* qArray) +inline void apply_scale_f16(U32 numData, F16 *array, F16 scale, INT8 *qArray, bool clamp) { - for (U32 i=0; i 127.0) { - qArray[i] = 127; - } else if (tmp < -127.0) { - qArray[i] = -127; - } else { - qArray[i] = round(tmp); - } + qArray[i] = round_towards_zero(tmp, clamp); } } -EE quantize_tensor_fp16(TensorDesc dDesc, const void* data, TensorDesc* qDesc, void* qData, F16 *scale) +EE quantize_tensor_fp16( + TensorDesc dDesc, const void *data, TensorDesc *qDesc, void *qData, F16 *scale) { if (nullptr == data || nullptr == qDesc || nullptr == qData || nullptr == scale) { CHECK_STATUS(NULL_POINTER); @@ -40,7 +34,7 @@ EE quantize_tensor_fp16(TensorDesc dDesc, const void* data, TensorDesc* qDesc, v DataFormat df; U32 n, c, h, w; if (tensorIs2d(dDesc)) { - CHECK_STATUS(tensor2dfGet(dDesc, &dt, &df, &n, &w)); + CHECK_STATUS(tensor2dGet(dDesc, &dt, &df, &n, &w)); c = 1; h = 1; } else if (tensorIs3d(dDesc)) { @@ -49,21 +43,21 @@ EE quantize_tensor_fp16(TensorDesc dDesc, const void* data, TensorDesc* qDesc, v } else { CHECK_STATUS(tensor4dGet(dDesc, &dt, &df, &n, &c, &h, &w)); } - + switch (dt) { case DT_F16: { switch (df) { - case DF_HWNCN8C4:{ // winograd - F16 *array = (F16*)data; - for (U32 idx=0; idx<36; idx++) { - float16x8_t tmp_v = vld1q_f16(array + idx*8*c); + case DF_HWNCN8C4: { // winograd + F16 *array = (F16 *)data; + for (U32 idx = 0; idx < 36; idx++) { + float16x8_t tmp_v = vld1q_f16(array + idx * 8 * c); float16x8_t max_v = tmp_v; float16x8_t min_v = tmp_v; - for (U32 o=0; o 0 + } else { // min > 0 scale[idx] = 127.0 / max; } - INT8* qArray = (INT8*)qData; - for (U32 o=0; o= 8); U32 i = 8; for (; i < numData - 7; i += 8) { - tmp_v = vld1q_f16(array+i); + tmp_v = vld1q_f16(array + i); max_v = vmaxq_f16(max_v, tmp_v); min_v = vminq_f16(min_v, tmp_v); } @@ -121,7 +122,9 @@ EE quantize_tensor_fp16(TensorDesc dDesc, const void* data, TensorDesc* qDesc, v } } if (max == 0 && min == 0) { - CHECK_STATUS(NOT_SUPPORTED); + *scale = 1; + memset(qData, 0, tensorNumBytes(*qDesc)); + break; } F16 scaleRaw; if (max > 0 && min < 0) { @@ -130,34 +133,26 @@ EE quantize_tensor_fp16(TensorDesc dDesc, const void* data, TensorDesc* qDesc, v scaleRaw = (scale_max < scale_min) ? scale_max : scale_min; } else if (max < 0) { scaleRaw = -127.0 / min; - } else { // min > 0 + } else { // min > 0 scaleRaw = 127.0 / max; } - DEBUG_info(max << " is the max FP16 value, and min value is " << min); + UNI_DEBUG_LOG("%f is the max FP16 value, and min value is %f\n", max, min); if (*scale < scaleRaw) { *scale = scaleRaw; } - INT8* qArray = (INT8*)qData; - apply_scale_f16(numData, array, *scale, qArray); - - if (tensorIs2d(dDesc)) { - *qDesc = tensor2df(DT_I8, df, n, w); - } else if (tensorIs3d(dDesc)) { - *qDesc = tensor3df(DT_I8, df, n, h, w); - } else { - *qDesc = tensor4df(DT_I8, df, n, c, h, w); - } + INT8 *qArray = (INT8 *)qData; + apply_scale_f16(numData, array, *scale, qArray, (*scale) != scaleRaw); break; } } break; } - default:{ + default: { CHECK_STATUS(NOT_SUPPORTED); break; } } - DEBUG_info(scale[0] << " is the quantization scale"); + UNI_DEBUG_LOG("%f is the quantization scale\n", scale[0]); return SUCCESS; } diff --git a/tensor_computing/src/cpu/arm/fp16/scale.cpp b/compute/tensor/src/cpu/arm/fp16/scale.cpp similarity index 64% rename from tensor_computing/src/cpu/arm/fp16/scale.cpp rename to compute/tensor/src/cpu/arm/fp16/scale.cpp index 351bd63f..80fb60ad 100644 --- a/tensor_computing/src/cpu/arm/fp16/scale.cpp +++ b/compute/tensor/src/cpu/arm/fp16/scale.cpp @@ -1,21 +1,21 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "cpu/arm/fp16/tensor_computing_fp16.h" -EE scale_nchwc8_fp16(F16* input, F16* alpha, F16* beta, I32 in, I32 ic, I32 elements_per_channel, F16* output) +EE scale_nchwc8_fp16( + F16 *input, F16 *alpha, F16 *beta, I32 in, I32 ic, I32 elements_per_channel, F16 *output) { float16x8_t one = vdupq_n_f16(1.); float16x8_t zero = vdupq_n_f16(0.); @@ -23,11 +23,11 @@ EE scale_nchwc8_fp16(F16* input, F16* alpha, F16* beta, I32 in, I32 ic, I32 elem for (I32 n = 0; n < in; n++) { for (I32 c = 0; c < ic; c += 8) { float16x8_t alpha_vec = (alpha == nullptr) ? one : vld1q_f16(alpha + c); - float16x8_t beta_vec = (beta == nullptr) ? zero : vld1q_f16(beta + c); + float16x8_t beta_vec = (beta == nullptr) ? zero : vld1q_f16(beta + c); for (I32 i = 0; i < elements_per_channel; i++) { float16x8_t in_vec = vld1q_f16(input + index); float16x8_t out_vec = vfmaq_f16(beta_vec, alpha_vec, in_vec); - vst1q_f16(output+index, out_vec); + vst1q_f16(output + index, out_vec); index += 8; } } @@ -35,7 +35,8 @@ EE scale_nchwc8_fp16(F16* input, F16* alpha, F16* beta, I32 in, I32 ic, I32 elem return SUCCESS; } -EE scale_nchw_fp16(F16* input, F16* alpha, F16* beta, I32 in, I32 ic, I32 elements_per_channel, F16* output) +EE scale_nchw_fp16( + F16 *input, F16 *alpha, F16 *beta, I32 in, I32 ic, I32 elements_per_channel, F16 *output) { float16x8_t one = vdupq_n_f16(1.); float16x8_t zero = vdupq_n_f16(0.); @@ -43,16 +44,18 @@ EE scale_nchw_fp16(F16* input, F16* alpha, F16* beta, I32 in, I32 ic, I32 elemen for (I32 n = 0; n < in; n++) { for (I32 c = 0; c < ic; c++) { float16x8_t alpha_vec = (alpha == nullptr) ? one : vdupq_n_f16(alpha[c]); - float16x8_t beta_vec = (beta == nullptr) ? zero : vdupq_n_f16(beta[c]); + float16x8_t beta_vec = (beta == nullptr) ? zero : vdupq_n_f16(beta[c]); I32 i = 0; - for (; i < elements_per_channel-7; i += 8) { + for (; i < elements_per_channel - 7; i += 8) { float16x8_t in_vec = vld1q_f16(input + index); float16x8_t out_vec = vfmaq_f16(beta_vec, alpha_vec, in_vec); - vst1q_f16(output+index, out_vec); + vst1q_f16(output + index, out_vec); index += 8; } for (; i < elements_per_channel; i++) { - output[index] = alpha[c] * input[index] + beta[c]; + float alpha_s = (alpha == nullptr) ? 1 : alpha[c]; + float beta_s = (beta == nullptr) ? 0 : beta[c]; + output[index] = alpha_s * input[index] + beta_s; index++; } } @@ -60,7 +63,8 @@ EE scale_nchw_fp16(F16* input, F16* alpha, F16* beta, I32 in, I32 ic, I32 elemen return SUCCESS; } -EE scale_nhwc_fp16(F16* input, F16* alpha, F16* beta, I32 in, I32 ic, I32 elements_per_channel, F16* output) +EE scale_nhwc_fp16( + F16 *input, F16 *alpha, F16 *beta, I32 in, I32 ic, I32 elements_per_channel, F16 *output) { float16x8_t one = vdupq_n_f16(1.); float16x8_t zero = vdupq_n_f16(0.); @@ -68,17 +72,18 @@ EE scale_nhwc_fp16(F16* input, F16* alpha, F16* beta, I32 in, I32 ic, I32 elemen for (I32 n = 0; n < in; n++) { for (I32 i = 0; i < elements_per_channel; i++) { I32 c = 0; - for (; c < ic-7; c += 8) { - float16x8_t alpha_vec = (alpha == nullptr) ? one : vld1q_f16(alpha+c); - float16x8_t beta_vec = (beta == nullptr) ? zero : vld1q_f16(beta+c); + for (; c < ic - 7; c += 8) { + float16x8_t alpha_vec = (alpha == nullptr) ? one : vld1q_f16(alpha + c); + float16x8_t beta_vec = (beta == nullptr) ? zero : vld1q_f16(beta + c); float16x8_t in_vec = vld1q_f16(input + index); float16x8_t out_vec = vfmaq_f16(beta_vec, alpha_vec, in_vec); - vst1q_f16(output+index, out_vec); + vst1q_f16(output + index, out_vec); index += 8; } for (; c < ic; c++) { - F32 beta_s = (beta == nullptr) ? 0 : beta[c]; - output[index] = alpha[c] * input[index] + beta_s; + float alpha_s = (alpha == nullptr) ? 1 : alpha[c]; + float beta_s = (beta == nullptr) ? 0 : beta[c]; + output[index] = alpha_s * input[index] + beta_s; index++; } } @@ -86,12 +91,22 @@ EE scale_nhwc_fp16(F16* input, F16* alpha, F16* beta, I32 in, I32 ic, I32 elemen return SUCCESS; } -EE scale_fp16(F16* input, I32 axis, I32 nDims, F16* alpha, F16* beta, I32 in, I32 ic, I32 elements_per_channel, F16* output) +EE scale_fp16(F16 *input, + I32 axis, + I32 nDims, + F16 *alpha, + F16 *beta, + I32 in, + I32 ic, + I32 elements_per_channel, + F16 *output) { - if (nullptr == input || nullptr == output) + if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); + } EE ret = SUCCESS; - if (axis == 1 || axis == 0) { + // If ic is 1, it means that weights/vectors have only one param, so we need use the calculation logic of nchw. + if (axis == 1 || axis == 0 || ic == 1) { ret = scale_nchw_fp16(input, alpha, beta, in, ic, elements_per_channel, output); CHECK_STATUS(ret); } else if (axis == nDims - 1) { diff --git a/tensor_computing/src/cpu/arm/fp16/softmax.cpp b/compute/tensor/src/cpu/arm/fp16/softmax.cpp similarity index 76% rename from tensor_computing/src/cpu/arm/fp16/softmax.cpp rename to compute/tensor/src/cpu/arm/fp16/softmax.cpp index df416489..4a7396ce 100644 --- a/tensor_computing/src/cpu/arm/fp16/softmax.cpp +++ b/compute/tensor/src/cpu/arm/fp16/softmax.cpp @@ -1,24 +1,23 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include #include "cpu/arm/fp16/tensor_computing_fp16.h" -void softmax_lastAxis_fp16(const F16* input, I32 loopOuter, I32 loops, F16 *output) +void softmax_lastAxis_fp16(const F16 *input, I32 loopOuter, I32 loops, F16 *output) { - for(I32 i = 0; i < loopOuter; i++) { + for (I32 i = 0; i < loopOuter; i++) { const F16 *inputPtr = input + i * loops; F16 *outputPtr = output + i * loops; @@ -30,7 +29,7 @@ void softmax_lastAxis_fp16(const F16* input, I32 loopOuter, I32 loops, F16 *outp I32 j = 0; F32 sum_s = 0; - for(j = 0; j < loops - 7; j += 8) { + for (j = 0; j < loops - 7; j += 8) { float16x8_t in = vld1q_f16(inputPtr + j); sub_v = vsubq_f16(in, max_v); tmp_v = vexpq_f16_f32(sub_v); @@ -38,7 +37,7 @@ void softmax_lastAxis_fp16(const F16* input, I32 loopOuter, I32 loops, F16 *outp vst1q_f16(outputPtr + j, tmp_v); } sum_s += vaddvq_f16(sum_v); - for(; j < loops; j++){ + for (; j < loops; j++) { tmp_s = exp(inputPtr[j] - max_s); outputPtr[j] = tmp_s; sum_s += tmp_s; @@ -47,34 +46,35 @@ void softmax_lastAxis_fp16(const F16* input, I32 loopOuter, I32 loops, F16 *outp } } -void softmax_anyAxis_fp16(const F16* input, I32 loopOuter, I32 loops, I32 loopInner, F16 *output) +void softmax_anyAxis_fp16(const F16 *input, I32 loopOuter, I32 loops, I32 loopInner, F16 *output) { std::vector buffer(loopInner * 2); - F16* maxBuffer = &buffer[0] ; - F16* sumBuffer = &buffer[loopInner] ; + F16 *maxBuffer = &buffer[0]; + F16 *sumBuffer = &buffer[loopInner]; I32 k = 0; - for(I32 i = 0; i < loopOuter; i++) { - const F16* inputPtrBase = input + i * loops * loopInner; - F16* outputPtrBase = output + i * loops * loopInner; + for (I32 i = 0; i < loopOuter; i++) { + const F16 *inputPtrBase = input + i * loops * loopInner; + F16 *outputPtrBase = output + i * loops * loopInner; memcpy(maxBuffer, inputPtrBase, loopInner * sizeof(F16)); memset(sumBuffer, 0, loopInner * sizeof(F16)); for (I32 j = 1; j < loops; j++) { - const F16* inputPtr = inputPtrBase + j * loopInner; - for (k = 0; k < loopInner-7; k += 8) { + const F16 *inputPtr = inputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 7; k += 8) { float16x8_t in_v = vld1q_f16(inputPtr + k); float16x8_t out_v = vld1q_f16(maxBuffer + k); float16x8_t max_v = vmaxq_f16(in_v, out_v); vst1q_f16(maxBuffer + k, max_v); } - for (; k < loopInner; k++) + for (; k < loopInner; k++) { maxBuffer[k] = UNI_MAX(maxBuffer[k], inputPtr[k]); + } } for (I32 j = 0; j < loops; j++) { - const F16* inputPtr = inputPtrBase + j * loopInner; - F16* outputPtr = outputPtrBase + j * loopInner; - for (k = 0; k < loopInner-7; k += 8) { - float16x8_t in_v = vld1q_f16(inputPtr + k); + const F16 *inputPtr = inputPtrBase + j * loopInner; + F16 *outputPtr = outputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 7; k += 8) { + float16x8_t in_v = vld1q_f16(inputPtr + k); float16x8_t max_v = vld1q_f16(maxBuffer + k); float16x8_t sub_v = vsubq_f16(in_v, max_v); float16x8_t exp_v = vexpq_f16_f32(sub_v); @@ -89,8 +89,8 @@ void softmax_anyAxis_fp16(const F16* input, I32 loopOuter, I32 loops, I32 loopIn } } for (I32 j = 0; j < loops; j++) { - F16* outputPtr = outputPtrBase + j * loopInner; - for (k = 0; k < loopInner-7; k += 8) { + F16 *outputPtr = outputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 7; k += 8) { float16x8_t out_v = vld1q_f16(outputPtr + k); float16x8_t sum_v = vld1q_f16(sumBuffer + k); out_v = vdivq_f16(out_v, sum_v); @@ -103,14 +103,12 @@ void softmax_anyAxis_fp16(const F16* input, I32 loopOuter, I32 loops, I32 loopIn } } - -EE softmax_fp16(TensorDesc inputDesc, const F16* input, - int axis, - TensorDesc outputDesc, F16* output) +EE softmax_fp16(TensorDesc inputDesc, const F16 *input, int axis, TensorDesc outputDesc, F16 *output) { UNUSED(outputDesc); - if(nullptr == input || nullptr == output) + if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); + } U32 size = tensorNumElements(inputDesc); axis = (axis + inputDesc.nDims) % inputDesc.nDims; @@ -118,8 +116,9 @@ EE softmax_fp16(TensorDesc inputDesc, const F16* input, I32 loops = inputDesc.dims[axis]; I32 loopInner = 1; - for (int i = 0; i < axis; i++) + for (int i = 0; i < axis; i++) { loopInner *= inputDesc.dims[i]; + } U32 loopOuter = size / loops / loopInner; if (loopInner == 1) { diff --git a/compute/tensor/src/cpu/arm/fp16/tensor_computing_fp16.h b/compute/tensor/src/cpu/arm/fp16/tensor_computing_fp16.h new file mode 100644 index 00000000..c6129be3 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp16/tensor_computing_fp16.h @@ -0,0 +1,178 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TENSOR_COMPUTING_FP16 +#define _H_TENSOR_COMPUTING_FP16 +#include + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "cpu/arm/fp16/arm_functions_fp16.h" + +EE convolution_transform_filter_fp16(TensorDesc filterDesc, + const F16 *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F16 *filterTransformed); + +EE convolution_fp16(TensorDesc inputDesc, + F16 *input, + TensorDesc filterDesc, + const F16 *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const F16 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *output, + ActivationParamSpec activationDesc, + Arch arch); + +EE deconvolution_transform_filter_fp16(TensorDesc filterDesc, + const F16 *filter, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F16 *filterTransformed); + +EE pooling_c8_fp16(const F16 *input, + U32 stride, + int hstart, + int hend, + int wstart, + int wend, + F16 *output, + PoolingParamSpec poolingParamSpec); + +EE pooling_c8_big_fp16(const F16 *input, + U32 stride, + int hstart, + int hend, + int wstart, + int wend, + F16 *output, + int poolSize); + +EE softmax_fp16( + TensorDesc inputDesc, const F16 *input, int axis, TensorDesc outputDesc, F16 *output); + +EE attention_fp16(U32 batch, + U32 numHeads, + I32 fromSequenceLength, + I32 toSequenceLength, + const F16 *input, + F16 *output); + +EE clip_fp16(F16 *input, F16 *output, I32 len, F32 minValue, F32 maxValue); + +EE concat_fp16(std::vector inputDesc, + std::vector input, + F16 *inputScale, + TensorDesc outputDesc, + F16 *output, + F16 *outputScale, + U32 concatDim); + +EE depthwise_pointwise_convolution_fp16(TensorDesc inputDesc, + F16 *input, + TensorDesc dwFilterDesc, + const F16 *dwFilter, + TensorDesc pwFilterDesc, + const F16 *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const F16 *dwBias, + TensorDesc pwBiasDesc, + const F16 *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F16 *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch); + +EE eltwise_fp16(std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode); + +EE rnncell_fp16(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *output, + Arch arch); + +EE power_fp16(TensorDesc inputDesc, + F16 *input, + F32 scale, + F32 shift, + F32 power, + TensorDesc outputDesc, + F16 *output); + +EE layer_normalization_fp16( + TensorDesc inputDesc, F16 *input, F16 *alpha, F16 *beta, TensorDesc outputDesc, F16 *output); + +EE scale_fp16(F16 *input, + I32 axis, + I32 nDims, + F16 *alpha, + F16 *beta, + I32 in, + I32 ic, + I32 elements_per_channel, + F16 *output); + +EE softmax_fp16(TensorDesc inputDesc, const F16 *input, TensorDesc outputDesc, F16 *output); + +EE check_fp16(TensorDesc inputDescA, + const F16 *inputA, + TensorDesc inputDescB, + const F16 *inputB, + CheckMode checkMode, + TensorDesc outputDesc, + I32 *output); + +EE quantize_tensor_fp16( + TensorDesc dDesc, const void *data, TensorDesc *qDesc, void *qData, F16 *scale); + +EE attention_mask_fp16(TensorDesc inputDesc, + const F16 *input, + AttentionMaskParamSpec p, + TensorDesc outputDesc, + F16 *output); + +EE prelu_fp16(TensorDesc inputDesc, + F16 *input, + F16 *weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + F16 *output); +#endif diff --git a/tensor_computing/src/cpu/arm/fp32/arm_functions_fp32.h b/compute/tensor/src/cpu/arm/fp32/arm_functions_fp32.h similarity index 67% rename from tensor_computing/src/cpu/arm/fp32/arm_functions_fp32.h rename to compute/tensor/src/cpu/arm/fp32/arm_functions_fp32.h index 23da22ec..07e9a976 100644 --- a/tensor_computing/src/cpu/arm/fp32/arm_functions_fp32.h +++ b/compute/tensor/src/cpu/arm/fp32/arm_functions_fp32.h @@ -1,63 +1,70 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_ARM_FUNCTIONS_FP32 #define _H_ARM_FUNCTIONS_FP32 -#ifdef _USE_FP32 +#include #include "arm_neon_expand.h" -#include -#include "tensor_computing_type.h" +#include "types.h" // array sum -inline F32 array_sum_f32(const F32 *data, I32 len) { - if(len <= 0) return 0; +inline F32 array_sum_f32(const F32 *data, I32 len) +{ + if (len <= 0) { + return 0; + } I32 i = 0; F32 sum_s = 0; float32x4_t sum_v = vdupq_n_f32(0); - for(i = 0; i < len - 3; i+=4){ + for (i = 0; i < len - 3; i += 4) { float32x4_t in = vld1q_f32(data + i); sum_v = vaddq_f32(sum_v, in); } sum_s += vaddvq_f32(sum_v); - for(; i < len; i++){ + for (; i < len; i++) { sum_s += data[i]; } return sum_s; } // array mean -inline F32 array_mean_f32(const F32 *data, I32 len) { - if(len <= 0) return 0; +inline F32 array_mean_f32(const F32 *data, I32 len) +{ + if (len <= 0) { + return 0; + } return array_sum_f32(data, len) / len; } // array var -inline F32 array_var_f32(const F32 *data, I32 len, F32 mean) { - if(len <= 0) return 0; +inline F32 array_var_f32(const F32 *data, I32 len, F32 mean) +{ + if (len <= 0) { + return 0; + } I32 i = 0; F32 sum_s = 0; float32x4_t mean_v = vdupq_n_f32(mean); - for(i = 0; i < len - 3; i+=4){ + for (i = 0; i < len - 3; i += 4) { float32x4_t in = vld1q_f32(data + i); float32x4_t tmp_v = vsubq_f32(in, mean_v); float32x4_t sum_v = vmulq_f32(tmp_v, tmp_v); sum_s += vaddvq_f32(sum_v); } - for(; i < len; i++){ + for (; i < len; i++) { F32 in = data[i]; F32 tmp = in - mean; sum_s += tmp * tmp; @@ -66,13 +73,14 @@ inline F32 array_var_f32(const F32 *data, I32 len, F32 mean) { } // array max -inline F32 array_max_f32(const F32* data, I32 len) { +inline F32 array_max_f32(const F32 *data, I32 len) +{ F32 max_s = data[0]; I32 i = 0; - if(len >= 4){ + if (len >= 4) { float32x4_t max_v, tmp_v; max_v = vld1q_f32(data); - for (i = 4; i < len - 3; i+=4) { + for (i = 4; i < len - 3; i += 4) { tmp_v = vld1q_f32(data + i); max_v = vmaxq_f32(tmp_v, max_v); } @@ -80,39 +88,76 @@ inline F32 array_max_f32(const F32* data, I32 len) { } for (; i < len; i++) { - if(data[i] > max_s) + if (data[i] > max_s) { max_s = data[i]; + } } return max_s; } -inline void array_scale_f32(F32 *input, F32 *output, I32 len, F32 alpha, F32 beta) { - I32 i = 0; +inline void array_scale_f32(const F32 *input, F32 *output, I32 len, F32 alpha, F32 beta) +{ float32x4_t alpha_v = vdupq_n_f32(alpha); - float32x4_t beta_v = vdupq_n_f32(beta); - for (i = 0; i < len-3; i+=4) { + float32x4_t beta_v = vdupq_n_f32(beta); + I32 i = 0; + for (i = 0; i < len - 3; i += 4) { float32x4_t in = vld1q_f32(input + i); float32x4_t tmp_v = vfmaq_f32(beta_v, alpha_v, in); - vst1q_f32(output+i, tmp_v); + vst1q_f32(output + i, tmp_v); } for (; i < len; i++) { output[i] = alpha * input[i] + beta; } } -inline EE activation_fp32(F32* input, U32 len, ActivationDesc activationDesc, F32* output) +inline void array_power_f32(F32 *input, F32 *output, I32 len, F32 power) +{ + I32 i = 0; + if (power == -1) { + float32x4_t one_v = vdupq_n_f32(1); + for (i = 0; i < len - 3; i += 4) { + float32x4_t in = vld1q_f32(input + i); + float32x4_t tmp_v = vdivq_f32(one_v, in); + vst1q_f32(output + i, tmp_v); + } + } else if (power == 0.5) { +#ifdef __aarch64__ + for (i = 0; i < len - 3; i += 4) { + float32x4_t in = vld1q_f32(input + i); + float32x4_t tmp_v = vsqrtq_f32(in); + vst1q_f32(output + i, tmp_v); + } +#endif + } else if (power == 1) { + if (input != output) { + memcpy(output, input, len * sizeof(F32)); + } + i = len; + } else if (power == 2) { + for (i = 0; i < len - 3; i += 4) { + float32x4_t in = vld1q_f32(input + i); + float32x4_t tmp_v = vmulq_f32(in, in); + vst1q_f32(output + i, tmp_v); + } + } + for (; i < len; i++) { + output[i] = powf(input[i], power); + } +} + +inline EE activation_fp32(F32 *input, U32 len, ActivationParamSpec activationDesc, F32 *output) { float32x4_t in, out; - float32x4_t zero = vdupq_n_f32(0.); - float32x4_t one = vdupq_n_f32(1.); + float32x4_t zero = vdupq_n_f32(0.); + float32x4_t one = vdupq_n_f32(1.); float32x4_t three = vdupq_n_f32(3.); - float32x4_t six = vdupq_n_f32(6.); + float32x4_t six = vdupq_n_f32(6.); U32 len_main = len / 4; U32 len_tail = len % 4; F32 value; - switch (activationDesc.mode){ + switch (activationDesc.mode) { case ACTIVATION_NULL: { break; } @@ -207,9 +252,9 @@ inline EE activation_fp32(F32* input, U32 len, ActivationDesc activationDesc, F3 } case ACTIVATION_GELU: { F32 two_div_PI_sqrt = sqrt(2 / 3.14159265358979323846); - float32x4_t vec0 = vdupq_n_f32(two_div_PI_sqrt); - float32x4_t vec1 = vdupq_n_f32(0.044715); - float32x4_t vec2 = vdupq_n_f32(0.5); + float32x4_t vec0 = vdupq_n_f32(two_div_PI_sqrt); + float32x4_t vec1 = vdupq_n_f32(0.044715); + float32x4_t vec2 = vdupq_n_f32(0.5); for (U32 i = 0; i < len_main; i++) { in = vld1q_f32(input); out = vmulq_f32(in, in); @@ -226,7 +271,7 @@ inline EE activation_fp32(F32* input, U32 len, ActivationDesc activationDesc, F3 } for (U32 i = 0; i < len_tail; i++) { value = input[i]; - value = two_div_PI_sqrt * (value + 0.044715 * pow(value, 3)); + value = two_div_PI_sqrt * (value + 0.044715 * powf(value, 3)); value = 1.0 - 2.0 / (exp(2.0 * value) + 1.0); value = 0.5 * (1.0 + value); value = input[i] * value; @@ -262,6 +307,27 @@ inline EE activation_fp32(F32* input, U32 len, ActivationDesc activationDesc, F3 } break; } + case ACTIVATION_MISH: { + for (U32 i = 0; i < len_main; i++) { + in = vld1q_f32(input); + out = vmulq_f32( + in, vtanhq_f32(vlogq_f32(vaddq_f32(vexpq_f32_03_percent_error(in), one)))); + vst1q_f32(output, out); + input += 4; + output += 4; + } + for (U32 i = 0; i < len_tail; i++) { + value = input[i] * tanh(log(exp(input[i]) + 1.0)); + output[i] = value; + } + break; + } + case ACTIVATION_GREATER: { + for (U32 i = 0; i < len; i++) { + output[i] = input[i] > 1 ? 1 : 0; + } + break; + } default: return NOT_SUPPORTED; } @@ -269,20 +335,35 @@ inline EE activation_fp32(F32* input, U32 len, ActivationDesc activationDesc, F3 return SUCCESS; } -inline void array_add_f32(const F32* inputA, const F32* inputB, F32* output, I32 len) +inline void array_add_f32(const F32 *inputA, const F32 *inputB, F32 *output, I32 len) { I32 i = 0; - for(i = 0; i < len - 3; i+=4){ + for (i = 0; i < len - 3; i += 4) { float32x4_t a = vld1q_f32(inputA + i); float32x4_t b = vld1q_f32(inputB + i); float32x4_t c = vaddq_f32(a, b); - vst1q_f32(output+i, c); + vst1q_f32(output + i, c); } - for ( ; i < len; i++) { + for (; i < len; i++) { output[i] = inputA[i] + inputB[i]; } } -#endif +inline void array_square_and_add_f32(const F32 *inputA, const F32 *inputB, F32 *output, I32 len) +{ + I32 i = 0; + for (i = 0; i < len - 3; i += 4) { + float32x4_t a = vld1q_f32(inputA + i); + float32x4_t b = vld1q_f32(inputB + i); + b = vmulq_f32(b, b); + float32x4_t c = vaddq_f32(a, b); + vst1q_f32(output + i, c); + } + + for (; i < len; i++) { + output[i] = inputA[i] + inputB[i] * inputB[i]; + } +} + #endif diff --git a/tensor_computing/src/cpu/arm/fp32/attention.cpp b/compute/tensor/src/cpu/arm/fp32/attention.cpp similarity index 65% rename from tensor_computing/src/cpu/arm/fp32/attention.cpp rename to compute/tensor/src/cpu/arm/fp32/attention.cpp index 15ee1724..6861cae6 100644 --- a/tensor_computing/src/cpu/arm/fp32/attention.cpp +++ b/compute/tensor/src/cpu/arm/fp32/attention.cpp @@ -1,68 +1,74 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "cpu/arm/fp32/tensor_computing_fp32.h" -EE attention_fp32(U32 batch, U32 numHeads, I32 fromSequenceLength, I32 toSequenceLength, const F32 *input, F32 *output) +EE attention_fp32(U32 batch, + U32 numHeads, + I32 fromSequenceLength, + I32 toSequenceLength, + const F32 *input, + F32 *output) { - if (nullptr == input || nullptr == output) + if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); + } F32 mask_s = -10000.0; I32 count = array_sum_f32(input, toSequenceLength); I32 valid = UNI_MIN(count, fromSequenceLength); float32x4_t mask_v = vdupq_n_f32(mask_s); float32x4_t one_v = vdupq_n_f32(1.0); - for(U32 n = 0; n < batch; n++){ + for (U32 n = 0; n < batch; n++) { for (U32 i = 0; i < numHeads; i++) { if (i == 0) { for (I32 j = 0; j < valid; j++) { if (j == 0) { I32 k = 0; - for (; k < toSequenceLength-3; k+=4) { + for (; k < toSequenceLength - 3; k += 4) { float32x4_t in_v = vld1q_f32(input + k); float32x4_t tmp_v = vsubq_f32(one_v, in_v); tmp_v = vmulq_f32(tmp_v, mask_v); - vst1q_f32(output+k, tmp_v); + vst1q_f32(output + k, tmp_v); } for (; k < toSequenceLength; k++) { F32 value = (1 - input[k]) * mask_s; output[k] = value; } - } - else { - memcpy(output+j*toSequenceLength, output, toSequenceLength*sizeof(F32)); + } else { + memcpy( + output + j * toSequenceLength, output, toSequenceLength * sizeof(F32)); } } for (I32 j = valid; j < fromSequenceLength; j++) { if (j == valid) { I32 k = 0; - for (; k < toSequenceLength-3; k+=4) { - vst1q_f32(output+j*toSequenceLength+k, mask_v); + for (; k < toSequenceLength - 3; k += 4) { + vst1q_f32(output + j * toSequenceLength + k, mask_v); } for (; k < toSequenceLength; k++) { - output[j*toSequenceLength+k] = mask_s; + output[j * toSequenceLength + k] = mask_s; } - } - else { - memcpy(output+j*toSequenceLength, output+valid*toSequenceLength, toSequenceLength*sizeof(F32)); + } else { + memcpy(output + j * toSequenceLength, output + valid * toSequenceLength, + toSequenceLength * sizeof(F32)); } } } else { - memcpy(output+i*fromSequenceLength*toSequenceLength, output, fromSequenceLength*toSequenceLength*sizeof(F32)); + memcpy(output + i * fromSequenceLength * toSequenceLength, output, + fromSequenceLength * toSequenceLength * sizeof(F32)); } } diff --git a/tensor_computing/src/cpu/arm/fp32/attention_mask.cpp b/compute/tensor/src/cpu/arm/fp32/attention_mask.cpp similarity index 78% rename from tensor_computing/src/cpu/arm/fp32/attention_mask.cpp rename to compute/tensor/src/cpu/arm/fp32/attention_mask.cpp index 94ee7db0..3a34c6dc 100644 --- a/tensor_computing/src/cpu/arm/fp32/attention_mask.cpp +++ b/compute/tensor/src/cpu/arm/fp32/attention_mask.cpp @@ -1,27 +1,32 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "cpu/arm/fp32/tensor_computing_fp32.h" -EE attention_mask_fp32(TensorDesc inputDesc, const F32* input, - I32 attentionLength, bool sameLength, float maskValue, - TensorDesc outputDesc, F32* output) +EE attention_mask_fp32(TensorDesc inputDesc, + const F32 *input, + AttentionMaskParamSpec p, + TensorDesc outputDesc, + F32 *output) { UNUSED(outputDesc); - if (nullptr == input || nullptr == output) + if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); + } + I32 attentionLength = p.attention_length; + bool sameLength = p.same_length; + float maskValue = p.mask; int qlen = inputDesc.dims[1]; int klen = inputDesc.dims[0]; int mlen = klen - qlen; @@ -48,9 +53,10 @@ EE attention_mask_fp32(TensorDesc inputDesc, const F32* input, } loops = UNI_MAX(loops, 0); start = UNI_MIN(start, klen); - if (start + loops > klen) + if (start + loops > klen) { loops = UNI_MAX(klen - start, 0); - memset(&mask[i*klen+start], 0, sizeof(F32)*loops); + } + memset(&mask[i * klen + start], 0, sizeof(F32) * loops); } } I32 loops = tensorNumElements(inputDesc) / length; @@ -58,13 +64,13 @@ EE attention_mask_fp32(TensorDesc inputDesc, const F32* input, float32x4_t mask_value_v = vdupq_n_f32(maskValue); for (int i = 0, index = 0; i < loops; i++) { int j = 0; - for (; j < length-3; j+=4) { - float32x4_t in = vld1q_f32(input+index); + for (; j < length - 3; j += 4) { + float32x4_t in = vld1q_f32(input + index); float32x4_t mask_v = vld1q_f32(&mask[j]); float32x4_t tmp_v = vsubq_f32(one_v, mask_v); tmp_v = vmulq_f32(in, tmp_v); tmp_v = vfmsq_f32(tmp_v, mask_value_v, mask_v); - vst1q_f32(output+index, tmp_v); + vst1q_f32(output + index, tmp_v); index += 4; } for (; j < length; j++) { diff --git a/tensor_computing/src/cpu/arm/fp32/check.cpp b/compute/tensor/src/cpu/arm/fp32/check.cpp similarity index 72% rename from tensor_computing/src/cpu/arm/fp32/check.cpp rename to compute/tensor/src/cpu/arm/fp32/check.cpp index 4501f17f..1e6894c7 100644 --- a/tensor_computing/src/cpu/arm/fp32/check.cpp +++ b/compute/tensor/src/cpu/arm/fp32/check.cpp @@ -1,35 +1,40 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "cpu/arm/fp32/tensor_computing_fp32.h" -EE check_fp32(TensorDesc inputDescA, const F32* inputA, - TensorDesc inputDescB, const F32* inputB, +EE check_fp32(TensorDesc inputDescA, + const F32 *inputA, + TensorDesc inputDescB, + const F32 *inputB, CheckMode checkMode, - TensorDesc outputDesc, I32* output) + TensorDesc outputDesc, + I32 *output) { - if (nullptr == inputA || nullptr == inputB || nullptr == output) + if (nullptr == inputA || nullptr == inputB || nullptr == output) { CHECK_STATUS(NULL_POINTER); + } - if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) + if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) { CHECK_STATUS(NOT_MATCH); + } U32 size = tensorNumElements(inputDescA); - U32 loopOuter = inputDescA.dims[inputDescA.nDims-1]; + U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1]; I32 length = size / loopOuter; - if (tensorNumElements(outputDesc) != loopOuter) + if (tensorNumElements(outputDesc) != loopOuter) { CHECK_STATUS(NOT_MATCH); + } for (U32 j = 0; j < loopOuter; j++) { const F32 *arrayA = inputA + j * length; const F32 *arrayB = inputB + j * length; @@ -37,45 +42,51 @@ EE check_fp32(TensorDesc inputDescA, const F32* inputA, case CHECK_GREAT: { uint32x4_t count_v = vdupq_n_u32(0); I32 i = 0; - for (; i < length-3; i+=4) { + for (; i < length - 3; i += 4) { float32x4_t a = vld1q_f32(arrayA + i); float32x4_t b = vld1q_f32(arrayA + i); count_v = vaddq_u32(count_v, vcgtq_f32(a, b)); } I32 count = vaddvq_u32(count_v); - for (; i < length; i++) - if (arrayA[i] > arrayB[i]) - count ++; + for (; i < length; i++) { + if (arrayA[i] > arrayB[i]) { + count++; + } + } output[j] = (count == length); break; } case CHECK_GREATEQUAL: { uint32x4_t count_v = vdupq_n_u32(0); I32 i = 0; - for (; i < length-3; i+=4) { + for (; i < length - 3; i += 4) { float32x4_t a = vld1q_f32(arrayA + i); float32x4_t b = vld1q_f32(arrayA + i); count_v = vaddq_u32(count_v, vcgeq_f32(a, b)); } I32 count = vaddvq_u32(count_v); - for (; i < length; i++) - if (arrayA[i] >= arrayB[i]) - count ++; + for (; i < length; i++) { + if (arrayA[i] >= arrayB[i]) { + count++; + } + } output[j] = (count == length); break; } case CHECK_EQUAL: { uint32x4_t count_v = vdupq_n_u32(0); I32 i = 0; - for (; i < length-3; i+=4) { + for (; i < length - 3; i += 4) { float32x4_t a = vld1q_f32(arrayA + i); float32x4_t b = vld1q_f32(arrayA + i); count_v = vaddq_u32(count_v, vceqq_f32(a, b)); } I32 count = vaddvq_u32(count_v); - for (; i < length; i++) - if (arrayA[i] == arrayB[i]) - count ++; + for (; i < length; i++) { + if (arrayA[i] == arrayB[i]) { + count++; + } + } output[j] = (count == length); break; } diff --git a/tensor_computing/src/cpu/arm/fp32/clip.cpp b/compute/tensor/src/cpu/arm/fp32/clip.cpp similarity index 83% rename from tensor_computing/src/cpu/arm/fp32/clip.cpp rename to compute/tensor/src/cpu/arm/fp32/clip.cpp index 64b3b482..a0b591be 100644 --- a/tensor_computing/src/cpu/arm/fp32/clip.cpp +++ b/compute/tensor/src/cpu/arm/fp32/clip.cpp @@ -1,33 +1,32 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "cpu/arm/fp32/tensor_computing_fp32.h" EE clip_fp32(F32 *input, F32 *output, I32 len, F32 minValue, F32 maxValue) { - if (nullptr == input - || nullptr == output) + if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); + } float32x4_t min_v = vdupq_n_f32(minValue); - float32x4_t max_v = vdupq_n_f32(maxValue); + float32x4_t max_v = vdupq_n_f32(maxValue); I32 i = 0; for (i = 0; i < len - 3; i += 4) { float32x4_t in = vld1q_f32(input + i); float32x4_t tmp_v = vminq_f32(max_v, vmaxq_f32(min_v, in)); - vst1q_f32(output+i, tmp_v); + vst1q_f32(output + i, tmp_v); } for (; i < len; i++) { F32 value = input[i]; diff --git a/compute/tensor/src/cpu/arm/fp32/convolution.cpp b/compute/tensor/src/cpu/arm/fp32/convolution.cpp new file mode 100644 index 00000000..42f878c1 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/convolution.cpp @@ -0,0 +1,93 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "cpu/arm/fp32/tensor_computing_fp32.h" + +EE convolution_fp32(TensorDesc inputDesc, + F32 *input, + TensorDesc filterDesc, + const F32 *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const F32 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + UNUSED(arch); + if (nullptr == input || nullptr == filter || nullptr == output || nullptr == bias || + nullptr == tmp) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(odf == DF_NCHWC8)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(ic == fc && oc == fn)) { + CHECK_STATUS(NOT_MATCH); + } + + // In some cases when we adjust the model input, the input tensor of conv can change from NCHW to NCHWc8 + // In this case we can simply change the algo, because they both require the same filter transform + if (CONVOLUTION_ALGORITHM_GEMM_ICNCHW == algorithm && DF_NCHWC8 == idf) { + algorithm = CONVOLUTION_ALGORITHM_GEMM; + } + + EE ret = SUCCESS; + switch (algorithm) { + case CONVOLUTION_ALGORITHM_GEMM: +#ifdef __aarch64__ + ret = convolution_gemm_V8(inputDesc, input, filterDesc, filter, convParamSpec, biasDesc, + bias, tmpBytes, tmp, outputDesc, output, activationDesc); +#else + ret = convolution_gemm_V7(inputDesc, input, filterDesc, filter, convParamSpec, biasDesc, + bias, tmpBytes, tmp, outputDesc, output, activationDesc); +#endif + break; + case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: +#ifdef __aarch64__ + ret = convolution_gemm_icnchw_V8(inputDesc, input, filterDesc, filter, convParamSpec, + biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc); +#else + ret = convolution_gemm_icnchw_V7(inputDesc, input, filterDesc, filter, convParamSpec, + biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc); +#endif + break; + case CONVOLUTION_ALGORITHM_WINOGRAD: + ret = convolution_winograd_V8(inputDesc, input, filterDesc, filter, convParamSpec, + biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp32/convolution_gemm_V7.cpp b/compute/tensor/src/cpu/arm/fp32/convolution_gemm_V7.cpp new file mode 100644 index 00000000..392d5180 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/convolution_gemm_V7.cpp @@ -0,0 +1,677 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef __aarch64__ +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#include +#ifdef _USE_OPENMP +#include +#endif + +EE convolution_gemm_V7(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + if (fdf != DF_NHWCN8) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + I32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + F32 *inArray_pad; + EE ret = SUCCESS; + for (U32 n = 0; n < in; n++) { + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + inArray_pad = inArray + n * ic * ih * iw * 8; + } else { + // copy input into a input with padding + inArray_pad = (F32 *)tmp; + F32 *inArray_pad_mov = inArray_pad; + F32 *inArray_mov = inArray + n * ic * ih * iw * 8; + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < paddingT; h++) { + memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += iw_pad * 8; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(idt)); + inArray_pad_mov += paddingL * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(idt)); + inArray_pad_mov += paddingR * 8; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { + memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += iw_pad * 8; + } + } + } + + // ohow / 6 +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (I32 hw = 0; hw < ohow - 5; hw += 6) { + const F32 *b0 = biasArray; + const F32 *b1 = biasArray + 4; +#ifdef _USE_OPENMP + // For NDK on ARMv7, OpenMP loop cannot reference more than 14 outside variables + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 fh = filterDesc.dims[1]; + U32 fw = filterDesc.dims[0]; + U32 thread_private_buffer_offset = 6 * fh * fw * ic * 8 * omp_get_thread_num(); +#else + U32 thread_private_buffer_offset = 0; +#endif + F32 *in_pack = ((F32 *)tmp) + ic * ihiw * 8 + thread_private_buffer_offset; + // pack input + // NCHWc8 => NHWChw6 + im2col + U32 in_h[6] = {0}; + U32 in_w[6] = {0}; + for (U32 i = 0; i < 6; i++) { + in_h[i] = ((hw + i) / ow) * convParamSpec.stride_h; + in_w[i] = ((hw + i) % ow) * convParamSpec.stride_w; + } + + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F32 *in_hw6c8 = inArray_pad + c * ihiw * 8 + + fh_idx * convParamSpec.dilatedRate_h * iw_pad * 8 + + fw_idx * convParamSpec.dilatedRate_w * 8; + F32 *in_0 = in_hw6c8 + in_h[0] * iw_pad * 8 + in_w[0] * 8; + F32 *in_1 = in_hw6c8 + in_h[1] * iw_pad * 8 + in_w[1] * 8; + F32 *in_2 = in_hw6c8 + in_h[2] * iw_pad * 8 + in_w[2] * 8; + F32 *in_3 = in_hw6c8 + in_h[3] * iw_pad * 8 + in_w[3] * 8; + F32 *in_4 = in_hw6c8 + in_h[4] * iw_pad * 8 + in_w[4] * 8; + F32 *in_5 = in_hw6c8 + in_h[5] * iw_pad * 8 + in_w[5] * 8; + + // NHWChw6 + F32 *in_pack_c8hw6 = + in_pack + fh_idx * fw * ic * 6 * 8 + fw_idx * ic * 6 * 8 + c * 6 * 8; + + __asm__ __volatile__("vld1.f32 {d0-d3}, [%[in_0]]\n" + "vld1.f32 {d4-d7}, [%[in_1]]\n" + "vld1.f32 {d8-d11}, [%[in_2]]\n" + "vld1.f32 {d12-d15}, [%[in_3]]\n" + "vld1.f32 {d16-d19}, [%[in_4]]\n" + "vld1.f32 {d20-d23}, [%[in_5]]\n" + + "vzip.32 q0, q2\n" + "vzip.32 q4, q6\n" + "vzip.32 q8, q10\n" + + "vst1.f32 {d0}, [%[pack]]!\n" + "vst1.f32 {d8}, [%[pack]]!\n" + "vst1.f32 {d16}, [%[pack]]!\n" + "vst1.f32 {d1}, [%[pack]]!\n" + "vst1.f32 {d9}, [%[pack]]!\n" + "vst1.f32 {d17}, [%[pack]]!\n" + "vst1.f32 {d4}, [%[pack]]!\n" + "vst1.f32 {d12}, [%[pack]]!\n" + "vst1.f32 {d20}, [%[pack]]!\n" + "vst1.f32 {d5}, [%[pack]]!\n" + "vst1.f32 {d13}, [%[pack]]!\n" + "vst1.f32 {d21}, [%[pack]]!\n" + + "vzip.32 q1, q3\n" + "vzip.32 q5, q7\n" + "vzip.32 q9, q11\n" + + "vst1.f32 {d2}, [%[pack]]!\n" + "vst1.f32 {d10}, [%[pack]]!\n" + "vst1.f32 {d18}, [%[pack]]!\n" + "vst1.f32 {d3}, [%[pack]]!\n" + "vst1.f32 {d11}, [%[pack]]!\n" + "vst1.f32 {d19}, [%[pack]]!\n" + "vst1.f32 {d6}, [%[pack]]!\n" + "vst1.f32 {d14}, [%[pack]]!\n" + "vst1.f32 {d22}, [%[pack]]!\n" + "vst1.f32 {d7}, [%[pack]]!\n" + "vst1.f32 {d15}, [%[pack]]!\n" + "vst1.f32 {d23}, [%[pack]]!\n" + : [pack] "+r"(in_pack_c8hw6), [in_0] "+r"(in_0), + [in_1] "+r"(in_1), [in_2] "+r"(in_2), + [in_3] "+r"(in_3), [in_4] "+r"(in_4), [in_5] "+r"(in_5) + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q8", "q9", "q10", "q11"); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0 + o * 8; + const F32 *b_o1 = b1 + o * 8; + __asm__ __volatile__( + "vld1.f32 {d8-d9}, [%[b_0]]\n" + "vld1.f32 {d10-d11}, [%[b_1]]\n" + "vld1.f32 {d0-d3}, [%[in_0]]!\n" + "vld1.f32 {d4-d7}, [%[f_0]]!\n" + + "vmov.f32 q6, q4\n" + "vmov.f32 q8, q4\n" + "vmov.f32 q10, q4\n" + "vmov.f32 q12, q4\n" + "vmov.f32 q14, q4\n" + + "mov r2, %[ic]\n" + + "vmov.f32 q7, q5\n" + "vmov.f32 q9, q5\n" + "vmov.f32 q11, q5\n" + "vmov.f32 q13, q5\n" + "vmov.f32 q15, q5\n" + + "0:\n" + "vmla.f32 q4, q2, d0[0]\n" + "vmla.f32 q6, q2, d0[1]\n" + "vmla.f32 q8, q2, d1[0]\n" + "vmla.f32 q10, q2, d1[1]\n" + "vmla.f32 q12, q2, d2[0]\n" + "vmla.f32 q14, q2, d2[1]\n" + + "vld1.f32 {d4-d5}, [%[f_0]]!\n" + + "vmla.f32 q5, q3, d0[0]\n" + "vmla.f32 q7, q3, d0[1]\n" + "vmla.f32 q9, q3, d1[0]\n" + "vmla.f32 q11, q3, d1[1]\n" + "vld1.f32 {d0-d1}, [%[in_0]]!\n" + "vmla.f32 q13, q3, d2[0]\n" + "vmla.f32 q15, q3, d2[1]\n" + + "vld1.f32 {d6-d7}, [%[f_0]]!\n" + "subs r2, r2, #4\n" + + "vmla.f32 q4, q2, d3[0]\n" + "vmla.f32 q6, q2, d3[1]\n" + "vmla.f32 q8, q2, d0[0]\n" + "vmla.f32 q10, q2, d0[1]\n" + "vmla.f32 q12, q2, d1[0]\n" + "vmla.f32 q14, q2, d1[1]\n" + + "vld1.f32 {d4-d5}, [%[f_0]]!\n" + + "vmla.f32 q5, q3, d3[0]\n" + "vmla.f32 q7, q3, d3[1]\n" + "vld1.f32 {d2-d3}, [%[in_0]]!\n" + "vmla.f32 q9, q3, d0[0]\n" + "vmla.f32 q11, q3, d0[1]\n" + "vmla.f32 q13, q3, d1[0]\n" + "vmla.f32 q15, q3, d1[1]\n" + + "vld1.f32 {d6-d7}, [%[f_0]]!\n" + "vld1.f32 {d0-d1}, [%[in_0]]!\n" + + "vmla.f32 q4, q2, d2[0]\n" + "vmla.f32 q6, q2, d2[1]\n" + "vmla.f32 q8, q2, d3[0]\n" + "vmla.f32 q10, q2, d3[1]\n" + "vmla.f32 q12, q2, d0[0]\n" + "vmla.f32 q14, q2, d0[1]\n" + + "vld1.f32 {d4-d5}, [%[f_0]]!\n" + + "vmla.f32 q5, q3, d2[0]\n" + "vmla.f32 q7, q3, d2[1]\n" + "vmla.f32 q9, q3, d3[0]\n" + "vmla.f32 q11, q3, d3[1]\n" + "vld1.f32 {d2-d3}, [%[in_0]]!\n" + "vmla.f32 q13, q3, d0[0]\n" + "vmla.f32 q15, q3, d0[1]\n" + + "vld1.f32 {d6-d7}, [%[f_0]]!\n" + + "vmla.f32 q4, q2, d1[0]\n" + "vmla.f32 q6, q2, d1[1]\n" + "vmla.f32 q8, q2, d2[0]\n" + "vmla.f32 q10, q2, d2[1]\n" + "vmla.f32 q12, q2, d3[0]\n" + "vmla.f32 q14, q2, d3[1]\n" + + "vld1.f32 {d4-d5}, [%[f_0]]!\n" + + "vmla.f32 q5, q3, d1[0]\n" + "vmla.f32 q7, q3, d1[1]\n" + "vld1.f32 {d0-d1}, [%[in_0]]!\n" + "vmla.f32 q9, q3, d2[0]\n" + "vmla.f32 q11, q3, d2[1]\n" + "vmla.f32 q13, q3, d3[0]\n" + "vmla.f32 q15, q3, d3[1]\n" + + "vld1.f32 {d2-d3}, [%[in_0]]!\n" + "vld1.f32 {d6-d7}, [%[f_0]]!\n" + "bne 0b\n" + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15", "r2"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("veor q1, q1, q1\n" // zero + "vmax.f32 q4, q4, q1\n" + "vmax.f32 q5, q5, q1\n" + "vmax.f32 q6, q6, q1\n" + "vmax.f32 q7, q7, q1\n" + "vmax.f32 q8, q8, q1\n" + "vmax.f32 q9, q9, q1\n" + "vmax.f32 q10, q10, q1\n" + "vmax.f32 q11, q11, q1\n" + "vmax.f32 q12, q12, q1\n" + "vmax.f32 q13, q13, q1\n" + "vmax.f32 q14, q14, q1\n" + "vmax.f32 q15, q15, q1\n" + : + : + : "memory", "cc", "q1", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q11", "q12", "q13", "q14", "q15"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("veor q1, q1, q1\n" // zero + "vmov.f32 q2, #6.0\n" // six + "vmax.f32 q4, q4, q1\n" + "vmax.f32 q5, q5, q1\n" + "vmax.f32 q6, q6, q1\n" + "vmax.f32 q7, q7, q1\n" + "vmax.f32 q8, q8, q1\n" + "vmax.f32 q9, q9, q1\n" + "vmax.f32 q10, q10, q1\n" + "vmax.f32 q11, q11, q1\n" + "vmax.f32 q12, q12, q1\n" + "vmax.f32 q13, q13, q1\n" + "vmax.f32 q14, q14, q1\n" + "vmax.f32 q15, q15, q1\n" + "vmin.f32 q4, q4, q2\n" + "vmin.f32 q5, q5, q2\n" + "vmin.f32 q6, q6, q2\n" + "vmin.f32 q7, q7, q2\n" + "vmin.f32 q8, q8, q2\n" + "vmin.f32 q9, q9, q2\n" + "vmin.f32 q10, q10, q2\n" + "vmin.f32 q11, q11, q2\n" + "vmin.f32 q12, q12, q2\n" + "vmin.f32 q13, q13, q2\n" + "vmin.f32 q14, q14, q2\n" + "vmin.f32 q15, q15, q2\n" + : + : + : "memory", "cc", "q1", "q2", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("vst1.f32 {q4}, [%[out_0]]!\n" + "vst1.f32 {q5}, [%[out_0]]!\n" + "vst1.f32 {q6}, [%[out_0]]!\n" + "vst1.f32 {q7}, [%[out_0]]!\n" + "vst1.f32 {q8}, [%[out_0]]!\n" + "vst1.f32 {q9}, [%[out_0]]!\n" + "vst1.f32 {q10}, [%[out_0]]!\n" + "vst1.f32 {q11}, [%[out_0]]!\n" + "vst1.f32 {q12}, [%[out_0]]!\n" + "vst1.f32 {q13}, [%[out_0]]!\n" + "vst1.f32 {q14}, [%[out_0]]!\n" + "vst1.f32 {q15}, [%[out_0]]!\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "q4", "q5", "q6", "q7", "q8", "q9", "q10", + "q11", "q12", "q13", "q14", "q15"); + } + } + + U32 ohow_s = (ohow / 6) * 6; + U32 ohow_tail = ohow - ohow_s; + + if (ohow_tail >= 4) { + I32 hw = ohow_s; + const F32 *b0 = biasArray; + const F32 *b1 = biasArray + 4; + F32 *in_pack = ((F32 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHWc8 => NHWChw4 + im2col + U32 in_h[4] = {0}; + U32 in_w[4] = {0}; + + for (U32 i = 0; i < 4; i++) { + in_h[i] = ((hw + i) / ow) * convParamSpec.stride_h; + in_w[i] = ((hw + i) % ow) * convParamSpec.stride_w; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F32 *in_hw4c8 = inArray_pad + c * ihiw * 8 + + fh_idx * convParamSpec.dilatedRate_h * iw_pad * 8 + + fw_idx * convParamSpec.dilatedRate_w * 8; + F32 *in_0 = in_hw4c8 + in_h[0] * iw_pad * 8 + in_w[0] * 8; + F32 *in_1 = in_hw4c8 + in_h[1] * iw_pad * 8 + in_w[1] * 8; + F32 *in_2 = in_hw4c8 + in_h[2] * iw_pad * 8 + in_w[2] * 8; + F32 *in_3 = in_hw4c8 + in_h[3] * iw_pad * 8 + in_w[3] * 8; + F32 *in_pack_c8hw4 = + in_pack + fh_idx * fw * ic * 8 * 4 + fw_idx * ic * 8 * 4 + c * 8 * 4; + + __asm__ __volatile__( + "vld1.f32 {d0-d3}, [%[in_0]]\n" + "vld1.f32 {d4-d7}, [%[in_1]]\n" + "vld1.f32 {d8-d11}, [%[in_2]]\n" + "vld1.f32 {d12-d15}, [%[in_3]]\n" + + "vzip.32 q0, q4\n" + "vzip.32 q2, q6\n" + + "vzip.32 q0, q2\n" + "vzip.32 q4, q6\n" + + "vst1.f32 {q0}, [%[pack]]!\n" + "vst1.f32 {q2}, [%[pack]]!\n" + "vst1.f32 {q4}, [%[pack]]!\n" + "vst1.f32 {q6}, [%[pack]]!\n" + + "vzip.32 q1, q5\n" + "vzip.32 q3, q7\n" + + "vzip.32 q1, q3\n" + "vzip.32 q5, q7\n" + + "vst1.f32 {q1}, [%[pack]]!\n" + "vst1.f32 {q3}, [%[pack]]!\n" + "vst1.f32 {q5}, [%[pack]]!\n" + "vst1.f32 {q7}, [%[pack]]!\n" + : [pack] "+r"(in_pack_c8hw4), [in_0] "+r"(in_0), [in_1] "+r"(in_1), + [in_2] "+r"(in_2), [in_3] "+r"(in_3) + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__( + "vld1.f32 {d8-d9}, [%[b_0]]\n" + "vld1.f32 {d10-d11}, [%[b_1]]\n" + "vld1.f32 {d0-d1}, [%[in_0]]!\n" + "vld1.f32 {d4-d7}, [%[f_0]]!\n" + + "vmov.f32 q6, q4\n" + "vmov.f32 q8, q4\n" + "vmov.f32 q10, q4\n" + + "mov r2, %[ic]\n" + + "vmov.f32 q7, q5\n" + "vmov.f32 q9, q5\n" + "vmov.f32 q11, q5\n" + + "0:\n" + "vmla.f32 q4, q2, d0[0]\n" + "vmla.f32 q6, q2, d0[1]\n" + "vmla.f32 q8, q2, d1[0]\n" + "vmla.f32 q10, q2, d1[1]\n" + + "vld1.f32 {d2-d3}, [%[in_0]]!\n" + "vld1.f32 {d4-d5}, [%[f_0]]!\n" + + "vmla.f32 q5, q3, d0[0]\n" + "vmla.f32 q7, q3, d0[1]\n" + "vmla.f32 q9, q3, d1[0]\n" + "vmla.f32 q11, q3, d1[1]\n" + + "vld1.f32 {d6-d7}, [%[f_0]]!\n" + "subs r2, r2, #2\n" + + "vmla.f32 q4, q2, d2[0]\n" + "vmla.f32 q6, q2, d2[1]\n" + "vmla.f32 q8, q2, d3[0]\n" + "vmla.f32 q10, q2, d3[1]\n" + + "vld1.f32 {d0-d1}, [%[in_0]]!\n" + "vld1.f32 {d4-d5}, [%[f_0]]!\n" + + "vmla.f32 q5, q3, d2[0]\n" + "vmla.f32 q7, q3, d2[1]\n" + "vmla.f32 q9, q3, d3[0]\n" + "vmla.f32 q11, q3, d3[1]\n" + + "vld1.f32 {d6-d7}, [%[f_0]]!\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "r2"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("veor q1, q1, q1\n" // zero + "vmax.f32 q4, q4, q1\n" + "vmax.f32 q5, q5, q1\n" + "vmax.f32 q6, q6, q1\n" + "vmax.f32 q7, q7, q1\n" + "vmax.f32 q8, q8, q1\n" + "vmax.f32 q9, q9, q1\n" + "vmax.f32 q10, q10, q1\n" + "vmax.f32 q11, q11, q1\n" + : + : + : "memory", "cc", "q1", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q11"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("veor q1, q1, q1\n" // zero + "vmov.f32 q2, #6.0\n" // six + "vmax.f32 q4, q4, q1\n" + "vmax.f32 q5, q5, q1\n" + "vmax.f32 q6, q6, q1\n" + "vmax.f32 q7, q7, q1\n" + "vmax.f32 q8, q8, q1\n" + "vmax.f32 q9, q9, q1\n" + "vmax.f32 q10, q10, q1\n" + "vmax.f32 q11, q11, q1\n" + "vmin.f32 q4, q4, q2\n" + "vmin.f32 q5, q5, q2\n" + "vmin.f32 q6, q6, q2\n" + "vmin.f32 q7, q7, q2\n" + "vmin.f32 q8, q8, q2\n" + "vmin.f32 q9, q9, q2\n" + "vmin.f32 q10, q10, q2\n" + "vmin.f32 q11, q11, q2\n" + : + : + : "memory", "cc", "q1", "q2", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__( + "vst1.f32 {q4}, [%[out_0]]!\n" + "vst1.f32 {q5}, [%[out_0]]!\n" + "vst1.f32 {q6}, [%[out_0]]!\n" + "vst1.f32 {q7}, [%[out_0]]!\n" + "vst1.f32 {q8}, [%[out_0]]!\n" + "vst1.f32 {q9}, [%[out_0]]!\n" + "vst1.f32 {q10}, [%[out_0]]!\n" + "vst1.f32 {q11}, [%[out_0]]!\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11"); + b0 += 8; + b1 += 8; + } + ohow_s += 4; + ohow_tail -= 4; + } + + // I32 ohow_s = (ohow / 4) * 4; + + for (I32 hw = ohow_s; hw < ohow; hw++) { + const F32 *b0 = biasArray; + const F32 *b1 = biasArray + 4; + F32 *in_pack = ((F32 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHW => NCHWc8hw1 + im2col + U32 in_h_0 = (hw / ow) * convParamSpec.stride_h; + U32 in_w_0 = (hw % ow) * convParamSpec.stride_w; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F32 *in_hw1c8 = inArray_pad + c * ihiw * 8 + + fh_idx * convParamSpec.dilatedRate_h * iw_pad * 8 + + fw_idx * convParamSpec.dilatedRate_w * 8; + F32 *in_0 = in_hw1c8 + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F32 *in_pack_c8hw1 = + in_pack + fh_idx * fw * ic * 8 + fw_idx * ic * 8 + c * 8; + + memcpy(in_pack_c8hw1, in_0, 8 * bytesOf(idt)); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__( + "vld1.f32 {d8-d9}, [%[b_0]]\n" + "vld1.f32 {d10-d11}, [%[b_1]]\n" + "vld1.f32 {d0}, [%[in_0]]!\n" + "vld1.f32 {d4-d7}, [%[f_0]]!\n" + "mov r2, %[ic]\n" + "0:\n" + "vmla.f32 q4, q2, d0[0]\n" + + "vld1.f32 {d4-d5}, [%[f_0]]!\n" + + "vmla.f32 q5, q3, d0[0]\n" + + "vld1.f32 {d6-d7}, [%[f_0]]!\n" + "subs r2, r2, #2\n" + + "vmla.f32 q4, q2, d0[1]\n" + + "vld1.f32 {d4-d5}, [%[f_0]]!\n" + + "vmla.f32 q5, q3, d0[1]\n" + + "vld1.f32 {d0}, [%[in_0]]!\n" + "vld1.f32 {d6-d7}, [%[f_0]]!\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "q0", "q2", "q3", "q4", "q5", "r2"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("veor q1, q1, q1\n" // zero + "vmax.f32 q4, q4, q1\n" + "vmax.f32 q5, q5, q1\n" + : + : + : "memory", "cc", "q1", "q4", "v5"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("veor q1, q1, q1\n" // zero + "vmov.f32 q2, #6.0\n" // six + "vmax.f32 q4, q4, q1\n" + "vmax.f32 q5, q5, q1\n" + "vmin.f32 q4, q4, q2\n" + "vmin.f32 q5, q5, q2\n" + : + : + : "memory", "cc", "q1", "q2", "q4", "v5"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("vst1.f32 {q4}, [%[out_0]]!\n" + "vst1.f32 {q5}, [%[out_0]]!\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "q4", "q5"); + b0 += 8; + b1 += 8; + } + } + } + return ret; +} +#endif diff --git a/compute/tensor/src/cpu/arm/fp32/convolution_gemm_V8.cpp b/compute/tensor/src/cpu/arm/fp32/convolution_gemm_V8.cpp new file mode 100644 index 00000000..9b38c51d --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/convolution_gemm_V8.cpp @@ -0,0 +1,1010 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef __aarch64__ +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#include +#ifdef _USE_OPENMP +#include +#endif + +EE convolution_gemm_V8(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if (fdf != DF_NHWCN8) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + I32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + F32 *inArray_pad; + EE ret = SUCCESS; + for (U32 n = 0; n < in; n++) { + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + inArray_pad = inArray + n * ic * ih * iw * 8; + } else { + // copy input into a input with padding + inArray_pad = (F32 *)tmp; + F32 *inArray_pad_mov = inArray_pad; + F32 *inArray_mov = inArray + n * ic * ih * iw * 8; + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < paddingT; h++) { + memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += iw_pad * 8; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(idt)); + inArray_pad_mov += paddingL * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(idt)); + inArray_pad_mov += paddingR * 8; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { + memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += iw_pad * 8; + } + } + } + // ohow / 12 +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (I32 hw = 0; hw < ohow - 11; hw += 12) { + const F32 *b0 = biasArray; + const F32 *b1 = biasArray + 4; +#ifdef _USE_OPENMP + U32 thread_private_buffer_offset = 12 * fh * fw * ic * 8 * omp_get_thread_num(); +#else + U32 thread_private_buffer_offset = 0; +#endif + F32 *in_pack = ((F32 *)tmp) + ic * ihiw * 8 + thread_private_buffer_offset; + // pack input + // NCHWc8 => NHWChw12 + im2col + U32 in_h[12] = {0}; + U32 in_w[12] = {0}; + for (U32 i = 0; i < 12; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F32 *in_hw12c8 = inArray_pad + c * ihiw * 8 + + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + F32 *in_0 = in_hw12c8 + in_h[0] * iw_pad * 8 + in_w[0] * 8; + F32 *in_1 = in_hw12c8 + in_h[1] * iw_pad * 8 + in_w[1] * 8; + F32 *in_2 = in_hw12c8 + in_h[2] * iw_pad * 8 + in_w[2] * 8; + F32 *in_3 = in_hw12c8 + in_h[3] * iw_pad * 8 + in_w[3] * 8; + F32 *in_4 = in_hw12c8 + in_h[4] * iw_pad * 8 + in_w[4] * 8; + F32 *in_5 = in_hw12c8 + in_h[5] * iw_pad * 8 + in_w[5] * 8; + F32 *in_6 = in_hw12c8 + in_h[6] * iw_pad * 8 + in_w[6] * 8; + F32 *in_7 = in_hw12c8 + in_h[7] * iw_pad * 8 + in_w[7] * 8; + F32 *in_8 = in_hw12c8 + in_h[8] * iw_pad * 8 + in_w[8] * 8; + F32 *in_9 = in_hw12c8 + in_h[9] * iw_pad * 8 + in_w[9] * 8; + F32 *in_10 = in_hw12c8 + in_h[10] * iw_pad * 8 + in_w[10] * 8; + F32 *in_11 = in_hw12c8 + in_h[11] * iw_pad * 8 + in_w[11] * 8; + + // NHWChw12 + F32 *in_pack_c8hw12 = + in_pack + fh_idx * fw * ic * 12 * 8 + fw_idx * ic * 12 * 8 + c * 12 * 8; + + __asm__ __volatile__( + "ldp q0, q1, [%[in_0]]\n" + "ldp q2, q3, [%[in_1]]\n" + "ldp q4, q5, [%[in_2]]\n" + "ldp q6, q7, [%[in_3]]\n" + + "ldp q8, q9, [%[in_4]]\n" + "ldp q10, q11, [%[in_5]]\n" + "ldp q12, q13, [%[in_6]]\n" + "ldp q14, q15, [%[in_7]]\n" + + "ldp q16, q17, [%[in_8]]\n" + "ldp q18, q19, [%[in_9]]\n" + "ldp q20, q21, [%[in_10]]\n" + "ldp q22, q23, [%[in_11]]\n" + + "zip1 v24.4s, v0.4s, v2.4s\n" + "zip2 v25.4s, v0.4s, v2.4s\n" + "zip1 v26.4s, v4.4s, v6.4s\n" + "zip2 v27.4s, v4.4s, v6.4s\n" + + "zip1 v0.2d, v24.2d, v26.2d\n" + "zip2 v2.2d, v24.2d, v26.2d\n" + "zip1 v4.2d, v25.2d, v27.2d\n" + "zip2 v6.2d, v25.2d, v27.2d\n" + + "zip1 v24.4s, v8.4s, v10.4s\n" + "zip2 v25.4s, v8.4s, v10.4s\n" + "zip1 v26.4s, v12.4s, v14.4s\n" + "zip2 v27.4s, v12.4s, v14.4s\n" + + "zip1 v8.2d, v24.2d, v26.2d\n" + "zip2 v10.2d, v24.2d, v26.2d\n" + "zip1 v12.2d, v25.2d, v27.2d\n" + "zip2 v14.2d, v25.2d, v27.2d\n" + + "zip1 v24.4s, v16.4s, v18.4s\n" + "zip2 v25.4s, v16.4s, v18.4s\n" + "zip1 v26.4s, v20.4s, v22.4s\n" + "zip2 v27.4s, v20.4s, v22.4s\n" + + "zip1 v16.2d, v24.2d, v26.2d\n" + "zip2 v18.2d, v24.2d, v26.2d\n" + "zip1 v20.2d, v25.2d, v27.2d\n" + "zip2 v22.2d, v25.2d, v27.2d\n" + + "stp q0, q8, [%[pack]]\n" + "str q16, [%[pack], #32]\n" + "stp q2, q10, [%[pack], 48]\n" + "str q18, [%[pack], #80]\n" + "stp q4, q12, [%[pack], #96]\n" + "str q20, [%[pack], #128]\n" + "stp q6, q14, [%[pack], #144]\n" + "str q22, [%[pack], #176]\n" + + "zip1 v24.4s, v1.4s, v3.4s\n" + "zip2 v25.4s, v1.4s, v3.4s\n" + "zip1 v26.4s, v5.4s, v7.4s\n" + "zip2 v27.4s, v5.4s, v7.4s\n" + + "zip1 v1.2d, v24.2d, v26.2d\n" + "zip2 v3.2d, v24.2d, v26.2d\n" + "zip1 v5.2d, v25.2d, v27.2d\n" + "zip2 v7.2d, v25.2d, v27.2d\n" + + "zip1 v24.4s, v9.4s, v11.4s\n" + "zip2 v25.4s, v9.4s, v11.4s\n" + "zip1 v26.4s, v13.4s, v15.4s\n" + "zip2 v27.4s, v13.4s, v15.4s\n" + + "zip1 v9.2d, v24.2d, v26.2d\n" + "zip2 v11.2d, v24.2d, v26.2d\n" + "zip1 v13.2d, v25.2d, v27.2d\n" + "zip2 v15.2d, v25.2d, v27.2d\n" + + "zip1 v24.4s, v17.4s, v19.4s\n" + "zip2 v25.4s, v17.4s, v19.4s\n" + "zip1 v26.4s, v21.4s, v23.4s\n" + "zip2 v27.4s, v21.4s, v23.4s\n" + + "zip1 v17.2d, v24.2d, v26.2d\n" + "zip2 v19.2d, v24.2d, v26.2d\n" + "zip1 v21.2d, v25.2d, v27.2d\n" + "zip2 v23.2d, v25.2d, v27.2d\n" + + "stp q1, q9, [%[pack], #192]\n" + "str q17, [%[pack], #224]\n" + "stp q3, q11, [%[pack], 240]\n" + "str q19, [%[pack], #272]\n" + "stp q5, q13, [%[pack], 288]\n" + "str q21, [%[pack], #320]\n" + "stp q7, q15, [%[pack], 336]\n" + "str q23, [%[pack], #368]\n" + : + : [pack] "r"(in_pack_c8hw12), [in_0] "r"(in_0), [in_1] "r"(in_1), + [in_2] "r"(in_2), [in_3] "r"(in_3), [in_4] "r"(in_4), [in_5] "r"(in_5), + [in_6] "r"(in_6), [in_7] "r"(in_7), [in_8] "r"(in_8), [in_9] "r"(in_9), + [in_10] "r"(in_10), [in_11] "r"(in_11) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0 + o * 8; + const F32 *b_o1 = b1 + o * 8; + __asm__ __volatile__( + "ldr q27, [%[b_0]]\n" + "ldr q28, [%[b_1]]\n" + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" + + "mov v5.16b, v27.16b\n" + "ldr q1, [%[in_0]]\n" // in_hw0 + "mov v7.16b, v27.16b\n" + "mov v9.16b, v27.16b\n" + "mov v11.16b, v27.16b\n" + "ldr q0, [%[f_0]]\n" // f_o0c0 + "mov v13.16b, v27.16b\n" + "mov v15.16b, v27.16b\n" + "mov v17.16b, v27.16b\n" + "ldr q3, [%[in_0], #16]\n" + "mov v19.16b, v27.16b\n" + "mov v21.16b, v27.16b\n" + "mov v23.16b, v27.16b\n" + "mov v25.16b, v27.16b\n" + + "mov v6.16b, v28.16b\n" + "mov v8.16b, v28.16b\n" + "mov v10.16b, v28.16b\n" + "mov v12.16b, v28.16b\n" + "mov v14.16b, v28.16b\n" + "mov v16.16b, v28.16b\n" + "mov v18.16b, v28.16b\n" + "mov v20.16b, v28.16b\n" + "mov v22.16b, v28.16b\n" + "mov v24.16b, v28.16b\n" + "mov v26.16b, v28.16b\n" + "0:\n" + "fmla v5.4s, v0.4s, v1.s[0]\n" + "fmla v7.4s, v0.4s, v1.s[1]\n" + "ldr q2, [x3, 32]\n" + "ldr q29, [x0, 16]\n" + "fmla v9.4s, v0.4s, v1.s[2]\n" + "fmla v11.4s, v0.4s, v1.s[3]\n" + + "fmla v13.4s, v0.4s, v3.s[0]\n" + "fmla v15.4s, v0.4s, v3.s[1]\n" + "fmla v17.4s, v0.4s, v3.s[2]\n" + "fmla v19.4s, v0.4s, v3.s[3]\n" + + "fmla v21.4s, v0.4s, v2.s[0]\n" + "fmla v23.4s, v0.4s, v2.s[1]\n" + "fmla v25.4s, v0.4s, v2.s[2]\n" + "fmla v27.4s, v0.4s, v2.s[3]\n" + + "fmla v6.4s, v29.4s, v1.s[0]\n" + "fmla v8.4s, v29.4s, v1.s[1]\n" + "fmla v10.4s, v29.4s, v1.s[2]\n" + "fmla v12.4s, v29.4s, v1.s[3]\n" + + "fmla v14.4s, v29.4s, v3.s[0]\n" + "fmla v16.4s, v29.4s, v3.s[1]\n" + "ldr q1, [x3, 48]!\n" + "ldr q0, [x0, 32]!\n" + "fmla v18.4s, v29.4s, v3.s[2]\n" + "fmla v20.4s, v29.4s, v3.s[3]\n" + + "fmla v22.4s, v29.4s, v2.s[0]\n" + "fmla v24.4s, v29.4s, v2.s[1]\n" + "ldr q3, [x3, 16]\n" + "subs x2, x2, #1\n" + "fmla v26.4s, v29.4s, v2.s[2]\n" + "fmla v28.4s, v29.4s, v2.s[3]\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", "x1", "x2", + "x3"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "fmax v24.4s, v24.4s, v1.4s\n" + "fmax v25.4s, v25.4s, v1.4s\n" + "fmax v26.4s, v26.4s, v1.4s\n" + "fmax v27.4s, v27.4s, v1.4s\n" + "fmax v28.4s, v28.4s, v1.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "fmax v24.4s, v24.4s, v1.4s\n" + "fmax v25.4s, v25.4s, v1.4s\n" + "fmax v26.4s, v26.4s, v1.4s\n" + "fmax v27.4s, v27.4s, v1.4s\n" + "fmax v28.4s, v28.4s, v1.4s\n" + + "fmin v5.4s, v5.4s, v30.4s\n" + "fmin v6.4s, v6.4s, v30.4s\n" + "fmin v7.4s, v7.4s, v30.4s\n" + "fmin v8.4s, v8.4s, v30.4s\n" + "fmin v9.4s, v9.4s, v30.4s\n" + "fmin v10.4s, v10.4s, v30.4s\n" + "fmin v11.4s, v11.4s, v30.4s\n" + "fmin v12.4s, v12.4s, v30.4s\n" + "fmin v13.4s, v13.4s, v30.4s\n" + "fmin v14.4s, v14.4s, v30.4s\n" + "fmin v15.4s, v15.4s, v30.4s\n" + "fmin v16.4s, v16.4s, v30.4s\n" + "fmin v17.4s, v17.4s, v30.4s\n" + "fmin v18.4s, v18.4s, v30.4s\n" + "fmin v19.4s, v19.4s, v30.4s\n" + "fmin v20.4s, v20.4s, v30.4s\n" + "fmin v21.4s, v21.4s, v30.4s\n" + "fmin v22.4s, v22.4s, v30.4s\n" + "fmin v23.4s, v23.4s, v30.4s\n" + "fmin v24.4s, v24.4s, v30.4s\n" + "fmin v25.4s, v25.4s, v30.4s\n" + "fmin v26.4s, v26.4s, v30.4s\n" + "fmin v27.4s, v27.4s, v30.4s\n" + "fmin v28.4s, v28.4s, v30.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v30"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + "str q7, [%[out_0], #32]\n" + "str q8, [%[out_0], #48]\n" + "str q9, [%[out_0], #64]\n" + "str q10, [%[out_0], #80]\n" + "str q11, [%[out_0], #96]\n" + "str q12, [%[out_0], #112]\n" + "str q13, [%[out_0], #128]\n" + "str q14, [%[out_0], #144]\n" + "str q15, [%[out_0], #160]\n" + "str q16, [%[out_0], #176]\n" + "str q17, [%[out_0], #192]\n" + "str q18, [%[out_0], #208]\n" + "str q19, [%[out_0], #224]\n" + "str q20, [%[out_0], #240]\n" + "str q21, [%[out_0], #256]\n" + "str q22, [%[out_0], #272]\n" + "str q23, [%[out_0], #288]\n" + "str q24, [%[out_0], #304]\n" + "str q25, [%[out_0], #320]\n" + "str q26, [%[out_0], #336]\n" + "str q27, [%[out_0], #352]\n" + "str q28, [%[out_0], #368]\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28"); + } + } + + U32 ohow_s = (ohow / 12) * 12; + U32 ohow_tail = ohow - ohow_s; + + if (ohow_tail >= 8) { + I32 hw = ohow_s; + const F32 *b0 = biasArray; + const F32 *b1 = biasArray + 4; + F32 *in_pack = ((F32 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHWc8 => NHWChw8 + im2col + U32 in_h[8] = {0}; + U32 in_w[8] = {0}; + + for (U32 i = 0; i < 8; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F32 *in_hw8c8 = inArray_pad + c * ihiw * 8 + fh_idx * dilateH * iw_pad * 8 + + fw_idx * dilateW * 8; + F32 *in_0 = in_hw8c8 + in_h[0] * iw_pad * 8 + in_w[0] * 8; + F32 *in_1 = in_hw8c8 + in_h[1] * iw_pad * 8 + in_w[1] * 8; + F32 *in_2 = in_hw8c8 + in_h[2] * iw_pad * 8 + in_w[2] * 8; + F32 *in_3 = in_hw8c8 + in_h[3] * iw_pad * 8 + in_w[3] * 8; + F32 *in_4 = in_hw8c8 + in_h[4] * iw_pad * 8 + in_w[4] * 8; + F32 *in_5 = in_hw8c8 + in_h[5] * iw_pad * 8 + in_w[5] * 8; + F32 *in_6 = in_hw8c8 + in_h[6] * iw_pad * 8 + in_w[6] * 8; + F32 *in_7 = in_hw8c8 + in_h[7] * iw_pad * 8 + in_w[7] * 8; + F32 *in_pack_c8hw8 = + in_pack + fh_idx * fw * ic * 8 * 8 + fw_idx * ic * 8 * 8 + c * 8 * 8; + + __asm__ __volatile__("ldp q0, q1, [%[in_0]]\n" + "ldp q2, q3, [%[in_1]]\n" + "ldp q4, q5, [%[in_2]]\n" + "ldp q6, q7, [%[in_3]]\n" + + "ldp q8, q9, [%[in_4]]\n" + "ldp q10, q11, [%[in_5]]\n" + "ldp q12, q13, [%[in_6]]\n" + "ldp q14, q15, [%[in_7]]\n" + + "zip1 v24.4s, v0.4s, v2.4s\n" + "zip2 v25.4s, v0.4s, v2.4s\n" + "zip1 v26.4s, v4.4s, v6.4s\n" + "zip2 v27.4s, v4.4s, v6.4s\n" + + "zip1 v0.2d, v24.2d, v26.2d\n" + "zip2 v2.2d, v24.2d, v26.2d\n" + "zip1 v4.2d, v25.2d, v27.2d\n" + "zip2 v6.2d, v25.2d, v27.2d\n" + + "zip1 v24.4s, v8.4s, v10.4s\n" + "zip2 v25.4s, v8.4s, v10.4s\n" + "zip1 v26.4s, v12.4s, v14.4s\n" + "zip2 v27.4s, v12.4s, v14.4s\n" + + "zip1 v8.2d, v24.2d, v26.2d\n" + "zip2 v10.2d, v24.2d, v26.2d\n" + "zip1 v12.2d, v25.2d, v27.2d\n" + "zip2 v14.2d, v25.2d, v27.2d\n" + + "stp q0, q8, [%[pack]]\n" + "stp q2, q10, [%[pack], #32]\n" + "stp q4, q12, [%[pack], #64]\n" + "stp q6, q14, [%[pack], #96]\n" + + "zip1 v24.4s, v1.4s, v3.4s\n" + "zip2 v25.4s, v1.4s, v3.4s\n" + "zip1 v26.4s, v5.4s, v7.4s\n" + "zip2 v27.4s, v5.4s, v7.4s\n" + + "zip1 v1.2d, v24.2d, v26.2d\n" + "zip2 v3.2d, v24.2d, v26.2d\n" + "zip1 v5.2d, v25.2d, v27.2d\n" + "zip2 v7.2d, v25.2d, v27.2d\n" + + "zip1 v24.4s, v9.4s, v11.4s\n" + "zip2 v25.4s, v9.4s, v11.4s\n" + "zip1 v26.4s, v13.4s, v15.4s\n" + "zip2 v27.4s, v13.4s, v15.4s\n" + + "zip1 v9.2d, v24.2d, v26.2d\n" + "zip2 v11.2d, v24.2d, v26.2d\n" + "zip1 v13.2d, v25.2d, v27.2d\n" + "zip2 v15.2d, v25.2d, v27.2d\n" + + "stp q1, q9, [%[pack], #128]\n" + "stp q3, q11, [%[pack], #160]\n" + "stp q5, q13, [%[pack], #192]\n" + "stp q7, q15, [%[pack], #224]\n" + : + : [pack] "r"(in_pack_c8hw8), [in_0] "r"(in_0), + [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3), + [in_4] "r"(in_4), [in_5] "r"(in_5), [in_6] "r"(in_6), + [in_7] "r"(in_7) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v24", "v25", "v26", "v27"); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__( + "ldr q27, [%[b_0]]\n" + "ldr q28, [%[b_1]]\n" + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" + + "mov v5.16b, v27.16b\n" + "ldr q1, [%[in_0]]\n" // in_hw0 + "mov v7.16b, v27.16b\n" + "mov v9.16b, v27.16b\n" + "mov v11.16b, v27.16b\n" + "ldr q0, [%[f_0]]\n" // f_o0c0 + "mov v13.16b, v27.16b\n" + "mov v15.16b, v27.16b\n" + "mov v17.16b, v27.16b\n" + "mov v19.16b, v27.16b\n" + + "mov v6.16b, v28.16b\n" + "mov v8.16b, v28.16b\n" + "mov v10.16b, v28.16b\n" + "mov v12.16b, v28.16b\n" + "mov v14.16b, v28.16b\n" + "mov v16.16b, v28.16b\n" + "mov v18.16b, v28.16b\n" + "mov v20.16b, v28.16b\n" + "0:\n" + "ldr q3, [x3, 16]!\n" + "ldr q29, [x0, 16]\n" + "fmla v5.4s, v0.4s, v1.s[0]\n" + "fmla v7.4s, v0.4s, v1.s[1]\n" + "fmla v9.4s, v0.4s, v1.s[2]\n" + "fmla v11.4s, v0.4s, v1.s[3]\n" + + "fmla v13.4s, v0.4s, v3.s[0]\n" + "fmla v15.4s, v0.4s, v3.s[1]\n" + "fmla v17.4s, v0.4s, v3.s[2]\n" + "fmla v19.4s, v0.4s, v3.s[3]\n" + + "fmla v6.4s, v29.4s, v1.s[0]\n" + "fmla v8.4s, v29.4s, v1.s[1]\n" + "fmla v10.4s, v29.4s, v1.s[2]\n" + "fmla v12.4s, v29.4s, v1.s[3]\n" + + "fmla v14.4s, v29.4s, v3.s[0]\n" + "fmla v16.4s, v29.4s, v3.s[1]\n" + "ldr q1, [x3, 16]!\n" + "ldr q0, [x0, 32]!\n" + "subs x2, x2, #1\n" + "fmla v18.4s, v29.4s, v3.s[2]\n" + "fmla v20.4s, v29.4s, v3.s[3]\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v27", "v28", + "v29", "x0", "x1", "x2", "x3"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + + "fmin v5.4s, v5.4s, v30.4s\n" + "fmin v6.4s, v6.4s, v30.4s\n" + "fmin v7.4s, v7.4s, v30.4s\n" + "fmin v8.4s, v8.4s, v30.4s\n" + "fmin v9.4s, v9.4s, v30.4s\n" + "fmin v10.4s, v10.4s, v30.4s\n" + "fmin v11.4s, v11.4s, v30.4s\n" + "fmin v12.4s, v12.4s, v30.4s\n" + "fmin v13.4s, v13.4s, v30.4s\n" + "fmin v14.4s, v14.4s, v30.4s\n" + "fmin v15.4s, v15.4s, v30.4s\n" + "fmin v16.4s, v16.4s, v30.4s\n" + "fmin v17.4s, v17.4s, v30.4s\n" + "fmin v18.4s, v18.4s, v30.4s\n" + "fmin v19.4s, v19.4s, v30.4s\n" + "fmin v20.4s, v20.4s, v30.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "v30"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + "str q7, [%[out_0], #32]\n" + "str q8, [%[out_0], #48]\n" + "str q9, [%[out_0], #64]\n" + "str q10, [%[out_0], #80]\n" + "str q11, [%[out_0], #96]\n" + "str q12, [%[out_0], #112]\n" + "str q13, [%[out_0], #128]\n" + "str q14, [%[out_0], #144]\n" + "str q15, [%[out_0], #160]\n" + "str q16, [%[out_0], #176]\n" + "str q17, [%[out_0], #192]\n" + "str q18, [%[out_0], #208]\n" + "str q19, [%[out_0], #224]\n" + "str q20, [%[out_0], #240]\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20"); + b0 += 8; + b1 += 8; + } + ohow_s += 8; + ohow_tail -= 8; + } + + if (ohow_tail >= 4) { + I32 hw = ohow_s; + const F32 *b0 = biasArray; + const F32 *b1 = biasArray + 4; + F32 *in_pack = ((F32 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHWc8 => NHWChw4 + im2col + U32 in_h[4] = {0}; + U32 in_w[4] = {0}; + + for (U32 i = 0; i < 4; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; + } + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F32 *in_hw4c8 = inArray_pad + c * ihiw * 8 + fh_idx * dilateH * iw_pad * 8 + + fw_idx * dilateW * 8; + F32 *in_0 = in_hw4c8 + in_h[0] * iw_pad * 8 + in_w[0] * 8; + F32 *in_1 = in_hw4c8 + in_h[1] * iw_pad * 8 + in_w[1] * 8; + F32 *in_2 = in_hw4c8 + in_h[2] * iw_pad * 8 + in_w[2] * 8; + F32 *in_3 = in_hw4c8 + in_h[3] * iw_pad * 8 + in_w[3] * 8; + F32 *in_pack_c8hw4 = + in_pack + fh_idx * fw * ic * 8 * 4 + fw_idx * ic * 8 * 4 + c * 8 * 4; + + __asm__ __volatile__( + "ldp q0, q4, [%[in_0]]\n" + "ldp q1, q5, [%[in_1]]\n" + "ldp q2, q6, [%[in_2]]\n" + "ldp q3, q7, [%[in_3]]\n" + + "st4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[pack]], #64\n" + "st4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[pack]]\n" + : [pack] "+r"(in_pack_c8hw4) + : [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__( + "ldr q27, [%[b_0]]\n" + "ldr q28, [%[b_1]]\n" + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" + + "mov v5.16b, v27.16b\n" + "ldr q1, [%[in_0]]\n" // in_hw0 + "mov v7.16b, v27.16b\n" + "mov v9.16b, v27.16b\n" + "mov v11.16b, v27.16b\n" + "ldr q0, [%[f_0]]\n" // f_o0c0 + + "mov v6.16b, v28.16b\n" + "mov v8.16b, v28.16b\n" + "mov v10.16b, v28.16b\n" + "mov v12.16b, v28.16b\n" + "0:\n" + "ldr q3, [x3, 16]!\n" + "ldr q29, [x0, 16]\n" + "fmla v5.4s, v0.4s, v1.s[0]\n" + "fmla v7.4s, v0.4s, v1.s[1]\n" + "fmla v9.4s, v0.4s, v1.s[2]\n" + "fmla v11.4s, v0.4s, v1.s[3]\n" + + "fmla v6.4s, v29.4s, v1.s[0]\n" + "fmla v8.4s, v29.4s, v1.s[1]\n" + "ldr q0, [x0, 32]!\n" + "subs x2, x2, #1\n" + "fmla v10.4s, v29.4s, v1.s[2]\n" + "fmla v12.4s, v29.4s, v1.s[3]\n" + + "mov v1.16b, v3.16b\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v27", "v28", "v29", "x0", "x1", "x2", "x3"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + + "fmin v5.4s, v5.4s, v30.4s\n" + "fmin v6.4s, v6.4s, v30.4s\n" + "fmin v7.4s, v7.4s, v30.4s\n" + "fmin v8.4s, v8.4s, v30.4s\n" + "fmin v9.4s, v9.4s, v30.4s\n" + "fmin v10.4s, v10.4s, v30.4s\n" + "fmin v11.4s, v11.4s, v30.4s\n" + "fmin v12.4s, v12.4s, v30.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v30"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__( + "str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + "str q7, [%[out_0], #32]\n" + "str q8, [%[out_0], #48]\n" + "str q9, [%[out_0], #64]\n" + "str q10, [%[out_0], #80]\n" + "str q11, [%[out_0], #96]\n" + "str q12, [%[out_0], #112]\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12"); + b0 += 8; + b1 += 8; + } + ohow_s += 4; + ohow_tail -= 4; + } + + for (I32 hw = ohow_s; hw < ohow; hw++) { + const F32 *b0 = biasArray; + const F32 *b1 = biasArray + 4; + F32 *in_pack = ((F32 *)tmp) + ic * ih_pad * iw_pad * 8; + // pack input + // NCHW => NCHWc8hw1 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F32 *in_hw1c8 = inArray_pad + c * ihiw * 8 + fh_idx * dilateH * iw_pad * 8 + + fw_idx * dilateW * 8; + F32 *in_0 = in_hw1c8 + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F32 *in_pack_c8hw1 = + in_pack + fh_idx * fw * ic * 8 + fw_idx * ic * 8 + c * 8; + + memcpy(in_pack_c8hw1, in_0, 8 * bytesOf(idt)); + } + } + } + + // compute + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__( + "ldr q5, [%[b_0]]\n" + "ldr q6, [%[b_1]]\n" + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" + + "ldr s1, [%[in_0]]\n" // in_hw0 + "ldp q0, q29, [%[f_0]]\n" // f_o0c0 + + "0:\n" + "ldp q30, q28, [x0, #32]\n" + "ldr s3, [x3, #4]\n" + "fmla v5.4s, v0.4s, v1.s[0]\n" + "fmla v6.4s, v29.4s, v1.s[0]\n" + + "ldr q0, [x0, #64]!\n" + "subs x2, x2, #2\n" + "ldr q29, [x0, #16]\n" + "ldr s1, [x3, #8]!\n" + "fmla v5.4s, v30.4s, v3.s[0]\n" + "fmla v6.4s, v28.4s, v3.s[0]\n" + + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v3", "v5", "v6", "v28", "v29", "v30", "x0", "x1", + "x2", "x3"); + + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + + "fmin v5.4s, v5.4s, v30.4s\n" + "fmin v6.4s, v6.4s, v30.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v30"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v5", "v6"); + b0 += 8; + b1 += 8; + } + } + } + return ret; +} +#endif diff --git a/tensor_computing/src/cpu/arm/fp32/convolution_gemm_icnchw_V7.cpp b/compute/tensor/src/cpu/arm/fp32/convolution_gemm_icnchw_V7.cpp similarity index 61% rename from tensor_computing/src/cpu/arm/fp32/convolution_gemm_icnchw_V7.cpp rename to compute/tensor/src/cpu/arm/fp32/convolution_gemm_icnchw_V7.cpp index 0d90205f..9219c0b5 100644 --- a/tensor_computing/src/cpu/arm/fp32/convolution_gemm_icnchw_V7.cpp +++ b/compute/tensor/src/cpu/arm/fp32/convolution_gemm_icnchw_V7.cpp @@ -1,28 +1,32 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef __aarch64__ #include #include "cpu/arm/fp32/tensor_computing_fp32.h" -EE convolution_gemm_icnchw_V7(TensorDesc inputDesc, F32* inArray, - TensorDesc filterDesc, const F32* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F32* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F32* outArray, - ActivationDesc activationDesc) +EE convolution_gemm_icnchw_V7(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc) { UNUSED(biasDesc); UNUSED(tmpBytes); @@ -35,14 +39,14 @@ EE convolution_gemm_icnchw_V7(TensorDesc inputDesc, F32* inArray, CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - U32 dilateH = convDesc.dilatedRate_h; - U32 dilateW = convDesc.dilatedRate_w; + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; if (fdf != DF_NHWCN8) { CHECK_STATUS(NOT_MATCH); @@ -62,34 +66,34 @@ EE convolution_gemm_icnchw_V7(TensorDesc inputDesc, F32* inArray, oc /= 8; U32 ih_pad = ih + paddingT + paddingB; U32 iw_pad = iw + paddingL + paddingR; - I32 ohow = oh*ow; - U32 ihiw = ih_pad*iw_pad; + I32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; F32 *inArray_pad; EE ret = SUCCESS; for (U32 n = 0; n < in; n++) { if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { - inArray_pad = inArray + n*ic*ih*iw; + inArray_pad = inArray + n * ic * ih * iw; } else { // copy input into a input with padding - inArray_pad = (F32*)tmp; + inArray_pad = (F32 *)tmp; F32 *inArray_pad_mov = inArray_pad; - F32 *inArray_mov = inArray + n*ic*ih*iw; + F32 *inArray_mov = inArray + n * ic * ih * iw; for (U32 c = 0; c < ic; c++) { for (U32 h = 0; h < paddingT; h++) { - memset(inArray_pad_mov, 0, iw_pad*bytesOf(idt)); + memset(inArray_pad_mov, 0, iw_pad * bytesOf(idt)); inArray_pad_mov += iw_pad; } for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL*bytesOf(idt)); + memset(inArray_pad_mov, 0, paddingL * bytesOf(idt)); inArray_pad_mov += paddingL; - memcpy(inArray_pad_mov, inArray_mov, iw*bytesOf(idt)); + memcpy(inArray_pad_mov, inArray_mov, iw * bytesOf(idt)); inArray_pad_mov += iw; inArray_mov += iw; - memset(inArray_pad_mov, 0, paddingR*bytesOf(idt)); + memset(inArray_pad_mov, 0, paddingR * bytesOf(idt)); inArray_pad_mov += paddingR; } for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { - memset(inArray_pad_mov, 0, iw_pad*bytesOf(idt)); + memset(inArray_pad_mov, 0, iw_pad * bytesOf(idt)); inArray_pad_mov += iw_pad; } } @@ -98,38 +102,39 @@ EE convolution_gemm_icnchw_V7(TensorDesc inputDesc, F32* inArray, for (I32 hw = 0; hw < ohow - 5; hw += 6) { const F32 *b0 = biasArray; const F32 *b1 = biasArray + 4; - F32 *in_pack = ((F32*)tmp) + ic*ih_pad*iw_pad; + F32 *in_pack = ((F32 *)tmp) + ic * ih_pad * iw_pad; // pack input // NCHW => NHWChw12 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - U32 in_h_1 = ((hw+1)/ow)*strideH; - U32 in_w_1 = ((hw+1)%ow)*strideW; - U32 in_h_2 = ((hw+2)/ow)*strideH; - U32 in_w_2 = ((hw+2)%ow)*strideW; - U32 in_h_3 = ((hw+3)/ow)*strideH; - U32 in_w_3 = ((hw+3)%ow)*strideW; - U32 in_h_4 = ((hw+4)/ow)*strideH; - U32 in_w_4 = ((hw+4)%ow)*strideW; - U32 in_h_5 = ((hw+5)/ow)*strideH; - U32 in_w_5 = ((hw+5)%ow)*strideW; + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; + U32 in_h_4 = ((hw + 4) / ow) * strideH; + U32 in_w_4 = ((hw + 4) % ow) * strideW; + U32 in_h_5 = ((hw + 5) / ow) * strideH; + U32 in_w_5 = ((hw + 5) % ow) * strideW; for (U32 c = 0; c < ic; c++) { for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F32 *in_hw = inArray_pad + c*ihiw + fh_idx*dilateH*iw_pad + dilateW*fw_idx; - F32 *in_0 = in_hw + in_h_0*iw_pad + in_w_0; - F32 *in_1 = in_hw + in_h_1*iw_pad + in_w_1; - F32 *in_2 = in_hw + in_h_2*iw_pad + in_w_2; - F32 *in_3 = in_hw + in_h_3*iw_pad + in_w_3; - F32 *in_4 = in_hw + in_h_4*iw_pad + in_w_4; - F32 *in_5 = in_hw + in_h_5*iw_pad + in_w_5; - F32 *in_pack_hw6 = in_pack + (fh_idx*fw*ic + fw_idx*ic + c)*6; + F32 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F32 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F32 *in_1 = in_hw + in_h_1 * iw_pad + in_w_1; + F32 *in_2 = in_hw + in_h_2 * iw_pad + in_w_2; + F32 *in_3 = in_hw + in_h_3 * iw_pad + in_w_3; + F32 *in_4 = in_hw + in_h_4 * iw_pad + in_w_4; + F32 *in_5 = in_hw + in_h_5 * iw_pad + in_w_5; + F32 *in_pack_hw6 = in_pack + (fh_idx * fw * ic + fw_idx * ic + c) * 6; *in_pack_hw6 = *in_0; - *(in_pack_hw6+1) = *in_1; - *(in_pack_hw6+2) = *in_2; - *(in_pack_hw6+3) = *in_3; - *(in_pack_hw6+4) = *in_4; - *(in_pack_hw6+5) = *in_5; + *(in_pack_hw6 + 1) = *in_1; + *(in_pack_hw6 + 2) = *in_2; + *(in_pack_hw6 + 3) = *in_3; + *(in_pack_hw6 + 4) = *in_4; + *(in_pack_hw6 + 5) = *in_5; } } } @@ -137,8 +142,8 @@ EE convolution_gemm_icnchw_V7(TensorDesc inputDesc, F32* inArray, // compute for (I32 o = 0; o < I32(oc); o++) { F32 *in_hw0 = in_pack; - const F32 *f_o0c0 = filterArray + o*8*fh*fw*ic; - F32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; // bias const F32 *b_o0 = b0; @@ -147,12 +152,12 @@ EE convolution_gemm_icnchw_V7(TensorDesc inputDesc, F32* inArray, "vld1.f32 {d10-d11}, [%[b_0]]\n" "vld1.f32 {d12-d13}, [%[b_1]]\n" "mov r2, %[ic]\n" - - "vld1.f32 {d2-d3}, [%[in_0]]!\n" //in_hw0 + + "vld1.f32 {d2-d3}, [%[in_0]]!\n" // in_hw0 "vmov.f32 q7, q5\n" "vmov.f32 q9, q5\n" "vmov.f32 q11, q5\n" - "vld1.f32 {d0-d1}, [%[f_0]]!\n" //f_o0c0 + "vld1.f32 {d0-d1}, [%[f_0]]!\n" // f_o0c0 "vmov.f32 q13, q5\n" "vmov.f32 q15, q5\n" @@ -184,7 +189,7 @@ EE convolution_gemm_icnchw_V7(TensorDesc inputDesc, F32* inArray, "cmp %[activation], #0\n" "beq 1f\n" - "veor q1, q1, q1\n" //zero + "veor q1, q1, q1\n" // zero "vmax.f32 q5, q5, q1\n" "vmax.f32 q6, q6, q1\n" "vmax.f32 q7, q7, q1\n" @@ -210,17 +215,11 @@ EE convolution_gemm_icnchw_V7(TensorDesc inputDesc, F32* inArray, "vst1.f32 {d28-d29}, [%[out_0]]!\n" "vst1.f32 {d30-d31}, [%[out_0]]!\n" "vst1.f32 {d6-d7}, [%[out_0]]\n" - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [activation]"r"(activation) - :"memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "q12", "q13", "q14", "q15", - "r2" - ); + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [activation] "r"(activation) + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15", "r2"); b0 += 8; b1 += 8; } @@ -233,30 +232,31 @@ EE convolution_gemm_icnchw_V7(TensorDesc inputDesc, F32* inArray, I32 hw = ohow_s; const F32 *b0 = biasArray; const F32 *b1 = biasArray + 4; - F32 *in_pack = ((F32*)tmp) + ic*ih_pad*iw_pad; + F32 *in_pack = ((F32 *)tmp) + ic * ih_pad * iw_pad; // pack input // NCHW => NHWChw4 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; - U32 in_h_1 = ((hw+1)/ow)*strideH; - U32 in_w_1 = ((hw+1)%ow)*strideW; - U32 in_h_2 = ((hw+2)/ow)*strideH; - U32 in_w_2 = ((hw+2)%ow)*strideW; - U32 in_h_3 = ((hw+3)/ow)*strideH; - U32 in_w_3 = ((hw+3)%ow)*strideW; + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; for (U32 c = 0; c < ic; c++) { for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F32 *in_hw = inArray_pad + c*ihiw + fh_idx*dilateH*iw_pad + dilateW*fw_idx; - F32 *in_0 = in_hw + in_h_0*iw_pad + in_w_0; - F32 *in_1 = in_hw + in_h_1*iw_pad + in_w_1; - F32 *in_2 = in_hw + in_h_2*iw_pad + in_w_2; - F32 *in_3 = in_hw + in_h_3*iw_pad + in_w_3; - F32 *in_pack_hw4 = in_pack + fh_idx*fw*ic*4 + fw_idx*ic*4 + c*4; + F32 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F32 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F32 *in_1 = in_hw + in_h_1 * iw_pad + in_w_1; + F32 *in_2 = in_hw + in_h_2 * iw_pad + in_w_2; + F32 *in_3 = in_hw + in_h_3 * iw_pad + in_w_3; + F32 *in_pack_hw4 = in_pack + fh_idx * fw * ic * 4 + fw_idx * ic * 4 + c * 4; *in_pack_hw4 = *in_0; - *(in_pack_hw4+1) = *in_1; - *(in_pack_hw4+2) = *in_2; - *(in_pack_hw4+3) = *in_3; + *(in_pack_hw4 + 1) = *in_1; + *(in_pack_hw4 + 2) = *in_2; + *(in_pack_hw4 + 3) = *in_3; } } } @@ -264,8 +264,8 @@ EE convolution_gemm_icnchw_V7(TensorDesc inputDesc, F32* inArray, // compute for (I32 o = 0; o < I32(oc); o++) { F32 *in_hw0 = in_pack; - const F32 *f_o0c0 = filterArray + o*8*fh*fw*ic; - F32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; // bias const F32 *b_o0 = b0; const F32 *b_o1 = b1; @@ -274,34 +274,35 @@ EE convolution_gemm_icnchw_V7(TensorDesc inputDesc, F32* inArray, "vld1.f32 {d12-d13}, [%[b_1]]\n" "mov r2, %[ic]\n" - "vld1.f32 {d2-d3}, [%[in_0]]!\n" //in_hw0 + "vld1.f32 {d2-d3}, [%[in_0]]!\n" // in_hw0 "vmov.f32 q7, q5\n" "vmov.f32 q9, q5\n" "vmov.f32 q11, q5\n" - "vld1.f32 {d0-d1}, [%[f_0]]!\n" //f_o0c0 + "vld1.f32 {d0-d1}, [%[f_0]]!\n" // f_o0c0 "vmov.f32 q8, q6\n" "vmov.f32 q10, q6\n" "vmov.f32 q12, q6\n" "0:\n" - "vld1.f32 {d4-d5}, [%[in_0]]!\n" + "vld1.f32 {d6-d7}, [%[in_0]]!\n" "vld1.f32 {d8-d9}, [%[f_0]]!\n" "vmla.f32 q5, q0, d2[0]\n" "vmla.f32 q7, q0, d2[1]\n" "vmla.f32 q9, q0, d3[0]\n" "vmla.f32 q11, q0, d3[1]\n" + "vld1.f32 {d0-d1}, [%[f_0]]!\n" "vmla.f32 q6, q4, d2[0]\n" "vmla.f32 q8, q4, d2[1]\n" "subs r2, r2, #1\n" "vmla.f32 q10, q4, d3[0]\n" "vmla.f32 q12, q4, d3[1]\n" - "vmov.f32 q1, q2\n" + "vmov.f32 q1, q3\n" "bne 0b\n" "cmp %[activation], #0\n" "beq 1f\n" - "veor q1, q1, q1\n" //zero + "veor q1, q1, q1\n" // zero "vmax.f32 q5, q5, q1\n" "vmax.f32 q6, q6, q1\n" "vmax.f32 q7, q7, q1\n" @@ -319,16 +320,11 @@ EE convolution_gemm_icnchw_V7(TensorDesc inputDesc, F32* inArray, "vst1.f32 {d20-d21}, [%[out_0]]!\n" "vst1.f32 {d22-d23}, [%[out_0]]!\n" "vst1.f32 {d24-d25}, [%[out_0]]\n" - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [activation]"r"(activation) - :"memory", "cc", "q0", "q1", "q3", "q5", "q6", "q7", "q8", "q9", "q10", "q11", - "q12", "q4", "r2" - ); + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [activation] "r"(activation) + : "memory", "cc", "q0", "q1", "q3", "q5", "q6", "q7", "q8", "q9", "q10", "q11", + "q12", "q4", "r2"); b0 += 8; b1 += 8; } @@ -339,17 +335,18 @@ EE convolution_gemm_icnchw_V7(TensorDesc inputDesc, F32* inArray, for (I32 hw = ohow_s; hw < ohow; hw++) { const F32 *b0 = biasArray; const F32 *b1 = biasArray + 4; - F32 *in_pack = ((F32*)tmp) + ic*ih_pad*iw_pad; + F32 *in_pack = ((F32 *)tmp) + ic * ih_pad * iw_pad; // pack input // NCHW => NCHWc8hw1 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; for (U32 c = 0; c < ic; c++) { for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - F32 *in_hw = inArray_pad + c*ihiw + fh_idx*dilateH*iw_pad + dilateW*fw_idx; - F32 *in_0 = in_hw + in_h_0*iw_pad + in_w_0; - F32 *in_pack_hw1 = in_pack + fh_idx*fw*ic + fw_idx*ic + c; + F32 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F32 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F32 *in_pack_hw1 = in_pack + fh_idx * fw * ic + fw_idx * ic + c; *in_pack_hw1 = *in_0; } } @@ -358,8 +355,8 @@ EE convolution_gemm_icnchw_V7(TensorDesc inputDesc, F32* inArray, // compute for (I32 o = 0; o < I32(oc); o++) { F32 *in_hw0 = in_pack; - const F32 *f_o0c0 = filterArray + o*8*fh*fw*ic; - F32 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; // bias const F32 *b_o0 = b0; const F32 *b_o1 = b1; @@ -379,21 +376,16 @@ EE convolution_gemm_icnchw_V7(TensorDesc inputDesc, F32* inArray, "cmp %[activation], #0\n" "beq 1f\n" - "veor q1, q1, q1\n" //zero + "veor q1, q1, q1\n" // zero "vmax.f32 q5, q5, q1\n" "vmax.f32 q6, q6, q1\n" "1:\n" "vst1.f32 {d10-d11}, [%[out_0]]!\n" "vst1.f32 {d12-d13}, [%[out_0]]\n" - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(in_hw0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*fh*fw), - [b_0]"r"(b_o0), - [b_1]"r"(b_o1), - [activation]"r"(activation) - :"memory", "cc", "q0", "q1", "q5", "q6", "q4", "r2" - ); + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [activation] "r"(activation) + : "memory", "cc", "q0", "q1", "q5", "q6", "q4", "r2"); b0 += 8; b1 += 8; } diff --git a/compute/tensor/src/cpu/arm/fp32/convolution_gemm_icnchw_V8.cpp b/compute/tensor/src/cpu/arm/fp32/convolution_gemm_icnchw_V8.cpp new file mode 100644 index 00000000..66c07de4 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/convolution_gemm_icnchw_V8.cpp @@ -0,0 +1,845 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef __aarch64__ +#include +#include "cpu/arm/fp32/tensor_computing_fp32.h" + +EE convolution_gemm_icnchw_V8(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if (fdf != DF_NHWCN8) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + I32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + F32 *inArray_pad; + EE ret = SUCCESS; + for (U32 n = 0; n < in; n++) { + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + inArray_pad = inArray + n * ic * ih * iw; + } else { + // copy input into a input with padding + inArray_pad = (F32 *)tmp; + F32 *inArray_pad_mov = inArray_pad; + F32 *inArray_mov = inArray + n * ic * ih * iw; + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < paddingT; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(idt)); + inArray_pad_mov += iw_pad; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * bytesOf(idt)); + inArray_pad_mov += paddingL; + memcpy(inArray_pad_mov, inArray_mov, iw * bytesOf(idt)); + inArray_pad_mov += iw; + inArray_mov += iw; + memset(inArray_pad_mov, 0, paddingR * bytesOf(idt)); + inArray_pad_mov += paddingR; + } + for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { + memset(inArray_pad_mov, 0, iw_pad * bytesOf(idt)); + inArray_pad_mov += iw_pad; + } + } + } + // ohow / 12 + for (I32 hw = 0; hw < ohow - 11; hw += 12) { + const F32 *b0 = biasArray; + const F32 *b1 = biasArray + 4; + F32 *in_pack = ((F32 *)tmp) + ic * ih_pad * iw_pad; + // pack input + // NCHW => NHWChw12 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; + U32 in_h_4 = ((hw + 4) / ow) * strideH; + U32 in_w_4 = ((hw + 4) % ow) * strideW; + U32 in_h_5 = ((hw + 5) / ow) * strideH; + U32 in_w_5 = ((hw + 5) % ow) * strideW; + U32 in_h_6 = ((hw + 6) / ow) * strideH; + U32 in_w_6 = ((hw + 6) % ow) * strideW; + U32 in_h_7 = ((hw + 7) / ow) * strideH; + U32 in_w_7 = ((hw + 7) % ow) * strideW; + U32 in_h_8 = ((hw + 8) / ow) * strideH; + U32 in_w_8 = ((hw + 8) % ow) * strideW; + U32 in_h_9 = ((hw + 9) / ow) * strideH; + U32 in_w_9 = ((hw + 9) % ow) * strideW; + U32 in_h_10 = ((hw + 10) / ow) * strideH; + U32 in_w_10 = ((hw + 10) % ow) * strideW; + U32 in_h_11 = ((hw + 11) / ow) * strideH; + U32 in_w_11 = ((hw + 11) % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F32 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F32 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F32 *in_1 = in_hw + in_h_1 * iw_pad + in_w_1; + F32 *in_2 = in_hw + in_h_2 * iw_pad + in_w_2; + F32 *in_3 = in_hw + in_h_3 * iw_pad + in_w_3; + F32 *in_4 = in_hw + in_h_4 * iw_pad + in_w_4; + F32 *in_5 = in_hw + in_h_5 * iw_pad + in_w_5; + F32 *in_6 = in_hw + in_h_6 * iw_pad + in_w_6; + F32 *in_7 = in_hw + in_h_7 * iw_pad + in_w_7; + F32 *in_8 = in_hw + in_h_8 * iw_pad + in_w_8; + F32 *in_9 = in_hw + in_h_9 * iw_pad + in_w_9; + F32 *in_10 = in_hw + in_h_10 * iw_pad + in_w_10; + F32 *in_11 = in_hw + in_h_11 * iw_pad + in_w_11; + F32 *in_pack_hw12 = + in_pack + fh_idx * fw * ic * 12 + fw_idx * ic * 12 + c * 12; + *in_pack_hw12 = *in_0; + *(in_pack_hw12 + 1) = *in_1; + *(in_pack_hw12 + 2) = *in_2; + *(in_pack_hw12 + 3) = *in_3; + *(in_pack_hw12 + 4) = *in_4; + *(in_pack_hw12 + 5) = *in_5; + *(in_pack_hw12 + 6) = *in_6; + *(in_pack_hw12 + 7) = *in_7; + *(in_pack_hw12 + 8) = *in_8; + *(in_pack_hw12 + 9) = *in_9; + *(in_pack_hw12 + 10) = *in_10; + *(in_pack_hw12 + 11) = *in_11; + } + } + } + + // compute + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__("ldr q5, [%[b_0]]\n" + "ldr q6, [%[b_1]]\n" + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" + + "ldr q1, [%[in_0]]\n" // in_hw0 + "mov v7.16b, v5.16b\n" + "mov v9.16b, v5.16b\n" + "mov v11.16b, v5.16b\n" + "ldr q0, [%[f_0]]\n" // f_o0c0 + "mov v13.16b, v5.16b\n" + "mov v15.16b, v5.16b\n" + "mov v17.16b, v5.16b\n" + "ldr q3, [%[in_0], #16]\n" + "mov v19.16b, v5.16b\n" + "mov v21.16b, v5.16b\n" + "mov v23.16b, v5.16b\n" + "mov v25.16b, v5.16b\n" + "mov v27.16b, v5.16b\n" + + "mov v8.16b, v6.16b\n" + "mov v10.16b, v6.16b\n" + "mov v12.16b, v6.16b\n" + "mov v14.16b, v6.16b\n" + "mov v16.16b, v6.16b\n" + "mov v18.16b, v6.16b\n" + "mov v20.16b, v6.16b\n" + "mov v22.16b, v6.16b\n" + "mov v24.16b, v6.16b\n" + "mov v26.16b, v6.16b\n" + "mov v28.16b, v6.16b\n" + "0:\n" + "fmla v5.4s, v0.4s, v1.s[0]\n" + "fmla v7.4s, v0.4s, v1.s[1]\n" + "ldr q2, [x3, 32]\n" + "ldr q4, [x0, 16]\n" + "fmla v9.4s, v0.4s, v1.s[2]\n" + "fmla v11.4s, v0.4s, v1.s[3]\n" + + "fmla v13.4s, v0.4s, v3.s[0]\n" + "fmla v15.4s, v0.4s, v3.s[1]\n" + "fmla v17.4s, v0.4s, v3.s[2]\n" + "fmla v19.4s, v0.4s, v3.s[3]\n" + + "fmla v21.4s, v0.4s, v2.s[0]\n" + "fmla v23.4s, v0.4s, v2.s[1]\n" + "fmla v25.4s, v0.4s, v2.s[2]\n" + "fmla v27.4s, v0.4s, v2.s[3]\n" + + "fmla v6.4s, v4.4s, v1.s[0]\n" + "fmla v8.4s, v4.4s, v1.s[1]\n" + "fmla v10.4s, v4.4s, v1.s[2]\n" + "fmla v12.4s, v4.4s, v1.s[3]\n" + + "fmla v14.4s, v4.4s, v3.s[0]\n" + "fmla v16.4s, v4.4s, v3.s[1]\n" + "ldr q1, [x3, 48]!\n" + "ldr q0, [x0, 32]!\n" + "fmla v18.4s, v4.4s, v3.s[2]\n" + "fmla v20.4s, v4.4s, v3.s[3]\n" + + "fmla v22.4s, v4.4s, v2.s[0]\n" + "fmla v24.4s, v4.4s, v2.s[1]\n" + "ldr q3, [x3, 16]\n" + "subs x2, x2, #1\n" + "fmla v26.4s, v4.4s, v2.s[2]\n" + "fmla v28.4s, v4.4s, v2.s[3]\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v4", "v30", "x0", "x1", "x2", "x3"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "fmax v24.4s, v24.4s, v1.4s\n" + "fmax v25.4s, v25.4s, v1.4s\n" + "fmax v26.4s, v26.4s, v1.4s\n" + "fmax v27.4s, v27.4s, v1.4s\n" + "fmax v28.4s, v28.4s, v1.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "fmax v24.4s, v24.4s, v1.4s\n" + "fmax v25.4s, v25.4s, v1.4s\n" + "fmax v26.4s, v26.4s, v1.4s\n" + "fmax v27.4s, v27.4s, v1.4s\n" + "fmax v28.4s, v28.4s, v1.4s\n" + + "fmin v5.4s, v5.4s, v30.4s\n" + "fmin v6.4s, v6.4s, v30.4s\n" + "fmin v7.4s, v7.4s, v30.4s\n" + "fmin v8.4s, v8.4s, v30.4s\n" + "fmin v9.4s, v9.4s, v30.4s\n" + "fmin v10.4s, v10.4s, v30.4s\n" + "fmin v11.4s, v11.4s, v30.4s\n" + "fmin v12.4s, v12.4s, v30.4s\n" + "fmin v13.4s, v13.4s, v30.4s\n" + "fmin v14.4s, v14.4s, v30.4s\n" + "fmin v15.4s, v15.4s, v30.4s\n" + "fmin v16.4s, v16.4s, v30.4s\n" + "fmin v17.4s, v17.4s, v30.4s\n" + "fmin v18.4s, v18.4s, v30.4s\n" + "fmin v19.4s, v19.4s, v30.4s\n" + "fmin v20.4s, v20.4s, v30.4s\n" + "fmin v21.4s, v21.4s, v30.4s\n" + "fmin v22.4s, v22.4s, v30.4s\n" + "fmin v23.4s, v23.4s, v30.4s\n" + "fmin v24.4s, v24.4s, v30.4s\n" + "fmin v25.4s, v25.4s, v30.4s\n" + "fmin v26.4s, v26.4s, v30.4s\n" + "fmin v27.4s, v27.4s, v30.4s\n" + "fmin v28.4s, v28.4s, v30.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v30"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + "str q7, [%[out_0], #32]\n" + "str q8, [%[out_0], #48]\n" + "str q9, [%[out_0], #64]\n" + "str q10, [%[out_0], #80]\n" + "str q11, [%[out_0], #96]\n" + "str q12, [%[out_0], #112]\n" + "str q13, [%[out_0], #128]\n" + "str q14, [%[out_0], #144]\n" + "str q15, [%[out_0], #160]\n" + "str q16, [%[out_0], #176]\n" + "str q17, [%[out_0], #192]\n" + "str q18, [%[out_0], #208]\n" + "str q19, [%[out_0], #224]\n" + "str q20, [%[out_0], #240]\n" + "str q21, [%[out_0], #256]\n" + "str q22, [%[out_0], #272]\n" + "str q23, [%[out_0], #288]\n" + "str q24, [%[out_0], #304]\n" + "str q25, [%[out_0], #320]\n" + "str q26, [%[out_0], #336]\n" + "str q27, [%[out_0], #352]\n" + "str q28, [%[out_0], #368]\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28"); + b0 += 8; + b1 += 8; + } + } + + U32 ohow_s = (ohow / 12) * 12; + U32 ohow_tail = ohow - ohow_s; + + if (ohow_tail >= 8) { + I32 hw = ohow_s; + const F32 *b0 = biasArray; + const F32 *b1 = biasArray + 4; + F32 *in_pack = ((F32 *)tmp) + ic * ih_pad * iw_pad; + // pack input + // NCHW => NHWChw8 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; + U32 in_h_4 = ((hw + 4) / ow) * strideH; + U32 in_w_4 = ((hw + 4) % ow) * strideW; + U32 in_h_5 = ((hw + 5) / ow) * strideH; + U32 in_w_5 = ((hw + 5) % ow) * strideW; + U32 in_h_6 = ((hw + 6) / ow) * strideH; + U32 in_w_6 = ((hw + 6) % ow) * strideW; + U32 in_h_7 = ((hw + 7) / ow) * strideH; + U32 in_w_7 = ((hw + 7) % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F32 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F32 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F32 *in_1 = in_hw + in_h_1 * iw_pad + in_w_1; + F32 *in_2 = in_hw + in_h_2 * iw_pad + in_w_2; + F32 *in_3 = in_hw + in_h_3 * iw_pad + in_w_3; + F32 *in_4 = in_hw + in_h_4 * iw_pad + in_w_4; + F32 *in_5 = in_hw + in_h_5 * iw_pad + in_w_5; + F32 *in_6 = in_hw + in_h_6 * iw_pad + in_w_6; + F32 *in_7 = in_hw + in_h_7 * iw_pad + in_w_7; + F32 *in_pack_hw8 = in_pack + fh_idx * fw * ic * 8 + fw_idx * ic * 8 + c * 8; + *in_pack_hw8 = *in_0; + *(in_pack_hw8 + 1) = *in_1; + *(in_pack_hw8 + 2) = *in_2; + *(in_pack_hw8 + 3) = *in_3; + *(in_pack_hw8 + 4) = *in_4; + *(in_pack_hw8 + 5) = *in_5; + *(in_pack_hw8 + 6) = *in_6; + *(in_pack_hw8 + 7) = *in_7; + } + } + } + + // compute + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__("ldr q5, [%[b_0]]\n" + "ldr q6, [%[b_1]]\n" + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" + + "ldr q1, [%[in_0]]\n" // in_hw0 + "mov v7.16b, v5.16b\n" + "mov v9.16b, v5.16b\n" + "mov v11.16b, v5.16b\n" + "ldr q0, [%[f_0]]\n" // f_o0c0 + "mov v13.16b, v5.16b\n" + "mov v15.16b, v5.16b\n" + "mov v17.16b, v5.16b\n" + "mov v19.16b, v5.16b\n" + + "mov v6.16b, v6.16b\n" + "mov v8.16b, v6.16b\n" + "mov v10.16b, v6.16b\n" + "mov v12.16b, v6.16b\n" + "mov v14.16b, v6.16b\n" + "mov v16.16b, v6.16b\n" + "mov v18.16b, v6.16b\n" + "mov v20.16b, v6.16b\n" + "0:\n" + "ldr q3, [x3, 16]!\n" + "ldr q4, [x0, 16]\n" + "fmla v5.4s, v0.4s, v1.s[0]\n" + "fmla v7.4s, v0.4s, v1.s[1]\n" + "fmla v9.4s, v0.4s, v1.s[2]\n" + "fmla v11.4s, v0.4s, v1.s[3]\n" + + "fmla v13.4s, v0.4s, v3.s[0]\n" + "fmla v15.4s, v0.4s, v3.s[1]\n" + "fmla v17.4s, v0.4s, v3.s[2]\n" + "fmla v19.4s, v0.4s, v3.s[3]\n" + + "fmla v6.4s, v4.4s, v1.s[0]\n" + "fmla v8.4s, v4.4s, v1.s[1]\n" + "fmla v10.4s, v4.4s, v1.s[2]\n" + "fmla v12.4s, v4.4s, v1.s[3]\n" + + "fmla v14.4s, v4.4s, v3.s[0]\n" + "fmla v16.4s, v4.4s, v3.s[1]\n" + "ldr q1, [x3, 16]!\n" + "ldr q0, [x0, 32]!\n" + "subs x2, x2, #1\n" + "fmla v18.4s, v4.4s, v3.s[2]\n" + "fmla v20.4s, v4.4s, v3.s[3]\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "v4", "x0", "x1", "x2", "x3"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + "fmax v13.4s, v13.4s, v1.4s\n" + "fmax v14.4s, v14.4s, v1.4s\n" + "fmax v15.4s, v15.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + + "fmin v5.4s, v5.4s, v30.4s\n" + "fmin v6.4s, v6.4s, v30.4s\n" + "fmin v7.4s, v7.4s, v30.4s\n" + "fmin v8.4s, v8.4s, v30.4s\n" + "fmin v9.4s, v9.4s, v30.4s\n" + "fmin v10.4s, v10.4s, v30.4s\n" + "fmin v11.4s, v11.4s, v30.4s\n" + "fmin v12.4s, v12.4s, v30.4s\n" + "fmin v13.4s, v13.4s, v30.4s\n" + "fmin v14.4s, v14.4s, v30.4s\n" + "fmin v15.4s, v15.4s, v30.4s\n" + "fmin v16.4s, v16.4s, v30.4s\n" + "fmin v17.4s, v17.4s, v30.4s\n" + "fmin v18.4s, v18.4s, v30.4s\n" + "fmin v19.4s, v19.4s, v30.4s\n" + "fmin v20.4s, v20.4s, v30.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "v30"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + "str q7, [%[out_0], #32]\n" + "str q8, [%[out_0], #48]\n" + "str q9, [%[out_0], #64]\n" + "str q10, [%[out_0], #80]\n" + "str q11, [%[out_0], #96]\n" + "str q12, [%[out_0], #112]\n" + "str q13, [%[out_0], #128]\n" + "str q14, [%[out_0], #144]\n" + "str q15, [%[out_0], #160]\n" + "str q16, [%[out_0], #176]\n" + "str q17, [%[out_0], #192]\n" + "str q18, [%[out_0], #208]\n" + "str q19, [%[out_0], #224]\n" + "str q20, [%[out_0], #240]\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20"); + b0 += 8; + b1 += 8; + } + ohow_s += 8; + ohow_tail -= 8; + } + + if (ohow_tail >= 4) { + I32 hw = ohow_s; + const F32 *b0 = biasArray; + const F32 *b1 = biasArray + 4; + F32 *in_pack = ((F32 *)tmp) + ic * ih_pad * iw_pad; + // pack input + // NCHW => NHWChw4 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + U32 in_h_1 = ((hw + 1) / ow) * strideH; + U32 in_w_1 = ((hw + 1) % ow) * strideW; + U32 in_h_2 = ((hw + 2) / ow) * strideH; + U32 in_w_2 = ((hw + 2) % ow) * strideW; + U32 in_h_3 = ((hw + 3) / ow) * strideH; + U32 in_w_3 = ((hw + 3) % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F32 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F32 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F32 *in_1 = in_hw + in_h_1 * iw_pad + in_w_1; + F32 *in_2 = in_hw + in_h_2 * iw_pad + in_w_2; + F32 *in_3 = in_hw + in_h_3 * iw_pad + in_w_3; + F32 *in_pack_hw4 = in_pack + fh_idx * fw * ic * 4 + fw_idx * ic * 4 + c * 4; + *in_pack_hw4 = *in_0; + *(in_pack_hw4 + 1) = *in_1; + *(in_pack_hw4 + 2) = *in_2; + *(in_pack_hw4 + 3) = *in_3; + } + } + } + + // compute + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__("ldr q5, [%[b_0]]\n" + "ldr q6, [%[b_1]]\n" + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" + + "ldr q1, [%[in_0]]\n" // in_hw0 + "mov v7.16b, v5.16b\n" + "mov v9.16b, v5.16b\n" + "mov v11.16b, v5.16b\n" + "ldr q0, [%[f_0]]\n" // f_o0c0 + + "mov v6.16b, v6.16b\n" + "mov v8.16b, v6.16b\n" + "mov v10.16b, v6.16b\n" + "mov v12.16b, v6.16b\n" + "0:\n" + "ldr q3, [x3, 16]!\n" + "ldr q4, [x0, 16]\n" + "fmla v5.4s, v0.4s, v1.s[0]\n" + "fmla v7.4s, v0.4s, v1.s[1]\n" + "fmla v9.4s, v0.4s, v1.s[2]\n" + "fmla v11.4s, v0.4s, v1.s[3]\n" + + "fmla v6.4s, v4.4s, v1.s[0]\n" + "fmla v8.4s, v4.4s, v1.s[1]\n" + "ldr q0, [x0, 32]!\n" + "subs x2, x2, #1\n" + "fmla v10.4s, v4.4s, v1.s[2]\n" + "fmla v12.4s, v4.4s, v1.s[3]\n" + "mov v1.16b, v3.16b\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v4", "x0", "x1", "x2", "x3"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v1.4s\n" + "fmax v8.4s, v8.4s, v1.4s\n" + "fmax v9.4s, v9.4s, v1.4s\n" + "fmax v10.4s, v10.4s, v1.4s\n" + "fmax v11.4s, v11.4s, v1.4s\n" + "fmax v12.4s, v12.4s, v1.4s\n" + + "fmin v5.4s, v5.4s, v30.4s\n" + "fmin v6.4s, v6.4s, v30.4s\n" + "fmin v7.4s, v7.4s, v30.4s\n" + "fmin v8.4s, v8.4s, v30.4s\n" + "fmin v9.4s, v9.4s, v30.4s\n" + "fmin v10.4s, v10.4s, v30.4s\n" + "fmin v11.4s, v11.4s, v30.4s\n" + "fmin v12.4s, v12.4s, v30.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v30"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__( + "str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + "str q7, [%[out_0], #32]\n" + "str q8, [%[out_0], #48]\n" + "str q9, [%[out_0], #64]\n" + "str q10, [%[out_0], #80]\n" + "str q11, [%[out_0], #96]\n" + "str q12, [%[out_0], #112]\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12"); + b0 += 8; + b1 += 8; + } + ohow_s += 4; + ohow_tail -= 4; + } + + for (I32 hw = ohow_s; hw < ohow; hw++) { + const F32 *b0 = biasArray; + const F32 *b1 = biasArray + 4; + F32 *in_pack = ((F32 *)tmp) + ic * ih_pad * iw_pad; + // pack input + // NCHW => NCHWc8hw1 + im2col + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; + for (U32 c = 0; c < ic; c++) { + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + F32 *in_hw = + inArray_pad + c * ihiw + fh_idx * dilateH * iw_pad + dilateW * fw_idx; + F32 *in_0 = in_hw + in_h_0 * iw_pad + in_w_0; + F32 *in_pack_hw1 = in_pack + fh_idx * fw * ic + fw_idx * ic + c; + *in_pack_hw1 = *in_0; + } + } + } + + // compute + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = filterArray + o * 8 * fh * fw * ic; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__( + "ldr q5, [%[b_0]]\n" + "ldr q6, [%[b_1]]\n" + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" + + "0:\n" + "ldr q0, [x0], #16\n" + "subs x2, x2, #1\n" + "ldr q4, [x0], #16\n" + "ldr s1, [x3], #4\n" + "fmla v5.4s, v0.4s, v1.s[0]\n" + "fmla v6.4s, v4.4s, v1.s[0]\n" + + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * fh * fw), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "v0", "v1", "v5", "v6", "v4", "x0", "x1", "x2", "x3"); + switch (activationDesc.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v1.16b, v1.16b, v1.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v5.4s, v5.4s, v1.4s\n" + "fmax v6.4s, v6.4s, v1.4s\n" + + "fmin v5.4s, v5.4s, v30.4s\n" + "fmin v6.4s, v6.4s, v30.4s\n" + : + : + : "memory", "cc", "v1", "v5", "v6", "v30"); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + __asm__ __volatile__("str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "v5", "v6"); + b0 += 8; + b1 += 8; + } + } + } + return ret; +} +#endif diff --git a/tensor_computing/src/cpu/arm/fp32/convolution_transform.cpp b/compute/tensor/src/cpu/arm/fp32/convolution_transform.cpp similarity index 63% rename from tensor_computing/src/cpu/arm/fp32/convolution_transform.cpp rename to compute/tensor/src/cpu/arm/fp32/convolution_transform.cpp index 82cd38ec..5378e368 100644 --- a/tensor_computing/src/cpu/arm/fp32/convolution_transform.cpp +++ b/compute/tensor/src/cpu/arm/fp32/convolution_transform.cpp @@ -1,34 +1,36 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "cpu/arm/fp32/tensor_computing_fp32.h" #include "cpu/arm/fp32/convolution_winograd_transform.h" -inline EE convolution_transform_filter_kernel_fp32(TensorDesc filterDesc, const F32* filterArray, - TensorDesc *ftmDesc, F32* ftmArray, +inline EE convolution_transform_filter_kernel_fp32(TensorDesc filterDesc, + const F32 *filterArray, + TensorDesc *ftmDesc, + F32 *ftmArray, DataFormat ftmDataFormat) { - if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) + if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) { CHECK_STATUS(NULL_POINTER); + } DataType fdt; DataFormat fdf; U32 fn, fc, fh, fw; CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); if (fdf == ftmDataFormat) { *ftmDesc = filterDesc; - memcpy(ftmArray, filterArray, fn*fc*fh*fw*bytesOf(fdt)); + memcpy(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt)); return SUCCESS; } if (fdf != DF_NCHW) { @@ -38,14 +40,15 @@ inline EE convolution_transform_filter_kernel_fp32(TensorDesc filterDesc, const switch (ftmDataFormat) { case DF_NHWCN8: { /* - * NCHW => NHWCN8 - */ + * NCHW => NHWCN8 + */ U32 oc = fn / 8; for (U32 o = 0; o < oc; o++) { - for (U32 hw = 0; hw < fh*fw; hw++) { + for (U32 hw = 0; hw < fh * fw; hw++) { for (U32 c = 0; c < fc; c++) { for (U32 o8 = 0; o8 < 8; o8++) { - ftmArray[o*fh*fw*fc*8 + hw*fc*8 + c*8 + o8] = filterArray[(o*8+o8)*fc*fh*fw + c*fh*fw + hw]; + ftmArray[o * fh * fw * fc * 8 + hw * fc * 8 + c * 8 + o8] = + filterArray[(o * 8 + o8) * fc * fh * fw + c * fh * fw + hw]; } } } @@ -54,14 +57,14 @@ inline EE convolution_transform_filter_kernel_fp32(TensorDesc filterDesc, const break; } case DF_HWNCN8: { - for (U32 o = 0; o < fn/8; o++) { + for (U32 o = 0; o < fn / 8; o++) { for (U32 c = 0; c < fc; c++) { // Each time deal with N4; 2 times we have N8 - U32 f_off_0 = (o*8)*fc*fh*fw + c*fh*fw; - U32 f_off_1 = (o*8+4)*fc*fh*fw + c*fh*fw; + U32 f_off_0 = (o * 8) * fc * fh * fw + c * fh * fw; + U32 f_off_1 = (o * 8 + 4) * fc * fh * fw + c * fh * fw; - U32 ftm_off_0 = o*36*fc*8 + c*8; - U32 ftm_off_1 = o*36*fc*8 + c*8 + 4; + U32 ftm_off_0 = o * 36 * fc * 8 + c * 8; + U32 ftm_off_1 = o * 36 * fc * 8 + c * 8 + 4; F32 F[9][4]; F32 *F_ptr[9]; @@ -69,22 +72,22 @@ inline EE convolution_transform_filter_kernel_fp32(TensorDesc filterDesc, const for (U32 hw = 0; hw < 9; hw++) { for (U32 oo = 0; oo < 4; oo++) { - F[hw][oo] = filterArray[f_off_0 + hw + oo*fc*fh*fw]; + F[hw][oo] = filterArray[f_off_0 + hw + oo * fc * fh * fw]; } F_ptr[hw] = F[hw]; } for (U32 hw = 0; hw < 36; hw++) { - Fw[hw] = ftmArray + ftm_off_0 + hw*fc*8; + Fw[hw] = ftmArray + ftm_off_0 + hw * fc * 8; } trans_W_4x4_3x3(Fw, F_ptr); for (U32 hw = 0; hw < 9; hw++) { for (U32 oo = 0; oo < 4; oo++) { - F[hw][oo] = filterArray[f_off_1 + hw + oo*fc*fh*fw]; + F[hw][oo] = filterArray[f_off_1 + hw + oo * fc * fh * fw]; } F_ptr[hw] = F[hw]; } for (U32 hw = 0; hw < 36; hw++) { - Fw[hw] = ftmArray + ftm_off_1 + hw*fc*8; + Fw[hw] = ftmArray + ftm_off_1 + hw * fc * 8; } trans_W_4x4_3x3(Fw, F_ptr); } @@ -99,9 +102,12 @@ inline EE convolution_transform_filter_kernel_fp32(TensorDesc filterDesc, const return ret; } -EE convolution_transform_filter_fp32(TensorDesc filterDesc, const F32* filter, +EE convolution_transform_filter_fp32(TensorDesc filterDesc, + const F32 *filter, + ConvolutionParamSpec convParamSpec, ConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, F32* filterTransformed) + TensorDesc *ftmDesc, + F32 *filterTransformed) { DataFormat ftmDataFormat; switch (algorithm) { @@ -117,7 +123,22 @@ EE convolution_transform_filter_fp32(TensorDesc filterDesc, const F32* filter, default: return NOT_MATCH; } - EE ret = convolution_transform_filter_kernel_fp32(filterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat); - CHECK_STATUS(ret); - return ret; + + U32 channelAxis = filterDesc.nDims - 1; + TensorDesc tmpFilterDesc = filterDesc; + tmpFilterDesc.dims[channelAxis] /= convParamSpec.group; + U32 fnPadding = tmpFilterDesc.dims[channelAxis]; + if (fnPadding % 8 != 0) { + fnPadding = (fnPadding / 8 + 1) * 8; + } + U32 originalTileSize = tensorNumElements(tmpFilterDesc); + for (U32 g = 0; g < convParamSpec.group; g++) { + CHECK_STATUS(convolution_transform_filter_kernel_fp32( + tmpFilterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat)); + U32 newTileSize = tensorNumElements(*ftmDesc) / tmpFilterDesc.dims[channelAxis] * fnPadding; + filter += originalTileSize; + filterTransformed += newTileSize; + } + ftmDesc->dims[channelAxis] = filterDesc.dims[channelAxis]; + return SUCCESS; } diff --git a/tensor_computing/src/cpu/arm/fp32/convolution_winograd_V8.cpp b/compute/tensor/src/cpu/arm/fp32/convolution_winograd_V8.cpp similarity index 71% rename from tensor_computing/src/cpu/arm/fp32/convolution_winograd_V8.cpp rename to compute/tensor/src/cpu/arm/fp32/convolution_winograd_V8.cpp index 6782a3b2..8b39981a 100644 --- a/tensor_computing/src/cpu/arm/fp32/convolution_winograd_V8.cpp +++ b/compute/tensor/src/cpu/arm/fp32/convolution_winograd_V8.cpp @@ -1,27 +1,31 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "cpu/arm/fp32/convolution_winograd_transform.h" #include "cpu/arm/fp32/tensor_computing_fp32.h" -EE convolution_winograd_V8(TensorDesc inputDesc, F32* inArray, - TensorDesc filterDesc, const F32* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const F32* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, F32* outArray, - ActivationDesc activationDesc) +EE convolution_winograd_V8(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc) { #ifdef __aarch64__ UNUSED(biasDesc); @@ -35,15 +39,17 @@ EE convolution_winograd_V8(TensorDesc inputDesc, F32* inArray, CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; - if (fdf != DF_HWNCN8) + if (fdf != DF_HWNCN8) { CHECK_STATUS(NOT_MATCH); - if (!(fh == 6 && fw == 6)) + } + if (!(fh == 6 && fw == 6)) { CHECK_STATUS(NOT_SUPPORTED); + } oc /= 8; ic /= 8; @@ -53,49 +59,49 @@ EE convolution_winograd_V8(TensorDesc inputDesc, F32* inArray, // num of 6x6 tiles I32 tiles = tile_h * tile_w; U32 pad_left = paddingL; - U32 pad_right = paddingR + (tile_w*4 - ow); - U32 pad_w_mod_4 = tile_w*4 - ow; + U32 pad_right = paddingR + (tile_w * 4 - ow); + U32 pad_w_mod_4 = tile_w * 4 - ow; U32 pad_top = paddingT; - U32 pad_bottom = paddingB + (tile_h*4 - oh); - U32 pad_h_mod_4 = tile_h*4 - oh; + U32 pad_bottom = paddingB + (tile_h * 4 - oh); + U32 pad_h_mod_4 = tile_h * 4 - oh; U32 ih_pad = ih + pad_top + pad_bottom; U32 iw_pad = iw + pad_left + pad_right; // tmp = in_pad + itm + otm // in_pad: ic*ih_pad*iw_pad*8 // itm: 6*6*ic*12*8 // otm: 6*6*12*8 - F32* inArray_pad = (F32*)tmp; - F32* itmArray = inArray_pad + ic*ih_pad*iw_pad*8; - F32* otmArray = itmArray + 6*6*ic*12*8; + F32 *inArray_pad = (F32 *)tmp; + F32 *itmArray = inArray_pad + ic * ih_pad * iw_pad * 8; + F32 *otmArray = itmArray + 6 * 6 * ic * 12 * 8; EE ret = SUCCESS; // copy input into a input with padding for (U32 n = 0; n < in; n++) { F32 *inArray_pad_mov = inArray_pad; - F32 *inArray_mov = inArray + n*ic*ih*iw*8; + F32 *inArray_mov = inArray + n * ic * ih * iw * 8; for (U32 c = 0; c < ic; c++) { - memset(inArray_pad_mov, 0, pad_top*iw_pad*8*bytesOf(idt)); - inArray_pad_mov += pad_top*iw_pad*8; + memset(inArray_pad_mov, 0, pad_top * iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += pad_top * iw_pad * 8; for (U32 h = pad_top; h < ih_pad - pad_bottom; h++) { - memset(inArray_pad_mov, 0, pad_left*8*bytesOf(idt)); - inArray_pad_mov += pad_left*8; - memcpy(inArray_pad_mov, inArray_mov, iw*8*bytesOf(idt)); - inArray_pad_mov += iw*8; - inArray_mov += iw*8; - memset(inArray_pad_mov, 0, pad_right*8*bytesOf(idt)); - inArray_pad_mov += pad_right*8; + memset(inArray_pad_mov, 0, pad_left * 8 * bytesOf(idt)); + inArray_pad_mov += pad_left * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, pad_right * 8 * bytesOf(idt)); + inArray_pad_mov += pad_right * 8; } - memset(inArray_pad_mov, 0, pad_bottom*iw_pad*8*bytesOf(idt)); - inArray_pad_mov += pad_bottom*iw_pad*8; + memset(inArray_pad_mov, 0, pad_bottom * iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += pad_bottom * iw_pad * 8; } // tiles / 12 - for (I32 hw = 0; hw < tiles-11; hw+=12) { + for (I32 hw = 0; hw < tiles - 11; hw += 12) { // in trans // NCHWc8 => (6*6)*C*c8*hw12 for (U32 c = 0; c < ic; c++) { - F32 *inArray_pad_mov = inArray_pad + c*ih_pad*iw_pad*8; - F32 *itmArray_mov = itmArray + c*12*8; + F32 *inArray_pad_mov = inArray_pad + c * ih_pad * iw_pad * 8; + F32 *itmArray_mov = itmArray + c * 12 * 8; F32 *Iw_ptr0[36]; F32 *Iw_ptr1[36]; F32 Iw[12][36][8]; @@ -104,14 +110,16 @@ EE convolution_winograd_V8(TensorDesc inputDesc, F32* inArray, U32 h[12]; U32 w[12]; for (U32 index = 0; index < 12; index++) { - h[index] = ((hw + index) / tile_w) * 4; - w[index] = ((hw + index) % tile_w) * 4; + h[index] = ((hw + index) / tile_w) * 4; + w[index] = ((hw + index) % tile_w) * 4; } for (U32 i = 0; i < 6; i++) { for (U32 j = 0; j < 6; j++) { for (U32 index = 0; index < 12; index++) { - I0[index][i*6 + j] = inArray_pad_mov + (h[index] + i) * iw_pad * 8 + (w[index] + j) * 8; - I1[index][i*6 + j] = inArray_pad_mov + (h[index] + i) * iw_pad * 8 + (w[index] + j) * 8 + 4; + I0[index][i * 6 + j] = + inArray_pad_mov + (h[index] + i) * iw_pad * 8 + (w[index] + j) * 8; + I1[index][i * 6 + j] = inArray_pad_mov + (h[index] + i) * iw_pad * 8 + + (w[index] + j) * 8 + 4; } } } @@ -124,7 +132,7 @@ EE convolution_winograd_V8(TensorDesc inputDesc, F32* inArray, trans_I_4x4_3x3(Iw_ptr1, I1[index]); } for (U32 i = 0; i < 36; i++) { - F32* itm = itmArray_mov + i*ic*8*12; + F32 *itm = itmArray_mov + i * ic * 8 * 12; __asm__ __volatile__( "ldp q0, q1, [%[in_0]]\n" @@ -220,48 +228,40 @@ EE convolution_winograd_V8(TensorDesc inputDesc, F32* inArray, "stp q7, q15, [%[pack], 336]\n" "str q23, [%[pack], #368]\n" : - :[pack]"r"(itm), - [in_0]"r"(Iw[0][i]), - [in_1]"r"(Iw[1][i]), - [in_2]"r"(Iw[2][i]), - [in_3]"r"(Iw[3][i]), - [in_4]"r"(Iw[4][i]), - [in_5]"r"(Iw[5][i]), - [in_6]"r"(Iw[6][i]), - [in_7]"r"(Iw[7][i]), - [in_8]"r"(Iw[8][i]), - [in_9]"r"(Iw[9][i]), - [in_10]"r"(Iw[10][i]), - [in_11]"r"(Iw[11][i]) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", - "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27" - ); + : [pack] "r"(itm), [in_0] "r"(Iw[0][i]), [in_1] "r"(Iw[1][i]), + [in_2] "r"(Iw[2][i]), [in_3] "r"(Iw[3][i]), [in_4] "r"(Iw[4][i]), + [in_5] "r"(Iw[5][i]), [in_6] "r"(Iw[6][i]), [in_7] "r"(Iw[7][i]), + [in_8] "r"(Iw[8][i]), [in_9] "r"(Iw[9][i]), [in_10] "r"(Iw[10][i]), + [in_11] "r"(Iw[11][i]) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); } } for (I32 o = 0; o < I32(oc); o++) { - const F32 *b_0 = biasArray + o*8; + const F32 *b_0 = biasArray + o * 8; const F32 *b_1 = b_0 + 4; // dot prod // (6*6)*C*c8*hw12 times O*(6*6)*C*c8*o8 = O*(6*6)*hw12*o8 for (U32 idx = 0; idx < 36; idx++) { - F32 *itm_0 = itmArray + idx*12*ic*8; - const F32 *f_o0c0 = filterArray + o*8*36*ic*8 + idx*8*ic*8; - F32 *out_o0hw0 = otmArray + idx*12*8; + F32 *itm_0 = itmArray + idx * 12 * ic * 8; + const F32 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; + F32 *out_o0hw0 = otmArray + idx * 12 * 8; __asm__ __volatile__( - //give in address to x3 + // give in address to x3 "mov x3, %[in_0]\n" - //give f address to x0 + // give f address to x0 "mov x0, %[f_0]\n" "mov x2, %[ic]\n" - + "eor v5.16b, v5.16b, v5.16b\n" - "ldr q1, [%[in_0]]\n" //in_hw0 + "ldr q1, [%[in_0]]\n" // in_hw0 "eor v6.16b, v6.16b, v6.16b\n" "eor v7.16b, v7.16b, v7.16b\n" "eor v8.16b, v8.16b, v8.16b\n" - "ldr q0, [%[f_0]]\n" //f_o0c0 + "ldr q0, [%[f_0]]\n" // f_o0c0 "eor v9.16b, v9.16b, v9.16b\n" "eor v10.16b, v10.16b, v10.16b\n" "eor v11.16b, v11.16b, v11.16b\n" @@ -345,38 +345,39 @@ EE convolution_winograd_V8(TensorDesc inputDesc, F32* inArray, "str q26, [%[out_0], #336]\n" "str q27, [%[out_0], #352]\n" "str q28, [%[out_0], #368]\n" - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(itm_0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8) - :"memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", "x1", "x2", "x3" - ); + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(itm_0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", + "x1", "x2", "x3"); } // out trans // O*(6*6)*hw12*o8 => NOHWo8 for (U32 hw12 = 0; hw12 < 12; hw12++) { - U32 h = (hw+hw12) / tile_w; - U32 w = (hw+hw12) % tile_w; - F32 *out_0 = outArray + n*oc*oh*ow*8 + o*oh*ow*8 + h*4*ow*8 + w*4*8; + U32 h = (hw + hw12) / tile_w; + U32 w = (hw + hw12) % tile_w; + F32 *out_0 = outArray + n * oc * oh * ow * 8 + o * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; F32 *Ow_0[36]; F32 *Ow_1[36]; F32 *O_0[16]; F32 *O_1[16]; for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + idx*12*8 + hw12*8; + Ow_0[idx] = otmArray + idx * 12 * 8 + hw12 * 8; Ow_1[idx] = Ow_0[idx] + 4; } for (U32 i = 0; i < 4; ++i) { for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - O_1[i*4 + j] = O_0[i*4 + j] + 4; + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + O_1[i * 4 + j] = O_0[i * 4 + j] + 4; } } - CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); } } } @@ -390,8 +391,8 @@ EE convolution_winograd_V8(TensorDesc inputDesc, F32* inArray, // in trans // NCHWc8 => (6*6)*C*c8*hw8 for (U32 c = 0; c < ic; c++) { - F32 *inArray_pad_mov = inArray_pad + c*ih_pad*iw_pad*8; - F32 *itmArray_mov = itmArray + c*8*8; + F32 *inArray_pad_mov = inArray_pad + c * ih_pad * iw_pad * 8; + F32 *itmArray_mov = itmArray + c * 8 * 8; F32 *Iw_ptr0[36]; F32 *Iw_ptr1[36]; F32 Iw[8][36][8]; @@ -400,14 +401,16 @@ EE convolution_winograd_V8(TensorDesc inputDesc, F32* inArray, U32 h[8]; U32 w[8]; for (U32 index = 0; index < 8; index++) { - h[index] = ((hw + index) / tile_w) * 4; - w[index] = ((hw + index) % tile_w) * 4; + h[index] = ((hw + index) / tile_w) * 4; + w[index] = ((hw + index) % tile_w) * 4; } for (U32 i = 0; i < 6; i++) { for (U32 j = 0; j < 6; j++) { for (U32 index = 0; index < 8; index++) { - I0[index][i*6 + j] = inArray_pad_mov + (h[index] + i) * iw_pad * 8 + (w[index] + j) * 8; - I1[index][i*6 + j] = inArray_pad_mov + (h[index] + i) * iw_pad * 8 + (w[index] + j) * 8 + 4; + I0[index][i * 6 + j] = + inArray_pad_mov + (h[index] + i) * iw_pad * 8 + (w[index] + j) * 8; + I1[index][i * 6 + j] = inArray_pad_mov + (h[index] + i) * iw_pad * 8 + + (w[index] + j) * 8 + 4; } } } @@ -420,7 +423,7 @@ EE convolution_winograd_V8(TensorDesc inputDesc, F32* inArray, trans_I_4x4_3x3(Iw_ptr1, I1[index]); } for (U32 i = 0; i < 36; i++) { - F32* itm = itmArray_mov + i*ic*8*8; + F32 *itm = itmArray_mov + i * ic * 8 * 8; __asm__ __volatile__( "ldp q0, q1, [%[in_0]]\n" @@ -483,44 +486,37 @@ EE convolution_winograd_V8(TensorDesc inputDesc, F32* inArray, "stp q5, q13, [%[pack], #192]\n" "stp q7, q15, [%[pack], #224]\n" : - :[pack]"r"(itm), - [in_0]"r"(Iw[0][i]), - [in_1]"r"(Iw[1][i]), - [in_2]"r"(Iw[2][i]), - [in_3]"r"(Iw[3][i]), - [in_4]"r"(Iw[4][i]), - [in_5]"r"(Iw[5][i]), - [in_6]"r"(Iw[6][i]), - [in_7]"r"(Iw[7][i]) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", - "v12", "v13", "v14", "v15", "v24", "v25", "v26", "v27" - ); + : [pack] "r"(itm), [in_0] "r"(Iw[0][i]), [in_1] "r"(Iw[1][i]), + [in_2] "r"(Iw[2][i]), [in_3] "r"(Iw[3][i]), [in_4] "r"(Iw[4][i]), + [in_5] "r"(Iw[5][i]), [in_6] "r"(Iw[6][i]), [in_7] "r"(Iw[7][i]) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v24", "v25", "v26", "v27"); } } for (I32 o = 0; o < I32(oc); o++) { - const F32 *b_0 = biasArray + o*8; + const F32 *b_0 = biasArray + o * 8; const F32 *b_1 = b_0 + 4; // dot prod // (6*6)*C*c8*hw8 times O*(6*6)*C*c8*o8 = O*(6*6)*hw8*o8 for (U32 idx = 0; idx < 36; idx++) { - F32 *itm_0 = itmArray + idx*8*ic*8; - const F32 *f_o0c0 = filterArray + o*8*36*ic*8 + idx*8*ic*8; - F32 *out_o0hw0 = otmArray + idx*8*8; + F32 *itm_0 = itmArray + idx * 8 * ic * 8; + const F32 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; + F32 *out_o0hw0 = otmArray + idx * 8 * 8; __asm__ __volatile__( - //give in address to x3 + // give in address to x3 "mov x3, %[in_0]\n" - //give f address to x0 + // give f address to x0 "mov x0, %[f_0]\n" "mov x2, %[ic]\n" - + "eor v5.16b, v5.16b, v5.16b\n" - "ldr q1, [%[in_0]]\n" //in_hw0 + "ldr q1, [%[in_0]]\n" // in_hw0 "eor v6.16b, v6.16b, v6.16b\n" "eor v7.16b, v7.16b, v7.16b\n" "eor v8.16b, v8.16b, v8.16b\n" - "ldr q0, [%[f_0]]\n" //f_o0c0 + "ldr q0, [%[f_0]]\n" // f_o0c0 "eor v9.16b, v9.16b, v9.16b\n" "eor v10.16b, v10.16b, v10.16b\n" "eor v11.16b, v11.16b, v11.16b\n" @@ -576,38 +572,38 @@ EE convolution_winograd_V8(TensorDesc inputDesc, F32* inArray, "str q18, [%[out_0], #208]\n" "str q19, [%[out_0], #224]\n" "str q20, [%[out_0], #240]\n" - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(itm_0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8) - :"memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", - "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", - "v27", "v28", "v29", "x0", "x1", "x2", "x3" - ); + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(itm_0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v27", + "v28", "v29", "x0", "x1", "x2", "x3"); } // out trans // O*(6*6)*hw8*o8 => NOHWo8 for (U32 hw8 = 0; hw8 < 8; hw8++) { - U32 h = (hw+hw8) / tile_w; - U32 w = (hw+hw8) % tile_w; - F32 *out_0 = outArray + n*oc*oh*ow*8 + o*oh*ow*8 + h*4*ow*8 + w*4*8; + U32 h = (hw + hw8) / tile_w; + U32 w = (hw + hw8) % tile_w; + F32 *out_0 = outArray + n * oc * oh * ow * 8 + o * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; F32 *Ow_0[36]; F32 *Ow_1[36]; F32 *O_0[16]; F32 *O_1[16]; for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + idx*8*8 + hw8*8; + Ow_0[idx] = otmArray + idx * 8 * 8 + hw8 * 8; Ow_1[idx] = Ow_0[idx] + 4; } for (U32 i = 0; i < 4; ++i) { for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - O_1[i*4 + j] = O_0[i*4 + j] + 4; + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + O_1[i * 4 + j] = O_0[i * 4 + j] + 4; } } - CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); } } tiles_s += 8; @@ -619,8 +615,8 @@ EE convolution_winograd_V8(TensorDesc inputDesc, F32* inArray, // in trans // NCHWc8 => (6*6)*C*c8*hw4 for (U32 c = 0; c < ic; c++) { - F32 *inArray_pad_mov = inArray_pad + c*ih_pad*iw_pad*8; - F32 *itmArray_mov = itmArray + c*8*4; + F32 *inArray_pad_mov = inArray_pad + c * ih_pad * iw_pad * 8; + F32 *itmArray_mov = itmArray + c * 8 * 4; F32 *Iw_ptr0[36]; F32 *Iw_ptr1[36]; F32 Iw[4][36][8]; @@ -629,14 +625,16 @@ EE convolution_winograd_V8(TensorDesc inputDesc, F32* inArray, U32 h[4]; U32 w[4]; for (U32 index = 0; index < 4; index++) { - h[index] = ((hw + index) / tile_w) * 4; - w[index] = ((hw + index) % tile_w) * 4; + h[index] = ((hw + index) / tile_w) * 4; + w[index] = ((hw + index) % tile_w) * 4; } for (U32 i = 0; i < 6; i++) { for (U32 j = 0; j < 6; j++) { for (U32 index = 0; index < 4; index++) { - I0[index][i*6 + j] = inArray_pad_mov + (h[index] + i) * iw_pad * 8 + (w[index] + j) * 8; - I1[index][i*6 + j] = inArray_pad_mov + (h[index] + i) * iw_pad * 8 + (w[index] + j) * 8 + 4; + I0[index][i * 6 + j] = + inArray_pad_mov + (h[index] + i) * iw_pad * 8 + (w[index] + j) * 8; + I1[index][i * 6 + j] = inArray_pad_mov + (h[index] + i) * iw_pad * 8 + + (w[index] + j) * 8 + 4; } } } @@ -649,7 +647,7 @@ EE convolution_winograd_V8(TensorDesc inputDesc, F32* inArray, trans_I_4x4_3x3(Iw_ptr1, I1[index]); } for (U32 i = 0; i < 36; i++) { - F32* itm = itmArray_mov + i*ic*8*4; + F32 *itm = itmArray_mov + i * ic * 8 * 4; __asm__ __volatile__( "ldp q0, q4, [%[in_0]]\n" @@ -660,39 +658,35 @@ EE convolution_winograd_V8(TensorDesc inputDesc, F32* inArray, "st4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[pack]], #64\n" "st4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[pack]]\n" : - :[pack]"r"(itm), - [in_0]"r"(Iw[0][i]), - [in_1]"r"(Iw[1][i]), - [in_2]"r"(Iw[2][i]), - [in_3]"r"(Iw[3][i]) - :"memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); + : [pack] "r"(itm), [in_0] "r"(Iw[0][i]), [in_1] "r"(Iw[1][i]), + [in_2] "r"(Iw[2][i]), [in_3] "r"(Iw[3][i]) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } } for (I32 o = 0; o < I32(oc); o++) { - const F32 *b_0 = biasArray + o*8; + const F32 *b_0 = biasArray + o * 8; const F32 *b_1 = b_0 + 4; // dot prod // (6*6)*C*c8*hw4 times O*(6*6)*C*c8*o8 = O*(6*6)*hw4*o8 for (U32 idx = 0; idx < 36; idx++) { - F32 *itm_0 = itmArray + idx*4*ic*8; - const F32 *f_o0c0 = filterArray + o*8*36*ic*8 + idx*8*ic*8; - F32 *out_o0hw0 = otmArray + idx*4*8; + F32 *itm_0 = itmArray + idx * 4 * ic * 8; + const F32 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; + F32 *out_o0hw0 = otmArray + idx * 4 * 8; __asm__ __volatile__( - //give in address to x3 + // give in address to x3 "mov x3, %[in_0]\n" - //give f address to x0 + // give f address to x0 "mov x0, %[f_0]\n" "mov x2, %[ic]\n" - + "eor v5.16b, v5.16b, v5.16b\n" - "ldr q1, [%[in_0]]\n" //in_hw0 + "ldr q1, [%[in_0]]\n" // in_hw0 "eor v6.16b, v6.16b, v6.16b\n" "eor v7.16b, v7.16b, v7.16b\n" "eor v8.16b, v8.16b, v8.16b\n" - "ldr q0, [%[f_0]]\n" //f_o0c0 + "ldr q0, [%[f_0]]\n" // f_o0c0 "eor v9.16b, v9.16b, v9.16b\n" "eor v10.16b, v10.16b, v10.16b\n" "eor v11.16b, v11.16b, v11.16b\n" @@ -722,37 +716,37 @@ EE convolution_winograd_V8(TensorDesc inputDesc, F32* inArray, "str q10, [%[out_0], #80]\n" "str q11, [%[out_0], #96]\n" "str q12, [%[out_0], #112]\n" - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(itm_0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8) - :"memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", - "v12", "v27", "v28", "v29", "x0", "x1", "x2", "x3" - ); + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(itm_0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v27", "v28", "v29", "x0", "x1", "x2", "x3"); } // out trans // O*(6*6)*hw4*o8 => NOHWo8 for (U32 hw4 = 0; hw4 < 4; hw4++) { - U32 h = (hw+hw4) / tile_w; - U32 w = (hw+hw4) % tile_w; - F32 *out_0 = outArray + n*oc*oh*ow*8 + o*oh*ow*8 + h*4*ow*8 + w*4*8; + U32 h = (hw + hw4) / tile_w; + U32 w = (hw + hw4) % tile_w; + F32 *out_0 = outArray + n * oc * oh * ow * 8 + o * oh * ow * 8 + + h * 4 * ow * 8 + w * 4 * 8; F32 *Ow_0[36]; F32 *Ow_1[36]; F32 *O_0[16]; F32 *O_1[16]; for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + idx*4*8 + hw4*8; + Ow_0[idx] = otmArray + idx * 4 * 8 + hw4 * 8; Ow_1[idx] = Ow_0[idx] + 4; } for (U32 i = 0; i < 4; ++i) { for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - O_1[i*4 + j] = O_0[i*4 + j] + 4; + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + O_1[i * 4 + j] = O_0[i * 4 + j] + 4; } } - CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); } } tiles_s += 4; @@ -763,19 +757,19 @@ EE convolution_winograd_V8(TensorDesc inputDesc, F32* inArray, // in trans // NCHWc8 => (6*6)*C*c8*hw1 for (U32 c = 0; c < ic; c++) { - F32 *inArray_pad_mov = inArray_pad + c*ih_pad*iw_pad*8; - F32 *itmArray_mov = itmArray + c*8; + F32 *inArray_pad_mov = inArray_pad + c * ih_pad * iw_pad * 8; + F32 *itmArray_mov = itmArray + c * 8; F32 *Iw_ptr0[36]; F32 *Iw_ptr1[36]; F32 Iw[36][8]; F32 *I0[36]; F32 *I1[36]; - U32 h = (hw / tile_w) * 4;; + U32 h = (hw / tile_w) * 4; U32 w = (hw % tile_w) * 4; for (U32 i = 0; i < 6; i++) { for (U32 j = 0; j < 6; j++) { - I0[i*6 + j] = inArray_pad_mov + (h + i) * iw_pad * 8 + (w + j) * 8; - I1[i*6 + j] = inArray_pad_mov + (h + i) * iw_pad * 8 + (w + j) * 8 + 4; + I0[i * 6 + j] = inArray_pad_mov + (h + i) * iw_pad * 8 + (w + j) * 8; + I1[i * 6 + j] = inArray_pad_mov + (h + i) * iw_pad * 8 + (w + j) * 8 + 4; } } for (U32 i = 0; i < 36; i++) { @@ -785,30 +779,30 @@ EE convolution_winograd_V8(TensorDesc inputDesc, F32* inArray, trans_I_4x4_3x3(Iw_ptr0, I0); trans_I_4x4_3x3(Iw_ptr1, I1); for (U32 i = 0; i < 36; i++) { - F32* itm = itmArray_mov + i*ic*8; - memcpy(itm, Iw[i], 8*bytesOf(idt)); + F32 *itm = itmArray_mov + i * ic * 8; + memcpy(itm, Iw[i], 8 * bytesOf(idt)); } } for (I32 o = 0; o < I32(oc); o++) { - const F32 *b_0 = biasArray + o*8; + const F32 *b_0 = biasArray + o * 8; const F32 *b_1 = b_0 + 4; // dot prod // (6*6)*C*c8*hw1 times O*(6*6)*C*c8*o8 = O*(6*6)*hw1*o8 for (U32 idx = 0; idx < 36; idx++) { - F32 *itm_0 = itmArray + idx*ic*8; - const F32 *f_o0c0 = filterArray + o*8*36*ic*8 + idx*8*ic*8; - F32 *out_o0hw0 = otmArray + idx*8; + F32 *itm_0 = itmArray + idx * ic * 8; + const F32 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; + F32 *out_o0hw0 = otmArray + idx * 8; __asm__ __volatile__( - "ldr s1, [%[in_0]]\n" //in_hw0 - "ldp q0, q29, [%[f_0]]\n" //f_o0c0 - //give in address to x3 + "ldr s1, [%[in_0]]\n" // in_hw0 + "ldp q0, q29, [%[f_0]]\n" // f_o0c0 + // give in address to x3 "mov x3, %[in_0]\n" - //give f address to x0 + // give f address to x0 "mov x0, %[f_0]\n" "mov x2, %[ic]\n" - + "eor v5.16b, v5.16b, v5.16b\n" "eor v6.16b, v6.16b, v6.16b\n" "0:\n" @@ -817,7 +811,6 @@ EE convolution_winograd_V8(TensorDesc inputDesc, F32* inArray, "fmla v5.4s, v0.4s, v1.s[0]\n" "fmla v6.4s, v29.4s, v1.s[0]\n" - "ldr q0, [x0, #64]!\n" "subs x2, x2, #2\n" "ldr q29, [x0, #16]\n" @@ -827,42 +820,42 @@ EE convolution_winograd_V8(TensorDesc inputDesc, F32* inArray, "bne 0b\n" "str q5, [%[out_0]]\n" "str q6, [%[out_0], #16]\n" - :[out_0]"+r"(out_o0hw0), - [in_0]"+r"(itm_0), - [f_0]"+r"(f_o0c0) - :[ic]"r"((I64)ic*8) - :"memory", "cc", "v0", "v1", "v3", "v5", "v6", "v28", "v29", "v30", "x0", "x1", "x2", "x3" - ); + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(itm_0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8) + : "memory", "cc", "v0", "v1", "v3", "v5", "v6", "v28", "v29", "v30", "x0", + "x1", "x2", "x3"); } // out trans // O*(6*6)*hw1*o8 => NOHWo8 U32 h = hw / tile_w; U32 w = hw % tile_w; - F32 *out_0 = outArray + n*oc*oh*ow*8 + o*oh*ow*8 + h*4*ow*8 + w*4*8; + F32 *out_0 = + outArray + n * oc * oh * ow * 8 + o * oh * ow * 8 + h * 4 * ow * 8 + w * 4 * 8; F32 *Ow_0[36]; F32 *Ow_1[36]; F32 *O_0[16]; F32 *O_1[16]; for (U32 idx = 0; idx < 36; idx++) { - Ow_0[idx] = otmArray + idx*8; + Ow_0[idx] = otmArray + idx * 8; Ow_1[idx] = Ow_0[idx] + 4; } for (U32 i = 0; i < 4; ++i) { for (U32 j = 0; j < 4; ++j) { - O_0[i*4 + j] = out_0 + i*ow*8 + j*8; - O_1[i*4 + j] = O_0[i*4 + j] + 4; + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + O_1[i * 4 + j] = O_0[i * 4 + j] + 4; } } - CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); - CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, tile_h-1, tile_w-1, activationDesc)); + CHECK_STATUS(trans_O_4x4_3x3(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); + CHECK_STATUS(trans_O_4x4_3x3(Ow_1, O_1, b_1, h, w, pad_h_mod_4, pad_w_mod_4, + tile_h - 1, tile_w - 1, activationDesc)); } } } return ret; #else // TODO - std::cerr << "[ERROR] currently not support ARMv7 convolution winograd" < #include "cpu/arm/fp32/arm_functions_fp32.h" -inline void trans_W_4x4_3x3(float* WTM[36], float* W[9]) +inline void trans_W_4x4_3x3(float *WTM[36], float *W[9]) { float T[6][3][4]; @@ -32,9 +31,9 @@ inline void trans_W_4x4_3x3(float* WTM[36], float* W[9]) float32x4_t v_025 = vmovq_n_f32(0.25f); for (int i = 0; i < 3; i++) { - float32x4_t v_W0 = vld1q_f32(W[0*3+i]); - float32x4_t v_W1 = vld1q_f32(W[1*3+i]); - float32x4_t v_W2 = vld1q_f32(W[2*3+i]); + float32x4_t v_W0 = vld1q_f32(W[0 * 3 + i]); + float32x4_t v_W1 = vld1q_f32(W[1 * 3 + i]); + float32x4_t v_W2 = vld1q_f32(W[2 * 3 + i]); float32x4_t v_t0 = vmulq_f32(v_01666, v_W2); float32x4_t v_t1 = vsubq_f32(vmulq_f32(v_minus_01666, v_W0), v_t0); @@ -68,17 +67,25 @@ inline void trans_W_4x4_3x3(float* WTM[36], float* W[9]) float32x4_t v_WTM3 = vfmaq_f32(v_t2, v_00833, v_T1); float32x4_t v_WTM4 = vfmaq_f32(v_t2, v_minus_00833, v_T1); - vst1q_f32(WTM[i*6+0], v_WTM0); - vst1q_f32(WTM[i*6+1], v_WTM1); - vst1q_f32(WTM[i*6+2], v_WTM2); - vst1q_f32(WTM[i*6+3], v_WTM3); - vst1q_f32(WTM[i*6+4], v_WTM4); - vst1q_f32(WTM[i*6+5], v_T2); + vst1q_f32(WTM[i * 6 + 0], v_WTM0); + vst1q_f32(WTM[i * 6 + 1], v_WTM1); + vst1q_f32(WTM[i * 6 + 2], v_WTM2); + vst1q_f32(WTM[i * 6 + 3], v_WTM3); + vst1q_f32(WTM[i * 6 + 4], v_WTM4); + vst1q_f32(WTM[i * 6 + 5], v_T2); } } -inline EE trans_O_4x4_3x3(float* OTM[36], float* O[16], const float* bias, - U32 h, U32 w, U32 _pad_h_mod_4, U32 _pad_w_mod_4, U32 oh, U32 ow, ActivationDesc activationDesc) +inline EE trans_O_4x4_3x3(float *OTM[36], + float *O[16], + const float *bias, + U32 h, + U32 w, + U32 _pad_h_mod_4, + U32 _pad_w_mod_4, + U32 oh, + U32 ow, + ActivationParamSpec activationDesc) { float T[4][6][4]; // bias @@ -91,11 +98,11 @@ inline EE trans_O_4x4_3x3(float* OTM[36], float* O[16], const float* bias, for (int i = 0; i < 6; i++) { float32x4_t v_OTM0 = vld1q_f32(OTM[i]); - float32x4_t v_OTM1 = vld1q_f32(OTM[1*6+i]); - float32x4_t v_OTM2 = vld1q_f32(OTM[2*6+i]); - float32x4_t v_OTM3 = vld1q_f32(OTM[3*6+i]); - float32x4_t v_OTM4 = vld1q_f32(OTM[4*6+i]); - float32x4_t v_OTM5 = vld1q_f32(OTM[5*6+i]); + float32x4_t v_OTM1 = vld1q_f32(OTM[1 * 6 + i]); + float32x4_t v_OTM2 = vld1q_f32(OTM[2 * 6 + i]); + float32x4_t v_OTM3 = vld1q_f32(OTM[3 * 6 + i]); + float32x4_t v_OTM4 = vld1q_f32(OTM[4 * 6 + i]); + float32x4_t v_OTM5 = vld1q_f32(OTM[5 * 6 + i]); float32x4_t v_t0 = vaddq_f32(v_OTM1, v_OTM2); float32x4_t v_t1 = vaddq_f32(v_OTM3, v_OTM4); @@ -144,37 +151,37 @@ inline EE trans_O_4x4_3x3(float* OTM[36], float* O[16], const float* bias, switch (activationDesc.mode) { case ACTIVATION_NULL: { if (pad_w_mod_4 == 0) { - vst1q_f32(O[i*4+0], vaddq_f32(v_O0, v_b)); - vst1q_f32(O[i*4+1], vaddq_f32(v_O1, v_b)); - vst1q_f32(O[i*4+2], vaddq_f32(v_O2, v_b)); - vst1q_f32(O[i*4+3], vaddq_f32(v_O3, v_b)); + vst1q_f32(O[i * 4 + 0], vaddq_f32(v_O0, v_b)); + vst1q_f32(O[i * 4 + 1], vaddq_f32(v_O1, v_b)); + vst1q_f32(O[i * 4 + 2], vaddq_f32(v_O2, v_b)); + vst1q_f32(O[i * 4 + 3], vaddq_f32(v_O3, v_b)); } else if (pad_w_mod_4 == 1) { - vst1q_f32(O[i*4+0], vaddq_f32(v_O0, v_b)); - vst1q_f32(O[i*4+1], vaddq_f32(v_O1, v_b)); - vst1q_f32(O[i*4+2], vaddq_f32(v_O2, v_b)); + vst1q_f32(O[i * 4 + 0], vaddq_f32(v_O0, v_b)); + vst1q_f32(O[i * 4 + 1], vaddq_f32(v_O1, v_b)); + vst1q_f32(O[i * 4 + 2], vaddq_f32(v_O2, v_b)); } else if (pad_w_mod_4 == 2) { - vst1q_f32(O[i*4+0], vaddq_f32(v_O0, v_b)); - vst1q_f32(O[i*4+1], vaddq_f32(v_O1, v_b)); + vst1q_f32(O[i * 4 + 0], vaddq_f32(v_O0, v_b)); + vst1q_f32(O[i * 4 + 1], vaddq_f32(v_O1, v_b)); } else if (pad_w_mod_4 == 3) { - vst1q_f32(O[i*4+0], vaddq_f32(v_O0, v_b)); + vst1q_f32(O[i * 4 + 0], vaddq_f32(v_O0, v_b)); } break; } case ACTIVATION_RELU: { if (pad_w_mod_4 == 0) { - vst1q_f32(O[i*4+0], vmaxq_f32(vaddq_f32(v_O0, v_b), v_0)); - vst1q_f32(O[i*4+1], vmaxq_f32(vaddq_f32(v_O1, v_b), v_0)); - vst1q_f32(O[i*4+2], vmaxq_f32(vaddq_f32(v_O2, v_b), v_0)); - vst1q_f32(O[i*4+3], vmaxq_f32(vaddq_f32(v_O3, v_b), v_0)); + vst1q_f32(O[i * 4 + 0], vmaxq_f32(vaddq_f32(v_O0, v_b), v_0)); + vst1q_f32(O[i * 4 + 1], vmaxq_f32(vaddq_f32(v_O1, v_b), v_0)); + vst1q_f32(O[i * 4 + 2], vmaxq_f32(vaddq_f32(v_O2, v_b), v_0)); + vst1q_f32(O[i * 4 + 3], vmaxq_f32(vaddq_f32(v_O3, v_b), v_0)); } else if (pad_w_mod_4 == 1) { - vst1q_f32(O[i*4+0], vmaxq_f32(vaddq_f32(v_O0, v_b), v_0)); - vst1q_f32(O[i*4+1], vmaxq_f32(vaddq_f32(v_O1, v_b), v_0)); - vst1q_f32(O[i*4+2], vmaxq_f32(vaddq_f32(v_O2, v_b), v_0)); + vst1q_f32(O[i * 4 + 0], vmaxq_f32(vaddq_f32(v_O0, v_b), v_0)); + vst1q_f32(O[i * 4 + 1], vmaxq_f32(vaddq_f32(v_O1, v_b), v_0)); + vst1q_f32(O[i * 4 + 2], vmaxq_f32(vaddq_f32(v_O2, v_b), v_0)); } else if (pad_w_mod_4 == 2) { - vst1q_f32(O[i*4+0], vmaxq_f32(vaddq_f32(v_O0, v_b), v_0)); - vst1q_f32(O[i*4+1], vmaxq_f32(vaddq_f32(v_O1, v_b), v_0)); + vst1q_f32(O[i * 4 + 0], vmaxq_f32(vaddq_f32(v_O0, v_b), v_0)); + vst1q_f32(O[i * 4 + 1], vmaxq_f32(vaddq_f32(v_O1, v_b), v_0)); } else if (pad_w_mod_4 == 3) { - vst1q_f32(O[i*4+0], vmaxq_f32(vaddq_f32(v_O0, v_b), v_0)); + vst1q_f32(O[i * 4 + 0], vmaxq_f32(vaddq_f32(v_O0, v_b), v_0)); } break; } @@ -185,7 +192,7 @@ inline EE trans_O_4x4_3x3(float* OTM[36], float* O[16], const float* bias, return SUCCESS; } -inline void trans_I_4x4_3x3(float* ITM[36], float* I[36]) +inline void trans_I_4x4_3x3(float *ITM[36], float *I[36]) { float T[6][6][4]; @@ -195,12 +202,12 @@ inline void trans_I_4x4_3x3(float* ITM[36], float* I[36]) float32x4_t v_minus_5 = vmovq_n_f32(-5); for (int i = 0; i < 6; i++) { - float32x4_t v_I0 = vld1q_f32(I[0*6+i]); - float32x4_t v_I1 = vld1q_f32(I[1*6+i]); - float32x4_t v_I2 = vld1q_f32(I[2*6+i]); - float32x4_t v_I3 = vld1q_f32(I[3*6+i]); - float32x4_t v_I4 = vld1q_f32(I[4*6+i]); - float32x4_t v_I5 = vld1q_f32(I[5*6+i]); + float32x4_t v_I0 = vld1q_f32(I[0 * 6 + i]); + float32x4_t v_I1 = vld1q_f32(I[1 * 6 + i]); + float32x4_t v_I2 = vld1q_f32(I[2 * 6 + i]); + float32x4_t v_I3 = vld1q_f32(I[3 * 6 + i]); + float32x4_t v_I4 = vld1q_f32(I[4 * 6 + i]); + float32x4_t v_I5 = vld1q_f32(I[5 * 6 + i]); float32x4_t v_t0 = vfmaq_f32(v_I4, v_I2, v_minus_4); float32x4_t v_t1 = vfmaq_f32(v_I3, v_I1, v_minus_4); @@ -245,13 +252,13 @@ inline void trans_I_4x4_3x3(float* ITM[36], float* I[36]) float32x4_t v_ITM3 = vaddq_f32(v_t3, v_t2); float32x4_t v_ITM4 = vsubq_f32(v_t2, v_t3); float32x4_t v_ITM5 = vfmaq_f32(v_t5, v_T3, v_minus_5); - - vst1q_f32(ITM[i*6+0], v_ITM0); - vst1q_f32(ITM[i*6+1], v_ITM1); - vst1q_f32(ITM[i*6+2], v_ITM2); - vst1q_f32(ITM[i*6+3], v_ITM3); - vst1q_f32(ITM[i*6+4], v_ITM4); - vst1q_f32(ITM[i*6+5], v_ITM5); + + vst1q_f32(ITM[i * 6 + 0], v_ITM0); + vst1q_f32(ITM[i * 6 + 1], v_ITM1); + vst1q_f32(ITM[i * 6 + 2], v_ITM2); + vst1q_f32(ITM[i * 6 + 3], v_ITM3); + vst1q_f32(ITM[i * 6 + 4], v_ITM4); + vst1q_f32(ITM[i * 6 + 5], v_ITM5); } } #endif diff --git a/compute/tensor/src/cpu/arm/fp32/deconvolution_transform.cpp b/compute/tensor/src/cpu/arm/fp32/deconvolution_transform.cpp new file mode 100644 index 00000000..4ed5962a --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/deconvolution_transform.cpp @@ -0,0 +1,89 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/transform_functions.h" +#include "cpu/arm/fp32/tensor_computing_fp32.h" + +inline EE deconvolution_transform_filter_kernel_fp32(TensorDesc filterDesc, + const F32 *filterArray, + TensorDesc *ftmDesc, + F32 *ftmArray, + DataFormat ftmDataFormat) +{ + if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + if (fdf == ftmDataFormat) { + *ftmDesc = filterDesc; + memcpy(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt)); + return SUCCESS; + } + if (fdf != DF_NCHW) { + CHECK_STATUS(NOT_SUPPORTED); + } + EE ret = SUCCESS; + switch (ftmDataFormat) { + case DF_NHWCN8: { + *ftmDesc = tensor4df(fdt, ftmDataFormat, fc, fn, fh, fw); + transformCNHWToNHWCNx(filterDesc, filterArray, *ftmDesc, ftmArray); + break; + } + case DF_HWNCN8: { + *ftmDesc = tensor4df(fdt, ftmDataFormat, fc, fn, 6, 6); + transformCNHWToHWNCNx(filterDesc, filterArray, *ftmDesc, ftmArray); + break; + } + case DF_NCHWC8: { + *ftmDesc = tensor4df(fdt, DF_NCHWC8, fn, fc, fh, fw); + transformCNHWToNCHWC8(filterDesc, filterArray, *ftmDesc, ftmArray); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE deconvolution_transform_filter_fp32(TensorDesc filterDesc, + const F32 *filter, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F32 *filterTransformed) +{ + DataFormat ftmDataFormat; + switch (algorithm) { + case CONVOLUTION_ALGORITHM_GEMM: + ftmDataFormat = DF_NHWCN8; + break; + case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: + ftmDataFormat = DF_NHWCN8; + break; + case CONVOLUTION_ALGORITHM_WINOGRAD: + ftmDataFormat = DF_HWNCN8; + break; + case CONVOLUTION_ALGORITHM_GROUP_DECONV: + ftmDataFormat = DF_NCHWC8; + break; + default: + return NOT_MATCH; + } + EE ret = deconvolution_transform_filter_kernel_fp32( + filterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat); + CHECK_STATUS(ret); + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution.cpp new file mode 100644 index 00000000..530383ae --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution.cpp @@ -0,0 +1,75 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#include "cpu/arm/fp32/depthwise_pointwise_convolution.h" + +EE depthwise_pointwise_convolution_fp32(TensorDesc inputDesc, + F32 *input, + TensorDesc dwFilterDesc, + const F32 *dwFilter, + TensorDesc pwFilterDesc, + const F32 *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const F32 *dwBias, + TensorDesc pwBiasDesc, + const F32 *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch) +{ + UNUSED(arch); + if (nullptr == input || nullptr == dwFilter || nullptr == output || nullptr == dwBias || + nullptr == tmp) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(idf == DF_NCHWC8 && odf == DF_NCHWC8)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(ic == fc)) { + CHECK_STATUS(NOT_MATCH); + } + + EE ret = NOT_MATCH; + if (algorithm == DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT || + algorithm == DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT) { +#ifdef __aarch64__ + ret = depthwise_pointwise_convolution_direct_V8(inputDesc, input, dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, dwBiasDesc, dwBias, pwBiasDesc, pwBias, tmpBytes, + tmp, outputDesc, output, depthwiseActivationParamSpec, pointwiseActivationParamSpec); +#else + ret = depthwise_pointwise_convolution_direct_V7(inputDesc, input, dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, dwBiasDesc, dwBias, pwBiasDesc, pwBias, tmpBytes, + tmp, outputDesc, output, depthwiseActivationParamSpec, pointwiseActivationParamSpec); +#endif + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution.h b/compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution.h new file mode 100644 index 00000000..f27d0db3 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution.h @@ -0,0 +1,59 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_DEPTHWISE_POINTWISE_CONVOLUTION +#define _H_DEPTHWISE_POINTWISE_CONVOLUTION + +#include "sys.h" +#include "tensor_desc.h" +#include "types.h" + +#ifdef __aarch64__ +EE depthwise_pointwise_convolution_direct_V8(TensorDesc inputDesc, + F32 *inArray, + TensorDesc dwFilterDesc, + const F32 *dwFilterArray, + TensorDesc pwFilterDesc, + const F32 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F32 *dwBiasArray, + TensorDesc pwBiasDesc, + const F32 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec); +#else +EE depthwise_pointwise_convolution_direct_V7(TensorDesc inputDesc, + F32 *inArray, + TensorDesc dwFilterDesc, + const F32 *dwFilterArray, + TensorDesc pwFilterDesc, + const F32 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F32 *dwBiasArray, + TensorDesc pwBiasDesc, + const F32 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec); +#endif + +#endif diff --git a/compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution_direct_V7.cpp b/compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution_direct_V7.cpp new file mode 100644 index 00000000..4848caba --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution_direct_V7.cpp @@ -0,0 +1,699 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef __aarch64__ +#include "cpu/arm/fp32/depthwise_pointwise_convolution.h" + +EE depthwise_pointwise_convolution_direct_V7(TensorDesc inputDesc, + F32 *inArray, + TensorDesc dwFilterDesc, + const F32 *dwFilterArray, + TensorDesc pwFilterDesc, + const F32 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F32 *dwBiasArray, + TensorDesc pwBiasDesc, + const F32 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec) +{ + UNUSED(dwBiasDesc); + UNUSED(pwBiasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if (dwFilterDesc.df != DF_NCHWC8) { + CHECK_STATUS(NOT_MATCH); + } + if (pwFilterArray != nullptr && pwFilterDesc.df != DF_NHWCN8) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + U32 ihiw = ih * iw; + I32 ohow = oh * ow; + F32 *pwArray = (F32 *)tmp + ic * ih_pad * iw_pad * 8; + + for (U32 n = 0; n < in; n++) { + // copy input into a input with padding + F32 *inArray_pad = (F32 *)tmp; + F32 *inArray_pad_mov = inArray_pad; + F32 *inArray_mov = inArray + n * ic * ihiw * 8; + for (U32 c = 0; c < ic; c++) { + if (paddingT > 0) { + memset(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingT * iw_pad * 8; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingL * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(fdt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingR * 8; + } + if (paddingB > 0) { + memset(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingB * iw_pad * 8; + } + + const F32 *b = dwBiasArray + c * 8; + F32 *in_pad = inArray_pad + c * ih_pad * iw_pad * 8; + const F32 *f = dwFilterArray + c * fh * fw * 8; + // ohow / 4 + for (I32 hw = 0; hw < ohow - 3; hw += 4) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + U32 in_h_1 = (hw + 1) / ow * strideH; + U32 in_w_1 = (hw + 1) % ow * strideW; + U32 in_h_2 = (hw + 2) / ow * strideH; + U32 in_w_2 = (hw + 2) % ow * strideW; + U32 in_h_3 = (hw + 3) / ow * strideH; + U32 in_w_3 = (hw + 3) % ow * strideW; + + __asm__ __volatile__( + "vld1.f32 {d0-d3}, [%[b]]\n" + "vmov.f32 q2, q0\n" + "vmov.f32 q3, q1\n" + "vmov.f32 q4, q0\n" + "vmov.f32 q5, q1\n" + "vmov.f32 q6, q0\n" + "vmov.f32 q7, q1\n" + : + : [b] "r"(b) + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const F32 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + F32 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + F32 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F32 *in_1 = in_idx + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F32 *in_2 = in_idx + in_h_2 * iw_pad * 8 + in_w_2 * 8; + F32 *in_3 = in_idx + in_h_3 * iw_pad * 8 + in_w_3 * 8; + + __asm__ __volatile__("vld1.f32 {d28-d31}, [%[f0]]\n" + "vld1.f32 {d16-d19}, [%[in0]]\n" + "vld1.f32 {d20-d23}, [%[in1]]\n" + "vld1.f32 {d24-d27}, [%[in2]]\n" + + "vmla.f32 q0, q8, q14\n" + "vmla.f32 q1, q9, q15\n" + "vld1.f32 {d16-d19}, [%[in3]]\n" + "vmla.f32 q2, q10, q14\n" + "vmla.f32 q3, q11, q15\n" + "vmla.f32 q4, q12, q14\n" + "vmla.f32 q5, q13, q15\n" + "vmla.f32 q6, q8, q14\n" + "vmla.f32 q7, q9, q15\n" + : + : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), + [in3] "r"(in_3), [f0] "r"(f_0) + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("veor q15, q15, q15\n" // zero + "vmax.f32 q0, q0, q15\n" + "vmax.f32 q1, q1, q15\n" + "vmax.f32 q2, q2, q15\n" + "vmax.f32 q3, q3, q15\n" + "vmax.f32 q4, q4, q15\n" + "vmax.f32 q5, q5, q15\n" + "vmax.f32 q6, q6, q15\n" + "vmax.f32 q7, q7, q15\n" + : + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q15"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("veor q15, q15, q15\n" // zero + "vmov.f32 q14, #6.0\n" // six + "vmax.f32 q0, q0, q15\n" + "vmax.f32 q1, q1, q15\n" + "vmax.f32 q2, q2, q15\n" + "vmax.f32 q3, q3, q15\n" + "vmax.f32 q4, q4, q15\n" + "vmax.f32 q5, q5, q15\n" + "vmax.f32 q6, q6, q15\n" + "vmax.f32 q7, q7, q15\n" + + "vmin.f32 q0, q0, q14\n" + "vmin.f32 q1, q1, q14\n" + "vmin.f32 q2, q2, q14\n" + "vmin.f32 q3, q3, q14\n" + "vmin.f32 q4, q4, q14\n" + "vmin.f32 q5, q5, q14\n" + "vmin.f32 q6, q6, q14\n" + "vmin.f32 q7, q7, q14\n" + : + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q14", "q15"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__("vmov.f32 q13, #3.0\n" // three + "vmov.f32 q14, #6.0\n" // six + "veor q15, q15, q15\n" // zero + "vadd.f32 q8, q0, q13\n" + "vadd.f32 q9, q1, q13\n" + "vadd.f32 q10, q2, q13\n" + "vadd.f32 q11, q3, q13\n" + "vmax.f32 q8, q8, q15\n" + "vmax.f32 q9, q9, q15\n" + "vmax.f32 q10, q10, q15\n" + "vmax.f32 q11, q11, q15\n" + "vmin.f32 q8, q8, q14\n" + "vmin.f32 q9, q9, q14\n" + "vmin.f32 q10, q10, q14\n" + "vmin.f32 q11, q11, q14\n" + "vrecpe.f32 q12, q14\n" + "vrecps.f32 q14, q14, q12\n" + "vmul.f32 q12, q14, q12\n" + "vmul.f32 q8, q8, q12\n" + "vmul.f32 q9, q9, q12\n" + "vmul.f32 q10, q10, q12\n" + "vmul.f32 q11, q11, q12\n" + "vmov.f32 q14, #6.0\n" // six + "vmul.f32 q0, q0, q8\n" + "vmul.f32 q1, q1, q9\n" + "vmul.f32 q2, q2, q10\n" + "vmul.f32 q3, q3, q11\n" + + "vadd.f32 q8, q4, q13\n" + "vadd.f32 q9, q5, q13\n" + "vadd.f32 q10, q6, q13\n" + "vadd.f32 q11, q7, q13\n" + "vmax.f32 q8, q8, q15\n" + "vmax.f32 q9, q9, q15\n" + "vmax.f32 q10, q10, q15\n" + "vmax.f32 q11, q11, q15\n" + "vmin.f32 q8, q8, q14\n" + "vmin.f32 q9, q9, q14\n" + "vmin.f32 q10, q10, q14\n" + "vmin.f32 q11, q11, q14\n" + "vrecpe.f32 q12, q14\n" + "vrecps.f32 q14, q14, q12\n" + "vmul.f32 q12, q14, q12\n" + "vmul.f32 q8, q8, q12\n" + "vmul.f32 q9, q9, q12\n" + "vmul.f32 q10, q10, q12\n" + "vmul.f32 q11, q11, q12\n" + "vmul.f32 q4, q4, q8\n" + "vmul.f32 q5, q5, q9\n" + "vmul.f32 q6, q6, q10\n" + "vmul.f32 q7, q7, q11\n" + : + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15"); + break; + } + default: + return NOT_SUPPORTED; + } + + if (pwFilterArray != nullptr) { + F32 *pw_pack_0 = pwArray + hw * ic * 8 + c * 4 * 8; + __asm__ __volatile__( + "vzip.32 q0, q4\n" + "vzip.32 q2, q6\n" + "vzip.32 q1, q5\n" + "vzip.32 q3, q7\n" + + "vzip.32 q0, q2\n" + "vzip.32 q4, q6\n" + "vzip.32 q1, q3\n" + "vzip.32 q5, q7\n" + + "vst1.f32 {q0}, [%[pw0]]!\n" + "vst1.f32 {q2}, [%[pw0]]!\n" + "vst1.f32 {q4}, [%[pw0]]!\n" + "vst1.f32 {q6}, [%[pw0]]!\n" + "vst1.f32 {q1}, [%[pw0]]!\n" + "vst1.f32 {q3}, [%[pw0]]!\n" + "vst1.f32 {q5}, [%[pw0]]!\n" + "vst1.f32 {q7}, [%[pw0]]!\n" + : [pw0] "+r"(pw_pack_0) + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); + } else { + F32 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; + __asm__ __volatile__( + "vstm %[out], {d0-d15}\n" + : [out] "+r"(out_ptr) + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); + } + } + + // ohow_reminder % 4 + U32 ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + + __asm__ __volatile__("vld1.f32 {d0-d3}, [%[b]]\n" + : + : [b] "r"(b) + : "memory", "cc", "q0", "q1"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const F32 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + F32 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + F32 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + __asm__ __volatile__( + "vld1.f32 {d28-d31}, [%[f0]]\n" + "vld1.f32 {d24-d27}, [%[in0]]\n" + + "vmla.f32 q0, q12, q14\n" + "vmla.f32 q1, q13, q15\n" + : + : [in0] "r"(in_0), [f0] "r"(f_0) + : "memory", "cc", "q0", "q1", "q12", "q13", "q14", "q15"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("veor q15, q15, q15\n" // zero + "vmax.f32 q0, q0, q15\n" + "vmax.f32 q1, q1, q15\n" + : + : + : "memory", "cc", "q0", "q1", "q15"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("veor q15, q15, q15\n" // zero + "vmov.f32 q14, #6.0\n" // six + "vmax.f32 q0, q0, q15\n" + "vmax.f32 q1, q1, q15\n" + + "vmin.f32 q0, q0, q14\n" + "vmin.f32 q1, q1, q14\n" + : + : + : "memory", "cc", "q0", "q1", "q14", "q15"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__( + "vmov.f32 q13, #3.0\n" // three + "vmov.f32 q14, #6.0\n" // six + "veor q15, q15, q15\n" // zero + "vadd.f32 q11, q0, q13\n" + "vadd.f32 q12, q1, q13\n" + + "vmax.f32 q11, q11, q15\n" + "vmax.f32 q12, q12, q15\n" + + "vmin.f32 q11, q11, q14\n" + "vmin.f32 q12, q12, q14\n" + + "vrecpe.f32 q13, q14\n" + "vrecps.f32 q14, q14, q13\n" + "vmul.f32 q14, q14, q13\n" + "vmul.f32 q11, q11, q14\n" + "vmul.f32 q12, q12, q14\n" + + "vmul.f32 q0, q0, q11\n" + "vmul.f32 q1, q1, q12\n" + : + : + : "memory", "cc", "q0", "q1", "q11", "q12", "q13", "q14", "q15"); + break; + } + default: + return NOT_SUPPORTED; + } + + F32 *out_ptr; + if (pwFilterArray != nullptr) { + out_ptr = pwArray + hw * ic * 8 + c * 8; + } else { + out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; + } + __asm__ __volatile__("vst1.f32 {d0-d3}, [%[pw0]]\n" + : [pw0] "+r"(out_ptr) + : + : "memory", "cc", "q0", "q1"); + } + } + + if (pwFilterArray == nullptr) { + continue; + } + // pw_conv + // ohow / 4 + for (I32 hw = 0; hw < ohow - 3; hw += 4) { + const F32 *b0 = pwBiasArray; + const F32 *b1 = b0 + 4; + F32 *in_pack = pwArray + hw * ic * 8; + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = pwFilterArray + o * 8 * ic * 8; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__("vld1.f32 {d0-d1}, [%[b_0]]\n" + "vld1.f32 {d2-d3}, [%[b_1]]\n" + "vld1.f32 {d12-d13}, [%[in_0]]!\n" + "vld1.f32 {d20-d23}, [%[f_0]]!\n" + + "vmov.f32 q2, q0\n" + "vmov.f32 q4, q0\n" + "vmov.f32 q8, q0\n" + + "mov r2, %[ic]\n" + + "vmov.f32 q3, q1\n" + "vmov.f32 q5, q1\n" + "vmov.f32 q9, q1\n" + + "0:\n" + "vmla.f32 q0, q10, d12[0]\n" + "vmla.f32 q2, q10, d12[1]\n" + "vmla.f32 q4, q10, d13[0]\n" + "vmla.f32 q8, q10, d13[1]\n" + + "vld1.f32 {d14-d15}, [%[in_0]]!\n" + "vld1.f32 {d20-d21}, [%[f_0]]!\n" + + "vmla.f32 q1, q11, d12[0]\n" + "vmla.f32 q3, q11, d12[1]\n" + "vmla.f32 q5, q11, d13[0]\n" + "vmla.f32 q9, q11, d13[1]\n" + + "vld1.f32 {d22-d23}, [%[f_0]]!\n" + "subs r2, r2, #2\n" + + "vmla.f32 q0, q10, d14[0]\n" + "vmla.f32 q2, q10, d14[1]\n" + "vmla.f32 q4, q10, d15[0]\n" + "vmla.f32 q8, q10, d15[1]\n" + + "vld1.f32 {d12-d13}, [%[in_0]]!\n" + "vld1.f32 {d20-d21}, [%[f_0]]!\n" + + "vmla.f32 q1, q11, d14[0]\n" + "vmla.f32 q3, q11, d14[1]\n" + "vmla.f32 q5, q11, d15[0]\n" + "vmla.f32 q9, q11, d15[1]\n" + + "vld1.f32 {d22-d23}, [%[f_0]]!\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", + "q7", "q8", "q9", "q10", "q11", "r2"); + + // activation + switch (pointwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("veor q15, q15, q15\n" // zero + "vmax.f32 q0, q0, q15\n" + "vmax.f32 q1, q1, q15\n" + "vmax.f32 q2, q2, q15\n" + "vmax.f32 q3, q3, q15\n" + "vmax.f32 q4, q4, q15\n" + "vmax.f32 q5, q5, q15\n" + "vmax.f32 q8, q8, q15\n" + "vmax.f32 q9, q9, q15\n" + : + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", + "q8", "q9", "q15"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("veor q15, q15, q15\n" // zero + "vmov.f32 q14, #6.0\n" // six + "vmax.f32 q0, q0, q15\n" + "vmax.f32 q1, q1, q15\n" + "vmax.f32 q2, q2, q15\n" + "vmax.f32 q3, q3, q15\n" + "vmax.f32 q4, q4, q15\n" + "vmax.f32 q5, q5, q15\n" + "vmax.f32 q8, q8, q15\n" + "vmax.f32 q9, q9, q15\n" + + "vmin.f32 q0, q0, q14\n" + "vmin.f32 q1, q1, q14\n" + "vmin.f32 q2, q2, q14\n" + "vmin.f32 q3, q3, q14\n" + "vmin.f32 q4, q4, q14\n" + "vmin.f32 q5, q5, q14\n" + "vmin.f32 q8, q8, q14\n" + "vmin.f32 q9, q9, q14\n" + : + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", + "q8", "q9", "q14", "q15"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__("vmov.f32 q6, q8\n" + "vmov.f32 q7, q9\n" + + "vmov.f32 q13, #3.0\n" // three + "vmov.f32 q14, #6.0\n" // six + "veor q15, q15, q15\n" // zero + "vadd.f32 q8, q0, q13\n" + "vadd.f32 q9, q1, q13\n" + "vadd.f32 q10, q2, q13\n" + "vadd.f32 q11, q3, q13\n" + "vmax.f32 q8, q8, q15\n" + "vmax.f32 q9, q9, q15\n" + "vmax.f32 q10, q10, q15\n" + "vmax.f32 q11, q11, q15\n" + "vmin.f32 q8, q8, q14\n" + "vmin.f32 q9, q9, q14\n" + "vmin.f32 q10, q10, q14\n" + "vmin.f32 q11, q11, q14\n" + "vrecpe.f32 q12, q14\n" + "vrecps.f32 q14, q14, q12\n" + "vmul.f32 q12, q14, q12\n" + "vmul.f32 q8, q8, q12\n" + "vmul.f32 q9, q9, q12\n" + "vmul.f32 q10, q10, q12\n" + "vmul.f32 q11, q11, q12\n" + "vmov.f32 q14, #6.0\n" // six + "vmul.f32 q0, q0, q8\n" + "vmul.f32 q1, q1, q9\n" + "vmul.f32 q2, q2, q10\n" + "vmul.f32 q3, q3, q11\n" + + "vadd.f32 q8, q4, q13\n" + "vadd.f32 q9, q5, q13\n" + "vadd.f32 q10, q6, q13\n" + "vadd.f32 q11, q7, q13\n" + "vmax.f32 q8, q8, q15\n" + "vmax.f32 q9, q9, q15\n" + "vmax.f32 q10, q10, q15\n" + "vmax.f32 q11, q11, q15\n" + "vmin.f32 q8, q8, q14\n" + "vmin.f32 q9, q9, q14\n" + "vmin.f32 q10, q10, q14\n" + "vmin.f32 q11, q11, q14\n" + "vrecpe.f32 q12, q14\n" + "vrecps.f32 q14, q14, q12\n" + "vmul.f32 q12, q14, q12\n" + "vmul.f32 q8, q8, q12\n" + "vmul.f32 q9, q9, q12\n" + "vmul.f32 q10, q10, q12\n" + "vmul.f32 q11, q11, q12\n" + "vmul.f32 q4, q4, q8\n" + "vmul.f32 q5, q5, q9\n" + "vmul.f32 q6, q6, q10\n" + "vmul.f32 q7, q7, q11\n" + + "vmov.f32 q8, q6\n" + "vmov.f32 q9, q7\n" + : + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15"); + break; + } + default: + return NOT_SUPPORTED; + } + + __asm__ __volatile__( + "vst1.f32 {d0-d3}, [%[out_0]]!\n" + "vst1.f32 {d4-d7}, [%[out_0]]!\n" + "vst1.f32 {d8-d11}, [%[out_0]]!\n" + "vst1.f32 {d16-d19}, [%[out_0]]!\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9"); + b0 += 8; + b1 += 8; + } + } + + // ohow_reminder % 4 + U32 ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + const F32 *b0 = pwBiasArray; + const F32 *b1 = b0 + 4; + F32 *in_pack = pwArray + hw * ic * 8; + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + const F32 *f_o0c0 = pwFilterArray + o * 8 * ic * 8; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__("vld1.f32 {d0-d1}, [%[b_0]]\n" + "vld1.f32 {d2-d3}, [%[b_1]]\n" + "vld1.f32 {d8}, [%[in_0]]!\n" + "vld1.f32 {d4-d7}, [%[f_0]]!\n" + "mov r2, %[ic]\n" + "0:\n" + "vmla.f32 q0, q2, d8[0]\n" + + "vld1.f32 {d4-d5}, [%[f_0]]!\n" + + "vmla.f32 q1, q3, d8[0]\n" + + "vld1.f32 {d6-d7}, [%[f_0]]!\n" + "subs r2, r2, #2\n" + + "vmla.f32 q0, q2, d8[1]\n" + + "vld1.f32 {d4-d5}, [%[f_0]]!\n" + + "vmla.f32 q1, q3, d8[1]\n" + + "vld1.f32 {d8}, [%[in_0]]!\n" + "vld1.f32 {d6-d7}, [%[f_0]]!\n" + "bne 0b\n" + : [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1) + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "r2"); + + switch (pointwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("veor q15, q15, q15\n" // zero + "vmax.f32 q0, q0, q15\n" + "vmax.f32 q1, q1, q15\n" + : + : + : "memory", "cc", "q0", "q1", "q15"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("veor q15, q15, q15\n" // zero + "vmov.f32 q14, #6.0\n" // six + "vmax.f32 q0, q0, q15\n" + "vmax.f32 q1, q1, q15\n" + + "vmin.f32 q0, q0, q14\n" + "vmin.f32 q1, q1, q14\n" + : + : + : "memory", "cc", "q0", "q1", "q14", "q15"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__( + "vmov.f32 q13, #3.0\n" // three + "vmov.f32 q14, #6.0\n" // six + "veor q15, q15, q15\n" // zero + "vadd.f32 q11, q0, q13\n" + "vadd.f32 q12, q1, q13\n" + + "vmax.f32 q11, q11, q15\n" + "vmax.f32 q12, q12, q15\n" + + "vmin.f32 q11, q11, q14\n" + "vmin.f32 q12, q12, q14\n" + + "vrecpe.f32 q13, q14\n" + "vrecps.f32 q14, q14, q13\n" + "vmul.f32 q14, q14, q13\n" + "vmul.f32 q11, q11, q14\n" + "vmul.f32 q12, q12, q14\n" + + "vmul.f32 q0, q0, q11\n" + "vmul.f32 q1, q1, q12\n" + : + : + : "memory", "cc", "q0", "q1", "q11", "q12", "q13", "q14", "q15"); + break; + } + default: + return NOT_SUPPORTED; + } + + __asm__ __volatile__("vst1.f32 {d0-d3}, [%[out_0]]\n" + : [out_0] "+r"(out_o0hw0) + : + : "memory", "cc", "q0", "q1"); + b0 += 8; + b1 += 8; + } + } + } + return SUCCESS; +} +#endif diff --git a/compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution_direct_V8.cpp b/compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution_direct_V8.cpp new file mode 100644 index 00000000..b3fc32ad --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/depthwise_pointwise_convolution_direct_V8.cpp @@ -0,0 +1,1264 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef __aarch64__ +#include "cpu/arm/fp32/depthwise_pointwise_convolution.h" + +EE depthwise_pointwise_convolution_direct_V8(TensorDesc inputDesc, + F32 *inArray, + TensorDesc dwFilterDesc, + const F32 *dwFilterArray, + TensorDesc pwFilterDesc, + const F32 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F32 *dwBiasArray, + TensorDesc pwBiasDesc, + const F32 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec) +{ + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if (dwFilterDesc.df != DF_NCHWC8) { + CHECK_STATUS(NOT_MATCH); + } + if (pwFilterArray != nullptr && pwFilterDesc.df != DF_NHWCN8) { + CHECK_STATUS(NOT_MATCH); + } + + oc /= 8; + ic /= 8; + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + U32 ihiw = ih * iw; + I32 ohow = oh * ow; + F32 *pwArray = (F32 *)tmp + ic * ih_pad * iw_pad * 8; + for (U32 n = 0; n < in; n++) { + F32 *inArray_pad = (F32 *)tmp; + F32 *inArray_pad_mov = inArray_pad; + F32 *inArray_mov = inArray + n * ic * ihiw * 8; + for (U32 c = 0; c < ic; c++) { + if (paddingT > 0) { + memset(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingT * iw_pad * 8; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingL * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(fdt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingR * 8; + } + if (paddingB > 0) { + memset(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(fdt)); + inArray_pad_mov += paddingB * iw_pad * 8; + } + + // dw_conv + const F32 *b = dwBiasArray + c * 8; + F32 *in_pad = inArray_pad + c * ih_pad * iw_pad * 8; + const F32 *f = dwFilterArray + c * fh * fw * 8; + // ohow / 8 + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + U32 in_h_1 = (hw + 1) / ow * strideH; + U32 in_w_1 = (hw + 1) % ow * strideW; + U32 in_h_2 = (hw + 2) / ow * strideH; + U32 in_w_2 = (hw + 2) % ow * strideW; + U32 in_h_3 = (hw + 3) / ow * strideH; + U32 in_w_3 = (hw + 3) % ow * strideW; + U32 in_h_4 = (hw + 4) / ow * strideH; + U32 in_w_4 = (hw + 4) % ow * strideW; + U32 in_h_5 = (hw + 5) / ow * strideH; + U32 in_w_5 = (hw + 5) % ow * strideW; + U32 in_h_6 = (hw + 6) / ow * strideH; + U32 in_w_6 = (hw + 6) % ow * strideW; + U32 in_h_7 = (hw + 7) / ow * strideH; + U32 in_w_7 = (hw + 7) % ow * strideW; + + __asm__ __volatile__("ldr q14, [%[b]]\n" + "ldr q15, [%[b], #16]\n" + "mov v0.16b, v14.16b\n" + "mov v1.16b, v15.16b\n" + "mov v2.16b, v14.16b\n" + "mov v3.16b, v15.16b\n" + "mov v4.16b, v14.16b\n" + "mov v5.16b, v15.16b\n" + "mov v6.16b, v14.16b\n" + "mov v7.16b, v15.16b\n" + "mov v8.16b, v14.16b\n" + "mov v9.16b, v15.16b\n" + "mov v10.16b, v14.16b\n" + "mov v11.16b, v15.16b\n" + "mov v12.16b, v14.16b\n" + "mov v13.16b, v15.16b\n" + : + : [b] "r"(b) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const F32 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + F32 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + F32 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F32 *in_1 = in_idx + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F32 *in_2 = in_idx + in_h_2 * iw_pad * 8 + in_w_2 * 8; + F32 *in_3 = in_idx + in_h_3 * iw_pad * 8 + in_w_3 * 8; + F32 *in_4 = in_idx + in_h_4 * iw_pad * 8 + in_w_4 * 8; + F32 *in_5 = in_idx + in_h_5 * iw_pad * 8 + in_w_5 * 8; + F32 *in_6 = in_idx + in_h_6 * iw_pad * 8 + in_w_6 * 8; + F32 *in_7 = in_idx + in_h_7 * iw_pad * 8 + in_w_7 * 8; + __asm__ __volatile__("ldp q16, q17, [%[f0]]\n" + "ldp q30, q31, [%[in0]]\n" + "ldp q18, q19, [%[in1]]\n" + "ldp q20, q21, [%[in2]]\n" + "ldp q22, q23, [%[in3]]\n" + "ldp q24, q25, [%[in4]]\n" + "ldp q26, q27, [%[in5]]\n" + "ldp q28, q29, [%[in6]]\n" + + "fmla v0.4s, v30.4s, v16.4s\n" + "fmla v1.4s, v31.4s, v17.4s\n" + "fmla v2.4s, v18.4s, v16.4s\n" + "ldp q30, q31, [%[in7]]\n" + "fmla v3.4s, v19.4s, v17.4s\n" + "fmla v4.4s, v20.4s, v16.4s\n" + "fmla v5.4s, v21.4s, v17.4s\n" + "fmla v6.4s, v22.4s, v16.4s\n" + "fmla v7.4s, v23.4s, v17.4s\n" + "fmla v8.4s, v24.4s, v16.4s\n" + "fmla v9.4s, v25.4s, v17.4s\n" + "fmla v10.4s, v26.4s, v16.4s\n" + "fmla v11.4s, v27.4s, v17.4s\n" + "fmla v12.4s, v28.4s, v16.4s\n" + "fmla v13.4s, v29.4s, v17.4s\n" + "fmla v14.4s, v30.4s, v16.4s\n" + "fmla v15.4s, v31.4s, v17.4s\n" + : + : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), + [in3] "r"(in_3), [in4] "r"(in_4), [in5] "r"(in_5), + [in6] "r"(in_6), [in7] "r"(in_7), [f0] "r"(f_0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "v30", "v31"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "fmax v0.4s, v0.4s, v31.4s\n" + "fmax v1.4s, v1.4s, v31.4s\n" + "fmax v2.4s, v2.4s, v31.4s\n" + "fmax v3.4s, v3.4s, v31.4s\n" + "fmax v4.4s, v4.4s, v31.4s\n" + "fmax v5.4s, v5.4s, v31.4s\n" + "fmax v6.4s, v6.4s, v31.4s\n" + "fmax v7.4s, v7.4s, v31.4s\n" + "fmax v8.4s, v8.4s, v31.4s\n" + "fmax v9.4s, v9.4s, v31.4s\n" + "fmax v10.4s, v10.4s, v31.4s\n" + "fmax v11.4s, v11.4s, v31.4s\n" + "fmax v12.4s, v12.4s, v31.4s\n" + "fmax v13.4s, v13.4s, v31.4s\n" + "fmax v14.4s, v14.4s, v31.4s\n" + "fmax v15.4s, v15.4s, v31.4s\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v31"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v0.4s, v0.4s, v31.4s\n" + "fmax v1.4s, v1.4s, v31.4s\n" + "fmax v2.4s, v2.4s, v31.4s\n" + "fmax v3.4s, v3.4s, v31.4s\n" + "fmax v4.4s, v4.4s, v31.4s\n" + "fmax v5.4s, v5.4s, v31.4s\n" + "fmax v6.4s, v6.4s, v31.4s\n" + "fmax v7.4s, v7.4s, v31.4s\n" + "fmax v8.4s, v8.4s, v31.4s\n" + "fmax v9.4s, v9.4s, v31.4s\n" + "fmax v10.4s, v10.4s, v31.4s\n" + "fmax v11.4s, v11.4s, v31.4s\n" + "fmax v12.4s, v12.4s, v31.4s\n" + "fmax v13.4s, v13.4s, v31.4s\n" + "fmax v14.4s, v14.4s, v31.4s\n" + "fmax v15.4s, v15.4s, v31.4s\n" + + "fmin v0.4s, v0.4s, v30.4s\n" + "fmin v1.4s, v1.4s, v30.4s\n" + "fmin v2.4s, v2.4s, v30.4s\n" + "fmin v3.4s, v3.4s, v30.4s\n" + "fmin v4.4s, v4.4s, v30.4s\n" + "fmin v5.4s, v5.4s, v30.4s\n" + "fmin v6.4s, v6.4s, v30.4s\n" + "fmin v7.4s, v7.4s, v30.4s\n" + "fmin v8.4s, v8.4s, v30.4s\n" + "fmin v9.4s, v9.4s, v30.4s\n" + "fmin v10.4s, v10.4s, v30.4s\n" + "fmin v11.4s, v11.4s, v30.4s\n" + "fmin v12.4s, v12.4s, v30.4s\n" + "fmin v13.4s, v13.4s, v30.4s\n" + "fmin v14.4s, v14.4s, v30.4s\n" + "fmin v15.4s, v15.4s, v30.4s\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v30", "v31"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__("fmov v29.4s, 3.0\n" // three + "fmov v30.4s, 6.0\n" // six + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fadd v22.4s, v0.4s, v29.4s\n" + "fadd v23.4s, v1.4s, v29.4s\n" + "fadd v16.4s, v2.4s, v29.4s\n" + "fadd v17.4s, v3.4s, v29.4s\n" + "fadd v18.4s, v4.4s, v29.4s\n" + "fadd v19.4s, v5.4s, v29.4s\n" + "fadd v20.4s, v6.4s, v29.4s\n" + "fadd v21.4s, v7.4s, v29.4s\n" + + "fmax v22.4s, v22.4s, v31.4s\n" + "fmax v23.4s, v23.4s, v31.4s\n" + "fmax v16.4s, v16.4s, v31.4s\n" + "fmax v17.4s, v17.4s, v31.4s\n" + "fmax v18.4s, v18.4s, v31.4s\n" + "fmax v19.4s, v19.4s, v31.4s\n" + "fmax v20.4s, v20.4s, v31.4s\n" + "fmax v21.4s, v21.4s, v31.4s\n" + + "fmin v22.4s, v22.4s, v30.4s\n" + "fmin v23.4s, v23.4s, v30.4s\n" + "fmin v16.4s, v16.4s, v30.4s\n" + "fmin v17.4s, v17.4s, v30.4s\n" + "fmin v18.4s, v18.4s, v30.4s\n" + "fmin v19.4s, v19.4s, v30.4s\n" + "fmin v20.4s, v20.4s, v30.4s\n" + "fmin v21.4s, v21.4s, v30.4s\n" + + "fdiv v22.4s, v22.4s, v30.4s\n" + "fdiv v23.4s, v23.4s, v30.4s\n" + "fdiv v16.4s, v16.4s, v30.4s\n" + "fdiv v17.4s, v17.4s, v30.4s\n" + "fdiv v18.4s, v18.4s, v30.4s\n" + "fdiv v19.4s, v19.4s, v30.4s\n" + "fdiv v20.4s, v20.4s, v30.4s\n" + "fdiv v21.4s, v21.4s, v30.4s\n" + + "fmul v0.4s, v0.4s, v22.4s\n" + "fmul v1.4s, v1.4s, v23.4s\n" + "fmul v2.4s, v2.4s, v16.4s\n" + "fmul v3.4s, v3.4s, v17.4s\n" + "fmul v4.4s, v4.4s, v18.4s\n" + "fmul v5.4s, v5.4s, v19.4s\n" + "fmul v6.4s, v6.4s, v20.4s\n" + "fmul v7.4s, v7.4s, v21.4s\n" + + "fadd v22.4s, v8.4s, v29.4s\n" + "fadd v23.4s, v9.4s, v29.4s\n" + "fadd v16.4s, v10.4s, v29.4s\n" + "fadd v17.4s, v11.4s, v29.4s\n" + "fadd v18.4s, v12.4s, v29.4s\n" + "fadd v19.4s, v13.4s, v29.4s\n" + "fadd v20.4s, v14.4s, v29.4s\n" + "fadd v21.4s, v15.4s, v29.4s\n" + + "fmax v22.4s, v22.4s, v31.4s\n" + "fmax v23.4s, v23.4s, v31.4s\n" + "fmax v16.4s, v16.4s, v31.4s\n" + "fmax v17.4s, v17.4s, v31.4s\n" + "fmax v18.4s, v18.4s, v31.4s\n" + "fmax v19.4s, v19.4s, v31.4s\n" + "fmax v20.4s, v20.4s, v31.4s\n" + "fmax v21.4s, v21.4s, v31.4s\n" + + "fmin v22.4s, v22.4s, v30.4s\n" + "fmin v23.4s, v23.4s, v30.4s\n" + "fmin v16.4s, v16.4s, v30.4s\n" + "fmin v17.4s, v17.4s, v30.4s\n" + "fmin v18.4s, v18.4s, v30.4s\n" + "fmin v19.4s, v19.4s, v30.4s\n" + "fmin v20.4s, v20.4s, v30.4s\n" + "fmin v21.4s, v21.4s, v30.4s\n" + + "fdiv v22.4s, v22.4s, v30.4s\n" + "fdiv v23.4s, v23.4s, v30.4s\n" + "fdiv v16.4s, v16.4s, v30.4s\n" + "fdiv v17.4s, v17.4s, v30.4s\n" + "fdiv v18.4s, v18.4s, v30.4s\n" + "fdiv v19.4s, v19.4s, v30.4s\n" + "fdiv v20.4s, v20.4s, v30.4s\n" + "fdiv v21.4s, v21.4s, v30.4s\n" + + "fmul v8.4s, v8.4s, v22.4s\n" + "fmul v9.4s, v9.4s, v23.4s\n" + "fmul v10.4s, v10.4s, v16.4s\n" + "fmul v11.4s, v11.4s, v17.4s\n" + "fmul v12.4s, v12.4s, v18.4s\n" + "fmul v13.4s, v13.4s, v19.4s\n" + "fmul v14.4s, v14.4s, v20.4s\n" + "fmul v15.4s, v15.4s, v21.4s\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v29", "v30", "v31"); + break; + } + default: + return NOT_SUPPORTED; + } + + if (pwFilterArray != nullptr) { + F32 *pw_pack_0 = pwArray + hw * ic * 8 + c * 8 * 8; + __asm__ __volatile__("zip1 v16.4s, v0.4s, v2.4s\n" + "zip2 v17.4s, v0.4s, v2.4s\n" + "zip1 v18.4s, v4.4s, v6.4s\n" + "zip2 v19.4s, v4.4s, v6.4s\n" + "zip1 v0.2d, v16.2d, v18.2d\n" + "zip2 v2.2d, v16.2d, v18.2d\n" + "zip1 v4.2d, v17.2d, v19.2d\n" + "zip2 v6.2d, v17.2d, v19.2d\n" + + "zip1 v16.4s, v8.4s, v10.4s\n" + "zip2 v17.4s, v8.4s, v10.4s\n" + "zip1 v18.4s, v12.4s, v14.4s\n" + "zip2 v19.4s, v12.4s, v14.4s\n" + "zip1 v8.2d, v16.2d, v18.2d\n" + "zip2 v10.2d, v16.2d, v18.2d\n" + "zip1 v12.2d, v17.2d, v19.2d\n" + "zip2 v14.2d, v17.2d, v19.2d\n" + + "zip1 v16.4s, v1.4s, v3.4s\n" + "zip2 v17.4s, v1.4s, v3.4s\n" + "zip1 v18.4s, v5.4s, v7.4s\n" + "zip2 v19.4s, v5.4s, v7.4s\n" + "zip1 v1.2d, v16.2d, v18.2d\n" + "zip2 v3.2d, v16.2d, v18.2d\n" + "zip1 v5.2d, v17.2d, v19.2d\n" + "zip2 v7.2d, v17.2d, v19.2d\n" + + "zip1 v16.4s, v9.4s, v11.4s\n" + "zip2 v17.4s, v9.4s, v11.4s\n" + "zip1 v18.4s, v13.4s, v15.4s\n" + "zip2 v19.4s, v13.4s, v15.4s\n" + "zip1 v9.2d, v16.2d, v18.2d\n" + "zip2 v11.2d, v16.2d, v18.2d\n" + "zip1 v13.2d, v17.2d, v19.2d\n" + "zip2 v15.2d, v17.2d, v19.2d\n" + + "str q0, [%[pw0]]\n" + "str q8, [%[pw0], #16]\n" + "str q2, [%[pw0], #32]\n" + "str q10, [%[pw0], #48]\n" + "str q4, [%[pw0], #64]\n" + "str q12, [%[pw0], #80]\n" + "str q6, [%[pw0], #96]\n" + "str q14, [%[pw0], #112]\n" + "str q1, [%[pw0], #128]\n" + "str q9, [%[pw0], #144]\n" + "str q3, [%[pw0], #160]\n" + "str q11, [%[pw0], #176]\n" + "str q5, [%[pw0], #192]\n" + "str q13, [%[pw0], #208]\n" + "str q7, [%[pw0], #224]\n" + "str q15, [%[pw0], #240]\n" + : [pw0] "+r"(pw_pack_0) + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19"); + } else { + F32 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; + __asm__ __volatile__("stp q0, q1, [%[out]]\n" + "stp q2, q3, [%[out], #32]\n" + "stp q4, q5, [%[out], #64]\n" + "stp q6, q7, [%[out], #96]\n" + "stp q8, q9, [%[out], #128]\n" + "stp q10, q11, [%[out], #160]\n" + "stp q12, q13, [%[out], #192]\n" + "stp q14, q15, [%[out], #224]\n" + : [out] "+r"(out_ptr) + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"); + } + } + + // ohow_reminder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + for (I32 hw = ohow_s; hw < ohow - 3; hw += 4) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + U32 in_h_1 = (hw + 1) / ow * strideH; + U32 in_w_1 = (hw + 1) % ow * strideW; + U32 in_h_2 = (hw + 2) / ow * strideH; + U32 in_w_2 = (hw + 2) % ow * strideW; + U32 in_h_3 = (hw + 3) / ow * strideH; + U32 in_w_3 = (hw + 3) % ow * strideW; + + __asm__ __volatile__( + "ldr q14, [%[b]]\n" + "ldr q15, [%[b], #16]\n" + "mov v0.16b, v14.16b\n" + "mov v1.16b, v15.16b\n" + "mov v2.16b, v14.16b\n" + "mov v3.16b, v15.16b\n" + "mov v4.16b, v14.16b\n" + "mov v5.16b, v15.16b\n" + "mov v6.16b, v14.16b\n" + "mov v7.16b, v15.16b\n" + : + : [b] "r"(b) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v14", "v15"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const F32 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + F32 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + F32 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F32 *in_1 = in_idx + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F32 *in_2 = in_idx + in_h_2 * iw_pad * 8 + in_w_2 * 8; + F32 *in_3 = in_idx + in_h_3 * iw_pad * 8 + in_w_3 * 8; + __asm__ __volatile__("ldp q14, q15, [%[f0]]\n" + "ldp q16, q17, [%[in0]]\n" + "ldp q18, q19, [%[in1]]\n" + "ldp q20, q21, [%[in2]]\n" + "ldp q22, q23, [%[in3]]\n" + + "fmla v0.4s, v16.4s, v14.4s\n" + "fmla v1.4s, v17.4s, v15.4s\n" + "fmla v2.4s, v18.4s, v14.4s\n" + "fmla v3.4s, v19.4s, v15.4s\n" + "fmla v4.4s, v20.4s, v14.4s\n" + "fmla v5.4s, v21.4s, v15.4s\n" + "fmla v6.4s, v22.4s, v14.4s\n" + "fmla v7.4s, v23.4s, v15.4s\n" + : + : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), + [in3] "r"(in_3), [f0] "r"(f_0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "fmax v0.4s, v0.4s, v31.4s\n" + "fmax v1.4s, v1.4s, v31.4s\n" + "fmax v2.4s, v2.4s, v31.4s\n" + "fmax v3.4s, v3.4s, v31.4s\n" + "fmax v4.4s, v4.4s, v31.4s\n" + "fmax v5.4s, v5.4s, v31.4s\n" + "fmax v6.4s, v6.4s, v31.4s\n" + "fmax v7.4s, v7.4s, v31.4s\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v31"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v0.4s, v0.4s, v31.4s\n" + "fmax v1.4s, v1.4s, v31.4s\n" + "fmax v2.4s, v2.4s, v31.4s\n" + "fmax v3.4s, v3.4s, v31.4s\n" + "fmax v4.4s, v4.4s, v31.4s\n" + "fmax v5.4s, v5.4s, v31.4s\n" + "fmax v6.4s, v6.4s, v31.4s\n" + "fmax v7.4s, v7.4s, v31.4s\n" + + "fmin v0.4s, v0.4s, v30.4s\n" + "fmin v1.4s, v1.4s, v30.4s\n" + "fmin v2.4s, v2.4s, v30.4s\n" + "fmin v3.4s, v3.4s, v30.4s\n" + "fmin v4.4s, v4.4s, v30.4s\n" + "fmin v5.4s, v5.4s, v30.4s\n" + "fmin v6.4s, v6.4s, v30.4s\n" + "fmin v7.4s, v7.4s, v30.4s\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v30", "v31"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__("fmov v29.4s, 3.0\n" // three + "fmov v30.4s, 6.0\n" // six + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fadd v14.4s, v0.4s, v29.4s\n" + "fadd v15.4s, v1.4s, v29.4s\n" + "fadd v16.4s, v2.4s, v29.4s\n" + "fadd v17.4s, v3.4s, v29.4s\n" + "fadd v18.4s, v4.4s, v29.4s\n" + "fadd v19.4s, v5.4s, v29.4s\n" + "fadd v20.4s, v6.4s, v29.4s\n" + "fadd v21.4s, v7.4s, v29.4s\n" + + "fmax v14.4s, v14.4s, v31.4s\n" + "fmax v15.4s, v15.4s, v31.4s\n" + "fmax v16.4s, v16.4s, v31.4s\n" + "fmax v17.4s, v17.4s, v31.4s\n" + "fmax v18.4s, v18.4s, v31.4s\n" + "fmax v19.4s, v19.4s, v31.4s\n" + "fmax v20.4s, v20.4s, v31.4s\n" + "fmax v21.4s, v21.4s, v31.4s\n" + + "fmin v14.4s, v14.4s, v30.4s\n" + "fmin v15.4s, v15.4s, v30.4s\n" + "fmin v16.4s, v16.4s, v30.4s\n" + "fmin v17.4s, v17.4s, v30.4s\n" + "fmin v18.4s, v18.4s, v30.4s\n" + "fmin v19.4s, v19.4s, v30.4s\n" + "fmin v20.4s, v20.4s, v30.4s\n" + "fmin v21.4s, v21.4s, v30.4s\n" + + "fdiv v14.4s, v14.4s, v30.4s\n" + "fdiv v15.4s, v15.4s, v30.4s\n" + "fdiv v16.4s, v16.4s, v30.4s\n" + "fdiv v17.4s, v17.4s, v30.4s\n" + "fdiv v18.4s, v18.4s, v30.4s\n" + "fdiv v19.4s, v19.4s, v30.4s\n" + "fdiv v20.4s, v20.4s, v30.4s\n" + "fdiv v21.4s, v21.4s, v30.4s\n" + + "fmul v0.4s, v0.4s, v14.4s\n" + "fmul v1.4s, v1.4s, v15.4s\n" + "fmul v2.4s, v2.4s, v16.4s\n" + "fmul v3.4s, v3.4s, v17.4s\n" + "fmul v4.4s, v4.4s, v18.4s\n" + "fmul v5.4s, v5.4s, v19.4s\n" + "fmul v6.4s, v6.4s, v20.4s\n" + "fmul v7.4s, v7.4s, v21.4s\n" + : + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v29", "v30", "v31"); + break; + } + default: + return NOT_SUPPORTED; + } + + if (pwFilterArray != nullptr) { + F32 *pw_pack_0 = pwArray + hw * ic * 8 + c * 8 * 4; + __asm__ __volatile__("zip1 v16.4s, v0.4s, v2.4s\n" + "zip2 v17.4s, v0.4s, v2.4s\n" + "zip1 v18.4s, v4.4s, v6.4s\n" + "zip2 v19.4s, v4.4s, v6.4s\n" + "zip1 v0.2d, v16.2d, v18.2d\n" + "zip2 v2.2d, v16.2d, v18.2d\n" + "zip1 v4.2d, v17.2d, v19.2d\n" + "zip2 v6.2d, v17.2d, v19.2d\n" + + "zip1 v16.4s, v1.4s, v3.4s\n" + "zip2 v17.4s, v1.4s, v3.4s\n" + "zip1 v18.4s, v5.4s, v7.4s\n" + "zip2 v19.4s, v5.4s, v7.4s\n" + "zip1 v1.2d, v16.2d, v18.2d\n" + "zip2 v3.2d, v16.2d, v18.2d\n" + "zip1 v5.2d, v17.2d, v19.2d\n" + "zip2 v7.2d, v17.2d, v19.2d\n" + + "str q0, [%[pw0]]\n" + "str q2, [%[pw0], #16]\n" + "str q4, [%[pw0], #32]\n" + "str q6, [%[pw0], #48]\n" + "str q1, [%[pw0], #64]\n" + "str q3, [%[pw0], #80]\n" + "str q5, [%[pw0], #96]\n" + "str q7, [%[pw0], #112]\n" + : [pw0] "+r"(pw_pack_0) + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v16", "v17", "v18", "v19"); + } else { + F32 *out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; + __asm__ __volatile__( + "stp q0, q1, [%[out]]\n" + "stp q2, q3, [%[out], #32]\n" + "stp q4, q5, [%[out], #64]\n" + "stp q6, q7, [%[out], #96]\n" + : [out] "+r"(out_ptr) + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); + } + } + + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + + __asm__ __volatile__("ldr q0, [%[b]]\n" + "ldr q1, [%[b], #16]\n" + : + : [b] "r"(b) + : "memory", "cc", "v0", "v1"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const F32 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + F32 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + F32 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + __asm__ __volatile__("ldp q14, q15, [%[f0]]\n" + "ldp q16, q17, [%[in0]]\n" + + "fmla v0.4s, v16.4s, v14.4s\n" + "fmla v1.4s, v17.4s, v15.4s\n" + : + : [in0] "r"(in_0), [f0] "r"(f_0) + : "memory", "cc", "v0", "v1", "v14", "v15"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "fmax v0.4s, v0.4s, v31.4s\n" + "fmax v1.4s, v1.4s, v31.4s\n" + : + : + : "memory", "cc", "v0", "v1", "v31"); + break; + } + case ACTIVATION_RELU6: { + __asm__ __volatile__("eor v31.16b, v31.16b, v31.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v0.4s, v0.4s, v31.4s\n" + "fmax v1.4s, v1.4s, v31.4s\n" + + "fmin v0.4s, v0.4s, v30.4s\n" + "fmin v1.4s, v1.4s, v30.4s\n" + : + : + : "memory", "cc", "v0", "v1", "v30", "v31"); + break; + } + case ACTIVATION_H_SWISH: { + __asm__ __volatile__( + "fmov v29.4s, 3.0\n" // three + "fmov v30.4s, 6.0\n" // six + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fadd v14.4s, v0.4s, v29.4s\n" + "fadd v15.4s, v1.4s, v29.4s\n" + + "fmax v14.4s, v14.4s, v31.4s\n" + "fmax v15.4s, v15.4s, v31.4s\n" + + "fmin v14.4s, v14.4s, v30.4s\n" + "fmin v15.4s, v15.4s, v30.4s\n" + + "fdiv v14.4s, v14.4s, v30.4s\n" + "fdiv v15.4s, v15.4s, v30.4s\n" + + "fmul v0.4s, v0.4s, v14.4s\n" + "fmul v1.4s, v1.4s, v15.4s\n" + : + : + : "memory", "cc", "v0", "v1", "v14", "v15", "v29", "v30", "v31"); + break; + } + default: + return NOT_SUPPORTED; + } + + F32 *out_ptr; + if (pwFilterArray != nullptr) { + out_ptr = pwArray + hw * ic * 8 + c * 8; + } else { + out_ptr = outArray + ((n * ic + c) * ohow + hw) * 8; + } + __asm__ __volatile__("stp q0, q1, [%[out]]\n" + : [out] "+r"(out_ptr) + : + : "memory", "cc", "v0", "v1"); + } + } + + if (pwFilterArray == nullptr) { + continue; + } + // pw_conv + // ohow / 8 + for (I32 hw = 0; hw < ohow - 7; hw += 8) { + const F32 *b0 = pwBiasArray; + const F32 *b1 = b0 + 4; + F32 *in_pack = pwArray + hw * ic * 8; + const F32 *f_o0c0 = pwFilterArray; + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__( + "ldr q24, [%[b_0]]\n" // b_O0o[0:3] + "ldr q25, [%[b_1]]\n" // b_O1o[0:3] + "mov x0, %[ic]\n" // ic_blk + "mov v4.16b, v24.16b\n" + "ldr q0, [%[in_0]]\n" // in_hw0 + "mov v5.16b, v24.16b\n" + "ldr q1, [%[in_0], #16]\n" // in_hw0 + "mov v6.16b, v24.16b\n" + "ldr q20, [%[f_0]]\n" // f_o0c0 + "mov v7.16b, v24.16b\n" + "ldr q21, [%[f_0], #16]\n" // f_o0c0 + "mov v8.16b, v24.16b\n" + "mov v9.16b, v24.16b\n" + "mov v10.16b, v24.16b\n" + "mov v11.16b, v24.16b\n" + "mov v12.16b, v25.16b\n" + "mov v13.16b, v25.16b\n" + "mov v14.16b, v25.16b\n" + "mov v15.16b, v25.16b\n" + "mov v16.16b, v25.16b\n" + "mov v17.16b, v25.16b\n" + "mov v18.16b, v25.16b\n" + "mov v19.16b, v25.16b\n" + + "0:\n" + "fmla v4.4s, v20.4s, v0.s[0]\n" + "ldr q2, [%[in_0], #32]\n" + "fmla v5.4s, v20.4s, v0.s[1]\n" + "ldr q3, [%[in_0], #48]\n" + "fmla v6.4s, v20.4s, v0.s[2]\n" + "ldr q22, [%[f_0], #32]\n" + "fmla v7.4s, v20.4s, v0.s[3]\n" + "ldr q23, [%[f_0], #48]\n" + "fmla v8.4s, v20.4s, v1.s[0]\n" + "fmla v9.4s, v20.4s, v1.s[1]\n" + "fmla v10.4s, v20.4s, v1.s[2]\n" + "fmla v11.4s, v20.4s, v1.s[3]\n" + "fmla v12.4s, v21.4s, v0.s[0]\n" + "fmla v13.4s, v21.4s, v0.s[1]\n" + "fmla v14.4s, v21.4s, v0.s[2]\n" + "fmla v15.4s, v21.4s, v0.s[3]\n" + "fmla v16.4s, v21.4s, v1.s[0]\n" + "fmla v17.4s, v21.4s, v1.s[1]\n" + "fmla v18.4s, v21.4s, v1.s[2]\n" + "fmla v19.4s, v21.4s, v1.s[3]\n" + + "fmla v4.4s, v22.4s, v2.s[0]\n" + "ldr q0, [%[in_0], #64]!\n" + "fmla v5.4s, v22.4s, v2.s[1]\n" + "ldr q1, [%[in_0], #16]\n" + "fmla v6.4s, v22.4s, v2.s[2]\n" + "ldr q20, [%[f_0], #64]!\n" + "fmla v7.4s, v22.4s, v2.s[3]\n" + "ldr q21, [%[f_0], #16]\n" + "fmla v8.4s, v22.4s, v3.s[0]\n" + "fmla v9.4s, v22.4s, v3.s[1]\n" + "fmla v10.4s, v22.4s, v3.s[2]\n" + "fmla v11.4s, v22.4s, v3.s[3]\n" + "fmla v12.4s, v23.4s, v2.s[0]\n" + "fmla v13.4s, v23.4s, v2.s[1]\n" + "fmla v14.4s, v23.4s, v2.s[2]\n" + "fmla v15.4s, v23.4s, v2.s[3]\n" + "fmla v16.4s, v23.4s, v3.s[0]\n" + "fmla v17.4s, v23.4s, v3.s[1]\n" + "fmla v18.4s, v23.4s, v3.s[2]\n" + "fmla v19.4s, v23.4s, v3.s[3]\n" + "subs x0, x0, #2\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fmax v4.4s, v4.4s, v31.4s\n" + "fmax v5.4s, v5.4s, v31.4s\n" + "fmax v6.4s, v6.4s, v31.4s\n" + "fmax v7.4s, v7.4s, v31.4s\n" + "fmax v8.4s, v8.4s, v31.4s\n" + "fmax v9.4s, v9.4s, v31.4s\n" + "fmax v10.4s, v10.4s, v31.4s\n" + "fmax v11.4s, v11.4s, v31.4s\n" + "fmax v12.4s, v12.4s, v31.4s\n" + "fmax v13.4s, v13.4s, v31.4s\n" + "fmax v14.4s, v14.4s, v31.4s\n" + "fmax v15.4s, v15.4s, v31.4s\n" + "fmax v16.4s, v16.4s, v31.4s\n" + "fmax v17.4s, v17.4s, v31.4s\n" + "fmax v18.4s, v18.4s, v31.4s\n" + "fmax v19.4s, v19.4s, v31.4s\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v4.4s, v4.4s, v31.4s\n" + "fmax v5.4s, v5.4s, v31.4s\n" + "fmax v6.4s, v6.4s, v31.4s\n" + "fmax v7.4s, v7.4s, v31.4s\n" + "fmax v8.4s, v8.4s, v31.4s\n" + "fmax v9.4s, v9.4s, v31.4s\n" + "fmax v10.4s, v10.4s, v31.4s\n" + "fmax v11.4s, v11.4s, v31.4s\n" + "fmax v12.4s, v12.4s, v31.4s\n" + "fmax v13.4s, v13.4s, v31.4s\n" + "fmax v14.4s, v14.4s, v31.4s\n" + "fmax v15.4s, v15.4s, v31.4s\n" + "fmax v16.4s, v16.4s, v31.4s\n" + "fmax v17.4s, v17.4s, v31.4s\n" + "fmax v18.4s, v18.4s, v31.4s\n" + "fmax v19.4s, v19.4s, v31.4s\n" + + "fmin v4.4s, v4.4s, v30.4s\n" + "fmin v5.4s, v5.4s, v30.4s\n" + "fmin v6.4s, v6.4s, v30.4s\n" + "fmin v7.4s, v7.4s, v30.4s\n" + "fmin v8.4s, v8.4s, v30.4s\n" + "fmin v9.4s, v9.4s, v30.4s\n" + "fmin v10.4s, v10.4s, v30.4s\n" + "fmin v11.4s, v11.4s, v30.4s\n" + "fmin v12.4s, v12.4s, v30.4s\n" + "fmin v13.4s, v13.4s, v30.4s\n" + "fmin v14.4s, v14.4s, v30.4s\n" + "fmin v15.4s, v15.4s, v30.4s\n" + "fmin v16.4s, v16.4s, v30.4s\n" + "fmin v17.4s, v17.4s, v30.4s\n" + "fmin v18.4s, v18.4s, v30.4s\n" + "fmin v19.4s, v19.4s, v30.4s\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "fmov v29.4s, 3.0\n" // three + "fmov v30.4s, 6.0\n" // six + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fadd v20.4s, v4.4s, v29.4s\n" + "fadd v21.4s, v5.4s, v29.4s\n" + "fadd v22.4s, v6.4s, v29.4s\n" + "fadd v23.4s, v7.4s, v29.4s\n" + "fadd v24.4s, v8.4s, v29.4s\n" + "fadd v25.4s, v9.4s, v29.4s\n" + "fadd v26.4s, v10.4s, v29.4s\n" + "fadd v27.4s, v11.4s, v29.4s\n" + + "fmax v20.4s, v20.4s, v31.4s\n" + "fmax v21.4s, v21.4s, v31.4s\n" + "fmax v22.4s, v22.4s, v31.4s\n" + "fmax v23.4s, v23.4s, v31.4s\n" + "fmax v24.4s, v24.4s, v31.4s\n" + "fmax v25.4s, v25.4s, v31.4s\n" + "fmax v26.4s, v26.4s, v31.4s\n" + "fmax v27.4s, v27.4s, v31.4s\n" + + "fmin v20.4s, v20.4s, v30.4s\n" + "fmin v21.4s, v21.4s, v30.4s\n" + "fmin v22.4s, v22.4s, v30.4s\n" + "fmin v23.4s, v23.4s, v30.4s\n" + "fmin v24.4s, v24.4s, v30.4s\n" + "fmin v25.4s, v25.4s, v30.4s\n" + "fmin v26.4s, v26.4s, v30.4s\n" + "fmin v27.4s, v27.4s, v30.4s\n" + + "fdiv v20.4s, v20.4s, v30.4s\n" + "fdiv v21.4s, v21.4s, v30.4s\n" + "fdiv v22.4s, v22.4s, v30.4s\n" + "fdiv v23.4s, v23.4s, v30.4s\n" + "fdiv v24.4s, v24.4s, v30.4s\n" + "fdiv v25.4s, v25.4s, v30.4s\n" + "fdiv v26.4s, v26.4s, v30.4s\n" + "fdiv v27.4s, v27.4s, v30.4s\n" + + "fmul v4.4s, v4.4s, v20.4s\n" + "fmul v5.4s, v5.4s, v21.4s\n" + "fmul v6.4s, v6.4s, v22.4s\n" + "fmul v7.4s, v7.4s, v23.4s\n" + "fmul v8.4s, v8.4s, v24.4s\n" + "fmul v9.4s, v9.4s, v25.4s\n" + "fmul v10.4s, v10.4s, v26.4s\n" + "fmul v11.4s, v11.4s, v27.4s\n" + + "fadd v20.4s, v12.4s, v29.4s\n" + "fadd v21.4s, v13.4s, v29.4s\n" + "fadd v22.4s, v14.4s, v29.4s\n" + "fadd v23.4s, v15.4s, v29.4s\n" + "fadd v24.4s, v16.4s, v29.4s\n" + "fadd v25.4s, v17.4s, v29.4s\n" + "fadd v26.4s, v18.4s, v29.4s\n" + "fadd v27.4s, v19.4s, v29.4s\n" + + "fmax v20.4s, v20.4s, v31.4s\n" + "fmax v21.4s, v21.4s, v31.4s\n" + "fmax v22.4s, v22.4s, v31.4s\n" + "fmax v23.4s, v23.4s, v31.4s\n" + "fmax v24.4s, v24.4s, v31.4s\n" + "fmax v25.4s, v25.4s, v31.4s\n" + "fmax v26.4s, v26.4s, v31.4s\n" + "fmax v27.4s, v27.4s, v31.4s\n" + + "fmin v20.4s, v20.4s, v30.4s\n" + "fmin v21.4s, v21.4s, v30.4s\n" + "fmin v22.4s, v22.4s, v30.4s\n" + "fmin v23.4s, v23.4s, v30.4s\n" + "fmin v24.4s, v24.4s, v30.4s\n" + "fmin v25.4s, v25.4s, v30.4s\n" + "fmin v26.4s, v26.4s, v30.4s\n" + "fmin v27.4s, v27.4s, v30.4s\n" + + "fdiv v20.4s, v20.4s, v30.4s\n" + "fdiv v21.4s, v21.4s, v30.4s\n" + "fdiv v22.4s, v22.4s, v30.4s\n" + "fdiv v23.4s, v23.4s, v30.4s\n" + "fdiv v24.4s, v24.4s, v30.4s\n" + "fdiv v25.4s, v25.4s, v30.4s\n" + "fdiv v26.4s, v26.4s, v30.4s\n" + "fdiv v27.4s, v27.4s, v30.4s\n" + + "fmul v12.4s, v12.4s, v20.4s\n" + "fmul v13.4s, v13.4s, v21.4s\n" + "fmul v14.4s, v14.4s, v22.4s\n" + "fmul v15.4s, v15.4s, v23.4s\n" + "fmul v16.4s, v16.4s, v24.4s\n" + "fmul v17.4s, v17.4s, v25.4s\n" + "fmul v18.4s, v18.4s, v26.4s\n" + "fmul v19.4s, v19.4s, v27.4s\n" + + "13:\n" + "str q4, [%[out_0]], #16\n" + "str q12, [%[out_0]], #16\n" + "str q5, [%[out_0]], #16\n" + "str q13, [%[out_0]], #16\n" + "str q6, [%[out_0]], #16\n" + "str q14, [%[out_0]], #16\n" + "str q7, [%[out_0]], #16\n" + "str q15, [%[out_0]], #16\n" + "str q8, [%[out_0]], #16\n" + "str q16, [%[out_0]], #16\n" + "str q9, [%[out_0]], #16\n" + "str q17, [%[out_0]], #16\n" + "str q10, [%[out_0]], #16\n" + "str q18, [%[out_0]], #16\n" + "str q11, [%[out_0]], #16\n" + "str q19, [%[out_0]], #16\n" + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v29", "v30", "v31", "x0", + "x1", "x2", "x3"); + b0 += 8; + b1 += 8; + } + } + + // ohow_remainder % 8 / 4 + U32 ohow_s = (ohow / 8) * 8; + for (I32 hw = ohow_s; hw < ohow - 3; hw += 4) { + const F32 *b0 = pwBiasArray; + const F32 *b1 = b0 + 4; + const F32 *f_o0c0 = pwFilterArray; + F32 *in_pack = pwArray + hw * ic * 8; + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__( + "ldr q24, [%[b_0]]\n" // b_o0 + "ldr q25, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "mov v4.16b, v24.16b\n" + "ldr q0, [%[in_0]]\n" // in_hw0 + "mov v5.16b, v24.16b\n" + "mov v6.16b, v24.16b\n" + "ldr q20, [%[f_0]]\n" // f_o0c0 + "mov v7.16b, v24.16b\n" + "ldr q21, [%[f_0], #16]\n" // f_o0c0 + "mov v12.16b, v25.16b\n" + "mov v13.16b, v25.16b\n" + "mov v14.16b, v25.16b\n" + "mov v15.16b, v25.16b\n" + + "0:\n" + "fmla v4.4s, v20.4s, v0.s[0]\n" + "ldr q2, [%[in_0], #16]\n" + "fmla v5.4s, v20.4s, v0.s[1]\n" + "ldr q22, [%[f_0], #32]\n" + "fmla v6.4s, v20.4s, v0.s[2]\n" + "ldr q23, [%[f_0], #48]\n" + "fmla v7.4s, v20.4s, v0.s[3]\n" + "fmla v12.4s, v21.4s, v0.s[0]\n" + "fmla v13.4s, v21.4s, v0.s[1]\n" + "fmla v14.4s, v21.4s, v0.s[2]\n" + "fmla v15.4s, v21.4s, v0.s[3]\n" + + "fmla v4.4s, v22.4s, v2.s[0]\n" + "ldr q0, [%[in_0], #32]!\n" + "fmla v5.4s, v22.4s, v2.s[1]\n" + "ldr q20, [%[f_0], #64]!\n" + "fmla v6.4s, v22.4s, v2.s[2]\n" + "ldr q21, [%[f_0], #16]\n" + "fmla v7.4s, v22.4s, v2.s[3]\n" + "fmla v12.4s, v23.4s, v2.s[0]\n" + "fmla v13.4s, v23.4s, v2.s[1]\n" + "fmla v14.4s, v23.4s, v2.s[2]\n" + "fmla v15.4s, v23.4s, v2.s[3]\n" + "subs x0, x0, #2\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fmax v4.4s, v4.4s, v31.4s\n" + "fmax v5.4s, v5.4s, v31.4s\n" + "fmax v6.4s, v6.4s, v31.4s\n" + "fmax v7.4s, v7.4s, v31.4s\n" + "fmax v12.4s, v12.4s, v31.4s\n" + "fmax v13.4s, v13.4s, v31.4s\n" + "fmax v14.4s, v14.4s, v31.4s\n" + "fmax v15.4s, v15.4s, v31.4s\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v4.4s, v4.4s, v31.4s\n" + "fmax v5.4s, v5.4s, v31.4s\n" + "fmax v6.4s, v6.4s, v31.4s\n" + "fmax v7.4s, v7.4s, v31.4s\n" + "fmax v12.4s, v12.4s, v31.4s\n" + "fmax v13.4s, v13.4s, v31.4s\n" + "fmax v14.4s, v14.4s, v31.4s\n" + "fmax v15.4s, v15.4s, v31.4s\n" + + "fmin v4.4s, v4.4s, v30.4s\n" + "fmin v5.4s, v5.4s, v30.4s\n" + "fmin v6.4s, v6.4s, v30.4s\n" + "fmin v7.4s, v7.4s, v30.4s\n" + "fmin v12.4s, v12.4s, v30.4s\n" + "fmin v13.4s, v13.4s, v30.4s\n" + "fmin v14.4s, v14.4s, v30.4s\n" + "fmin v15.4s, v15.4s, v30.4s\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "fmov v29.4s, 3.0\n" // three + "fmov v30.4s, 6.0\n" // six + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fadd v20.4s, v4.4s, v29.4s\n" + "fadd v21.4s, v5.4s, v29.4s\n" + "fadd v22.4s, v6.4s, v29.4s\n" + "fadd v23.4s, v7.4s, v29.4s\n" + "fadd v24.4s, v12.4s, v29.4s\n" + "fadd v25.4s, v13.4s, v29.4s\n" + "fadd v26.4s, v14.4s, v29.4s\n" + "fadd v27.4s, v15.4s, v29.4s\n" + + "fmax v20.4s, v20.4s, v31.4s\n" + "fmax v21.4s, v21.4s, v31.4s\n" + "fmax v22.4s, v22.4s, v31.4s\n" + "fmax v23.4s, v23.4s, v31.4s\n" + "fmax v24.4s, v24.4s, v31.4s\n" + "fmax v25.4s, v25.4s, v31.4s\n" + "fmax v26.4s, v26.4s, v31.4s\n" + "fmax v27.4s, v27.4s, v31.4s\n" + + "fmin v20.4s, v20.4s, v30.4s\n" + "fmin v21.4s, v21.4s, v30.4s\n" + "fmin v22.4s, v22.4s, v30.4s\n" + "fmin v23.4s, v23.4s, v30.4s\n" + "fmin v24.4s, v24.4s, v30.4s\n" + "fmin v25.4s, v25.4s, v30.4s\n" + "fmin v26.4s, v26.4s, v30.4s\n" + "fmin v27.4s, v27.4s, v30.4s\n" + + "fdiv v20.4s, v20.4s, v30.4s\n" + "fdiv v21.4s, v21.4s, v30.4s\n" + "fdiv v22.4s, v22.4s, v30.4s\n" + "fdiv v23.4s, v23.4s, v30.4s\n" + "fdiv v24.4s, v24.4s, v30.4s\n" + "fdiv v25.4s, v25.4s, v30.4s\n" + "fdiv v26.4s, v26.4s, v30.4s\n" + "fdiv v27.4s, v27.4s, v30.4s\n" + + "fmul v4.4s, v4.4s, v20.4s\n" + "fmul v5.4s, v5.4s, v21.4s\n" + "fmul v6.4s, v6.4s, v22.4s\n" + "fmul v7.4s, v7.4s, v23.4s\n" + "fmul v12.4s, v12.4s, v24.4s\n" + "fmul v13.4s, v13.4s, v25.4s\n" + "fmul v14.4s, v14.4s, v26.4s\n" + "fmul v15.4s, v15.4s, v27.4s\n" + + "13:\n" + "str q4, [%[out_0]]\n" + "str q12, [%[out_0], #16]\n" + "str q5, [%[out_0], #32]\n" + "str q13, [%[out_0], #48]\n" + "str q6, [%[out_0], #64]\n" + "str q14, [%[out_0], #80]\n" + "str q7, [%[out_0], #96]\n" + "str q15, [%[out_0], #112]\n" + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v2", "v4", "v5", "v6", "v7", "v12", "v13", "v14", + "v15", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v29", "v30", + "v31", "x0", "x1", "x2", "x3"); + b0 += 8; + b1 += 8; + } + } + + // ohow_reminder % 4 + ohow_s = (ohow / 4) * 4; + for (I32 hw = ohow_s; hw < ohow; hw++) { + const F32 *b0 = pwBiasArray; + const F32 *b1 = b0 + 4; + const F32 *f_o0c0 = pwFilterArray; + F32 *in_pack = pwArray + hw * ic * 8; + for (I32 o = 0; o < I32(oc); o++) { + F32 *in_hw0 = in_pack; + F32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const F32 *b_o0 = b0; + const F32 *b_o1 = b1; + __asm__ __volatile__( + "ldr q4, [%[b_0]]\n" // b_o0 + "ldr q12, [%[b_1]]\n" // b_o1 + "mov x0, %[ic]\n" // ic_blk + "ldr s0, [%[in_0]]\n" // in_hw0 + "ldr q20, [%[f_0]]\n" // f_o0c0 + "ldr q21, [%[f_0], #16]\n" + "0:\n" + "ldr s2, [%[in_0], #4]\n" + "ldr q22, [%[f_0], #32]\n" + "ldr q23, [%[f_0], #48]\n" + "fmla v4.4s, v20.4s, v0.s[0]\n" + "fmla v12.4s, v21.4s, v0.s[0]\n" + + "ldr s0, [%[in_0], #8]!\n" + "ldr q20, [%[f_0], #64]!\n" + "ldr q21, [%[f_0], #16]\n" + "fmla v4.4s, v22.4s, v2.s[0]\n" + "fmla v12.4s, v23.4s, v2.s[0]\n" + "subs x0, x0, #2\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 11f\n" + "eor v0.16b, v0.16b, v0.16b\n" // zero + "fmax v4.4s, v4.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v0.4s\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 12f\n" + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fmov v30.4s, 6.0\n" // six + "fmax v4.4s, v4.4s, v31.4s\n" + "fmax v12.4s, v12.4s, v31.4s\n" + + "12:\n" + "cmp %[pointwiseActivationMode], %[am_h_swish]\n" + "bne 13f\n" + "fmov v29.4s, 3.0\n" // three + "fmov v30.4s, 6.0\n" // six + "eor v31.16b, v31.16b, v31.16b\n" // zero + "fadd v20.4s, v4.4s, v29.4s\n" + "fadd v24.4s, v12.4s, v29.4s\n" + + "fmax v20.4s, v20.4s, v31.4s\n" + "fmax v24.4s, v24.4s, v31.4s\n" + + "fmin v20.4s, v20.4s, v30.4s\n" + "fmin v24.4s, v24.4s, v30.4s\n" + + "fdiv v20.4s, v20.4s, v30.4s\n" + "fdiv v24.4s, v24.4s, v30.4s\n" + + "fmul v4.4s, v4.4s, v20.4s\n" + "fmul v12.4s, v12.4s, v24.4s\n" + + "13:\n" + "str q4, [%[out_0]]\n" + "str q12, [%[out_0], #16]\n" + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_o0), [b_1] "r"(b_o1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [am_h_swish] "r"((I64)ACTIVATION_H_SWISH) + : "memory", "cc", "v0", "v1", "v2", "v4", "v12", "v20", "v24", "v29", "v30", + "v31", "x0", "x1", "x2", "x3"); + b0 += 8; + b1 += 8; + } + } + } + return SUCCESS; +} +#endif diff --git a/tensor_computing/src/cpu/arm/fp32/eltwise.cpp b/compute/tensor/src/cpu/arm/fp32/eltwise.cpp similarity index 56% rename from tensor_computing/src/cpu/arm/fp32/eltwise.cpp rename to compute/tensor/src/cpu/arm/fp32/eltwise.cpp index 2e82ddc0..94d9f147 100644 --- a/tensor_computing/src/cpu/arm/fp32/eltwise.cpp +++ b/compute/tensor/src/cpu/arm/fp32/eltwise.cpp @@ -1,64 +1,53 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include #include "cpu/arm/fp32/tensor_computing_fp32.h" +#include "cpu/cpu_functions.h" -float32x4_t getFloatVector(void* input, int inputSize, int index) { - float32x4_t result; - if (inputSize == 1) { - result = vdupq_n_f32(*((F32*)input)); - } - int local = index % inputSize; - int remain = inputSize - local; - if (remain >= 4) { - result = vld1q_f32((F32*)(input) + local); - } else { - F32 buffer[4]; - F32 *ptr = (F32*)input; - memcpy(buffer, ptr+local, sizeof(F32)*remain); - for (int i = 0; i < 4 - remain; i++) { - buffer[remain+i] = ptr[i % inputSize]; - } - result = vld1q_f32(buffer); - } - return result; -} - -F32 getFloatScalar(void* input, int inputSize, int index) { - int local = index % inputSize; - return ((F32*)input)[local]; -} - -EE eltwise_fp32(std::vectorinput, std::vector inputSize, U32 num, U32 len, void *output, EltwiseMode eltwiseMode) { +EE eltwise_fp32(std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode) +{ + F32 buffer[4]; U32 len_tail = len % 4; U32 len_main = len - len_tail; + F32 *tmp = buffer; F32 *output_ptr = (F32 *)output; - for (U32 i = 0; i < len_main; i += 4){ - float32x4_t tmp_v = getFloatVector(input[0], inputSize[0], i); + for (U32 i = 0; i < len_main; i += 4) { + get_vector((F32 *)input[0], inputSize[0], &tmp, 4, i, 4, buffer); + float32x4_t tmp_v = vld1q_f32(tmp); for (U32 j = 1; j < num; j++) { - float32x4_t value_v = getFloatVector(input[j], inputSize[j], i); + get_vector((F32 *)input[j], inputSize[j], &tmp, 4, i, 4, buffer); + float32x4_t value_v = vld1q_f32(tmp); switch (eltwiseMode) { case ELTWISE_SUM: - tmp_v = vaddq_f32(value_v, tmp_v); + tmp_v = vaddq_f32(tmp_v, value_v); break; case ELTWISE_MAX: - tmp_v = vmaxq_f32(value_v, tmp_v); + tmp_v = vmaxq_f32(tmp_v, value_v); break; case ELTWISE_PROD: - tmp_v = vmulq_f32(value_v, tmp_v); + tmp_v = vmulq_f32(tmp_v, value_v); + break; + case ELTWISE_SUB: + tmp_v = vsubq_f32(tmp_v, value_v); + break; + case ELTWISE_DIV: + tmp_v = vdivq_f32(tmp_v, value_v); break; default: return NOT_SUPPORTED; @@ -66,10 +55,12 @@ EE eltwise_fp32(std::vectorinput, std::vector inputSize, U32 num, U3 } vst1q_f32(output_ptr + i, tmp_v); } - for (U32 i = len_main; i < len; i++){ - F32 tmp_s = getFloatScalar(input[0], inputSize[0], i); + for (U32 i = len_main; i < len; i++) { + get_vector((F32 *)input[0], inputSize[0], &tmp, 4, i, 1, buffer); + F32 tmp_s = tmp[0]; for (U32 j = 1; j < num; j++) { - F32 value_s = getFloatScalar(input[j], inputSize[j], i); + get_vector((F32 *)input[j], inputSize[j], &tmp, 4, i, 1, buffer); + F32 value_s = tmp[0]; switch (eltwiseMode) { case ELTWISE_SUM: tmp_s = value_s + tmp_s; @@ -80,6 +71,12 @@ EE eltwise_fp32(std::vectorinput, std::vector inputSize, U32 num, U3 case ELTWISE_PROD: tmp_s *= value_s; break; + case ELTWISE_SUB: + tmp_s -= value_s; + break; + case ELTWISE_DIV: + tmp_s /= value_s; + break; default: return NOT_SUPPORTED; } @@ -87,4 +84,4 @@ EE eltwise_fp32(std::vectorinput, std::vector inputSize, U32 num, U3 output_ptr[i] = tmp_s; } return SUCCESS; -} +} diff --git a/compute/tensor/src/cpu/arm/fp32/lstm.cpp b/compute/tensor/src/cpu/arm/fp32/lstm.cpp new file mode 100644 index 00000000..9365902a --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/lstm.cpp @@ -0,0 +1,467 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/arm/fp32/tensor_computing_fp32.h" + +void mvm_nkn32(U32 fn, U32 fk, const F32 *filterArray, F32 *input, F32 *output) +{ +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 n = 0; n < fn; n++) { + F32 *in = input; + const F32 *f = filterArray + n * fk * 32; + F32 *out = output + n * 32; +#ifdef __aarch64__ + __asm__ __volatile__("ldr d0, [%[in]]\n" + "ldr q1, [%[out]]\n" + "ldr q2, [%[out], #16]\n" + "ldr q3, [%[out], #32]\n" + "ldr q4, [%[out], #48]\n" + "ldr q13, [%[out], #64]\n" + "ldr q14, [%[out], #80]\n" + "ldr q15, [%[out], #96]\n" + "ldr q16, [%[out], #112]\n" + "mov x0, %[k]\n" + "ldr q5, [%[f]]\n" + "ldr q6, [%[f], #16]\n" + "ldr q7, [%[f], #32]\n" + "ldr q8, [%[f], #48]\n" + "ldr q17, [%[f], #64]\n" + "ldr q18, [%[f], #80]\n" + "ldr q19, [%[f], #96]\n" + "ldr q20, [%[f], #112]\n" + "0:\n" + "prfm pldl2strm, [%[f], #4096]\n" + "prfm pldl1strm, [%[f], #1024]\n" + "ldr d9, [%[f], #128]\n" + "fmla v1.4s, v5.4s, v0.s[0]\n" + "ldr x9, [%[f], #136]\n" + "ins v9.d[1], x9\n" + "ldr d10, [%[f], #144]\n" + "fmla v2.4s, v6.4s, v0.s[0]\n" + "ldr x10, [%[f], #152]\n" + "ins v10.d[1], x10\n" + "ldr d11, [%[f], #160]\n" + "fmla v3.4s, v7.4s, v0.s[0]\n" + "ldr x11, [%[f], #168]\n" + "ins v11.d[1], x11\n" + "ldr d12, [%[f], #176]\n" + "fmla v4.4s, v8.4s, v0.s[0]\n" + "ldr x12, [%[f], #184]\n" + "ins v12.d[1], x12\n" + "ldr d21, [%[f], #192]\n" + "fmla v13.4s, v17.4s, v0.s[0]\n" + "ldr x9, [%[f], #200]\n" + "ins v21.d[1], x9\n" + "ldr d22, [%[f], #208]\n" + "fmla v14.4s, v18.4s, v0.s[0]\n" + "ldr x10, [%[f], #216]\n" + "ins v22.d[1], x10\n" + "ldr d23, [%[f], #224]\n" + "fmla v15.4s, v19.4s, v0.s[0]\n" + "ldr x11, [%[f], #232]\n" + "ins v23.d[1], x11\n" + "ldr d24, [%[f], #240]\n" + "fmla v16.4s, v20.4s, v0.s[0]\n" + "ldr x12, [%[f], #248]\n" + "ins v24.d[1], x12\n" + + "add %[f], %[f], #256\n" + "ldr d5, [%[f]]\n" + "fmla v1.4s, v9.4s, v0.s[1]\n" + "ldr x5, [%[f], #8]\n" + "ins v5.d[1], x5\n" + "ldr d6, [%[f], #16]\n" + "fmla v2.4s, v10.4s, v0.s[1]\n" + "ldr x6, [%[f], #24]\n" + "ins v6.d[1], x6\n" + "ldr d7, [%[f], #32]\n" + "fmla v3.4s, v11.4s, v0.s[1]\n" + "ldr x7, [%[f], #40]\n" + "ins v7.d[1], x7\n" + "ldr d8, [%[f], #48]\n" + "fmla v4.4s, v12.4s, v0.s[1]\n" + "ldr x8, [%[f], #56]\n" + "ins v8.d[1], x8\n" + "ldr d17, [%[f], #64]\n" + "fmla v13.4s, v21.4s, v0.s[1]\n" + "ldr x5, [%[f], #72]\n" + "ins v17.d[1], x5\n" + "ldr d18, [%[f], #80]\n" + "fmla v14.4s, v22.4s, v0.s[1]\n" + "ldr x6, [%[f], #88]\n" + "ins v18.d[1], x6\n" + "ldr d19, [%[f], #96]\n" + "fmla v15.4s, v23.4s, v0.s[1]\n" + "ldr x7, [%[f], #104]\n" + "ins v19.d[1], x7\n" + "ldr d20, [%[f], #112]\n" + "fmla v16.4s, v24.4s, v0.s[1]\n" + "ldr x8, [%[f], #120]\n" + "add %[in], %[in], #8\n" + "ins v20.d[1], x8\n" + + "ldr d0, [%[in]]\n" + "sub x0, x0, #2\n" + + "cmp x0, #3\n" + "bgt 0b\n" + "ldr q9, [%[f], #128]\n" + "ldr q10, [%[f], #144]\n" + "ldr q11, [%[f], #160]\n" + "ldr q12, [%[f], #176]\n" + "ldr q21, [%[f], #192]\n" + "ldr q22, [%[f], #208]\n" + "ldr q23, [%[f], #224]\n" + "ldr q24, [%[f], #240]\n" + "fmla v1.4s, v5.4s, v0.s[0]\n" + "fmla v2.4s, v6.4s, v0.s[0]\n" + "fmla v3.4s, v7.4s, v0.s[0]\n" + "fmla v4.4s, v8.4s, v0.s[0]\n" + "fmla v13.4s, v17.4s, v0.s[0]\n" + "fmla v14.4s, v18.4s, v0.s[0]\n" + "fmla v15.4s, v19.4s, v0.s[0]\n" + "fmla v16.4s, v20.4s, v0.s[0]\n" + "fmla v1.4s, v9.4s, v0.s[1]\n" + "fmla v2.4s, v10.4s, v0.s[1]\n" + "fmla v3.4s, v11.4s, v0.s[1]\n" + "fmla v4.4s, v12.4s, v0.s[1]\n" + "fmla v13.4s, v21.4s, v0.s[1]\n" + "fmla v14.4s, v22.4s, v0.s[1]\n" + "fmla v15.4s, v23.4s, v0.s[1]\n" + "fmla v16.4s, v24.4s, v0.s[1]\n" + "cmp x0, #3\n" + "bne 1f\n" + "add %[f], %[f], #256\n" + "ldr s0, [%[in], #8]\n" + "ldr q5, [%[f]]\n" + "ldr q6, [%[f], #16]\n" + "ldr q7, [%[f], #32]\n" + "ldr q8, [%[f], #48]\n" + "ldr q17, [%[f], #64]\n" + "ldr q18, [%[f], #80]\n" + "ldr q19, [%[f], #96]\n" + "ldr q20, [%[f], #112]\n" + "fmla v1.4s, v5.4s, v0.s[0]\n" + "fmla v2.4s, v6.4s, v0.s[0]\n" + "fmla v3.4s, v7.4s, v0.s[0]\n" + "fmla v4.4s, v8.4s, v0.s[0]\n" + "fmla v13.4s, v17.4s, v0.s[0]\n" + "fmla v14.4s, v18.4s, v0.s[0]\n" + "fmla v15.4s, v19.4s, v0.s[0]\n" + "fmla v16.4s, v20.4s, v0.s[0]\n" + + "1:\n" + "str q1, [%[out]]\n" + "str q2, [%[out], #16]\n" + "str q3, [%[out], #32]\n" + "str q4, [%[out], #48]\n" + "str q13, [%[out], #64]\n" + "str q14, [%[out], #80]\n" + "str q15, [%[out], #96]\n" + "str q16, [%[out], #112]\n" + : [out] "+r"(out), [f] "+r"(f), [in] "+r"(in) + : [k] "r"((I64)fk) + : "memory", "cc", "x0", "x5", "x6", "x7", "x8", "x9", "x10", "x11", + "x12", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24"); +#else + __asm__ __volatile__("vld1.f32 {d0[0]}, [%[in]]!\n" + "mov r2, %[out]\n" + "mov r3, %[out]\n" + "vld1.f32 {d2-d3}, [r2]!\n" + "vld1.f32 {d4-d5}, [r2]!\n" + "vld1.f32 {d6-d7}, [r2]!\n" + "vld1.f32 {d8-d9}, [r2]!\n" + "vld1.f32 {d18-d19}, [%[f]]!\n" + "vld1.f32 {d20-d21}, [%[f]]!\n" + "vld1.f32 {d22-d23}, [%[f]]!\n" + "vld1.f32 {d24-d25}, [%[f]]!\n" + "mov r4, %[k]\n" + "vld1.f32 {d10-d11}, [r2]!\n" + "vld1.f32 {d12-d13}, [r2]!\n" + "vld1.f32 {d14-d15}, [r2]!\n" + "vld1.f32 {d16-d17}, [r2]\n" + "vld1.f32 {d26-d27}, [%[f]]!\n" + "vld1.f32 {d28-d29}, [%[f]]!\n" + "0:\n" + "cmp r4, #3\n" + "ble 3f\n" + "pld [%[f], #374]\n" + "vld1.f32 {d30[0]}, [%[in]]!\n" + "vmla.f32 q1, q9, d0[0]\n" + "vld1.f32 {d18-d19}, [%[f]]!\n" + "vmla.f32 q2, q10, d0[0]\n" + "vld1.f32 {d20-d21}, [%[f]]!\n" + "vmla.f32 q3, q11, d0[0]\n" + "vld1.f32 {d22-d23}, [%[f]]!\n" + "vmla.f32 q4, q12, d0[0]\n" + "vld1.f32 {d24-d25}, [%[f]]!\n" + "vmla.f32 q5, q13, d0[0]\n" + "vld1.f32 {d26-d27}, [%[f]]!\n" + "vmla.f32 q6, q14, d0[0]\n" + "vld1.f32 {d28-d29}, [%[f]]!\n" + "vmla.f32 q7, q9, d0[0]\n" + "vld1.f32 {d18-d19}, [%[f]]!\n" + "vmla.f32 q8, q10, d0[0]\n" + "vld1.f32 {d20-d21}, [%[f]]!\n" + + "pld [%[f], #374]\n" + "vmov.f32 q0, q15\n" + "vld1.f32 {d30[0]}, [%[in]]!\n" + "vmla.f32 q1, q11, d0[0]\n" + "vld1.f32 {d22-d23}, [%[f]]!\n" + "vmla.f32 q2, q12, d0[0]\n" + "vld1.f32 {d24-d25}, [%[f]]!\n" + "vmla.f32 q3, q13, d0[0]\n" + "vld1.f32 {d26-d27}, [%[f]]!\n" + "vmla.f32 q4, q14, d0[0]\n" + "vld1.f32 {d28-d29}, [%[f]]!\n" + "vmla.f32 q5, q9, d0[0]\n" + "vld1.f32 {d18-d19}, [%[f]]!\n" + "vmla.f32 q6, q10, d0[0]\n" + "vld1.f32 {d20-d21}, [%[f]]!\n" + "vmla.f32 q7, q11, d0[0]\n" + "vld1.f32 {d22-d23}, [%[f]]!\n" + "vmla.f32 q8, q12, d0[0]\n" + "vld1.f32 {d24-d25}, [%[f]]!\n" + + "sub r4, r4, #3\n" + + "pld [%[f], #374]\n" + "vmov.f32 q0, q15\n" + "vld1.f32 {d30[0]}, [%[in]]!\n" + "vmla.f32 q1, q13, d0[0]\n" + "vld1.f32 {d26-d27}, [%[f]]!\n" + "vmla.f32 q2, q14, d0[0]\n" + "vld1.f32 {d28-d29}, [%[f]]!\n" + "vmla.f32 q3, q9, d0[0]\n" + "vld1.f32 {d18-d19}, [%[f]]!\n" + "vmla.f32 q4, q10, d0[0]\n" + "vld1.f32 {d20-d21}, [%[f]]!\n" + "vmla.f32 q5, q11, d0[0]\n" + "vld1.f32 {d22-d23}, [%[f]]!\n" + "vmla.f32 q6, q12, d0[0]\n" + "vld1.f32 {d24-d25}, [%[f]]!\n" + "vmla.f32 q7, q13, d0[0]\n" + "vld1.f32 {d26-d27}, [%[f]]!\n" + "vmla.f32 q8, q14, d0[0]\n" + "vld1.f32 {d28-d29}, [%[f]]!\n" + "vmov.f32 q0, q15\n" + "b 0b\n" + "3:\n" + "sub r4, r4, #1\n" + "vmla.f32 q1, q9, d0[0]\n" + "vld1.f32 {d18-d19}, [%[f]]!\n" + "vmla.f32 q2, q10, d0[0]\n" + "vld1.f32 {d20-d21}, [%[f]]!\n" + "vmla.f32 q3, q11, d0[0]\n" + "vmla.f32 q4, q12, d0[0]\n" + "vmla.f32 q5, q13, d0[0]\n" + "vmla.f32 q6, q14, d0[0]\n" + "vmla.f32 q7, q9, d0[0]\n" + "vmla.f32 q8, q10, d0[0]\n" + + "1:\n" + "cmp r4, #0\n" + "beq 2f\n" + "sub r4, r4, #1\n" + "vld1.f32 {d0[0]}, [%[in]]!\n" + "vld1.f32 {d18-d19}, [%[f]]!\n" + "vld1.f32 {d20-d21}, [%[f]]!\n" + "vld1.f32 {d22-d23}, [%[f]]!\n" + "vld1.f32 {d24-d25}, [%[f]]!\n" + "vmla.f32 q1, q9, d0[0]\n" + "vld1.f32 {d18-d19}, [%[f]]!\n" + "vmla.f32 q2, q10, d0[0]\n" + "vld1.f32 {d20-d21}, [%[f]]!\n" + "vmla.f32 q3, q11, d0[0]\n" + "vld1.f32 {d22-d23}, [%[f]]!\n" + "vmla.f32 q4, q12, d0[0]\n" + "vld1.f32 {d24-d25}, [%[f]]!\n" + "vmla.f32 q5, q9, d0[0]\n" + "vmla.f32 q6, q10, d0[0]\n" + "vmla.f32 q7, q11, d0[0]\n" + "vmla.f32 q8, q12, d0[0]\n" + "b 1b\n" + + "2:\n" + "vst1.f32 {d2-d3}, [r3]!\n" + "vst1.f32 {d4-d5}, [r3]!\n" + "vst1.f32 {d6-d7}, [r3]!\n" + "vst1.f32 {d8-d9}, [r3]!\n" + "vst1.f32 {d10-d11}, [r3]!\n" + "vst1.f32 {d12-d13}, [r3]!\n" + "vst1.f32 {d14-d15}, [r3]!\n" + "vst1.f32 {d16-d17}, [r3]\n" + : [f] "+r"(f), [in] "+r"(in) + : [k] "r"(fk), [out] "r"(out) + : "memory", "cc", "r2", "r3", "r4", "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); +#endif + } +} + +EE rnncell_fp32(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *output, + Arch arch) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(arch); + if (nullptr == currentX || nullptr == filter || nullptr == bias || nullptr == state || + nullptr == tmp || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ix; + U32 on, oh; + U32 fk, fn; + CHECK_STATUS(tensor2dGet(xDesc, &idt, &idf, &in, &ix)); + CHECK_STATUS(tensor2dGet(filterDesc[0], &fdt, &fdf, &fn, &fk)); + CHECK_STATUS(tensor2dGet(hDesc, &odt, &odf, &on, &oh)); + if (fdf != DF_NKN32) { + CHECK_STATUS(NOT_MATCH); + } + fn /= 32; + + U32 batch = in; + I32 xDim = ix; + I32 hDim = rnnParamSpec.numOutput; + I32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection + : rnnParamSpec.numOutput; + if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(4 * column == (I32)fn * 32 && (ix + oh) == fk && in == on)) { + CHECK_STATUS(NOT_MATCH); + } + F32 forgetBias = rnnParamSpec.forgetBias; + ActivationMode activationMode = rnnParamSpec.activationMode; + if (activationMode != ACTIVATION_TANH) { + CHECK_STATUS(NOT_SUPPORTED); + } + + const F32 *currentXArray = (const F32 *)currentX; + F32 *lastStateArray = (F32 *)state; + F32 *lastHArray = lastStateArray + column; + F32 *tmpArray = (F32 *)tmp; + F32 *currentStateArray = (F32 *)state; + F32 *currentHArray = currentStateArray + column; + F32 *outputArray = (F32 *)output; + F32 *xhArray = tmpArray; + F32 *intermediateH = xhArray + (xDim + hDim); + U32 lastStateStride = column + hDim; + U32 lastHStride = column + hDim; + U32 currentStateStride = column + hDim; + U32 currentHStride = column + hDim; + float32x4_t forgetBiasVector = vdupq_n_f32(forgetBias); + for (U32 m = 0; m < batch; m++) { + F32 *lastBatchH = lastHArray + m * lastHStride; + memcpy(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F32)); + memcpy(xhArray + xDim, lastBatchH, hDim * sizeof(F32)); + + memcpy(intermediateH, bias[0], column * 4 * sizeof(F32)); + mvm_nkn32(fn, fk, (const F32 *)filter[0], xhArray, intermediateH); + F32 *out_i = intermediateH; + F32 *out_g = out_i + column; + F32 *out_f = out_i + column * 2; + F32 *out_o = out_i + column * 3; + + F32 *lastBatchState = lastStateArray + m * lastStateStride; + F32 *currentBatchState = currentStateArray + m * currentStateStride; + F32 *currentBatchH = currentHArray + m * currentHStride; + F32 *currentOutput = outputArray + m * batchStrideH; + + F32 *tmpState, *tmpHH, *tmpH; + if (rnnParamSpec.zoneoutCell == 0) { + tmpState = currentBatchState; + } else { + tmpState = out_i; + } + if (rnnParamSpec.numProjection > 0) { + tmpHH = out_g; + tmpH = currentOutput; + } else { + tmpHH = currentOutput; + tmpH = out_g; + } + + I32 h = 0; + for (; h < column - 3; h += 4) { + float32x4_t out_i_v = vld1q_f32(out_i + h); + float32x4_t out_g_v = vld1q_f32(out_g + h); + float32x4_t out_f_v = vld1q_f32(out_f + h); + float32x4_t out_o_v = vld1q_f32(out_o + h); + float32x4_t C_v = vld1q_f32(lastBatchState + h); + float32x4_t I_v = vsigmoidq_f32(out_i_v); + float32x4_t F_v = vsigmoidq_f32(vaddq_f32(out_f_v, forgetBiasVector)); + float32x4_t O_v = vsigmoidq_f32(out_o_v); + float32x4_t G_v = vtanhq_f32(out_g_v); + C_v = vaddq_f32(vmulq_f32(C_v, F_v), vmulq_f32(I_v, G_v)); + float32x4_t out_hidden_v = vmulq_f32(O_v, vtanhq_f32(C_v)); + vst1q_f32(tmpState + h, C_v); + vst1q_f32(tmpHH + h, out_hidden_v); + } + for (; h < column; h++) { + F32 C_s = lastBatchState[h]; + F32 I_s = 1.0 / (1.0 + exp(-out_i[h])); + F32 F_s = 1.0 / (1.0 + exp(-(out_f[h] + forgetBias))); + F32 O_s = 1.0 / (1.0 + exp(-out_o[h])); + F32 G_s = tanh(out_g[h]); + C_s = C_s * F_s + I_s * G_s; + F32 value = O_s * tanh(C_s); + tmpState[h] = C_s; + tmpHH[h] = value; + } + if (rnnParamSpec.zoneoutCell != 0) { + array_scale_f32(tmpState, tmpState, column, 1 - rnnParamSpec.zoneoutCell, 0); + array_scale_f32(lastBatchState, lastBatchState, column, rnnParamSpec.zoneoutCell, 0); + array_add_f32(tmpState, lastBatchState, currentBatchState, column); + } + + if (rnnParamSpec.numProjection > 0) { + memset(tmpH, 0, sizeof(F32) * hDim); + mvm_nkn32(hDim / 32, rnnParamSpec.numProjection, (const F32 *)filter[1], tmpHH, tmpH); + } + if (rnnParamSpec.zoneoutOutput != 0) { + if (rnnParamSpec.numProjection > 0) { + array_scale_f32(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0); + } else { + array_scale_f32(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0); + } + array_scale_f32(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneoutOutput, 0); + array_add_f32(out_f, lastBatchH, currentBatchH, hDim); + } else { + memcpy(currentBatchH, currentOutput, sizeof(F32) * hDim); + } + } + return SUCCESS; +} diff --git a/tensor_computing/src/cpu/arm/fp32/normalization.cpp b/compute/tensor/src/cpu/arm/fp32/normalization.cpp similarity index 72% rename from tensor_computing/src/cpu/arm/fp32/normalization.cpp rename to compute/tensor/src/cpu/arm/fp32/normalization.cpp index c03b6608..6604b485 100644 --- a/tensor_computing/src/cpu/arm/fp32/normalization.cpp +++ b/compute/tensor/src/cpu/arm/fp32/normalization.cpp @@ -1,61 +1,62 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "cpu/arm/fp32/tensor_computing_fp32.h" -inline void array_norm_scale_fp32(F32 *input, F32 *output, I32 len, F32 mean, F32 var, F32 *alpha, F32 *beta) { +inline void array_norm_scale_fp32( + F32 *input, F32 *output, I32 len, F32 mean, F32 var, F32 *alpha, F32 *beta) +{ F32 eps = 1e-6; F32 std_value = sqrt(var + eps); float32x4_t mean_v = vdupq_n_f32(mean); - float32x4_t std_v = vdupq_n_f32(std_value); + float32x4_t std_v = vdupq_n_f32(std_value); I32 i = 0; - for(i = 0; i < len - 3; i += 4){ + for (i = 0; i < len - 3; i += 4) { float32x4_t in = vld1q_f32(input + i); float32x4_t alpha_v = vld1q_f32(alpha + i); - float32x4_t beta_v = vld1q_f32(beta + i); + float32x4_t beta_v = vld1q_f32(beta + i); float32x4_t tmp_v = vsubq_f32(in, mean_v); tmp_v = vdivq_f32(tmp_v, std_v); tmp_v = vfmaq_f32(beta_v, alpha_v, tmp_v); - vst1q_f32(output+i, tmp_v); + vst1q_f32(output + i, tmp_v); } - for(; i < len; i++){ + for (; i < len; i++) { output[i] = alpha[i] * (input[i] - mean) / std_value + beta[i]; } } -EE layer_normalization_fp32(F32 *alpha, F32 *beta, - TensorDesc inputDesc, F32* input, - TensorDesc outputDesc, F32* output) +EE layer_normalization_fp32( + TensorDesc inputDesc, F32 *input, F32 *alpha, F32 *beta, TensorDesc outputDesc, F32 *output) { UNUSED(outputDesc); - if (nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) + if (nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); + } U32 size = tensorNumElements(inputDesc); I32 size_inner = inputDesc.dims[0]; I32 size_outer = size / size_inner; - for(I32 i = 0; i < size_outer; i++) { + for (I32 i = 0; i < size_outer; i++) { F32 *current_input = input + i * size_inner; F32 *current_output = output + i * size_inner; F32 mean = array_mean_f32(current_input, size_inner); - F32 var = array_var_f32(current_input, size_inner, mean); + F32 var = array_var_f32(current_input, size_inner, mean); array_norm_scale_fp32(current_input, current_output, size_inner, mean, var, alpha, beta); } - + return SUCCESS; } diff --git a/compute/tensor/src/cpu/arm/fp32/pooling.cpp b/compute/tensor/src/cpu/arm/fp32/pooling.cpp new file mode 100644 index 00000000..0249b731 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/pooling.cpp @@ -0,0 +1,86 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/arm/fp32/tensor_computing_fp32.h" + +EE pooling_c8_fp32(const F32 *input, + U32 stride, + int hstart, + int hend, + int wstart, + int wend, + F32 *output, + PoolingParamSpec poolingParamSpec) +{ + EE ret = SUCCESS; + PoolingMode pm = poolingParamSpec.mode; + float32x4_t in0, in1, out0, out1; + float32x4_t poolSize = vdupq_n_f32((hend - hstart) * (wend - wstart)); + out0 = vdupq_n_f32((pm == POOLING_MAX) ? -FLT_MAX : 0); + out1 = out0; + for (int kernelH = hstart; kernelH < hend; kernelH++) { + for (int kernelW = wstart; kernelW < wend; kernelW++) { + const U32 index = (kernelH * stride + kernelW) * 8; + in0 = vld1q_f32(input + index); + in1 = vld1q_f32(input + index + 4); + switch (pm) { + case POOLING_MAX: { + out0 = vmaxq_f32(in0, out0); + out1 = vmaxq_f32(in1, out1); + break; + } + case POOLING_MEAN: { + out0 = vaddq_f32(out0, in0); + out1 = vaddq_f32(out1, in1); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + } + } + vst1q_f32(output, ((pm == POOLING_MAX) ? out0 : vdivq_f32(out0, poolSize))); + vst1q_f32(output + 4, ((pm == POOLING_MAX) ? out1 : vdivq_f32(out1, poolSize))); + return ret; +} + +EE pooling_bp_c8_fp32(const F32 *input, + int hstart, + int hend, + int wstart, + int wend, + F32 *output, + U32 stride, + PoolingParamSpec poolingParamSpec) +{ + EE ret = SUCCESS; + PoolingMode pm = poolingParamSpec.mode; + if (pm != POOLING_MEAN) { + ret = NOT_SUPPORTED; + } + float32x4_t poolSize = vdupq_n_f32((hend - hstart) * (wend - wstart)); + float32x4_t in0 = vdivq_f32(vld1q_f32(input), poolSize); + float32x4_t in1 = vdivq_f32(vld1q_f32(input + 4), poolSize); + for (int kernelH = hstart; kernelH < hend; kernelH++) { + for (int kernelW = wstart; kernelW < wend; kernelW++) { + U32 index = (kernelH * stride + kernelW) * 8; + float32x4_t out0 = vaddq_f32(vld1q_f32(output + index), in0); + float32x4_t out1 = vaddq_f32(vld1q_f32(output + index + 4), in1); + vst1q_f32(output + index, out0); + vst1q_f32(output + index + 4, out1); + } + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp32/prelu.cpp b/compute/tensor/src/cpu/arm/fp32/prelu.cpp new file mode 100644 index 00000000..b19dcd56 --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/prelu.cpp @@ -0,0 +1,68 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/tensor_computing_arm.h" +#include "cpu/arm/fp32/tensor_computing_fp32.h" + +EE prelu_fp32(TensorDesc inputDesc, + F32 *input, + F32 *weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + F32 *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, odt; + DataFormat idf, odf; + U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0; + if (tensorIs4d(inputDesc) && tensorIs4d(outputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + if (idf != DF_NCHWC8) { + CHECK_STATUS(NOT_SUPPORTED); + } + } else { + return NOT_SUPPORTED; + } + + CHECK_REQUIREMENT(in == on && ic == oc && ih == oh && iw == ow); + ic /= 8; + float32x4_t slope0, slope1; + uint32x4_t mask0, mask1; + float32x4_t in0, in1, out0, out1; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 hw = 0; hw < ih * iw; hw++) { + slope0 = preluDesc.propagate_down ? vdupq_n_f32(weight[0]) + : vld1q_f32(weight + c * 8); + slope1 = preluDesc.propagate_down ? vdupq_n_f32(weight[0]) + : vld1q_f32(weight + c * 8 + 4); + in0 = vld1q_f32(input); + in1 = vld1q_f32(input + 4); + mask0 = vcleq_f32(in0, vdupq_n_f32(0.f)); + mask1 = vcleq_f32(in1, vdupq_n_f32(0.f)); + float32x4_t tmp0 = vmulq_f32(in0, slope0); + float32x4_t tmp1 = vmulq_f32(in1, slope1); + out0 = vbslq_f32(mask0, tmp0, in0); + out1 = vbslq_f32(mask1, tmp1, in1); + vst1q_f32(output, out0); + vst1q_f32(output + 4, out1); + input += 8; + output += 8; + } + } + } + return SUCCESS; +} diff --git a/tensor_computing/src/cpu/arm/fp32/scale.cpp b/compute/tensor/src/cpu/arm/fp32/scale.cpp similarity index 67% rename from tensor_computing/src/cpu/arm/fp32/scale.cpp rename to compute/tensor/src/cpu/arm/fp32/scale.cpp index dbfbe567..882ed072 100644 --- a/tensor_computing/src/cpu/arm/fp32/scale.cpp +++ b/compute/tensor/src/cpu/arm/fp32/scale.cpp @@ -1,21 +1,21 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "cpu/arm/fp32/tensor_computing_fp32.h" -EE scale_nchwc8_fp32(F32* input, F32* alpha, F32* beta, I32 in, I32 ic, I32 elements_per_channel, F32* output) +EE scale_nchwc8_fp32( + F32 *input, F32 *alpha, F32 *beta, I32 in, I32 ic, I32 elements_per_channel, F32 *output) { float32x4_t in_vec, out_vec; float32x4_t one = vdupq_n_f32(float32_t(1.)); @@ -30,11 +30,11 @@ EE scale_nchwc8_fp32(F32* input, F32* alpha, F32* beta, I32 in, I32 ic, I32 elem for (I32 i = 0; i < elements_per_channel; i++) { in_vec = vld1q_f32(input + index); out_vec = vfmaq_f32(beta_vec0, alpha_vec0, in_vec); - vst1q_f32(output+index, out_vec); + vst1q_f32(output + index, out_vec); in_vec = vld1q_f32(input + index + 4); out_vec = vfmaq_f32(beta_vec1, alpha_vec1, in_vec); - vst1q_f32(output+index+4, out_vec); + vst1q_f32(output + index + 4, out_vec); index += 8; } } @@ -42,7 +42,8 @@ EE scale_nchwc8_fp32(F32* input, F32* alpha, F32* beta, I32 in, I32 ic, I32 elem return SUCCESS; } -EE scale_nchw_fp32(F32* input, F32* alpha, F32* beta, I32 in, I32 ic, I32 elements_per_channel, F32* output) +EE scale_nchw_fp32( + F32 *input, F32 *alpha, F32 *beta, I32 in, I32 ic, I32 elements_per_channel, F32 *output) { float32x4_t one = vdupq_n_f32(1.); float32x4_t zero = vdupq_n_f32(0.); @@ -50,16 +51,18 @@ EE scale_nchw_fp32(F32* input, F32* alpha, F32* beta, I32 in, I32 ic, I32 elemen for (I32 n = 0; n < in; n++) { for (I32 c = 0; c < ic; c++) { float32x4_t alpha_vec = (alpha == nullptr) ? one : vdupq_n_f32(alpha[c]); - float32x4_t beta_vec = (beta == nullptr) ? zero : vdupq_n_f32(beta[c]); + float32x4_t beta_vec = (beta == nullptr) ? zero : vdupq_n_f32(beta[c]); I32 i = 0; - for (; i < elements_per_channel-3; i += 4) { + for (; i < elements_per_channel - 3; i += 4) { float32x4_t in_vec = vld1q_f32(input + index); float32x4_t out_vec = vfmaq_f32(beta_vec, alpha_vec, in_vec); - vst1q_f32(output+index, out_vec); + vst1q_f32(output + index, out_vec); index += 4; } for (; i < elements_per_channel; i++) { - output[index] = alpha[c] * input[index] + beta[c]; + float alpha_s = (alpha == nullptr) ? 1 : alpha[c]; + float beta_s = (beta == nullptr) ? 0 : beta[c]; + output[index] = alpha_s * input[index] + beta_s; index++; } } @@ -67,7 +70,8 @@ EE scale_nchw_fp32(F32* input, F32* alpha, F32* beta, I32 in, I32 ic, I32 elemen return SUCCESS; } -EE scale_nhwc_fp32(F32* input, F32* alpha, F32* beta, I32 in, I32 ic, I32 elements_per_channel, F32* output) +EE scale_nhwc_fp32( + F32 *input, F32 *alpha, F32 *beta, I32 in, I32 ic, I32 elements_per_channel, F32 *output) { float32x4_t one = vdupq_n_f32(1.); float32x4_t zero = vdupq_n_f32(0.); @@ -75,17 +79,18 @@ EE scale_nhwc_fp32(F32* input, F32* alpha, F32* beta, I32 in, I32 ic, I32 elemen for (I32 n = 0; n < in; n++) { for (I32 i = 0; i < elements_per_channel; i++) { I32 c = 0; - for (; c < ic-3; c += 4) { - float32x4_t alpha_vec = (alpha == nullptr) ? one : vld1q_f32(alpha+c); - float32x4_t beta_vec = (beta == nullptr) ? zero : vld1q_f32(beta+c); + for (; c < ic - 3; c += 4) { + float32x4_t alpha_vec = (alpha == nullptr) ? one : vld1q_f32(alpha + c); + float32x4_t beta_vec = (beta == nullptr) ? zero : vld1q_f32(beta + c); float32x4_t in_vec = vld1q_f32(input + index); float32x4_t out_vec = vfmaq_f32(beta_vec, alpha_vec, in_vec); - vst1q_f32(output+index, out_vec); + vst1q_f32(output + index, out_vec); index += 4; } for (; c < ic; c++) { - F32 beta_s = (beta == nullptr) ? 0 : beta[c]; - output[index] = alpha[c] * input[index] + beta_s; + float alpha_s = (alpha == nullptr) ? 1 : alpha[c]; + float beta_s = (beta == nullptr) ? 0 : beta[c]; + output[index] = alpha_s * input[index] + beta_s; index++; } } @@ -93,12 +98,22 @@ EE scale_nhwc_fp32(F32* input, F32* alpha, F32* beta, I32 in, I32 ic, I32 elemen return SUCCESS; } -EE scale_fp32(F32* input, I32 axis, I32 nDims, F32* alpha, F32* beta, I32 in, I32 ic, I32 elements_per_channel, F32* output) +EE scale_fp32(F32 *input, + I32 axis, + I32 nDims, + F32 *alpha, + F32 *beta, + I32 in, + I32 ic, + I32 elements_per_channel, + F32 *output) { - if (nullptr == input || nullptr == output) + if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); + } EE ret = SUCCESS; - if (axis == 1 || axis == 0) { + // If ic is 1, it means that weights/vectors have only one param, so we need use the calculation logic of nchw. + if (axis == 1 || axis == 0 || ic == 1) { ret = scale_nchw_fp32(input, alpha, beta, in, ic, elements_per_channel, output); } else if (axis == nDims - 1) { ret = scale_nhwc_fp32(input, alpha, beta, in, ic, elements_per_channel, output); diff --git a/tensor_computing/src/cpu/arm/fp32/softmax.cpp b/compute/tensor/src/cpu/arm/fp32/softmax.cpp similarity index 76% rename from tensor_computing/src/cpu/arm/fp32/softmax.cpp rename to compute/tensor/src/cpu/arm/fp32/softmax.cpp index 8e88f58e..04597bc8 100644 --- a/tensor_computing/src/cpu/arm/fp32/softmax.cpp +++ b/compute/tensor/src/cpu/arm/fp32/softmax.cpp @@ -1,24 +1,23 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include #include "cpu/arm/fp32/tensor_computing_fp32.h" -void softmax_lastAxis_fp32(const F32* input, I32 loopOuter, I32 loops, F32 *output) +void softmax_lastAxis_fp32(const F32 *input, I32 loopOuter, I32 loops, F32 *output) { - for(I32 i = 0; i < loopOuter; i++) { + for (I32 i = 0; i < loopOuter; i++) { const F32 *inputPtr = input + i * loops; F32 *outputPtr = output + i * loops; @@ -30,7 +29,7 @@ void softmax_lastAxis_fp32(const F32* input, I32 loopOuter, I32 loops, F32 *outp I32 j = 0; F32 sum_s = 0; - for(j = 0; j < loops-3; j += 4) { + for (j = 0; j < loops - 3; j += 4) { float32x4_t in = vld1q_f32(inputPtr + j); sub_v = vsubq_f32(in, max_v); tmp_v = vexpq_f32_03_percent_error(sub_v); @@ -38,7 +37,7 @@ void softmax_lastAxis_fp32(const F32* input, I32 loopOuter, I32 loops, F32 *outp vst1q_f32(outputPtr + j, tmp_v); } sum_s += vaddvq_f32(sum_v); - for(; j < loops; j++){ + for (; j < loops; j++) { tmp_s = exp(inputPtr[j] - max_s); outputPtr[j] = tmp_s; sum_s += tmp_s; @@ -47,34 +46,35 @@ void softmax_lastAxis_fp32(const F32* input, I32 loopOuter, I32 loops, F32 *outp } } -void softmax_anyAxis_fp32(const F32* input, I32 loopOuter, I32 loops, I32 loopInner, F32 *output) +void softmax_anyAxis_fp32(const F32 *input, I32 loopOuter, I32 loops, I32 loopInner, F32 *output) { std::vector buffer(loopInner * 2); - F32* maxBuffer = &buffer[0] ; - F32* sumBuffer = &buffer[loopInner] ; + F32 *maxBuffer = &buffer[0]; + F32 *sumBuffer = &buffer[loopInner]; I32 k = 0; - for(I32 i = 0; i < loopOuter; i++) { - const F32* inputPtrBase = input + i * loops * loopInner; - F32* outputPtrBase = output + i * loops * loopInner; + for (I32 i = 0; i < loopOuter; i++) { + const F32 *inputPtrBase = input + i * loops * loopInner; + F32 *outputPtrBase = output + i * loops * loopInner; memcpy(maxBuffer, inputPtrBase, loopInner * sizeof(F32)); memset(sumBuffer, 0, loopInner * sizeof(F32)); for (I32 j = 1; j < loops; j++) { - const F32* inputPtr = inputPtrBase + j * loopInner; - for (k = 0; k < loopInner-3; k += 4) { + const F32 *inputPtr = inputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 3; k += 4) { float32x4_t in_v = vld1q_f32(inputPtr + k); float32x4_t out_v = vld1q_f32(maxBuffer + k); float32x4_t max_v = vmaxq_f32(in_v, out_v); vst1q_f32(maxBuffer + k, max_v); } - for (; k < loopInner; k++) + for (; k < loopInner; k++) { maxBuffer[k] = UNI_MAX(maxBuffer[k], inputPtr[k]); + } } for (I32 j = 0; j < loops; j++) { - const F32* inputPtr = inputPtrBase + j * loopInner; - F32* outputPtr = outputPtrBase + j * loopInner; - for (k = 0; k < loopInner-3; k += 4) { - float32x4_t in_v = vld1q_f32(inputPtr + k); + const F32 *inputPtr = inputPtrBase + j * loopInner; + F32 *outputPtr = outputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 3; k += 4) { + float32x4_t in_v = vld1q_f32(inputPtr + k); float32x4_t max_v = vld1q_f32(maxBuffer + k); float32x4_t sub_v = vsubq_f32(in_v, max_v); float32x4_t exp_v = vexpq_f32_03_percent_error(sub_v); @@ -89,8 +89,8 @@ void softmax_anyAxis_fp32(const F32* input, I32 loopOuter, I32 loops, I32 loopIn } } for (I32 j = 0; j < loops; j++) { - F32* outputPtr = outputPtrBase + j * loopInner; - for (k = 0; k < loopInner-3; k += 4) { + F32 *outputPtr = outputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 3; k += 4) { float32x4_t out_v = vld1q_f32(outputPtr + k); float32x4_t sum_v = vld1q_f32(sumBuffer + k); out_v = vdivq_f32(out_v, sum_v); @@ -103,14 +103,12 @@ void softmax_anyAxis_fp32(const F32* input, I32 loopOuter, I32 loops, I32 loopIn } } - -EE softmax_fp32(TensorDesc inputDesc, const F32* input, - int axis, - TensorDesc outputDesc, F32* output) +EE softmax_fp32(TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output) { UNUSED(outputDesc); - if(nullptr == input || nullptr == output) + if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); + } U32 size = tensorNumElements(inputDesc); axis = (axis + inputDesc.nDims) % inputDesc.nDims; @@ -118,8 +116,9 @@ EE softmax_fp32(TensorDesc inputDesc, const F32* input, I32 loops = inputDesc.dims[axis]; I32 loopInner = 1; - for (int i = 0; i < axis; i++) + for (int i = 0; i < axis; i++) { loopInner *= inputDesc.dims[i]; + } U32 loopOuter = size / loops / loopInner; if (loopInner == 1) { diff --git a/compute/tensor/src/cpu/arm/fp32/tensor_computing_fp32.h b/compute/tensor/src/cpu/arm/fp32/tensor_computing_fp32.h new file mode 100644 index 00000000..76d2b38f --- /dev/null +++ b/compute/tensor/src/cpu/arm/fp32/tensor_computing_fp32.h @@ -0,0 +1,242 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TENSOR_COMPUTING_FP32 +#define _H_TENSOR_COMPUTING_FP32 +#include +#include "sys.h" +#include "error.h" +#include "thread_affinity.h" +#include "types.h" +#include "cpu/arm/fp32/arm_functions_fp32.h" + +EE convolution_transform_filter_fp32(TensorDesc filterDesc, + const F32 *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F32 *filterTransformed); + +EE convolution_fp32(TensorDesc inputDesc, + F32 *input, + TensorDesc filterDesc, + const F32 *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const F32 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *output, + ActivationParamSpec activationDesc, + Arch arch); + +#ifdef __aarch64__ +EE convolution_gemm_V8(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc); +#else +EE convolution_gemm_V7(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc); +#endif + +#ifdef __aarch64__ +EE convolution_gemm_icnchw_V8(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc); +#else +EE convolution_gemm_icnchw_V7(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc); +#endif + +EE convolution_winograd_V8(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc); + +EE deconvolution_transform_filter_fp32(TensorDesc filterDesc, + const F32 *filter, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F32 *filterTransformed); + +EE pooling_c8_fp32(const F32 *input, + U32 stride, + int hstart, + int hend, + int wstart, + int wend, + F32 *output, + PoolingParamSpec poolingParamSpec); + +EE pooling_bp_c8_fp32(const F32 *input, + int hstart, + int hend, + int wstart, + int wend, + F32 *output, + U32 stride, + PoolingParamSpec poolingParamSpec); + +EE softmax_fp32( + TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output); + +EE concat_fp32(std::vector inputDesc, + std::vector input, + TensorDesc outputDesc, + void *output, + U32 concatDim); + +EE attention_fp32(U32 batch, + U32 numHeads, + I32 fromSequenceLength, + I32 toSequenceLength, + const F32 *input, + F32 *output); + +EE clip_fp32(F32 *input, F32 *output, I32 len, F32 minValue, F32 maxValue); + +EE depthwise_pointwise_convolution_fp32(TensorDesc inputDesc, + F32 *input, + TensorDesc dwFilterDesc, + const F32 *dwFilter, + TensorDesc pwFilterDesc, + const F32 *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const F32 *dwBias, + TensorDesc pwBiasDesc, + const F32 *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch); + +EE eltwise_fp32(std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode); + +EE rnncell_fp32(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *output, + Arch arch); + +EE power_fp32(TensorDesc inputDesc, + F32 *input, + F32 scale, + F32 shift, + F32 power, + TensorDesc outputDesc, + F32 *output); + +EE layer_normalization_fp32( + TensorDesc inputDesc, F32 *input, F32 *alpha, F32 *beta, TensorDesc outputDesc, F32 *output); + +EE scale_fp32(F32 *input, + I32 axis, + I32 nDims, + F32 *alpha, + F32 *beta, + I32 in, + I32 ic, + I32 elements_per_channel, + F32 *output); + +EE softmax_fp32(TensorDesc inputDesc, const F32 *input, TensorDesc outputDesc, F32 *output); + +EE check_fp32(TensorDesc inputDescA, + const F32 *inputA, + TensorDesc inputDescB, + const F32 *inputB, + CheckMode checkMode, + TensorDesc outputDesc, + I32 *output); + +EE attention_mask_fp32(TensorDesc inputDesc, + const F32 *input, + AttentionMaskParamSpec p, + TensorDesc outputDesc, + F32 *output); + +EE prelu_fp32(TensorDesc inputDesc, + F32 *input, + F32 *weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + F32 *output); +#endif diff --git a/tensor_computing/src/cpu/arm/int8/arm_functions_int8.h b/compute/tensor/src/cpu/arm/int8/arm_functions_int8.h similarity index 81% rename from tensor_computing/src/cpu/arm/int8/arm_functions_int8.h rename to compute/tensor/src/cpu/arm/int8/arm_functions_int8.h index 10ebb20a..d1c1cebe 100644 --- a/tensor_computing/src/cpu/arm/int8/arm_functions_int8.h +++ b/compute/tensor/src/cpu/arm/int8/arm_functions_int8.h @@ -1,35 +1,35 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_ARM_FUNCTIONS_INT8 #define _H_ARM_FUNCTIONS_INT8 #include "arm_neon_expand.h" -inline EE activation_int8(INT8* input, U32 len, ActivationDesc activationDesc, INT8* output) +inline EE activation_int8(INT8 *input, U32 len, ActivationParamSpec activationDesc, INT8 *output) { int8x16_t in, out; - int8x16_t zero = vdupq_n_s8(0); + int8x16_t zero = vdupq_n_s8(0); U32 len_main = len / 16; U32 len_tail = len % 16; - switch (activationDesc.mode){ + switch (activationDesc.mode) { case ACTIVATION_NULL: { break; } case ACTIVATION_RELU: { - if (activationDesc.value[0] != 0) + if (activationDesc.value[0] != 0) { return NOT_SUPPORTED; + } for (U32 i = 0; i < len_main; i++) { in = vld1q_s8(input); out = vmaxq_s8(zero, in); diff --git a/tensor_computing/src/cpu/arm/int8/concat.cpp b/compute/tensor/src/cpu/arm/int8/concat.cpp similarity index 69% rename from tensor_computing/src/cpu/arm/int8/concat.cpp rename to compute/tensor/src/cpu/arm/int8/concat.cpp index 429d1b60..d0acd676 100644 --- a/tensor_computing/src/cpu/arm/int8/concat.cpp +++ b/compute/tensor/src/cpu/arm/int8/concat.cpp @@ -1,28 +1,32 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifdef _USE_INT8 #include #include "cpu/arm/int8/tensor_computing_int8.h" -EE concat_int8(std::vector inputDesc, std::vector input, F32* inputScale, - TensorDesc outputDesc, void* output, F32* outputScale, U32 concatDim) +EE concat_int8(std::vector inputDesc, + std::vector input, + F32 *inputScale, + int concatDim, + TensorDesc outputDesc, + void *output, + F32 *outputScale) { if (inputDesc.size() < 1) { CHECK_STATUS(NOT_MATCH); } - if(inputDesc.size() == 1) { + if (inputDesc.size() == 1) { memcpy(output, input[0], tensorNumBytes(outputDesc)); return SUCCESS; } @@ -33,7 +37,7 @@ EE concat_int8(std::vector inputDesc, std::vector input, F32* F32 min_scale = inputScale[0]; U32 min_idx = 0; - for (U32 i=1; i inputScale[i]) { min_scale = inputScale[i]; min_idx = i; @@ -41,14 +45,14 @@ EE concat_int8(std::vector inputDesc, std::vector input, F32* } *outputScale = min_scale; - for (U32 i=0; i= 0.9961) { // Even 128 will not be updated to 127 + if (rescale >= 0.9921) { // Even 127 will not be updated to 126 continue; } INT8 factor = rescale * 128; @@ -56,7 +60,7 @@ EE concat_int8(std::vector inputDesc, std::vector input, F32* if (factor < 2) { continue; } - + int8x8_t fact = vdup_n_s8(factor); U32 num = tensorNumElements(inputDesc[i]); @@ -65,50 +69,49 @@ EE concat_int8(std::vector inputDesc, std::vector input, F32* int8x8_t in[4]; int16x8_t in16[4]; - for (U32 i=0; i inputDesc, std::vector input, F32* } return SUCCESS; } - //channel - if(concatDim == 1) { - for(U32 j = 0; j < on; j++) { - for(U32 i = 0; i < inputDesc.size(); i++) { + // channel + if (concatDim == 1) { + for (U32 j = 0; j < on; j++) { + for (U32 i = 0; i < inputDesc.size(); i++) { CHECK_STATUS(tensor4dGet(inputDesc[i], &idt, &idf, &in, &ic, &ih, &iw)); if (odf != idf) { CHECK_STATUS(NOT_MATCH); @@ -133,8 +136,7 @@ EE concat_int8(std::vector inputDesc, std::vector input, F32* } return SUCCESS; } - } - else{ + } else { return NOT_MATCH; } return NOT_SUPPORTED; diff --git a/compute/tensor/src/cpu/arm/int8/convolution.cpp b/compute/tensor/src/cpu/arm/int8/convolution.cpp new file mode 100644 index 00000000..269ab97a --- /dev/null +++ b/compute/tensor/src/cpu/arm/int8/convolution.cpp @@ -0,0 +1,94 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_INT8 +#include "types.h" +#include "cpu/arm/int8/tensor_computing_int8.h" +#include "cpu/arm/int8/convolution_winograd.h" +#include "cpu/arm/int8/convolution_gemm.h" + +EE convolution_int8(TensorDesc inputDesc, + const INT8 *input, + TensorDesc filterDesc, + const INT8 *filter, + F16 *scales, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const F16 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + if (nullptr == input || nullptr == filter || nullptr == output || nullptr == bias || + nullptr == tmp) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (idt != DT_I8 && idt != DT_F16) { + CHECK_STATUS(NOT_MATCH); + } + if (fdt != DT_I8) { + CHECK_STATUS(NOT_MATCH); + } + if (odt != DT_F16 && odt != DT_I8) { + CHECK_STATUS(NOT_MATCH); + } + if (odf != DF_NCHWC8) { + CHECK_STATUS(NOT_MATCH); + } + if (!(ic == fc && oc == fn)) { + CHECK_STATUS(NOT_MATCH); + } + + const INT8 *inputPtr = input; + INT8 *tmpPtr = (INT8 *)tmp; + if (idf == DF_NCHW) { + TensorDesc prevDesc = inputDesc; + inputDesc.df = DF_NCHWC8; + CHECK_STATUS(transformNCHWToNCHWC8(prevDesc, input, inputDesc, tmpPtr)); + inputPtr = tmpPtr; + tmpPtr += tensorNumBytes(inputDesc); + tmpBytes -= tensorNumBytes(inputDesc); + } + + EE ret = SUCCESS; + switch (algorithm) { + case CONVOLUTION_ALGORITHM_WINOGRAD: + ret = convolution_winograd(inputDesc, inputPtr, scales, filterDesc, filter, scales + 2, + convParamSpec, biasDesc, bias, tmpBytes, tmpPtr, outputDesc, output, scales + 1, + activationDesc, arch); + break; + case CONVOLUTION_ALGORITHM_GEMM: + ret = convolution_gemm(inputDesc, inputPtr, scales, filterDesc, filter, scales + 2, + convParamSpec, biasDesc, bias, tmpBytes, tmpPtr, outputDesc, output, scales + 1, + activationDesc, arch); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} +#endif diff --git a/compute/tensor/src/cpu/arm/int8/convolution_gemm.h b/compute/tensor/src/cpu/arm/int8/convolution_gemm.h new file mode 100644 index 00000000..4ef9117a --- /dev/null +++ b/compute/tensor/src/cpu/arm/int8/convolution_gemm.h @@ -0,0 +1,502 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONVOLUTION_GEMM +#define _H_CONVOLUTION_GEMM +#ifdef _USE_INT8 +#include + +#include "sys.h" +#include "types.h" + +template +EE convolution_gemm_A55(TensorDesc inputDesc, + const void *input, + F16 *inputScale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec am); + +template +EE convolution_gemm_A76(TensorDesc inputDesc, + const void *input, + F16 *inputScale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec am); + +inline EE convolution_gemm(TensorDesc inputDesc, + const void *input, + F16 *inputScale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec am, + Arch arch) +{ + EE ret = SUCCESS; + switch (arch) { + case ARM_A55: { + ret = convolution_gemm_A55(inputDesc, input, inputScale, filterDesc, filter, + filterScale, convParamSpec, biasDesc, bias, tmpBytes, tmp, outputDesc, output, + outputScale, am); + break; + } + case ARM_A76: { + ret = convolution_gemm_A76(inputDesc, input, inputScale, filterDesc, filter, + filterScale, convParamSpec, biasDesc, bias, tmpBytes, tmp, outputDesc, output, + outputScale, am); + break; + } + default: { + ret = NOT_SUPPORTED; + break; + } + } + return ret; +} + +inline EE quantize_I32(U32 num_v, I32 *out_d, I32 factor, F32 scale, INT8 *out_q) +{ + // num_v is the number of q-form vectors (I32) + I32 *arr_d = out_d; + I32 fact = factor; + INT8 *arr_q = out_q; + U32 i28 = num_v / 28; // The number of iterations, each handling 28 vectors + + if (i28 > 0) { + __asm__ __volatile__("ld4 {v1.4s, v2.4s, v3.4s, v4.4s}, [%[out_d]], #64\n" + "ldr s0, [%[factor]]\n" + "ld4 {v5.4s, v6.4s, v7.4s, v8.4s}, [%[out_d]], #64\n" + "mov x1, %[i]\n" + "ld4 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[out_d]], #64\n" + "dup v0.4s, v0.s[0]\n" + "ld4 {v13.4s, v14.4s, v15.4s, v16.4s}, [%[out_d]], #64\n" + "ld4 {v17.4s, v18.4s, v19.4s, v20.4s}, [%[out_d]], #64\n" + "ld4 {v21.4s, v22.4s, v23.4s, v24.4s}, [%[out_d]], #64\n" + + "0:\n" + "ld4 {v25.4s, v26.4s, v27.4s, v28.4s}, [%[out_d]], #64\n" + "subs x1, x1, #1\n" + + "mul v4.4s, v4.4s, v0.4s\n" + "mul v3.4s, v3.4s, v0.4s\n" + "mul v2.4s, v2.4s, v0.4s\n" + "mul v1.4s, v1.4s, v0.4s\n" + + "mul v8.4s, v8.4s, v0.4s\n" + "sri v4.4s, v3.4s, #8\n" + "mul v7.4s, v7.4s, v0.4s\n" + "sri v2.4s, v1.4s, #8\n" + "mul v6.4s, v6.4s, v0.4s\n" + "mul v5.4s, v5.4s, v0.4s\n" + "sri v4.4s, v2.4s, #16\n" + + "mul v12.4s, v12.4s, v0.4s\n" + "sri v8.4s, v7.4s, #8\n" + "mul v11.4s, v11.4s, v0.4s\n" + "sri v6.4s, v5.4s, #8\n" + "mul v10.4s, v10.4s, v0.4s\n" + "str q4, [%[out_q]], #16\n" + "ld4 {v1.4s, v2.4s, v3.4s, v4.4s}, [%[out_d]], #64\n" + "mul v9.4s, v9.4s, v0.4s\n" + "sri v8.4s, v6.4s, #16\n" + + "mul v16.4s, v16.4s, v0.4s\n" + "sri v12.4s, v11.4s, #8\n" + "mul v15.4s, v15.4s, v0.4s\n" + "sri v10.4s, v9.4s, #8\n" + "mul v14.4s, v14.4s, v0.4s\n" + "str q8, [%[out_q]], #16\n" + "ld4 {v5.4s, v6.4s, v7.4s, v8.4s}, [%[out_d]], #64\n" + "mul v13.4s, v13.4s, v0.4s\n" + "sri v12.4s, v10.4s, #16\n" + + "mul v20.4s, v20.4s, v0.4s\n" + "sri v16.4s, v15.4s, #8\n" + "mul v19.4s, v19.4s, v0.4s\n" + "sri v14.4s, v13.4s, #8\n" + "mul v18.4s, v18.4s, v0.4s\n" + "str q12, [%[out_q]], #16\n" + "ld4 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[out_d]], #64\n" + "mul v17.4s, v17.4s, v0.4s\n" + "sri v16.4s, v14.4s, #16\n" + + "mul v24.4s, v24.4s, v0.4s\n" + "sri v20.4s, v19.4s, #8\n" + "mul v23.4s, v23.4s, v0.4s\n" + "sri v18.4s, v17.4s, #8\n" + "mul v22.4s, v22.4s, v0.4s\n" + "str q16, [%[out_q]], #16\n" + "ld4 {v13.4s, v14.4s, v15.4s, v16.4s}, [%[out_d]], #64\n" + "mul v21.4s, v21.4s, v0.4s\n" + "sri v20.4s, v18.4s, #16\n" + + "mul v28.4s, v28.4s, v0.4s\n" + "sri v24.4s, v23.4s, #8\n" + "mul v27.4s, v27.4s, v0.4s\n" + "sri v22.4s, v21.4s, #8\n" + "mul v26.4s, v26.4s, v0.4s\n" + "str q20, [%[out_q]], #16\n" + "ld4 {v17.4s, v18.4s, v19.4s, v20.4s}, [%[out_d]], #64\n" + "mul v25.4s, v25.4s, v0.4s\n" + "sri v24.4s, v22.4s, #16\n" + + "sri v28.4s, v27.4s, #8\n" + "sri v26.4s, v25.4s, #8\n" + "str q24, [%[out_q]], #16\n" + "sri v28.4s, v26.4s, #16\n" + "ld4 {v21.4s, v22.4s, v23.4s, v24.4s}, [%[out_d]], #64\n" + "str q28, [%[out_q]], #16\n" + "bne 0b\n" + : [out_d] "+r"(arr_d), [out_q] "+r"(arr_q) + : [factor] "r"(&fact), [i] "r"((I64)i28) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", + "x1"); + arr_d -= 96; // Prefetched 24 extra vectors + } + + U32 remainder = num_v - i28 * 28; + + if (remainder % 4) { + for (U32 i = 0; i < 8; i++) { + arr_q[i] = round_towards_zero(arr_d[i] * scale); + } + arr_d += 8; + arr_q += 8; + remainder -= 2; + } + + switch (remainder) { + case 24: { + __asm__ __volatile__("ldr s0, [%[factor]]\n" + "dup v0.4s, v0.s[0]\n" + + "ld4 {v1.4s, v2.4s, v3.4s, v4.4s}, [%[out_d]], #64\n" + "ld4 {v5.4s, v6.4s, v7.4s, v8.4s}, [%[out_d]], #64\n" + "ld4 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[out_d]], #64\n" + "ld4 {v13.4s, v14.4s, v15.4s, v16.4s}, [%[out_d]], #64\n" + "ld4 {v17.4s, v18.4s, v19.4s, v20.4s}, [%[out_d]], #64\n" + "ld4 {v21.4s, v22.4s, v23.4s, v24.4s}, [%[out_d]], #64\n" + + "mul v4.4s, v4.4s, v0.4s\n" + "mul v3.4s, v3.4s, v0.4s\n" + "mul v2.4s, v2.4s, v0.4s\n" + "mul v1.4s, v1.4s, v0.4s\n" + + "mul v8.4s, v8.4s, v0.4s\n" + "sri v4.4s, v3.4s, #8\n" + "mul v7.4s, v7.4s, v0.4s\n" + "sri v2.4s, v1.4s, #8\n" + "mul v6.4s, v6.4s, v0.4s\n" + "mul v5.4s, v5.4s, v0.4s\n" + "sri v4.4s, v2.4s, #16\n" + + "mul v12.4s, v12.4s, v0.4s\n" + "sri v8.4s, v7.4s, #8\n" + "mul v11.4s, v11.4s, v0.4s\n" + "sri v6.4s, v5.4s, #8\n" + "mul v10.4s, v10.4s, v0.4s\n" + "str q4, [%[out_q]], #16\n" + "mul v9.4s, v9.4s, v0.4s\n" + "sri v8.4s, v6.4s, #16\n" + + "mul v16.4s, v16.4s, v0.4s\n" + "sri v12.4s, v11.4s, #8\n" + "mul v15.4s, v15.4s, v0.4s\n" + "sri v10.4s, v9.4s, #8\n" + "mul v14.4s, v14.4s, v0.4s\n" + "str q8, [%[out_q]], #16\n" + "mul v13.4s, v13.4s, v0.4s\n" + "sri v12.4s, v10.4s, #16\n" + + "mul v20.4s, v20.4s, v0.4s\n" + "sri v16.4s, v15.4s, #8\n" + "mul v19.4s, v19.4s, v0.4s\n" + "sri v14.4s, v13.4s, #8\n" + "mul v18.4s, v18.4s, v0.4s\n" + "str q12, [%[out_q]], #16\n" + "mul v17.4s, v17.4s, v0.4s\n" + "sri v16.4s, v14.4s, #16\n" + + "mul v24.4s, v24.4s, v0.4s\n" + "sri v20.4s, v19.4s, #8\n" + "mul v23.4s, v23.4s, v0.4s\n" + "sri v18.4s, v17.4s, #8\n" + "mul v22.4s, v22.4s, v0.4s\n" + "str q16, [%[out_q]], #16\n" + "mul v21.4s, v21.4s, v0.4s\n" + "sri v20.4s, v18.4s, #16\n" + + "sri v24.4s, v23.4s, #8\n" + "sri v22.4s, v21.4s, #8\n" + "str q20, [%[out_q]], #16\n" + "sri v24.4s, v22.4s, #16\n" + + "str q24, [%[out_q]], #16\n" + : [out_d] "+r"(arr_d), [out_q] "+r"(arr_q) + : [factor] "r"(&fact) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "x1"); + break; + } + case 20: { + __asm__ __volatile__("ldr s0, [%[factor]]\n" + "dup v0.4s, v0.s[0]\n" + + "ld4 {v1.4s, v2.4s, v3.4s, v4.4s}, [%[out_d]], #64\n" + "ld4 {v5.4s, v6.4s, v7.4s, v8.4s}, [%[out_d]], #64\n" + "ld4 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[out_d]], #64\n" + "ld4 {v13.4s, v14.4s, v15.4s, v16.4s}, [%[out_d]], #64\n" + "ld4 {v17.4s, v18.4s, v19.4s, v20.4s}, [%[out_d]], #64\n" + + "mul v4.4s, v4.4s, v0.4s\n" + "mul v3.4s, v3.4s, v0.4s\n" + "mul v2.4s, v2.4s, v0.4s\n" + "mul v1.4s, v1.4s, v0.4s\n" + + "mul v8.4s, v8.4s, v0.4s\n" + "sri v4.4s, v3.4s, #8\n" + "mul v7.4s, v7.4s, v0.4s\n" + "sri v2.4s, v1.4s, #8\n" + "mul v6.4s, v6.4s, v0.4s\n" + "mul v5.4s, v5.4s, v0.4s\n" + "sri v4.4s, v2.4s, #16\n" + + "mul v12.4s, v12.4s, v0.4s\n" + "sri v8.4s, v7.4s, #8\n" + "mul v11.4s, v11.4s, v0.4s\n" + "sri v6.4s, v5.4s, #8\n" + "mul v10.4s, v10.4s, v0.4s\n" + "str q4, [%[out_q]], #16\n" + "mul v9.4s, v9.4s, v0.4s\n" + "sri v8.4s, v6.4s, #16\n" + + "mul v16.4s, v16.4s, v0.4s\n" + "sri v12.4s, v11.4s, #8\n" + "mul v15.4s, v15.4s, v0.4s\n" + "sri v10.4s, v9.4s, #8\n" + "mul v14.4s, v14.4s, v0.4s\n" + "str q8, [%[out_q]], #16\n" + "mul v13.4s, v13.4s, v0.4s\n" + "sri v12.4s, v10.4s, #16\n" + + "mul v20.4s, v20.4s, v0.4s\n" + "sri v16.4s, v15.4s, #8\n" + "mul v19.4s, v19.4s, v0.4s\n" + "sri v14.4s, v13.4s, #8\n" + "mul v18.4s, v18.4s, v0.4s\n" + "str q12, [%[out_q]], #16\n" + "mul v17.4s, v17.4s, v0.4s\n" + "sri v16.4s, v14.4s, #16\n" + + "sri v20.4s, v19.4s, #8\n" + "sri v18.4s, v17.4s, #8\n" + "str q16, [%[out_q]], #16\n" + "sri v20.4s, v18.4s, #16\n" + + "str q20, [%[out_q]], #16\n" + : [out_d] "+r"(arr_d), [out_q] "+r"(arr_q) + : [factor] "r"(&fact) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "x1"); + break; + } + case 16: { + __asm__ __volatile__("ldr s0, [%[factor]]\n" + "dup v0.4s, v0.s[0]\n" + + "ld4 {v1.4s, v2.4s, v3.4s, v4.4s}, [%[out_d]], #64\n" + "ld4 {v5.4s, v6.4s, v7.4s, v8.4s}, [%[out_d]], #64\n" + "ld4 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[out_d]], #64\n" + "ld4 {v13.4s, v14.4s, v15.4s, v16.4s}, [%[out_d]], #64\n" + + "mul v4.4s, v4.4s, v0.4s\n" + "mul v3.4s, v3.4s, v0.4s\n" + "mul v2.4s, v2.4s, v0.4s\n" + "mul v1.4s, v1.4s, v0.4s\n" + + "mul v8.4s, v8.4s, v0.4s\n" + "sri v4.4s, v3.4s, #8\n" + "mul v7.4s, v7.4s, v0.4s\n" + "sri v2.4s, v1.4s, #8\n" + "mul v6.4s, v6.4s, v0.4s\n" + "mul v5.4s, v5.4s, v0.4s\n" + "sri v4.4s, v2.4s, #16\n" + + "mul v12.4s, v12.4s, v0.4s\n" + "sri v8.4s, v7.4s, #8\n" + "mul v11.4s, v11.4s, v0.4s\n" + "sri v6.4s, v5.4s, #8\n" + "mul v10.4s, v10.4s, v0.4s\n" + "str q4, [%[out_q]], #16\n" + "mul v9.4s, v9.4s, v0.4s\n" + "sri v8.4s, v6.4s, #16\n" + + "mul v16.4s, v16.4s, v0.4s\n" + "sri v12.4s, v11.4s, #8\n" + "mul v15.4s, v15.4s, v0.4s\n" + "sri v10.4s, v9.4s, #8\n" + "mul v14.4s, v14.4s, v0.4s\n" + "str q8, [%[out_q]], #16\n" + "mul v13.4s, v13.4s, v0.4s\n" + "sri v12.4s, v10.4s, #16\n" + + "sri v16.4s, v15.4s, #8\n" + "sri v14.4s, v13.4s, #8\n" + "str q12, [%[out_q]], #16\n" + "sri v16.4s, v14.4s, #16\n" + + "str q16, [%[out_q]], #16\n" + : [out_d] "+r"(arr_d), [out_q] "+r"(arr_q) + : [factor] "r"(&fact) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "x1"); + break; + } + case 12: { + __asm__ __volatile__("ldr s0, [%[factor]]\n" + "dup v0.4s, v0.s[0]\n" + + "ld4 {v1.4s, v2.4s, v3.4s, v4.4s}, [%[out_d]], #64\n" + "ld4 {v5.4s, v6.4s, v7.4s, v8.4s}, [%[out_d]], #64\n" + "ld4 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[out_d]], #64\n" + + "mul v4.4s, v4.4s, v0.4s\n" + "mul v3.4s, v3.4s, v0.4s\n" + "mul v2.4s, v2.4s, v0.4s\n" + "mul v1.4s, v1.4s, v0.4s\n" + + "mul v8.4s, v8.4s, v0.4s\n" + "sri v4.4s, v3.4s, #8\n" + "mul v7.4s, v7.4s, v0.4s\n" + "sri v2.4s, v1.4s, #8\n" + "mul v6.4s, v6.4s, v0.4s\n" + "mul v5.4s, v5.4s, v0.4s\n" + "sri v4.4s, v2.4s, #16\n" + + "mul v12.4s, v12.4s, v0.4s\n" + "sri v8.4s, v7.4s, #8\n" + "mul v11.4s, v11.4s, v0.4s\n" + "sri v6.4s, v5.4s, #8\n" + "mul v10.4s, v10.4s, v0.4s\n" + "str q4, [%[out_q]], #16\n" + "mul v9.4s, v9.4s, v0.4s\n" + "sri v8.4s, v6.4s, #16\n" + + "sri v12.4s, v11.4s, #8\n" + "sri v10.4s, v9.4s, #8\n" + "str q8, [%[out_q]], #16\n" + "sri v12.4s, v10.4s, #16\n" + + "str q12, [%[out_q]], #16\n" + : [out_d] "+r"(arr_d), [out_q] "+r"(arr_q) + : [factor] "r"(&fact) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "x1"); + break; + } + case 8: { + __asm__ __volatile__( + "ldr s0, [%[factor]]\n" + "dup v0.4s, v0.s[0]\n" + + "ld4 {v1.4s, v2.4s, v3.4s, v4.4s}, [%[out_d]], #64\n" + "ld4 {v5.4s, v6.4s, v7.4s, v8.4s}, [%[out_d]], #64\n" + + "mul v4.4s, v4.4s, v0.4s\n" + "mul v3.4s, v3.4s, v0.4s\n" + "mul v2.4s, v2.4s, v0.4s\n" + "mul v1.4s, v1.4s, v0.4s\n" + + "mul v8.4s, v8.4s, v0.4s\n" + "sri v4.4s, v3.4s, #8\n" + "mul v7.4s, v7.4s, v0.4s\n" + "sri v2.4s, v1.4s, #8\n" + "mul v6.4s, v6.4s, v0.4s\n" + "mul v5.4s, v5.4s, v0.4s\n" + "sri v4.4s, v2.4s, #16\n" + + "sri v8.4s, v7.4s, #8\n" + "sri v6.4s, v5.4s, #8\n" + "str q4, [%[out_q]], #16\n" + "sri v8.4s, v6.4s, #16\n" + + "str q8, [%[out_q]], #16\n" + : [out_d] "+r"(arr_d), [out_q] "+r"(arr_q) + : [factor] "r"(&fact) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "x1"); + break; + } + case 4: { + __asm__ __volatile__("ldr s0, [%[factor]]\n" + "dup v0.4s, v0.s[0]\n" + + "ld4 {v1.4s, v2.4s, v3.4s, v4.4s}, [%[out_d]], #64\n" + + "mul v4.4s, v4.4s, v0.4s\n" + "mul v3.4s, v3.4s, v0.4s\n" + "mul v2.4s, v2.4s, v0.4s\n" + "mul v1.4s, v1.4s, v0.4s\n" + + "sri v4.4s, v3.4s, #8\n" + "sri v2.4s, v1.4s, #8\n" + "sri v4.4s, v2.4s, #16\n" + + "str q4, [%[out_q]], #16\n" + : [out_d] "+r"(arr_d), [out_q] "+r"(arr_q) + : [factor] "r"(&fact) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "x1"); + break; + } + case 0: { + break; + } + default: { + return UNKNOWN; + } + } + return SUCCESS; +} +#endif +#endif diff --git a/tensor_computing/src/cpu/arm/int8/convolution_gemm_A55.cpp b/compute/tensor/src/cpu/arm/int8/convolution_gemm_A55.cpp similarity index 76% rename from tensor_computing/src/cpu/arm/int8/convolution_gemm_A55.cpp rename to compute/tensor/src/cpu/arm/int8/convolution_gemm_A55.cpp index c8fa2045..6023715e 100644 --- a/tensor_computing/src/cpu/arm/int8/convolution_gemm_A55.cpp +++ b/compute/tensor/src/cpu/arm/int8/convolution_gemm_A55.cpp @@ -1,25 +1,36 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifdef _USE_INT8 #include #include "cpu/arm/int8/convolution_gemm.h" -template -EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale, TensorDesc filterDesc, const void* filter, F16* filterScale, - ConvolutionDesc convDesc, TensorDesc biasDesc, const void* bias, U32 tmpBytes, void* tmp, TensorDesc outputDesc, - void* output, F16* outputScale, ActivationDesc activationDesc) +template +EE convolution_gemm_A55(TensorDesc inputDesc, + const void *input, + F16 *inputScale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec activationDesc) { UNUSED(biasDesc); UNUSED(tmpBytes); @@ -32,14 +43,14 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - U32 dilateH = convDesc.dilatedRate_h; - U32 dilateW = convDesc.dilatedRate_w; + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; if (fdf != DF_NCHWN8C4) { return NOT_MATCH; @@ -52,37 +63,38 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale scale_known_bool = 1; } - INT8* inArray = (INT8*)input; // It will be updated if there is quantization - INT8* filterArray = (INT8*)filter; - F16* outArray = (F16*)output; - F16* biasArray = (F16*)bias; - INT8* in_pad = (INT8*)tmp; + INT8 *inArray = (INT8 *)input; // It will be updated if there is quantization + INT8 *filterArray = (INT8 *)filter; + F16 *outArray = (F16 *)output; + F16 *biasArray = (F16 *)bias; + INT8 *in_pad = (INT8 *)tmp; - // both input and output are stored with C8 + // both input and output are stored with C8 oc /= 8; ic /= 8; U32 ih_pad = ih + paddingT + paddingB; U32 iw_pad = iw + paddingL + paddingR; - I32 ohow = oh*ow; - U32 ihiw = ih_pad*iw_pad; + I32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; - I32* biasScaled = (I32*)(in_pad + ic*ihiw*8 + 12*fh*fw*ic*8); // Initialize + I32 *biasScaled = (I32 *)(in_pad + ic * ihiw * 8 + 12 * fh * fw * ic * 8); // Initialize - //double start, end; - I32 max_i32[4] = {0}; // To record max I32 values - I32 min_i32[4] = {0}; // To record min I32 values + // double start, end; + I32 max_i32[4] = {0}; // To record max I32 values + I32 min_i32[4] = {0}; // To record min I32 values - for (U32 n = 0; n < in; n++) {// for each batch + for (U32 n = 0; n < in; n++) { // for each batch F16 scale_i = 1.0; // quantize input if necessary if (idt == DT_F16) { - //start = get_current_time_int8(); - F16* in = ((F16*)input) + n*ic*ih*iw*8; - inArray = in_pad + ic*ihiw*8 + 12*fh*fw*ic*8; // After the space for padding and packing + // start = get_current_time_int8(); + F16 *in = ((F16 *)input) + n * ic * ih * iw * 8; + inArray = in_pad + ic * ihiw * 8 + + 12 * fh * fw * ic * 8; // After the space for padding and packing - U32 numData = ic*ih*iw*8; + U32 numData = ic * ih * iw * 8; if (*inputScale > 0) { scale_i = *inputScale; } else { @@ -90,8 +102,8 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale float16x8_t max_v = temp_v; float16x8_t min_v = temp_v; - for (U32 i=8; i 0 + } else { // min > 0 scale_i = 127.0 / max; } } for (U32 i = 0; i < numData; i++) { F32 temp = in[i] * scale_i; - if (temp > 127) { - inArray[i] = 127; - } else if (temp < -127) { - inArray[i] = -127; - } else { - inArray[i] = temp; - } + inArray[i] = round_towards_zero(temp, (*inputScale) != scale_i); } *inputScale = scale_i; } else { @@ -143,13 +149,13 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale min_i32[i] = thresholdN; } } - - if (odt == DT_I8) { // Scale the bias + + if (odt == DT_I8) { // Scale the bias if (idt == DT_F16) { - biasScaled += ic * ih * iw * 8 / bytesOf(DT_I32); // After the quantized input + biasScaled += ic * ih * iw * 8 / bytesOf(DT_I32); // After the quantized input } F32 scale = (*inputScale) * (*filterScale); - for (U32 i=0; i NHWChw12c4 + im2col U32 in_h[12]; U32 in_w[12]; for (U32 i = 0; i < 12; i++) { - in_h[i] = ((hw+i)/ow)*strideH; - in_w[i] = ((hw+i)%ow)*strideW; + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; } - for (U32 c = 0; c < ic; c++) {// for each 8 channels + for (U32 c = 0; c < ic; c++) { // for each 8 channels for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - INT8 *in_hw12c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - - INT8 *in_0 = in_hw12c8 + in_h[0]*iw_pad*8 + in_w[0]*8; - INT8 *in_1 = in_hw12c8 + in_h[1]*iw_pad*8 + in_w[1]*8; - INT8 *in_2 = in_hw12c8 + in_h[2]*iw_pad*8 + in_w[2]*8; - INT8 *in_3 = in_hw12c8 + in_h[3]*iw_pad*8 + in_w[3]*8; - INT8 *in_4 = in_hw12c8 + in_h[4]*iw_pad*8 + in_w[4]*8; - INT8 *in_5 = in_hw12c8 + in_h[5]*iw_pad*8 + in_w[5]*8; - INT8 *in_6 = in_hw12c8 + in_h[6]*iw_pad*8 + in_w[6]*8; - INT8 *in_7 = in_hw12c8 + in_h[7]*iw_pad*8 + in_w[7]*8; - INT8 *in_8 = in_hw12c8 + in_h[8]*iw_pad*8 + in_w[8]*8; - INT8 *in_9 = in_hw12c8 + in_h[9]*iw_pad*8 + in_w[9]*8; - INT8 *in_10 = in_hw12c8 + in_h[10]*iw_pad*8 + in_w[10]*8; - INT8 *in_11 = in_hw12c8 + in_h[11]*iw_pad*8 + in_w[11]*8; - + INT8 *in_hw12c8 = inArray_pad + c * ihiw * 8 + + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + + INT8 *in_0 = in_hw12c8 + in_h[0] * iw_pad * 8 + in_w[0] * 8; + INT8 *in_1 = in_hw12c8 + in_h[1] * iw_pad * 8 + in_w[1] * 8; + INT8 *in_2 = in_hw12c8 + in_h[2] * iw_pad * 8 + in_w[2] * 8; + INT8 *in_3 = in_hw12c8 + in_h[3] * iw_pad * 8 + in_w[3] * 8; + INT8 *in_4 = in_hw12c8 + in_h[4] * iw_pad * 8 + in_w[4] * 8; + INT8 *in_5 = in_hw12c8 + in_h[5] * iw_pad * 8 + in_w[5] * 8; + INT8 *in_6 = in_hw12c8 + in_h[6] * iw_pad * 8 + in_w[6] * 8; + INT8 *in_7 = in_hw12c8 + in_h[7] * iw_pad * 8 + in_w[7] * 8; + INT8 *in_8 = in_hw12c8 + in_h[8] * iw_pad * 8 + in_w[8] * 8; + INT8 *in_9 = in_hw12c8 + in_h[9] * iw_pad * 8 + in_w[9] * 8; + INT8 *in_10 = in_hw12c8 + in_h[10] * iw_pad * 8 + in_w[10] * 8; + INT8 *in_11 = in_hw12c8 + in_h[11] * iw_pad * 8 + in_w[11] * 8; + // in_pack (tmp) is reused for each tile // NHWChw12c4 - INT8 *in_pack_0 = in_pack + c*fh*fw*12*8 + fh_idx*fw*12*4 + fw_idx*12*4; - INT8 *in_pack_1 = in_pack_0 + fh*fw*12*4; + INT8 *in_pack_0 = + in_pack + c * fh * fw * 12 * 8 + fh_idx * fw * 12 * 4 + fw_idx * 12 * 4; + INT8 *in_pack_1 = in_pack_0 + fh * fw * 12 * 4; __asm__ __volatile__( "ldr d0, [%[in_0]]\n" @@ -249,7 +258,7 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale "trn2 v21.4s, v0.4s, v1.4s\n" "ldr x10, [%[in_10]]\n" - + "ldr d9, [%[in_9]]\n" "trn1 v24.4s, v4.4s, v5.4s\n" @@ -261,61 +270,51 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale "str q24, [%[pack_0], #16]\n" "ins v9.d[1], x11\n" - + "trn1 v28.4s, v8.4s, v9.4s\n" "str q21, [%[pack_1]]\n" "trn2 v29.4s, v8.4s, v9.4s\n" "str q25, [%[pack_1], #16]\n" - + "str q28, [%[pack_0], #32]\n" "str q29, [%[pack_1], #32]\n" : - :[pack_0]"r"(in_pack_0), - [pack_1]"r"(in_pack_1), - [in_0]"r"(in_0), - [in_1]"r"(in_1), - [in_2]"r"(in_2), - [in_3]"r"(in_3), - [in_4]"r"(in_4), - [in_5]"r"(in_5), - [in_6]"r"(in_6), - [in_7]"r"(in_7), - [in_8]"r"(in_8), - [in_9]"r"(in_9), - [in_10]"r"(in_10), - [in_11]"r"(in_11) - :"memory", "cc", "v0", "v1", "v4", "v5", "v8", "v9", "v20", "v21", "v24", "v25", "v28", "v29", "x2", "x3", "x6", "x7", "x10", "x11" - ); + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), [in_0] "r"(in_0), + [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3), [in_4] "r"(in_4), + [in_5] "r"(in_5), [in_6] "r"(in_6), [in_7] "r"(in_7), [in_8] "r"(in_8), + [in_9] "r"(in_9), [in_10] "r"(in_10), [in_11] "r"(in_11) + : "memory", "cc", "v0", "v1", "v4", "v5", "v8", "v9", "v20", "v21", + "v24", "v25", "v28", "v29", "x2", "x3", "x6", "x7", "x10", "x11"); } } } // compute - for (U32 o = 0; o < oc; o++) {// 8 output channels at a time + for (U32 o = 0; o < oc; o++) { // 8 output channels at a time INT8 *in_hw0 = in_pack; - INT8 *f_o0c0 = filterArray + o*8*fh*fw*ic*8; - I32 *out_buf = biasScaled + oc*8 + n*oc*ohow*8 + o*ohow*8 + hw*8;; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; + INT8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + I32 *out_buf = biasScaled + oc * 8 + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; // bias F16 *b_0 = b0; I32 *b_0_s = b0_s; __asm__ __volatile__( "cbz %[out_f16], 8f\n" "eor v5.16b, v5.16b, v5.16b\n" - "ldr d1, [%[in_0]]\n" //in_0 + "ldr d1, [%[in_0]]\n" // in_0 "eor v6.16b, v6.16b, v6.16b\n" "ldr x1, [%[in_0], #8]\n" "eor v7.16b, v7.16b, v7.16b\n" "ins v1.d[1], x1\n" "eor v8.16b, v8.16b, v8.16b\n" - "ldr d0, [%[f_0]]\n" //f_0 + "ldr d0, [%[f_0]]\n" // f_0 "eor v9.16b, v9.16b, v9.16b\n" "ldr x2, [%[f_0], #8]\n" "eor v10.16b, v10.16b, v10.16b\n" "ins v0.d[1], x2\n" "eor v11.16b, v11.16b, v11.16b\n" - "ldr d3, [%[in_0], #16]\n" //in_1 + "ldr d3, [%[in_0], #16]\n" // in_1 "eor v12.16b, v12.16b, v12.16b\n" "ldr x3, [%[in_0], #24]\n" "eor v13.16b, v13.16b, v13.16b\n" @@ -323,7 +322,7 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale "eor v14.16b, v14.16b, v14.16b\n" "eor v15.16b, v15.16b, v15.16b\n" "eor v16.16b, v16.16b, v16.16b\n" - + "eor v17.16b, v17.16b, v17.16b\n" "eor v18.16b, v18.16b, v18.16b\n" "eor v19.16b, v19.16b, v19.16b\n" @@ -341,19 +340,19 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale "8:\n" "ldp q29, q30, [%[b_0_s]]\n" "mov v5.16b, v29.16b\n" - "ldr d1, [%[in_0]]\n" //in_0 + "ldr d1, [%[in_0]]\n" // in_0 "mov v7.16b, v29.16b\n" "ldr x1, [%[in_0], #8]\n" "mov v9.16b, v29.16b\n" "ins v1.d[1], x1\n" "mov v11.16b, v29.16b\n" - "ldr d0, [%[f_0]]\n" //f_0 + "ldr d0, [%[f_0]]\n" // f_0 "mov v13.16b, v29.16b\n" "ldr x2, [%[f_0], #8]\n" "mov v15.16b, v29.16b\n" "ins v0.d[1], x2\n" "mov v17.16b, v29.16b\n" - "ldr d3, [%[in_0], #16]\n" //in_1 + "ldr d3, [%[in_0], #16]\n" // in_1 "mov v19.16b, v29.16b\n" "ldr x3, [%[in_0], #24]\n" "mov v21.16b, v29.16b\n" @@ -376,13 +375,13 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale "mov v28.16b, v30.16b\n" "7:\n" - //give in address to x3 + // give in address to x3 "mov x3, %[in_0]\n" - //give f address to x0 + // give f address to x0 "mov x0, %[f_0]\n" - "mov x2, %[ic]\n" //ic_blk + "mov x2, %[ic]\n" // ic_blk "0:\n" "sdot v5.4s, v0.16b, v1.4b[0]\n" "ldr d2, [x3, 32]\n" @@ -424,7 +423,7 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale "sdot v12.4s, v29.16b, v1.4b[3]\n" "ins v0.d[1], x17\n" - "ins v3.d[1], x16\n" + "ins v3.d[1], x16\n" "sdot v22.4s, v29.16b, v2.4b[0]\n" "mov v1.16b, v30.16b\n" @@ -531,7 +530,7 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale "fadd v27.8h, v0.8h, v27.8h\n" "cbz %[conv_relu], 1f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero + "eor v1.16b, v1.16b, v1.16b\n" // zero "fmax v5.8h, v5.8h, v1.8h\n" "fmax v7.8h, v7.8h, v1.8h\n" "fmax v9.8h, v9.8h, v1.8h\n" @@ -564,7 +563,7 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale "ldr q0, [%[min]]\n" "ldr q30, [%[max]]\n" "cbz %[conv_relu], 2f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero + "eor v1.16b, v1.16b, v1.16b\n" // zero "smax v5.4s, v5.4s, v1.4s\n" "smax v6.4s, v6.4s, v1.4s\n" "smax v7.4s, v7.4s, v1.4s\n" @@ -695,7 +694,7 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale "smax v30.4s, v13.4s, v30.4s\n" "smin v0.4s, v13.4s, v0.4s\n" "str q13, [%[out_buf], 128]\n" - + "smax v30.4s, v14.4s, v30.4s\n" "smin v0.4s, v14.4s, v0.4s\n" "str q14, [%[out_buf], 144]\n" @@ -747,21 +746,15 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale "5:\n" : - :[out_0]"r"(out_o0hw0), - [out_buf]"r"(out_buf), - [in_0]"r"(in_hw0), - [f_0]"r"(f_o0c0), - [ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_0), - [b_0_s]"r"(b_0_s), - [factor]"r"(factor_v), - [max]"r"(max_i32), - [min]"r"(min_i32), - [conv_relu]"r"(conv_relu_bool), - [out_f16]"r"(out_f16_bool), - [scale_known]"r"(scale_known_bool) - :"memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", "x1", "x2", "x3","x17","x16" - ); + : [out_0] "r"(out_o0hw0), [out_buf] "r"(out_buf), [in_0] "r"(in_hw0), + [f_0] "r"(f_o0c0), [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_0), + [b_0_s] "r"(b_0_s), [factor] "r"(factor_v), [max] "r"(max_i32), + [min] "r"(min_i32), [conv_relu] "r"(conv_relu_bool), + [out_f16] "r"(out_f16_bool), [scale_known] "r"(scale_known_bool) + : "memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", "x1", "x2", + "x3", "x17", "x16"); b0 += 8; b0_s += 8; } @@ -770,72 +763,67 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale // ohow_reminder % 12 / 8 I32 ohow_s = (ohow / 12) * 12; I32 ohow_tail = ohow - ohow_s; - + if (ohow_tail >= 8) { I32 hw = ohow_s; F16 *b0 = biasArray; I32 *b0_s = biasScaled; - INT8 *in_pack = ((INT8*)tmp) + ic*ih_pad*iw_pad*8; + INT8 *in_pack = ((INT8 *)tmp) + ic * ih_pad * iw_pad * 8; // pack input // NCHWc8 => NHWChw8c4 + im2col U32 in_h[8]; U32 in_w[8]; for (U32 i = 0; i < 8; i++) { - in_h[i] = ((hw+i)/ow)*strideH; - in_w[i] = ((hw+i)%ow)*strideW; + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; } for (U32 c = 0; c < ic; c++) { for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - INT8 *in_hw8c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - INT8 *in_0 = in_hw8c8 + in_h[0]*iw_pad*8 + in_w[0]*8; - INT8 *in_1 = in_hw8c8 + in_h[1]*iw_pad*8 + in_w[1]*8; - INT8 *in_2 = in_hw8c8 + in_h[2]*iw_pad*8 + in_w[2]*8; - INT8 *in_3 = in_hw8c8 + in_h[3]*iw_pad*8 + in_w[3]*8; - INT8 *in_4 = in_hw8c8 + in_h[4]*iw_pad*8 + in_w[4]*8; - INT8 *in_5 = in_hw8c8 + in_h[5]*iw_pad*8 + in_w[5]*8; - INT8 *in_6 = in_hw8c8 + in_h[6]*iw_pad*8 + in_w[6]*8; - INT8 *in_7 = in_hw8c8 + in_h[7]*iw_pad*8 + in_w[7]*8; - INT8 *in_pack_0 = in_pack + c*fh*fw*8*8 + fh_idx*fw*8*4 + fw_idx*8*4; - INT8 *in_pack_1 = in_pack_0 + fh*fw*8*4; - - __asm__ __volatile__( - "ldr d0, [%[in_0]]\n" - "ldr x2, [%[in_2]]\n" - "ldr d1, [%[in_1]]\n" - "ldr x3, [%[in_3]]\n" - "ins v0.d[1], x2\n" - "ins v1.d[1], x3\n" - "ldr d4, [%[in_4]]\n" - "ldr x6, [%[in_6]]\n" - "trn1 v20.4s, v0.4s, v1.4s\n" - "trn2 v21.4s, v0.4s, v1.4s\n" - - "ldr d5, [%[in_5]]\n" - "ldr x7, [%[in_7]]\n" - "ins v4.d[1], x6\n" - "ins v5.d[1], x7\n" - - "str q20, [%[pack_0]]\n" - "trn1 v24.4s, v4.4s, v5.4s\n" - "trn2 v25.4s, v4.4s, v5.4s\n" - "str q21, [%[pack_1]]\n" - "str q24, [%[pack_0], #16]\n" - "str q25, [%[pack_1], #16]\n" - : - :[pack_0]"r"(in_pack_0), - [pack_1]"r"(in_pack_1), - [in_0]"r"(in_0), - [in_1]"r"(in_1), - [in_2]"r"(in_2), - [in_3]"r"(in_3), - [in_4]"r"(in_4), - [in_5]"r"(in_5), - [in_6]"r"(in_6), - [in_7]"r"(in_7) - :"memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", "v24", "v25", "x2", "x3", "x6", "x7" - ); + INT8 *in_hw8c8 = inArray_pad + c * ihiw * 8 + + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + INT8 *in_0 = in_hw8c8 + in_h[0] * iw_pad * 8 + in_w[0] * 8; + INT8 *in_1 = in_hw8c8 + in_h[1] * iw_pad * 8 + in_w[1] * 8; + INT8 *in_2 = in_hw8c8 + in_h[2] * iw_pad * 8 + in_w[2] * 8; + INT8 *in_3 = in_hw8c8 + in_h[3] * iw_pad * 8 + in_w[3] * 8; + INT8 *in_4 = in_hw8c8 + in_h[4] * iw_pad * 8 + in_w[4] * 8; + INT8 *in_5 = in_hw8c8 + in_h[5] * iw_pad * 8 + in_w[5] * 8; + INT8 *in_6 = in_hw8c8 + in_h[6] * iw_pad * 8 + in_w[6] * 8; + INT8 *in_7 = in_hw8c8 + in_h[7] * iw_pad * 8 + in_w[7] * 8; + INT8 *in_pack_0 = + in_pack + c * fh * fw * 8 * 8 + fh_idx * fw * 8 * 4 + fw_idx * 8 * 4; + INT8 *in_pack_1 = in_pack_0 + fh * fw * 8 * 4; + + __asm__ __volatile__("ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + "ins v0.d[1], x2\n" + "ins v1.d[1], x3\n" + "ldr d4, [%[in_4]]\n" + "ldr x6, [%[in_6]]\n" + "trn1 v20.4s, v0.4s, v1.4s\n" + "trn2 v21.4s, v0.4s, v1.4s\n" + + "ldr d5, [%[in_5]]\n" + "ldr x7, [%[in_7]]\n" + "ins v4.d[1], x6\n" + "ins v5.d[1], x7\n" + + "str q20, [%[pack_0]]\n" + "trn1 v24.4s, v4.4s, v5.4s\n" + "trn2 v25.4s, v4.4s, v5.4s\n" + "str q21, [%[pack_1]]\n" + "str q24, [%[pack_0], #16]\n" + "str q25, [%[pack_1], #16]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), + [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), + [in_3] "r"(in_3), [in_4] "r"(in_4), [in_5] "r"(in_5), + [in_6] "r"(in_6), [in_7] "r"(in_7) + : "memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", + "v24", "v25", "x2", "x3", "x6", "x7"); } } } @@ -843,22 +831,22 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale // compute for (U32 o = 0; o < oc; o++) { INT8 *in_hw0 = in_pack; - INT8 *f_o0c0 = filterArray + o*8*fh*fw*ic*8; - I32 *out_buf = biasScaled + oc*8 + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; + INT8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + I32 *out_buf = biasScaled + oc * 8 + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; // bias F16 *b_0 = b0; I32 *b_0_s = b0_s; __asm__ __volatile__( "cbz %[out_f16], 8f\n" "eor v5.16b, v5.16b, v5.16b\n" - "ldr d1, [%[in_0]]\n" //in_0 + "ldr d1, [%[in_0]]\n" // in_0 "eor v6.16b, v6.16b, v6.16b\n" "ldr x1, [%[in_0], #8]\n" "eor v7.16b, v7.16b, v7.16b\n" "ins v1.d[1], x1\n" "eor v8.16b, v8.16b, v8.16b\n" - "ldr d0, [%[f_0]]\n" //f_0 + "ldr d0, [%[f_0]]\n" // f_0 "eor v9.16b, v9.16b, v9.16b\n" "ldr x2, [%[f_0], #8]\n" "eor v10.16b, v10.16b, v10.16b\n" @@ -878,13 +866,13 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale "8:\n" "ldp q29, q30, [%[b_0_s]]\n" "mov v5.16b, v29.16b\n" - "ldr d1, [%[in_0]]\n" //in_0 + "ldr d1, [%[in_0]]\n" // in_0 "mov v7.16b, v29.16b\n" "ldr x1, [%[in_0], #8]\n" "mov v9.16b, v29.16b\n" "ins v1.d[1], x1\n" "mov v11.16b, v29.16b\n" - "ldr d0, [%[f_0]]\n" //f_0 + "ldr d0, [%[f_0]]\n" // f_0 "mov v13.16b, v29.16b\n" "ldr x2, [%[f_0], #8]\n" "mov v15.16b, v29.16b\n" @@ -903,13 +891,13 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale "7:\n" - //give in address to x3 + // give in address to x3 "mov x3, %[in_0]\n" - //give f address to x0 + // give f address to x0 "mov x0, %[f_0]\n" - "mov x2, %[ic]\n" //ic_blk + "mov x2, %[ic]\n" // ic_blk "0:\n" "sdot v5.4s, v0.16b, v1.4b[0]\n" "ldr d3, [x3, 16]!\n" @@ -1015,7 +1003,7 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale "fadd v19.8h, v0.8h, v19.8h\n" "cbz %[conv_relu], 1f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero + "eor v1.16b, v1.16b, v1.16b\n" // zero "fmax v5.8h, v5.8h, v1.8h\n" "fmax v7.8h, v7.8h, v1.8h\n" "fmax v9.8h, v9.8h, v1.8h\n" @@ -1040,7 +1028,7 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale "ldr q0, [%[min]]\n" "ldr q30, [%[max]]\n" "cbz %[conv_relu], 2f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero + "eor v1.16b, v1.16b, v1.16b\n" // zero "smax v5.4s, v5.4s, v1.4s\n" "smax v6.4s, v6.4s, v1.4s\n" "smax v7.4s, v7.4s, v1.4s\n" @@ -1139,7 +1127,7 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale "smax v30.4s, v13.4s, v30.4s\n" "smin v0.4s, v13.4s, v0.4s\n" "str q13, [%[out_buf], 128]\n" - + "smax v30.4s, v14.4s, v30.4s\n" "smin v0.4s, v14.4s, v0.4s\n" "str q14, [%[out_buf], 144]\n" @@ -1166,21 +1154,14 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale "str q0, [%[min]]\n" "5:\n" : - :[out_0]"r"(out_o0hw0), - [out_buf]"r"(out_buf), - [in_0]"r"(in_hw0), - [f_0]"r"(f_o0c0), - [ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_0), - [b_0_s]"r"(b_0_s), - [factor]"r"(factor_v), - [max]"r"(max_i32), - [min]"r"(min_i32), - [conv_relu]"r"(conv_relu_bool), - [out_f16]"r"(out_f16_bool), - [scale_known]"r"(scale_known_bool) - :"memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v29", "v30", "x0", "x1", "x2", "x3","x17","x16" - ); + : [out_0] "r"(out_o0hw0), [out_buf] "r"(out_buf), [in_0] "r"(in_hw0), + [f_0] "r"(f_o0c0), [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_0), + [b_0_s] "r"(b_0_s), [factor] "r"(factor_v), [max] "r"(max_i32), + [min] "r"(min_i32), [conv_relu] "r"(conv_relu_bool), + [out_f16] "r"(out_f16_bool), [scale_known] "r"(scale_known_bool) + : "memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v29", "v30", + "x0", "x1", "x2", "x3", "x17", "x16"); b0 += 8; b0_s += 8; } @@ -1192,26 +1173,28 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale I32 hw = ohow_s; F16 *b0 = biasArray; I32 *b0_s = biasScaled; - INT8 *in_pack = ((INT8*)tmp) + ic*ih_pad*iw_pad*8; + INT8 *in_pack = ((INT8 *)tmp) + ic * ih_pad * iw_pad * 8; // pack input // NCHWc8 => NHWChw4c4 + im2col U32 in_h[4]; U32 in_w[4]; - for (U32 i=0; i<4; i++) { - in_h[i] = ((hw+i)/ow)*strideH; - in_w[i] = ((hw+i)%ow)*strideW; + for (U32 i = 0; i < 4; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; } for (U32 c = 0; c < ic; c++) { for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - INT8 *in_hw4c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - INT8 *in_0 = in_hw4c8 + in_h[0]*iw_pad*8 + in_w[0]*8; - INT8 *in_1 = in_hw4c8 + in_h[1]*iw_pad*8 + in_w[1]*8; - INT8 *in_2 = in_hw4c8 + in_h[2]*iw_pad*8 + in_w[2]*8; - INT8 *in_3 = in_hw4c8 + in_h[3]*iw_pad*8 + in_w[3]*8; - INT8 *in_pack_0 = in_pack + c*fh*fw*4*8 + fh_idx*fw*4*4 + fw_idx*4*4; - INT8 *in_pack_1 = in_pack_0 + fh*fw*4*4; + INT8 *in_hw4c8 = inArray_pad + c * ihiw * 8 + + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + INT8 *in_0 = in_hw4c8 + in_h[0] * iw_pad * 8 + in_w[0] * 8; + INT8 *in_1 = in_hw4c8 + in_h[1] * iw_pad * 8 + in_w[1] * 8; + INT8 *in_2 = in_hw4c8 + in_h[2] * iw_pad * 8 + in_w[2] * 8; + INT8 *in_3 = in_hw4c8 + in_h[3] * iw_pad * 8 + in_w[3] * 8; + INT8 *in_pack_0 = + in_pack + c * fh * fw * 4 * 8 + fh_idx * fw * 4 * 4 + fw_idx * 4 * 4; + INT8 *in_pack_1 = in_pack_0 + fh * fw * 4 * 4; __asm__ __volatile__( "ldr d0, [%[in_0]]\n" @@ -1225,14 +1208,9 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale "str q20, [%[pack_0]]\n" "str q21, [%[pack_1]]\n" : - :[pack_0]"r"(in_pack_0), - [pack_1]"r"(in_pack_1), - [in_0]"r"(in_0), - [in_1]"r"(in_1), - [in_2]"r"(in_2), - [in_3]"r"(in_3) - :"memory", "cc", "v0", "v1", "v20", "v21", "x2", "x3" - ); + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), [in_0] "r"(in_0), + [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3) + : "memory", "cc", "v0", "v1", "v20", "v21", "x2", "x3"); } } } @@ -1240,23 +1218,23 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale // compute for (U32 o = 0; o < oc; o++) { INT8 *in_hw0 = in_pack; - INT8 *f_o0c0 = filterArray + o*8*fh*fw*ic*8; - I32 *out_buf = biasScaled + oc*8 + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; + INT8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + I32 *out_buf = biasScaled + oc * 8 + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; // bias F16 *b_0 = b0; I32 *b_0_s = b0_s; __asm__ __volatile__( "cbz %[out_f16], 8f\n" "eor v5.16b, v5.16b, v5.16b\n" - "ldr d1, [%[in_0]]\n" //in_0 + "ldr d1, [%[in_0]]\n" // in_0 "eor v6.16b, v6.16b, v6.16b\n" "ldr x1, [%[in_0], #8]\n" "eor v7.16b, v7.16b, v7.16b\n" "ins v1.d[1], x1\n" "eor v8.16b, v8.16b, v8.16b\n" - "ldr d0, [%[f_0]]\n" //f_0 - + "ldr d0, [%[f_0]]\n" // f_0 + "eor v9.16b, v9.16b, v9.16b\n" "ldr x2, [%[f_0], #8]\n" "eor v10.16b, v10.16b, v10.16b\n" @@ -1267,13 +1245,13 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale "8:\n" "ldp q29, q30, [%[b_0_s]]\n" - "ldr d1, [%[in_0]]\n" //in_0 + "ldr d1, [%[in_0]]\n" // in_0 "mov v5.16b, v29.16b\n" "ldr x1, [%[in_0], #8]\n" "mov v7.16b, v29.16b\n" "ins v1.d[1], x1\n" "mov v9.16b, v29.16b\n" - "ldr d0, [%[f_0]]\n" //f_0 + "ldr d0, [%[f_0]]\n" // f_0 "mov v11.16b, v29.16b\n" "ldr x2, [%[f_0], #8]\n" @@ -1285,13 +1263,13 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale "7:\n" - //give in address to x3 + // give in address to x3 "mov x3, %[in_0]\n" - //give f address to x0 + // give f address to x0 "mov x0, %[f_0]\n" - "mov x2, %[ic]\n" //ic_blk + "mov x2, %[ic]\n" // ic_blk "0:\n" "ldr d29, [x0, 16]\n" "ldr x17, [x0, 24]\n" @@ -1357,7 +1335,7 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale "fadd v11.8h, v0.8h, v11.8h\n" "cbz %[conv_relu], 1f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero + "eor v1.16b, v1.16b, v1.16b\n" // zero "fmax v5.8h, v5.8h, v1.8h\n" "fmax v7.8h, v7.8h, v1.8h\n" "fmax v9.8h, v9.8h, v1.8h\n" @@ -1374,7 +1352,7 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale "ldr q0, [%[min]]\n" "ldr q30, [%[max]]\n" "cbz %[conv_relu], 2f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero + "eor v1.16b, v1.16b, v1.16b\n" // zero "smax v5.4s, v5.4s, v1.4s\n" "smax v6.4s, v6.4s, v1.4s\n" "smax v7.4s, v7.4s, v1.4s\n" @@ -1443,21 +1421,13 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale "str q0, [%[min]]\n" "5:\n" : - :[out_0]"r"(out_o0hw0), - [out_buf]"r"(out_buf), - [in_0]"r"(in_hw0), - [f_0]"r"(f_o0c0), - [ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_0), - [b_0_s]"r"(b_0_s), - [factor]"r"(factor_v), - [max]"r"(max_i32), - [min]"r"(min_i32), - [conv_relu]"r"(conv_relu_bool), - [out_f16]"r"(out_f16_bool), - [scale_known]"r"(scale_known_bool) - :"memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v29", "x0", "x1", "x2", "x3","x17","x16" - ); + : [out_0] "r"(out_o0hw0), [out_buf] "r"(out_buf), [in_0] "r"(in_hw0), + [f_0] "r"(f_o0c0), [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_0), + [b_0_s] "r"(b_0_s), [factor] "r"(factor_v), [max] "r"(max_i32), + [min] "r"(min_i32), [conv_relu] "r"(conv_relu_bool), + [out_f16] "r"(out_f16_bool), [scale_known] "r"(scale_known_bool) + : "memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v29", "x0", "x1", "x2", "x3", "x17", "x16"); b0 += 8; b0_s += 8; } @@ -1467,21 +1437,22 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale for (I32 hw = ohow_s; hw < ohow; hw++) { F16 *b0 = biasArray; I32 *b0_s = biasScaled; - INT8 *in_pack = ((INT8*)tmp) + ic*ih_pad*iw_pad*8; + INT8 *in_pack = ((INT8 *)tmp) + ic * ih_pad * iw_pad * 8; // pack input // NCHWc8 => NHWChw1c4 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; for (U32 c = 0; c < ic; c++) { for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - INT8 *in_hw1c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - INT8 *in_0 = in_hw1c8 + in_h_0*iw_pad*8 + in_w_0*8; - INT8 *in_pack_0 = in_pack + c*fh*fw*8 + fh_idx*fw*4 + fw_idx*4; - INT8 *in_pack_1 = in_pack_0 + fh*fw*4; - - memcpy(in_pack_0, in_0, 4*bytesOf(DT_I8)); - memcpy(in_pack_1, in_0+4, 4*bytesOf(DT_I8)); + INT8 *in_hw1c8 = inArray_pad + c * ihiw * 8 + + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + INT8 *in_0 = in_hw1c8 + in_h_0 * iw_pad * 8 + in_w_0 * 8; + INT8 *in_pack_0 = in_pack + c * fh * fw * 8 + fh_idx * fw * 4 + fw_idx * 4; + INT8 *in_pack_1 = in_pack_0 + fh * fw * 4; + + memcpy(in_pack_0, in_0, 4 * bytesOf(DT_I8)); + memcpy(in_pack_1, in_0 + 4, 4 * bytesOf(DT_I8)); } } } @@ -1489,27 +1460,27 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale // compute for (U32 o = 0; o < oc; o++) { INT8 *in_hw = in_pack; - INT8 *f_o = filterArray + o*8*fh*fw*ic*8; - I32 *out_buf = biasScaled + oc*8 + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - + INT8 *f_o = filterArray + o * 8 * fh * fw * ic * 8; + I32 *out_buf = biasScaled + oc * 8 + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + int32x4_t res[2] = {0}; if (out_f16_bool == 0) { res[0] = vld1q_s32(b0_s); res[1] = vld1q_s32(b0_s + 4); } - for(U32 c=0; c 0) { factor = 127 * 16777216 / max; scale_o = 127.0 / max; @@ -1604,21 +1575,45 @@ EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale } *outputScale = (*inputScale) * (*filterScale) * scale_o; } - - U32 num_v = oc * ohow * 2; // Number of q-form vectors - I32 *out_buf = biasScaled + oc*8; - INT8 *out_q = (INT8*)output; + + U32 num_v = oc * ohow * 2; // Number of q-form vectors + I32 *out_buf = biasScaled + oc * 8; + INT8 *out_q = (INT8 *)output; ret = quantize_I32(num_v, out_buf, factor, scale_o, out_q); } return ret; } -template EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale, TensorDesc filterDesc, const void* filter, F16* filterScale, - ConvolutionDesc convDesc, TensorDesc biasDesc, const void* bias, U32 tmpBytes, void* tmp, TensorDesc outputDesc, - void* output, F16* outputScale, ActivationDesc activationDesc); - -template EE convolution_gemm_A55(TensorDesc inputDesc, const void* input, F16* inputScale, TensorDesc filterDesc, const void* filter, F16* filterScale, - ConvolutionDesc convDesc, TensorDesc biasDesc, const void* bias, U32 tmpBytes, void* tmp, TensorDesc outputDesc, - void* output, F16* outputScale, ActivationDesc activationDesc); +template EE convolution_gemm_A55(TensorDesc inputDesc, + const void *input, + F16 *inputScale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec activationDesc); + +template EE convolution_gemm_A55(TensorDesc inputDesc, + const void *input, + F16 *inputScale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec activationDesc); #endif diff --git a/tensor_computing/src/cpu/arm/int8/convolution_gemm_A76.cpp b/compute/tensor/src/cpu/arm/int8/convolution_gemm_A76.cpp similarity index 75% rename from tensor_computing/src/cpu/arm/int8/convolution_gemm_A76.cpp rename to compute/tensor/src/cpu/arm/int8/convolution_gemm_A76.cpp index b022870e..4fe4e040 100644 --- a/tensor_computing/src/cpu/arm/int8/convolution_gemm_A76.cpp +++ b/compute/tensor/src/cpu/arm/int8/convolution_gemm_A76.cpp @@ -1,25 +1,36 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifdef _USE_INT8 #include #include "cpu/arm/int8/convolution_gemm.h" -template -EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale, TensorDesc filterDesc, const void* filter, F16* filterScale, - ConvolutionDesc convDesc, TensorDesc biasDesc, const void* bias, U32 tmpBytes, void* tmp, TensorDesc outputDesc, - void* output, F16* outputScale, ActivationDesc activationDesc) +template +EE convolution_gemm_A76(TensorDesc inputDesc, + const void *input, + F16 *inputScale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec activationDesc) { UNUSED(biasDesc); UNUSED(tmpBytes); @@ -32,14 +43,14 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 strideH = convDesc.stride_h; - U32 strideW = convDesc.stride_w; - U32 paddingT = convDesc.padding_top; - U32 paddingB = convDesc.padding_bottom; - U32 paddingL = convDesc.padding_left; - U32 paddingR = convDesc.padding_right; - U32 dilateH = convDesc.dilatedRate_h; - U32 dilateW = convDesc.dilatedRate_w; + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; if (fdf != DF_NCHWN8C4) { return NOT_MATCH; @@ -52,37 +63,38 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale scale_known_bool = 1; } - INT8* inArray = (INT8*)input; // It will be updated if there is quantization - INT8* filterArray = (INT8*)filter; - F16* outArray = (F16*)output; - F16* biasArray = (F16*)bias; - INT8* in_pad = (INT8*)tmp; + INT8 *inArray = (INT8 *)input; // It will be updated if there is quantization + INT8 *filterArray = (INT8 *)filter; + F16 *outArray = (F16 *)output; + F16 *biasArray = (F16 *)bias; + INT8 *in_pad = (INT8 *)tmp; - // both input and output are stored with C8 + // both input and output are stored with C8 oc /= 8; ic /= 8; U32 ih_pad = ih + paddingT + paddingB; U32 iw_pad = iw + paddingL + paddingR; - I32 ohow = oh*ow; - U32 ihiw = ih_pad*iw_pad; + I32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; - I32* biasScaled = (I32*)(in_pad + ic*ihiw*8 + 12*fh*fw*ic*8); // Initialize + I32 *biasScaled = (I32 *)(in_pad + ic * ihiw * 8 + 12 * fh * fw * ic * 8); // Initialize - //double start, end; - I32 max_i32[4] = {0}; // To record max I32 values - I32 min_i32[4] = {0}; // To record min I32 values + // double start, end; + I32 max_i32[4] = {0}; // To record max I32 values + I32 min_i32[4] = {0}; // To record min I32 values - for (U32 n = 0; n < in; n++) {// for each batch + for (U32 n = 0; n < in; n++) { // for each batch F16 scale_i = 1.0; // quantize input if necessary if (idt == DT_F16) { - //start = get_current_time_int8(); - F16* in = ((F16*)input) + n*ic*ih*iw*8; - inArray = in_pad + ic*ihiw*8 + 12*fh*fw*ic*8; // After the space for padding and packing + // start = get_current_time_int8(); + F16 *in = ((F16 *)input) + n * ic * ih * iw * 8; + inArray = in_pad + ic * ihiw * 8 + + 12 * fh * fw * ic * 8; // After the space for padding and packing - U32 numData = ic*ih*iw*8; + U32 numData = ic * ih * iw * 8; if (*inputScale > 0) { scale_i = *inputScale; } else { @@ -90,8 +102,8 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale float16x8_t max_v = temp_v; float16x8_t min_v = temp_v; - for (U32 i=8; i 0 + } else { // min > 0 scale_i = 127.0 / max; } } for (U32 i = 0; i < numData; i++) { F32 temp = in[i] * scale_i; - if (temp > 127) { - inArray[i] = 127; - } else if (temp < -127) { - inArray[i] = -127; - } else { - inArray[i] = temp; - } + inArray[i] = round_towards_zero(temp, (*inputScale) != scale_i); } *inputScale = scale_i; } else { @@ -143,87 +149,90 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale min_i32[i] = thresholdN; } } - - if (odt == DT_I8) { // Scale the bias + + if (odt == DT_I8) { // Scale the bias if (idt == DT_F16) { biasScaled += ic * ih * iw * 8 / bytesOf(DT_I32); // After the quantized input } F32 scale = (*inputScale) * (*filterScale); - for (U32 i=0; i NHWChw12c4 + im2col U32 in_h[12]; U32 in_w[12]; for (U32 i = 0; i < 12; i++) { - in_h[i] = ((hw+i)/ow)*strideH; - in_w[i] = ((hw+i)%ow)*strideW; + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; } - for (U32 c = 0; c < ic; c++) {// for each 8 channels + for (U32 c = 0; c < ic; c++) { // for each 8 channels for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - INT8 *in_hw12c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - - INT8 *in_0 = in_hw12c8 + in_h[0]*iw_pad*8 + in_w[0]*8; - INT8 *in_1 = in_hw12c8 + in_h[1]*iw_pad*8 + in_w[1]*8; - INT8 *in_2 = in_hw12c8 + in_h[2]*iw_pad*8 + in_w[2]*8; - INT8 *in_3 = in_hw12c8 + in_h[3]*iw_pad*8 + in_w[3]*8; - INT8 *in_4 = in_hw12c8 + in_h[4]*iw_pad*8 + in_w[4]*8; - INT8 *in_5 = in_hw12c8 + in_h[5]*iw_pad*8 + in_w[5]*8; - INT8 *in_6 = in_hw12c8 + in_h[6]*iw_pad*8 + in_w[6]*8; - INT8 *in_7 = in_hw12c8 + in_h[7]*iw_pad*8 + in_w[7]*8; - INT8 *in_8 = in_hw12c8 + in_h[8]*iw_pad*8 + in_w[8]*8; - INT8 *in_9 = in_hw12c8 + in_h[9]*iw_pad*8 + in_w[9]*8; - INT8 *in_10 = in_hw12c8 + in_h[10]*iw_pad*8 + in_w[10]*8; - INT8 *in_11 = in_hw12c8 + in_h[11]*iw_pad*8 + in_w[11]*8; - + INT8 *in_hw12c8 = inArray_pad + c * ihiw * 8 + + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + + INT8 *in_0 = in_hw12c8 + in_h[0] * iw_pad * 8 + in_w[0] * 8; + INT8 *in_1 = in_hw12c8 + in_h[1] * iw_pad * 8 + in_w[1] * 8; + INT8 *in_2 = in_hw12c8 + in_h[2] * iw_pad * 8 + in_w[2] * 8; + INT8 *in_3 = in_hw12c8 + in_h[3] * iw_pad * 8 + in_w[3] * 8; + INT8 *in_4 = in_hw12c8 + in_h[4] * iw_pad * 8 + in_w[4] * 8; + INT8 *in_5 = in_hw12c8 + in_h[5] * iw_pad * 8 + in_w[5] * 8; + INT8 *in_6 = in_hw12c8 + in_h[6] * iw_pad * 8 + in_w[6] * 8; + INT8 *in_7 = in_hw12c8 + in_h[7] * iw_pad * 8 + in_w[7] * 8; + INT8 *in_8 = in_hw12c8 + in_h[8] * iw_pad * 8 + in_w[8] * 8; + INT8 *in_9 = in_hw12c8 + in_h[9] * iw_pad * 8 + in_w[9] * 8; + INT8 *in_10 = in_hw12c8 + in_h[10] * iw_pad * 8 + in_w[10] * 8; + INT8 *in_11 = in_hw12c8 + in_h[11] * iw_pad * 8 + in_w[11] * 8; + // in_pack (tmp) is reused for each tile // NHWChw12c4 - INT8 *in_pack_0 = in_pack + c*fh*fw*12*8 + fh_idx*fw*12*4 + fw_idx*12*4; - INT8 *in_pack_1 = in_pack_0 + fh*fw*12*4; + INT8 *in_pack_0 = + in_pack + c * fh * fw * 12 * 8 + fh_idx * fw * 12 * 4 + fw_idx * 12 * 4; + INT8 *in_pack_1 = in_pack_0 + fh * fw * 12 * 4; __asm__ __volatile__( "ldr d0, [%[in_0]]\n" @@ -249,7 +258,7 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale "trn2 v21.4s, v0.4s, v1.4s\n" "ldr x10, [%[in_10]]\n" - + "ldr d9, [%[in_9]]\n" "trn1 v24.4s, v4.4s, v5.4s\n" @@ -261,63 +270,53 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale "str q24, [%[pack_0], #16]\n" "ins v9.d[1], x11\n" - + "trn1 v28.4s, v8.4s, v9.4s\n" "str q21, [%[pack_1]]\n" "trn2 v29.4s, v8.4s, v9.4s\n" "str q25, [%[pack_1], #16]\n" - + "str q28, [%[pack_0], #32]\n" "str q29, [%[pack_1], #32]\n" : - :[pack_0]"r"(in_pack_0), - [pack_1]"r"(in_pack_1), - [in_0]"r"(in_0), - [in_1]"r"(in_1), - [in_2]"r"(in_2), - [in_3]"r"(in_3), - [in_4]"r"(in_4), - [in_5]"r"(in_5), - [in_6]"r"(in_6), - [in_7]"r"(in_7), - [in_8]"r"(in_8), - [in_9]"r"(in_9), - [in_10]"r"(in_10), - [in_11]"r"(in_11) - :"memory", "cc", "v0", "v1", "v4", "v5", "v8", "v9", "v20", "v21", "v24", "v25", "v28", "v29", "x2", "x3", "x6", "x7", "x10", "x11" - ); + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), [in_0] "r"(in_0), + [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3), [in_4] "r"(in_4), + [in_5] "r"(in_5), [in_6] "r"(in_6), [in_7] "r"(in_7), [in_8] "r"(in_8), + [in_9] "r"(in_9), [in_10] "r"(in_10), [in_11] "r"(in_11) + : "memory", "cc", "v0", "v1", "v4", "v5", "v8", "v9", "v20", "v21", + "v24", "v25", "v28", "v29", "x2", "x3", "x6", "x7", "x10", "x11"); } } } // compute - for (U32 o = 0; o < oc; o++) {// 8 output channels at a time + for (U32 o = 0; o < oc; o++) { // 8 output channels at a time INT8 *in_hw0 = in_pack; - INT8 *f_o0c0 = filterArray + o*8*fh*fw*ic*8; - I32 *out_buf = biasScaled + oc*8 + n*oc*ohow*8 + o*ohow*8 + hw*8;; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; + INT8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + I32 *out_buf = biasScaled + oc * 8 + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; // bias F16 *b_0 = b0; I32 *b_0_s = b0_s; __asm__ __volatile__( "cbz %[out_f16], 8f\n" "eor v5.16b, v5.16b, v5.16b\n" - "ldr q1, [%[in_0]]\n" //in_0 + "ldr q1, [%[in_0]]\n" // in_0 "eor v6.16b, v6.16b, v6.16b\n" "eor v7.16b, v7.16b, v7.16b\n" "eor v8.16b, v8.16b, v8.16b\n" - "ldr q0, [%[f_0]]\n" //f_0 + "ldr q0, [%[f_0]]\n" // f_0 "eor v9.16b, v9.16b, v9.16b\n" "eor v10.16b, v10.16b, v10.16b\n" "eor v11.16b, v11.16b, v11.16b\n" - "ldr q3, [%[in_0], #16]\n" //in_1 + "ldr q3, [%[in_0], #16]\n" // in_1 "eor v12.16b, v12.16b, v12.16b\n" "eor v13.16b, v13.16b, v13.16b\n" "eor v14.16b, v14.16b, v14.16b\n" "eor v15.16b, v15.16b, v15.16b\n" "eor v16.16b, v16.16b, v16.16b\n" - + "eor v17.16b, v17.16b, v17.16b\n" "eor v18.16b, v18.16b, v18.16b\n" "eor v19.16b, v19.16b, v19.16b\n" @@ -335,15 +334,15 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale "8:\n" "ldp q29, q30, [%[b_0_s]]\n" "mov v5.16b, v29.16b\n" - "ldr q1, [%[in_0]]\n" //in_0 + "ldr q1, [%[in_0]]\n" // in_0 "mov v7.16b, v29.16b\n" "mov v9.16b, v29.16b\n" "mov v11.16b, v29.16b\n" - "ldr q0, [%[f_0]]\n" //f_0 + "ldr q0, [%[f_0]]\n" // f_0 "mov v13.16b, v29.16b\n" "mov v15.16b, v29.16b\n" "mov v17.16b, v29.16b\n" - "ldr q3, [%[in_0], #16]\n" //in_1 + "ldr q3, [%[in_0], #16]\n" // in_1 "mov v19.16b, v29.16b\n" "mov v21.16b, v29.16b\n" "mov v23.16b, v29.16b\n" @@ -364,13 +363,13 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale "mov v28.16b, v30.16b\n" "7:\n" - //give in address to x3 + // give in address to x3 "mov x3, %[in_0]\n" - //give f address to x0 + // give f address to x0 "mov x0, %[f_0]\n" - "mov x2, %[ic]\n" //ic_blk + "mov x2, %[ic]\n" // ic_blk "0:\n" "ldr q2, [x3, 32]\n" "ldr q29, [x0, 16]\n" @@ -400,7 +399,7 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale "sdot v8.4s, v29.16b, v1.4b[1]\n" "sdot v10.4s, v29.16b, v1.4b[2]\n" "sdot v12.4s, v29.16b, v1.4b[3]\n" - + "ldr q1, [x3, 48]!\n" "ldr q3, [x3, 16]\n" "sdot v22.4s, v29.16b, v2.4b[0]\n" @@ -503,7 +502,7 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale "fadd v27.8h, v0.8h, v27.8h\n" "cbz %[conv_relu], 1f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero + "eor v1.16b, v1.16b, v1.16b\n" // zero "fmax v5.8h, v5.8h, v1.8h\n" "fmax v7.8h, v7.8h, v1.8h\n" "fmax v9.8h, v9.8h, v1.8h\n" @@ -536,7 +535,7 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale "ldr q0, [%[min]]\n" "ldr q30, [%[max]]\n" "cbz %[conv_relu], 2f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero + "eor v1.16b, v1.16b, v1.16b\n" // zero "smax v5.4s, v5.4s, v1.4s\n" "smax v6.4s, v6.4s, v1.4s\n" "smax v7.4s, v7.4s, v1.4s\n" @@ -667,7 +666,7 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale "smax v30.4s, v13.4s, v30.4s\n" "smin v0.4s, v13.4s, v0.4s\n" "str q13, [%[out_buf], 128]\n" - + "smax v30.4s, v14.4s, v30.4s\n" "smin v0.4s, v14.4s, v0.4s\n" "str q14, [%[out_buf], 144]\n" @@ -719,21 +718,15 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale "5:\n" : - :[out_0]"r"(out_o0hw0), - [out_buf]"r"(out_buf), - [in_0]"r"(in_hw0), - [f_0]"r"(f_o0c0), - [ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_0), - [b_0_s]"r"(b_0_s), - [factor]"r"(factor_v), - [max]"r"(max_i32), - [min]"r"(min_i32), - [conv_relu]"r"(conv_relu_bool), - [out_f16]"r"(out_f16_bool), - [scale_known]"r"(scale_known_bool) - :"memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", "x1", "x2", "x3","x17","x16" - ); + : [out_0] "r"(out_o0hw0), [out_buf] "r"(out_buf), [in_0] "r"(in_hw0), + [f_0] "r"(f_o0c0), [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_0), + [b_0_s] "r"(b_0_s), [factor] "r"(factor_v), [max] "r"(max_i32), + [min] "r"(min_i32), [conv_relu] "r"(conv_relu_bool), + [out_f16] "r"(out_f16_bool), [scale_known] "r"(scale_known_bool) + : "memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", "x1", "x2", + "x3", "x17", "x16"); b0 += 8; b0_s += 8; } @@ -742,72 +735,67 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale // ohow_reminder % 12 / 8 I32 ohow_s = (ohow / 12) * 12; I32 ohow_tail = ohow - ohow_s; - + if (ohow_tail >= 8) { I32 hw = ohow_s; F16 *b0 = biasArray; I32 *b0_s = biasScaled; - INT8 *in_pack = ((INT8*)tmp) + ic*ih_pad*iw_pad*8; + INT8 *in_pack = ((INT8 *)tmp) + ic * ih_pad * iw_pad * 8; // pack input // NCHWc8 => NHWChw8c4 + im2col U32 in_h[8]; U32 in_w[8]; for (U32 i = 0; i < 8; i++) { - in_h[i] = ((hw+i)/ow)*strideH; - in_w[i] = ((hw+i)%ow)*strideW; + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; } for (U32 c = 0; c < ic; c++) { for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - INT8 *in_hw8c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - INT8 *in_0 = in_hw8c8 + in_h[0]*iw_pad*8 + in_w[0]*8; - INT8 *in_1 = in_hw8c8 + in_h[1]*iw_pad*8 + in_w[1]*8; - INT8 *in_2 = in_hw8c8 + in_h[2]*iw_pad*8 + in_w[2]*8; - INT8 *in_3 = in_hw8c8 + in_h[3]*iw_pad*8 + in_w[3]*8; - INT8 *in_4 = in_hw8c8 + in_h[4]*iw_pad*8 + in_w[4]*8; - INT8 *in_5 = in_hw8c8 + in_h[5]*iw_pad*8 + in_w[5]*8; - INT8 *in_6 = in_hw8c8 + in_h[6]*iw_pad*8 + in_w[6]*8; - INT8 *in_7 = in_hw8c8 + in_h[7]*iw_pad*8 + in_w[7]*8; - INT8 *in_pack_0 = in_pack + c*fh*fw*8*8 + fh_idx*fw*8*4 + fw_idx*8*4; - INT8 *in_pack_1 = in_pack_0 + fh*fw*8*4; - - __asm__ __volatile__( - "ldr d0, [%[in_0]]\n" - "ldr x2, [%[in_2]]\n" - "ldr d1, [%[in_1]]\n" - "ldr x3, [%[in_3]]\n" - "ins v0.d[1], x2\n" - "ins v1.d[1], x3\n" - "ldr d4, [%[in_4]]\n" - "ldr x6, [%[in_6]]\n" - "trn1 v20.4s, v0.4s, v1.4s\n" - "trn2 v21.4s, v0.4s, v1.4s\n" - - "ldr d5, [%[in_5]]\n" - "ldr x7, [%[in_7]]\n" - "ins v4.d[1], x6\n" - "ins v5.d[1], x7\n" - - "str q20, [%[pack_0]]\n" - "trn1 v24.4s, v4.4s, v5.4s\n" - "trn2 v25.4s, v4.4s, v5.4s\n" - "str q21, [%[pack_1]]\n" - "str q24, [%[pack_0], #16]\n" - "str q25, [%[pack_1], #16]\n" - : - :[pack_0]"r"(in_pack_0), - [pack_1]"r"(in_pack_1), - [in_0]"r"(in_0), - [in_1]"r"(in_1), - [in_2]"r"(in_2), - [in_3]"r"(in_3), - [in_4]"r"(in_4), - [in_5]"r"(in_5), - [in_6]"r"(in_6), - [in_7]"r"(in_7) - :"memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", "v24", "v25", "x2", "x3", "x6", "x7" - ); + INT8 *in_hw8c8 = inArray_pad + c * ihiw * 8 + + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + INT8 *in_0 = in_hw8c8 + in_h[0] * iw_pad * 8 + in_w[0] * 8; + INT8 *in_1 = in_hw8c8 + in_h[1] * iw_pad * 8 + in_w[1] * 8; + INT8 *in_2 = in_hw8c8 + in_h[2] * iw_pad * 8 + in_w[2] * 8; + INT8 *in_3 = in_hw8c8 + in_h[3] * iw_pad * 8 + in_w[3] * 8; + INT8 *in_4 = in_hw8c8 + in_h[4] * iw_pad * 8 + in_w[4] * 8; + INT8 *in_5 = in_hw8c8 + in_h[5] * iw_pad * 8 + in_w[5] * 8; + INT8 *in_6 = in_hw8c8 + in_h[6] * iw_pad * 8 + in_w[6] * 8; + INT8 *in_7 = in_hw8c8 + in_h[7] * iw_pad * 8 + in_w[7] * 8; + INT8 *in_pack_0 = + in_pack + c * fh * fw * 8 * 8 + fh_idx * fw * 8 * 4 + fw_idx * 8 * 4; + INT8 *in_pack_1 = in_pack_0 + fh * fw * 8 * 4; + + __asm__ __volatile__("ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + "ins v0.d[1], x2\n" + "ins v1.d[1], x3\n" + "ldr d4, [%[in_4]]\n" + "ldr x6, [%[in_6]]\n" + "trn1 v20.4s, v0.4s, v1.4s\n" + "trn2 v21.4s, v0.4s, v1.4s\n" + + "ldr d5, [%[in_5]]\n" + "ldr x7, [%[in_7]]\n" + "ins v4.d[1], x6\n" + "ins v5.d[1], x7\n" + + "str q20, [%[pack_0]]\n" + "trn1 v24.4s, v4.4s, v5.4s\n" + "trn2 v25.4s, v4.4s, v5.4s\n" + "str q21, [%[pack_1]]\n" + "str q24, [%[pack_0], #16]\n" + "str q25, [%[pack_1], #16]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), + [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), + [in_3] "r"(in_3), [in_4] "r"(in_4), [in_5] "r"(in_5), + [in_6] "r"(in_6), [in_7] "r"(in_7) + : "memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", + "v24", "v25", "x2", "x3", "x6", "x7"); } } } @@ -815,20 +803,20 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale // compute for (U32 o = 0; o < oc; o++) { INT8 *in_hw0 = in_pack; - INT8 *f_o0c0 = filterArray + o*8*fh*fw*ic*8; - I32 *out_buf = biasScaled + oc*8 + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; + INT8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + I32 *out_buf = biasScaled + oc * 8 + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; // bias F16 *b_0 = b0; I32 *b_0_s = b0_s; __asm__ __volatile__( "cbz %[out_f16], 8f\n" "eor v5.16b, v5.16b, v5.16b\n" - "ldr q1, [%[in_0]]\n" //in_0 + "ldr q1, [%[in_0]]\n" // in_0 "eor v6.16b, v6.16b, v6.16b\n" "eor v7.16b, v7.16b, v7.16b\n" "eor v8.16b, v8.16b, v8.16b\n" - "ldr q0, [%[f_0]]\n" //f_0 + "ldr q0, [%[f_0]]\n" // f_0 "eor v9.16b, v9.16b, v9.16b\n" "eor v10.16b, v10.16b, v10.16b\n" "eor v11.16b, v11.16b, v11.16b\n" @@ -845,8 +833,8 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale "8:\n" "ldp q29, q30, [%[b_0_s]]\n" - "ldr q1, [%[in_0]]\n" //in_0 - "ldr q0, [%[f_0]]\n" //f_0 + "ldr q1, [%[in_0]]\n" // in_0 + "ldr q0, [%[f_0]]\n" // f_0 "mov v5.16b, v29.16b\n" "mov v7.16b, v29.16b\n" "mov v9.16b, v29.16b\n" @@ -867,13 +855,13 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale "7:\n" - //give in address to x3 + // give in address to x3 "mov x3, %[in_0]\n" - //give f address to x0 + // give f address to x0 "mov x0, %[f_0]\n" - "mov x2, %[ic]\n" //ic_blk + "mov x2, %[ic]\n" // ic_blk "0:\n" "ldr q3, [x3, 16]!\n" "ldr q29, [x0, 16]\n" @@ -889,7 +877,7 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale "sdot v6.4s, v29.16b, v1.4b[0]\n" "sdot v8.4s, v29.16b, v1.4b[1]\n" - + "sdot v10.4s, v29.16b, v1.4b[2]\n" "sdot v12.4s, v29.16b, v1.4b[3]\n" @@ -967,7 +955,7 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale "fadd v19.8h, v0.8h, v19.8h\n" "cbz %[conv_relu], 1f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero + "eor v1.16b, v1.16b, v1.16b\n" // zero "fmax v5.8h, v5.8h, v1.8h\n" "fmax v7.8h, v7.8h, v1.8h\n" "fmax v9.8h, v9.8h, v1.8h\n" @@ -992,7 +980,7 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale "ldr q0, [%[min]]\n" "ldr q30, [%[max]]\n" "cbz %[conv_relu], 2f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero + "eor v1.16b, v1.16b, v1.16b\n" // zero "smax v5.4s, v5.4s, v1.4s\n" "smax v6.4s, v6.4s, v1.4s\n" "smax v7.4s, v7.4s, v1.4s\n" @@ -1091,7 +1079,7 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale "smax v30.4s, v13.4s, v30.4s\n" "smin v0.4s, v13.4s, v0.4s\n" "str q13, [%[out_buf], 128]\n" - + "smax v30.4s, v14.4s, v30.4s\n" "smin v0.4s, v14.4s, v0.4s\n" "str q14, [%[out_buf], 144]\n" @@ -1118,21 +1106,14 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale "str q0, [%[min]]\n" "5:\n" : - :[out_0]"r"(out_o0hw0), - [out_buf]"r"(out_buf), - [in_0]"r"(in_hw0), - [f_0]"r"(f_o0c0), - [ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_0), - [b_0_s]"r"(b_0_s), - [factor]"r"(factor_v), - [max]"r"(max_i32), - [min]"r"(min_i32), - [conv_relu]"r"(conv_relu_bool), - [out_f16]"r"(out_f16_bool), - [scale_known]"r"(scale_known_bool) - :"memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v29", "v30", "x0", "x1", "x2", "x3","x17","x16" - ); + : [out_0] "r"(out_o0hw0), [out_buf] "r"(out_buf), [in_0] "r"(in_hw0), + [f_0] "r"(f_o0c0), [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_0), + [b_0_s] "r"(b_0_s), [factor] "r"(factor_v), [max] "r"(max_i32), + [min] "r"(min_i32), [conv_relu] "r"(conv_relu_bool), + [out_f16] "r"(out_f16_bool), [scale_known] "r"(scale_known_bool) + : "memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v29", "v30", + "x0", "x1", "x2", "x3", "x17", "x16"); b0 += 8; b0_s += 8; } @@ -1144,26 +1125,28 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale I32 hw = ohow_s; F16 *b0 = biasArray; I32 *b0_s = biasScaled; - INT8 *in_pack = ((INT8*)tmp) + ic*ih_pad*iw_pad*8; + INT8 *in_pack = ((INT8 *)tmp) + ic * ih_pad * iw_pad * 8; // pack input // NCHWc8 => NHWChw4c4 + im2col U32 in_h[4]; U32 in_w[4]; - for (U32 i=0; i<4; i++) { - in_h[i] = ((hw+i)/ow)*strideH; - in_w[i] = ((hw+i)%ow)*strideW; + for (U32 i = 0; i < 4; i++) { + in_h[i] = ((hw + i) / ow) * strideH; + in_w[i] = ((hw + i) % ow) * strideW; } for (U32 c = 0; c < ic; c++) { for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - INT8 *in_hw4c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - INT8 *in_0 = in_hw4c8 + in_h[0]*iw_pad*8 + in_w[0]*8; - INT8 *in_1 = in_hw4c8 + in_h[1]*iw_pad*8 + in_w[1]*8; - INT8 *in_2 = in_hw4c8 + in_h[2]*iw_pad*8 + in_w[2]*8; - INT8 *in_3 = in_hw4c8 + in_h[3]*iw_pad*8 + in_w[3]*8; - INT8 *in_pack_0 = in_pack + c*fh*fw*4*8 + fh_idx*fw*4*4 + fw_idx*4*4; - INT8 *in_pack_1 = in_pack_0 + fh*fw*4*4; + INT8 *in_hw4c8 = inArray_pad + c * ihiw * 8 + + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + INT8 *in_0 = in_hw4c8 + in_h[0] * iw_pad * 8 + in_w[0] * 8; + INT8 *in_1 = in_hw4c8 + in_h[1] * iw_pad * 8 + in_w[1] * 8; + INT8 *in_2 = in_hw4c8 + in_h[2] * iw_pad * 8 + in_w[2] * 8; + INT8 *in_3 = in_hw4c8 + in_h[3] * iw_pad * 8 + in_w[3] * 8; + INT8 *in_pack_0 = + in_pack + c * fh * fw * 4 * 8 + fh_idx * fw * 4 * 4 + fw_idx * 4 * 4; + INT8 *in_pack_1 = in_pack_0 + fh * fw * 4 * 4; __asm__ __volatile__( "ldr d0, [%[in_0]]\n" @@ -1177,14 +1160,9 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale "str q20, [%[pack_0]]\n" "str q21, [%[pack_1]]\n" : - :[pack_0]"r"(in_pack_0), - [pack_1]"r"(in_pack_1), - [in_0]"r"(in_0), - [in_1]"r"(in_1), - [in_2]"r"(in_2), - [in_3]"r"(in_3) - :"memory", "cc", "v0", "v1", "v20", "v21", "x2", "x3" - ); + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), [in_0] "r"(in_0), + [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3) + : "memory", "cc", "v0", "v1", "v20", "v21", "x2", "x3"); } } } @@ -1192,21 +1170,21 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale // compute for (U32 o = 0; o < oc; o++) { INT8 *in_hw0 = in_pack; - INT8 *f_o0c0 = filterArray + o*8*fh*fw*ic*8; - I32 *out_buf = biasScaled + oc*8 + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; + INT8 *f_o0c0 = filterArray + o * 8 * fh * fw * ic * 8; + I32 *out_buf = biasScaled + oc * 8 + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; // bias F16 *b_0 = b0; I32 *b_0_s = b0_s; __asm__ __volatile__( "cbz %[out_f16], 8f\n" "eor v5.16b, v5.16b, v5.16b\n" - "ldr q1, [%[in_0]]\n" //in_0 + "ldr q1, [%[in_0]]\n" // in_0 "eor v6.16b, v6.16b, v6.16b\n" "eor v7.16b, v7.16b, v7.16b\n" "eor v8.16b, v8.16b, v8.16b\n" - "ldr q0, [%[f_0]]\n" //f_0 - + "ldr q0, [%[f_0]]\n" // f_0 + "eor v9.16b, v9.16b, v9.16b\n" "eor v10.16b, v10.16b, v10.16b\n" "eor v11.16b, v11.16b, v11.16b\n" @@ -1215,8 +1193,8 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale "8:\n" "ldp q29, q30, [%[b_0_s]]\n" - "ldr q1, [%[in_0]]\n" //in_0 - "ldr q0, [%[f_0]]\n" //f_0 + "ldr q1, [%[in_0]]\n" // in_0 + "ldr q0, [%[f_0]]\n" // f_0 "mov v5.16b, v29.16b\n" "mov v7.16b, v29.16b\n" "mov v9.16b, v29.16b\n" @@ -1229,19 +1207,19 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale "7:\n" - //give in address to x3 + // give in address to x3 "mov x3, %[in_0]\n" - //give f address to x0 + // give f address to x0 "mov x0, %[f_0]\n" - "mov x2, %[ic]\n" //ic_blk + "mov x2, %[ic]\n" // ic_blk "0:\n" "ldr q29, [x0, 16]\n" "ldr q3, [x3, 16]!\n" "sdot v5.4s, v0.16b, v1.4b[0]\n" "sdot v7.4s, v0.16b, v1.4b[1]\n" - + "sdot v9.4s, v0.16b, v1.4b[2]\n" "sdot v11.4s, v0.16b, v1.4b[3]\n" @@ -1292,7 +1270,7 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale "fadd v11.8h, v0.8h, v11.8h\n" "cbz %[conv_relu], 1f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero + "eor v1.16b, v1.16b, v1.16b\n" // zero "fmax v5.8h, v5.8h, v1.8h\n" "fmax v7.8h, v7.8h, v1.8h\n" "fmax v9.8h, v9.8h, v1.8h\n" @@ -1309,7 +1287,7 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale "ldr q0, [%[min]]\n" "ldr q30, [%[max]]\n" "cbz %[conv_relu], 2f\n" - "eor v1.16b, v1.16b, v1.16b\n" //zero + "eor v1.16b, v1.16b, v1.16b\n" // zero "smax v5.4s, v5.4s, v1.4s\n" "smax v6.4s, v6.4s, v1.4s\n" "smax v7.4s, v7.4s, v1.4s\n" @@ -1378,21 +1356,13 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale "str q0, [%[min]]\n" "5:\n" : - :[out_0]"r"(out_o0hw0), - [out_buf]"r"(out_buf), - [in_0]"r"(in_hw0), - [f_0]"r"(f_o0c0), - [ic]"r"((I64)ic*8*fh*fw), - [b_0]"r"(b_0), - [b_0_s]"r"(b_0_s), - [factor]"r"(factor_v), - [max]"r"(max_i32), - [min]"r"(min_i32), - [conv_relu]"r"(conv_relu_bool), - [out_f16]"r"(out_f16_bool), - [scale_known]"r"(scale_known_bool) - :"memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v29", "x0", "x1", "x2", "x3","x17","x16" - ); + : [out_0] "r"(out_o0hw0), [out_buf] "r"(out_buf), [in_0] "r"(in_hw0), + [f_0] "r"(f_o0c0), [ic] "r"((I64)ic * 8 * fh * fw), [b_0] "r"(b_0), + [b_0_s] "r"(b_0_s), [factor] "r"(factor_v), [max] "r"(max_i32), + [min] "r"(min_i32), [conv_relu] "r"(conv_relu_bool), + [out_f16] "r"(out_f16_bool), [scale_known] "r"(scale_known_bool) + : "memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v29", "x0", "x1", "x2", "x3", "x17", "x16"); b0 += 8; b0_s += 8; } @@ -1402,21 +1372,22 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale for (I32 hw = ohow_s; hw < ohow; hw++) { F16 *b0 = biasArray; I32 *b0_s = biasScaled; - INT8 *in_pack = ((INT8*)tmp) + ic*ih_pad*iw_pad*8; + INT8 *in_pack = ((INT8 *)tmp) + ic * ih_pad * iw_pad * 8; // pack input // NCHWc8 => NHWChw1c4 + im2col - U32 in_h_0 = (hw/ow)*strideH; - U32 in_w_0 = (hw%ow)*strideW; + U32 in_h_0 = (hw / ow) * strideH; + U32 in_w_0 = (hw % ow) * strideW; for (U32 c = 0; c < ic; c++) { for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { - INT8 *in_hw1c8 = inArray_pad + c*ihiw*8 + fh_idx*dilateH*iw_pad*8 + fw_idx*dilateW*8; - INT8 *in_0 = in_hw1c8 + in_h_0*iw_pad*8 + in_w_0*8; - INT8 *in_pack_0 = in_pack + c*fh*fw*8 + fh_idx*fw*4 + fw_idx*4; - INT8 *in_pack_1 = in_pack_0 + fh*fw*4; - - memcpy(in_pack_0, in_0, 4*bytesOf(DT_I8)); - memcpy(in_pack_1, in_0+4, 4*bytesOf(DT_I8)); + INT8 *in_hw1c8 = inArray_pad + c * ihiw * 8 + + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + INT8 *in_0 = in_hw1c8 + in_h_0 * iw_pad * 8 + in_w_0 * 8; + INT8 *in_pack_0 = in_pack + c * fh * fw * 8 + fh_idx * fw * 4 + fw_idx * 4; + INT8 *in_pack_1 = in_pack_0 + fh * fw * 4; + + memcpy(in_pack_0, in_0, 4 * bytesOf(DT_I8)); + memcpy(in_pack_1, in_0 + 4, 4 * bytesOf(DT_I8)); } } } @@ -1424,27 +1395,27 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale // compute for (U32 o = 0; o < oc; o++) { INT8 *in_hw = in_pack; - INT8 *f_o = filterArray + o*8*fh*fw*ic*8; - I32 *out_buf = biasScaled + oc*8 + n*oc*ohow*8 + o*ohow*8 + hw*8; - F16 *out_o0hw0 = outArray + n*oc*ohow*8 + o*ohow*8 + hw*8; - + INT8 *f_o = filterArray + o * 8 * fh * fw * ic * 8; + I32 *out_buf = biasScaled + oc * 8 + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + F16 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + int32x4_t res[2] = {0}; if (out_f16_bool == 0) { res[0] = vld1q_s32(b0_s); res[1] = vld1q_s32(b0_s + 4); } - for(U32 c=0; c 0) { factor = 127 * 16777216 / max; scale_o = 127.0 / max; @@ -1539,21 +1510,45 @@ EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale } *outputScale = (*inputScale) * (*filterScale) * scale_o; } - - U32 num_v = oc * ohow * 2; // Number of q-form vectors - I32 *out_buf = biasScaled + oc*8; - INT8 *out_q = (INT8*)output; + + U32 num_v = oc * ohow * 2; // Number of q-form vectors + I32 *out_buf = biasScaled + oc * 8; + INT8 *out_q = (INT8 *)output; ret = quantize_I32(num_v, out_buf, factor, scale_o, out_q); } return ret; } -template EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale, TensorDesc filterDesc, const void* filter, F16* filterScale, - ConvolutionDesc convDesc, TensorDesc biasDesc, const void* bias, U32 tmpBytes, void* tmp, TensorDesc outputDesc, - void* output, F16* outputScale, ActivationDesc activationDesc); - -template EE convolution_gemm_A76(TensorDesc inputDesc, const void* input, F16* inputScale, TensorDesc filterDesc, const void* filter, F16* filterScale, - ConvolutionDesc convDesc, TensorDesc biasDesc, const void* bias, U32 tmpBytes, void* tmp, TensorDesc outputDesc, - void* output, F16* outputScale, ActivationDesc activationDesc); +template EE convolution_gemm_A76(TensorDesc inputDesc, + const void *input, + F16 *inputScale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec activationDesc); + +template EE convolution_gemm_A76(TensorDesc inputDesc, + const void *input, + F16 *inputScale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec activationDesc); #endif diff --git a/tensor_computing/src/cpu/arm/int8/convolution_transform.cpp b/compute/tensor/src/cpu/arm/int8/convolution_transform.cpp similarity index 56% rename from tensor_computing/src/cpu/arm/int8/convolution_transform.cpp rename to compute/tensor/src/cpu/arm/int8/convolution_transform.cpp index 83e1ad2d..7bdeb659 100644 --- a/tensor_computing/src/cpu/arm/int8/convolution_transform.cpp +++ b/compute/tensor/src/cpu/arm/int8/convolution_transform.cpp @@ -1,57 +1,62 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifdef _USE_INT8 #include "cpu/arm/int8/tensor_computing_int8.h" #include "cpu/arm/fp16/convolution_winograd_transform.h" #include -#include "type.h" +#include "types.h" #include "tensor_desc.h" #include "error.h" #include "tensor_computing.h" -inline EE convolution_transform_filter_kernel_int8(TensorDesc filterDesc, const void* filter, - TensorDesc *ftmDesc, void* ftm, +inline EE convolution_transform_filter_kernel_int8(TensorDesc filterDesc, + const void *filter, + TensorDesc *ftmDesc, + void *ftm, DataFormat ftmDataFormat) { - if (nullptr == filter || nullptr == ftmDesc || nullptr == ftm) + if (nullptr == filter || nullptr == ftmDesc || nullptr == ftm) { CHECK_STATUS(NULL_POINTER); + } DataType fdt; DataFormat fdf; U32 fn, fc, fh, fw; CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); if (fdf == ftmDataFormat) { *ftmDesc = filterDesc; - memcpy(ftm, filter, fn*fc*fh*fw*bytesOf(fdt)); + memcpy(ftm, filter, fn * fc * fh * fw * bytesOf(fdt)); return SUCCESS; } - if (fdf != DF_NCHW) + if (fdf != DF_NCHW) { CHECK_STATUS(NOT_SUPPORTED); + } EE ret = SUCCESS; switch (ftmDataFormat) { case DF_NCHWN8C4: { - INT8 *filterArray = (INT8*)filter; - INT8 *ftmArray = (INT8*)ftm; + INT8 *filterArray = (INT8 *)filter; + INT8 *ftmArray = (INT8 *)ftm; U32 oc = fn / 8; U32 fc_quad = fc / 4; for (U32 o = 0; o < oc; o++) { for (U32 c = 0; c < fc_quad; c++) { - for (U32 hw = 0; hw < fh*fw; hw++) { + for (U32 hw = 0; hw < fh * fw; hw++) { for (U32 o8 = 0; o8 < 8; o8++) { for (U32 c4 = 0; c4 < 4; c4++) { - ftmArray[o*fh*fw*fc*8 + c*fh*fw*32 + hw*32 + o8*4 + c4] = filterArray[(o*8+o8)*fc*fh*fw + (c*4+c4)*fh*fw + hw]; + ftmArray[o * fh * fw * fc * 8 + c * fh * fw * 32 + hw * 32 + + o8 * 4 + c4] = filterArray[(o * 8 + o8) * fc * fh * fw + + (c * 4 + c4) * fh * fw + hw]; } } } @@ -60,20 +65,20 @@ inline EE convolution_transform_filter_kernel_int8(TensorDesc filterDesc, const break; } case DF_HWNCN8C4: { - F16 *filterArray = (F16*)filter; - F16 *ftmArray = (F16*)ftm; - for (U32 o = 0; o < fn/8; o++) { - for (U32 c = 0; c < fc/4; c++) { + F16 *filterArray = (F16 *)filter; + F16 *ftmArray = (F16 *)ftm; + for (U32 o = 0; o < fn / 8; o++) { + for (U32 c = 0; c < fc / 4; c++) { // Each time deal with N2C4; 4 times we have N8C4 - U32 f_off_0 = (o*8)*fc*fh*fw + c*4*fh*fw; - U32 f_off_1 = (o*8+2)*fc*fh*fw + c*4*fh*fw; - U32 f_off_2 = (o*8+4)*fc*fh*fw + c*4*fh*fw; - U32 f_off_3 = (o*8+6)*fc*fh*fw + c*4*fh*fw; - - U32 ftm_off_0 = o*36*fc*8 + c*32; - U32 ftm_off_1 = o*36*fc*8 + c*32 + 8; - U32 ftm_off_2 = o*36*fc*8 + c*32 + 16; - U32 ftm_off_3 = o*36*fc*8 + c*32 + 24; + U32 f_off_0 = (o * 8) * fc * fh * fw + c * 4 * fh * fw; + U32 f_off_1 = (o * 8 + 2) * fc * fh * fw + c * 4 * fh * fw; + U32 f_off_2 = (o * 8 + 4) * fc * fh * fw + c * 4 * fh * fw; + U32 f_off_3 = (o * 8 + 6) * fc * fh * fw + c * 4 * fh * fw; + + U32 ftm_off_0 = o * 36 * fc * 8 + c * 32; + U32 ftm_off_1 = o * 36 * fc * 8 + c * 32 + 8; + U32 ftm_off_2 = o * 36 * fc * 8 + c * 32 + 16; + U32 ftm_off_3 = o * 36 * fc * 8 + c * 32 + 24; F16 F[9][8]; // N2C4 at a time F16 *F_ptr[9]; @@ -82,52 +87,56 @@ inline EE convolution_transform_filter_kernel_int8(TensorDesc filterDesc, const for (U32 hw = 0; hw < 9; hw++) { for (U32 oo = 0; oo < 2; oo++) { for (U32 cc = 0; cc < 4; cc++) { - F[hw][oo*4+cc] = filterArray[f_off_0 + hw + oo*fc*fh*fw + cc*fh*fw]; + F[hw][oo * 4 + cc] = + filterArray[f_off_0 + hw + oo * fc * fh * fw + cc * fh * fw]; } } F_ptr[hw] = F[hw]; } for (U32 hw = 0; hw < 36; hw++) { - Fw[hw] = ftmArray + ftm_off_0 + hw*fc*8; // Each hw fills N8*fc + Fw[hw] = ftmArray + ftm_off_0 + hw * fc * 8; // Each hw fills N8*fc } trans_W_4x4_3x3(Fw, F_ptr); for (U32 hw = 0; hw < 9; hw++) { for (U32 oo = 0; oo < 2; oo++) { for (U32 cc = 0; cc < 4; cc++) { - F[hw][oo*4+cc] = filterArray[f_off_1 + hw + oo*fc*fh*fw + cc*fh*fw]; + F[hw][oo * 4 + cc] = + filterArray[f_off_1 + hw + oo * fc * fh * fw + cc * fh * fw]; } } F_ptr[hw] = F[hw]; } for (U32 hw = 0; hw < 36; hw++) { - Fw[hw] = ftmArray + ftm_off_1 + hw*fc*8; // Each hw fills N8*fc + Fw[hw] = ftmArray + ftm_off_1 + hw * fc * 8; // Each hw fills N8*fc } trans_W_4x4_3x3(Fw, F_ptr); for (U32 hw = 0; hw < 9; hw++) { for (U32 oo = 0; oo < 2; oo++) { for (U32 cc = 0; cc < 4; cc++) { - F[hw][oo*4+cc] = filterArray[f_off_2 + hw + oo*fc*fh*fw + cc*fh*fw]; + F[hw][oo * 4 + cc] = + filterArray[f_off_2 + hw + oo * fc * fh * fw + cc * fh * fw]; } } F_ptr[hw] = F[hw]; } for (U32 hw = 0; hw < 36; hw++) { - Fw[hw] = ftmArray + ftm_off_2 + hw*fc*8; // Each hw fills N8*fc + Fw[hw] = ftmArray + ftm_off_2 + hw * fc * 8; // Each hw fills N8*fc } trans_W_4x4_3x3(Fw, F_ptr); for (U32 hw = 0; hw < 9; hw++) { for (U32 oo = 0; oo < 2; oo++) { for (U32 cc = 0; cc < 4; cc++) { - F[hw][oo*4+cc] = filterArray[f_off_3 + hw + oo*fc*fh*fw + cc*fh*fw]; + F[hw][oo * 4 + cc] = + filterArray[f_off_3 + hw + oo * fc * fh * fw + cc * fh * fw]; } } F_ptr[hw] = F[hw]; } for (U32 hw = 0; hw < 36; hw++) { - Fw[hw] = ftmArray + ftm_off_3 + hw*fc*8; // Each hw fills N8*fc + Fw[hw] = ftmArray + ftm_off_3 + hw * fc * 8; // Each hw fills N8*fc } trans_W_4x4_3x3(Fw, F_ptr); } @@ -145,10 +154,12 @@ inline EE convolution_transform_filter_kernel_int8(TensorDesc filterDesc, const return ret; } - -EE convolution_transform_filter_int8(TensorDesc filterDesc, const void* filter, +EE convolution_transform_filter_int8(TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, ConvolutionForwardAlgorithm algorithm, - TensorDesc *ftmDesc, void* filterTransformed) + TensorDesc *ftmDesc, + void *filterTransformed) { DataFormat ftmDataFormat; switch (algorithm) { @@ -161,7 +172,22 @@ EE convolution_transform_filter_int8(TensorDesc filterDesc, const void* filter, default: return NOT_MATCH; } - EE ret = convolution_transform_filter_kernel_int8(filterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat); - return ret; + U32 channelAxis = filterDesc.nDims - 1; + TensorDesc tmpFilterDesc = filterDesc; + tmpFilterDesc.dims[channelAxis] /= convParamSpec.group; + U32 fnPadding = tmpFilterDesc.dims[channelAxis]; + if (fnPadding % 8 != 0) { + fnPadding = (fnPadding / 8 + 1) * 8; + } + U32 originalTileSize = tensorNumElements(tmpFilterDesc); + for (U32 g = 0; g < convParamSpec.group; g++) { + CHECK_STATUS(convolution_transform_filter_kernel_int8( + tmpFilterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat)); + U32 newTileSize = tensorNumElements(*ftmDesc) / tmpFilterDesc.dims[channelAxis] * fnPadding; + filter = (const U8 *)filter + originalTileSize * bytesOf(filterDesc.dt); + filterTransformed = (U8 *)filterTransformed + newTileSize * bytesOf(ftmDesc->dt); + } + ftmDesc->dims[channelAxis] = filterDesc.dims[channelAxis]; + return SUCCESS; } #endif diff --git a/compute/tensor/src/cpu/arm/int8/convolution_winograd.h b/compute/tensor/src/cpu/arm/int8/convolution_winograd.h new file mode 100644 index 00000000..8e764994 --- /dev/null +++ b/compute/tensor/src/cpu/arm/int8/convolution_winograd.h @@ -0,0 +1,181 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONVOLUTION_WINOGRAD +#define _H_CONVOLUTION_WINOGRAD + +#ifdef _USE_INT8 +#include "sys.h" +#include "types.h" + +template +EE convolution_winograd_A55(TensorDesc inputDesc, + const void *input, + F16 *input_scale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec am); + +template +EE convolution_winograd_A76(TensorDesc inputDesc, + const void *input, + F16 *input_scale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec am); + +inline EE convolution_winograd(TensorDesc inputDesc, + const void *input, + F16 *input_scale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec am, + Arch arch) +{ + EE ret = SUCCESS; + switch (arch) { + case ARM_A55: + ret = convolution_winograd_A55(inputDesc, input, input_scale, filterDesc, filter, + filterScale, convParamSpec, biasDesc, bias, tmpBytes, tmp, outputDesc, output, + outputScale, am); + break; + case ARM_A76: + ret = convolution_winograd_A76(inputDesc, input, input_scale, filterDesc, filter, + filterScale, convParamSpec, biasDesc, bias, tmpBytes, tmp, outputDesc, output, + outputScale, am); + break; + default: + return NOT_SUPPORTED; + } + return ret; +} + +inline void apply_scale_f16(U32 numData, F16 *array, F16 scale, INT8 *qArray, bool clamp = true) +{ + for (U32 i = 0; i < numData; i++) { + F32 tmp = array[i] * scale; + qArray[i] = round_towards_zero(tmp, clamp); + } +} + +inline void quantize_wino_input(F16 *itmArray, U32 len_per_36, INT8 *inQ, F32 *inputScale) +{ + U32 numData = len_per_36; + F32 scale; + + for (U32 idx = 0; idx < 36; idx++) { + F16 *in = itmArray + idx * numData; + float16x8_t temp_v = vld1q_f16(in); + float16x8_t max_v = temp_v; + float16x8_t min_v = temp_v; + + for (U32 i = 8; i < numData; i += 8) { + temp_v = vld1q_f16(in + i); + max_v = vmaxq_f16(max_v, temp_v); + min_v = vminq_f16(min_v, temp_v); + } + + F16 max = vmaxvq_f16(max_v); + F16 min = vminvq_f16(min_v); + + if (max == 0 && min == 0) { + inputScale[idx] = 0.0; // We can skip this dotprod later + continue; + } + if (max > 0 && min < 0) { + F32 scale_max = 127.0 / max; + F32 scale_min = -127.0 / min; + scale = (scale_max < scale_min) ? scale_max : scale_min; + } else if (max < 0) { + scale = -127.0 / min; + } else { // min > 0 + scale = 127.0 / max; + } + + INT8 *base = inQ + idx * numData; + apply_scale_f16(numData, in, scale, base); + inputScale[idx] = scale; + } +} + +inline void quantize_wino_input_s16( + short *itmArray, U32 len_per_36, INT8 *inQ, F32 *inputScale, F16 input_scale) +{ + U32 numData = len_per_36; + F32 scale; + + for (U32 idx = 0; idx < 36; idx++) { + short *in = itmArray + idx * numData; + int16x8_t temp_v = vld1q_s16(in); + int16x8_t max_v = temp_v; + int16x8_t min_v = temp_v; + + for (U32 i = 8; i < numData; i += 8) { + temp_v = vld1q_s16(in + i); + max_v = vmaxq_s16(max_v, temp_v); + min_v = vminq_s16(min_v, temp_v); + } + + short max = vmaxvq_s16(max_v); + short min = vminvq_s16(min_v); + + if (max == 0 && min == 0) { + inputScale[idx] = 0.0; // We can skip this dotprod later + continue; + } + if (max > 0 && min < 0) { + F32 scaleMax = 127.0 / max; + F32 scaleMin = -127.0 / min; + scale = (scaleMax < scaleMin) ? scaleMax : scaleMin; + } else if (max < 0) { + scale = -127.0 / min; + } else { // min > 0 + scale = 127.0 / max; + } + + INT8 *base = inQ + idx * numData; + for (U32 i = 0; i < numData; i++) { + base[i] = round_towards_zero(scale * in[i], false); + } + inputScale[idx] = input_scale * scale; + } +} +#endif +#endif diff --git a/compute/tensor/src/cpu/arm/int8/convolution_winograd_A55.cpp b/compute/tensor/src/cpu/arm/int8/convolution_winograd_A55.cpp new file mode 100644 index 00000000..5f577370 --- /dev/null +++ b/compute/tensor/src/cpu/arm/int8/convolution_winograd_A55.cpp @@ -0,0 +1,1487 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_INT8 +#include "cpu/arm/int8/convolution_winograd_transform.h" +#include "cpu/arm/int8/convolution_winograd.h" + +template +EE convolution_winograd_A55(TensorDesc inputDesc, + const void *input, + F16 *input_scale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec am) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + // not truely one_step. Compute hw12*(6*6)*ic at one time. + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + if (fdf != DF_HWNCN8C4) { + return NOT_MATCH; + } + if (!(fh == 6 && fw == 6)) { + return NOT_MATCH; + } + + // Assume IT is the same as OT + OT *inArray = (OT *)input; + INT8 *filterArray = (INT8 *)filter; + F16 *outArray = (F16 *)output; + F16 *biasArray = (F16 *)bias; + + // both input and output are stored with C8 + oc /= 8; + ic /= 8; + + U32 tile_h = (oh + 3) / 4; + U32 tile_w = (ow + 3) / 4; + I32 tiles = tile_h * tile_w; // num of 6x6 tiles + U32 pad_left = paddingL; + U32 pad_right = paddingR + (tile_w * 4 - ow); + U32 pad_w_mod_4 = tile_w * 4 - ow; + U32 pad_top = paddingT; + U32 pad_bottom = paddingB + (tile_h * 4 - oh); + U32 pad_h_mod_4 = tile_h * 4 - oh; + U32 ih_pad = ih + pad_top + pad_bottom; + U32 iw_pad = iw + pad_left + pad_right; + + U32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + + // tmp = in_pad + itm + otm + inQ + ... + // in_pad: ic*ih_pad*iw_pad*8 + // itm: 6*6*ic*12*8 (int16 or fp16) + // otm: 6*6*12*8 (F16) + // inQ: 6*6*ic*12*8 (int8) + OT *inArray_pad = (OT *)tmp; + short *itmArray = (short *)(inArray_pad + ic * ihiw * 8); // will be cast to fp16 for fp16 inputs + F16 *otmArray = (F16 *)(itmArray + 6 * 6 * ic * 12 * 8); + INT8 *inQ = (INT8 *)(otmArray + 6 * 6 * 12 * 8); + if (DT_I8 == odt) { + outArray = (F16 *)(inQ + 6 * 6 * ic * 12 * 8); // After otmArray and pack + } + + // To track the range of the final outputs and prepare for quantization + F16 max[8] = {0}; + F16 min[8] = {0}; + + for (U32 n = 0; n < in; n++) { // for each batch + OT *inArray_pad_mov = inArray_pad; + OT *inArray_mov = inArray + n * ic * ih * iw * 8; + for (U32 c = 0; c < ic; c++) { + memset(inArray_pad_mov, 0, pad_top * iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += pad_top * iw_pad * 8; + for (U32 h = pad_top; h < ih_pad - pad_bottom; h++) { + memset(inArray_pad_mov, 0, pad_left * 8 * bytesOf(idt)); + inArray_pad_mov += pad_left * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, pad_right * 8 * bytesOf(idt)); + inArray_pad_mov += pad_right * 8; + } + memset(inArray_pad_mov, 0, pad_bottom * iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += pad_bottom * iw_pad * 8; + } + + // tiles / 12 + for (I32 hw = 0; hw < tiles - 11; hw += 12) { + // in trans + // NCHWc8 => (6*6)*(C/4)*hw12*c4 + // transform hw1c8 at a time, so we need 12 times to cover hw12c8 + // pack into hw12c4 after quantizing (reuse the space of itmArray) + for (U32 c = 0; c < ic; c++) { + OT *inArray_pad_mov = inArray_pad + c * ihiw * 8; + short *Iw_ptr[36]; + short *Iw0[36]; + OT *I0[36]; + short *Iw1[36]; + OT *I1[36]; + short *Iw2[36]; + OT *I2[36]; + short *Iw3[36]; + OT *I3[36]; + short *Iw4[36]; + OT *I4[36]; + short *Iw5[36]; + OT *I5[36]; + short *Iw6[36]; + OT *I6[36]; + short *Iw7[36]; + OT *I7[36]; + short *Iw8[36]; + OT *I8[36]; + short *Iw9[36]; + OT *I9[36]; + short *Iw10[36]; + OT *I10[36]; + short *Iw11[36]; + OT *I11[36]; + + // Store transformed hw12c8 to itmArray + for (U32 i = 0; i < 36; i++) { + Iw0[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12; + Iw1[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 1 * 8; + Iw2[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 2 * 8; + Iw3[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 3 * 8; + Iw4[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 4 * 8; + Iw5[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 5 * 8; + Iw6[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 6 * 8; + Iw7[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 7 * 8; + Iw8[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 8 * 8; + Iw9[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 9 * 8; + Iw10[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 10 * 8; + Iw11[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 11 * 8; + } + + U32 h0 = (hw / tile_w) * 4; // stride is 4 + U32 w0 = (hw % tile_w) * 4; + U32 h1 = ((hw + 1) / tile_w) * 4; + U32 w1 = ((hw + 1) % tile_w) * 4; + U32 h2 = ((hw + 2) / tile_w) * 4; + U32 w2 = ((hw + 2) % tile_w) * 4; + U32 h3 = ((hw + 3) / tile_w) * 4; + U32 w3 = ((hw + 3) % tile_w) * 4; + U32 h4 = ((hw + 4) / tile_w) * 4; + U32 w4 = ((hw + 4) % tile_w) * 4; + U32 h5 = ((hw + 5) / tile_w) * 4; + U32 w5 = ((hw + 5) % tile_w) * 4; + U32 h6 = ((hw + 6) / tile_w) * 4; + U32 w6 = ((hw + 6) % tile_w) * 4; + U32 h7 = ((hw + 7) / tile_w) * 4; + U32 w7 = ((hw + 7) % tile_w) * 4; + U32 h8 = ((hw + 8) / tile_w) * 4; + U32 w8 = ((hw + 8) % tile_w) * 4; + U32 h9 = ((hw + 9) / tile_w) * 4; + U32 w9 = ((hw + 9) % tile_w) * 4; + U32 h10 = ((hw + 10) / tile_w) * 4; + U32 w10 = ((hw + 10) % tile_w) * 4; + U32 h11 = ((hw + 11) / tile_w) * 4; + U32 w11 = ((hw + 11) % tile_w) * 4; + + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + I1[i * 6 + j] = inArray_pad_mov + (h1 + i) * iw_pad * 8 + (w1 + j) * 8; + I2[i * 6 + j] = inArray_pad_mov + (h2 + i) * iw_pad * 8 + (w2 + j) * 8; + I3[i * 6 + j] = inArray_pad_mov + (h3 + i) * iw_pad * 8 + (w3 + j) * 8; + I4[i * 6 + j] = inArray_pad_mov + (h4 + i) * iw_pad * 8 + (w4 + j) * 8; + I5[i * 6 + j] = inArray_pad_mov + (h5 + i) * iw_pad * 8 + (w5 + j) * 8; + I6[i * 6 + j] = inArray_pad_mov + (h6 + i) * iw_pad * 8 + (w6 + j) * 8; + I7[i * 6 + j] = inArray_pad_mov + (h7 + i) * iw_pad * 8 + (w7 + j) * 8; + I8[i * 6 + j] = inArray_pad_mov + (h8 + i) * iw_pad * 8 + (w8 + j) * 8; + I9[i * 6 + j] = inArray_pad_mov + (h9 + i) * iw_pad * 8 + (w9 + j) * 8; + I10[i * 6 + j] = inArray_pad_mov + (h10 + i) * iw_pad * 8 + (w10 + j) * 8; + I11[i * 6 + j] = inArray_pad_mov + (h11 + i) * iw_pad * 8 + (w11 + j) * 8; + } + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I0); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I0); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw1[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I1); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I1); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw2[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I2); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I2); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw3[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I3); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I3); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw4[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I4); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I4); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw5[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I5); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I5); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw6[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I6); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I6); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw7[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I7); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I7); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw8[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I8); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I8); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw9[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I9); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I9); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw10[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I10); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I10); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw11[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I11); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I11); + } + } + + F32 inputScale[36]; + + if (DT_I8 == idt) { + quantize_wino_input_s16(itmArray, 12 * ic * 8, inQ, inputScale, *input_scale); + } else { + quantize_wino_input((F16 *)itmArray, 12 * ic * 8, inQ, inputScale); + } + + F32 factor_v[36][4]; + for (U32 i = 0; i < 36; i++) { + if (inputScale[i] == 0) { + factor_v[i][0] = 0; + continue; + } else { + factor_v[i][0] = 1.0 / inputScale[i] / (F32)filterScale[i]; + } + + factor_v[i][1] = factor_v[i][0]; + factor_v[i][2] = factor_v[i][0]; + factor_v[i][3] = factor_v[i][0]; + } + + INT8 *in_pack = (INT8 *)itmArray; // Reuse the space + + for (U32 idx = 0; idx < 36; idx++) { + if (factor_v[idx][0] == 0) { // input pixels are all 0 + continue; + } + for (U32 c = 0; c < ic; c++) { // for each 8 channels + INT8 *in_hw12c8 = inQ + idx * 12 * ic * 8 + c * 12 * 8; + + INT8 *in_0 = in_hw12c8; + INT8 *in_1 = in_hw12c8 + 1 * 8; + INT8 *in_2 = in_hw12c8 + 2 * 8; + INT8 *in_3 = in_hw12c8 + 3 * 8; + INT8 *in_4 = in_hw12c8 + 4 * 8; + INT8 *in_5 = in_hw12c8 + 5 * 8; + INT8 *in_6 = in_hw12c8 + 6 * 8; + INT8 *in_7 = in_hw12c8 + 7 * 8; + INT8 *in_8 = in_hw12c8 + 8 * 8; + INT8 *in_9 = in_hw12c8 + 9 * 8; + INT8 *in_10 = in_hw12c8 + 10 * 8; + INT8 *in_11 = in_hw12c8 + 11 * 8; + + // NHWChw12c4 + INT8 *in_pack_0 = in_pack + idx * 12 * ic * 8 + c * 12 * 8; + INT8 *in_pack_1 = in_pack_0 + 12 * 4; + + __asm__ __volatile__( + "ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + "ins v0.d[1], x2\n" + "ins v1.d[1], x3\n" + "ldr d4, [%[in_4]]\n" + "ldr x6, [%[in_6]]\n" + "trn1 v20.4s, v0.4s, v1.4s\n" + "trn2 v21.4s, v0.4s, v1.4s\n" + + "ldr d5, [%[in_5]]\n" + "ldr x7, [%[in_7]]\n" + "ins v4.d[1], x6\n" + "ins v5.d[1], x7\n" + "ldr d8, [%[in_8]]\n" + "ldr x10, [%[in_10]]\n" + "trn1 v24.4s, v4.4s, v5.4s\n" + "trn2 v25.4s, v4.4s, v5.4s\n" + "ldr d9, [%[in_9]]\n" + "ldr x11, [%[in_11]]\n" + "ins v8.d[1], x10\n" + "ins v9.d[1], x11\n" + + "str q20, [%[pack_0]]\n" + "trn1 v28.4s, v8.4s, v9.4s\n" + "trn2 v29.4s, v8.4s, v9.4s\n" + "str q24, [%[pack_0], #16]\n" + "str q28, [%[pack_0], #32]\n" + "str q21, [%[pack_1]]\n" + "str q25, [%[pack_1], #16]\n" + "str q29, [%[pack_1], #32]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), [in_0] "r"(in_0), + [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3), [in_4] "r"(in_4), + [in_5] "r"(in_5), [in_6] "r"(in_6), [in_7] "r"(in_7), [in_8] "r"(in_8), + [in_9] "r"(in_9), [in_10] "r"(in_10), [in_11] "r"(in_11) + : "memory", "cc", "v0", "v1", "v4", "v5", "v8", "v9", "v20", "v21", "v24", + "v25", "v28", "v29", "x2", "x3", "x6", "x7", "x10", "x11"); + } + } + + // compute + for (U32 o = 0; o < oc; o++) { // 8 output channels at a time + // bias + F16 *b_0 = biasArray + o * 8; + for (U32 idx = 0; idx < 36; idx++) { + INT8 *in_hw0 = in_pack + idx * 12 * ic * 8; + INT8 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; + F16 *out_o0hw0 = otmArray + idx * 12 * 8; + if (factor_v[idx][0] == 0) { // input pixels are all 0 + memset(out_o0hw0, 0, 12 * 8 * sizeof(OT)); + continue; + } + F32 *fac = factor_v[idx]; + __asm__ __volatile__("eor v5.16b, v5.16b, v5.16b\n" + "ldr d1, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "ldr x1, [%[in_0], #8]\n" + "eor v7.16b, v7.16b, v7.16b\n" + "ins v1.d[1], x1\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + "eor v9.16b, v9.16b, v9.16b\n" + "ldr x2, [%[f_0], #8]\n" + "eor v10.16b, v10.16b, v10.16b\n" + "ins v0.d[1], x2\n" + "eor v11.16b, v11.16b, v11.16b\n" + "ldr d3, [%[in_0], #16]\n" // in_1 + "eor v12.16b, v12.16b, v12.16b\n" + "ldr x3, [%[in_0], #24]\n" + "eor v13.16b, v13.16b, v13.16b\n" + "ins v3.d[1], x3\n" + "eor v14.16b, v14.16b, v14.16b\n" + "eor v15.16b, v15.16b, v15.16b\n" + "eor v16.16b, v16.16b, v16.16b\n" + + "eor v17.16b, v17.16b, v17.16b\n" + "eor v18.16b, v18.16b, v18.16b\n" + "eor v19.16b, v19.16b, v19.16b\n" + "eor v20.16b, v20.16b, v20.16b\n" + "eor v21.16b, v21.16b, v21.16b\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + "eor v25.16b, v25.16b, v25.16b\n" + "eor v26.16b, v26.16b, v26.16b\n" + "eor v27.16b, v27.16b, v27.16b\n" + "eor v28.16b, v28.16b, v28.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + "0:\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d2, [x3, 32]\n" + "ldr x16, [x3, 40]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ldr d29, [x0, 16]\n" + "ldr x17, [x0, 24]\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "ins v2.d[1], x16\n" + "ldr d30, [x3, 48]!\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + "ins v29.d[1], x17\n" + + "sdot v13.4s, v0.16b, v3.4b[0]\n" + "ldr x16, [x3, 8]\n" + "subs x2, x2, #4\n" + "sdot v15.4s, v0.16b, v3.4b[1]\n" + "sdot v17.4s, v0.16b, v3.4b[2]\n" + "ins v30.d[1], x16\n" + "sdot v19.4s, v0.16b, v3.4b[3]\n" + + "sdot v21.4s, v0.16b, v2.4b[0]\n" + "sdot v23.4s, v0.16b, v2.4b[1]\n" + "sdot v25.4s, v0.16b, v2.4b[2]\n" + "sdot v27.4s, v0.16b, v2.4b[3]\n" + + "sdot v14.4s, v29.16b, v3.4b[0]\n" + "sdot v16.4s, v29.16b, v3.4b[1]\n" + "ldr d0, [x0, 32]!\n" + "ldr x17, [x0, 8]\n" + "sdot v18.4s, v29.16b, v3.4b[2]\n" + "sdot v20.4s, v29.16b, v3.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "ldr d3, [x3, 16]\n" + "ldr x16, [x3, 24]\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + + "ins v0.d[1], x17\n" + "ins v3.d[1], x16\n" + + "sdot v22.4s, v29.16b, v2.4b[0]\n" + "mov v1.16b, v30.16b\n" + "sdot v24.4s, v29.16b, v2.4b[1]\n" + "sdot v26.4s, v29.16b, v2.4b[2]\n" + "sdot v28.4s, v29.16b, v2.4b[3]\n" + + "bne 0b\n" + "scvtf v5.4s, v5.4s\n" + "scvtf v6.4s, v6.4s\n" + "ldr d1, [%[factor]]\n" + "ldr x1, [%[factor], #8]\n" + "scvtf v7.4s, v7.4s\n" + "scvtf v8.4s, v8.4s\n" + "ins v1.d[1], x1\n" + "scvtf v9.4s, v9.4s\n" + "scvtf v10.4s, v10.4s\n" + "scvtf v11.4s, v11.4s\n" + "scvtf v12.4s, v12.4s\n" + "scvtf v13.4s, v13.4s\n" + "scvtf v14.4s, v14.4s\n" + "scvtf v15.4s, v15.4s\n" + "scvtf v16.4s, v16.4s\n" + "scvtf v17.4s, v17.4s\n" + "scvtf v18.4s, v18.4s\n" + "scvtf v19.4s, v19.4s\n" + "scvtf v20.4s, v20.4s\n" + "scvtf v21.4s, v21.4s\n" + "scvtf v22.4s, v22.4s\n" + "scvtf v23.4s, v23.4s\n" + "scvtf v24.4s, v24.4s\n" + "scvtf v25.4s, v25.4s\n" + "scvtf v26.4s, v26.4s\n" + "scvtf v27.4s, v27.4s\n" + "scvtf v28.4s, v28.4s\n" + + "fmul v5.4s, v1.4s, v5.4s\n" + "fmul v6.4s, v1.4s, v6.4s\n" + "fmul v7.4s, v1.4s, v7.4s\n" + "fmul v8.4s, v1.4s, v8.4s\n" + "fmul v9.4s, v1.4s, v9.4s\n" + "fmul v10.4s, v1.4s, v10.4s\n" + "fmul v11.4s, v1.4s, v11.4s\n" + "fmul v12.4s, v1.4s, v12.4s\n" + "fmul v13.4s, v1.4s, v13.4s\n" + "fmul v14.4s, v1.4s, v14.4s\n" + "fmul v15.4s, v1.4s, v15.4s\n" + "fmul v16.4s, v1.4s, v16.4s\n" + "fmul v17.4s, v1.4s, v17.4s\n" + "fmul v18.4s, v1.4s, v18.4s\n" + "fmul v19.4s, v1.4s, v19.4s\n" + "fmul v20.4s, v1.4s, v20.4s\n" + "fmul v21.4s, v1.4s, v21.4s\n" + "fmul v22.4s, v1.4s, v22.4s\n" + "fmul v23.4s, v1.4s, v23.4s\n" + "fmul v24.4s, v1.4s, v24.4s\n" + "fmul v25.4s, v1.4s, v25.4s\n" + "fmul v26.4s, v1.4s, v26.4s\n" + "fmul v27.4s, v1.4s, v27.4s\n" + "fmul v28.4s, v1.4s, v28.4s\n" + + "fcvtn v5.4h, v5.4s\n" + "fcvtn v7.4h, v7.4s\n" + "fcvtn v9.4h, v9.4s\n" + "fcvtn v11.4h, v11.4s\n" + "fcvtn v13.4h, v13.4s\n" + "fcvtn v15.4h, v15.4s\n" + "fcvtn v17.4h, v17.4s\n" + "fcvtn v19.4h, v19.4s\n" + "fcvtn v21.4h, v21.4s\n" + "fcvtn v23.4h, v23.4s\n" + "fcvtn v25.4h, v25.4s\n" + "fcvtn v27.4h, v27.4s\n" + + "fcvtn2 v5.8h, v6.4s\n" + "fcvtn2 v7.8h, v8.4s\n" + "fcvtn2 v9.8h, v10.4s\n" + "fcvtn2 v11.8h, v12.4s\n" + "fcvtn2 v13.8h, v14.4s\n" + "fcvtn2 v15.8h, v16.4s\n" + "fcvtn2 v17.8h, v18.4s\n" + "fcvtn2 v19.8h, v20.4s\n" + "fcvtn2 v21.8h, v22.4s\n" + "fcvtn2 v23.8h, v24.4s\n" + "fcvtn2 v25.8h, v26.4s\n" + "fcvtn2 v27.8h, v28.4s\n" + + "str q5, [%[out_0]]\n" + "str q7, [%[out_0], #16]\n" + "str q9, [%[out_0], #32]\n" + "str q11, [%[out_0], #48]\n" + "str q13, [%[out_0], #64]\n" + "str q15, [%[out_0], #80]\n" + "str q17, [%[out_0], #96]\n" + "str q19, [%[out_0], #112]\n" + "str q21, [%[out_0], #128]\n" + "str q23, [%[out_0], #144]\n" + "str q25, [%[out_0], #160]\n" + "str q27, [%[out_0], #176]\n" + : + : [out_0] "r"(out_o0hw0), [in_0] "r"(in_hw0), + [f_0] "r"(f_o0c0), [ic] "r"((I64)ic * 8), [factor] "r"(fac) + : "memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", + "x1", "x2", "x3", "x17", "x16"); + } + // out trans + // (6*6)*hw12*o8 => NOHWo8 + for (U32 hw12 = 0; hw12 < 12; hw12++) { + U32 h = (hw + hw12) / tile_w; + U32 w = (hw + hw12) % tile_w; + F16 *out_0 = + outArray + n * oc * ohow * 8 + o * ohow * 8 + h * 4 * ow * 8 + w * 4 * 8; + + F16 *Ow_0[36]; + F16 *O_0[16]; + + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + idx * 12 * 8 + hw12 * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + trans_O(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h - 1, tile_w - 1, + max, min, am); + } + } + } + + // tiles_reminder % 12 / 8 + I32 tiles_s = (tiles / 12) * 12; + I32 tiles_tail = tiles - tiles_s; + + if (tiles_tail >= 8) { + I32 hw = tiles_s; + // in trans + // NCHWc8 => (6*6)*(C/4)*hw8*c4 + // transform hw1c8 at a time, so we need 8 times to cover hw8c8 + // pack into hw8c4 after quantizing (reuse the space of itmArray) + for (U32 c = 0; c < ic; c++) { + OT *inArray_pad_mov = inArray_pad + c * ihiw * 8; + short *Iw_ptr[36]; + short *Iw0[36]; + OT *I0[36]; + short *Iw1[36]; + OT *I1[36]; + short *Iw2[36]; + OT *I2[36]; + short *Iw3[36]; + OT *I3[36]; + short *Iw4[36]; + OT *I4[36]; + short *Iw5[36]; + OT *I5[36]; + short *Iw6[36]; + OT *I6[36]; + short *Iw7[36]; + OT *I7[36]; + + // Store transformed hw12c8 to itmArray + for (U32 i = 0; i < 36; i++) { + Iw0[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8; + Iw1[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 1 * 8; + Iw2[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 2 * 8; + Iw3[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 3 * 8; + Iw4[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 4 * 8; + Iw5[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 5 * 8; + Iw6[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 6 * 8; + Iw7[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 7 * 8; + } + + U32 h0 = (hw / tile_w) * 4; // stride is 4 + U32 w0 = (hw % tile_w) * 4; + U32 h1 = ((hw + 1) / tile_w) * 4; + U32 w1 = ((hw + 1) % tile_w) * 4; + U32 h2 = ((hw + 2) / tile_w) * 4; + U32 w2 = ((hw + 2) % tile_w) * 4; + U32 h3 = ((hw + 3) / tile_w) * 4; + U32 w3 = ((hw + 3) % tile_w) * 4; + U32 h4 = ((hw + 4) / tile_w) * 4; + U32 w4 = ((hw + 4) % tile_w) * 4; + U32 h5 = ((hw + 5) / tile_w) * 4; + U32 w5 = ((hw + 5) % tile_w) * 4; + U32 h6 = ((hw + 6) / tile_w) * 4; + U32 w6 = ((hw + 6) % tile_w) * 4; + U32 h7 = ((hw + 7) / tile_w) * 4; + U32 w7 = ((hw + 7) % tile_w) * 4; + + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + I1[i * 6 + j] = inArray_pad_mov + (h1 + i) * iw_pad * 8 + (w1 + j) * 8; + I2[i * 6 + j] = inArray_pad_mov + (h2 + i) * iw_pad * 8 + (w2 + j) * 8; + I3[i * 6 + j] = inArray_pad_mov + (h3 + i) * iw_pad * 8 + (w3 + j) * 8; + I4[i * 6 + j] = inArray_pad_mov + (h4 + i) * iw_pad * 8 + (w4 + j) * 8; + I5[i * 6 + j] = inArray_pad_mov + (h5 + i) * iw_pad * 8 + (w5 + j) * 8; + I6[i * 6 + j] = inArray_pad_mov + (h6 + i) * iw_pad * 8 + (w6 + j) * 8; + I7[i * 6 + j] = inArray_pad_mov + (h7 + i) * iw_pad * 8 + (w7 + j) * 8; + } + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I0); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I0); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw1[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I1); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I1); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw2[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I2); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I2); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw3[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I3); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I3); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw4[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I4); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I4); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw5[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I5); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I5); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw6[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I6); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I6); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw7[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I7); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I7); + } + } + + F32 inputScale[36]; + + if (idt == DT_I8) { + quantize_wino_input_s16(itmArray, 8 * ic * 8, inQ, inputScale, *input_scale); + } else { + quantize_wino_input((F16 *)itmArray, 8 * ic * 8, inQ, inputScale); + } + + F32 factor_v[36][4]; + for (U32 i = 0; i < 36; i++) { + if (inputScale[i] == 0) { + factor_v[i][0] = 0; + continue; + } else { + factor_v[i][0] = 1.0 / inputScale[i] / (F32)filterScale[i]; + } + factor_v[i][1] = factor_v[i][0]; + factor_v[i][2] = factor_v[i][0]; + factor_v[i][3] = factor_v[i][0]; + } + + INT8 *in_pack = (INT8 *)itmArray; // Reuse the space + + for (U32 idx = 0; idx < 36; idx++) { + if (factor_v[idx][0] == 0) { // input pixels are all 0 + continue; + } + for (U32 c = 0; c < ic; c++) { // for each 8 channels + INT8 *in_hw8c8 = inQ + idx * 8 * ic * 8 + c * 8 * 8; + + INT8 *in_0 = in_hw8c8; + INT8 *in_1 = in_hw8c8 + 1 * 8; + INT8 *in_2 = in_hw8c8 + 2 * 8; + INT8 *in_3 = in_hw8c8 + 3 * 8; + INT8 *in_4 = in_hw8c8 + 4 * 8; + INT8 *in_5 = in_hw8c8 + 5 * 8; + INT8 *in_6 = in_hw8c8 + 6 * 8; + INT8 *in_7 = in_hw8c8 + 7 * 8; + + // NHWChw8c4 + INT8 *in_pack_0 = in_pack + idx * 8 * ic * 8 + c * 8 * 8; + INT8 *in_pack_1 = in_pack_0 + 8 * 4; + + __asm__ __volatile__("ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + "ins v0.d[1], x2\n" + "ins v1.d[1], x3\n" + "ldr d4, [%[in_4]]\n" + "ldr x6, [%[in_6]]\n" + "trn1 v20.4s, v0.4s, v1.4s\n" + "trn2 v21.4s, v0.4s, v1.4s\n" + + "ldr d5, [%[in_5]]\n" + "ldr x7, [%[in_7]]\n" + "ins v4.d[1], x6\n" + "ins v5.d[1], x7\n" + + "str q20, [%[pack_0]]\n" + "trn1 v24.4s, v4.4s, v5.4s\n" + "trn2 v25.4s, v4.4s, v5.4s\n" + "str q21, [%[pack_1]]\n" + "str q24, [%[pack_0], #16]\n" + "str q25, [%[pack_1], #16]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), + [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), + [in_3] "r"(in_3), [in_4] "r"(in_4), [in_5] "r"(in_5), + [in_6] "r"(in_6), [in_7] "r"(in_7) + : "memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", + "v24", "v25", "x2", "x3", "x6", "x7"); + } + } + + // compute + for (U32 o = 0; o < oc; o++) { // 8 output channels at a time + // bias + F16 *b_0 = biasArray + o * 8; + for (U32 idx = 0; idx < 36; idx++) { + INT8 *in_hw0 = in_pack + idx * 8 * ic * 8; + INT8 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; + F16 *out_o0hw0 = otmArray + idx * 8 * 8; + if (factor_v[idx][0] == 0) { // input pixels are all 0 + memset(out_o0hw0, 0, 8 * 8 * sizeof(OT)); + continue; + } + F32 *fac = factor_v[idx]; + __asm__ __volatile__( + // Bias should be applied after transform + "eor v5.16b, v5.16b, v5.16b\n" + "ldr d1, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "ldr x1, [%[in_0], #8]\n" + "eor v7.16b, v7.16b, v7.16b\n" + "ins v1.d[1], x1\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + "eor v9.16b, v9.16b, v9.16b\n" + "ldr x2, [%[f_0], #8]\n" + "eor v10.16b, v10.16b, v10.16b\n" + "ins v0.d[1], x2\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + "eor v13.16b, v13.16b, v13.16b\n" + "eor v14.16b, v14.16b, v14.16b\n" + "eor v15.16b, v15.16b, v15.16b\n" + "eor v16.16b, v16.16b, v16.16b\n" + "eor v17.16b, v17.16b, v17.16b\n" + "eor v18.16b, v18.16b, v18.16b\n" + "eor v19.16b, v19.16b, v19.16b\n" + "eor v20.16b, v20.16b, v20.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + "0:\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d3, [x3, 16]!\n" + "ldr x16, [x3, 8]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ldr d29, [x0, 16]\n" + "ldr x17, [x0, 24]\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "ins v3.d[1], x16\n" + "ldr d30, [x3, 16]!\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + "ins v29.d[1], x17\n" + + "sdot v13.4s, v0.16b, v3.4b[0]\n" + "ldr x16, [x3, 8]\n" + "subs x2, x2, #4\n" + "sdot v15.4s, v0.16b, v3.4b[1]\n" + "sdot v17.4s, v0.16b, v3.4b[2]\n" + "ins v30.d[1], x16\n" + "sdot v19.4s, v0.16b, v3.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "ldr d0, [x0, 32]!\n" + "ldr x17, [x0, 8]\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + + "sdot v14.4s, v29.16b, v3.4b[0]\n" + "ins v0.d[1], x17\n" + "mov v1.16b, v30.16b\n" + "sdot v16.4s, v29.16b, v3.4b[1]\n" + "sdot v18.4s, v29.16b, v3.4b[2]\n" + "sdot v20.4s, v29.16b, v3.4b[3]\n" + + "bne 0b\n" + "scvtf v5.4s, v5.4s\n" + "scvtf v6.4s, v6.4s\n" + "ldr d1, [%[factor]]\n" + "ldr x1, [%[factor], #8]\n" + "scvtf v7.4s, v7.4s\n" + "scvtf v8.4s, v8.4s\n" + "ins v1.d[1], x1\n" + "scvtf v9.4s, v9.4s\n" + "scvtf v10.4s, v10.4s\n" + "scvtf v11.4s, v11.4s\n" + "scvtf v12.4s, v12.4s\n" + "scvtf v13.4s, v13.4s\n" + "scvtf v14.4s, v14.4s\n" + "scvtf v15.4s, v15.4s\n" + "scvtf v16.4s, v16.4s\n" + "scvtf v17.4s, v17.4s\n" + "scvtf v18.4s, v18.4s\n" + "scvtf v19.4s, v19.4s\n" + "scvtf v20.4s, v20.4s\n" + + "fmul v5.4s, v1.4s, v5.4s\n" + "fmul v6.4s, v1.4s, v6.4s\n" + "fmul v7.4s, v1.4s, v7.4s\n" + "fmul v8.4s, v1.4s, v8.4s\n" + "fmul v9.4s, v1.4s, v9.4s\n" + "fmul v10.4s, v1.4s, v10.4s\n" + "fmul v11.4s, v1.4s, v11.4s\n" + "fmul v12.4s, v1.4s, v12.4s\n" + "fmul v13.4s, v1.4s, v13.4s\n" + "fmul v14.4s, v1.4s, v14.4s\n" + "fmul v15.4s, v1.4s, v15.4s\n" + "fmul v16.4s, v1.4s, v16.4s\n" + "fmul v17.4s, v1.4s, v17.4s\n" + "fmul v18.4s, v1.4s, v18.4s\n" + "fmul v19.4s, v1.4s, v19.4s\n" + "fmul v20.4s, v1.4s, v20.4s\n" + + "fcvtn v5.4h, v5.4s\n" + "fcvtn v7.4h, v7.4s\n" + "fcvtn v9.4h, v9.4s\n" + "fcvtn v11.4h, v11.4s\n" + "fcvtn v13.4h, v13.4s\n" + "fcvtn v15.4h, v15.4s\n" + "fcvtn v17.4h, v17.4s\n" + "fcvtn v19.4h, v19.4s\n" + + "fcvtn2 v5.8h, v6.4s\n" + "fcvtn2 v7.8h, v8.4s\n" + "fcvtn2 v9.8h, v10.4s\n" + "fcvtn2 v11.8h, v12.4s\n" + "fcvtn2 v13.8h, v14.4s\n" + "fcvtn2 v15.8h, v16.4s\n" + "fcvtn2 v17.8h, v18.4s\n" + "fcvtn2 v19.8h, v20.4s\n" + + "str q5, [%[out_0]]\n" + "str q7, [%[out_0], #16]\n" + "str q9, [%[out_0], #32]\n" + "str q11, [%[out_0], #48]\n" + "str q13, [%[out_0], #64]\n" + "str q15, [%[out_0], #80]\n" + "str q17, [%[out_0], #96]\n" + "str q19, [%[out_0], #112]\n" + : + : [out_0] "r"(out_o0hw0), [in_0] "r"(in_hw0), [f_0] "r"(f_o0c0), + [ic] "r"((I64)ic * 8), [factor] "r"(fac) + : "memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v29", + "v30", "x0", "x1", "x2", "x3", "x17", "x16"); + } + // out trans + // (6*6)*hw8*o8 => NOHWo8 + for (U32 hw8 = 0; hw8 < 8; hw8++) { + U32 h = (hw + hw8) / tile_w; + U32 w = (hw + hw8) % tile_w; + F16 *out_0 = + outArray + n * oc * ohow * 8 + o * ohow * 8 + h * 4 * ow * 8 + w * 4 * 8; + + F16 *Ow_0[36]; + F16 *O_0[16]; + + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + idx * 8 * 8 + hw8 * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + trans_O(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h - 1, tile_w - 1, + max, min, am); + } + } + tiles_s += 8; + tiles_tail -= 8; + } + + if (tiles_tail >= 4) { + I32 hw = tiles_s; + // in trans + // NCHWc8 => (6*6)*(C/4)*hw4*c4 + // transform hw4c8 at a time, so we need 4 times to cover hw4c8 + // pack into hw4c4 after quantizing (reuse the space of itmArray) + for (U32 c = 0; c < ic; c++) { + OT *inArray_pad_mov = inArray_pad + c * ihiw * 8; + short *Iw_ptr[36]; + short *Iw0[36]; + OT *I0[36]; + short *Iw1[36]; + OT *I1[36]; + short *Iw2[36]; + OT *I2[36]; + short *Iw3[36]; + OT *I3[36]; + + // Store transformed hw4c8 to itmArray + for (U32 i = 0; i < 36; i++) { + Iw0[i] = itmArray + i * 4 * ic * 8 + c * 4 * 8; + Iw1[i] = itmArray + i * 4 * ic * 8 + c * 4 * 8 + 1 * 8; + Iw2[i] = itmArray + i * 4 * ic * 8 + c * 4 * 8 + 2 * 8; + Iw3[i] = itmArray + i * 4 * ic * 8 + c * 4 * 8 + 3 * 8; + } + + U32 h0 = (hw / tile_w) * 4; // stride is 4 + U32 w0 = (hw % tile_w) * 4; + U32 h1 = ((hw + 1) / tile_w) * 4; + U32 w1 = ((hw + 1) % tile_w) * 4; + U32 h2 = ((hw + 2) / tile_w) * 4; + U32 w2 = ((hw + 2) % tile_w) * 4; + U32 h3 = ((hw + 3) / tile_w) * 4; + U32 w3 = ((hw + 3) % tile_w) * 4; + + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + I1[i * 6 + j] = inArray_pad_mov + (h1 + i) * iw_pad * 8 + (w1 + j) * 8; + I2[i * 6 + j] = inArray_pad_mov + (h2 + i) * iw_pad * 8 + (w2 + j) * 8; + I3[i * 6 + j] = inArray_pad_mov + (h3 + i) * iw_pad * 8 + (w3 + j) * 8; + } + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I0); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I0); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw1[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I1); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I1); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw2[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I2); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I2); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw3[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I3); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I3); + } + } + + F32 inputScale[36]; + + if (idt == DT_I8) { + quantize_wino_input_s16(itmArray, 4 * ic * 8, inQ, inputScale, *input_scale); + } else { + quantize_wino_input((F16 *)itmArray, 4 * ic * 8, inQ, inputScale); + } + + F32 factor_v[36][4]; + for (U32 i = 0; i < 36; i++) { + if (inputScale[i] == 0) { + factor_v[i][0] = 0; + continue; + } else { + factor_v[i][0] = 1.0 / inputScale[i] / (F32)filterScale[i]; + } + factor_v[i][1] = factor_v[i][0]; + factor_v[i][2] = factor_v[i][0]; + factor_v[i][3] = factor_v[i][0]; + } + + F16 *b0 = biasArray; + INT8 *in_pack = (INT8 *)itmArray; // Reuse the space + + for (U32 idx = 0; idx < 36; idx++) { + if (factor_v[idx][0] == 0) { // input pixels are all 0 + continue; + } + for (U32 c = 0; c < ic; c++) { // for each 8 channels + INT8 *in_hw4c8 = inQ + idx * 4 * ic * 8 + c * 4 * 8; + + INT8 *in_0 = in_hw4c8; + INT8 *in_1 = in_hw4c8 + 1 * 8; + INT8 *in_2 = in_hw4c8 + 2 * 8; + INT8 *in_3 = in_hw4c8 + 3 * 8; + + // NHWChw8c4 + INT8 *in_pack_0 = in_pack + idx * 4 * ic * 8 + c * 4 * 8; + INT8 *in_pack_1 = in_pack_0 + 4 * 4; + + __asm__ __volatile__("ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + "ins v0.d[1], x2\n" + "ins v1.d[1], x3\n" + "trn1 v20.4s, v0.4s, v1.4s\n" + "trn2 v21.4s, v0.4s, v1.4s\n" + "str q20, [%[pack_0]]\n" + "str q21, [%[pack_1]]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), + [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), + [in_3] "r"(in_3) + : "memory", "cc", "v0", "v1", "v20", "v21", "x2", "x3"); + } + } + + // compute + for (U32 o = 0; o < oc; o++) { // 8 output channels at a time + // bias + F16 *b_0 = b0 + o * 8; + for (U32 idx = 0; idx < 36; idx++) { + INT8 *in_hw0 = in_pack + idx * 4 * ic * 8; + INT8 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; + F16 *out_o0hw0 = otmArray + idx * 4 * 8; + if (factor_v[idx][0] == 0) { + memset(out_o0hw0, 0, 4 * 8 * sizeof(OT)); + continue; + } + F32 *fac = factor_v[idx]; + __asm__ __volatile__("eor v5.16b, v5.16b, v5.16b\n" + "ldr d1, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "ldr x1, [%[in_0], #8]\n" + "eor v7.16b, v7.16b, v7.16b\n" + "ins v1.d[1], x1\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + + "eor v9.16b, v9.16b, v9.16b\n" + "ldr x2, [%[f_0], #8]\n" + "eor v10.16b, v10.16b, v10.16b\n" + "ins v0.d[1], x2\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + "0:\n" + "ldr d29, [x0, 16]\n" + "ldr x17, [x0, 24]\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d3, [x3, 16]!\n" + "ldr x16, [x3, 8]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ins v29.d[1], x17\n" + "subs x2, x2, #4\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "ins v3.d[1], x16\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "ldr d0, [x0, 32]!\n" + "ldr x17, [x0, 8]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "ins v0.d[1], x17\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + "mov v1.16b, v3.16b\n" + + "bne 0b\n" + + "scvtf v5.4s, v5.4s\n" + "scvtf v6.4s, v6.4s\n" + "ldr d1, [%[factor]]\n" + "ldr x1, [%[factor], #8]\n" + "scvtf v7.4s, v7.4s\n" + "scvtf v8.4s, v8.4s\n" + "ins v1.d[1], x1\n" + "scvtf v9.4s, v9.4s\n" + "scvtf v10.4s, v10.4s\n" + "scvtf v11.4s, v11.4s\n" + "scvtf v12.4s, v12.4s\n" + + "fmul v5.4s, v1.4s, v5.4s\n" + "fmul v6.4s, v1.4s, v6.4s\n" + "fmul v7.4s, v1.4s, v7.4s\n" + "fmul v8.4s, v1.4s, v8.4s\n" + "fmul v9.4s, v1.4s, v9.4s\n" + "fmul v10.4s, v1.4s, v10.4s\n" + "fmul v11.4s, v1.4s, v11.4s\n" + "fmul v12.4s, v1.4s, v12.4s\n" + + "fcvtn v5.4h, v5.4s\n" + "fcvtn v7.4h, v7.4s\n" + "fcvtn v9.4h, v9.4s\n" + "fcvtn v11.4h, v11.4s\n" + + "fcvtn2 v5.8h, v6.4s\n" + "fcvtn2 v7.8h, v8.4s\n" + "fcvtn2 v9.8h, v10.4s\n" + "fcvtn2 v11.8h, v12.4s\n" + + "str q5, [%[out_0]]\n" + "str q7, [%[out_0], #16]\n" + "str q9, [%[out_0], #32]\n" + "str q11, [%[out_0], #48]\n" + : + : [out_0] "r"(out_o0hw0), [in_0] "r"(in_hw0), + [f_0] "r"(f_o0c0), [ic] "r"((I64)ic * 8), [factor] "r"(fac) + : "memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v29", "x0", "x1", "x2", + "x3", "x17", "x16"); + } + // out trans + // (6*6)*hw4*o8 => NOHWo8 + for (U32 hw4 = 0; hw4 < 4; hw4++) { + U32 h = (hw + hw4) / tile_w; + U32 w = (hw + hw4) % tile_w; + F16 *out_0 = + outArray + n * oc * ohow * 8 + o * ohow * 8 + h * 4 * ow * 8 + w * 4 * 8; + + F16 *Ow_0[36]; + F16 *O_0[16]; + + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + idx * 4 * 8 + hw4 * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + trans_O(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h - 1, tile_w - 1, + max, min, am); + } + } + tiles_s += 4; + } + + for (I32 hw = tiles_s; hw < tiles; hw++) { + // in trans + // NCHWc8 => (6*6)*(C/4)*hw1*c4 + // transform hw1c8 + // pack into hw1c4 after quantizing (reuse the space of itmArray) + for (U32 c = 0; c < ic; c++) { + OT *inArray_pad_mov = inArray_pad + c * ihiw * 8; + short *Iw_ptr[36]; + short *Iw0[36]; + OT *I0[36]; + + // Store transformed hw12c8 to itmArray + for (U32 i = 0; i < 36; i++) { + Iw0[i] = itmArray + i * ic * 8 + c * 8; + } + + U32 h0 = (hw / tile_w) * 4; // stride is 4 + U32 w0 = (hw % tile_w) * 4; + + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + } + } + + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I0); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I0); + } + } + + F32 inputScale[36]; + + if (idt == DT_I8) { + quantize_wino_input_s16(itmArray, ic * 8, inQ, inputScale, *input_scale); + } else { + quantize_wino_input((F16 *)itmArray, ic * 8, inQ, inputScale); + } + + F32 factor_v[36][4]; + for (U32 i = 0; i < 36; i++) { + if (inputScale[i] == 0) { + factor_v[i][0] = 0; + continue; + } else { + factor_v[i][0] = 1.0 / inputScale[i] / (F32)filterScale[i]; + } + factor_v[i][1] = factor_v[i][0]; + factor_v[i][2] = factor_v[i][0]; + factor_v[i][3] = factor_v[i][0]; + } + + F16 *b0 = biasArray; + INT8 *in_pack = (INT8 *)itmArray; // Reuse the space + + for (U32 idx = 0; idx < 36; idx++) { + if (factor_v[idx][0] == 0) { + continue; + } + for (U32 c = 0; c < ic; c++) { // for each 8 channels + INT8 *in_0 = inQ + idx * ic * 8 + c * 8; + + // NHWChw8c4 + INT8 *in_pack_0 = in_pack + idx * ic * 8 + c * 8; + INT8 *in_pack_1 = in_pack_0 + 4; + + memcpy(in_pack_0, in_0, 4 * bytesOf(DT_I8)); + memcpy(in_pack_1, in_0 + 4, 4 * bytesOf(DT_I8)); + } + } + + // compute + for (U32 o = 0; o < oc; o++) { // 8 output channels at a time + // bias + F16 *b_0 = b0 + o * 8; + for (U32 idx = 0; idx < 36; idx++) { + INT8 *in_hw = in_pack + idx * ic * 8; + INT8 *f_o = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; + F16 *out_o0hw0 = otmArray + idx * 8; + if (factor_v[idx][0] == 0) { + memset(out_o0hw0, 0, 8 * sizeof(OT)); + continue; + } + int32x4_t res[2] = {0}; + + for (U32 c = 0; c < ic; c++) { + int8x8_t in_2 = vld1_s8(in_hw); + in_hw += 8; + int8x16_t f_8o[4]; + f_8o[0] = vld1q_s8(f_o); + f_8o[1] = vld1q_s8(f_o + 16); + res[0] = vdotq_lane_s32(res[0], f_8o[0], in_2, 0); + res[1] = vdotq_lane_s32(res[1], f_8o[1], in_2, 0); + + f_8o[2] = vld1q_s8(f_o + 32); + f_8o[3] = vld1q_s8(f_o + 48); + f_o += 64; + res[0] = vdotq_lane_s32(res[0], f_8o[2], in_2, 1); + res[1] = vdotq_lane_s32(res[1], f_8o[3], in_2, 1); + } + float32x4_t fac = vld1q_f32(factor_v[idx]); + float32x4_t resf0 = vcvtq_f32_s32(res[0]); + float32x4_t resf1 = vcvtq_f32_s32(res[1]); + resf0 = vmulq_f32(resf0, fac); + resf1 = vmulq_f32(resf1, fac); + + float16x4_t resh0 = vcvt_f16_f32(resf0); + float16x4_t resh1 = vcvt_f16_f32(resf1); + + vst1_f16(out_o0hw0, resh0); + vst1_f16(out_o0hw0 + 4, resh1); + } + // out trans + // (6*6)*hw1*o8 => NOHWo8 + U32 h = hw / tile_w; + U32 w = hw % tile_w; + F16 *out_0 = + outArray + n * oc * ohow * 8 + o * ohow * 8 + h * 4 * ow * 8 + w * 4 * 8; + + F16 *Ow_0[36]; + F16 *O_0[16]; + + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + idx * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + trans_O(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h - 1, tile_w - 1, max, + min, am); + } + } + } + + if (DT_I8 == odt) { + F16 max_s = max[0]; + F16 min_s = min[0]; + for (U32 i = 1; i < 8; i++) { + if (max_s < max[i]) { + max_s = max[i]; + } + if (min_s > min[i]) { + min_s = min[i]; + } + } + + if (max_s == 0 && min_s == 0) { + return NOT_SUPPORTED; + } + + F16 scale_o; + if (max_s > 0 && min_s < 0) { + F16 scale_max = 127.0 / max_s; + F16 scale_min = -127.0 / min_s; + scale_o = (scale_max < scale_min) ? scale_max : scale_min; + } else if (max_s > 0) { + scale_o = 127.0 / max_s; + } else { + scale_o = -127.0 / min_s; + } + *outputScale = scale_o; + + apply_scale_f16(on * oc * ohow * 8, outArray, scale_o, (INT8 *)output); + } + return SUCCESS; +} + +template EE convolution_winograd_A55(TensorDesc inputDesc, + const void *input, + F16 *input_scale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec am); + +template EE convolution_winograd_A55(TensorDesc inputDesc, + const void *input, + F16 *input_scale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec am); +#endif diff --git a/compute/tensor/src/cpu/arm/int8/convolution_winograd_A76.cpp b/compute/tensor/src/cpu/arm/int8/convolution_winograd_A76.cpp new file mode 100644 index 00000000..9d997df0 --- /dev/null +++ b/compute/tensor/src/cpu/arm/int8/convolution_winograd_A76.cpp @@ -0,0 +1,1440 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_INT8 +#include "cpu/arm/int8/convolution_winograd_transform.h" +#include "cpu/arm/int8/convolution_winograd.h" + +template +EE convolution_winograd_A76(TensorDesc inputDesc, + const void *input, + F16 *input_scale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec am) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + // not truely one_step. Compute hw12*(6*6)*ic at one time. + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + if (fdf != DF_HWNCN8C4) { + return NOT_MATCH; + } + if (!(fh == 6 && fw == 6)) { + return NOT_MATCH; + } + + // Assume IT is the same as OT + OT *inArray = (OT *)input; + INT8 *filterArray = (INT8 *)filter; + F16 *outArray = (F16 *)output; + F16 *biasArray = (F16 *)bias; + + // both input and output are stored with C8 + oc /= 8; + ic /= 8; + + U32 tile_h = (oh + 3) / 4; + U32 tile_w = (ow + 3) / 4; + I32 tiles = tile_h * tile_w; // num of 6x6 tiles + U32 pad_left = paddingL; + U32 pad_right = paddingR + (tile_w * 4 - ow); + U32 pad_w_mod_4 = tile_w * 4 - ow; + U32 pad_top = paddingT; + U32 pad_bottom = paddingB + (tile_h * 4 - oh); + U32 pad_h_mod_4 = tile_h * 4 - oh; + U32 ih_pad = ih + pad_top + pad_bottom; + U32 iw_pad = iw + pad_left + pad_right; + + U32 ohow = oh * ow; + U32 ihiw = ih_pad * iw_pad; + + // tmp = in_pad + itm + otm + inQ + ... + // in_pad: ic*ih_pad*iw_pad*8 + // itm: 6*6*ic*12*8 (int16 or fp16) + // otm: 6*6*12*8 (F16) + // inQ: 6*6*ic*12*8 (int8) + OT *inArray_pad = (OT *)tmp; + short *itmArray = (short *)(inArray_pad + ic * ihiw * 8); // will be cast to fp16 for fp16 inputs + F16 *otmArray = (F16 *)(itmArray + 6 * 6 * ic * 12 * 8); + INT8 *inQ = (INT8 *)(otmArray + 6 * 6 * 12 * 8); + if (DT_I8 == odt) { + outArray = (F16 *)(inQ + 6 * 6 * ic * 12 * 8); // After otmArray and pack + } + + // To track the range of the final outputs and prepare for quantization + F16 max[8] = {0}; + F16 min[8] = {0}; + + for (U32 n = 0; n < in; n++) { // for each batch + OT *inArray_pad_mov = inArray_pad; + OT *inArray_mov = inArray + n * ic * ih * iw * 8; + for (U32 c = 0; c < ic; c++) { + memset(inArray_pad_mov, 0, pad_top * iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += pad_top * iw_pad * 8; + for (U32 h = pad_top; h < ih_pad - pad_bottom; h++) { + memset(inArray_pad_mov, 0, pad_left * 8 * bytesOf(idt)); + inArray_pad_mov += pad_left * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, pad_right * 8 * bytesOf(idt)); + inArray_pad_mov += pad_right * 8; + } + memset(inArray_pad_mov, 0, pad_bottom * iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += pad_bottom * iw_pad * 8; + } + + // tiles / 12 + for (I32 hw = 0; hw < tiles - 11; hw += 12) { + // in trans + // NCHWc8 => (6*6)*(C/4)*hw12*c4 + // transform hw1c8 at a time, so we need 12 times to cover hw12c8 + // pack into hw12c4 after quantizing (reuse the space of itmArray) + for (U32 c = 0; c < ic; c++) { + OT *inArray_pad_mov = inArray_pad + c * ihiw * 8; + short *Iw_ptr[36]; + short *Iw0[36]; + OT *I0[36]; + short *Iw1[36]; + OT *I1[36]; + short *Iw2[36]; + OT *I2[36]; + short *Iw3[36]; + OT *I3[36]; + short *Iw4[36]; + OT *I4[36]; + short *Iw5[36]; + OT *I5[36]; + short *Iw6[36]; + OT *I6[36]; + short *Iw7[36]; + OT *I7[36]; + short *Iw8[36]; + OT *I8[36]; + short *Iw9[36]; + OT *I9[36]; + short *Iw10[36]; + OT *I10[36]; + short *Iw11[36]; + OT *I11[36]; + + // Store transformed hw12c8 to itmArray + for (U32 i = 0; i < 36; i++) { + Iw0[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12; + Iw1[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 1 * 8; + Iw2[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 2 * 8; + Iw3[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 3 * 8; + Iw4[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 4 * 8; + Iw5[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 5 * 8; + Iw6[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 6 * 8; + Iw7[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 7 * 8; + Iw8[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 8 * 8; + Iw9[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 9 * 8; + Iw10[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 10 * 8; + Iw11[i] = itmArray + i * 12 * ic * 8 + c * 8 * 12 + 11 * 8; + } + + U32 h0 = (hw / tile_w) * 4; // stride is 4 + U32 w0 = (hw % tile_w) * 4; + U32 h1 = ((hw + 1) / tile_w) * 4; + U32 w1 = ((hw + 1) % tile_w) * 4; + U32 h2 = ((hw + 2) / tile_w) * 4; + U32 w2 = ((hw + 2) % tile_w) * 4; + U32 h3 = ((hw + 3) / tile_w) * 4; + U32 w3 = ((hw + 3) % tile_w) * 4; + U32 h4 = ((hw + 4) / tile_w) * 4; + U32 w4 = ((hw + 4) % tile_w) * 4; + U32 h5 = ((hw + 5) / tile_w) * 4; + U32 w5 = ((hw + 5) % tile_w) * 4; + U32 h6 = ((hw + 6) / tile_w) * 4; + U32 w6 = ((hw + 6) % tile_w) * 4; + U32 h7 = ((hw + 7) / tile_w) * 4; + U32 w7 = ((hw + 7) % tile_w) * 4; + U32 h8 = ((hw + 8) / tile_w) * 4; + U32 w8 = ((hw + 8) % tile_w) * 4; + U32 h9 = ((hw + 9) / tile_w) * 4; + U32 w9 = ((hw + 9) % tile_w) * 4; + U32 h10 = ((hw + 10) / tile_w) * 4; + U32 w10 = ((hw + 10) % tile_w) * 4; + U32 h11 = ((hw + 11) / tile_w) * 4; + U32 w11 = ((hw + 11) % tile_w) * 4; + + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + I1[i * 6 + j] = inArray_pad_mov + (h1 + i) * iw_pad * 8 + (w1 + j) * 8; + I2[i * 6 + j] = inArray_pad_mov + (h2 + i) * iw_pad * 8 + (w2 + j) * 8; + I3[i * 6 + j] = inArray_pad_mov + (h3 + i) * iw_pad * 8 + (w3 + j) * 8; + I4[i * 6 + j] = inArray_pad_mov + (h4 + i) * iw_pad * 8 + (w4 + j) * 8; + I5[i * 6 + j] = inArray_pad_mov + (h5 + i) * iw_pad * 8 + (w5 + j) * 8; + I6[i * 6 + j] = inArray_pad_mov + (h6 + i) * iw_pad * 8 + (w6 + j) * 8; + I7[i * 6 + j] = inArray_pad_mov + (h7 + i) * iw_pad * 8 + (w7 + j) * 8; + I8[i * 6 + j] = inArray_pad_mov + (h8 + i) * iw_pad * 8 + (w8 + j) * 8; + I9[i * 6 + j] = inArray_pad_mov + (h9 + i) * iw_pad * 8 + (w9 + j) * 8; + I10[i * 6 + j] = inArray_pad_mov + (h10 + i) * iw_pad * 8 + (w10 + j) * 8; + I11[i * 6 + j] = inArray_pad_mov + (h11 + i) * iw_pad * 8 + (w11 + j) * 8; + } + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I0); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I0); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw1[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I1); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I1); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw2[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I2); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I2); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw3[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I3); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I3); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw4[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I4); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I4); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw5[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I5); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I5); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw6[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I6); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I6); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw7[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I7); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I7); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw8[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I8); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I8); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw9[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I9); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I9); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw10[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I10); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I10); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw11[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I11); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I11); + } + } + + F32 inputScale[36]; + + if (DT_I8 == idt) { + quantize_wino_input_s16(itmArray, 12 * ic * 8, inQ, inputScale, *input_scale); + } else { + quantize_wino_input((F16 *)itmArray, 12 * ic * 8, inQ, inputScale); + } + + F32 factor_v[36][4]; + for (U32 i = 0; i < 36; i++) { + if (inputScale[i] == 0) { + factor_v[i][0] = 0; + continue; + } else { + factor_v[i][0] = 1.0 / inputScale[i] / (F32)filterScale[i]; + } + + factor_v[i][1] = factor_v[i][0]; + factor_v[i][2] = factor_v[i][0]; + factor_v[i][3] = factor_v[i][0]; + } + + INT8 *in_pack = (INT8 *)itmArray; // Reuse the space + + for (U32 idx = 0; idx < 36; idx++) { + if (factor_v[idx][0] == 0) { // input pixels are all 0 + continue; + } + for (U32 c = 0; c < ic; c++) { // for each 8 channels + INT8 *in_hw12c8 = inQ + idx * 12 * ic * 8 + c * 12 * 8; + + INT8 *in_0 = in_hw12c8; + INT8 *in_1 = in_hw12c8 + 1 * 8; + INT8 *in_2 = in_hw12c8 + 2 * 8; + INT8 *in_3 = in_hw12c8 + 3 * 8; + INT8 *in_4 = in_hw12c8 + 4 * 8; + INT8 *in_5 = in_hw12c8 + 5 * 8; + INT8 *in_6 = in_hw12c8 + 6 * 8; + INT8 *in_7 = in_hw12c8 + 7 * 8; + INT8 *in_8 = in_hw12c8 + 8 * 8; + INT8 *in_9 = in_hw12c8 + 9 * 8; + INT8 *in_10 = in_hw12c8 + 10 * 8; + INT8 *in_11 = in_hw12c8 + 11 * 8; + + // NHWChw12c4 + INT8 *in_pack_0 = in_pack + idx * 12 * ic * 8 + c * 12 * 8; + INT8 *in_pack_1 = in_pack_0 + 12 * 4; + + __asm__ __volatile__( + "ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + "ins v0.d[1], x2\n" + "ins v1.d[1], x3\n" + "ldr d4, [%[in_4]]\n" + "ldr x6, [%[in_6]]\n" + "trn1 v20.4s, v0.4s, v1.4s\n" + "trn2 v21.4s, v0.4s, v1.4s\n" + + "ldr d5, [%[in_5]]\n" + "ldr x7, [%[in_7]]\n" + "ins v4.d[1], x6\n" + "ins v5.d[1], x7\n" + "ldr d8, [%[in_8]]\n" + "ldr x10, [%[in_10]]\n" + "trn1 v24.4s, v4.4s, v5.4s\n" + "trn2 v25.4s, v4.4s, v5.4s\n" + "ldr d9, [%[in_9]]\n" + "ldr x11, [%[in_11]]\n" + "ins v8.d[1], x10\n" + "ins v9.d[1], x11\n" + + "str q20, [%[pack_0]]\n" + "trn1 v28.4s, v8.4s, v9.4s\n" + "trn2 v29.4s, v8.4s, v9.4s\n" + "str q24, [%[pack_0], #16]\n" + "str q28, [%[pack_0], #32]\n" + "str q21, [%[pack_1]]\n" + "str q25, [%[pack_1], #16]\n" + "str q29, [%[pack_1], #32]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), [in_0] "r"(in_0), + [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3), [in_4] "r"(in_4), + [in_5] "r"(in_5), [in_6] "r"(in_6), [in_7] "r"(in_7), [in_8] "r"(in_8), + [in_9] "r"(in_9), [in_10] "r"(in_10), [in_11] "r"(in_11) + : "memory", "cc", "v0", "v1", "v4", "v5", "v8", "v9", "v20", "v21", "v24", + "v25", "v28", "v29", "x2", "x3", "x6", "x7", "x10", "x11"); + } + } + + // compute + for (U32 o = 0; o < oc; o++) { // 8 output channels at a time + // bias + F16 *b_0 = biasArray + o * 8; + for (U32 idx = 0; idx < 36; idx++) { + INT8 *in_hw0 = in_pack + idx * 12 * ic * 8; + INT8 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; + F16 *out_o0hw0 = otmArray + idx * 12 * 8; + if (factor_v[idx][0] == 0) { // input pixels are all 0 + memset(out_o0hw0, 0, 12 * 8 * sizeof(OT)); + continue; + } + F32 *fac = factor_v[idx]; + __asm__ __volatile__("eor v5.16b, v5.16b, v5.16b\n" + "ldr q1, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "eor v7.16b, v7.16b, v7.16b\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ldr q0, [%[f_0]]\n" // f_0 + "eor v9.16b, v9.16b, v9.16b\n" + "eor v10.16b, v10.16b, v10.16b\n" + "eor v11.16b, v11.16b, v11.16b\n" + "ldr q3, [%[in_0], #16]\n" // in_1 + "eor v12.16b, v12.16b, v12.16b\n" + "eor v13.16b, v13.16b, v13.16b\n" + "eor v14.16b, v14.16b, v14.16b\n" + "eor v15.16b, v15.16b, v15.16b\n" + "eor v16.16b, v16.16b, v16.16b\n" + + "eor v17.16b, v17.16b, v17.16b\n" + "eor v18.16b, v18.16b, v18.16b\n" + "eor v19.16b, v19.16b, v19.16b\n" + "eor v20.16b, v20.16b, v20.16b\n" + "eor v21.16b, v21.16b, v21.16b\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + "eor v25.16b, v25.16b, v25.16b\n" + "eor v26.16b, v26.16b, v26.16b\n" + "eor v27.16b, v27.16b, v27.16b\n" + "eor v28.16b, v28.16b, v28.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + "0:\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ldr q2, [x3, 32]\n" + "ldr q29, [x0, 16]\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + + "sdot v13.4s, v0.16b, v3.4b[0]\n" + "sdot v15.4s, v0.16b, v3.4b[1]\n" + "sdot v17.4s, v0.16b, v3.4b[2]\n" + "sdot v19.4s, v0.16b, v3.4b[3]\n" + + "sdot v21.4s, v0.16b, v2.4b[0]\n" + "sdot v23.4s, v0.16b, v2.4b[1]\n" + "sdot v25.4s, v0.16b, v2.4b[2]\n" + "sdot v27.4s, v0.16b, v2.4b[3]\n" + + "sdot v14.4s, v29.16b, v3.4b[0]\n" + "sdot v16.4s, v29.16b, v3.4b[1]\n" + + "sdot v18.4s, v29.16b, v3.4b[2]\n" + "sdot v20.4s, v29.16b, v3.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "ldr q0, [x0, 32]!\n" + "subs x2, x2, #4\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + + "ldr q1, [x3, 48]!\n" + "ldr q3, [x3, 16]\n" + "sdot v22.4s, v29.16b, v2.4b[0]\n" + "sdot v24.4s, v29.16b, v2.4b[1]\n" + "sdot v26.4s, v29.16b, v2.4b[2]\n" + "sdot v28.4s, v29.16b, v2.4b[3]\n" + + "bne 0b\n" + "scvtf v5.4s, v5.4s\n" + "scvtf v6.4s, v6.4s\n" + "ldr q1, [%[factor]]\n" + "scvtf v7.4s, v7.4s\n" + "scvtf v8.4s, v8.4s\n" + "scvtf v9.4s, v9.4s\n" + "scvtf v10.4s, v10.4s\n" + "scvtf v11.4s, v11.4s\n" + "scvtf v12.4s, v12.4s\n" + "scvtf v13.4s, v13.4s\n" + "scvtf v14.4s, v14.4s\n" + "scvtf v15.4s, v15.4s\n" + "scvtf v16.4s, v16.4s\n" + "scvtf v17.4s, v17.4s\n" + "scvtf v18.4s, v18.4s\n" + "scvtf v19.4s, v19.4s\n" + "scvtf v20.4s, v20.4s\n" + "scvtf v21.4s, v21.4s\n" + "scvtf v22.4s, v22.4s\n" + "scvtf v23.4s, v23.4s\n" + "scvtf v24.4s, v24.4s\n" + "scvtf v25.4s, v25.4s\n" + "scvtf v26.4s, v26.4s\n" + "scvtf v27.4s, v27.4s\n" + "scvtf v28.4s, v28.4s\n" + + "fmul v5.4s, v1.4s, v5.4s\n" + "fmul v6.4s, v1.4s, v6.4s\n" + "fmul v7.4s, v1.4s, v7.4s\n" + "fmul v8.4s, v1.4s, v8.4s\n" + "fmul v9.4s, v1.4s, v9.4s\n" + "fmul v10.4s, v1.4s, v10.4s\n" + "fmul v11.4s, v1.4s, v11.4s\n" + "fmul v12.4s, v1.4s, v12.4s\n" + "fmul v13.4s, v1.4s, v13.4s\n" + "fmul v14.4s, v1.4s, v14.4s\n" + "fmul v15.4s, v1.4s, v15.4s\n" + "fmul v16.4s, v1.4s, v16.4s\n" + "fmul v17.4s, v1.4s, v17.4s\n" + "fmul v18.4s, v1.4s, v18.4s\n" + "fmul v19.4s, v1.4s, v19.4s\n" + "fmul v20.4s, v1.4s, v20.4s\n" + "fmul v21.4s, v1.4s, v21.4s\n" + "fmul v22.4s, v1.4s, v22.4s\n" + "fmul v23.4s, v1.4s, v23.4s\n" + "fmul v24.4s, v1.4s, v24.4s\n" + "fmul v25.4s, v1.4s, v25.4s\n" + "fmul v26.4s, v1.4s, v26.4s\n" + "fmul v27.4s, v1.4s, v27.4s\n" + "fmul v28.4s, v1.4s, v28.4s\n" + + "fcvtn v5.4h, v5.4s\n" + "fcvtn v7.4h, v7.4s\n" + "fcvtn v9.4h, v9.4s\n" + "fcvtn v11.4h, v11.4s\n" + "fcvtn v13.4h, v13.4s\n" + "fcvtn v15.4h, v15.4s\n" + "fcvtn v17.4h, v17.4s\n" + "fcvtn v19.4h, v19.4s\n" + "fcvtn v21.4h, v21.4s\n" + "fcvtn v23.4h, v23.4s\n" + "fcvtn v25.4h, v25.4s\n" + "fcvtn v27.4h, v27.4s\n" + + "fcvtn2 v5.8h, v6.4s\n" + "fcvtn2 v7.8h, v8.4s\n" + "fcvtn2 v9.8h, v10.4s\n" + "fcvtn2 v11.8h, v12.4s\n" + "fcvtn2 v13.8h, v14.4s\n" + "fcvtn2 v15.8h, v16.4s\n" + "fcvtn2 v17.8h, v18.4s\n" + "fcvtn2 v19.8h, v20.4s\n" + "fcvtn2 v21.8h, v22.4s\n" + "fcvtn2 v23.8h, v24.4s\n" + "fcvtn2 v25.8h, v26.4s\n" + "fcvtn2 v27.8h, v28.4s\n" + + "str q5, [%[out_0]]\n" + "str q7, [%[out_0], #16]\n" + "str q9, [%[out_0], #32]\n" + "str q11, [%[out_0], #48]\n" + "str q13, [%[out_0], #64]\n" + "str q15, [%[out_0], #80]\n" + "str q17, [%[out_0], #96]\n" + "str q19, [%[out_0], #112]\n" + "str q21, [%[out_0], #128]\n" + "str q23, [%[out_0], #144]\n" + "str q25, [%[out_0], #160]\n" + "str q27, [%[out_0], #176]\n" + : + : [out_0] "r"(out_o0hw0), [in_0] "r"(in_hw0), + [f_0] "r"(f_o0c0), [ic] "r"((I64)ic * 8), [factor] "r"(fac) + : "memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "x0", "x1", "x2", + "x3", "x17", "x16"); + } + // out trans + // (6*6)*hw12*o8 => NOHWo8 + for (U32 hw12 = 0; hw12 < 12; hw12++) { + U32 h = (hw + hw12) / tile_w; + U32 w = (hw + hw12) % tile_w; + F16 *out_0 = + outArray + n * oc * ohow * 8 + o * ohow * 8 + h * 4 * ow * 8 + w * 4 * 8; + + F16 *Ow_0[36]; + F16 *O_0[16]; + + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + idx * 12 * 8 + hw12 * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + trans_O(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h - 1, tile_w - 1, + max, min, am); + } + } + } + + // tiles_reminder % 12 / 8 + I32 tiles_s = (tiles / 12) * 12; + I32 tiles_tail = tiles - tiles_s; + + if (tiles_tail >= 8) { + I32 hw = tiles_s; + // in trans + // NCHWc8 => (6*6)*(C/4)*hw8*c4 + // transform hw1c8 at a time, so we need 8 times to cover hw8c8 + // pack into hw8c4 after quantizing (reuse the space of itmArray) + for (U32 c = 0; c < ic; c++) { + OT *inArray_pad_mov = inArray_pad + c * ihiw * 8; + short *Iw_ptr[36]; + short *Iw0[36]; + OT *I0[36]; + short *Iw1[36]; + OT *I1[36]; + short *Iw2[36]; + OT *I2[36]; + short *Iw3[36]; + OT *I3[36]; + short *Iw4[36]; + OT *I4[36]; + short *Iw5[36]; + OT *I5[36]; + short *Iw6[36]; + OT *I6[36]; + short *Iw7[36]; + OT *I7[36]; + + // Store transformed hw12c8 to itmArray + for (U32 i = 0; i < 36; i++) { + Iw0[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8; + Iw1[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 1 * 8; + Iw2[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 2 * 8; + Iw3[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 3 * 8; + Iw4[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 4 * 8; + Iw5[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 5 * 8; + Iw6[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 6 * 8; + Iw7[i] = itmArray + i * 8 * ic * 8 + c * 8 * 8 + 7 * 8; + } + + U32 h0 = (hw / tile_w) * 4; // stride is 4 + U32 w0 = (hw % tile_w) * 4; + U32 h1 = ((hw + 1) / tile_w) * 4; + U32 w1 = ((hw + 1) % tile_w) * 4; + U32 h2 = ((hw + 2) / tile_w) * 4; + U32 w2 = ((hw + 2) % tile_w) * 4; + U32 h3 = ((hw + 3) / tile_w) * 4; + U32 w3 = ((hw + 3) % tile_w) * 4; + U32 h4 = ((hw + 4) / tile_w) * 4; + U32 w4 = ((hw + 4) % tile_w) * 4; + U32 h5 = ((hw + 5) / tile_w) * 4; + U32 w5 = ((hw + 5) % tile_w) * 4; + U32 h6 = ((hw + 6) / tile_w) * 4; + U32 w6 = ((hw + 6) % tile_w) * 4; + U32 h7 = ((hw + 7) / tile_w) * 4; + U32 w7 = ((hw + 7) % tile_w) * 4; + + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + I1[i * 6 + j] = inArray_pad_mov + (h1 + i) * iw_pad * 8 + (w1 + j) * 8; + I2[i * 6 + j] = inArray_pad_mov + (h2 + i) * iw_pad * 8 + (w2 + j) * 8; + I3[i * 6 + j] = inArray_pad_mov + (h3 + i) * iw_pad * 8 + (w3 + j) * 8; + I4[i * 6 + j] = inArray_pad_mov + (h4 + i) * iw_pad * 8 + (w4 + j) * 8; + I5[i * 6 + j] = inArray_pad_mov + (h5 + i) * iw_pad * 8 + (w5 + j) * 8; + I6[i * 6 + j] = inArray_pad_mov + (h6 + i) * iw_pad * 8 + (w6 + j) * 8; + I7[i * 6 + j] = inArray_pad_mov + (h7 + i) * iw_pad * 8 + (w7 + j) * 8; + } + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I0); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I0); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw1[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I1); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I1); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw2[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I2); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I2); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw3[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I3); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I3); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw4[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I4); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I4); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw5[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I5); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I5); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw6[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I6); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I6); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw7[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I7); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I7); + } + } + + F32 inputScale[36]; + + if (idt == DT_I8) { + quantize_wino_input_s16(itmArray, 8 * ic * 8, inQ, inputScale, *input_scale); + } else { + quantize_wino_input((F16 *)itmArray, 8 * ic * 8, inQ, inputScale); + } + + F32 factor_v[36][4]; + for (U32 i = 0; i < 36; i++) { + if (inputScale[i] == 0) { + factor_v[i][0] = 0; + continue; + } else { + factor_v[i][0] = 1.0 / inputScale[i] / (F32)filterScale[i]; + } + factor_v[i][1] = factor_v[i][0]; + factor_v[i][2] = factor_v[i][0]; + factor_v[i][3] = factor_v[i][0]; + } + + INT8 *in_pack = (INT8 *)itmArray; // Reuse the space + + for (U32 idx = 0; idx < 36; idx++) { + if (factor_v[idx][0] == 0) { // input pixels are all 0 + continue; + } + for (U32 c = 0; c < ic; c++) { // for each 8 channels + INT8 *in_hw8c8 = inQ + idx * 8 * ic * 8 + c * 8 * 8; + + INT8 *in_0 = in_hw8c8; + INT8 *in_1 = in_hw8c8 + 1 * 8; + INT8 *in_2 = in_hw8c8 + 2 * 8; + INT8 *in_3 = in_hw8c8 + 3 * 8; + INT8 *in_4 = in_hw8c8 + 4 * 8; + INT8 *in_5 = in_hw8c8 + 5 * 8; + INT8 *in_6 = in_hw8c8 + 6 * 8; + INT8 *in_7 = in_hw8c8 + 7 * 8; + + // NHWChw8c4 + INT8 *in_pack_0 = in_pack + idx * 8 * ic * 8 + c * 8 * 8; + INT8 *in_pack_1 = in_pack_0 + 8 * 4; + + __asm__ __volatile__("ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + "ins v0.d[1], x2\n" + "ins v1.d[1], x3\n" + "ldr d4, [%[in_4]]\n" + "ldr x6, [%[in_6]]\n" + "trn1 v20.4s, v0.4s, v1.4s\n" + "trn2 v21.4s, v0.4s, v1.4s\n" + + "ldr d5, [%[in_5]]\n" + "ldr x7, [%[in_7]]\n" + "ins v4.d[1], x6\n" + "ins v5.d[1], x7\n" + + "str q20, [%[pack_0]]\n" + "trn1 v24.4s, v4.4s, v5.4s\n" + "trn2 v25.4s, v4.4s, v5.4s\n" + "str q21, [%[pack_1]]\n" + "str q24, [%[pack_0], #16]\n" + "str q25, [%[pack_1], #16]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), + [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), + [in_3] "r"(in_3), [in_4] "r"(in_4), [in_5] "r"(in_5), + [in_6] "r"(in_6), [in_7] "r"(in_7) + : "memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", + "v24", "v25", "x2", "x3", "x6", "x7"); + } + } + + // compute + for (U32 o = 0; o < oc; o++) { // 8 output channels at a time + // bias + F16 *b_0 = biasArray + o * 8; + for (U32 idx = 0; idx < 36; idx++) { + INT8 *in_hw0 = in_pack + idx * 8 * ic * 8; + INT8 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; + F16 *out_o0hw0 = otmArray + idx * 8 * 8; + if (factor_v[idx][0] == 0) { // input pixels are all 0 + memset(out_o0hw0, 0, 8 * 8 * sizeof(OT)); + continue; + } + F32 *fac = factor_v[idx]; + __asm__ __volatile__( + // Bias should be applied after transform + "eor v5.16b, v5.16b, v5.16b\n" + "ldr q1, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "eor v7.16b, v7.16b, v7.16b\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ldr q0, [%[f_0]]\n" // f_0 + "eor v9.16b, v9.16b, v9.16b\n" + "eor v10.16b, v10.16b, v10.16b\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + "eor v13.16b, v13.16b, v13.16b\n" + "eor v14.16b, v14.16b, v14.16b\n" + "eor v15.16b, v15.16b, v15.16b\n" + "eor v16.16b, v16.16b, v16.16b\n" + "eor v17.16b, v17.16b, v17.16b\n" + "eor v18.16b, v18.16b, v18.16b\n" + "eor v19.16b, v19.16b, v19.16b\n" + "eor v20.16b, v20.16b, v20.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + "0:\n" + "ldr q3, [x3, 16]!\n" + "ldr q29, [x0, 16]\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + + "sdot v13.4s, v0.16b, v3.4b[0]\n" + "sdot v15.4s, v0.16b, v3.4b[1]\n" + "sdot v17.4s, v0.16b, v3.4b[2]\n" + "sdot v19.4s, v0.16b, v3.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "ldr q0, [x0, 32]!\n" + "subs x2, x2, #4\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + + "sdot v14.4s, v29.16b, v3.4b[0]\n" + "ldr q1, [x3, 16]!\n" + "sdot v16.4s, v29.16b, v3.4b[1]\n" + "sdot v18.4s, v29.16b, v3.4b[2]\n" + "sdot v20.4s, v29.16b, v3.4b[3]\n" + + "bne 0b\n" + "scvtf v5.4s, v5.4s\n" + "scvtf v6.4s, v6.4s\n" + "ldr q1, [%[factor]]\n" + "scvtf v7.4s, v7.4s\n" + "scvtf v8.4s, v8.4s\n" + "scvtf v9.4s, v9.4s\n" + "scvtf v10.4s, v10.4s\n" + "scvtf v11.4s, v11.4s\n" + "scvtf v12.4s, v12.4s\n" + "scvtf v13.4s, v13.4s\n" + "scvtf v14.4s, v14.4s\n" + "scvtf v15.4s, v15.4s\n" + "scvtf v16.4s, v16.4s\n" + "scvtf v17.4s, v17.4s\n" + "scvtf v18.4s, v18.4s\n" + "scvtf v19.4s, v19.4s\n" + "scvtf v20.4s, v20.4s\n" + + "fmul v5.4s, v1.4s, v5.4s\n" + "fmul v6.4s, v1.4s, v6.4s\n" + "fmul v7.4s, v1.4s, v7.4s\n" + "fmul v8.4s, v1.4s, v8.4s\n" + "fmul v9.4s, v1.4s, v9.4s\n" + "fmul v10.4s, v1.4s, v10.4s\n" + "fmul v11.4s, v1.4s, v11.4s\n" + "fmul v12.4s, v1.4s, v12.4s\n" + "fmul v13.4s, v1.4s, v13.4s\n" + "fmul v14.4s, v1.4s, v14.4s\n" + "fmul v15.4s, v1.4s, v15.4s\n" + "fmul v16.4s, v1.4s, v16.4s\n" + "fmul v17.4s, v1.4s, v17.4s\n" + "fmul v18.4s, v1.4s, v18.4s\n" + "fmul v19.4s, v1.4s, v19.4s\n" + "fmul v20.4s, v1.4s, v20.4s\n" + + "fcvtn v5.4h, v5.4s\n" + "fcvtn v7.4h, v7.4s\n" + "fcvtn v9.4h, v9.4s\n" + "fcvtn v11.4h, v11.4s\n" + "fcvtn v13.4h, v13.4s\n" + "fcvtn v15.4h, v15.4s\n" + "fcvtn v17.4h, v17.4s\n" + "fcvtn v19.4h, v19.4s\n" + + "fcvtn2 v5.8h, v6.4s\n" + "fcvtn2 v7.8h, v8.4s\n" + "fcvtn2 v9.8h, v10.4s\n" + "fcvtn2 v11.8h, v12.4s\n" + "fcvtn2 v13.8h, v14.4s\n" + "fcvtn2 v15.8h, v16.4s\n" + "fcvtn2 v17.8h, v18.4s\n" + "fcvtn2 v19.8h, v20.4s\n" + + "str q5, [%[out_0]]\n" + "str q7, [%[out_0], #16]\n" + "str q9, [%[out_0], #32]\n" + "str q11, [%[out_0], #48]\n" + "str q13, [%[out_0], #64]\n" + "str q15, [%[out_0], #80]\n" + "str q17, [%[out_0], #96]\n" + "str q19, [%[out_0], #112]\n" + : + : [out_0] "r"(out_o0hw0), [in_0] "r"(in_hw0), [f_0] "r"(f_o0c0), + [ic] "r"((I64)ic * 8), [factor] "r"(fac) + : "memory", "cc", "v0", "v1", "v3", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v29", + "v30", "x0", "x1", "x2", "x3", "x17", "x16"); + } + // out trans + // (6*6)*hw8*o8 => NOHWo8 + for (U32 hw8 = 0; hw8 < 8; hw8++) { + U32 h = (hw + hw8) / tile_w; + U32 w = (hw + hw8) % tile_w; + F16 *out_0 = + outArray + n * oc * ohow * 8 + o * ohow * 8 + h * 4 * ow * 8 + w * 4 * 8; + + F16 *Ow_0[36]; + F16 *O_0[16]; + + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + idx * 8 * 8 + hw8 * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + trans_O(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h - 1, tile_w - 1, + max, min, am); + } + } + tiles_s += 8; + tiles_tail -= 8; + } + + if (tiles_tail >= 4) { + I32 hw = tiles_s; + // in trans + // NCHWc8 => (6*6)*(C/4)*hw4*c4 + // transform hw4c8 at a time, so we need 4 times to cover hw4c8 + // pack into hw4c4 after quantizing (reuse the space of itmArray) + for (U32 c = 0; c < ic; c++) { + OT *inArray_pad_mov = inArray_pad + c * ihiw * 8; + short *Iw_ptr[36]; + short *Iw0[36]; + OT *I0[36]; + short *Iw1[36]; + OT *I1[36]; + short *Iw2[36]; + OT *I2[36]; + short *Iw3[36]; + OT *I3[36]; + + // Store transformed hw4c8 to itmArray + for (U32 i = 0; i < 36; i++) { + Iw0[i] = itmArray + i * 4 * ic * 8 + c * 4 * 8; + Iw1[i] = itmArray + i * 4 * ic * 8 + c * 4 * 8 + 1 * 8; + Iw2[i] = itmArray + i * 4 * ic * 8 + c * 4 * 8 + 2 * 8; + Iw3[i] = itmArray + i * 4 * ic * 8 + c * 4 * 8 + 3 * 8; + } + + U32 h0 = (hw / tile_w) * 4; // stride is 4 + U32 w0 = (hw % tile_w) * 4; + U32 h1 = ((hw + 1) / tile_w) * 4; + U32 w1 = ((hw + 1) % tile_w) * 4; + U32 h2 = ((hw + 2) / tile_w) * 4; + U32 w2 = ((hw + 2) % tile_w) * 4; + U32 h3 = ((hw + 3) / tile_w) * 4; + U32 w3 = ((hw + 3) % tile_w) * 4; + + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + I1[i * 6 + j] = inArray_pad_mov + (h1 + i) * iw_pad * 8 + (w1 + j) * 8; + I2[i * 6 + j] = inArray_pad_mov + (h2 + i) * iw_pad * 8 + (w2 + j) * 8; + I3[i * 6 + j] = inArray_pad_mov + (h3 + i) * iw_pad * 8 + (w3 + j) * 8; + } + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I0); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I0); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw1[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I1); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I1); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw2[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I2); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I2); + } + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw3[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I3); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I3); + } + } + + F32 inputScale[36]; + + if (idt == DT_I8) { + quantize_wino_input_s16(itmArray, 4 * ic * 8, inQ, inputScale, *input_scale); + } else { + quantize_wino_input((F16 *)itmArray, 4 * ic * 8, inQ, inputScale); + } + + F32 factor_v[36][4]; + for (U32 i = 0; i < 36; i++) { + if (inputScale[i] == 0) { + factor_v[i][0] = 0; + continue; + } else { + factor_v[i][0] = 1.0 / inputScale[i] / (F32)filterScale[i]; + } + factor_v[i][1] = factor_v[i][0]; + factor_v[i][2] = factor_v[i][0]; + factor_v[i][3] = factor_v[i][0]; + } + + F16 *b0 = biasArray; + INT8 *in_pack = (INT8 *)itmArray; // Reuse the space + + for (U32 idx = 0; idx < 36; idx++) { + if (factor_v[idx][0] == 0) { // input pixels are all 0 + continue; + } + for (U32 c = 0; c < ic; c++) { // for each 8 channels + INT8 *in_hw4c8 = inQ + idx * 4 * ic * 8 + c * 4 * 8; + + INT8 *in_0 = in_hw4c8; + INT8 *in_1 = in_hw4c8 + 1 * 8; + INT8 *in_2 = in_hw4c8 + 2 * 8; + INT8 *in_3 = in_hw4c8 + 3 * 8; + + // NHWChw8c4 + INT8 *in_pack_0 = in_pack + idx * 4 * ic * 8 + c * 4 * 8; + INT8 *in_pack_1 = in_pack_0 + 4 * 4; + + __asm__ __volatile__("ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + "ins v0.d[1], x2\n" + "ins v1.d[1], x3\n" + "trn1 v20.4s, v0.4s, v1.4s\n" + "trn2 v21.4s, v0.4s, v1.4s\n" + "str q20, [%[pack_0]]\n" + "str q21, [%[pack_1]]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), + [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), + [in_3] "r"(in_3) + : "memory", "cc", "v0", "v1", "v20", "v21", "x2", "x3"); + } + } + + // compute + for (U32 o = 0; o < oc; o++) { // 8 output channels at a time + // bias + F16 *b_0 = b0 + o * 8; + for (U32 idx = 0; idx < 36; idx++) { + INT8 *in_hw0 = in_pack + idx * 4 * ic * 8; + INT8 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; + F16 *out_o0hw0 = otmArray + idx * 4 * 8; + if (factor_v[idx][0] == 0) { + memset(out_o0hw0, 0, 4 * 8 * sizeof(OT)); + continue; + } + F32 *fac = factor_v[idx]; + __asm__ __volatile__("eor v5.16b, v5.16b, v5.16b\n" + "eor v6.16b, v6.16b, v6.16b\n" + "ldr q1, [%[in_0]]\n" // in_0 + "ldr q0, [%[f_0]]\n" // f_0 + "eor v7.16b, v7.16b, v7.16b\n" + "eor v8.16b, v8.16b, v8.16b\n" + + "eor v9.16b, v9.16b, v9.16b\n" + "eor v10.16b, v10.16b, v10.16b\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + "0:\n" + "ldr q29, [x0, 16]\n" + "ldr q3, [x3, 16]!\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + + "subs x2, x2, #4\n" + "ldr q0, [x0, 32]!\n" + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + "mov v1.16b, v3.16b\n" + "bne 0b\n" + + "scvtf v5.4s, v5.4s\n" + "scvtf v6.4s, v6.4s\n" + "ldr q1, [%[factor]]\n" + "scvtf v7.4s, v7.4s\n" + "scvtf v8.4s, v8.4s\n" + "scvtf v9.4s, v9.4s\n" + "scvtf v10.4s, v10.4s\n" + "scvtf v11.4s, v11.4s\n" + "scvtf v12.4s, v12.4s\n" + + "fmul v5.4s, v1.4s, v5.4s\n" + "fmul v6.4s, v1.4s, v6.4s\n" + "fmul v7.4s, v1.4s, v7.4s\n" + "fmul v8.4s, v1.4s, v8.4s\n" + "fmul v9.4s, v1.4s, v9.4s\n" + "fmul v10.4s, v1.4s, v10.4s\n" + "fmul v11.4s, v1.4s, v11.4s\n" + "fmul v12.4s, v1.4s, v12.4s\n" + + "fcvtn v5.4h, v5.4s\n" + "fcvtn v7.4h, v7.4s\n" + "fcvtn v9.4h, v9.4s\n" + "fcvtn v11.4h, v11.4s\n" + + "fcvtn2 v5.8h, v6.4s\n" + "fcvtn2 v7.8h, v8.4s\n" + "fcvtn2 v9.8h, v10.4s\n" + "fcvtn2 v11.8h, v12.4s\n" + + "str q5, [%[out_0]]\n" + "str q7, [%[out_0], #16]\n" + "str q9, [%[out_0], #32]\n" + "str q11, [%[out_0], #48]\n" + : + : [out_0] "r"(out_o0hw0), [in_0] "r"(in_hw0), + [f_0] "r"(f_o0c0), [ic] "r"((I64)ic * 8), [factor] "r"(fac) + : "memory", "cc", "v0", "v1", "v2", "v3", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v29", "x0", "x1", "x2", + "x3", "x17", "x16"); + } + // out trans + // (6*6)*hw4*o8 => NOHWo8 + for (U32 hw4 = 0; hw4 < 4; hw4++) { + U32 h = (hw + hw4) / tile_w; + U32 w = (hw + hw4) % tile_w; + F16 *out_0 = + outArray + n * oc * ohow * 8 + o * ohow * 8 + h * 4 * ow * 8 + w * 4 * 8; + + F16 *Ow_0[36]; + F16 *O_0[16]; + + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + idx * 4 * 8 + hw4 * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + trans_O(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h - 1, tile_w - 1, + max, min, am); + } + } + tiles_s += 4; + } + + for (I32 hw = tiles_s; hw < tiles; hw++) { + // in trans + // NCHWc8 => (6*6)*(C/4)*hw1*c4 + // transform hw1c8 + // pack into hw1c4 after quantizing (reuse the space of itmArray) + for (U32 c = 0; c < ic; c++) { + OT *inArray_pad_mov = inArray_pad + c * ihiw * 8; + short *Iw_ptr[36]; + short *Iw0[36]; + OT *I0[36]; + + // Store transformed hw12c8 to itmArray + for (U32 i = 0; i < 36; i++) { + Iw0[i] = itmArray + i * ic * 8 + c * 8; + } + + U32 h0 = (hw / tile_w) * 4; // stride is 4 + U32 w0 = (hw % tile_w) * 4; + + for (U32 i = 0; i < 6; i++) { + for (U32 j = 0; j < 6; j++) { + I0[i * 6 + j] = inArray_pad_mov + (h0 + i) * iw_pad * 8 + (w0 + j) * 8; + } + } + + for (U32 i = 0; i < 36; i++) { + Iw_ptr[i] = Iw0[i]; + } + if (idt == DT_I8) { + trans_I_int8(Iw_ptr, (INT8 *const *)I0); + } else { + trans_I_4x4_3x3((F16 **)Iw_ptr, (F16 *const *)I0); + } + } + + F32 inputScale[36]; + + if (idt == DT_I8) { + quantize_wino_input_s16(itmArray, ic * 8, inQ, inputScale, *input_scale); + } else { + quantize_wino_input((F16 *)itmArray, ic * 8, inQ, inputScale); + } + + F32 factor_v[36][4]; + for (U32 i = 0; i < 36; i++) { + if (inputScale[i] == 0) { + factor_v[i][0] = 0; + continue; + } else { + factor_v[i][0] = 1.0 / inputScale[i] / (F32)filterScale[i]; + } + factor_v[i][1] = factor_v[i][0]; + factor_v[i][2] = factor_v[i][0]; + factor_v[i][3] = factor_v[i][0]; + } + + F16 *b0 = biasArray; + INT8 *in_pack = (INT8 *)itmArray; // Reuse the space + + for (U32 idx = 0; idx < 36; idx++) { + if (factor_v[idx][0] == 0) { + continue; + } + for (U32 c = 0; c < ic; c++) { // for each 8 channels + INT8 *in_0 = inQ + idx * ic * 8 + c * 8; + + // NHWChw8c4 + INT8 *in_pack_0 = in_pack + idx * ic * 8 + c * 8; + INT8 *in_pack_1 = in_pack_0 + 4; + + memcpy(in_pack_0, in_0, 4 * bytesOf(DT_I8)); + memcpy(in_pack_1, in_0 + 4, 4 * bytesOf(DT_I8)); + } + } + + // compute + for (U32 o = 0; o < oc; o++) { // 8 output channels at a time + // bias + F16 *b_0 = b0 + o * 8; + for (U32 idx = 0; idx < 36; idx++) { + INT8 *in_hw = in_pack + idx * ic * 8; + INT8 *f_o = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; + F16 *out_o0hw0 = otmArray + idx * 8; + if (factor_v[idx][0] == 0) { + memset(out_o0hw0, 0, 8 * sizeof(OT)); + continue; + } + int32x4_t res[2] = {0}; + + for (U32 c = 0; c < ic; c++) { + int8x8_t in_2 = vld1_s8(in_hw); + in_hw += 8; + int8x16_t f_8o[4]; + f_8o[0] = vld1q_s8(f_o); + f_8o[1] = vld1q_s8(f_o + 16); + res[0] = vdotq_lane_s32(res[0], f_8o[0], in_2, 0); + res[1] = vdotq_lane_s32(res[1], f_8o[1], in_2, 0); + + f_8o[2] = vld1q_s8(f_o + 32); + f_8o[3] = vld1q_s8(f_o + 48); + f_o += 64; + res[0] = vdotq_lane_s32(res[0], f_8o[2], in_2, 1); + res[1] = vdotq_lane_s32(res[1], f_8o[3], in_2, 1); + } + float32x4_t fac = vld1q_f32(factor_v[idx]); + float32x4_t resf0 = vcvtq_f32_s32(res[0]); + float32x4_t resf1 = vcvtq_f32_s32(res[1]); + resf0 = vmulq_f32(resf0, fac); + resf1 = vmulq_f32(resf1, fac); + + float16x4_t resh0 = vcvt_f16_f32(resf0); + float16x4_t resh1 = vcvt_f16_f32(resf1); + + vst1_f16(out_o0hw0, resh0); + vst1_f16(out_o0hw0 + 4, resh1); + } + // out trans + // (6*6)*hw1*o8 => NOHWo8 + U32 h = hw / tile_w; + U32 w = hw % tile_w; + F16 *out_0 = + outArray + n * oc * ohow * 8 + o * ohow * 8 + h * 4 * ow * 8 + w * 4 * 8; + + F16 *Ow_0[36]; + F16 *O_0[16]; + + for (U32 idx = 0; idx < 36; idx++) { + Ow_0[idx] = otmArray + idx * 8; + } + for (U32 i = 0; i < 4; ++i) { + for (U32 j = 0; j < 4; ++j) { + O_0[i * 4 + j] = out_0 + i * ow * 8 + j * 8; + } + } + trans_O(Ow_0, O_0, b_0, h, w, pad_h_mod_4, pad_w_mod_4, tile_h - 1, tile_w - 1, max, + min, am); + } + } + } + + if (DT_I8 == odt) { + F16 max_s = max[0]; + F16 min_s = min[0]; + for (U32 i = 1; i < 8; i++) { + if (max_s < max[i]) { + max_s = max[i]; + } + if (min_s > min[i]) { + min_s = min[i]; + } + } + + if (max_s == 0 && min_s == 0) { + return NOT_SUPPORTED; + } + + F16 scale_o; + if (max_s > 0 && min_s < 0) { + F16 scale_max = 127.0 / max_s; + F16 scale_min = -127.0 / min_s; + scale_o = (scale_max < scale_min) ? scale_max : scale_min; + } else if (max_s > 0) { + scale_o = 127.0 / max_s; + } else { + scale_o = -127.0 / min_s; + } + *outputScale = scale_o; + + apply_scale_f16(on * oc * ohow * 8, outArray, scale_o, (INT8 *)output, false); + } + return SUCCESS; +} + +template EE convolution_winograd_A76(TensorDesc inputDesc, + const void *input, + F16 *input_scale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec am); + +template EE convolution_winograd_A76(TensorDesc inputDesc, + const void *input, + F16 *input_scale, + TensorDesc filterDesc, + const void *filter, + F16 *filterScale, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + F16 *outputScale, + ActivationParamSpec am); +#endif diff --git a/tensor_computing/src/cpu/arm/int8/convolution_winograd_transform.h b/compute/tensor/src/cpu/arm/int8/convolution_winograd_transform.h similarity index 80% rename from tensor_computing/src/cpu/arm/int8/convolution_winograd_transform.h rename to compute/tensor/src/cpu/arm/int8/convolution_winograd_transform.h index b903299a..60f56183 100644 --- a/tensor_computing/src/cpu/arm/int8/convolution_winograd_transform.h +++ b/compute/tensor/src/cpu/arm/int8/convolution_winograd_transform.h @@ -1,28 +1,27 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_CONVOLUTION_WINOGRAD_TRANSFORM #define _H_CONVOLUTION_WINOGRAD_TRANSFORM #ifdef _USE_INT8 #include #include -#include "type.h" +#include "types.h" #include "error.h" #include "cpu/arm/fp16/convolution_winograd_transform.h" -inline void trans_I_int8(short *Iw[36], INT8* const I[36]) +inline void trans_I_int8(short *Iw[36], INT8 *const I[36]) { short T[6][6][8]; @@ -31,12 +30,12 @@ inline void trans_I_int8(short *Iw[36], INT8* const I[36]) int8x8_t v_minus_5 = vmov_n_s8(-5); for (U32 i = 0; i < 6; i++) { - int8x8_t v_I0 = vld1_s8(I[0*6+i]); - int8x8_t v_I1 = vld1_s8(I[1*6+i]); - int8x8_t v_I2 = vld1_s8(I[2*6+i]); - int8x8_t v_I3 = vld1_s8(I[3*6+i]); - int8x8_t v_I4 = vld1_s8(I[4*6+i]); - int8x8_t v_I5 = vld1_s8(I[5*6+i]); + int8x8_t v_I0 = vld1_s8(I[0 * 6 + i]); + int8x8_t v_I1 = vld1_s8(I[1 * 6 + i]); + int8x8_t v_I2 = vld1_s8(I[2 * 6 + i]); + int8x8_t v_I3 = vld1_s8(I[3 * 6 + i]); + int8x8_t v_I4 = vld1_s8(I[4 * 6 + i]); + int8x8_t v_I5 = vld1_s8(I[5 * 6 + i]); // Reorder to accelerate int16x8_t v_t0 = vmull_s8(v_I2, v_minus_4); @@ -108,17 +107,27 @@ inline void trans_I_int8(short *Iw[36], INT8* const I[36]) int16x8_t v_Iw4 = vsubq_s16(v_t2, v_t3); int16x8_t v_Iw5 = vmlaq_n_s16(v_t5, v_T3, -5); - vst1q_s16(Iw[i*6+0], v_Iw0); - vst1q_s16(Iw[i*6+1], v_Iw1); - vst1q_s16(Iw[i*6+2], v_Iw2); - vst1q_s16(Iw[i*6+3], v_Iw3); - vst1q_s16(Iw[i*6+4], v_Iw4); - vst1q_s16(Iw[i*6+5], v_Iw5); + vst1q_s16(Iw[i * 6 + 0], v_Iw0); + vst1q_s16(Iw[i * 6 + 1], v_Iw1); + vst1q_s16(Iw[i * 6 + 2], v_Iw2); + vst1q_s16(Iw[i * 6 + 3], v_Iw3); + vst1q_s16(Iw[i * 6 + 4], v_Iw4); + vst1q_s16(Iw[i * 6 + 5], v_Iw5); } } -inline void trans_O(F16* const Ow[36], F16 *O[16], const F16* bias, - U32 h, U32 w, U32 _pad_h_mod_4, U32 _pad_w_mod_4, U32 oh, U32 ow, F16* max, F16* min, ActivationDesc activationDesc) +inline void trans_O(F16 *const Ow[36], + F16 *O[16], + const F16 *bias, + U32 h, + U32 w, + U32 _pad_h_mod_4, + U32 _pad_w_mod_4, + U32 oh, + U32 ow, + F16 *max, + F16 *min, + ActivationParamSpec activationDesc) { F16 T[4][6][8]; // bias @@ -131,11 +140,11 @@ inline void trans_O(F16* const Ow[36], F16 *O[16], const F16* bias, for (U32 i = 0; i < 6; i++) { float16x8_t v_Ow0 = vld1q_f16(Ow[i]); - float16x8_t v_Ow1 = vld1q_f16(Ow[1*6+i]); - float16x8_t v_Ow2 = vld1q_f16(Ow[2*6+i]); - float16x8_t v_Ow3 = vld1q_f16(Ow[3*6+i]); - float16x8_t v_Ow4 = vld1q_f16(Ow[4*6+i]); - float16x8_t v_Ow5 = vld1q_f16(Ow[5*6+i]); + float16x8_t v_Ow1 = vld1q_f16(Ow[1 * 6 + i]); + float16x8_t v_Ow2 = vld1q_f16(Ow[2 * 6 + i]); + float16x8_t v_Ow3 = vld1q_f16(Ow[3 * 6 + i]); + float16x8_t v_Ow4 = vld1q_f16(Ow[4 * 6 + i]); + float16x8_t v_Ow5 = vld1q_f16(Ow[5 * 6 + i]); float16x8_t v_t0 = vaddq_f16(v_Ow1, v_Ow2); float16x8_t v_t1 = vaddq_f16(v_Ow3, v_Ow4); @@ -195,104 +204,104 @@ inline void trans_O(F16* const Ow[36], F16 *O[16], const F16* bias, temp = vmaxq_f16(vaddq_f16(v_O0, v_b), v_0); max_v = vmaxq_f16(max_v, temp); min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+0], temp); + vst1q_f16(O[i * 4 + 0], temp); temp = vmaxq_f16(vaddq_f16(v_O1, v_b), v_0); max_v = vmaxq_f16(max_v, temp); min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+1], temp); + vst1q_f16(O[i * 4 + 1], temp); temp = vmaxq_f16(vaddq_f16(v_O2, v_b), v_0); max_v = vmaxq_f16(max_v, temp); min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+2], temp); + vst1q_f16(O[i * 4 + 2], temp); temp = vmaxq_f16(vaddq_f16(v_O3, v_b), v_0); max_v = vmaxq_f16(max_v, temp); min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+3], temp); + vst1q_f16(O[i * 4 + 3], temp); } else if (pad_w_mod_4 == 1) { temp = vmaxq_f16(vaddq_f16(v_O0, v_b), v_0); max_v = vmaxq_f16(max_v, temp); min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+0], temp); + vst1q_f16(O[i * 4 + 0], temp); temp = vmaxq_f16(vaddq_f16(v_O1, v_b), v_0); max_v = vmaxq_f16(max_v, temp); min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+1], temp); + vst1q_f16(O[i * 4 + 1], temp); temp = vmaxq_f16(vaddq_f16(v_O2, v_b), v_0); max_v = vmaxq_f16(max_v, temp); min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+2], temp); + vst1q_f16(O[i * 4 + 2], temp); } else if (pad_w_mod_4 == 2) { temp = vmaxq_f16(vaddq_f16(v_O0, v_b), v_0); max_v = vmaxq_f16(max_v, temp); min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+0], temp); + vst1q_f16(O[i * 4 + 0], temp); temp = vmaxq_f16(vaddq_f16(v_O1, v_b), v_0); max_v = vmaxq_f16(max_v, temp); min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+1], temp); + vst1q_f16(O[i * 4 + 1], temp); } else if (pad_w_mod_4 == 3) { temp = vmaxq_f16(vaddq_f16(v_O0, v_b), v_0); max_v = vmaxq_f16(max_v, temp); min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+0], temp); + vst1q_f16(O[i * 4 + 0], temp); } } else { if (pad_w_mod_4 == 0) { temp = vaddq_f16(v_O0, v_b); max_v = vmaxq_f16(max_v, temp); min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+0], temp); + vst1q_f16(O[i * 4 + 0], temp); temp = vaddq_f16(v_O1, v_b); max_v = vmaxq_f16(max_v, temp); min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+1], temp); + vst1q_f16(O[i * 4 + 1], temp); temp = vaddq_f16(v_O2, v_b); max_v = vmaxq_f16(max_v, temp); min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+2], temp); + vst1q_f16(O[i * 4 + 2], temp); temp = vaddq_f16(v_O3, v_b); max_v = vmaxq_f16(max_v, temp); min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+3], temp); + vst1q_f16(O[i * 4 + 3], temp); } else if (pad_w_mod_4 == 1) { temp = vaddq_f16(v_O0, v_b); max_v = vmaxq_f16(max_v, temp); min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+0], temp); + vst1q_f16(O[i * 4 + 0], temp); temp = vaddq_f16(v_O1, v_b); max_v = vmaxq_f16(max_v, temp); min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+1], temp); + vst1q_f16(O[i * 4 + 1], temp); temp = vaddq_f16(v_O2, v_b); max_v = vmaxq_f16(max_v, temp); min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+2], temp); + vst1q_f16(O[i * 4 + 2], temp); } else if (pad_w_mod_4 == 2) { temp = vaddq_f16(v_O0, v_b); max_v = vmaxq_f16(max_v, temp); min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+0], temp); + vst1q_f16(O[i * 4 + 0], temp); temp = vaddq_f16(v_O1, v_b); max_v = vmaxq_f16(max_v, temp); min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+1], temp); + vst1q_f16(O[i * 4 + 1], temp); } else if (pad_w_mod_4 == 3) { temp = vaddq_f16(v_O0, v_b); max_v = vmaxq_f16(max_v, temp); min_v = vminq_f16(min_v, temp); - vst1q_f16(O[i*4+0], temp); + vst1q_f16(O[i * 4 + 0], temp); } } } diff --git a/tensor_computing/src/cpu/arm/int8/depthwise_convolution.cpp b/compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution.cpp similarity index 51% rename from tensor_computing/src/cpu/arm/int8/depthwise_convolution.cpp rename to compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution.cpp index d8b5ae7a..04c23b4a 100644 --- a/tensor_computing/src/cpu/arm/int8/depthwise_convolution.cpp +++ b/compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution.cpp @@ -1,64 +1,69 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#ifdef _USE_INT8 -#include "tensor_computing_type.h" -#include "cpu/arm/int8/depthwise_convolution.h" #include "cpu/arm/int8/tensor_computing_int8.h" +#include "cpu/arm/int8/depthwise_pointwise_convolution.h" -EE depthwise_convolution_int8(TensorDesc inputDesc, INT8* input, - TensorDesc filterDesc, const INT8* filter, - ConvolutionDesc convDesc, DepthwiseConvolutionForwardAlgorithm algorithm, - TensorDesc biasDesc, const I32* bias, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, I32* output, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc, +EE depthwise_pointwise_convolution_int8(TensorDesc inputDesc, + INT8 *input, + TensorDesc dwFilterDesc, + const INT8 *dwFilter, + TensorDesc pwFilterDesc, + const INT8 *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const I32 *dwBias, + TensorDesc pwBiasDesc, + const I32 *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + I32 *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, Arch arch) { - if(nullptr == input || nullptr == filter || nullptr == output || nullptr == bias || nullptr == tmp) + if (nullptr == input || nullptr == dwFilter || nullptr == output || nullptr == dwBias || + nullptr == tmp) { CHECK_STATUS(NULL_POINTER); + } DataType idt, fdt, odt; DataFormat idf, fdf, odf; U32 in, ic, ih, iw; U32 fn, fc, fh, fw; U32 on, oc, oh, ow; CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - if (!(idt == DT_I8 && fdt == DT_I8 && odt == DT_I32)) - CHECK_STATUS(NOT_MATCH); - if (fh != fw) + if (!(idt == DT_I8 && fdt == DT_I8 && odt == DT_I32)) { CHECK_STATUS(NOT_MATCH); - if (!(idf == DF_NCHWC8 && odf == DF_NCHWC8)) + } + if (!(idf == DF_NCHWC8 && odf == DF_NCHWC8)) { CHECK_STATUS(NOT_MATCH); - if (!(ic == fc && oc == fn)) + } + if (ic != fc) { CHECK_STATUS(NOT_MATCH); + } EE ret = SUCCESS; switch (algorithm) { case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: - ret = depthwise_pointwise_convolution_direct(inputDesc, input, - filterDesc, filter, - convDesc, - biasDesc, bias, - tmpBytes, tmp, - outputDesc, output, - depthwiseActivationDesc, - pointwiseActivationDesc, - arch); + ret = depthwise_pointwise_convolution_direct(inputDesc, input, dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, dwBiasDesc, dwBias, pwBiasDesc, pwBias, + tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, arch); break; default: ret = NOT_SUPPORTED; @@ -66,4 +71,3 @@ EE depthwise_convolution_int8(TensorDesc inputDesc, INT8* input, } return ret; } -#endif diff --git a/compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution.h b/compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution.h new file mode 100644 index 00000000..a731826d --- /dev/null +++ b/compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution.h @@ -0,0 +1,39 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_DEPTHWISE_POINTWISE_CONVOLUTION +#define _H_DEPTHWISE_POINTWISE_CONVOLUTION + +#include "sys.h" +#include "tensor_desc.h" +#include "tensor_computing_type.h" + +EE depthwise_pointwise_convolution_direct(TensorDesc inputDesc, + INT8 *inArray, + TensorDesc dwFilterDesc, + const INT8 *dwFilterArray, + TensorDesc pwFilterDesc, + const INT8 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const I32 *dwBiasArray, + TensorDesc pwBiasDesc, + const I32 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + I32 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch); +#endif diff --git a/compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution_direct.cpp b/compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution_direct.cpp new file mode 100644 index 00000000..77ec8489 --- /dev/null +++ b/compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution_direct.cpp @@ -0,0 +1,1865 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/int8/depthwise_pointwise_convolution.h" + +EE depthwise_pointwise_convolution_direct(TensorDesc inputDesc, + INT8 *inArray, + TensorDesc dwFilterDesc, + const INT8 *dwFilterArray, + TensorDesc pwFilterDesc, + const INT8 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const I32 *dwBiasArray, + TensorDesc pwBiasDesc, + const I32 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + I32 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch) +{ + UNUSED(tmpBytes); + UNUSED(arch); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if (dwFilterDesc.df != DF_NCHWC8 || pwFilterDesc.df != DF_NCHWN8C4) { + CHECK_STATUS(NOT_MATCH); + } + if (pwFilterArray == nullptr) { + return NOT_SUPPORTED; + } + + oc /= 8; + ic /= 8; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + U32 ihiw = ih * iw; + I32 ohow = oh * ow; + INT8 *pwArray = (INT8 *)tmp + ic * ih_pad * iw_pad * 8; + I32 *dw_out = (I32 *)(pwArray + ic * ohow * 8); + + for (U32 n = 0; n < in; n++) { + // copy input into a input with padding + INT8 *inArray_pad = (INT8 *)tmp; + INT8 *inArray_pad_mov = inArray_pad; + INT8 *inArray_mov = inArray + n * ic * ihiw * 8; + for (U32 c = 0; c < ic; c++) { + if (paddingT > 0) { + memset(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += paddingT * iw_pad * 8; + } + for (U32 h = paddingT; h < ih_pad - paddingB; h++) { + memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(idt)); + inArray_pad_mov += paddingL * 8; + memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + inArray_pad_mov += iw * 8; + inArray_mov += iw * 8; + memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(idt)); + inArray_pad_mov += paddingR * 8; + } + if (paddingB > 0) { + memset(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(idt)); + inArray_pad_mov += paddingB * iw_pad * 8; + } + + // dw_conv + const I32 *b = dwBiasArray + c * 8; + INT8 *in_pad = inArray_pad + c * ih_pad * iw_pad * 8; + const INT8 *f = dwFilterArray + c * fh * fw * 8; + + // ohow / 12 + for (I32 hw = 0; hw < ohow - 11; hw += 12) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + U32 in_h_1 = (hw + 1) / ow * strideH; + U32 in_w_1 = (hw + 1) % ow * strideW; + U32 in_h_2 = (hw + 2) / ow * strideH; + U32 in_w_2 = (hw + 2) % ow * strideW; + U32 in_h_3 = (hw + 3) / ow * strideH; + U32 in_w_3 = (hw + 3) % ow * strideW; + U32 in_h_4 = (hw + 4) / ow * strideH; + U32 in_w_4 = (hw + 4) % ow * strideW; + U32 in_h_5 = (hw + 5) / ow * strideH; + U32 in_w_5 = (hw + 5) % ow * strideW; + U32 in_h_6 = (hw + 6) / ow * strideH; + U32 in_w_6 = (hw + 6) % ow * strideW; + U32 in_h_7 = (hw + 7) / ow * strideH; + U32 in_w_7 = (hw + 7) % ow * strideW; + U32 in_h_8 = (hw + 8) / ow * strideH; + U32 in_w_8 = (hw + 8) % ow * strideW; + U32 in_h_9 = (hw + 9) / ow * strideH; + U32 in_w_9 = (hw + 9) % ow * strideW; + U32 in_h_10 = (hw + 10) / ow * strideH; + U32 in_w_10 = (hw + 10) % ow * strideW; + U32 in_h_11 = (hw + 11) / ow * strideH; + U32 in_w_11 = (hw + 11) % ow * strideW; + + I32 *pw_pack_0 = dw_out + hw * ic * 8 + c * 12 * 8; + I32 *pw_pack_1 = pw_pack_0 + 48; // Second half + // TODO handle asm combined with c. No guarantee that compiler will not use vec reg in c. + __asm__ __volatile__("ldr d29, [%[b]]\n" // b_0 + "ldr x1, [%[b], #8]\n" + "ins v29.d[1], x1\n" + "ldr d30, [%[b], #16]\n" // b_1 + "ldr x2, [%[b], #24]\n" + "ins v30.d[1], x2\n" + "mov v5.16b, v29.16b\n" + "mov v7.16b, v29.16b\n" + "mov v9.16b, v29.16b\n" + "mov v11.16b, v29.16b\n" + "mov v13.16b, v29.16b\n" + "mov v15.16b, v29.16b\n" + "mov v17.16b, v29.16b\n" + "mov v19.16b, v29.16b\n" + "mov v21.16b, v29.16b\n" + "mov v23.16b, v29.16b\n" + "mov v25.16b, v29.16b\n" + "mov v27.16b, v29.16b\n" + + "mov v6.16b, v30.16b\n" + "mov v8.16b, v30.16b\n" + "mov v10.16b, v30.16b\n" + "mov v12.16b, v30.16b\n" + "mov v14.16b, v30.16b\n" + "mov v16.16b, v30.16b\n" + "mov v18.16b, v30.16b\n" + "mov v20.16b, v30.16b\n" + "mov v22.16b, v30.16b\n" + "mov v24.16b, v30.16b\n" + "mov v26.16b, v30.16b\n" + "mov v28.16b, v30.16b\n" + : + : [b] "r"(b) + : "memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "v30", "x1", "x2"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const INT8 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + INT8 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + INT8 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + INT8 *in_1 = in_idx + in_h_1 * iw_pad * 8 + in_w_1 * 8; + INT8 *in_2 = in_idx + in_h_2 * iw_pad * 8 + in_w_2 * 8; + INT8 *in_3 = in_idx + in_h_3 * iw_pad * 8 + in_w_3 * 8; + INT8 *in_4 = in_idx + in_h_4 * iw_pad * 8 + in_w_4 * 8; + INT8 *in_5 = in_idx + in_h_5 * iw_pad * 8 + in_w_5 * 8; + INT8 *in_6 = in_idx + in_h_6 * iw_pad * 8 + in_w_6 * 8; + INT8 *in_7 = in_idx + in_h_7 * iw_pad * 8 + in_w_7 * 8; + INT8 *in_8 = in_idx + in_h_8 * iw_pad * 8 + in_w_8 * 8; + INT8 *in_9 = in_idx + in_h_9 * iw_pad * 8 + in_w_9 * 8; + INT8 *in_10 = in_idx + in_h_10 * iw_pad * 8 + in_w_10 * 8; + INT8 *in_11 = in_idx + in_h_11 * iw_pad * 8 + in_w_11 * 8; + __asm__ __volatile__( + "ldr d29, [%[f0]]\n" + "ldr d0, [%[in0]]\n" + "ldr d1, [%[in1]]\n" + "ldr d2, [%[in2]]\n" + "sshll v29.8h, v29.8b, #0\n" + "ldr d30, [%[in3]]\n" + "sshll v0.8h, v0.8b, #0\n" + "sshll v1.8h, v1.8b, #0\n" + + "smlal v5.4s, v29.4h, v0.4h\n" + "sshll v2.8h, v2.8b, #0\n" + "smlal2 v6.4s, v29.8h, v0.8h\n" + "sshll v30.8h, v30.8b, #0\n" + "smlal v7.4s, v29.4h, v1.4h\n" + "ldr d0, [%[in4]]\n" + "smlal2 v8.4s, v29.8h, v1.8h\n" + "smlal v9.4s, v29.4h, v2.4h\n" + "ldr d1, [%[in5]]\n" + "smlal2 v10.4s, v29.8h, v2.8h\n" + "sshll v0.8h, v0.8b, #0\n" + "smlal v11.4s, v29.4h, v30.4h\n" + "ldr d2, [%[in6]]\n" + "smlal2 v12.4s, v29.8h, v30.8h\n" + "sshll v1.8h, v1.8b, #0\n" + + "smlal v13.4s, v29.4h, v0.4h\n" + "ldr d30, [%[in7]]\n" + "smlal2 v14.4s, v29.8h, v0.8h\n" + "sshll v2.8h, v2.8b, #0\n" + "smlal v15.4s, v29.4h, v1.4h\n" + "ldr d0, [%[in8]]\n" + "smlal2 v16.4s, v29.8h, v1.8h\n" + "sshll v30.8h, v30.8b, #0\n" + "smlal v17.4s, v29.4h, v2.4h\n" + "ldr d1, [%[in9]]\n" + "smlal2 v18.4s, v29.8h, v2.8h\n" + "sshll v0.8h, v0.8b, #0\n" + "smlal v19.4s, v29.4h, v30.4h\n" + "ldr d2, [%[in10]]\n" + "smlal2 v20.4s, v29.8h, v30.8h\n" + "sshll v1.8h, v1.8b, #0\n" + + "smlal v21.4s, v29.4h, v0.4h\n" + "ldr d30, [%[in11]]\n" + "smlal2 v22.4s, v29.8h, v0.8h\n" + "sshll v2.8h, v2.8b, #0\n" + "smlal v23.4s, v29.4h, v1.4h\n" + "sshll v30.8h, v30.8b, #0\n" + "smlal2 v24.4s, v29.8h, v1.8h\n" + "smlal v25.4s, v29.4h, v2.4h\n" + "smlal2 v26.4s, v29.8h, v2.8h\n" + "smlal v27.4s, v29.4h, v30.4h\n" + "smlal2 v28.4s, v29.8h, v30.8h\n" + : + : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), [in3] "r"(in_3), + [in4] "r"(in_4), [in5] "r"(in_5), [in6] "r"(in_6), [in7] "r"(in_7), + [in8] "r"(in_8), [in9] "r"(in_9), [in10] "r"(in_10), [in11] "r"(in_11), + [f0] "r"(f_0) + : "memory", "cc", "v0", "v1", "v2", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: { + break; + } + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v0.16b, v0.16b, v0.16b\n" // zero + + "smax v5.4s, v0.4s, v5.4s\n" + "smax v6.4s, v0.4s, v6.4s\n" + "smax v7.4s, v0.4s, v7.4s\n" + "smax v8.4s, v0.4s, v8.4s\n" + "smax v9.4s, v0.4s, v9.4s\n" + "smax v10.4s, v0.4s, v10.4s\n" + "smax v11.4s, v0.4s, v11.4s\n" + "smax v12.4s, v0.4s, v12.4s\n" + "smax v13.4s, v0.4s, v13.4s\n" + "smax v14.4s, v0.4s, v14.4s\n" + "smax v15.4s, v0.4s, v15.4s\n" + "smax v16.4s, v0.4s, v16.4s\n" + "smax v17.4s, v0.4s, v17.4s\n" + "smax v18.4s, v0.4s, v18.4s\n" + "smax v19.4s, v0.4s, v19.4s\n" + "smax v20.4s, v0.4s, v20.4s\n" + "smax v21.4s, v0.4s, v21.4s\n" + "smax v22.4s, v0.4s, v22.4s\n" + "smax v23.4s, v0.4s, v23.4s\n" + "smax v24.4s, v0.4s, v24.4s\n" + "smax v25.4s, v0.4s, v25.4s\n" + "smax v26.4s, v0.4s, v26.4s\n" + "smax v27.4s, v0.4s, v27.4s\n" + "smax v28.4s, v0.4s, v28.4s\n" + : + : + : "memory", "cc", "v0", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28"); + break; + } + case ACTIVATION_RELU6: { + INT8 *pw_in0 = pwArray + hw * ic * 8 + c * 12 * 8; + INT8 *pw_in1 = pw_in0 + 48; + __asm__ __volatile__("eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v30.4s, #6\n" // six + + "smax v5.4s, v0.4s, v5.4s\n" + "smax v6.4s, v0.4s, v6.4s\n" + "smax v7.4s, v0.4s, v7.4s\n" + "smax v8.4s, v0.4s, v8.4s\n" + "smax v9.4s, v0.4s, v9.4s\n" + "smax v10.4s, v0.4s, v10.4s\n" + "smax v11.4s, v0.4s, v11.4s\n" + "smax v12.4s, v0.4s, v12.4s\n" + "smax v13.4s, v0.4s, v13.4s\n" + "smax v14.4s, v0.4s, v14.4s\n" + "smax v15.4s, v0.4s, v15.4s\n" + "smax v16.4s, v0.4s, v16.4s\n" + "smax v17.4s, v0.4s, v17.4s\n" + "smax v18.4s, v0.4s, v18.4s\n" + "smax v19.4s, v0.4s, v19.4s\n" + "smax v20.4s, v0.4s, v20.4s\n" + "smax v21.4s, v0.4s, v21.4s\n" + "smax v22.4s, v0.4s, v22.4s\n" + "smax v23.4s, v0.4s, v23.4s\n" + "smax v24.4s, v0.4s, v24.4s\n" + "smax v25.4s, v0.4s, v25.4s\n" + "smax v26.4s, v0.4s, v26.4s\n" + "smax v27.4s, v0.4s, v27.4s\n" + "smax v28.4s, v0.4s, v28.4s\n" + + "smin v5.4s, v30.4s, v5.4s\n" + "smin v6.4s, v30.4s, v6.4s\n" + "smin v7.4s, v30.4s, v7.4s\n" + "smin v8.4s, v30.4s, v8.4s\n" + "smin v9.4s, v30.4s, v9.4s\n" + "smin v10.4s, v30.4s, v10.4s\n" + "smin v11.4s, v30.4s, v11.4s\n" + "smin v12.4s, v30.4s, v12.4s\n" + "smin v13.4s, v30.4s, v13.4s\n" + "smin v14.4s, v30.4s, v14.4s\n" + "smin v15.4s, v30.4s, v15.4s\n" + "smin v16.4s, v30.4s, v16.4s\n" + "smin v17.4s, v30.4s, v17.4s\n" + "smin v18.4s, v30.4s, v18.4s\n" + "smin v19.4s, v30.4s, v19.4s\n" + "smin v20.4s, v30.4s, v20.4s\n" + "smin v21.4s, v30.4s, v21.4s\n" + "smin v22.4s, v30.4s, v22.4s\n" + "smin v23.4s, v30.4s, v23.4s\n" + "smin v24.4s, v30.4s, v24.4s\n" + "smin v25.4s, v30.4s, v25.4s\n" + "smin v26.4s, v30.4s, v26.4s\n" + "smin v27.4s, v30.4s, v27.4s\n" + "smin v28.4s, v30.4s, v28.4s\n" + + // No need to quantize for ReLU6 + "sqshl v5.4s, v5.4s, #2\n" + "sqshl v6.4s, v6.4s, #2\n" + "sqshl v7.4s, v7.4s, #2\n" + "sqshl v8.4s, v8.4s, #2\n" + "sqshl v9.4s, v9.4s, #2\n" + "sqshl v10.4s, v10.4s, #2\n" + "sqshl v11.4s, v11.4s, #2\n" + "sqshl v12.4s, v12.4s, #2\n" + "sqshl v13.4s, v13.4s, #2\n" + "sqshl v14.4s, v14.4s, #2\n" + "sqshl v15.4s, v15.4s, #2\n" + "sqshl v16.4s, v16.4s, #2\n" + "sqshl v17.4s, v17.4s, #2\n" + "sqshl v18.4s, v18.4s, #2\n" + "sqshl v19.4s, v19.4s, #2\n" + "sqshl v20.4s, v20.4s, #2\n" + "sqshl v21.4s, v21.4s, #2\n" + "sqshl v22.4s, v22.4s, #2\n" + "sqshl v23.4s, v23.4s, #2\n" + "sqshl v24.4s, v24.4s, #2\n" + "sqshl v25.4s, v25.4s, #2\n" + "sqshl v26.4s, v26.4s, #2\n" + "sqshl v27.4s, v27.4s, #2\n" + "sqshl v28.4s, v28.4s, #2\n" + + "sqshrn v5.4h, v5.4s, #1\n" + "sqshrn v9.4h, v9.4s, #1\n" + "sqshrn2 v5.8h, v7.4s, #1\n" + "sqshrn2 v9.8h, v11.4s, #1\n" + "sqshrn v13.4h, v13.4s, #1\n" + "sqshrn v17.4h, v17.4s, #1\n" + "sqshrn2 v13.8h, v15.4s, #1\n" + "sqshrn2 v17.8h, v19.4s, #1\n" + + "sqshrn v21.4h, v21.4s, #1\n" + "sqshrn v25.4h, v25.4s, #1\n" + "sqshrn2 v21.8h, v23.4s, #1\n" + "sqshrn2 v25.8h, v27.4s, #1\n" + + "sqshrn v5.8b, v5.8h, #1\n" + "sqshrn v13.8b, v13.8h, #1\n" + "sqshrn v21.8b, v21.8h, #1\n" + + "sqshrn2 v5.16b, v9.8h, #1\n" + "sqshrn2 v13.16b, v17.8h, #1\n" + "sqshrn2 v21.16b, v25.8h, #1\n" + "str q5, [%[in0]]\n" + "str q13, [%[in0], #16]\n" + "str q21, [%[in0], #32]\n" + + "sqshrn v6.4h, v6.4s, #1\n" + "sqshrn v10.4h, v10.4s, #1\n" + "sqshrn2 v6.8h, v8.4s, #1\n" + "sqshrn2 v10.8h, v12.4s, #1\n" + + "sqshrn v14.4h, v14.4s, #1\n" + "sqshrn v18.4h, v18.4s, #1\n" + "sqshrn2 v14.8h, v16.4s, #1\n" + "sqshrn2 v18.8h, v20.4s, #1\n" + + "sqshrn v22.4h, v22.4s, #1\n" + "sqshrn v26.4h, v26.4s, #1\n" + "sqshrn2 v22.8h, v24.4s, #1\n" + "sqshrn2 v26.8h, v28.4s, #1\n" + + "sqshrn v6.8b, v6.8h, #1\n" + "sqshrn v14.8b, v14.8h, #1\n" + "sqshrn v22.8b, v22.8h, #1\n" + + "sqshrn2 v6.16b, v10.8h, #1\n" + "sqshrn2 v14.16b, v18.8h, #1\n" + "sqshrn2 v22.16b, v26.8h, #1\n" + "str q6, [%[in1]]\n" + "str q14, [%[in1], #16]\n" + "str q22, [%[in1], #32]\n" + : + : [in0] "r"(pw_in0), [in1] "r"(pw_in1) + : "memory", "cc", "v0", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v30"); + break; + } + default: + return NOT_SUPPORTED; + } + + if (depthwiseActivationParamSpec.mode != ACTIVATION_RELU6) { + __asm__ __volatile__("str q5, [%[pw0]]\n" + "str q7, [%[pw0], #16]\n" + "str q9, [%[pw0], #32]\n" + "str q11, [%[pw0], #48]\n" + "str q13, [%[pw0], #64]\n" + "str q15, [%[pw0], #80]\n" + "str q17, [%[pw0], #96]\n" + "str q19, [%[pw0], #112]\n" + "str q21, [%[pw0], #128]\n" + "str q23, [%[pw0], #144]\n" + "str q25, [%[pw0], #160]\n" + "str q27, [%[pw0], #176]\n" + + "str q6, [%[pw1]]\n" + "str q8, [%[pw1], #16]\n" + "str q10, [%[pw1], #32]\n" + "str q12, [%[pw1], #48]\n" + "str q14, [%[pw1], #64]\n" + "str q16, [%[pw1], #80]\n" + "str q18, [%[pw1], #96]\n" + "str q20, [%[pw1], #112]\n" + "str q22, [%[pw1], #128]\n" + "str q24, [%[pw1], #144]\n" + "str q26, [%[pw1], #160]\n" + "str q28, [%[pw1], #176]\n" + : + : [pw0] "r"(pw_pack_0), [pw1] "r"(pw_pack_1) + : "memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28"); + } + } + + // ohow_reminder % 12 / 8 + U32 ohow_s = (ohow / 12) * 12; + U32 ohow_tail = ohow - ohow_s; + + if (ohow_tail >= 8) { + U32 hw = ohow_s; + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + U32 in_h_1 = (hw + 1) / ow * strideH; + U32 in_w_1 = (hw + 1) % ow * strideW; + U32 in_h_2 = (hw + 2) / ow * strideH; + U32 in_w_2 = (hw + 2) % ow * strideW; + U32 in_h_3 = (hw + 3) / ow * strideH; + U32 in_w_3 = (hw + 3) % ow * strideW; + U32 in_h_4 = ((hw + 4) / ow) * strideH; + U32 in_w_4 = ((hw + 4) % ow) * strideW; + U32 in_h_5 = ((hw + 5) / ow) * strideH; + U32 in_w_5 = ((hw + 5) % ow) * strideW; + U32 in_h_6 = ((hw + 6) / ow) * strideH; + U32 in_w_6 = ((hw + 6) % ow) * strideW; + U32 in_h_7 = ((hw + 7) / ow) * strideH; + U32 in_w_7 = ((hw + 7) % ow) * strideW; + I32 *pw_pack_0 = dw_out + hw * ic * 8 + c * 8 * 8; + I32 *pw_pack_1 = pw_pack_0 + 32; + // TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. + __asm__ __volatile__("ldr d29, [%[b]]\n" // b_0 + "ldr x1, [%[b], #8]\n" + "ins v29.d[1], x1\n" + "ldr d30, [%[b], #16]\n" // b_1 + "ldr x2, [%[b], #24]\n" + "ins v30.d[1], x2\n" + "mov v5.16b, v29.16b\n" + "mov v7.16b, v29.16b\n" + "mov v9.16b, v29.16b\n" + "mov v11.16b, v29.16b\n" + "mov v13.16b, v29.16b\n" + "mov v15.16b, v29.16b\n" + "mov v17.16b, v29.16b\n" + "mov v19.16b, v29.16b\n" + + "mov v6.16b, v30.16b\n" + "mov v8.16b, v30.16b\n" + "mov v10.16b, v30.16b\n" + "mov v12.16b, v30.16b\n" + "mov v14.16b, v30.16b\n" + "mov v16.16b, v30.16b\n" + "mov v18.16b, v30.16b\n" + "mov v20.16b, v30.16b\n" + : + : [b] "r"(b) + : "memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v29", "v30", "x1", "x2"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const INT8 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + INT8 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + INT8 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + INT8 *in_1 = in_idx + in_h_1 * iw_pad * 8 + in_w_1 * 8; + INT8 *in_2 = in_idx + in_h_2 * iw_pad * 8 + in_w_2 * 8; + INT8 *in_3 = in_idx + in_h_3 * iw_pad * 8 + in_w_3 * 8; + INT8 *in_4 = in_idx + in_h_4 * iw_pad * 8 + in_w_4 * 8; + INT8 *in_5 = in_idx + in_h_5 * iw_pad * 8 + in_w_5 * 8; + INT8 *in_6 = in_idx + in_h_6 * iw_pad * 8 + in_w_6 * 8; + INT8 *in_7 = in_idx + in_h_7 * iw_pad * 8 + in_w_7 * 8; + __asm__ __volatile__("ldr d29, [%[f0]]\n" + "ldr d0, [%[in0]]\n" + "ldr d1, [%[in1]]\n" + "ldr d2, [%[in2]]\n" + "sshll v29.8h, v29.8b, #0\n" + "ldr d30, [%[in3]]\n" + "sshll v0.8h, v0.8b, #0\n" + "sshll v1.8h, v1.8b, #0\n" + + "smlal v5.4s, v29.4h, v0.4h\n" + "sshll v2.8h, v2.8b, #0\n" + "smlal2 v6.4s, v29.8h, v0.8h\n" + "sshll v30.8h, v30.8b, #0\n" + "smlal v7.4s, v29.4h, v1.4h\n" + "ldr d0, [%[in4]]\n" + "smlal2 v8.4s, v29.8h, v1.8h\n" + "smlal v9.4s, v29.4h, v2.4h\n" + "ldr d1, [%[in5]]\n" + "smlal2 v10.4s, v29.8h, v2.8h\n" + "sshll v0.8h, v0.8b, #0\n" + "smlal v11.4s, v29.4h, v30.4h\n" + "ldr d2, [%[in6]]\n" + "smlal2 v12.4s, v29.8h, v30.8h\n" + "sshll v1.8h, v1.8b, #0\n" + + "smlal v13.4s, v29.4h, v0.4h\n" + "ldr d30, [%[in7]]\n" + "smlal2 v14.4s, v29.8h, v0.8h\n" + "sshll v2.8h, v2.8b, #0\n" + "smlal v15.4s, v29.4h, v1.4h\n" + "smlal2 v16.4s, v29.8h, v1.8h\n" + "sshll v30.8h, v30.8b, #0\n" + "smlal v17.4s, v29.4h, v2.4h\n" + "smlal2 v18.4s, v29.8h, v2.8h\n" + "smlal v19.4s, v29.4h, v30.4h\n" + "smlal2 v20.4s, v29.8h, v30.8h\n" + : + : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), + [in3] "r"(in_3), [in4] "r"(in_4), [in5] "r"(in_5), + [in6] "r"(in_6), [in7] "r"(in_7), [f0] "r"(f_0) + : "memory", "cc", "v0", "v1", "v2", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v29", "v30"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: { + break; + } + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v0.16b, v0.16b, v0.16b\n" // zero + + "smax v5.4s, v0.4s, v5.4s\n" + "smax v6.4s, v0.4s, v6.4s\n" + "smax v7.4s, v0.4s, v7.4s\n" + "smax v8.4s, v0.4s, v8.4s\n" + "smax v9.4s, v0.4s, v9.4s\n" + "smax v10.4s, v0.4s, v10.4s\n" + "smax v11.4s, v0.4s, v11.4s\n" + "smax v12.4s, v0.4s, v12.4s\n" + "smax v13.4s, v0.4s, v13.4s\n" + "smax v14.4s, v0.4s, v14.4s\n" + "smax v15.4s, v0.4s, v15.4s\n" + "smax v16.4s, v0.4s, v16.4s\n" + "smax v17.4s, v0.4s, v17.4s\n" + "smax v18.4s, v0.4s, v18.4s\n" + "smax v19.4s, v0.4s, v19.4s\n" + "smax v20.4s, v0.4s, v20.4s\n" + : + : + : "memory", "cc", "v0", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20"); + break; + } + case ACTIVATION_RELU6: { + INT8 *pw_in0 = pwArray + hw * ic * 8 + c * 8 * 8; + INT8 *pw_in1 = pw_in0 + 32; + __asm__ __volatile__("eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v30.4s, #6\n" // six + + "smax v5.4s, v0.4s, v5.4s\n" + "smax v6.4s, v0.4s, v6.4s\n" + "smax v7.4s, v0.4s, v7.4s\n" + "smax v8.4s, v0.4s, v8.4s\n" + "smax v9.4s, v0.4s, v9.4s\n" + "smax v10.4s, v0.4s, v10.4s\n" + "smax v11.4s, v0.4s, v11.4s\n" + "smax v12.4s, v0.4s, v12.4s\n" + "smax v13.4s, v0.4s, v13.4s\n" + "smax v14.4s, v0.4s, v14.4s\n" + "smax v15.4s, v0.4s, v15.4s\n" + "smax v16.4s, v0.4s, v16.4s\n" + "smax v17.4s, v0.4s, v17.4s\n" + "smax v18.4s, v0.4s, v18.4s\n" + "smax v19.4s, v0.4s, v19.4s\n" + "smax v20.4s, v0.4s, v20.4s\n" + + "smin v5.4s, v30.4s, v5.4s\n" + "smin v6.4s, v30.4s, v6.4s\n" + "smin v7.4s, v30.4s, v7.4s\n" + "smin v8.4s, v30.4s, v8.4s\n" + "smin v9.4s, v30.4s, v9.4s\n" + "smin v10.4s, v30.4s, v10.4s\n" + "smin v11.4s, v30.4s, v11.4s\n" + "smin v12.4s, v30.4s, v12.4s\n" + "smin v13.4s, v30.4s, v13.4s\n" + "smin v14.4s, v30.4s, v14.4s\n" + "smin v15.4s, v30.4s, v15.4s\n" + "smin v16.4s, v30.4s, v16.4s\n" + "smin v17.4s, v30.4s, v17.4s\n" + "smin v18.4s, v30.4s, v18.4s\n" + "smin v19.4s, v30.4s, v19.4s\n" + "smin v20.4s, v30.4s, v20.4s\n" + + // No need to quantize for ReLU6 + "sqshl v5.4s, v5.4s, #2\n" + "sqshl v6.4s, v6.4s, #2\n" + "sqshl v7.4s, v7.4s, #2\n" + "sqshl v8.4s, v8.4s, #2\n" + "sqshl v9.4s, v9.4s, #2\n" + "sqshl v10.4s, v10.4s, #2\n" + "sqshl v11.4s, v11.4s, #2\n" + "sqshl v12.4s, v12.4s, #2\n" + "sqshl v13.4s, v13.4s, #2\n" + "sqshl v14.4s, v14.4s, #2\n" + "sqshl v15.4s, v15.4s, #2\n" + "sqshl v16.4s, v16.4s, #2\n" + "sqshl v17.4s, v17.4s, #2\n" + "sqshl v18.4s, v18.4s, #2\n" + "sqshl v19.4s, v19.4s, #2\n" + "sqshl v20.4s, v20.4s, #2\n" + + "sqshrn v5.4h, v5.4s, #1\n" + "sqshrn v9.4h, v9.4s, #1\n" + "sqshrn2 v5.8h, v7.4s, #1\n" + "sqshrn2 v9.8h, v11.4s, #1\n" + + "sqshrn v13.4h, v13.4s, #1\n" + "sqshrn v17.4h, v17.4s, #1\n" + "sqshrn2 v13.8h, v15.4s, #1\n" + "sqshrn2 v17.8h, v19.4s, #1\n" + + "sqshrn v5.8b, v5.8h, #1\n" + "sqshrn v13.8b, v13.8h, #1\n" + + "sqshrn2 v5.16b, v9.8h, #1\n" + "sqshrn2 v13.16b, v17.8h, #1\n" + "str q5, [%[in0]]\n" + "str q13, [%[in0], #16]\n" + + "sqshrn v6.4h, v6.4s, #1\n" + "sqshrn v10.4h, v10.4s, #1\n" + "sqshrn2 v6.8h, v8.4s, #1\n" + "sqshrn2 v10.8h, v12.4s, #1\n" + + "sqshrn v14.4h, v14.4s, #1\n" + "sqshrn v18.4h, v18.4s, #1\n" + "sqshrn2 v14.8h, v16.4s, #1\n" + "sqshrn2 v18.8h, v20.4s, #1\n" + + "sqshrn v6.8b, v6.8h, #1\n" + "sqshrn v14.8b, v14.8h, #1\n" + + "sqshrn2 v6.16b, v10.8h, #1\n" + "sqshrn2 v14.16b, v18.8h, #1\n" + "str q6, [%[in1]]\n" + "str q14, [%[in1], #16]\n" + : + : [in0] "r"(pw_in0), [in1] "r"(pw_in1) + : "memory", "cc", "v0", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "v30"); + break; + } + default: + return NOT_SUPPORTED; + } + + if (depthwiseActivationParamSpec.mode != ACTIVATION_RELU6) { + __asm__ __volatile__("str q5, [%[pw0]]\n" + "str q7, [%[pw0], #16]\n" + "str q9, [%[pw0], #32]\n" + "str q11, [%[pw0], #48]\n" + "str q13, [%[pw0], #64]\n" + "str q15, [%[pw0], #80]\n" + "str q17, [%[pw0], #96]\n" + "str q19, [%[pw0], #112]\n" + + "str q6, [%[pw1]]\n" + "str q8, [%[pw1], #16]\n" + "str q10, [%[pw1], #32]\n" + "str q12, [%[pw1], #48]\n" + "str q14, [%[pw1], #64]\n" + "str q16, [%[pw1], #80]\n" + "str q18, [%[pw1], #96]\n" + "str q20, [%[pw1], #112]\n" + : + : [pw0] "r"(pw_pack_0), [pw1] "r"(pw_pack_1) + : "memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20"); + } + ohow_s += 8; + ohow_tail -= 8; + } + + if (ohow_tail >= 4) { + U32 hw = ohow_s; + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + U32 in_h_1 = (hw + 1) / ow * strideH; + U32 in_w_1 = (hw + 1) % ow * strideW; + U32 in_h_2 = (hw + 2) / ow * strideH; + U32 in_w_2 = (hw + 2) % ow * strideW; + U32 in_h_3 = (hw + 3) / ow * strideH; + U32 in_w_3 = (hw + 3) % ow * strideW; + I32 *pw_pack_0 = dw_out + hw * ic * 8 + c * 4 * 8; + I32 *pw_pack_1 = pw_pack_0 + 16; + // TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. + __asm__ __volatile__("ldr d29, [%[b]]\n" // b_0 + "ldr x1, [%[b], #8]\n" + "ins v29.d[1], x1\n" + "ldr d30, [%[b], #16]\n" // b_1 + "ldr x2, [%[b], #24]\n" + "ins v30.d[1], x2\n" + "mov v5.16b, v29.16b\n" + "mov v7.16b, v29.16b\n" + "mov v9.16b, v29.16b\n" + "mov v11.16b, v29.16b\n" + + "mov v6.16b, v30.16b\n" + "mov v8.16b, v30.16b\n" + "mov v10.16b, v30.16b\n" + "mov v12.16b, v30.16b\n" + : + : [b] "r"(b) + : "memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v29", "v30", "x1", "x2"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const INT8 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + INT8 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + INT8 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + INT8 *in_1 = in_idx + in_h_1 * iw_pad * 8 + in_w_1 * 8; + INT8 *in_2 = in_idx + in_h_2 * iw_pad * 8 + in_w_2 * 8; + INT8 *in_3 = in_idx + in_h_3 * iw_pad * 8 + in_w_3 * 8; + __asm__ __volatile__("ldr d29, [%[f0]]\n" + "ldr d0, [%[in0]]\n" + "ldr d1, [%[in1]]\n" + "ldr d2, [%[in2]]\n" + "sshll v29.8h, v29.8b, #0\n" + "ldr d30, [%[in3]]\n" + "sshll v0.8h, v0.8b, #0\n" + "sshll v1.8h, v1.8b, #0\n" + + "smlal v5.4s, v29.4h, v0.4h\n" + "sshll v2.8h, v2.8b, #0\n" + "smlal2 v6.4s, v29.8h, v0.8h\n" + "sshll v30.8h, v30.8b, #0\n" + "smlal v7.4s, v29.4h, v1.4h\n" + "smlal2 v8.4s, v29.8h, v1.8h\n" + "smlal v9.4s, v29.4h, v2.4h\n" + "smlal2 v10.4s, v29.8h, v2.8h\n" + "smlal v11.4s, v29.4h, v30.4h\n" + "smlal2 v12.4s, v29.8h, v30.8h\n" + : + : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), + [in3] "r"(in_3), [f0] "r"(f_0) + : "memory", "cc", "v0", "v1", "v2", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v29", "v30"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: { + break; + } + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v0.16b, v0.16b, v0.16b\n" // zero + + "smax v5.4s, v0.4s, v5.4s\n" + "smax v6.4s, v0.4s, v6.4s\n" + "smax v7.4s, v0.4s, v7.4s\n" + "smax v8.4s, v0.4s, v8.4s\n" + "smax v9.4s, v0.4s, v9.4s\n" + "smax v10.4s, v0.4s, v10.4s\n" + "smax v11.4s, v0.4s, v11.4s\n" + "smax v12.4s, v0.4s, v12.4s\n" + : + : + : "memory", "cc", "v0", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12"); + break; + } + case ACTIVATION_RELU6: { + INT8 *pw_in0 = pwArray + hw * ic * 8 + c * 4 * 8; + INT8 *pw_in1 = pw_in0 + 16; + __asm__ __volatile__("eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v30.4s, #6\n" // six + + "smax v5.4s, v0.4s, v5.4s\n" + "smax v6.4s, v0.4s, v6.4s\n" + "smax v7.4s, v0.4s, v7.4s\n" + "smax v8.4s, v0.4s, v8.4s\n" + "smax v9.4s, v0.4s, v9.4s\n" + "smax v10.4s, v0.4s, v10.4s\n" + "smax v11.4s, v0.4s, v11.4s\n" + "smax v12.4s, v0.4s, v12.4s\n" + + "smin v5.4s, v30.4s, v5.4s\n" + "smin v6.4s, v30.4s, v6.4s\n" + "smin v7.4s, v30.4s, v7.4s\n" + "smin v8.4s, v30.4s, v8.4s\n" + "smin v9.4s, v30.4s, v9.4s\n" + "smin v10.4s, v30.4s, v10.4s\n" + "smin v11.4s, v30.4s, v11.4s\n" + "smin v12.4s, v30.4s, v12.4s\n" + + // No need to quantize for ReLU6 + "sqshl v5.4s, v5.4s, #2\n" + "sqshl v6.4s, v6.4s, #2\n" + "sqshl v7.4s, v7.4s, #2\n" + "sqshl v8.4s, v8.4s, #2\n" + "sqshl v9.4s, v9.4s, #2\n" + "sqshl v10.4s, v10.4s, #2\n" + "sqshl v11.4s, v11.4s, #2\n" + "sqshl v12.4s, v12.4s, #2\n" + + "sqshrn v5.4h, v5.4s, #1\n" + "sqshrn v9.4h, v9.4s, #1\n" + "sqshrn2 v5.8h, v7.4s, #1\n" + "sqshrn2 v9.8h, v11.4s, #1\n" + + "sqshrn v5.8b, v5.8h, #1\n" + "sqshrn2 v5.16b, v9.8h, #1\n" + "str q5, [%[in0]]\n" + + "sqshrn v6.4h, v6.4s, #1\n" + "sqshrn v10.4h, v10.4s, #1\n" + "sqshrn2 v6.8h, v8.4s, #1\n" + "sqshrn2 v10.8h, v12.4s, #1\n" + + "sqshrn v6.8b, v6.8h, #1\n" + + "sqshrn2 v6.16b, v10.8h, #1\n" + "str q6, [%[in1]]\n" + : + : [in0] "r"(pw_in0), [in1] "r"(pw_in1) + : "memory", "cc", "v0", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v30"); + break; + } + default: + return NOT_SUPPORTED; + } + + if (depthwiseActivationParamSpec.mode != ACTIVATION_RELU6) { + __asm__ __volatile__( + "str q5, [%[pw0]]\n" + "str q7, [%[pw0], #16]\n" + "str q9, [%[pw0], #32]\n" + "str q11, [%[pw0], #48]\n" + + "str q6, [%[pw1]]\n" + "str q8, [%[pw1], #16]\n" + "str q10, [%[pw1], #32]\n" + "str q12, [%[pw1], #48]\n" + : + : [pw0] "r"(pw_pack_0), [pw1] "r"(pw_pack_1) + : "memory", "cc", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12"); + } + ohow_s += 4; + ohow_tail -= 4; + } + + // ohow_reminder % 4 + for (I32 hw = ohow_s; hw < ohow; hw++) { + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + I32 *pw_pack_0 = dw_out + hw * ic * 8 + c * 8; + I32 *pw_pack_1 = pw_pack_0 + 4; + // TODO handle asm combined with c. No guarantee that compile will not use vec reg in c. + __asm__ __volatile__("ldr d5, [%[b]]\n" // b_0 + "ldr x1, [%[b], #8]\n" + "ins v5.d[1], x1\n" + "ldr d6, [%[b], #16]\n" // b_1 + "ldr x2, [%[b], #24]\n" + "ins v6.d[1], x2\n" + : + : [b] "r"(b) + : "memory", "cc", "v5", "v6", "x1", "x2"); + + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + const INT8 *f_0 = f + fh_idx * fw * 8 + fw_idx * 8; + INT8 *in_idx = in_pad + fh_idx * dilateH * iw_pad * 8 + fw_idx * dilateW * 8; + INT8 *in_0 = in_idx + in_h_0 * iw_pad * 8 + in_w_0 * 8; + __asm__ __volatile__("ldr d29, [%[f0]]\n" + "ldr d0, [%[in0]]\n" + "sshll v29.8h, v29.8b, #0\n" + "sshll v0.8h, v0.8b, #0\n" + "smlal v5.4s, v29.4h, v0.4h\n" + "smlal2 v6.4s, v29.8h, v0.8h\n" + : + : [in0] "r"(in_0), [f0] "r"(f_0) + : "memory", "cc", "v0", "v5", "v6", "v29"); + } + } + + // activation + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: { + break; + } + case ACTIVATION_RELU: { + __asm__ __volatile__("eor v0.16b, v0.16b, v0.16b\n" // zero + + "smax v5.4s, v0.4s, v5.4s\n" + "smax v6.4s, v0.4s, v6.4s\n" + : + : + : "memory", "cc", "v0", "v5", "v6"); + break; + } + case ACTIVATION_RELU6: { + INT8 *pw_in0 = pwArray + hw * ic * 8 + c * 8; + __asm__ __volatile__("eor v0.16b, v0.16b, v0.16b\n" // zero + "movi v30.4s, #6\n" // six + + "smax v5.4s, v0.4s, v5.4s\n" + "smax v6.4s, v0.4s, v6.4s\n" + + "smin v5.4s, v30.4s, v5.4s\n" + "smin v6.4s, v30.4s, v6.4s\n" + + // No need to quantize for ReLU6 + "sqshl v5.4s, v5.4s, #2\n" + "sqshl v6.4s, v6.4s, #2\n" + + "sqshrn v5.4h, v5.4s, #1\n" + "sqshrn2 v5.8h, v6.4s, #1\n" + + "sqshrn v5.8b, v5.8h, #1\n" + "str d5, [%[in0]]\n" + : + : [in0] "r"(pw_in0) + : "memory", "cc", "v0", "v5", "v6", "v30"); + break; + } + default: + return NOT_SUPPORTED; + } + + if (depthwiseActivationParamSpec.mode != ACTIVATION_RELU6) { + __asm__ __volatile__("str q5, [%[pw0]]\n" + "str q6, [%[pw1]]\n" + : + : [pw0] "r"(pw_pack_0), [pw1] "r"(pw_pack_1) + : "memory", "cc", "v5", "v6"); + } + } + } + + I32 scale = 1; + if (depthwiseActivationParamSpec.mode != ACTIVATION_RELU6) { + // quantization + I32 factor = 16777216; // 24 bits + switch (depthwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: { + I32 max_s = dw_out[0]; + I32 min_s = dw_out[0]; + for (U32 i = 1; i < ohow * ic * 8; i++) { + I32 cur = dw_out[i]; + if (cur > max_s) { + max_s = cur; + } + if (cur < min_s) { + min_s = cur; + } + } + + if (max_s <= 127 && min_s >= -127) { // No need to scale + break; + } + + if (max_s == 0 && min_s == 0) { + break; + } + + if (max_s > 0 && min_s < 0) { + I32 factor_p = (factor * 127) / max_s; + I32 factor_n = (factor * -127) / min_s; + factor = (factor_p < factor_n) ? factor_p : factor_n; + } else if (max_s < 0) { + factor = (factor * -127) / min_s; + } else { // min_s > 0 + factor = (factor * 127) / max_s; + } + scale = 16777216 / factor; + break; + } + case ACTIVATION_RELU: { + I32 max_s = dw_out[0]; + for (U32 i = 1; i < ohow * ic * 8; i++) { + I32 cur = dw_out[i]; + if (cur > max_s) { + max_s = cur; + } + } + if (max_s <= 127) { // No need to scale + break; + } + + if (max_s == 0) { + break; + } + + factor = (factor * 127) / max_s; + scale = 16777216 / factor; + break; + } + default: + return NOT_SUPPORTED; + } + I32 factor_v[4]; + for (U32 i = 0; i < 4; i++) { + factor_v[i] = factor; + } + __asm__ __volatile__("ldr q0, [%[factor]]\n" + "mov x0, %[dw_out]\n" + "mov x1, %[pw_in]\n" + "mov x2, %[num]\n" + "0:\n" + "ldr q1, [x0], #16\n" + "ldr q2, [x0], #16\n" + "mul v1.4s, v0.4s, v1.4s\n" + "mul v2.4s, v0.4s, v2.4s\n" + + "shrn v1.4h, v1.4s, #16\n" + "shrn2 v1.8h, v2.4s, #16\n" + + "shrn v1.8b, v1.8h, #8\n" + "subs x2, x2, #8\n" + + "str d1, [x1], #8\n" + "bne 0b\n" + : + : [factor] "r"(factor_v), [dw_out] "r"(dw_out), + [pw_in] "r"(pwArray), [num] "r"((I64)ohow * ic * 8) + : "memory", "cc", "v0", "v1", "v2", "x0", "x1", "x2"); + } + + I32 scale_v[4]; + for (U32 i = 0; i < 4; i++) { + scale_v[i] = scale; + } + + // pw_conv + const INT8 *f_base = pwFilterArray; + + // ohow / 12 + for (I32 hw = 0; hw < ohow - 11; hw += 12) { + const I32 *b0 = pwBiasArray; + const I32 *b1 = b0 + 4; + INT8 *in_pack = pwArray + hw * ic * 8; + for (U32 o = 0; o < oc; o++) { + INT8 *in_hw0 = in_pack; + const INT8 *f_o0c0 = f_base + o * 8 * ic * 8; + I32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const I32 *b_0 = b0; + const I32 *b_1 = b1; + __asm__ __volatile__( + // Bias should be applied after scaling + "eor v5.16b, v5.16b, v5.16b\n" + "ldr d1, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "ldr x1, [%[in_0], #8]\n" + "eor v7.16b, v7.16b, v7.16b\n" + "ins v1.d[1], x1\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + "eor v9.16b, v9.16b, v9.16b\n" + "ldr x2, [%[f_0], #8]\n" + "eor v10.16b, v10.16b, v10.16b\n" + "ins v0.d[1], x2\n" + "eor v11.16b, v11.16b, v11.16b\n" + "ldr d3, [%[in_0], #16]\n" // in_1 + "eor v12.16b, v12.16b, v12.16b\n" + "ldr x3, [%[in_0], #24]\n" + "eor v13.16b, v13.16b, v13.16b\n" + "ins v3.d[1], x3\n" + "eor v14.16b, v14.16b, v14.16b\n" + "eor v15.16b, v15.16b, v15.16b\n" + "eor v16.16b, v16.16b, v16.16b\n" + + "eor v17.16b, v17.16b, v17.16b\n" + "eor v18.16b, v18.16b, v18.16b\n" + "eor v19.16b, v19.16b, v19.16b\n" + "eor v20.16b, v20.16b, v20.16b\n" + "eor v21.16b, v21.16b, v21.16b\n" + "eor v22.16b, v22.16b, v22.16b\n" + "eor v23.16b, v23.16b, v23.16b\n" + "eor v24.16b, v24.16b, v24.16b\n" + "eor v25.16b, v25.16b, v25.16b\n" + "eor v26.16b, v26.16b, v26.16b\n" + "eor v27.16b, v27.16b, v27.16b\n" + "eor v28.16b, v28.16b, v28.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + "0:\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d2, [x3, 32]\n" + "ldr x16, [x3, 40]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ldr d29, [x0, 16]\n" + "ldr x17, [x0, 24]\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "ins v2.d[1], x16\n" + "ldr d30, [x3, 48]!\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + "ins v29.d[1], x17\n" + + "sdot v13.4s, v0.16b, v3.4b[0]\n" + "ldr x16, [x3, 8]\n" + "subs x2, x2, #4\n" + "sdot v15.4s, v0.16b, v3.4b[1]\n" + "sdot v17.4s, v0.16b, v3.4b[2]\n" + "ins v30.d[1], x16\n" + "sdot v19.4s, v0.16b, v3.4b[3]\n" + + "sdot v21.4s, v0.16b, v2.4b[0]\n" + "sdot v23.4s, v0.16b, v2.4b[1]\n" + "sdot v25.4s, v0.16b, v2.4b[2]\n" + "sdot v27.4s, v0.16b, v2.4b[3]\n" + + "sdot v14.4s, v29.16b, v3.4b[0]\n" + "sdot v16.4s, v29.16b, v3.4b[1]\n" + "ldr d0, [x0, 32]!\n" + "ldr x17, [x0, 8]\n" + "sdot v18.4s, v29.16b, v3.4b[2]\n" + "sdot v20.4s, v29.16b, v3.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "ldr d3, [x3, 16]\n" + "ldr x16, [x3, 24]\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + + "ins v0.d[1], x17\n" + "ins v3.d[1], x16\n" + + "sdot v22.4s, v29.16b, v2.4b[0]\n" + "mov v1.16b, v30.16b\n" + "sdot v24.4s, v29.16b, v2.4b[1]\n" + "sdot v26.4s, v29.16b, v2.4b[2]\n" + "sdot v28.4s, v29.16b, v2.4b[3]\n" + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu6]\n" // No need to scale for relu6 + "ldr q3, [%[b_0]]\n" + "ldr q4, [%[b_1]]\n" + "beq 11f\n" + + "ldr q0, [%[scale]]\n" + "mul v5.4s, v0.4s, v5.4s\n" + "mul v6.4s, v0.4s, v6.4s\n" + "mul v7.4s, v0.4s, v7.4s\n" + "mul v8.4s, v0.4s, v8.4s\n" + "mul v9.4s, v0.4s, v9.4s\n" + "mul v10.4s, v0.4s, v10.4s\n" + "mul v11.4s, v0.4s, v11.4s\n" + "mul v12.4s, v0.4s, v12.4s\n" + "mul v13.4s, v0.4s, v13.4s\n" + "mul v14.4s, v0.4s, v14.4s\n" + "mul v15.4s, v0.4s, v15.4s\n" + "mul v16.4s, v0.4s, v16.4s\n" + "mul v17.4s, v0.4s, v17.4s\n" + "mul v18.4s, v0.4s, v18.4s\n" + "mul v19.4s, v0.4s, v19.4s\n" + "mul v20.4s, v0.4s, v20.4s\n" + "mul v21.4s, v0.4s, v21.4s\n" + "mul v22.4s, v0.4s, v22.4s\n" + "mul v23.4s, v0.4s, v23.4s\n" + "mul v24.4s, v0.4s, v24.4s\n" + "mul v25.4s, v0.4s, v25.4s\n" + "mul v26.4s, v0.4s, v26.4s\n" + "mul v27.4s, v0.4s, v27.4s\n" + "mul v28.4s, v0.4s, v28.4s\n" + + "add v5.4s, v3.4s, v5.4s\n" + "add v6.4s, v4.4s, v6.4s\n" + "add v7.4s, v3.4s, v7.4s\n" + "add v8.4s, v4.4s, v8.4s\n" + "add v9.4s, v3.4s, v9.4s\n" + "add v10.4s, v4.4s, v10.4s\n" + "add v11.4s, v3.4s, v11.4s\n" + "add v12.4s, v4.4s, v12.4s\n" + "add v13.4s, v3.4s, v13.4s\n" + "add v14.4s, v4.4s, v14.4s\n" + "add v15.4s, v3.4s, v15.4s\n" + "add v16.4s, v4.4s, v16.4s\n" + "add v17.4s, v3.4s, v17.4s\n" + "add v18.4s, v4.4s, v18.4s\n" + "add v19.4s, v3.4s, v19.4s\n" + "add v20.4s, v4.4s, v20.4s\n" + "add v21.4s, v3.4s, v21.4s\n" + "add v22.4s, v4.4s, v22.4s\n" + "add v23.4s, v3.4s, v23.4s\n" + "add v24.4s, v4.4s, v24.4s\n" + "add v25.4s, v3.4s, v25.4s\n" + "add v26.4s, v4.4s, v26.4s\n" + "add v27.4s, v3.4s, v27.4s\n" + "add v28.4s, v4.4s, v28.4s\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 13f\n" + "eor v1.16b, v1.16b, v1.16b\n" // zero + "smax v5.4s, v5.4s, v1.4s\n" + "smax v6.4s, v6.4s, v1.4s\n" + "smax v7.4s, v7.4s, v1.4s\n" + "smax v8.4s, v8.4s, v1.4s\n" + "smax v9.4s, v9.4s, v1.4s\n" + "smax v10.4s, v10.4s, v1.4s\n" + "smax v11.4s, v11.4s, v1.4s\n" + "smax v12.4s, v12.4s, v1.4s\n" + "smax v13.4s, v13.4s, v1.4s\n" + "smax v14.4s, v14.4s, v1.4s\n" + "smax v15.4s, v15.4s, v1.4s\n" + "smax v16.4s, v16.4s, v1.4s\n" + "smax v17.4s, v17.4s, v1.4s\n" + "smax v18.4s, v18.4s, v1.4s\n" + "smax v19.4s, v19.4s, v1.4s\n" + "smax v20.4s, v20.4s, v1.4s\n" + "smax v21.4s, v21.4s, v1.4s\n" + "smax v22.4s, v22.4s, v1.4s\n" + "smax v23.4s, v23.4s, v1.4s\n" + "smax v24.4s, v24.4s, v1.4s\n" + "smax v25.4s, v25.4s, v1.4s\n" + "smax v26.4s, v26.4s, v1.4s\n" + "smax v27.4s, v27.4s, v1.4s\n" + "smax v28.4s, v28.4s, v1.4s\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 13f\n" + // Apply bias + "add v5.4s, v3.4s, v5.4s\n" + "add v6.4s, v4.4s, v6.4s\n" + "add v7.4s, v3.4s, v7.4s\n" + "add v8.4s, v4.4s, v8.4s\n" + "add v9.4s, v3.4s, v9.4s\n" + "add v10.4s, v4.4s, v10.4s\n" + "add v11.4s, v3.4s, v11.4s\n" + "add v12.4s, v4.4s, v12.4s\n" + "add v13.4s, v3.4s, v13.4s\n" + "add v14.4s, v4.4s, v14.4s\n" + "add v15.4s, v3.4s, v15.4s\n" + "add v16.4s, v4.4s, v16.4s\n" + "add v17.4s, v3.4s, v17.4s\n" + "add v18.4s, v4.4s, v18.4s\n" + "add v19.4s, v3.4s, v19.4s\n" + "add v20.4s, v4.4s, v20.4s\n" + "add v21.4s, v3.4s, v21.4s\n" + "add v22.4s, v4.4s, v22.4s\n" + "add v23.4s, v3.4s, v23.4s\n" + "add v24.4s, v4.4s, v24.4s\n" + "add v25.4s, v3.4s, v25.4s\n" + "add v26.4s, v4.4s, v26.4s\n" + "add v27.4s, v3.4s, v27.4s\n" + "add v28.4s, v4.4s, v28.4s\n" + + "eor v1.16b, v0.16b, v0.16b\n" // zero + "movi v2.4s, #6\n" // six + "smax v5.4s, v5.4s, v1.4s\n" + "smax v6.4s, v6.4s, v1.4s\n" + "smax v7.4s, v7.4s, v1.4s\n" + "smax v8.4s, v8.4s, v1.4s\n" + "smax v9.4s, v9.4s, v1.4s\n" + "smax v10.4s, v10.4s, v1.4s\n" + "smax v11.4s, v11.4s, v1.4s\n" + "smax v12.4s, v12.4s, v1.4s\n" + "smax v13.4s, v13.4s, v1.4s\n" + "smax v14.4s, v14.4s, v1.4s\n" + "smax v15.4s, v15.4s, v1.4s\n" + "smax v16.4s, v16.4s, v1.4s\n" + "smax v17.4s, v17.4s, v1.4s\n" + "smax v18.4s, v18.4s, v1.4s\n" + "smax v19.4s, v19.4s, v1.4s\n" + "smax v20.4s, v20.4s, v1.4s\n" + "smax v21.4s, v21.4s, v1.4s\n" + "smax v22.4s, v22.4s, v1.4s\n" + "smax v23.4s, v23.4s, v1.4s\n" + "smax v24.4s, v24.4s, v1.4s\n" + "smax v25.4s, v25.4s, v1.4s\n" + "smax v26.4s, v26.4s, v1.4s\n" + "smax v27.4s, v27.4s, v1.4s\n" + "smax v28.4s, v28.4s, v1.4s\n" + + "smin v5.4s, v5.4s, v2.4s\n" + "smin v6.4s, v6.4s, v2.4s\n" + "smin v7.4s, v7.4s, v2.4s\n" + "smin v8.4s, v8.4s, v2.4s\n" + "smin v9.4s, v9.4s, v2.4s\n" + "smin v10.4s, v10.4s, v2.4s\n" + "smin v11.4s, v11.4s, v2.4s\n" + "smin v12.4s, v12.4s, v2.4s\n" + "smin v13.4s, v13.4s, v2.4s\n" + "smin v14.4s, v14.4s, v2.4s\n" + "smin v15.4s, v15.4s, v2.4s\n" + "smin v16.4s, v16.4s, v2.4s\n" + "smin v17.4s, v17.4s, v2.4s\n" + "smin v18.4s, v18.4s, v2.4s\n" + "smin v19.4s, v19.4s, v2.4s\n" + "smin v20.4s, v20.4s, v2.4s\n" + "smin v21.4s, v21.4s, v2.4s\n" + "smin v22.4s, v22.4s, v2.4s\n" + "smin v23.4s, v23.4s, v2.4s\n" + "smin v24.4s, v24.4s, v2.4s\n" + "smin v25.4s, v25.4s, v2.4s\n" + "smin v26.4s, v26.4s, v2.4s\n" + "smin v27.4s, v27.4s, v2.4s\n" + "smin v28.4s, v28.4s, v2.4s\n" + + "13:\n" + "str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + "str q7, [%[out_0], #32]\n" + "str q8, [%[out_0], #48]\n" + "str q9, [%[out_0], #64]\n" + "str q10, [%[out_0], #80]\n" + "str q11, [%[out_0], #96]\n" + "str q12, [%[out_0], #112]\n" + "str q13, [%[out_0], #128]\n" + "str q14, [%[out_0], #144]\n" + "str q15, [%[out_0], #160]\n" + "str q16, [%[out_0], #176]\n" + "str q17, [%[out_0], #192]\n" + "str q18, [%[out_0], #208]\n" + "str q19, [%[out_0], #224]\n" + "str q20, [%[out_0], #240]\n" + "str q21, [%[out_0], #256]\n" + "str q22, [%[out_0], #272]\n" + "str q23, [%[out_0], #288]\n" + "str q24, [%[out_0], #304]\n" + "str q25, [%[out_0], #320]\n" + "str q26, [%[out_0], #336]\n" + "str q27, [%[out_0], #352]\n" + "str q28, [%[out_0], #368]\n" + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_0), [b_1] "r"(b_1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [scale] "r"(scale_v) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", + "x1", "x2", "x3", "x17", "x16"); + b0 += 8; + b1 += 8; + } + } + + // ohow_reminder % 12 / 8 + U32 ohow_s = (ohow / 12) * 12; + U32 ohow_tail = ohow - ohow_s; + + if (ohow_tail >= 8) { + U32 hw = ohow_s; + const I32 *b0 = pwBiasArray; + const I32 *b1 = b0 + 4; + INT8 *in_pack = pwArray + hw * ic * 8; + for (U32 o = 0; o < oc; o++) { + INT8 *in_hw0 = in_pack; + const INT8 *f_o0c0 = f_base + o * 8 * ic * 8; + I32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const I32 *b_0 = b0; + const I32 *b_1 = b1; + __asm__ __volatile__( + // Bias should be applied after scaling + "eor v5.16b, v5.16b, v5.16b\n" + "ldr d1, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "ldr x1, [%[in_0], #8]\n" + "eor v7.16b, v7.16b, v7.16b\n" + "ins v1.d[1], x1\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + "eor v9.16b, v9.16b, v9.16b\n" + "ldr x2, [%[f_0], #8]\n" + "eor v10.16b, v10.16b, v10.16b\n" + "ins v0.d[1], x2\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + "eor v13.16b, v13.16b, v13.16b\n" + "eor v14.16b, v14.16b, v14.16b\n" + "eor v15.16b, v15.16b, v15.16b\n" + "eor v16.16b, v16.16b, v16.16b\n" + "eor v17.16b, v17.16b, v17.16b\n" + "eor v18.16b, v18.16b, v18.16b\n" + "eor v19.16b, v19.16b, v19.16b\n" + "eor v20.16b, v20.16b, v20.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + "0:\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d3, [x3, 16]!\n" + "ldr x16, [x3, 8]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ldr d29, [x0, 16]\n" + "ldr x17, [x0, 24]\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "ins v3.d[1], x16\n" + "ldr d30, [x3, 16]!\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + "ins v29.d[1], x17\n" + + "sdot v13.4s, v0.16b, v3.4b[0]\n" + "ldr x16, [x3, 8]\n" + "subs x2, x2, #4\n" + "sdot v15.4s, v0.16b, v3.4b[1]\n" + "sdot v17.4s, v0.16b, v3.4b[2]\n" + "ins v30.d[1], x16\n" + "sdot v19.4s, v0.16b, v3.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "ldr d0, [x0, 32]!\n" + "ldr x17, [x0, 8]\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + + "sdot v14.4s, v29.16b, v3.4b[0]\n" + "ins v0.d[1], x17\n" + "mov v1.16b, v30.16b\n" + "sdot v16.4s, v29.16b, v3.4b[1]\n" + "sdot v18.4s, v29.16b, v3.4b[2]\n" + "sdot v20.4s, v29.16b, v3.4b[3]\n" + + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu6]\n" // No need to scale for relu6 + "ldr q3, [%[b_0]]\n" + "ldr q4, [%[b_1]]\n" + "beq 11f\n" + + "ldr q0, [%[scale]]\n" + "mul v5.4s, v0.4s, v5.4s\n" + "mul v6.4s, v0.4s, v6.4s\n" + "mul v7.4s, v0.4s, v7.4s\n" + "mul v8.4s, v0.4s, v8.4s\n" + "mul v9.4s, v0.4s, v9.4s\n" + "mul v10.4s, v0.4s, v10.4s\n" + "mul v11.4s, v0.4s, v11.4s\n" + "mul v12.4s, v0.4s, v12.4s\n" + "mul v13.4s, v0.4s, v13.4s\n" + "mul v14.4s, v0.4s, v14.4s\n" + "mul v15.4s, v0.4s, v15.4s\n" + "mul v16.4s, v0.4s, v16.4s\n" + "mul v17.4s, v0.4s, v17.4s\n" + "mul v18.4s, v0.4s, v18.4s\n" + "mul v19.4s, v0.4s, v19.4s\n" + "mul v20.4s, v0.4s, v20.4s\n" + + "add v5.4s, v3.4s, v5.4s\n" + "add v6.4s, v4.4s, v6.4s\n" + "add v7.4s, v3.4s, v7.4s\n" + "add v8.4s, v4.4s, v8.4s\n" + "add v9.4s, v3.4s, v9.4s\n" + "add v10.4s, v4.4s, v10.4s\n" + "add v11.4s, v3.4s, v11.4s\n" + "add v12.4s, v4.4s, v12.4s\n" + "add v13.4s, v3.4s, v13.4s\n" + "add v14.4s, v4.4s, v14.4s\n" + "add v15.4s, v3.4s, v15.4s\n" + "add v16.4s, v4.4s, v16.4s\n" + "add v17.4s, v3.4s, v17.4s\n" + "add v18.4s, v4.4s, v18.4s\n" + "add v19.4s, v3.4s, v19.4s\n" + "add v20.4s, v4.4s, v20.4s\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 13f\n" + "eor v1.16b, v1.16b, v1.16b\n" // zero + "smax v5.4s, v5.4s, v1.4s\n" + "smax v6.4s, v6.4s, v1.4s\n" + "smax v7.4s, v7.4s, v1.4s\n" + "smax v8.4s, v8.4s, v1.4s\n" + "smax v9.4s, v9.4s, v1.4s\n" + "smax v10.4s, v10.4s, v1.4s\n" + "smax v11.4s, v11.4s, v1.4s\n" + "smax v12.4s, v12.4s, v1.4s\n" + "smax v13.4s, v13.4s, v1.4s\n" + "smax v14.4s, v14.4s, v1.4s\n" + "smax v15.4s, v15.4s, v1.4s\n" + "smax v16.4s, v16.4s, v1.4s\n" + "smax v17.4s, v17.4s, v1.4s\n" + "smax v18.4s, v18.4s, v1.4s\n" + "smax v19.4s, v19.4s, v1.4s\n" + "smax v20.4s, v20.4s, v1.4s\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 13f\n" + // Apply bias + "add v5.4s, v3.4s, v5.4s\n" + "add v6.4s, v4.4s, v6.4s\n" + "add v7.4s, v3.4s, v7.4s\n" + "add v8.4s, v4.4s, v8.4s\n" + "add v9.4s, v3.4s, v9.4s\n" + "add v10.4s, v4.4s, v10.4s\n" + "add v11.4s, v3.4s, v11.4s\n" + "add v12.4s, v4.4s, v12.4s\n" + "add v13.4s, v3.4s, v13.4s\n" + "add v14.4s, v4.4s, v14.4s\n" + "add v15.4s, v3.4s, v15.4s\n" + "add v16.4s, v4.4s, v16.4s\n" + "add v17.4s, v3.4s, v17.4s\n" + "add v18.4s, v4.4s, v18.4s\n" + "add v19.4s, v3.4s, v19.4s\n" + "add v20.4s, v4.4s, v20.4s\n" + + "eor v1.16b, v0.16b, v0.16b\n" // zero + "movi v2.4s, #6\n" // six + "smax v5.4s, v5.4s, v1.4s\n" + "smax v6.4s, v6.4s, v1.4s\n" + "smax v7.4s, v7.4s, v1.4s\n" + "smax v8.4s, v8.4s, v1.4s\n" + "smax v9.4s, v9.4s, v1.4s\n" + "smax v10.4s, v10.4s, v1.4s\n" + "smax v11.4s, v11.4s, v1.4s\n" + "smax v12.4s, v12.4s, v1.4s\n" + "smax v13.4s, v13.4s, v1.4s\n" + "smax v14.4s, v14.4s, v1.4s\n" + "smax v15.4s, v15.4s, v1.4s\n" + "smax v16.4s, v16.4s, v1.4s\n" + "smax v17.4s, v17.4s, v1.4s\n" + "smax v18.4s, v18.4s, v1.4s\n" + "smax v19.4s, v19.4s, v1.4s\n" + "smax v20.4s, v20.4s, v1.4s\n" + + "smin v5.4s, v5.4s, v2.4s\n" + "smin v6.4s, v6.4s, v2.4s\n" + "smin v7.4s, v7.4s, v2.4s\n" + "smin v8.4s, v8.4s, v2.4s\n" + "smin v9.4s, v9.4s, v2.4s\n" + "smin v10.4s, v10.4s, v2.4s\n" + "smin v11.4s, v11.4s, v2.4s\n" + "smin v12.4s, v12.4s, v2.4s\n" + "smin v13.4s, v13.4s, v2.4s\n" + "smin v14.4s, v14.4s, v2.4s\n" + "smin v15.4s, v15.4s, v2.4s\n" + "smin v16.4s, v16.4s, v2.4s\n" + "smin v17.4s, v17.4s, v2.4s\n" + "smin v18.4s, v18.4s, v2.4s\n" + "smin v19.4s, v19.4s, v2.4s\n" + "smin v20.4s, v20.4s, v2.4s\n" + + "13:\n" + "str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + "str q7, [%[out_0], #32]\n" + "str q8, [%[out_0], #48]\n" + "str q9, [%[out_0], #64]\n" + "str q10, [%[out_0], #80]\n" + "str q11, [%[out_0], #96]\n" + "str q12, [%[out_0], #112]\n" + "str q13, [%[out_0], #128]\n" + "str q14, [%[out_0], #144]\n" + "str q15, [%[out_0], #160]\n" + "str q16, [%[out_0], #176]\n" + "str q17, [%[out_0], #192]\n" + "str q18, [%[out_0], #208]\n" + "str q19, [%[out_0], #224]\n" + "str q20, [%[out_0], #240]\n" + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_0), [b_1] "r"(b_1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [scale] "r"(scale_v) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", + "v29", "v30", "x0", "x1", "x2", "x3", "x17", "x16"); + b0 += 8; + b1 += 8; + } + ohow_s += 8; + ohow_tail -= 8; + } + + if (ohow_tail >= 4) { + U32 hw = ohow_s; + const I32 *b0 = pwBiasArray; + const I32 *b1 = b0 + 4; + INT8 *in_pack = pwArray + hw * ic * 8; + for (U32 o = 0; o < oc; o++) { + INT8 *in_hw0 = in_pack; + const INT8 *f_o0c0 = f_base + o * 8 * ic * 8; + I32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + // bias + const I32 *b_0 = b0; + const I32 *b_1 = b1; + __asm__ __volatile__( + // Bias should be applied after scaling + "eor v5.16b, v5.16b, v5.16b\n" + "ldr d1, [%[in_0]]\n" // in_0 + "eor v6.16b, v6.16b, v6.16b\n" + "ldr x1, [%[in_0], #8]\n" + "eor v7.16b, v7.16b, v7.16b\n" + "ins v1.d[1], x1\n" + "eor v8.16b, v8.16b, v8.16b\n" + "ldr d0, [%[f_0]]\n" // f_0 + "eor v9.16b, v9.16b, v9.16b\n" + "ldr x2, [%[f_0], #8]\n" + "eor v10.16b, v10.16b, v10.16b\n" + "ins v0.d[1], x2\n" + "eor v11.16b, v11.16b, v11.16b\n" + "eor v12.16b, v12.16b, v12.16b\n" + + // give in address to x3 + "mov x3, %[in_0]\n" + + // give f address to x0 + "mov x0, %[f_0]\n" + + "mov x2, %[ic]\n" // ic_blk + "0:\n" + "ldr d29, [x0, 16]\n" + "ldr x17, [x0, 24]\n" + "sdot v5.4s, v0.16b, v1.4b[0]\n" + "ldr d3, [x3, 16]!\n" + "ldr x16, [x3, 8]\n" + "sdot v7.4s, v0.16b, v1.4b[1]\n" + "ins v29.d[1], x17\n" + "subs x2, x2, #4\n" + "sdot v9.4s, v0.16b, v1.4b[2]\n" + "ins v3.d[1], x16\n" + "sdot v11.4s, v0.16b, v1.4b[3]\n" + + "sdot v6.4s, v29.16b, v1.4b[0]\n" + "ldr d0, [x0, 32]!\n" + "ldr x17, [x0, 8]\n" + "sdot v8.4s, v29.16b, v1.4b[1]\n" + "sdot v10.4s, v29.16b, v1.4b[2]\n" + "ins v0.d[1], x17\n" + "sdot v12.4s, v29.16b, v1.4b[3]\n" + "mov v1.16b, v3.16b\n" + + "bne 0b\n" + + "cmp %[pointwiseActivationMode], %[am_relu6]\n" // No need to scale for relu6 + "ldr q3, [%[b_0]]\n" + "ldr q4, [%[b_1]]\n" + "beq 11f\n" + + "ldr q0, [%[scale]]\n" + "mul v5.4s, v0.4s, v5.4s\n" + "mul v6.4s, v0.4s, v6.4s\n" + "mul v7.4s, v0.4s, v7.4s\n" + "mul v8.4s, v0.4s, v8.4s\n" + "mul v9.4s, v0.4s, v9.4s\n" + "mul v10.4s, v0.4s, v10.4s\n" + "mul v11.4s, v0.4s, v11.4s\n" + "mul v12.4s, v0.4s, v12.4s\n" + + "add v5.4s, v3.4s, v5.4s\n" + "add v6.4s, v4.4s, v6.4s\n" + "add v7.4s, v3.4s, v7.4s\n" + "add v8.4s, v4.4s, v8.4s\n" + "add v9.4s, v3.4s, v9.4s\n" + "add v10.4s, v4.4s, v10.4s\n" + "add v11.4s, v3.4s, v11.4s\n" + "add v12.4s, v4.4s, v12.4s\n" + + "cmp %[pointwiseActivationMode], %[am_relu]\n" + "bne 13f\n" + "eor v1.16b, v1.16b, v1.16b\n" // zero + "smax v5.4s, v5.4s, v1.4s\n" + "smax v6.4s, v6.4s, v1.4s\n" + "smax v7.4s, v7.4s, v1.4s\n" + "smax v8.4s, v8.4s, v1.4s\n" + "smax v9.4s, v9.4s, v1.4s\n" + "smax v10.4s, v10.4s, v1.4s\n" + "smax v11.4s, v11.4s, v1.4s\n" + "smax v12.4s, v12.4s, v1.4s\n" + + "11:\n" + "cmp %[pointwiseActivationMode], %[am_relu6]\n" + "bne 13f\n" + // Apply bias + "add v5.4s, v3.4s, v5.4s\n" + "add v6.4s, v4.4s, v6.4s\n" + "add v7.4s, v3.4s, v7.4s\n" + "add v8.4s, v4.4s, v8.4s\n" + "add v9.4s, v3.4s, v9.4s\n" + "add v10.4s, v4.4s, v10.4s\n" + "add v11.4s, v3.4s, v11.4s\n" + "add v12.4s, v4.4s, v12.4s\n" + + "eor v1.16b, v0.16b, v0.16b\n" // zero + "movi v2.4s, #0x06\n" // six + "smax v5.4s, v5.4s, v1.4s\n" + "smax v6.4s, v6.4s, v1.4s\n" + "smax v7.4s, v7.4s, v1.4s\n" + "smax v8.4s, v8.4s, v1.4s\n" + "smax v9.4s, v9.4s, v1.4s\n" + "smax v10.4s, v10.4s, v1.4s\n" + "smax v11.4s, v11.4s, v1.4s\n" + "smax v12.4s, v12.4s, v1.4s\n" + + "smin v5.4s, v5.4s, v2.4s\n" + "smin v6.4s, v6.4s, v2.4s\n" + "smin v7.4s, v7.4s, v2.4s\n" + "smin v8.4s, v8.4s, v2.4s\n" + "smin v9.4s, v9.4s, v2.4s\n" + "smin v10.4s, v10.4s, v2.4s\n" + "smin v11.4s, v11.4s, v2.4s\n" + "smin v12.4s, v12.4s, v2.4s\n" + + "13:\n" + "str q5, [%[out_0]]\n" + "str q6, [%[out_0], #16]\n" + "str q7, [%[out_0], #32]\n" + "str q8, [%[out_0], #48]\n" + "str q9, [%[out_0], #64]\n" + "str q10, [%[out_0], #80]\n" + "str q11, [%[out_0], #96]\n" + "str q12, [%[out_0], #112]\n" + : [out_0] "+r"(out_o0hw0), [in_0] "+r"(in_hw0), [f_0] "+r"(f_o0c0) + : [ic] "r"((I64)ic * 8), [b_0] "r"(b_0), [b_1] "r"(b_1), + [pointwiseActivationMode] "r"((I64)pointwiseActivationParamSpec.mode), + [am_relu] "r"((I64)ACTIVATION_RELU), [am_relu6] "r"((I64)ACTIVATION_RELU6), + [scale] "r"(scale_v) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v29", "v30", "x0", "x1", "x2", "x3", "x17", "x16"); + b0 += 8; + b1 += 8; + } + ohow_s += 4; + ohow_tail -= 4; + } + + for (I32 hw = ohow_s; hw < ohow; hw++) { + const I32 *b0 = pwBiasArray; + INT8 *in_pack = pwArray + hw * ic * 8; + + // compute + for (U32 o = 0; o < oc; o++) { + INT8 *in_hw = in_pack; + const INT8 *f_o = f_base + o * 8 * ic * 8; + I32 *out_o0hw0 = outArray + n * oc * ohow * 8 + o * ohow * 8 + hw * 8; + + int32x4_t res[2] = {0}; + + for (U32 c = 0; c < ic; c++) { + int8x8_t in_2 = vld1_s8(in_hw); + in_hw += 8; + int8x16_t f_8o[4]; + f_8o[0] = vld1q_s8(f_o); + f_8o[1] = vld1q_s8(f_o + 16); + res[0] = vdotq_lane_s32(res[0], f_8o[0], in_2, 0); + res[1] = vdotq_lane_s32(res[1], f_8o[1], in_2, 0); + + f_8o[2] = vld1q_s8(f_o + 32); + f_8o[3] = vld1q_s8(f_o + 48); + f_o += 64; + res[0] = vdotq_lane_s32(res[0], f_8o[2], in_2, 1); + res[1] = vdotq_lane_s32(res[1], f_8o[3], in_2, 1); + } + + if (pointwiseActivationParamSpec.mode != ACTIVATION_RELU6 && scale != 1) { // Scale + int32x4_t sc = vld1q_s32(scale_v); + res[0] = vmulq_s32(res[0], sc); + res[1] = vmulq_s32(res[1], sc); + } + + int32x4_t bias[2]; + bias[0] = vld1q_s32(b0); + bias[1] = vld1q_s32(b0 + 4); + + res[0] = vaddq_s32(res[0], bias[0]); + res[1] = vaddq_s32(res[1], bias[1]); + + switch (pointwiseActivationParamSpec.mode) { + case ACTIVATION_NULL: + break; + case ACTIVATION_RELU: { + int32x4_t z = vdupq_n_s32(0); + res[0] = vmaxq_s32(res[0], z); + res[1] = vmaxq_s32(res[1], z); + break; + } + case ACTIVATION_RELU6: { + int32x4_t z = vdupq_n_s32(0); + int32x4_t s = vdupq_n_s32(6); + res[0] = vmaxq_s32(res[0], z); + res[1] = vmaxq_s32(res[1], z); + res[0] = vminq_s32(res[0], s); + res[1] = vminq_s32(res[1], s); + break; + } + default: + return NOT_SUPPORTED; + } + vst1q_s32(out_o0hw0, res[0]); + vst1q_s32(out_o0hw0 + 4, res[1]); + b0 += 8; + } + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/int8/pooling.cpp b/compute/tensor/src/cpu/arm/int8/pooling.cpp new file mode 100644 index 00000000..9a3bf24b --- /dev/null +++ b/compute/tensor/src/cpu/arm/int8/pooling.cpp @@ -0,0 +1,81 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/int8/tensor_computing_int8.h" + +EE pooling_c8_int8(const INT8 *input, + U32 stride, + int hstart, + int hend, + int wstart, + int wend, + INT8 *output, + PoolingParamSpec poolingParamSpec, + void *scale) +{ + EE ret = SUCCESS; + PoolingMode pm = poolingParamSpec.mode; + U32 kernelSizeH = poolingParamSpec.kernel_h; + U32 kernelSizeW = poolingParamSpec.kernel_w; + if (kernelSizeH * kernelSizeW > 256 && pm == POOLING_MEAN) { + ret = NOT_SUPPORTED; + } + short khkw = kernelSizeH * kernelSizeW; + short factor = 256 / khkw; + F32 *inputScale = (F32 *)scale; + F32 *outputScale = inputScale + 1; + switch (pm) { + case POOLING_MAX: { + *outputScale = *inputScale; + break; + } + case POOLING_MEAN: { + *outputScale = *inputScale * factor * khkw / 256; + break; + } + default: { + ret = NOT_SUPPORTED; + break; + } + } + int8x8_t in1, out1; + int16x8_t out_mean = {0}; + out1 = vdup_n_s8(-128); + short pool_size = (hend - hstart) * (wend - wstart); + for (int kernelH = hstart; kernelH < hend; kernelH++) { + for (int kernelW = wstart; kernelW < wend; kernelW++) { + const U32 index = (kernelH * stride + kernelW) * 8; + in1 = vld1_s8(input + index); + switch (pm) { + case POOLING_MAX: + out1 = vmax_s8(out1, in1); + break; + case POOLING_MEAN: + out_mean = vaddw_s8(out_mean, in1); + break; + default: + ret = NOT_SUPPORTED; + break; + } + } + } + if (pm == POOLING_MEAN) { + short pool_factor = factor * khkw / pool_size; + if (pool_factor > 1) { + out_mean = vmulq_n_s16(out_mean, pool_factor); + } + out1 = vshrn_n_s16(out_mean, 8); + } + vst1_s8(output, out1); + return ret; +} diff --git a/tensor_computing/src/cpu/arm/int8/quantize.cpp b/compute/tensor/src/cpu/arm/int8/quantize.cpp similarity index 82% rename from tensor_computing/src/cpu/arm/int8/quantize.cpp rename to compute/tensor/src/cpu/arm/int8/quantize.cpp index d746c87e..c0675ac5 100644 --- a/tensor_computing/src/cpu/arm/int8/quantize.cpp +++ b/compute/tensor/src/cpu/arm/int8/quantize.cpp @@ -1,24 +1,23 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include #include "cpu/arm/int8/tensor_computing_int8.h" #include "cpu/arm/int8/convolution_gemm.h" - -EE quantize_tensor_int32(TensorDesc dDesc, const void* data, TensorDesc* qDesc, void* qData, F32 *scale) +EE quantize_tensor_int32( + TensorDesc dDesc, const void *data, TensorDesc *qDesc, void *qData, F32 *scale) { if (nullptr == data || nullptr == qDesc || nullptr == qData || nullptr == scale) { CHECK_STATUS(NULL_POINTER); @@ -27,7 +26,7 @@ EE quantize_tensor_int32(TensorDesc dDesc, const void* data, TensorDesc* qDesc, DataFormat df; U32 n, c, h, w; if (tensorIs2d(dDesc)) { - CHECK_STATUS(tensor2dfGet(dDesc, &dt, &df, &n, &w)); + CHECK_STATUS(tensor2dGet(dDesc, &dt, &df, &n, &w)); c = 1; h = 1; } else if (tensorIs3d(dDesc)) { @@ -38,7 +37,7 @@ EE quantize_tensor_int32(TensorDesc dDesc, const void* data, TensorDesc* qDesc, } switch (dt) { case DT_I32: { - I32 *array = (I32*)data; + I32 *array = (I32 *)data; int32x4_t tmp_v = vld1q_s32(array); int32x4_t max_v = tmp_v; int32x4_t min_v = tmp_v; @@ -47,7 +46,7 @@ EE quantize_tensor_int32(TensorDesc dDesc, const void* data, TensorDesc* qDesc, CHECK_REQUIREMENT(numData >= 4); U32 i = 4; for (; i < numData - 3; i += 4) { - tmp_v = vld1q_s32(array+i); + tmp_v = vld1q_s32(array + i); max_v = vmaxq_s32(max_v, tmp_v); min_v = vminq_s32(min_v, tmp_v); } @@ -73,7 +72,7 @@ EE quantize_tensor_int32(TensorDesc dDesc, const void* data, TensorDesc* qDesc, I32 factor_max = 127 * 16777216 / max; I32 factor_min = -127 * 16777216 / min; factor = (factor_max < factor_min) ? factor_max : factor_min; - scaleO = (factor_max < factor_min) ? (127.0/max) : (-127.0/min); + scaleO = (factor_max < factor_min) ? (127.0 / max) : (-127.0 / min); } else if (max > 0) { factor = 127 * 16777216 / max; scaleO = 127.0 / max; @@ -81,12 +80,13 @@ EE quantize_tensor_int32(TensorDesc dDesc, const void* data, TensorDesc* qDesc, factor = -127 * 16777216 / min; scaleO = -127.0 / min; } - DEBUG_info(max << " is the max I32 value, and min values is " << min); - DEBUG_info(scaleO << " is the derived scale"); + UNI_DEBUG_LOG("%d is the max I32 value, and min values is %d, %f is the derived " + "scale\n", + max, min, scaleO); *scale *= scaleO; U32 main = numData / 16; - INT8 *qArray = (INT8*)qData; + INT8 *qArray = (INT8 *)qData; CHECK_STATUS(quantize_I32(main * 4, array, factor, scaleO, qArray)); for (U32 i = main * 16; i < numData; i++) { qArray[i] = array[i] * scaleO; @@ -101,7 +101,7 @@ EE quantize_tensor_int32(TensorDesc dDesc, const void* data, TensorDesc* qDesc, } break; } - default:{ + default: { CHECK_STATUS(NOT_SUPPORTED); } } diff --git a/compute/tensor/src/cpu/arm/int8/tensor_computing_int8.h b/compute/tensor/src/cpu/arm/int8/tensor_computing_int8.h new file mode 100644 index 00000000..6fc9c92d --- /dev/null +++ b/compute/tensor/src/cpu/arm/int8/tensor_computing_int8.h @@ -0,0 +1,97 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TENSOR_COMPUTING_INT8 +#define _H_TENSOR_COMPUTING_INT8 +#ifdef _USE_INT8 +#include +#include "sys.h" +#include "types.h" +#include "error.h" +#include "cpu/arm/int8/arm_functions_int8.h" + +EE convolution_infer_forward_algorithm_int8(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ConvolutionForwardAlgorithm *algorithm); + +EE convolution_transform_filter_bytes_int8( + TensorDesc filterDesc, ConvolutionForwardAlgorithm algorithm, U32 *bytes); + +EE convolution_transform_filter_int8(TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed); + +EE convolution_int8(TensorDesc inputDesc, + const INT8 *input, + TensorDesc filterDesc, + const INT8 *filter, + F16 *scales, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const F16 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch); + +EE depthwise_pointwise_convolution_int8(TensorDesc inputDesc, + INT8 *input, + TensorDesc dwFilterDesc, + const INT8 *dwFilter, + TensorDesc pwFilterDesc, + const INT8 *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const I32 *dwBias, + TensorDesc pwBiasDesc, + const I32 *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + I32 *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch); + +EE pooling_c8_int8(const INT8 *input, + U32 stride, + int hstart, + int hend, + int wstart, + int wend, + INT8 *output, + PoolingParamSpec poolingParamSpec, + void *scale); + +EE concat_int8(std::vector inputDesc, + std::vector input, + F32 *inputScale, + int axis, + TensorDesc outputDesc, + void *output, + F32 *outputScale); + +EE quantize_tensor_int32( + TensorDesc dDesc, const void *data, TensorDesc *qDesc, void *qData, F32 *scale); +#endif +#endif diff --git a/tensor_computing/src/cpu/arm/attention_mask.cpp b/compute/tensor/src/cpu/arm/normalization.cpp similarity index 69% rename from tensor_computing/src/cpu/arm/attention_mask.cpp rename to compute/tensor/src/cpu/arm/normalization.cpp index 38928807..a26d8bc3 100644 --- a/tensor_computing/src/cpu/arm/attention_mask.cpp +++ b/compute/tensor/src/cpu/arm/normalization.cpp @@ -1,17 +1,16 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "cpu/arm/tensor_computing_arm.h" #ifdef _USE_FP32 #include "cpu/arm/fp32/tensor_computing_fp32.h" @@ -20,24 +19,23 @@ #include "cpu/arm/fp16/tensor_computing_fp16.h" #endif -EE attention_mask_arm(TensorDesc inputDesc, const void* input, - I32 attentionLength, bool sameLength, float mask, - TensorDesc outputDesc, void* output) +EE layer_normalization_arm( + TensorDesc inputDesc, void *input, void *alpha, void *beta, TensorDesc outputDesc, void *output) { DataType idt = inputDesc.dt; EE ret = SUCCESS; switch (idt) { #ifdef _USE_FP32 case DT_F32: { - ret = attention_mask_fp32(inputDesc, (const F32*)input, - attentionLength, sameLength, mask, outputDesc, (F32*)output); + ret = layer_normalization_fp32( + inputDesc, (F32 *)input, (F32 *)alpha, (F32 *)beta, outputDesc, (F32 *)output); break; } #endif #ifdef _USE_FP16 case DT_F16: { - ret = attention_mask_fp16(inputDesc, (const F16*)input, - attentionLength, sameLength, mask, outputDesc, (F16*)output); + ret = layer_normalization_fp16( + inputDesc, (F16 *)input, (F16 *)alpha, (F16 *)beta, outputDesc, (F16 *)output); break; } #endif @@ -45,6 +43,5 @@ EE attention_mask_arm(TensorDesc inputDesc, const void* input, ret = NOT_SUPPORTED; break; } - return ret; } diff --git a/compute/tensor/src/cpu/arm/padding.cpp b/compute/tensor/src/cpu/arm/padding.cpp new file mode 100644 index 00000000..a3a4ef79 --- /dev/null +++ b/compute/tensor/src/cpu/arm/padding.cpp @@ -0,0 +1,126 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "types.h" +#include "cpu/arm/tensor_computing_arm.h" +#include + +EE padding_arm(TensorDesc inputDesc, + const void *input, + PadParamSpec padParamSpec, + TensorDesc outputDesc, + void *output) +{ + DataType idt, odt; + DataFormat idf, odf; + U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + CHECK_REQUIREMENT(in == on); + CHECK_REQUIREMENT(ic == oc); + U32 alignSize = 1; + if (idf == DF_NCHWC8) { + alignSize = 8; + } + ic /= alignSize; + oc /= alignSize; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < ih; h++) { + const U8 *inPtr = + (const U8 *)input + (((n * ic + c) * ih + h) * iw) * alignSize * bytesOf(idt); + U8 *outPtr = (U8 *)output + + (((n * oc + c) * oh + (padParamSpec.top + h)) * ow) * alignSize * bytesOf(odt); + if (padParamSpec.pad_mode == Pad_Constant) { + memset(outPtr, 0, padParamSpec.left * alignSize * bytesOf(odt)); + outPtr += padParamSpec.left * alignSize * bytesOf(odt); + memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt)); + outPtr += iw * alignSize * bytesOf(odt); + memset(outPtr, 0, padParamSpec.right * alignSize * bytesOf(odt)); + } else { + for (U32 w = 0; w < padParamSpec.left; w++) { + U32 index = 0; + if (padParamSpec.pad_mode == Pad_Reflect) { + index = (padParamSpec.left - w) * alignSize * bytesOf(idt); + } else if (padParamSpec.pad_mode == Pad_Symmetric) { + index = (padParamSpec.left - w - 1) * alignSize * bytesOf(idt); + } + memcpy(outPtr, inPtr + index, alignSize * bytesOf(idt)); + outPtr += alignSize * bytesOf(idt); + } + memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt)); + outPtr += iw * alignSize * bytesOf(odt); + for (U32 w = 0; w < padParamSpec.right; w++) { + U32 index = (iw - 1) * alignSize * bytesOf(idt); + if (padParamSpec.pad_mode == Pad_Reflect) { + index = (iw - w - 2) * alignSize * bytesOf(idt); + } else if (padParamSpec.pad_mode == Pad_Symmetric) { + index = (iw - w - 1) * alignSize * bytesOf(idt); + } + memcpy(outPtr, inPtr + index, alignSize * bytesOf(idt)); + outPtr += alignSize * bytesOf(idt); + } + } + } + U8 *outPtr = (U8 *)output + (((n * oc + c) * oh) * ow) * alignSize * bytesOf(odt); + for (U32 h = 0; h < padParamSpec.top; h++) { + U32 index = h * ow * alignSize * bytesOf(odt); + if (padParamSpec.pad_mode == Pad_Constant) { + memset(outPtr + index, 0, ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Edge) { + memcpy(outPtr + index, + outPtr + (padParamSpec.top * ow * alignSize * bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Reflect) { + memcpy(outPtr + index, + outPtr + + ((padParamSpec.top + padParamSpec.top - h) * ow * alignSize * + bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Symmetric) { + memcpy(outPtr + index, + outPtr + + ((padParamSpec.top + padParamSpec.top - h - 1) * ow * alignSize * + bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else { + return NOT_SUPPORTED; + } + } + for (U32 h = 0; h < padParamSpec.bottom; h++) { + U32 index = (padParamSpec.top + ih + h) * ow * alignSize * bytesOf(odt); + if (padParamSpec.pad_mode == Pad_Constant) { + memset(outPtr + index, 0, ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Edge) { + memcpy(outPtr + index, + outPtr + ((padParamSpec.top + ih - 1) * ow * alignSize * bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Reflect) { + // memcpy(outPtr+index, outPtr+((padParamSpec.top+ih-2-h)*ow*alignSize*bytesOf(odt)), ow*alignSize*bytesOf(odt)); + memcpy(outPtr + index, + outPtr + + ((padParamSpec.top + ih - 1 - padParamSpec.bottom + h) * ow * + alignSize * bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Symmetric) { + memcpy(outPtr + index, + outPtr + ((padParamSpec.top + ih - 1 - h) * ow * alignSize * bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else { + return NOT_SUPPORTED; + } + } + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/arm/pooling.cpp b/compute/tensor/src/cpu/arm/pooling.cpp new file mode 100644 index 00000000..3c726e5b --- /dev/null +++ b/compute/tensor/src/cpu/arm/pooling.cpp @@ -0,0 +1,177 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_FP32 +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#endif +#ifdef _USE_INT8 +#include "cpu/arm/int8/tensor_computing_int8.h" +#endif + +EE pooling_arm(TensorDesc inputDesc, + const void *input, + PoolingParamSpec poolingParamSpec, + void *scale, + TensorDesc outputDesc, + void *output) +{ + EE ret = SUCCESS; + if (nullptr == input || nullptr == output) { + ret = NULL_POINTER; + } + DataType idt, odt; + DataFormat idf, odf; + U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (idt != odt) { + ret = NOT_MATCH; + } + if (in != on || ic != oc) { + ret = NOT_MATCH; + } + if (idf != DF_NCHWC8 || odf != idf) { + ret = NOT_MATCH; + } + + U32 strideH = poolingParamSpec.stride_h; + U32 strideW = poolingParamSpec.stride_w; + U32 paddingT = poolingParamSpec.padding_top; + U32 paddingL = poolingParamSpec.padding_left; + U32 kernelSizeH = poolingParamSpec.kernel_h; + U32 kernelSizeW = poolingParamSpec.kernel_w; + if (paddingT >= kernelSizeH || paddingL >= kernelSizeW) { + ret = NOT_SUPPORTED; + } + + ic /= 8; + const U8 *inputPtr = (const U8 *)input; + U8 *outputPtr = (U8 *)output; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < oh; h++) { + for (U32 w = 0; w < ow; w++, outputPtr += 8 * bytesOf(odt)) { + int hstart = UNI_MAX((int)h * (int)strideH - (int)paddingT, 0); + int wstart = UNI_MAX((int)w * (int)strideW - (int)paddingL, 0); + int hend = UNI_MIN(hstart + kernelSizeH, ih); + int wend = UNI_MIN(wstart + kernelSizeW, iw); + int poolSize = (hend - hstart) * (wend - wstart); + switch (idt) { +#ifdef _USE_FP32 + case DT_F32: + ret = pooling_c8_fp32((const F32 *)inputPtr, iw, hstart, hend, wstart, + wend, (F32 *)outputPtr, poolingParamSpec); + break; +#endif +#ifdef _USE_FP16 + case DT_F16: + // Global average pooling kernel can be very big. Accumulate to FP32 to protect accurracy + if (poolSize > 256 && poolingParamSpec.mode == POOLING_MEAN) { + ret = pooling_c8_big_fp16((const F16 *)inputPtr, iw, hstart, hend, wstart, + wend, (F16 *)outputPtr, poolSize); + } else { + ret = pooling_c8_fp16((const F16 *)inputPtr, iw, hstart, hend, wstart, + wend, (F16 *)outputPtr, poolingParamSpec); + } + break; +#endif +#ifdef _USE_INT8 + case DT_I8: + ret = pooling_c8_int8((const INT8 *)inputPtr, iw, hstart, hend, wstart, + wend, (INT8 *)outputPtr, poolingParamSpec, scale); + break; +#endif + default: + ret = NOT_SUPPORTED; + break; + } + } + } + inputPtr += ih * iw * 8 * bytesOf(idt); + } + } + return ret; +} + +EE pooling_bp_arm(TensorDesc inputDesc, + const void *input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + void *output) +{ + EE ret = SUCCESS; + if (nullptr == input || nullptr == output) { + ret = NULL_POINTER; + } + DataType idt, odt; + DataFormat idf, odf; + U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (idt != odt) { + ret = NOT_MATCH; + } + if (in != on || ic != oc) { + ret = NOT_MATCH; + } + if (idf != DF_NCHWC8 || odf != idf) { + ret = NOT_MATCH; + } + + U32 strideH = poolingParamSpec.stride_h; + U32 strideW = poolingParamSpec.stride_w; + U32 paddingT = poolingParamSpec.padding_top; + U32 paddingL = poolingParamSpec.padding_left; + U32 kernelSizeH = poolingParamSpec.kernel_h; + U32 kernelSizeW = poolingParamSpec.kernel_w; + if (paddingT >= kernelSizeH || paddingL >= kernelSizeW) { + ret = NOT_SUPPORTED; + } + + ic /= 8; + const U8 *inputPtr = (const U8 *)input; + U8 *outputPtr = (U8 *)output; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < ih; h++) { + for (U32 w = 0; w < iw; w++, inputPtr += 8 * bytesOf(idt)) { + int hstart = (int)h * (int)strideH - (int)paddingT; + int wstart = (int)w * (int)strideW - (int)paddingL; + int hend = UNI_MIN(hstart + kernelSizeH, oh); + int wend = UNI_MIN(wstart + kernelSizeW, ow); + hstart = UNI_MAX(hstart, 0); + wstart = UNI_MAX(wstart, 0); + switch (idt) { +#ifdef _USE_FP32 + case DT_F32: + ret = pooling_bp_c8_fp32((const F32 *)inputPtr, hstart, hend, wstart, + wend, (F32 *)outputPtr, ow, poolingParamSpec); + break; +#endif + default: + ret = NOT_SUPPORTED; + break; + } + } + } + outputPtr += oh * ow * 8 * bytesOf(odt); + } + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/prelu.cpp b/compute/tensor/src/cpu/arm/prelu.cpp new file mode 100644 index 00000000..bb9881f0 --- /dev/null +++ b/compute/tensor/src/cpu/arm/prelu.cpp @@ -0,0 +1,50 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_FP32 +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#endif + +EE prelu_arm(TensorDesc inputDesc, + void *input, + void *weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + void *output) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = prelu_fp32( + inputDesc, (F32 *)input, (F32 *)weight, preluDesc, outputDesc, (F32 *)output); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = prelu_fp16( + inputDesc, (F16 *)input, (F16 *)weight, preluDesc, outputDesc, (F16 *)output); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/tensor_computing/src/cpu/arm/quantize.cpp b/compute/tensor/src/cpu/arm/quantize.cpp similarity index 83% rename from tensor_computing/src/cpu/arm/quantize.cpp rename to compute/tensor/src/cpu/arm/quantize.cpp index 13c1aa94..60c04e31 100644 --- a/tensor_computing/src/cpu/arm/quantize.cpp +++ b/compute/tensor/src/cpu/arm/quantize.cpp @@ -1,17 +1,16 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "cpu/arm/tensor_computing_arm.h" #ifdef _USE_FP16 #include "cpu/arm/fp16/tensor_computing_fp16.h" @@ -20,20 +19,20 @@ #include "cpu/arm/int8/tensor_computing_int8.h" #endif - -EE quantize_tensor_arm(TensorDesc dDesc, const void* data, TensorDesc* qDesc, void* qData, void *scale) +EE quantize_tensor_arm( + TensorDesc dDesc, const void *data, TensorDesc *qDesc, void *qData, void *scale) { EE ret = SUCCESS; switch (dDesc.dt) { #ifdef _USE_FP16 case DT_F16: { - ret = quantize_tensor_fp16(dDesc, data, qDesc, qData, (F16*)scale); + ret = quantize_tensor_fp16(dDesc, data, qDesc, qData, (F16 *)scale); break; } #endif #ifdef _USE_INT8 case DT_I32: { - ret = quantize_tensor_int32(dDesc, data, qDesc, qData, (F32*)scale); + ret = quantize_tensor_int32(dDesc, data, qDesc, qData, (F32 *)scale); break; } #endif diff --git a/compute/tensor/src/cpu/arm/rnn.cpp b/compute/tensor/src/cpu/arm/rnn.cpp new file mode 100644 index 00000000..d5313f11 --- /dev/null +++ b/compute/tensor/src/cpu/arm/rnn.cpp @@ -0,0 +1,59 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_FP32 +#include "cpu/arm/fp32/tensor_computing_fp32.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/fp16/tensor_computing_fp16.h" +#endif + +EE rnncell_arm(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *output, + Arch arch) +{ + EE ret = SUCCESS; + switch (xDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = rnncell_fp32(xDesc, currentX, filterDesc, filter, biasDesc, bias, state, tmpBytes, + tmp, rnnParamSpec, batchStrideX, batchStrideH, hDesc, output, arch); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = rnncell_fp16(xDesc, currentX, filterDesc, filter, biasDesc, bias, state, tmpBytes, + tmp, rnnParamSpec, batchStrideX, batchStrideH, hDesc, output, arch); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/tensor_computing/src/cpu/arm/scale.cpp b/compute/tensor/src/cpu/arm/scale.cpp similarity index 69% rename from tensor_computing/src/cpu/arm/scale.cpp rename to compute/tensor/src/cpu/arm/scale.cpp index 52a96c5c..84ea909b 100644 --- a/tensor_computing/src/cpu/arm/scale.cpp +++ b/compute/tensor/src/cpu/arm/scale.cpp @@ -1,17 +1,16 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "cpu/arm/tensor_computing_arm.h" #ifdef _USE_FP32 #include "cpu/arm/fp32/tensor_computing_fp32.h" @@ -20,33 +19,36 @@ #include "cpu/arm/fp16/tensor_computing_fp16.h" #endif -EE scale_arm(TensorDesc inputDesc, void* input, - I32 axis, void *alpha, void *beta, - TensorDesc outputDesc, void* output) +EE scale_arm(TensorDesc inputDesc, + void *input, + void *alpha, + void *beta, + ScaleParamSpec p, + TensorDesc outputDesc, + void *output) { UNUSED(outputDesc); U32 length = tensorNumElements(inputDesc); - axis = (axis + inputDesc.nDims) % inputDesc.nDims; + int axis = (p.axis + inputDesc.nDims) % inputDesc.nDims; I32 in = inputDesc.dims[inputDesc.nDims - 1]; I32 ic = inputDesc.dims[inputDesc.nDims - 1 - axis]; I32 elements_per_channel = length / (in * ic); - if (inputDesc.df == DF_NCHWC8) + if (inputDesc.df == DF_NCHWC8) { axis = inputDesc.nDims; + } EE ret = SUCCESS; switch (inputDesc.dt) { #ifdef _USE_FP32 case DT_F32: { - ret = scale_fp32((F32*)input, - axis, inputDesc.nDims, (F32*)alpha, (F32*)beta, in, ic, elements_per_channel, - (F32*)output); + ret = scale_fp32((F32 *)input, axis, inputDesc.nDims, (F32 *)alpha, (F32 *)beta, in, ic, + elements_per_channel, (F32 *)output); break; } #endif #ifdef _USE_FP16 case DT_F16: { - ret = scale_fp16((F16*)input, - axis, inputDesc.nDims, (F16*)alpha, (F16*)beta, in, ic, elements_per_channel, - (F16*)output); + ret = scale_fp16((F16 *)input, axis, inputDesc.nDims, (F16 *)alpha, (F16 *)beta, in, ic, + elements_per_channel, (F16 *)output); break; } #endif diff --git a/tensor_computing/src/cpu/arm/softmax.cpp b/compute/tensor/src/cpu/arm/softmax.cpp similarity index 75% rename from tensor_computing/src/cpu/arm/softmax.cpp rename to compute/tensor/src/cpu/arm/softmax.cpp index 9a6f06a8..88ebb474 100644 --- a/tensor_computing/src/cpu/arm/softmax.cpp +++ b/compute/tensor/src/cpu/arm/softmax.cpp @@ -1,17 +1,16 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "cpu/arm/tensor_computing_arm.h" #ifdef _USE_FP32 #include "cpu/arm/fp32/tensor_computing_fp32.h" @@ -20,23 +19,21 @@ #include "cpu/arm/fp16/tensor_computing_fp16.h" #endif - -EE softmax_arm(TensorDesc inputDesc, const void* input, - int axis, - TensorDesc outputDesc, void* output) +EE softmax_arm( + TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output) { DataType idt = inputDesc.dt; EE ret = SUCCESS; switch (idt) { #ifdef _USE_FP32 case DT_F32: { - ret = softmax_fp32(inputDesc, (const F32*)input, axis, outputDesc, (F32*)output); + ret = softmax_fp32(inputDesc, (const F32 *)input, p.axis, outputDesc, (F32 *)output); break; } #endif #ifdef _USE_FP16 case DT_F16: { - ret = softmax_fp16(inputDesc, (const F16*)input, axis, outputDesc, (F16*)output); + ret = softmax_fp16(inputDesc, (const F16 *)input, p.axis, outputDesc, (F16 *)output); break; } #endif diff --git a/compute/tensor/src/cpu/arm/tensor_computing_arm.h b/compute/tensor/src/cpu/arm/tensor_computing_arm.h new file mode 100644 index 00000000..678ab1da --- /dev/null +++ b/compute/tensor/src/cpu/arm/tensor_computing_arm.h @@ -0,0 +1,227 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TENSOR_COMPUTING_ARM +#define _H_TENSOR_COMPUTING_ARM + +#include +#include "sys.h" +#include "types.h" + +EE attention_arm(TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output); + +EE clip_arm(TensorDesc inputDesc, void *input, ClipParamSpec p, TensorDesc outputDesc, void *output); + +EE convolution_infer_forward_algorithm_arm(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ConvolutionForwardAlgorithm *algorithm, + DataType targetDataType); + +EE convolution_transform_filter_bytes_arm(TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes); + +EE convolution_transform_filter_arm(TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed); + +EE convolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes); + +EE convolution_arm(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch); + +EE deconvolution_transform_filter_arm(TensorDesc filterDesc, + const void *filter, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed); + +EE depthwise_pointwise_convolution_infer_forward_algorithm_arm(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + DepthwiseConvolutionForwardAlgorithm *algorithm, + DataType targetDataType); + +EE depthwise_pointwise_convolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes); + +EE depthwise_pointwise_convolution_transform_filter_arm(TensorDesc dwFilterDesc, + const void *dwFilter, + TensorDesc pwFilterDesc, + const void *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *dwFtmDesc, + void *dwFilterTransformed, + TensorDesc *pwFtmDesc, + void *pwFilterTransformed); + +EE depthwise_pointwise_convolution_arm(TensorDesc inputDesc, + void *input, + TensorDesc dwFilterDesc, + const void *dwFilter, + TensorDesc pwFilterDesc, + const void *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const void *dwBias, + TensorDesc pwBiasDesc, + const void *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch); + +EE depthwise_convolution_transform_filter_arm(TensorDesc filterDesc, + const void *filter, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed); + +EE depthwise_convolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes); + +EE depthwise_convolution_arm(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec, + Arch arch); + +EE eltwise_arm(DataType dataType, + std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode); + +EE rnncell_arm(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *currentH, + Arch arch); + +EE layer_normalization_arm( + TensorDesc inputDesc, void *input, void *alpha, void *beta, TensorDesc outputDesc, void *output); + +EE pooling_arm(TensorDesc inputDesc, + const void *input, + PoolingParamSpec poolingParamSpec, + void *scale, + TensorDesc outputDesc, + void *output); + +EE pooling_bp_arm(TensorDesc inputDesc, + const void *input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + void *output); + +EE reshape_arm(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output); + +EE scale_arm(TensorDesc inputDesc, + void *input, + void *alpha, + void *beta, + ScaleParamSpec p, + TensorDesc outputDesc, + void *output); + +EE softmax_arm( + TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output); + +EE quantize_tensor_arm( + TensorDesc dDesc, const void *data, TensorDesc *qDesc, void *qData, void *scale); + +EE check_arm(TensorDesc inputDescA, + const void *inputA, + TensorDesc inputDescB, + const void *inputB, + CheckParamSpec p, + TensorDesc outputDesc, + void *output); + +EE attention_mask_arm(TensorDesc inputDesc, + const void *input, + AttentionMaskParamSpec p, + TensorDesc outputDesc, + void *output); + +EE prelu_arm(TensorDesc inputDesc, + void *input, + void *weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + void *output); +#endif diff --git a/compute/tensor/src/cpu/arm/transform_functions.h b/compute/tensor/src/cpu/arm/transform_functions.h new file mode 100644 index 00000000..98ea6b85 --- /dev/null +++ b/compute/tensor/src/cpu/arm/transform_functions.h @@ -0,0 +1,148 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "types.h" +#ifdef _USE_FP32 +#include "cpu/arm/fp32/convolution_winograd_transform.h" +#endif +#ifdef _USE_FP16 +#include "cpu/arm/fp16/convolution_winograd_transform.h" +#endif + +template +inline EE transformCNHWToNHWCNx( + TensorDesc inputDesc, const T *input, TensorDesc outputDesc, T *output) +{ + if (input == NULL || output == NULL) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(inputDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + U32 oc = fc / N; + U32 hwMax = fh * fw - 1; + + for (U32 o = 0; o < oc; o++) { + for (U32 hw = 0; hw < fh * fw; hw++) { + for (U32 c = 0; c < fn; c++) { + for (U32 ox = 0; ox < N; ox++) { + output[o * fh * fw * fn * N + hw * fn * N + c * N + ox] = + input[c * fc * fh * fw + (o * N + ox) * fh * fw + hwMax - hw]; + } + } + } + } + if ((fc != oc * N) && (N == 16)) { + for (U32 hw = 0; hw < fh * fw; hw++) { + for (U32 c = 0; c < fn; c++) { + for (U32 o8 = 0; o8 < 8; o8++) { + output[(oc * 16) * fh * fw * fn + hw * fn * 8 + c * 8 + o8] = + input[c * fc * fh * fw + (oc * 16 + o8) * fh * fw + hwMax - hw]; + } + } + } + } + return SUCCESS; +} + +template +inline EE transformCNHWToNCHWC8( + TensorDesc inputDesc, const T *input, TensorDesc outputDesc, T *output) +{ + if (input == NULL || output == NULL) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(inputDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_REQUIREMENT(1 == fn); + U32 ic = fc / 8; + U32 hwMax = fh * fw - 1; + for (U32 c = 0; c < ic; c++) { + for (U32 hw = 0; hw < fh * fw; hw++) { + for (U32 c8 = 0; c8 < 8; c8++) { + output[c * fh * fw * 8 + hw * 8 + c8] = input[(c * 8 + c8) * fh * fw + hwMax - hw]; + } + } + } + return SUCCESS; +} + +template +inline EE transformCNHWToHWNCNx( + TensorDesc inputDesc, const T *input, TensorDesc outputDesc, T *output) +{ + if (input == NULL || output == NULL) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(inputDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + const U32 hwMax = 8; + for (U32 o = 0; o < fc / N; o++) { + for (U32 c = 0; c < fn; c++) { + U32 f_off_0 = c * fc * fh * fw + (o * N) * fh * fw; + U32 f_off_1 = c * fc * fh * fw + (o * N + N / 2) * fh * fw; + U32 ftm_off_0 = o * 36 * fn * N + c * N; + U32 ftm_off_1 = o * 36 * fn * N + c * N + N / 2; + T F[9][N / 2]; + T *F_ptr[9]; + T *Fw[36]; + + for (U32 hw = 0; hw < 9; hw++) { + for (U32 oo = 0; oo < N / 2; oo++) { + F[hw][oo] = input[f_off_0 + hwMax - hw + oo * fh * fw]; + } + F_ptr[hw] = F[hw]; + } + for (U32 hw = 0; hw < 36; hw++) { + Fw[hw] = output + ftm_off_0 + hw * fn * N; + } + trans_W_4x4_3x3(Fw, F_ptr); + for (U32 hw = 0; hw < 9; hw++) { + for (U32 oo = 0; oo < N / 2; oo++) { + F[hw][oo] = input[f_off_1 + hwMax - hw + oo * fh * fw]; + } + F_ptr[hw] = F[hw]; + } + for (U32 hw = 0; hw < 36; hw++) { + Fw[hw] = output + ftm_off_1 + hw * fn * N; + } + trans_W_4x4_3x3(Fw, F_ptr); + } + } + U32 oc = (fc / 16) * 16; + if ((oc != fc) && (N == 16)) { + for (U32 c = 0; c < fn; c++) { + U32 f_off_0 = c * fc * fh * fw + oc * fh * fw; + U32 ftm_off_0 = oc * 36 * fn + c * 8; + T F[9][8]; + T *F_ptr[9]; + T *Fw[36]; + for (U32 hw = 0; hw < 9; hw++) { + for (U32 oo = 0; oo < 8; oo++) { + F[hw][oo] = input[f_off_0 + hwMax - hw + oo * fh * fw]; + } + F_ptr[hw] = F[hw]; + } + for (U32 hw = 0; hw < 36; hw++) { + Fw[hw] = output + ftm_off_0 + hw * fn * 8; + } + trans_W_4x4_3x3(Fw, F_ptr); + } + } + return SUCCESS; +} diff --git a/tensor_computing/src/split.cpp b/compute/tensor/src/cpu/clip.cpp similarity index 58% rename from tensor_computing/src/split.cpp rename to compute/tensor/src/cpu/clip.cpp index d1cff984..b882e7fc 100644 --- a/tensor_computing/src/split.cpp +++ b/compute/tensor/src/cpu/clip.cpp @@ -1,48 +1,42 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include "tensor_computing.h" +#include "cpu/tensor_computing_cpu.h" #ifdef _USE_GENERAL #include "cpu/general/tensor_computing_general.h" #endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif #ifdef _USE_NEON #include "cpu/arm/tensor_computing_arm.h" #endif -#include - -EE split_infer_output_size(TensorDesc inputDesc, std::vector* outputDesc) -{ - if (nullptr == outputDesc) - CHECK_STATUS(NULL_POINTER); - for(U32 i = 0; i < (*outputDesc).size(); i++) { - (*outputDesc)[i] = inputDesc; - } - return SUCCESS; -} - -EE split(TensorDesc inputDesc, void* input, - std::vector outputDesc, std::vector* output, Arch arch) +EE clip_cpu( + TensorDesc inputDesc, void *input, ClipParamSpec p, TensorDesc outputDesc, void *output, Arch arch) { EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { + if (IS_GENERAL(arch)) { #ifdef _USE_GENERAL - ret = split_general(inputDesc, input, outputDesc, output); + ret = clip_general(inputDesc, input, p, outputDesc, output); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = clip_x86(inputDesc, input, p, outputDesc, output); #endif #ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = split_arm(inputDesc, input, outputDesc, output); + } else if (IS_ARM(arch)) { + ret = clip_arm(inputDesc, input, p, outputDesc, output); #endif } return ret; diff --git a/compute/tensor/src/cpu/concat.cpp b/compute/tensor/src/cpu/concat.cpp new file mode 100644 index 00000000..b9704e59 --- /dev/null +++ b/compute/tensor/src/cpu/concat.cpp @@ -0,0 +1,108 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/tensor_computing_cpu.h" +#if defined(_USE_NEON) && defined(_USE_INT8) +#include "cpu/arm/int8/tensor_computing_int8.h" +#endif + +static EE concat(std::vector inputDesc, + std::vector input, + int axis, + TensorDesc outputDesc, + void *output, + void *tmp) +{ + if (nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + U32 num = inputDesc.size(); + if (num < 1) { + return NOT_MATCH; + } + + int dim = outputDesc.nDims; + axis = (axis + dim) % dim; + axis = dim - 1 - axis; + U32 tileSize = bytesOf(outputDesc.dt); + for (I32 i = 0; i < axis; i++) { + tileSize *= outputDesc.dims[i]; + } + U32 loops = 1; + for (I32 i = axis + 1; i < dim; i++) { + loops *= outputDesc.dims[i]; + } + + if (outputDesc.df == DF_NCHWC8) { + if (axis < 2) { + tileSize *= 8; + loops /= 8; + } + } + + bool isC8 = DF_NCHWC8 == outputDesc.df; + + U8 *ptr = (U8 *)output; + U8 *tmpPtr = (U8 *)tmp; + for (U32 i = 0; i < loops; i++) { + for (U32 j = 0; j < num; j++) { + U8 *inPtr = (U8 *)((input)[j]); + if (nullptr == input[j] || tensorNumElements(inputDesc[j]) == 0) { + continue; + } + + if ((4 != inputDesc[j].nDims) || (1 != inputDesc[j].dims[1]) || + (1 != inputDesc[j].dims[0])) { + if (isC8 && (DF_NCHW == inputDesc[j].df)) { + TensorDesc tmpDesc = inputDesc[j]; + tmpDesc.df = DF_NCHWC8; + transformNCHWToNCHWC8(inputDesc[j], inPtr, tmpDesc, tmpPtr); + inPtr = tmpPtr; + } else if (!isC8 && (DF_NCHWC8 == inputDesc[j].df)) { + TensorDesc tmpDesc = inputDesc[j]; + tmpDesc.df = DF_NCHW; + transformToNCHW(inputDesc[j], inPtr, tmpDesc, tmpPtr); + inPtr = tmpPtr; + } + } + U32 blockSize = inputDesc[j].dims[axis] * tileSize; + U8 *srcPtr = inPtr + i * blockSize; + memcpy(ptr, srcPtr, blockSize); + ptr += blockSize; + tmpPtr += tensorNumBytes(inputDesc[j]); + } + } + return SUCCESS; +} + +EE concat_cpu(std::vector inputDesc, + std::vector input, + void *inputScale, + ConcatParamSpec p, + void *tmp, + TensorDesc outputDesc, + void *output, + void *outputScale) +{ + EE ret = NOT_SUPPORTED; + if (outputDesc.dt == DT_I8) { +#if defined(_USE_NEON) && defined(_USE_INT8) + ret = concat_int8( + inputDesc, input, (F32 *)inputScale, p.axis, outputDesc, output, (F32 *)outputScale); +#endif + } else { + ret = concat(inputDesc, input, p.axis, outputDesc, output, tmp); + } + return ret; +} diff --git a/compute/tensor/src/cpu/convolution.cpp b/compute/tensor/src/cpu/convolution.cpp new file mode 100644 index 00000000..e115eb9e --- /dev/null +++ b/compute/tensor/src/cpu/convolution.cpp @@ -0,0 +1,62 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif +#include "cpu/tensor_computing_cpu.h" + +EE convolution_cpu(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = convolution_general(inputDesc, input, filterDesc, filter, convParamSpec, scaleDesc, + scale, biasDesc, bias, outputDesc, output, activationDesc); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = convolution_x86(inputDesc, input, filterDesc, filter, convParamSpec, algorithm, + scaleDesc, scale, biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc, + arch); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = convolution_arm(inputDesc, input, filterDesc, filter, convParamSpec, algorithm, + scaleDesc, scale, biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc, + arch); +#endif + } + return ret; +} \ No newline at end of file diff --git a/compute/tensor/src/cpu/cpu_functions.h b/compute/tensor/src/cpu/cpu_functions.h new file mode 100644 index 00000000..0aefae95 --- /dev/null +++ b/compute/tensor/src/cpu/cpu_functions.h @@ -0,0 +1,231 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CPU_FUNCTIONS +#define _H_CPU_FUNCTIONS + +#ifdef _USE_GENERAL +#include "cpu/general/general_functions.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/arm_functions.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/x86_functions.h" +#endif + +typedef void (*ArrayScaleFunction)( + DataType dt, const void *input, void *output, I32 len, F32 alpha, F32 beta); +typedef void (*ArrayAddFunction)( + DataType dt, const void *inputA, const void *inputB, void *output, I32 len); +typedef F32 (*ArraySumFunction)(DataType dt, const void *data, I32 len); +typedef F32 (*ArrayMeanFunction)(DataType dt, const void *data, I32 len); +typedef F32 (*ArrayVarFunction)(DataType dt, const void *data, I32 len, F32 mean); +typedef void (*ArrayPowerFunction)(DataType dt, void *input, void *output, I32 len, F32 power); +typedef void (*ArraySquareAndAddFunction)( + DataType dt, const void *inputA, const void *inputB, void *output, I32 len); +typedef EE (*ArrayActivationFunction)( + DataType dt, void *input, U32 len, ActivationParamSpec activationDesc, void *output); + +inline ArrayScaleFunction get_array_scale_function(Arch arch) +{ + ArrayScaleFunction func; + bool find = false; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + func = array_scale_general; + find = true; +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + func = array_scale_arm; + find = true; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + func = array_scale_x86; + find = true; +#endif + } + CHECK_REQUIREMENT(find); + return func; +} + +inline ArrayAddFunction get_array_add_function(Arch arch) +{ + ArrayAddFunction func; + bool find = false; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + func = array_add_general; + find = true; +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + func = array_add_arm; + find = true; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + func = array_add_x86; + find = true; +#endif + } + CHECK_REQUIREMENT(find); + return func; +} + +inline ArrayMeanFunction get_array_mean_function(Arch arch) +{ + ArrayMeanFunction func; + bool find = false; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + func = array_mean_general; + find = true; +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + func = array_mean_arm; + find = true; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + func = array_mean_x86; + find = true; +#endif + } + CHECK_REQUIREMENT(find); + return func; +} + +inline ArrayVarFunction get_array_var_function(Arch arch) +{ + ArrayVarFunction func; + bool find = false; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + func = array_var_general; + find = true; +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + func = array_var_arm; + find = true; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + func = array_var_x86; + find = true; +#endif + } + CHECK_REQUIREMENT(find); + return func; +} + +inline ArrayPowerFunction get_array_power_function(Arch arch) +{ + ArrayPowerFunction func; + bool find = false; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + func = array_power_general; + find = true; +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + func = array_power_arm; + find = true; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + func = array_power_x86; + find = true; +#endif + } + CHECK_REQUIREMENT(find); + return func; +} + +inline ArraySumFunction get_array_sum_function(Arch arch) +{ + ArraySumFunction func; + bool find = false; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + func = array_sum_general; + find = true; +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + func = array_sum_arm; + find = true; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + func = array_sum_x86; + find = true; +#endif + } + CHECK_REQUIREMENT(find); + return func; +} + +inline ArraySquareAndAddFunction get_array_square_and_add_function(Arch arch) +{ + ArraySquareAndAddFunction func; + bool find = false; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + func = array_square_and_add_general; + find = true; +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + func = array_square_and_add_arm; + find = true; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + func = array_square_and_add_x86; + find = true; +#endif + } + CHECK_REQUIREMENT(find); + return func; +} + +inline ArrayActivationFunction get_array_activation_function(Arch arch) +{ + ArrayActivationFunction func; + bool find = false; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + func = array_activation_general; + find = true; +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + func = array_activation_arm; + find = true; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + func = array_activation_x86; + find = true; +#endif + } + CHECK_REQUIREMENT(find); + return func; +} +#endif diff --git a/compute/tensor/src/cpu/cpu_functions_template.h b/compute/tensor/src/cpu/cpu_functions_template.h new file mode 100644 index 00000000..e53260b1 --- /dev/null +++ b/compute/tensor/src/cpu/cpu_functions_template.h @@ -0,0 +1,215 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CPU_FUNCTIONS_TEMPLATE +#define _H_CPU_FUNCTIONS_TEMPLATE + +#include +#include +#include "types.h" + +// copy input[index]~input[index+length] to output buffer +template +void get_vector(T *input, int lda, T **output, int ldb, int index, int length, T *buffer) +{ + UNUSED(ldb); + int local = index % lda; + if (length == 1) { + *output = buffer; + (*output)[0] = input[local]; + } else if (lda == 1) { + *output = input; + } else { + int remain = lda - local; + if (remain >= length) { + *output = input + local; + } else { + *output = buffer; + memcpy(*output, input + local, sizeof(T) * remain); + for (int i = 0; i < length - remain; i++) { + (*output)[remain + i] = input[i % lda]; + } + } + } +} + +template +inline void array_scale_template(const T *input, T *output, I32 len, F32 alpha, F32 beta) +{ + for (I32 i = 0; i < len; i++) { + output[i] = alpha * input[i] + beta; + } +} + +template +inline void array_power_template(T *input, T *output, I32 len, F32 power) +{ + for (I32 i = 0; i < len; i++) { + output[i] = powf(input[i], power); + } +} + +template +EE activation_template(ActivationParamSpec activationDesc, F32 input, T *output) +{ + F32 value, result = 0; + EE ret = SUCCESS; + switch (activationDesc.mode) { + case ACTIVATION_NULL: { + result = input; + break; + } + case ACTIVATION_RELU: { + value = input; + F32 tmp = activationDesc.value[0] * value; + if (value < tmp) { + value = tmp; + } + result = value; + break; + } + case ACTIVATION_RELU6: { + value = input; + if (value < 0) { + value = 0; + } + if (value > 6) { + value = 6; + } + result = value; + break; + } + case ACTIVATION_H_SIGMOID: { + value = input + 3; + if (value < 0) { + value = 0; + } + if (value > 6) { + value = 6; + } + result = value / 6; + break; + } + case ACTIVATION_H_SWISH: { + value = input + 3; + if (value < 0) { + value = 0; + } + if (value > 6) { + value = 6; + } + result = input * (value / 6); + break; + } + case ACTIVATION_GELU: { + value = input; + F32 two_div_PI_sqrt = sqrt(2 / 3.14159265358979323846); + value = two_div_PI_sqrt * (value + 0.044715 * powf(value, 3)); + value = 1.0 - 2.0 / (exp(2.0 * value) + 1.0); + value = 0.5 * (1.0 + value); + value = input * value; + result = value; + break; + } + case ACTIVATION_TANH: { + value = 1.0 - 2.0 / (exp(2.0 * input) + 1.0); + result = value; + break; + } + case ACTIVATION_SIGMOID: { + value = 1.0 / (1.0 + exp(-1.0 * input)); + result = value; + break; + } + case ACTIVATION_MISH: { + value = input; + F32 mish_threshold = 20; + if (value < -mish_threshold) { + value = exp(value); + } else if (!(value > mish_threshold || value < -mish_threshold)) { + value = log(exp(value) + 1.0); + } + value = input * tanh(value); + result = value; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + *output = result; + return ret; +} + +template +F32 array_sum_template(const T *array, U32 length) +{ + F32 sum = 0; + for (U32 i = 0; i < length; i++) { + sum += array[i]; + } + return sum; +} + +// array mean +template +F32 array_mean_template(const T *data, I32 len) +{ + if (len <= 0) { + return 0; + } + return array_sum_template(data, len) / len; +} + +template +F32 array_var_template(const T *data, I32 len, F32 mean) +{ + F32 sum_s = 0; + for (I32 i = 0; i < len; i++) { + F32 in = data[i]; + F32 tmp = in - mean; + sum_s += tmp * tmp; + } + return sum_s / len; +} + +template +inline void array_add_template(const T *inputA, const T *inputB, T *output, I32 len) +{ + for (I32 i = 0; i < len; i++) { + output[i] = inputA[i] + inputB[i]; + } +} + +template +inline F32 array_sum_template(const T *data, I32 len) +{ + if (len <= 0) { + return 0; + } + + F32 sum_s = 0; + for (I32 i = 0; i < len; i++) { + sum_s += data[i]; + } + return sum_s; +} + +template +inline void array_square_and_add_template(const T *inputA, const T *inputB, T *output, I32 len) +{ + for (I32 i = 0; i < len; i++) { + output[i] = inputA[i] + inputB[i] * inputB[i]; + } +} +#endif diff --git a/compute/tensor/src/cpu/deconvolution.cpp b/compute/tensor/src/cpu/deconvolution.cpp new file mode 100644 index 00000000..6a30e66c --- /dev/null +++ b/compute/tensor/src/cpu/deconvolution.cpp @@ -0,0 +1,681 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_OPENMP +#include +#endif +#include "thread_affinity.h" +#include "cpu/tensor_computing_cpu.h" +#include "cpu/cpu_functions.h" +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif +#include "blas_enhance.h" + +#if defined(_USE_X86) || defined(_USE_NEON) + +EE deconvolution_infer_forward_algorithm_cpu(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ConvolutionForwardAlgorithm *algorithm, + DataType targetDataType, + Arch arch) +{ + if (nullptr == algorithm) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt; + DataFormat idf, fdf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + + if (1 == fn && ic != fn) { + *algorithm = CONVOLUTION_ALGORITHM_GROUP_DECONV; + return SUCCESS; + } + + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + if ((strideH > 1 || strideW > 1) && fh % strideH == 0 && fw % strideW == 0) { + *algorithm = CONVOLUTION_ALGORITHM_IM2COL_GEMM; + return SUCCESS; + } + + ConvolutionParamSpec transposedCD = convParamSpec; + transposedCD.stride_h = 1; + transposedCD.stride_w = 1; + transposedCD.padding_top = 1; + transposedCD.padding_bottom = 1; + transposedCD.padding_left = 1; + transposedCD.padding_right = 1; + transposedCD.dilatedRate_h = 1; + transposedCD.dilatedRate_w = 1; + + U32 tPadding = (fh - 1 - paddingT) - 1; // Leave out padding of length 1 to activate Winograd + U32 bPadding = (fh - 1 - paddingB) - 1; + U32 lPadding = (fw - 1 - paddingL) - 1; + U32 rPadding = (fw - 1 - paddingR) - 1; + + ih = ih + (ih - 1) * (strideH - 1) + tPadding + bPadding; + iw = iw + (iw - 1) * (strideW - 1) + lPadding + rPadding; + + TensorDesc inPaddedDesc = tensor4df(idt, idf, in, ic, ih, iw); + + // Swap fn and fc + filterDesc.dims[2] = filterDesc.dims[3]; + filterDesc.dims[3] = ic; + EE ret = NOT_SUPPORTED; + if (IS_ARM(arch)) { +#ifdef _USE_NEON + ret = convolution_infer_forward_algorithm_arm( + inPaddedDesc, filterDesc, outputDesc, transposedCD, policy, algorithm, targetDataType); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = convolution_infer_forward_algorithm_x86( + inPaddedDesc, filterDesc, outputDesc, transposedCD, policy, algorithm, targetDataType); +#endif + } + return ret; +} + +EE deconvolution_transform_filter_bytes_cpu(TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes, + Arch arch) +{ + EE ret = NOT_SUPPORTED; + if (algorithm == CONVOLUTION_ALGORITHM_IM2COL_GEMM) { + *bytes = tensorNumBytes(filterDesc); + ret = SUCCESS; + } else if (algorithm == CONVOLUTION_ALGORITHM_GROUP_DECONV) { + ret = depthwise_convolution_transform_filter_bytes_cpu( + filterDesc, DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT, bytes); + } else { + if (IS_ARM(arch)) { +#ifdef _USE_NEON + ret = + convolution_transform_filter_bytes_arm(filterDesc, convParamSpec, algorithm, bytes); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = + convolution_transform_filter_bytes_x86(filterDesc, convParamSpec, algorithm, bytes); +#endif + } + } + return ret; +} + +static EE deconvolution_transform_filter_im2col_gemm_cpu(TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc *ftmDesc, + void *filterTransformed) +{ + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + if (convParamSpec.stride_h == convParamSpec.kernel_h && + convParamSpec.stride_w == convParamSpec.kernel_w) { + U32 filterDims[5] = {fw, fh, 8, fc / 8, fn}; + U32 ftmDims[5] = {8, fw, fh, fc / 8, fn}; + U32 filterTransformDims[5] = {0, 1, 3, 4, 2}; + CHECK_STATUS(array_transpose( + fdt, filterDims, filter, ftmDims, filterTransformed, filterTransformDims, 5)); + } else { + U32 elementSize = bytesOf(filterDesc.dt); + U32 fnAlignSize = fn / 8; + U8 *ptr = (U8 *)filterTransformed; + for (U32 i = 0; i < convParamSpec.stride_h; i++) { + for (U32 j = 0; j < convParamSpec.stride_w; j++) { + U32 fhStart = (fh - 1 - i - convParamSpec.padding_top) % convParamSpec.stride_h; + U32 fwStart = (fw - 1 - j - convParamSpec.padding_left) % convParamSpec.stride_w; + for (U32 ic = 0; ic < fnAlignSize; ic++) { + for (U32 h = fhStart; h < convParamSpec.kernel_h; h += convParamSpec.stride_h) { + for (U32 w = fwStart; w < convParamSpec.kernel_w; + w += convParamSpec.stride_w) { + for (U32 c8 = 0; c8 < 8; c8++) { + for (U32 oc = 0; oc < fc; oc++, ptr += elementSize) { + U32 srcIndex = + ((((ic * 8 + c8) * fc + oc) * fh + (fh - 1 - h)) * fw + + (fw - 1 - w)) * + elementSize; + const U8 *src = (const U8 *)filter + srcIndex; + memcpy(ptr, src, elementSize); + } + } + } + } + } + } + } + } + *ftmDesc = tensor2df(filterDesc.dt, DF_NORMAL, fn, fc * fh * fw); + return SUCCESS; +} + +EE deconvolution_transform_filter_cpu(TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed, + Arch arch) +{ + if (algorithm == CONVOLUTION_ALGORITHM_IM2COL_GEMM) { + return deconvolution_transform_filter_im2col_gemm_cpu( + filterDesc, filter, convParamSpec, ftmDesc, filterTransformed); + } + EE ret = NOT_SUPPORTED; + if (IS_ARM(arch)) { +#ifdef _USE_NEON + ret = deconvolution_transform_filter_arm( + filterDesc, filter, algorithm, ftmDesc, filterTransformed); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = deconvolution_transform_filter_x86( + filterDesc, filter, algorithm, ftmDesc, filterTransformed); +#endif + } + return ret; +} + +EE deconvolution_infer_forward_tmp_bytes_cpu(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes, + Arch arch) +{ + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw, fn, fc, fh, fw, on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + if (algorithm == CONVOLUTION_ALGORITHM_IM2COL_GEMM) { + U32 ihNum = ih + convParamSpec.kernel_h * 2 - convParamSpec.padding_top - + convParamSpec.padding_bottom; + U32 iwNum = iw + convParamSpec.kernel_w * 2 - convParamSpec.padding_left - + convParamSpec.padding_right; + U32 fhNum = (U32)ceil((float)convParamSpec.kernel_h / convParamSpec.stride_h); + U32 fwNum = (U32)ceil((float)convParamSpec.kernel_w / convParamSpec.stride_w); + TensorDesc matrixADesc = tensor2df(idt, DF_NORMAL, in * ihNum * iwNum, ic * fhNum * fwNum); + TensorDesc matrixBDesc = tensor2df(filterDesc.dt, DF_NORMAL, ic * fhNum * fwNum, + oc * convParamSpec.stride_h * convParamSpec.stride_w); + CHECK_STATUS(matrix_matrix_multiply_tmp_bytes(matrixADesc, matrixBDesc, bytes, arch)); + *bytes *= OMP_NUM_THREADS; + *bytes += tensorNumBytes(matrixADesc) + tensorNumBytes(outputDesc); + return SUCCESS; + } + + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + U32 tPadding = fh - 1 - paddingT; + U32 bPadding = fh - 1 - paddingB; + U32 lPadding = fw - 1 - paddingL; + U32 rPadding = fw - 1 - paddingR; + + ConvolutionParamSpec transposedCD = convParamSpec; + transposedCD.stride_h = 1; + transposedCD.stride_w = 1; + transposedCD.padding_top = 0; + transposedCD.padding_bottom = 0; + transposedCD.padding_left = 0; + transposedCD.padding_right = 0; + transposedCD.dilatedRate_h = 1; + transposedCD.dilatedRate_w = 1; + + ih = ih + (ih - 1) * (strideH - 1) + tPadding + bPadding; + iw = iw + (iw - 1) * (strideW - 1) + lPadding + rPadding; + TensorDesc inPaddedDesc = tensor4df(idt, idf, in, ic, ih, iw); + if (CONVOLUTION_ALGORITHM_GROUP_DECONV == algorithm) { + *bytes = tensorNumBytes(inPaddedDesc) * 2 + 32; + return SUCCESS; + } + if (DF_NCHW == filterDesc.df) { + // Swap fn and fc + filterDesc.dims[2] = filterDesc.dims[3]; + filterDesc.dims[3] = ic; + } + U32 convolution_tmp_bytes = 0; + EE ret = NOT_SUPPORTED; + if (IS_ARM(arch)) { +#ifdef _USE_NEON + ret = convolution_infer_forward_tmp_bytes_arm( + inPaddedDesc, filterDesc, outputDesc, transposedCD, algorithm, &convolution_tmp_bytes); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = convolution_infer_forward_tmp_bytes_x86( + inPaddedDesc, filterDesc, outputDesc, transposedCD, algorithm, &convolution_tmp_bytes); +#endif + } + *bytes = tensorNumBytes(inPaddedDesc) + convolution_tmp_bytes; + return ret; +} + +static EE deconvolution_stride_greater_one_and_kernel_divide_stride_cpu(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + DataType idt, odt; + DataFormat idf, odf; + U32 in, ic, ih, iw, on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U8 alignSize = 8; + U32 icAlignSize = ic / alignSize; + U32 inputTileSize = bytesOf(idt) * alignSize; +#ifndef _USE_OPENMP + U32 ocAlignSize = oc / alignSize; + U32 outputTileSize = bytesOf(odt) * alignSize; + ArrayAddFunction add_func = get_array_add_function(arch); + ArrayActivationFunction activation_func = get_array_activation_function(arch); +#endif + U32 iPaddingT = + (convParamSpec.kernel_h - 1 - convParamSpec.padding_top) / convParamSpec.stride_h; + U32 iPaddingB = + (convParamSpec.kernel_h - 1 - convParamSpec.padding_bottom) / convParamSpec.stride_h; + U32 iPaddingL = + (convParamSpec.kernel_w - 1 - convParamSpec.padding_left) / convParamSpec.stride_w; + U32 iPaddingR = + (convParamSpec.kernel_w - 1 - convParamSpec.padding_right) / convParamSpec.stride_w; + U32 iKernelH = convParamSpec.kernel_h / convParamSpec.stride_h; + U32 iKernelW = convParamSpec.kernel_w / convParamSpec.stride_w; + U8 *tmpInput = (U8 *)tmp; + U32 iStrideT = (convParamSpec.kernel_h - 1 - convParamSpec.padding_top) % convParamSpec.stride_h; + U32 iStrideL = + (convParamSpec.kernel_w - 1 - convParamSpec.padding_left) % convParamSpec.stride_w; + U32 iDumpH = 1; + if (iStrideT == convParamSpec.stride_h - 1) { + iDumpH = 0; + } + U32 iDumpW = 1; + if (iStrideL == convParamSpec.stride_w - 1) { + iDumpW = 0; + } + U32 ihNum = iPaddingT + ih + iPaddingB; + U32 iwNum = iPaddingL + iw + iPaddingR; + U32 mNum = 0; + for (U32 n = 0; n < in; n++) { + for (U32 hStart = 0; hStart <= ihNum - iKernelH; hStart++) { + for (U32 wStart = 0; wStart <= iwNum - iKernelW; wStart++, mNum++) { + for (U32 c = 0, k = 0; c < icAlignSize; c++) { + for (U32 i = 0; i < iKernelH; i++) { + for (U32 j = 0; j < iKernelW; j++, tmpInput += inputTileSize, k += 8) { + U32 h = hStart + i; + U32 w = wStart + j; + if (h < iPaddingT || h >= iPaddingT + ih || w < iPaddingL || + w >= iPaddingL + iw) { + memset(tmpInput, 0, inputTileSize); + } else { + U32 srcIndex = (((n * icAlignSize + c) * ih + (h - iPaddingT)) * iw + + (w - iPaddingL)) * + inputTileSize; + memcpy(tmpInput, (const U8 *)input + srcIndex, inputTileSize); + } + } + } + } + } + } + } + U32 kNum = ic * iKernelH * iKernelW; + U32 nNum = oc; + TensorDesc tmpInputDesc = tensor2df(idt, DF_NORMAL, mNum, kNum); + TensorDesc tmpFilterDesc = tensor2df(filterDesc.dt, DF_NORMAL, kNum, nNum); + TensorDesc tmpOutputDesc = tensor2df(odt, DF_NORMAL, mNum, nNum); + tmpInput = (U8 *)tmp; + U32 bufferSize = + (tmpBytes - tensorNumBytes(tmpInputDesc) - tensorNumBytes(tmpOutputDesc) * OMP_NUM_THREADS) / + OMP_NUM_THREADS; +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 index = 0; index < convParamSpec.stride_h * convParamSpec.stride_w; index++) { + U32 i = index / convParamSpec.stride_w; + U32 j = index % convParamSpec.stride_w; +#ifdef _USE_OPENMP + // For NDK on ARMv7, OpenMP loop cannot reference more than 14 outside variables + ArrayAddFunction add_func = get_array_add_function(arch); + ArrayActivationFunction activation_func = get_array_activation_function(arch); + U32 ocAlignSize = outputDesc.dims[2] / 8; + U32 outputTileSize = bytesOf(outputDesc.dt) * 8; + U32 iPaddingT = + (convParamSpec.kernel_h - 1 - convParamSpec.padding_top) / convParamSpec.stride_h; + U32 iPaddingB = + (convParamSpec.kernel_h - 1 - convParamSpec.padding_bottom) / convParamSpec.stride_h; + U32 iPaddingL = + (convParamSpec.kernel_w - 1 - convParamSpec.padding_left) / convParamSpec.stride_w; + U32 iPaddingR = + (convParamSpec.kernel_w - 1 - convParamSpec.padding_right) / convParamSpec.stride_w; + U32 iKernelH = convParamSpec.kernel_h / convParamSpec.stride_h; + U32 iKernelW = convParamSpec.kernel_w / convParamSpec.stride_w; + U32 iStrideT = + (convParamSpec.kernel_h - 1 - convParamSpec.padding_top) % convParamSpec.stride_h; + U32 iStrideL = + (convParamSpec.kernel_w - 1 - convParamSpec.padding_left) % convParamSpec.stride_w; + U32 ihNum = iPaddingT + inputDesc.dims[1] + iPaddingB; + U32 iwNum = iPaddingL + inputDesc.dims[0] + iPaddingR; + U32 iDumpH = (iStrideT == convParamSpec.stride_h - 1) ? 0 : 1; + U32 iDumpW = (iStrideL == convParamSpec.stride_w - 1) ? 0 : 1; + U32 threadId = omp_get_thread_num(); +#else + U32 threadId = 0; +#endif + U8 *tmpOutput = (U8 *)tmpInput + tensorNumBytes(tmpInputDesc) + + (tensorNumBytes(tmpOutputDesc) + bufferSize) * threadId; + U8 *buffer = (U8 *)tmpOutput + tensorNumBytes(tmpOutputDesc); + memset(tmpOutput, 0, tensorNumBytes(tmpOutputDesc)); + const U8 *tmpFilter = (const U8 *)filter + tensorNumBytes(tmpFilterDesc) * index; + CHECK_STATUS(matrix_matrix_multiply(tmpInputDesc, tmpInput, tmpFilterDesc, tmpFilter, + bufferSize, buffer, tmpOutputDesc, tmpOutput, arch)); + U32 ihStart = 0; + U32 ihEnd = iPaddingT + inputDesc.dims[1] + iPaddingB - iKernelH - iDumpH; + U32 iwStart = 0; + U32 iwEnd = iPaddingL + inputDesc.dims[0] + iPaddingR - iKernelW - iDumpW; + if (i > iStrideT) { + ihStart += iDumpH; + ihEnd += iDumpH; + } + if (j > iStrideL) { + iwStart += iDumpW; + iwEnd += iDumpW; + } + for (U32 n = 0; n < in; n++) { + for (U32 hStart = ihStart, h = 0; hStart <= ihEnd; hStart++, h++) { + for (U32 wStart = iwStart, w = 0; wStart <= iwEnd; wStart++, w++) { + U32 srcIndex = + (((n * (ihNum - iKernelH + 1) + hStart) * (iwNum - iKernelW + 1) + wStart) * + ocAlignSize) * + outputTileSize; + add_func(outputDesc.dt, (U8 *)tmpOutput + srcIndex, bias, + (U8 *)tmpOutput + srcIndex, outputDesc.dims[2]); + CHECK_STATUS(activation_func(outputDesc.dt, (U8 *)tmpOutput + srcIndex, + outputDesc.dims[2], activationDesc, (U8 *)tmpOutput + srcIndex)); + for (U32 c = 0; c < ocAlignSize; c++) { + U32 srcIndex = + (((n * (ihNum - iKernelH + 1) + hStart) * (iwNum - iKernelW + 1) + + wStart) * + ocAlignSize + + c) * + outputTileSize; + U32 dstIndex = (((n * ocAlignSize + c) * outputDesc.dims[1] + + h * convParamSpec.stride_h + i) * + outputDesc.dims[0] + + w * convParamSpec.stride_w + j) * + outputTileSize; + memcpy((U8 *)output + dstIndex, (U8 *)tmpOutput + srcIndex, outputTileSize); + } + } + } + } + } + return SUCCESS; +} + +static EE deconvolution_stride_greater_one_and_kernel_equal_stride_cpu(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + ArrayActivationFunction activation_func = get_array_activation_function(arch); + DataType idt, odt; + DataFormat idf, odf; + U32 in, ic, ih, iw, on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 ihNum = ih + convParamSpec.padding_top + convParamSpec.padding_bottom; + U32 iwNum = iw + convParamSpec.padding_left + convParamSpec.padding_right; + U32 fh = convParamSpec.kernel_h; + U32 fw = convParamSpec.kernel_w; + U32 fhNum = fh / convParamSpec.stride_h; + U32 fwNum = fw / convParamSpec.stride_w; + + TensorDesc tmpInputDesc = tensor5df(idt, DF_NCHW, in, ic / 8, ih, iw, 8); + TensorDesc finalInputDesc = tensor5df(idt, DF_NCHW, in, ih, iw, ic / 8, 8); + U32 inputTransformDims[5] = {0, 2, 3, 1, 4}; + void *tmpInput = tmp; + tmp = (U8 *)tmp + tensorNumBytes(finalInputDesc); + tmpBytes -= tensorNumBytes(finalInputDesc); + CHECK_STATUS(array_transpose(tmpInputDesc.dt, tmpInputDesc.dims, input, finalInputDesc.dims, + tmpInput, inputTransformDims, tmpInputDesc.nDims)); + + TensorDesc matrixADesc = tensor2df(idt, DF_NORMAL, in * ihNum * iwNum, ic * fhNum * fwNum); + TensorDesc matrixCDesc = tensor2df(odt, DF_NORMAL, in * ihNum * iwNum, oc * fh * fw); + void *tmpOutput = tmp; + tmp = (U8 *)tmp + tensorNumBytes(matrixCDesc); + tmpBytes -= tensorNumBytes(matrixCDesc); + U32 biasTileSize = bytesOf(biasDesc.dt) * 8; + U8 *tmpOutputPtr = (U8 *)tmpOutput; + for (U32 n = 0; n < on * ih * iw; n++) { + U8 *biasPtr = (U8 *)bias; + for (U32 c = 0; c < oc / 8; c++, biasPtr += biasTileSize) { + for (U32 i = 0; i < oh * ow / (ih * iw); i++, tmpOutputPtr += biasTileSize) { + memcpy(tmpOutputPtr, biasPtr, biasTileSize); + } + } + } + CHECK_STATUS(matrix_matrix_multiply( + matrixADesc, tmpInput, filterDesc, filter, tmpBytes, tmp, matrixCDesc, tmpOutput, arch)); + + U32 tmpOutputDims[7] = {8, ow / iw, oh / ih, oc / 8, iw, ih, on}; + U32 finalOutputDims[7] = {8, ow / iw, iw, oh / ih, ih, oc / 8, on}; + U32 outputTransformDims[7] = {0, 3, 1, 4, 2, 5, 6}; + CHECK_STATUS(array_transpose( + odt, tmpOutputDims, tmpOutput, finalOutputDims, output, outputTransformDims, 7)); + CHECK_STATUS( + activation_func(odt, output, tensorNumElements(outputDesc), activationDesc, output)); + return SUCCESS; +} + +EE deconvolution_cpu(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + UNUSED(scaleDesc); + UNUSED(scale); + + if (algorithm == CONVOLUTION_ALGORITHM_IM2COL_GEMM) { + if (convParamSpec.stride_h == convParamSpec.kernel_h && + convParamSpec.stride_w == convParamSpec.kernel_w) { + return deconvolution_stride_greater_one_and_kernel_equal_stride_cpu(inputDesc, input, + filterDesc, filter, convParamSpec, biasDesc, bias, tmpBytes, tmp, outputDesc, + output, activationDesc, arch); + } else { + return deconvolution_stride_greater_one_and_kernel_divide_stride_cpu(inputDesc, input, + filterDesc, filter, convParamSpec, biasDesc, bias, tmpBytes, tmp, outputDesc, + output, activationDesc, arch); + } + } + + if (nullptr == input || nullptr == filter || nullptr == output || nullptr == bias || + nullptr == tmp) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (!(idf == DF_NCHWC8 && odf == DF_NCHWC8)) { + CHECK_STATUS(NOT_MATCH); + } + + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + ConvolutionParamSpec transposedCD = convParamSpec; + transposedCD.stride_h = 1; + transposedCD.stride_w = 1; + transposedCD.padding_top = 0; + transposedCD.padding_bottom = 0; + transposedCD.padding_left = 0; + transposedCD.padding_right = 0; + transposedCD.dilatedRate_h = 1; + transposedCD.dilatedRate_w = 1; + + if (CONVOLUTION_ALGORITHM_WINOGRAD == algorithm) { + fh = 3; + fw = 3; + } + + U32 tPadding = fh - 1 - paddingT; + U32 bPadding = fh - 1 - paddingB; + U32 lPadding = fw - 1 - paddingL; + U32 rPadding = fw - 1 - paddingR; + + U32 stuffH = strideH - 1; + U32 stuffW = strideW - 1; + U32 ihPadded = ih + (ih - 1) * stuffH + tPadding + bPadding; + U32 iwPadded = iw + (iw - 1) * stuffW + lPadding + rPadding; + TensorDesc inPaddedDesc = tensor4df(idt, idf, in, ic, ihPadded, iwPadded); + + U8 *inPad = (U8 *)tmp; + U8 *inPadMov = inPad; + U8 *inputMov = (U8 *)input; + U32 memUnit = 8 * bytesOf(idt); + + ic /= 8; + + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < tPadding; h++) { + memset(inPadMov, 0, iwPadded * memUnit); + inPadMov += iwPadded * memUnit; + } + for (U32 h = 0; h < ih - 1; h++) { + memset(inPadMov, 0, lPadding * memUnit); + inPadMov += lPadding * memUnit; + for (U32 w = 0; w < iw - 1; w++) { + memcpy(inPadMov, inputMov, memUnit); + inPadMov += memUnit; + inputMov += memUnit; + memset(inPadMov, 0, stuffW * memUnit); + inPadMov += stuffW * memUnit; + } + memcpy(inPadMov, inputMov, memUnit); + inPadMov += memUnit; + inputMov += memUnit; + memset(inPadMov, 0, rPadding * memUnit); + inPadMov += rPadding * memUnit; + + // stuffH + memset(inPadMov, 0, iwPadded * stuffH * memUnit); + inPadMov += iwPadded * stuffH * memUnit; + } + memset(inPadMov, 0, lPadding * memUnit); + inPadMov += lPadding * memUnit; + for (U32 w = 0; w < iw - 1; w++) { + memcpy(inPadMov, inputMov, memUnit); + inPadMov += memUnit; + inputMov += memUnit; + memset(inPadMov, 0, stuffW * memUnit); + inPadMov += stuffW * memUnit; + } + memcpy(inPadMov, inputMov, memUnit); + inPadMov += memUnit; + inputMov += memUnit; + memset(inPadMov, 0, rPadding * memUnit); + inPadMov += rPadding * memUnit; + + for (U32 h = ihPadded - bPadding; h < ihPadded; h++) { + memset(inPadMov, 0, iwPadded * memUnit); + inPadMov += iwPadded * memUnit; + } + } + + EE ret = NOT_SUPPORTED; + if (algorithm == CONVOLUTION_ALGORITHM_GROUP_DECONV) { + TensorDesc blankTensorDesc; + ActivationParamSpec blankActivationParamSpec; + ret = depthwise_pointwise_convolution_cpu(inPaddedDesc, inPad, filterDesc, filter, + blankTensorDesc, nullptr, transposedCD, + DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT, biasDesc, bias, blankTensorDesc, + nullptr, tmpBytes - tensorNumBytes(inPaddedDesc), inPad + tensorNumBytes(inPaddedDesc), + outputDesc, output, activationDesc, blankActivationParamSpec, arch); + } else { + ret = convolution_cpu(inPaddedDesc, inPad, filterDesc, filter, transposedCD, algorithm, + scaleDesc, scale, biasDesc, bias, tmpBytes - tensorNumBytes(inPaddedDesc), + inPad + tensorNumBytes(inPaddedDesc), outputDesc, output, activationDesc, arch); + } + + return ret; +} + +#endif diff --git a/tensor_computing/src/cpu/general/reshape.cpp b/compute/tensor/src/cpu/depthwise_convolution.cpp similarity index 62% rename from tensor_computing/src/cpu/general/reshape.cpp rename to compute/tensor/src/cpu/depthwise_convolution.cpp index b59ecc37..24caa9b5 100644 --- a/tensor_computing/src/cpu/general/reshape.cpp +++ b/compute/tensor/src/cpu/depthwise_convolution.cpp @@ -1,30 +1,31 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#include "cpu/tensor_computing_cpu.h" -#include - -#include "cpu/arm/tensor_computing_arm.h" - -EE reshape_general(TensorDesc inputDesc, void* input, - TensorDesc outputDesc, void* output) +EE depthwise_convolution_transform_filter_bytes_cpu( + TensorDesc filterDesc, DepthwiseConvolutionForwardAlgorithm algorithm, U32 *bytes) { - if (nullptr == input || nullptr == output) + if (nullptr == bytes) { CHECK_STATUS(NULL_POINTER); - - CHECK_REQUIREMENT(DF_NCHWC8 != inputDesc.df); - - CHECK_REQUIREMENT(tensorNumElements(inputDesc) == tensorNumElements(outputDesc)); - memcpy(output, input, tensorNumBytes(inputDesc)); + } + switch (algorithm) { + case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: + *bytes = tensorNumBytes(filterDesc); + break; + default: + return NOT_SUPPORTED; + } + *bytes += 32; return SUCCESS; } diff --git a/compute/tensor/src/cpu/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/depthwise_pointwise_convolution.cpp new file mode 100644 index 00000000..14eea321 --- /dev/null +++ b/compute/tensor/src/cpu/depthwise_pointwise_convolution.cpp @@ -0,0 +1,68 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif +#include "cpu/tensor_computing_cpu.h" + +EE depthwise_pointwise_convolution_cpu(TensorDesc inputDesc, + void *input, + TensorDesc dwFilterDesc, + const void *dwFilter, + TensorDesc pwFilterDesc, + const void *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const void *dwBias, + TensorDesc pwBiasDesc, + const void *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch) +{ + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = depthwise_pointwise_convolution_general(inputDesc, input, dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, dwBiasDesc, dwBias, pwBiasDesc, pwBias, tmpBytes, + tmp, outputDesc, output, depthwiseActivationParamSpec, pointwiseActivationParamSpec); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = depthwise_pointwise_convolution_x86(inputDesc, input, dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, algorithm, dwBiasDesc, dwBias, pwBiasDesc, + pwBias, tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, arch); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = depthwise_pointwise_convolution_arm(inputDesc, input, dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, algorithm, dwBiasDesc, dwBias, pwBiasDesc, + pwBias, tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, arch); +#endif + } + return ret; +} \ No newline at end of file diff --git a/tensor_computing/src/cpu/general/detectionoutput.cpp b/compute/tensor/src/cpu/detectionoutput.cpp similarity index 57% rename from tensor_computing/src/cpu/general/detectionoutput.cpp rename to compute/tensor/src/cpu/detectionoutput.cpp index 24d03e45..9695c638 100644 --- a/tensor_computing/src/cpu/general/detectionoutput.cpp +++ b/compute/tensor/src/cpu/detectionoutput.cpp @@ -1,57 +1,57 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include "type.h" -#include "tensor_desc.h" #include "error.h" -#include "tensor_computing_type.h" -#include "cpu/general/tensor_computing_general.h" +#include "cpu/tensor_computing_cpu.h" -inline EE qsort_descent(std::vector& boxes, std::vector& scores, int left, int right) +inline EE qsort_descent(std::vector &boxes, std::vector &scores, int left, int right) { - if (boxes.empty() || scores.empty()) + if (boxes.empty() || scores.empty()) { return NOT_SUPPORTED; - + } + int i = left; int j = right; - F32 temp = scores[(left+right) / 2]; + F32 temp = scores[(left + right) / 2]; - while (i <= j){ - while(scores[i] > temp) + while (i <= j) { + while (scores[i] > temp) { i++; - while(scores[j] < temp) + } + while (scores[j] < temp) { j--; - if(i<=j){ + } + if (i <= j) { std::swap(boxes[i], boxes[j]); std::swap(scores[i], scores[j]); i++; j--; } } - - if (left < j) + + if (left < j) { qsort_descent(boxes, scores, left, j); - if (i < right) + } + if (i < right) { qsort_descent(boxes, scores, i, right); - + } + return SUCCESS; } inline F32 intersectionarea(BoxRect a, BoxRect b) { - if (a.xmin > b.xmax || a.xmax < b.xmin || a.ymin > b.ymax || a.ymax < b.ymin) - { + if (a.xmin > b.xmax || a.xmax < b.xmin || a.ymin > b.ymax || a.ymax < b.ymin) { return 0.f; } F32 inter_width = std::min(a.xmax, b.xmax) - std::max(a.xmin, b.xmin); @@ -60,56 +60,64 @@ inline F32 intersectionarea(BoxRect a, BoxRect b) return inter_width * inter_height; } -inline EE nms_pickedboxes(std::vector boxes, std::vector& picked, F32 nms_threshold) +inline EE nms_pickedboxes(std::vector boxes, std::vector &picked, F32 nms_threshold) { I64 n = boxes.size(); std::vector areas(n); - for(I64 i = 0; i < n; i++){ + for (I64 i = 0; i < n; i++) { BoxRect box = boxes[i]; - + F32 width = box.xmax - box.xmin; F32 height = box.ymax - box.ymin; - + areas[i] = width * height; } - for(I64 i = 0; i < n; i++){ + for (I64 i = 0; i < n; i++) { BoxRect a = boxes[i]; int keep = 1; - for(int j = 0; j < (int)picked.size(); j++){ + for (int j = 0; j < (int)picked.size(); j++) { BoxRect b = boxes[picked[j]]; - F32 inter_area = intersectionarea(a,b); + F32 inter_area = intersectionarea(a, b); F32 union_area = areas[i] + areas[picked[j]] - inter_area; - - if(inter_area / union_area > nms_threshold) + + if (inter_area / union_area > nms_threshold) { keep = 0; + } } - if(keep){ + if (keep) { picked.push_back(i); } } return SUCCESS; } -template -EE detectionoutput(std::vector input, T* output, U32 priorbox_width, U32 num_class, F32 nms_threshold, U32 nms_top_k, U32 keep_top_k, F32 confidence_threshold) +template +EE detectionoutput_kernel(std::vector input, + T *output, + U32 priorbox_width, + U32 num_class, + F32 nms_threshold, + U32 nms_top_k, + U32 keep_top_k, + F32 confidence_threshold) { - T* location = (T*)input[0]; - T* confidence = (T*)input[1]; - T* priorbox = (T*)input[2]; + T *location = (T *)input[0]; + T *confidence = (T *)input[1]; + T *priorbox = (T *)input[2]; U32 num_total_priorbox = priorbox_width / 4; U32 numclass = num_class; - std::vector> boxes; + std::vector> boxes; boxes.resize(num_total_priorbox); - T* variance = priorbox + priorbox_width; + T *variance = priorbox + priorbox_width; // decode priorbox - for(U32 i = 0 ; i < num_total_priorbox ; i++){ - T* loc = location + i * 4; - T* pb = priorbox + i * 4; - T* var = variance + i * 4; - + for (U32 i = 0; i < num_total_priorbox; i++) { + T *loc = location + i * 4; + T *pb = priorbox + i * 4; + T *var = variance + i * 4; + F32 pb_w = pb[2] - pb[0]; F32 pb_h = pb[3] - pb[1]; F32 pb_cx = (pb[0] + pb[2]) * 0.5f; @@ -119,15 +127,15 @@ EE detectionoutput(std::vector input, T* output, U32 priorbox_width, U32 F32 box_cy = var[1] * loc[1] * pb_h + pb_cy; F32 box_w = static_cast(exp(var[2] * loc[2]) * pb_w); F32 box_h = static_cast(exp(var[3] * loc[3]) * pb_h); - + std::vector box; box.resize(4); box[0] = box_cx - box_w * 0.5f; box[1] = box_cy - box_h * 0.5f; box[2] = box_cx + box_w * 0.5f; - box[3] = box_cy + box_h * 0.5f; + box[3] = box_cy + box_h * 0.5f; // give box to boxes - boxes[i].assign(box.begin(),box.end()); + boxes[i].assign(box.begin(), box.end()); } std::vector> allclass_boxrects; @@ -135,35 +143,33 @@ EE detectionoutput(std::vector input, T* output, U32 priorbox_width, U32 allclass_boxrects.resize(numclass); allclass_boxscores.resize(numclass); - for(U32 i = 1; i < numclass; i++){ + for (U32 i = 1; i < numclass; i++) { std::vector class_boxrects; std::vector class_boxscores; - for(U32 j = 0; j < num_total_priorbox; j++){ - + for (U32 j = 0; j < num_total_priorbox; j++) { F32 score = confidence[j * numclass + i]; - if (score > confidence_threshold) - { + if (score > confidence_threshold) { std::vector inbox; - inbox.assign(boxes[j].begin(),boxes[j].end()); - BoxRect b = { inbox[0], inbox[1], inbox[2], inbox[3], i }; + inbox.assign(boxes[j].begin(), boxes[j].end()); + BoxRect b = {inbox[0], inbox[1], inbox[2], inbox[3], i}; class_boxrects.push_back(b); class_boxscores.push_back(score); } } - //sort the boxes with scores - qsort_descent(class_boxrects, class_boxscores, 0, static_cast(class_boxscores.size()-1)); + // sort the boxes with scores + qsort_descent( + class_boxrects, class_boxscores, 0, static_cast(class_boxscores.size() - 1)); - if(nms_top_k < (U32)class_boxrects.size()){ + if (nms_top_k < (U32)class_boxrects.size()) { class_boxrects.resize(nms_top_k); class_boxscores.resize(nms_top_k); } - //apply nms + // apply nms std::vector picked; nms_pickedboxes(class_boxrects, picked, nms_threshold); - for(I64 j = 0; j < (I64)picked.size(); j++) - { + for (I64 j = 0; j < (I64)picked.size(); j++) { I64 picked_box = picked[j]; allclass_boxrects[i].push_back(class_boxrects[picked_box]); allclass_boxscores[i].push_back(class_boxscores[picked_box]); @@ -173,42 +179,43 @@ EE detectionoutput(std::vector input, T* output, U32 priorbox_width, U32 std::vector boxrects; std::vector boxscores; - for (U32 i = 1; i < numclass ; i++) - { + for (U32 i = 1; i < numclass; i++) { boxrects.insert(boxrects.end(), allclass_boxrects[i].begin(), allclass_boxrects[i].end()); - boxscores.insert(boxscores.end(), allclass_boxscores[i].begin(), allclass_boxscores[i].end()); + boxscores.insert( + boxscores.end(), allclass_boxscores[i].begin(), allclass_boxscores[i].end()); } - qsort_descent(boxrects, boxscores, 0, static_cast(boxscores.size()-1)); + qsort_descent(boxrects, boxscores, 0, static_cast(boxscores.size() - 1)); - if (keep_top_k < (U32)boxrects.size()) - { + if (keep_top_k < (U32)boxrects.size()) { boxrects.resize(keep_top_k); boxscores.resize(keep_top_k); } U32 num_detected = static_cast(boxrects.size()); - if (num_detected == 0) - return SUCCESS; // the first box contains the number of availble boxes in the first element. output[0] = num_detected; output[1] = output[2] = output[3] = output[4] = output[5] = 0; - for(U32 i = 0; i < num_detected ; i++){ + for (U32 i = 0; i < num_detected; i++) { BoxRect b = boxrects[i]; F32 score = boxscores[i]; - output[(i+1)*6] = b.label; - output[(i+1)*6+1] = score; - output[(i+1)*6+2] = b.xmin; - output[(i+1)*6+3] = b.ymin; - output[(i+1)*6+4] = b.xmax; - output[(i+1)*6+5] = b.ymax; + output[(i + 1) * 6] = b.label; + output[(i + 1) * 6 + 1] = score; + output[(i + 1) * 6 + 2] = b.xmin; + output[(i + 1) * 6 + 3] = b.ymin; + output[(i + 1) * 6 + 4] = b.xmax; + output[(i + 1) * 6 + 5] = b.ymax; } return SUCCESS; } -EE detectionoutput_general(std::vector inputDesc, std::vector input, DetectionOutputDesc detectionoutputDesc, TensorDesc outputDesc, void* output) +EE detectionoutput_cpu(std::vector inputDesc, + std::vector input, + DetectionOutputParamSpec detectionOutputParamSpec, + TensorDesc outputDesc, + void *output) { UNUSED(outputDesc); if (nullptr == output) { @@ -219,25 +226,27 @@ EE detectionoutput_general(std::vector inputDesc, std::vector } DataType idt0 = inputDesc[0].dt; U32 ilens2 = inputDesc[2].dims[0]; - U32 numclass = detectionoutputDesc.num_class; - F32 nmsthreshold = detectionoutputDesc.nms_threshold; - U32 nmstopk = detectionoutputDesc.nms_top_k; - U32 keeptopk = detectionoutputDesc.keep_top_k; - F32 confidencethreshold = detectionoutputDesc.confidence_threshold; + U32 numclass = detectionOutputParamSpec.num_class; + F32 nmsthreshold = detectionOutputParamSpec.nms_threshold; + U32 nmstopk = detectionOutputParamSpec.nms_top_k; + U32 keeptopk = detectionOutputParamSpec.keep_top_k; + F32 confidencethreshold = detectionOutputParamSpec.confidence_threshold; EE ret = SUCCESS; switch (idt0) { #ifdef _USE_FP32 case DT_F32: - detectionoutput(input, (F32*)output, ilens2, numclass, nmsthreshold, nmstopk, keeptopk, confidencethreshold); + detectionoutput_kernel(input, (F32 *)output, ilens2, numclass, nmsthreshold, nmstopk, + keeptopk, confidencethreshold); break; #endif #ifdef _USE_FP16 case DT_F16: - detectionoutput(input, (F16*)output, ilens2, numclass, nmsthreshold, nmstopk, keeptopk, confidencethreshold); + detectionoutput_kernel(input, (F16 *)output, ilens2, numclass, nmsthreshold, nmstopk, + keeptopk, confidencethreshold); break; #endif - default: + default: ret = NOT_SUPPORTED; - } + } return ret; } diff --git a/compute/tensor/src/cpu/eltwise.cpp b/compute/tensor/src/cpu/eltwise.cpp new file mode 100644 index 00000000..def5b37a --- /dev/null +++ b/compute/tensor/src/cpu/eltwise.cpp @@ -0,0 +1,169 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/tensor_computing_cpu.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif + +static std::vector calculateRelativeLocalIndex_cpu(U32 *indexes, U32 *dims, U32 nDims) +{ + std::vector relativeIndexes(nDims); + for (U32 i = 0; i < nDims; i++) { + relativeIndexes[i] = indexes[i] % dims[i]; + } + return relativeIndexes; +} + +// [1, 10, 10] + [1, 10, 10] = [1, 10, 10] +// [1, 10, 1] + [1, 1, 10] = [1, 10, 10] +// [1, 20, 10] + [10] = [1. 20, 10] + [1, 1, 10] = [1, 20, 10] +EE eltwise_cpu(std::vector inputDesc, + std::vector input_, + EltwiseParamSpec eltwiseDesc, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + Arch arch) +{ + U32 num = inputDesc.size(); + if (num <= 1 || outputDesc.nDims < 1) { + return NOT_MATCH; + } + std::vector input = input_; + U32 nchwc8Count = 0; + U32 minDims = inputDesc[0].nDims; + for (U32 i = 0; i < num; i++) { + if (inputDesc[i].df == DF_NCHWC8) { + nchwc8Count++; + } + if (inputDesc[i].nDims < minDims) { + minDims = inputDesc[i].nDims; + } + } + U8 *ptr = (U8 *)tmp; + if (nchwc8Count > 0 && nchwc8Count != num) { + for (U32 i = 0; i < num; i++) { + if (inputDesc[i].df == DF_NCHWC8) { + TensorDesc tmpDesc = inputDesc[i]; + tmpDesc.df = DF_NCHW; + transformToNCHW(inputDesc[i], input[i], tmpDesc, ptr); + inputDesc[i] = tmpDesc; + input[i] = ptr; + ptr += tensorNumBytes(inputDesc[i]); + // Output from 1D-conv + 3D tensors + if (inputDesc[i].dims[0] == 1 && minDims == 3) { + inputDesc[i] = tensor3df(inputDesc[i].dt, DF_NCHW, + inputDesc[i].dims[3], inputDesc[i].dims[2], inputDesc[i].dims[1]); + } + } + } + } + + I32 oneCount = 0; + for (int i = 0; i < ((int)outputDesc.nDims) - 1; i++) { + if (outputDesc.dims[i] == 1) { + oneCount++; + } else { + break; + } + } + TensorDesc newOutputDesc = outputDesc; + for (int i = 0; i < (int)outputDesc.nDims - oneCount; i++) { + newOutputDesc.dims[i] = outputDesc.dims[oneCount + i]; + } + newOutputDesc.nDims = outputDesc.nDims - oneCount; + + std::vector newInputDesc(num); + for (U32 i = 0; i < num; i++) { + newInputDesc[i] = inputDesc[i]; + for (int j = 0; j < (int)inputDesc[i].nDims - oneCount; j++) { + newInputDesc[i].dims[j] = inputDesc[i].dims[oneCount + j]; + } + newInputDesc[i].nDims = inputDesc[i].nDims - oneCount; + for (U32 j = newInputDesc[i].nDims; j < newOutputDesc.nDims; j++) { + newInputDesc[i].dims[j] = 1; + } + newInputDesc[i].nDims = newOutputDesc.nDims; + } + U32 size = tensorNumElements(newOutputDesc); + int lastDimSize = newOutputDesc.dims[0]; + std::vector lastDimSizes(num); + for (U32 i = 0; i < num; i++) { + lastDimSizes[i] = newInputDesc[i].dims[0]; + if (lastDimSizes[i] != lastDimSize && newInputDesc[0].df == DF_NCHWC8) { + UNI_ERROR_LOG("For NCHWc8, eltwise can only handle inputs with matching widths\n"); + } + } + for (U32 i = 1; i < newOutputDesc.nDims; i++) { + bool sameDim = true; + for (U32 j = 0; j < num; j++) { + if (newInputDesc[j].dims[i] != newOutputDesc.dims[i]) { + sameDim = false; + break; + } + } + if (sameDim) { + lastDimSize *= newOutputDesc.dims[i]; + for (U32 j = 0; j < num; j++) { + lastDimSizes[j] *= newInputDesc[j].dims[i]; + } + } else { + break; + } + } + + std::vector newInput(num); + EE ret = NOT_SUPPORTED; + for (U32 i = 0; i < size; i += lastDimSize) { + std::vector index = calculateLocalIndex(i, newOutputDesc.dims, newOutputDesc.nDims); + for (U32 j = 0; j < num; j++) { + std::vector relativeIndex = calculateRelativeLocalIndex_cpu( + index.data(), newInputDesc[j].dims, newInputDesc[j].nDims); + U32 globalIndex = calculateGlobalIndex( + relativeIndex.data(), newInputDesc[j].dims, newInputDesc[j].nDims); + newInput[j] = (U8 *)(input[j]) + globalIndex * bytesOf(newInputDesc[j].dt); + } + U8 *newOutput = (U8 *)output + i * bytesOf(newOutputDesc.dt); + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = eltwise_general(newOutputDesc.dt, newInput, lastDimSizes, num, lastDimSize, + newOutput, eltwiseDesc.elt_mode); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = eltwise_arm(newOutputDesc.dt, newInput, lastDimSizes, num, lastDimSize, newOutput, + eltwiseDesc.elt_mode); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = eltwise_x86(newOutputDesc.dt, newInput, lastDimSizes, num, lastDimSize, newOutput, + eltwiseDesc.elt_mode); +#endif + } + } + if (ret == SUCCESS && eltwiseDesc.activation_type != ACTIVATION_NULL) { + ActivationParamSpec p; + p.mode = eltwiseDesc.activation_type; + ret = activation_cpu(outputDesc, output, p, outputDesc, output, arch); + } + return ret; +} diff --git a/compute/tensor/src/cpu/embedding.cpp b/compute/tensor/src/cpu/embedding.cpp new file mode 100644 index 00000000..6698b5a4 --- /dev/null +++ b/compute/tensor/src/cpu/embedding.cpp @@ -0,0 +1,66 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/tensor_computing_cpu.h" + +EE embedding_cpu(TensorDesc inputDesc, + void *input, + void *weight, + EmbedParamSpec p, + TensorDesc outputDesc, + void *output) +{ + U8 *weightPtr = (U8 *)weight; + U8 *outputPtr = (U8 *)output; + U32 len = tensorNumElements(inputDesc); + U32 elementBytes = bytesOf(outputDesc.dt); + U32 wordEmbeddingCPUBytes = elementBytes * p.num_output; + U32 transposeStride = elementBytes * p.input_dim; + EE ret = SUCCESS; + for (U32 i = 0; i < len; i++) { + U32 wordIndex = 0; + switch (inputDesc.dt) { + case DT_U32: + wordIndex = ((U32 *)input)[i]; + break; + case DT_I32: + wordIndex = ((I32 *)input)[i]; + break; + case DT_F32: + wordIndex = ((F32 *)input)[i]; + break; +#ifdef _USE_FP16 + case DT_F16: + wordIndex = ((F16 *)input)[i]; + break; +#endif + default: + ret = NOT_SUPPORTED; + break; + } + U8 *dest = outputPtr; + if (p.transpose) { + U8 *src = weightPtr + wordIndex * elementBytes; + for (U32 j = 0; j < p.num_output; j++) { + memcpy(dest, src, elementBytes); + src += transposeStride; + dest += elementBytes; + } + } else { + U8 *src = weightPtr + wordIndex * wordEmbeddingCPUBytes; + memcpy(dest, src, wordEmbeddingCPUBytes); + } + outputPtr += wordEmbeddingCPUBytes; + } + return ret; +} diff --git a/tensor_computing/src/cpu/general/attention.cpp b/compute/tensor/src/cpu/general/attention.cpp similarity index 70% rename from tensor_computing/src/cpu/general/attention.cpp rename to compute/tensor/src/cpu/general/attention.cpp index db10ffb0..dc12c890 100644 --- a/tensor_computing/src/cpu/general/attention.cpp +++ b/compute/tensor/src/cpu/general/attention.cpp @@ -1,41 +1,44 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "cpu/general/tensor_computing_general.h" #include "cpu/general/general_functions.h" -template -EE attention(U32 batch, U32 numHeads, U32 fromSequenceLength, U32 toSequenceLength, const T *input, T *output) +template +EE attention( + U32 batch, U32 numHeads, U32 fromSequenceLength, U32 toSequenceLength, const T *input, T *output) { - if (nullptr == input || nullptr == output) + if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); + } T minValue = -10000.0; - U32 count = array_sum(input, toSequenceLength); + U32 count = array_sum_template(input, toSequenceLength); U32 valid = UNI_MIN(count, fromSequenceLength); for (U32 n = 0; n < batch; n++) { for (U32 i = 0; i < numHeads; i++) { for (U32 j = 0; j < valid; j++) { for (U32 k = 0; k < toSequenceLength; k++) { - T value = input[n*toSequenceLength + k]; - U32 index = (((n * numHeads + i)*fromSequenceLength + j)*toSequenceLength + k); + T value = input[n * toSequenceLength + k]; + U32 index = + (((n * numHeads + i) * fromSequenceLength + j) * toSequenceLength + k); output[index] = (1 - value) * minValue; } } for (U32 j = valid; j < fromSequenceLength; j++) { for (U32 k = 0; k < toSequenceLength; k++) { - U32 index = (((n * numHeads + i)*fromSequenceLength + j)*toSequenceLength + k); + U32 index = + (((n * numHeads + i) * fromSequenceLength + j) * toSequenceLength + k); output[index] = minValue; } } @@ -51,19 +54,22 @@ EE attention_general(TensorDesc inputDesc, const void *input, TensorDesc outputD U32 batch, numHeads, fromSequenceLength, toSequenceLength; CHECK_REQUIREMENT(tensorIs2d(inputDesc)); CHECK_REQUIREMENT(tensorIs4d(outputDesc)); - CHECK_STATUS(tensor4dGet(outputDesc, &dt, &df, &batch, &numHeads, &fromSequenceLength, &toSequenceLength)); + CHECK_STATUS(tensor4dGet( + outputDesc, &dt, &df, &batch, &numHeads, &fromSequenceLength, &toSequenceLength)); EE ret = SUCCESS; switch (dt) { #ifdef _USE_FP16 case DT_F16: { - ret = attention(batch, numHeads, fromSequenceLength, toSequenceLength, (const F16*)input, (F16*)output); + ret = attention(batch, numHeads, fromSequenceLength, toSequenceLength, + (const F16 *)input, (F16 *)output); break; } #endif #ifdef _USE_FP32 case DT_F32: { - ret = attention(batch, numHeads, fromSequenceLength, toSequenceLength, (const F32*)input, (F32*)output); + ret = attention(batch, numHeads, fromSequenceLength, toSequenceLength, + (const F32 *)input, (F32 *)output); break; } #endif diff --git a/tensor_computing/src/cpu/general/attention_mask.cpp b/compute/tensor/src/cpu/general/attention_mask.cpp similarity index 73% rename from tensor_computing/src/cpu/general/attention_mask.cpp rename to compute/tensor/src/cpu/general/attention_mask.cpp index cf4efbb0..90a45c78 100644 --- a/tensor_computing/src/cpu/general/attention_mask.cpp +++ b/compute/tensor/src/cpu/general/attention_mask.cpp @@ -1,29 +1,32 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "cpu/general/tensor_computing_general.h" - -template -EE attention_mask(TensorDesc inputDesc, const T* input, - I32 attentionLength, bool sameLength, float maskValue, - TensorDesc outputDesc, T* output) +template +static EE attention_mask(TensorDesc inputDesc, + const T *input, + I32 attentionLength, + bool sameLength, + float maskValue, + TensorDesc outputDesc, + T *output) { UNUSED(outputDesc); - if (nullptr == input || nullptr == output) + if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); + } int qlen = inputDesc.dims[1]; int klen = inputDesc.dims[0]; int mlen = klen - qlen; @@ -49,9 +52,10 @@ EE attention_mask(TensorDesc inputDesc, const T* input, } loops = UNI_MAX(loops, 0); start = UNI_MIN(start, klen); - if (start + loops > klen) + if (start + loops > klen) { loops = UNI_MAX(klen - start, 0); - memset(&mask[i][start], 0, sizeof(T)*loops); + } + memset(&mask[i][start], 0, sizeof(T) * loops); } } I32 loops = tensorNumElements(inputDesc) / qlen / klen; @@ -66,24 +70,26 @@ EE attention_mask(TensorDesc inputDesc, const T* input, return SUCCESS; } -EE attention_mask_general(TensorDesc inputDesc, const void* input, - I32 attentionLength, bool sameLength, float mask, - TensorDesc outputDesc, void* output) +EE attention_mask_general(TensorDesc inputDesc, + const void *input, + AttentionMaskParamSpec p, + TensorDesc outputDesc, + void *output) { DataType idt = inputDesc.dt; EE ret = SUCCESS; switch (idt) { #ifdef _USE_FP32 case DT_F32: { - ret = attention_mask(inputDesc, (const F32*)input, - attentionLength, sameLength, mask, outputDesc, (F32*)output); + ret = attention_mask(inputDesc, (const F32 *)input, p.attention_length, + p.same_length, p.mask, outputDesc, (F32 *)output); break; } #endif #ifdef _USE_FP16 case DT_F16: { - ret = attention_mask(inputDesc, (const F16*)input, - attentionLength, sameLength, mask, outputDesc, (F16*)output); + ret = attention_mask(inputDesc, (const F16 *)input, p.attention_length, + p.same_length, p.mask, outputDesc, (F16 *)output); break; } #endif diff --git a/tensor_computing/src/cpu/general/check.cpp b/compute/tensor/src/cpu/general/check.cpp similarity index 51% rename from tensor_computing/src/cpu/general/check.cpp rename to compute/tensor/src/cpu/general/check.cpp index 3b17ab43..50ac9a82 100644 --- a/tensor_computing/src/cpu/general/check.cpp +++ b/compute/tensor/src/cpu/general/check.cpp @@ -1,44 +1,59 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include #include "cpu/general/tensor_computing_general.h" -template -EE check(TensorDesc inputDescA, const T* inputA, - TensorDesc inputDescB, const T* inputB, +template +static EE check(TensorDesc inputDescA, + const T *inputA, + TensorDesc inputDescB, + const T *inputB, CheckMode checkMode, - TensorDesc outputDesc, I32* output) + TensorDesc outputDesc, + I32 *output) { UNUSED(inputDescB); UNUSED(outputDesc); - if (nullptr == inputA || nullptr == inputB || nullptr == output) + if (nullptr == inputA || nullptr == inputB || nullptr == output) { CHECK_STATUS(NULL_POINTER); + } U32 size = tensorNumElements(inputDescA); - U32 loopOuter = inputDescA.dims[inputDescA.nDims-1]; + U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1]; U32 loopInner = size / loopOuter; - + for (U32 i = 0; i < loopOuter; i++) { U32 count = 0; for (U32 j = 0; j < loopInner; j++) { U32 index = i * loopInner + j; switch (checkMode) { case CHECK_EQUAL: { - if (inputA[index] == inputB[index]) - count ++; + if (inputA[index] == inputB[index]) { + count++; + } + break; + } + case CHECK_GREATEQUAL: { + if (inputA[index] >= inputB[index]) { + count++; + } + break; + } + case CHECK_GREAT: { + if (inputA[index] > inputB[index]) { + count++; + } break; } default: @@ -46,45 +61,51 @@ EE check(TensorDesc inputDescA, const T* inputA, break; } } - switch (checkMode) { - case CHECK_EQUAL: { - if (count == loopInner) - output[i] = 1; - else - output[i] = 0; - break; - } - default: - break; + + if (count == loopInner) { + output[i] = 1; + } else { + output[i] = 0; } } return SUCCESS; } -EE check_general(TensorDesc inputDescA, const void* inputA, - TensorDesc inputDescB, const void* inputB, - CheckMode checkMode, - TensorDesc outputDesc, void* output) +EE check_general(TensorDesc inputDescA, + const void *inputA, + TensorDesc inputDescB, + const void *inputB, + CheckParamSpec p, + TensorDesc outputDesc, + void *output) { DataType idt = inputDescA.dt; EE ret = SUCCESS; switch (idt) { #ifdef _USE_FP16 case DT_F16: { - ret = check(inputDescA, (const F16*)inputA, - inputDescB, (const F16*)inputB, - checkMode, outputDesc, (I32*)output); + ret = check(inputDescA, (const F16 *)inputA, inputDescB, (const F16 *)inputB, + p.check_mode, outputDesc, (I32 *)output); break; } #endif #ifdef _USE_FP32 case DT_F32: { - ret = check(inputDescA, (const F32*)inputA, - inputDescB, (const F32*)inputB, - checkMode, outputDesc, (I32*)output); + ret = check(inputDescA, (const F32 *)inputA, inputDescB, (const F32 *)inputB, + p.check_mode, outputDesc, (I32 *)output); break; } #endif + case DT_U32: { + ret = check(inputDescA, (const U32 *)inputA, inputDescB, (const U32 *)inputB, + p.check_mode, outputDesc, (I32 *)output); + break; + } + case DT_I32: { + ret = check(inputDescA, (const I32 *)inputA, inputDescB, (const I32 *)inputB, + p.check_mode, outputDesc, (I32 *)output); + break; + } default: ret = NOT_SUPPORTED; break; diff --git a/tensor_computing/src/cpu/general/clip.cpp b/compute/tensor/src/cpu/general/clip.cpp similarity index 72% rename from tensor_computing/src/cpu/general/clip.cpp rename to compute/tensor/src/cpu/general/clip.cpp index 62c128b5..a627a24e 100644 --- a/tensor_computing/src/cpu/general/clip.cpp +++ b/compute/tensor/src/cpu/general/clip.cpp @@ -1,24 +1,24 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "cpu/general/tensor_computing_general.h" -template -EE clip(T* input, T* output, U32 len, F32 min_value, F32 max_value) { - if (nullptr == input - || nullptr == output) +template +static EE clip(T *input, T *output, U32 len, F32 min_value, F32 max_value) +{ + if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); + } for (U32 i = 0; i < len; i++) { F32 value = input[i]; @@ -29,25 +29,21 @@ EE clip(T* input, T* output, U32 len, F32 min_value, F32 max_value) { return SUCCESS; } -EE clip_general(void *minValue, void *maxValue, TensorDesc inputDesc, void* input, TensorDesc outputDesc, void *output) +EE clip_general( + TensorDesc inputDesc, void *input, ClipParamSpec p, TensorDesc outputDesc, void *output) { UNUSED(outputDesc); - - if (nullptr == minValue - || nullptr == maxValue) - CHECK_STATUS(NULL_POINTER); - EE ret = SUCCESS; switch (inputDesc.dt) { #ifdef _USE_FP32 case DT_F32: { - ret = clip((F32 *)input, (F32 *)output, tensorNumElements(inputDesc), *((F32 *)minValue), *((F32 *)maxValue)); + ret = clip((F32 *)input, (F32 *)output, tensorNumElements(inputDesc), p.min, p.max); break; } #endif #ifdef _USE_FP16 case DT_F16: { - ret = clip((F16 *)input, (F16 *)output, tensorNumElements(inputDesc), *((F32 *)minValue), *((F32 *)maxValue)); + ret = clip((F16 *)input, (F16 *)output, tensorNumElements(inputDesc), p.min, p.max); break; } #endif diff --git a/compute/tensor/src/cpu/general/convolution.cpp b/compute/tensor/src/cpu/general/convolution.cpp new file mode 100644 index 00000000..9179307e --- /dev/null +++ b/compute/tensor/src/cpu/general/convolution.cpp @@ -0,0 +1,209 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include "types.h" +#include "cpu/general/tensor_computing_general.h" +#include "cpu/general/general_functions.h" + +template +inline EE convolution(TensorDesc inputDesc, + T1 *inArray, + TensorDesc filterDesc, + const T2 *filterArray, + ConvolutionParamSpec convParamSpec, + const T3 *biasArray, + const T4 *scaleArray, + TensorDesc outputDesc, + T4 *outArray, + ActivationParamSpec activationDesc, + T1 paddingValue = 0) +{ + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 group = convParamSpec.group; + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingL = convParamSpec.padding_left; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + U32 ocGroupSize = oc / group; + CHECK_REQUIREMENT(fdf == DF_NCHW); + + // For BNN, accumulated values are always 0 or 1, which may lead to error if buf is floating point. + U32 ic8 = ic / 8; + U32 oc8 = oc / 8; + for (U32 n = 0; n < in; n++) { + for (U32 o = 0; o < oc; o++) { + for (U32 h = 0; h < oh; h++) { + for (U32 w = 0; w < ow; w++) { + T3 value = 0; + U32 groupId = o / ocGroupSize; + U32 icStart = groupId * fc; + U32 icEnd = (groupId + 1) * fc; + for (U32 c = icStart, f_off = o * fc * fh * fw; c < icEnd; c++) { + for (I32 fh_idx = 0; fh_idx < (I32)fh; fh_idx++) { + for (I32 fw_idx = 0; fw_idx < (I32)fw; fw_idx++, f_off++) { + I32 ih_idx = h * strideH - paddingT + fh_idx * dilateH; + I32 iw_idx = w * strideW - paddingL + fw_idx * dilateW; + if (ih_idx >= 0 && ih_idx < (I32)ih && iw_idx >= 0 && + iw_idx < (I32)iw) { + U32 i_off; + if (idf == DF_NCHW) { + i_off = ((n * ic + c) * ih + ih_idx) * iw + iw_idx; + } else { + i_off = + (((n * ic8 + (c / 8)) * ih + ih_idx) * iw + iw_idx) * 8 + + c % 8; + } + value += inArray[i_off] * filterArray[f_off]; + } else { + value += paddingValue * filterArray[f_off]; + } + } + } + } + U32 o_off; + if (odf == DF_NCHW) { + o_off = ((n * oc + o) * oh + h) * ow + w; + } else { + o_off = (((n * oc8 + (o / 8)) * oh + h) * ow + w) * 8 + o % 8; + } + + T4 scale = 1; + if (scaleArray != nullptr) { + scale = scaleArray[o]; + } + outArray[o_off] = scale * value + biasArray[o]; + CHECK_STATUS( + activation_template(activationDesc, outArray[o_off], &outArray[o_off])); + } + } + } + } + return SUCCESS; +} + +#ifdef _USE_FP16 +void bnn_input_process(TensorDesc inputDesc, F16 *input, DataType fdt, short *output) +{ + F16 centerValue = 0.0; + if (fdt == DT_BIN01) { + centerValue = 0.5; + } + short zeroValue = 0; + if (fdt == DT_BIN11) { + zeroValue = -1; + } + U32 len = tensorNumElements(inputDesc); + for (U32 i = 0; i < len; i++) { + if (input[i] >= centerValue) { + output[i] = 1; + } else { + output[i] = zeroValue; + } + } +} + +void bnn_filter_process(TensorDesc filterDesc, BIN8 *filter, short *filterTransformed) +{ + short zeroValue = 0; + if (filterDesc.dt == DT_BIN11) { + zeroValue = -1; + } + U32 len = tensorNumElements(filterDesc); + for (U32 i = 0; i < len; i++) { + U32 bitSlot = i / 8; + U32 bitNo = 7 - (i % 8); + std::bitset<8> Q(filter[bitSlot]); + if (Q.test(bitNo)) { + filterTransformed[i] = 1; + } else { + filterTransformed[i] = zeroValue; + } + } +} +#endif + +EE convolution_general(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc) +{ + UNUSED(scaleDesc); + UNUSED(biasDesc); + + EE ret = SUCCESS; + switch (filterDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: + ret = convolution(inputDesc, (F32 *)input, filterDesc, (F32 *)filter, + convParamSpec, (F32 *)bias, (F32 *)scale, outputDesc, (F32 *)output, activationDesc); + break; +#endif +#ifdef _USE_FP16 + case DT_F16: + ret = convolution(inputDesc, (F16 *)input, filterDesc, (F16 *)filter, + convParamSpec, (F16 *)bias, (F16 *)scale, outputDesc, (F16 *)output, activationDesc); + break; +#endif +#ifdef _USE_INT8 + case DT_I8: + ret = convolution(inputDesc, (INT8 *)input, filterDesc, + (F16 *)filter, convParamSpec, (F16 *)bias, (F16 *)scale, outputDesc, (F16 *)output, + activationDesc); + break; +#endif +#ifdef _USE_FP16 + case DT_BIN01: { + std::vector inputTransformed(tensorNumElements(inputDesc)); + std::vector filterTransformed(tensorNumElements(filterDesc)); + bnn_input_process(inputDesc, (F16 *)input, filterDesc.dt, inputTransformed.data()); + bnn_filter_process(filterDesc, (BIN8 *)filter, filterTransformed.data()); + ret = convolution(inputDesc, inputTransformed.data(), + filterDesc, filterTransformed.data(), convParamSpec, (F16 *)bias, (F16 *)scale, + outputDesc, (F16 *)output, activationDesc, 0); + break; + } + case DT_BIN11: { + std::vector inputTransformed(tensorNumElements(inputDesc)); + std::vector filterTransformed(tensorNumElements(filterDesc)); + bnn_input_process(inputDesc, (F16 *)input, filterDesc.dt, inputTransformed.data()); + bnn_filter_process(filterDesc, (BIN8 *)filter, filterTransformed.data()); + ret = convolution(inputDesc, inputTransformed.data(), + filterDesc, filterTransformed.data(), convParamSpec, (F16 *)bias, (F16 *)scale, + outputDesc, (F16 *)output, activationDesc, -1); + break; + } +#endif + default: + return NOT_SUPPORTED; + } + return ret; +} diff --git a/compute/tensor/src/cpu/general/deconvolution.cpp b/compute/tensor/src/cpu/general/deconvolution.cpp new file mode 100644 index 00000000..7ff796fe --- /dev/null +++ b/compute/tensor/src/cpu/general/deconvolution.cpp @@ -0,0 +1,150 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "types.h" +#include "cpu/general/tensor_computing_general.h" +#include "cpu/general/general_functions.h" + +template +inline EE deconvolution(TensorDesc inputDesc, + T *inArray, + TensorDesc filterDesc, + const T *filterArray, + ConvolutionParamSpec convParamSpec, + const T *biasArray, + TensorDesc outputDesc, + T *outArray, + ActivationParamSpec activationDesc) +{ + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 group = convParamSpec.group; + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingL = convParamSpec.padding_left; + U32 ocGroupSize = oc / group; + + // initialize outputs to 0 + memset(outArray, 0, tensorNumBytes(outputDesc)); + U32 ic8 = ic / 8; + U32 oc8 = oc / 8; + for (U32 n = 0; n < in; n++) { + for (U32 o = 0; o < oc; o++) { + U32 groupId = o / ocGroupSize; + U32 icStart = groupId * fn; + U32 icEnd = (groupId + 1) * fn; + for (U32 c = icStart; c < icEnd; c++) { + for (U32 h = 0; h < ih; h++) { + for (U32 w = 0; w < iw; w++) { + U32 i_off; + if (idf == DF_NCHW) { + i_off = ((n * ic + c) * ih + h) * iw + w; + } else { + i_off = (((n * ic8 + (c / 8)) * ih + h) * iw + w) * 8 + c % 8; + } + for (I32 fh_idx = 0; fh_idx < (I32)fh; fh_idx++) { + for (I32 fw_idx = 0; fw_idx < (I32)fw; fw_idx++) { + I32 oh_idx = fh_idx + strideH * h - paddingT; + I32 ow_idx = fw_idx + strideW * w - paddingL; + if (oh_idx >= 0 && oh_idx < (I32)oh && ow_idx >= 0 && + ow_idx < (I32)ow) { + U32 o_off; + if (odf == DF_NCHW) { + o_off = ((n * oc + o) * oh + oh_idx) * ow + ow_idx; + } else { + o_off = + (((n * oc8 + (o / 8)) * oh + oh_idx) * ow + ow_idx) * 8 + + o % 8; + } + U32 f_off = + (((c - icStart) * fc + o) * fh + fh_idx) * fw + fw_idx; + outArray[o_off] += inArray[i_off] * filterArray[f_off]; + } + } + } + } + } + } + } + } + // bias + U32 ohow = oh * ow; + for (U32 i = 0; i < tensorNumElements(outputDesc); i++) { + U32 o; + if (odf == DF_NCHW) { + o = (i / ohow) % oc; + } else { + o = (i / (ohow * 8)) % oc8 * 8 + i % 8; + } + outArray[i] += biasArray[o]; + switch (activationDesc.mode) { + case ACTIVATION_NULL: { + break; + } + case ACTIVATION_RELU: { + F32 tmp = activationDesc.value[0] * outArray[i]; + if (outArray[i] < tmp) { + outArray[i] = tmp; + } + break; + } + default: + return NOT_SUPPORTED; + } + } + return SUCCESS; +} + +EE deconvolution_general(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc) +{ + UNUSED(scaleDesc); + UNUSED(scale); + UNUSED(biasDesc); + + EE ret = SUCCESS; + switch (inputDesc.dt) { +#ifdef _USE_FP16 + case DT_F16: + ret = deconvolution(inputDesc, (F16 *)input, filterDesc, (F16 *)filter, + convParamSpec, (F16 *)bias, outputDesc, (F16 *)output, activationDesc); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + ret = deconvolution(inputDesc, (F32 *)input, filterDesc, (F32 *)filter, + convParamSpec, (F32 *)bias, outputDesc, (F32 *)output, activationDesc); + break; +#endif + default: + return NOT_SUPPORTED; + } + return ret; +} diff --git a/compute/tensor/src/cpu/general/depthwise_convolution.cpp b/compute/tensor/src/cpu/general/depthwise_convolution.cpp new file mode 100644 index 00000000..e787ed44 --- /dev/null +++ b/compute/tensor/src/cpu/general/depthwise_convolution.cpp @@ -0,0 +1,34 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/general/tensor_computing_general.h" + +EE depthwise_convolution_general(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec) +{ + TensorDesc blankTensorDesc; + ActivationParamSpec blankActivationParamSpec; + return depthwise_pointwise_convolution_general(inputDesc, input, filterDesc, filter, + blankTensorDesc, nullptr, convParamSpec, blankTensorDesc, bias, biasDesc, nullptr, tmpBytes, + tmp, outputDesc, output, depthwiseActivationParamSpec, blankActivationParamSpec); +} diff --git a/compute/tensor/src/cpu/general/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/general/depthwise_pointwise_convolution.cpp new file mode 100644 index 00000000..4fcddbcd --- /dev/null +++ b/compute/tensor/src/cpu/general/depthwise_pointwise_convolution.cpp @@ -0,0 +1,191 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "types.h" +#include "cpu/general/tensor_computing_general.h" +#include "cpu/general/general_functions.h" + +EE depthwise_pointwise_convolution_infer_forward_tmp_bytes_general(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes) +{ + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + U32 elementSize = bytesOf(fdt); + if (fdt == DT_I8) { + elementSize = bytesOf(DT_I32); + } + *bytes = ic * oh * ow * elementSize; + return SUCCESS; +} + +template +inline EE depthwise_pointwise_convolution(TensorDesc inputDesc, + T1 *inArray, + TensorDesc dwFilterDesc, + const T2 *dwFilterArray, + TensorDesc pwFilterDesc, + const T2 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + const T3 *dwBiasArray, + const T3 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + T3 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec) +{ + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingL = convParamSpec.padding_left; + bool fuseDepthwisePointwise = (pwFilterArray == nullptr) ? false : true; + + T3 *pwArray; + if (fuseDepthwisePointwise) { + CHECK_REQUIREMENT(tmpBytes >= ic * oh * ow * sizeof(T3)); + pwArray = (T3 *)tmp; + } else { + pwArray = outArray; + } + U32 ic8 = ic / 8; + U32 oc8 = oc / 8; + for (U32 n = 0; n < in; n++) { + // dw conv + for (U32 c = 0, pw_off = 0; c < ic; c++) { + for (U32 h = 0; h < oh; h++) { + for (U32 w = 0; w < ow; w++, pw_off++) { + T3 value = dwBiasArray[c]; + for (U32 fh_idx = 0; fh_idx < fh; fh_idx++) { + for (U32 fw_idx = 0; fw_idx < fw; fw_idx++) { + I32 ih_idx = h * strideH - paddingT + fh_idx; + I32 iw_idx = w * strideW - paddingL + fw_idx; + if (ih_idx >= 0 && ih_idx < (I32)ih && iw_idx >= 0 && iw_idx < (I32)iw) { + U32 i_off; + if (idf == DF_NCHW) { + i_off = ((n * ic + c) * ih + ih_idx) * iw + iw_idx; + } else { + i_off = (((n * ic8 + (c / 8)) * ih + ih_idx) * iw + iw_idx) * 8 + + c % 8; + } + value += inArray[i_off] * + dwFilterArray[c * fh * fw + fh_idx * fw + fw_idx]; + } + } + } + CHECK_STATUS( + activation_template(depthwiseActivationParamSpec, value, &value)); + + if (fuseDepthwisePointwise || odf == DF_NCHW) { + pwArray[pw_off] = value; + } else { + pwArray[(((n * ic8 + (c / 8)) * oh + h) * ow + w) * 8 + c % 8] = value; + } + } + } + } + if (fuseDepthwisePointwise) { + // pw conv + for (U32 o = 0; o < oc; o++) { + for (U32 hw = 0; hw < oh * ow; hw++) { + T3 value = pwBiasArray[o]; + for (U32 c = 0; c < ic; c++) { + U32 pw_off = c * oh * ow + hw; + value += pwArray[pw_off] * pwFilterArray[o * ic + c]; + } + CHECK_STATUS( + activation_template(pointwiseActivationParamSpec, value, &value)); + U32 o_off; + if (odf == DF_NCHW) { + o_off = (n * oc + o) * oh * ow + hw; + } else { + o_off = ((n * oc8 + (o / 8)) * oh * ow + hw) * 8 + o % 8; + } + outArray[o_off] = value; + } + } + } + } + return SUCCESS; +} + +EE depthwise_pointwise_convolution_general(TensorDesc inputDesc, + void *input, + TensorDesc dwFilterDesc, + const void *dwFilter, + TensorDesc pwFilterDesc, + const void *pwFilter, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const void *dwBias, + TensorDesc pwBiasDesc, + const void *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { +#ifdef _USE_FP16 + case DT_F16: + ret = depthwise_pointwise_convolution(inputDesc, (F16 *)input, + dwFilterDesc, (F16 *)dwFilter, pwFilterDesc, (F16 *)pwFilter, convParamSpec, + (F16 *)dwBias, (F16 *)pwBias, tmpBytes, tmp, outputDesc, (F16 *)output, + depthwiseActivationParamSpec, pointwiseActivationParamSpec); + break; +#endif +#ifdef _USE_INT8 + case DT_I8: + ret = depthwise_pointwise_convolution(inputDesc, (INT8 *)input, + dwFilterDesc, (INT8 *)dwFilter, pwFilterDesc, (INT8 *)pwFilter, convParamSpec, + (I32 *)dwBias, (I32 *)pwBias, tmpBytes, tmp, outputDesc, (I32 *)output, + depthwiseActivationParamSpec, pointwiseActivationParamSpec); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + ret = depthwise_pointwise_convolution(inputDesc, (F32 *)input, + dwFilterDesc, (F32 *)dwFilter, pwFilterDesc, (F32 *)pwFilter, convParamSpec, + (F32 *)dwBias, (F32 *)pwBias, tmpBytes, tmp, outputDesc, (F32 *)output, + depthwiseActivationParamSpec, pointwiseActivationParamSpec); + break; +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/general/eltwise.cpp b/compute/tensor/src/cpu/general/eltwise.cpp new file mode 100644 index 00000000..a62a7da2 --- /dev/null +++ b/compute/tensor/src/cpu/general/eltwise.cpp @@ -0,0 +1,88 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/general/tensor_computing_general.h" + +template +T getFloatScalar(void *input, int inputSize, int index) +{ + int local = index % inputSize; + return ((T *)input)[local]; +} + +template +EE eltwise_general_kernel(std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode) +{ + T *output_ptr = (T *)output; + for (U32 i = 0; i < len; i++) { + F32 tmp_s = getFloatScalar(input[0], inputSize[0], i); + for (U32 j = 1; j < num; j++) { + F32 value_s = getFloatScalar(input[j], inputSize[j], i); + switch (eltwiseMode) { + case ELTWISE_SUM: + tmp_s = value_s + tmp_s; + break; + case ELTWISE_MAX: + tmp_s = (value_s > tmp_s) ? value_s : tmp_s; + break; + case ELTWISE_PROD: + tmp_s *= value_s; + break; + case ELTWISE_SUB: + tmp_s -= value_s; + break; + case ELTWISE_DIV: + tmp_s /= value_s; + break; + default: + return NOT_SUPPORTED; + } + } + output_ptr[i] = tmp_s; + } + return SUCCESS; +} + +EE eltwise_general(DataType dataType, + std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode) +{ + EE ret = SUCCESS; + switch (dataType) { +#ifdef _USE_FP32 + case DT_F32: { + ret = eltwise_general_kernel(input, inputSize, num, len, output, eltwiseMode); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = eltwise_general_kernel(input, inputSize, num, len, output, eltwiseMode); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/general/general_functions.h b/compute/tensor/src/cpu/general/general_functions.h new file mode 100644 index 00000000..2b886db3 --- /dev/null +++ b/compute/tensor/src/cpu/general/general_functions.h @@ -0,0 +1,274 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_GENERAL_FUNCTIONS +#define _H_GENERAL_FUNCTIONS + +#include "cpu/cpu_functions_template.h" + +template +inline EE from_nchwc8_to_nchw(TensorDesc *desc, T *data) +{ + if (desc == nullptr || data == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + CHECK_STATUS(tensor4dGet(*desc, &idt, &idf, &in, &ic, &ih, &iw)); + if (idf != DF_NCHWC8) { + CHECK_STATUS(NOT_MATCH); + } + + *desc = tensor4df(idt, DF_NCHW, in, ic, ih, iw); + + T *tmp = (T *)malloc(tensorNumBytes(*desc)); + ic /= 8; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 hw = 0; hw < ih * iw; hw++) { + for (U32 c8 = 0; c8 < 8; c8++) { + tmp[n * ic * 8 * ih * iw + (c * 8 + c8) * ih * iw + hw] = + data[n * ic * ih * iw * 8 + c * ih * iw * 8 + hw * 8 + c8]; + } + } + } + } + memcpy(data, tmp, tensorNumBytes(*desc)); + free(tmp); + return SUCCESS; +} + +template +inline EE from_nchw_to_nchwc8(TensorDesc *desc, T *data) +{ + if (desc == nullptr || data == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + CHECK_STATUS(tensor4dGet(*desc, &idt, &idf, &in, &ic, &ih, &iw)); + if (idf != DF_NCHW) { + CHECK_STATUS(NOT_MATCH); + } + + *desc = tensor4df(idt, DF_NCHWC8, in, ic, ih, iw); + + T *tmp = (T *)malloc(tensorNumBytes(*desc)); + ic /= 8; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 hw = 0; hw < ih * iw; hw++) { + for (U32 c8 = 0; c8 < 8; c8++) { + tmp[n * ic * ih * iw * 8 + c * ih * iw * 8 + hw * 8 + c8] = + data[n * ic * 8 * ih * iw + (c * 8 + c8) * ih * iw + hw]; + } + } + } + } + memcpy(data, tmp, tensorNumBytes(*desc)); + free(tmp); + return SUCCESS; +} + +inline F32 array_mean_general(DataType dt, const void *data, I32 len) +{ + F32 result = 0; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + result = array_mean_template((const F16 *)data, len); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + result = array_mean_template((const F32 *)data, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return result; +} + +inline F32 array_var_general(DataType dt, const void *data, I32 len, F32 mean) +{ + F32 result = 0; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + result = array_var_template((const F16 *)data, len, mean); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + result = array_var_template((const F32 *)data, len, mean); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return result; +} + +inline void array_power_general(DataType dt, void *input, void *output, I32 len, F32 power) +{ + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + array_power_template((F16 *)input, (F16 *)output, len, power); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + array_power_template((F32 *)input, (F32 *)output, len, power); + break; +#endif + case DT_I32: + array_power_template((I32 *)input, (I32 *)output, len, power); + break; + case DT_U32: + array_power_template((U32 *)input, (U32 *)output, len, power); + break; + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } +} + +inline void array_add_general( + DataType dt, const void *inputA, const void *inputB, void *output, I32 len) +{ + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + array_add_template((const F16 *)inputA, (const F16 *)inputB, (F16 *)output, len); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + array_add_template((const F32 *)inputA, (const F32 *)inputB, (F32 *)output, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } +} + +inline void array_scale_general( + DataType dt, const void *input, void *output, I32 len, F32 alpha, F32 beta) +{ + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + array_scale_template((const F16 *)input, (F16 *)output, len, alpha, beta); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + array_scale_template((const F32 *)input, (F32 *)output, len, alpha, beta); + break; +#endif + case DT_I32: + array_scale_template((const I32 *)input, (I32 *)output, len, alpha, beta); + break; + case DT_U32: + array_scale_template((const U32 *)input, (U32 *)output, len, alpha, beta); + break; + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } +} + +inline F32 array_sum_general(DataType dt, const void *data, I32 len) +{ + F32 result = 0; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + result = array_sum_template((const F16 *)data, len); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + result = array_sum_template((const F32 *)data, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return result; +} + +inline void array_square_and_add_general( + DataType dt, const void *inputA, const void *inputB, void *output, I32 len) +{ + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + array_square_and_add_template( + (const F16 *)inputA, (const F16 *)inputB, (F16 *)output, len); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + array_square_and_add_template( + (const F32 *)inputA, (const F32 *)inputB, (F32 *)output, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } +} + +inline EE array_activation_general( + DataType dt, void *input, U32 len, ActivationParamSpec activationDesc, void *output) +{ + EE ret = SUCCESS; + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: { + F16 *inPtr = (F16 *)input; + F16 *outPtr = (F16 *)output; + for (U32 i = 0; i < len; i++) { + activation_template(activationDesc, inPtr[i], &outPtr[i]); + } + break; + } +#endif +#ifdef _USE_FP32 + case DT_F32: { + F32 *inPtr = (F32 *)input; + F32 *outPtr = (F32 *)output; + for (U32 i = 0; i < len; i++) { + activation_template(activationDesc, inPtr[i], &outPtr[i]); + } + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} +#endif diff --git a/tensor_computing/src/cpu/general/normalization.cpp b/compute/tensor/src/cpu/general/normalization.cpp similarity index 57% rename from tensor_computing/src/cpu/general/normalization.cpp rename to compute/tensor/src/cpu/general/normalization.cpp index 871dbb7f..793ebd7b 100644 --- a/tensor_computing/src/cpu/general/normalization.cpp +++ b/compute/tensor/src/cpu/general/normalization.cpp @@ -1,74 +1,77 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "cpu/general/general_functions.h" #include "cpu/general/tensor_computing_general.h" -template -inline EE array_norm_scale(T *input, T *output, I32 len, F32 mean, F32 var, T *alpha, T *beta) { +template +inline EE array_norm_scale_template( + T *input, T *output, I32 len, F32 mean, F32 var, T *alpha, T *beta) +{ F32 eps = 1e-6; F32 std_value = sqrt(var + eps); - for(I32 i = 0; i < len; i++){ + for (I32 i = 0; i < len; i++) { output[i] = alpha[i] * (input[i] - mean) / std_value + beta[i]; } return SUCCESS; } -template -inline EE layer_normalization(T *alpha, T *beta, - TensorDesc inputDesc, T* input, - TensorDesc outputDesc, T* output) +template +inline EE layer_normalization_template( + TensorDesc inputDesc, T *input, T *alpha, T *beta, TensorDesc outputDesc, T *output) { - if (nullptr == input || nullptr == output) + if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); - if(inputDesc.dt != outputDesc.dt || inputDesc.df != outputDesc.df) + } + if (inputDesc.dt != outputDesc.dt || inputDesc.df != outputDesc.df) { CHECK_STATUS(NOT_MATCH); + } U32 size = tensorNumElements(inputDesc); I32 size_inner = inputDesc.dims[0]; I32 size_outer = size / size_inner; - for(I32 i = 0; i < size_outer; i++) { + for (I32 i = 0; i < size_outer; i++) { T *current_input = input + i * size_inner; T *current_output = output + i * size_inner; - F32 mean = array_mean(current_input, size_inner); - F32 var = array_var(current_input, size_inner, mean); + F32 mean = array_mean_template(current_input, size_inner); + F32 var = array_var_template(current_input, size_inner, mean); - array_norm_scale(current_input, current_output, size_inner, mean, var, alpha, beta); + array_norm_scale_template( + current_input, current_output, size_inner, mean, var, alpha, beta); } - + return SUCCESS; } - -EE layer_normalization_general(void *alpha, void *beta, - TensorDesc inputDesc, void* input, - TensorDesc outputDesc, void* output) +EE layer_normalization_general( + TensorDesc inputDesc, void *input, void *alpha, void *beta, TensorDesc outputDesc, void *output) { DataType idt = inputDesc.dt; EE ret = SUCCESS; switch (idt) { #ifdef _USE_FP32 case DT_F32: { - ret = layer_normalization((F32*)alpha, (F32*)beta, inputDesc, (F32*)input, outputDesc, (F32*)output); + ret = layer_normalization_template( + inputDesc, (F32 *)input, (F32 *)alpha, (F32 *)beta, outputDesc, (F32 *)output); break; } #endif #ifdef _USE_FP16 case DT_F16: { - ret = layer_normalization((F16*)alpha, (F16*)beta, inputDesc, (F16*)input, outputDesc, (F16*)output); + ret = layer_normalization_template( + inputDesc, (F16 *)input, (F16 *)alpha, (F16 *)beta, outputDesc, (F16 *)output); break; } #endif diff --git a/compute/tensor/src/cpu/general/padding.cpp b/compute/tensor/src/cpu/general/padding.cpp new file mode 100644 index 00000000..aad8b036 --- /dev/null +++ b/compute/tensor/src/cpu/general/padding.cpp @@ -0,0 +1,126 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "types.h" +#include "cpu/general/tensor_computing_general.h" +#include + +EE padding_general(TensorDesc inputDesc, + const void *input, + PadParamSpec padParamSpec, + TensorDesc outputDesc, + void *output) +{ + DataType idt, odt; + DataFormat idf, odf; + U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + CHECK_REQUIREMENT(in == on); + CHECK_REQUIREMENT(ic == oc); + U32 alignSize = 1; + if (idf == DF_NCHWC8) { + alignSize = 8; + } + ic /= alignSize; + oc /= alignSize; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < ih; h++) { + const U8 *inPtr = + (const U8 *)input + (((n * ic + c) * ih + h) * iw) * alignSize * bytesOf(idt); + U8 *outPtr = (U8 *)output + + (((n * oc + c) * oh + (padParamSpec.top + h)) * ow) * alignSize * bytesOf(odt); + if (padParamSpec.pad_mode == Pad_Constant) { + memset(outPtr, 0, padParamSpec.left * alignSize * bytesOf(odt)); + outPtr += padParamSpec.left * alignSize * bytesOf(odt); + memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt)); + outPtr += iw * alignSize * bytesOf(odt); + memset(outPtr, 0, padParamSpec.right * alignSize * bytesOf(odt)); + } else { + for (U32 w = 0; w < padParamSpec.left; w++) { + U32 index = 0; + if (padParamSpec.pad_mode == Pad_Reflect) { + index = (padParamSpec.left - w) * alignSize * bytesOf(idt); + } else if (padParamSpec.pad_mode == Pad_Symmetric) { + index = (padParamSpec.left - w - 1) * alignSize * bytesOf(idt); + } + memcpy(outPtr, inPtr + index, alignSize * bytesOf(idt)); + outPtr += alignSize * bytesOf(idt); + } + memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt)); + outPtr += iw * alignSize * bytesOf(odt); + for (U32 w = 0; w < padParamSpec.right; w++) { + U32 index = (iw - 1) * alignSize * bytesOf(idt); + if (padParamSpec.pad_mode == Pad_Reflect) { + index = (iw - w - 2) * alignSize * bytesOf(idt); + } else if (padParamSpec.pad_mode == Pad_Symmetric) { + index = (iw - w - 1) * alignSize * bytesOf(idt); + } + memcpy(outPtr, inPtr + index, alignSize * bytesOf(idt)); + outPtr += alignSize * bytesOf(idt); + } + } + } + U8 *outPtr = (U8 *)output + (((n * oc + c) * oh) * ow) * alignSize * bytesOf(odt); + for (U32 h = 0; h < padParamSpec.top; h++) { + U32 index = h * ow * alignSize * bytesOf(odt); + if (padParamSpec.pad_mode == Pad_Constant) { + memset(outPtr + index, 0, ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Edge) { + memcpy(outPtr + index, + outPtr + (padParamSpec.top * ow * alignSize * bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Reflect) { + memcpy(outPtr + index, + outPtr + + ((padParamSpec.top + padParamSpec.top - h) * ow * alignSize * + bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Symmetric) { + memcpy(outPtr + index, + outPtr + + ((padParamSpec.top + padParamSpec.top - h - 1) * ow * alignSize * + bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else { + return NOT_SUPPORTED; + } + } + for (U32 h = 0; h < padParamSpec.bottom; h++) { + U32 index = (padParamSpec.top + ih + h) * ow * alignSize * bytesOf(odt); + if (padParamSpec.pad_mode == Pad_Constant) { + memset(outPtr + index, 0, ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Edge) { + memcpy(outPtr + index, + outPtr + ((padParamSpec.top + ih - 1) * ow * alignSize * bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Reflect) { + // memcpy(outPtr+index, outPtr+((padParamSpec.top+ih-2-h)*ow*alignSize*bytesOf(odt)), ow*alignSize*bytesOf(odt)); + memcpy(outPtr + index, + outPtr + + ((padParamSpec.top + ih - 1 - padParamSpec.bottom + h) * ow * + alignSize * bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Symmetric) { + memcpy(outPtr + index, + outPtr + ((padParamSpec.top + ih - 1 - h) * ow * alignSize * bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else { + return NOT_SUPPORTED; + } + } + } + } + return SUCCESS; +} diff --git a/tensor_computing/src/cpu/general/pooling.cpp b/compute/tensor/src/cpu/general/pooling.cpp similarity index 64% rename from tensor_computing/src/cpu/general/pooling.cpp rename to compute/tensor/src/cpu/general/pooling.cpp index bcf5a1bb..b5aa6c4f 100644 --- a/tensor_computing/src/cpu/general/pooling.cpp +++ b/compute/tensor/src/cpu/general/pooling.cpp @@ -1,31 +1,40 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include -#include "type.h" -#include "tensor_desc.h" #include "error.h" -#include "tensor_computing_type.h" +#include "types.h" #include "cpu/general/tensor_computing_general.h" -template -EE pooling(T *input, T* output, - U32 in, U32 ic, U32 ih, U32 iw, - U32 strideH, U32 strideW, U32 paddingT, U32 paddingB, U32 paddingL, U32 paddingR, U32 kernelH, U32 kernelW, - PoolingMode pm, RoundMode rm, - U32 alignSize, - F32 minValue) +template +EE pooling(T *input, + T *output, + U32 in, + U32 ic, + U32 ih, + U32 iw, + U32 strideH, + U32 strideW, + U32 paddingT, + U32 paddingB, + U32 paddingL, + U32 paddingR, + U32 kernelH, + U32 kernelW, + PoolingMode pm, + RoundMode rm, + U32 alignSize, + F32 minValue) { U32 oh = 0, ow = 0; if (rm == CEIL) { @@ -40,11 +49,11 @@ EE pooling(T *input, T* output, CHECK_REQUIREMENT(ic % alignSize == 0); ic = ic / alignSize; - for (U32 n=0; n (int)ih) ? ih : hend; wend = (wend > (int)iw) ? iw : wend; - float poolSize = (hend - hstart)*(wend - wstart); + float poolSize = (hend - hstart) * (wend - wstart); - T value; - switch(pm){ + F32 value; + switch (pm) { case POOLING_MAX: value = minValue; break; @@ -68,8 +77,8 @@ EE pooling(T *input, T* output, } for (int x = hstart; x < hend; x++) { for (int y = wstart; y < wend; y++) { - U32 in_off = ((((n*ic + c)*ih) + x)*iw + y)*alignSize + j; - switch(pm){ + U32 in_off = ((((n * ic + c) * ih) + x) * iw + y) * alignSize + j; + switch (pm) { case POOLING_MAX: value = (value > input[in_off]) ? value : input[in_off]; break; @@ -81,7 +90,7 @@ EE pooling(T *input, T* output, } } } - switch(pm){ + switch (pm) { case POOLING_MAX: break; case POOLING_MEAN: @@ -91,7 +100,7 @@ EE pooling(T *input, T* output, return NOT_SUPPORTED; } - U32 out_off = ((((n*ic + c)*oh) + h)*ow + w)*alignSize + j; + U32 out_off = ((((n * ic + c) * oh) + h) * ow + w) * alignSize + j; output[out_off] = value; } } @@ -101,15 +110,18 @@ EE pooling(T *input, T* output, return SUCCESS; } -EE pooling_general(TensorDesc inputDesc, const void* input, PoolingDesc poolingDesc, TensorDesc outputDesc, void* output) +EE pooling_general(TensorDesc inputDesc, + const void *input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + void *output) { if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); } DataType idt, odt; DataFormat idf, odf; - U32 in = 0, ic = 0, ih = 0, iw = 0, - on = 0, oc = 0, oh = 0, ow = 0; + U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0; CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); @@ -120,35 +132,29 @@ EE pooling_general(TensorDesc inputDesc, const void* input, PoolingDesc poolingD CHECK_STATUS(NOT_MATCH); } - U32 strideH = poolingDesc.stride_h; - U32 strideW = poolingDesc.stride_w; - U32 paddingT = poolingDesc.padding_top; - U32 paddingB = poolingDesc.padding_bottom; - U32 paddingL = poolingDesc.padding_left; - U32 paddingR = poolingDesc.padding_right; - U32 kernelSizeH = poolingDesc.kernelSize_h; - U32 kernelSizeW = poolingDesc.kernelSize_w; + U32 strideH = poolingParamSpec.stride_h; + U32 strideW = poolingParamSpec.stride_w; + U32 paddingT = poolingParamSpec.padding_top; + U32 paddingB = poolingParamSpec.padding_bottom; + U32 paddingL = poolingParamSpec.padding_left; + U32 paddingR = poolingParamSpec.padding_right; + U32 kernelSizeH = poolingParamSpec.kernel_h; + U32 kernelSizeW = poolingParamSpec.kernel_w; EE ret = SUCCESS; switch (idt) { #ifdef _USE_FP32 case DT_F32: - ret = pooling((F32*)input, (F32*)output, - in, ic, ih, iw, - strideH, strideW, paddingT, paddingB, paddingL, paddingR, - kernelSizeH, kernelSizeW, - poolingDesc.pm, poolingDesc.rm, - 8, FLT_MIN); + ret = pooling((F32 *)input, (F32 *)output, in, ic, ih, iw, strideH, strideW, paddingT, + paddingB, paddingL, paddingR, kernelSizeH, kernelSizeW, poolingParamSpec.mode, + poolingParamSpec.rm, 8, -FLT_MAX); break; #endif #ifdef _USE_FP16 case DT_F16: - ret = pooling((F16*)input, (F16*)output, - in, ic, ih, iw, - strideH, strideW, paddingT, paddingB, paddingL, paddingR, - kernelSizeH, kernelSizeW, - poolingDesc.pm, poolingDesc.rm, - 8, UNI_F16_MIN); + ret = pooling((F16 *)input, (F16 *)output, in, ic, ih, iw, strideH, strideW, paddingT, + paddingB, paddingL, paddingR, kernelSizeH, kernelSizeW, poolingParamSpec.mode, + poolingParamSpec.rm, 8, -UNI_F16_MAX); break; #endif default: diff --git a/compute/tensor/src/cpu/general/pooling_bp.cpp b/compute/tensor/src/cpu/general/pooling_bp.cpp new file mode 100644 index 00000000..ac6a6ea4 --- /dev/null +++ b/compute/tensor/src/cpu/general/pooling_bp.cpp @@ -0,0 +1,111 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "error.h" +#include "types.h" +#include "cpu/general/tensor_computing_general.h" + +template +EE pooling_bp(T *input, + T *output, + U32 in, + U32 ic, + U32 ih, + U32 iw, + U32 strideH, + U32 strideW, + U32 paddingT, + U32 paddingL, + U32 kernelH, + U32 kernelW, + PoolingMode pm, + U32 oh, + U32 ow, + U32 alignSize) +{ + UNUSED(pm); + CHECK_REQUIREMENT(ic % alignSize == 0); + ic = ic / alignSize; + + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 j = 0; j < alignSize; j++) { + for (I32 h = 0; h < (I32)ih; h++) { + for (I32 w = 0; w < (I32)iw; w++) { + int hstart = int(h * strideH - paddingT); + int wstart = int(w * strideW - paddingL); + int hend = hstart + kernelH; + int wend = wstart + kernelW; + hstart = (hstart < 0) ? 0 : hstart; + wstart = (wstart < 0) ? 0 : wstart; + hend = (hend > (int)oh) ? oh : hend; + wend = (wend > (int)ow) ? ow : wend; + float poolSize = (hend - hstart) * (wend - wstart); + for (int x = hstart; x < hend; x++) { + for (int y = wstart; y < wend; y++) { + U32 in_off = ((((n * ic + c) * ih) + h) * iw + w) * alignSize + j; + U32 out_off = ((((n * ic + c) * oh) + x) * ow + y) * alignSize + j; + output[out_off] += input[in_off] / poolSize; + } + } + } + } + } + } + } + return SUCCESS; +} + +EE pooling_bp_general(TensorDesc inputDesc, + const void *input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + void *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, odt; + DataFormat idf, odf; + U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (in != on || ic != oc) { + CHECK_STATUS(NOT_MATCH); + } + if (idf != DF_NCHWC8 || odf != idf) { + CHECK_STATUS(NOT_MATCH); + } + + U32 strideH = poolingParamSpec.stride_h; + U32 strideW = poolingParamSpec.stride_w; + U32 paddingT = poolingParamSpec.padding_top; + U32 paddingL = poolingParamSpec.padding_left; + U32 kernelSizeH = poolingParamSpec.kernel_h; + U32 kernelSizeW = poolingParamSpec.kernel_w; + + EE ret = SUCCESS; + switch (idt) { +#ifdef _USE_FP32 + case DT_F32: + ret = pooling_bp((F32 *)input, (F32 *)output, in, ic, ih, iw, strideH, strideW, + paddingT, paddingL, kernelSizeH, kernelSizeW, poolingParamSpec.mode, oh, ow, 8); + break; +#endif + default: + ret = NOT_SUPPORTED; + } + return ret; +} \ No newline at end of file diff --git a/compute/tensor/src/cpu/general/prelu.cpp b/compute/tensor/src/cpu/general/prelu.cpp new file mode 100644 index 00000000..1f6cca19 --- /dev/null +++ b/compute/tensor/src/cpu/general/prelu.cpp @@ -0,0 +1,85 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "error.h" +#include "types.h" +#include "cpu/general/tensor_computing_general.h" + +template +static EE prelu( + T *input, T *output, T *weight, PReLUParamSpec preluDesc, U32 in, U32 ic, U32 ih, U32 iw) +{ + ic /= 8; + T slope; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 hw = 0; hw < ih * iw; hw++) { + for (U32 c8 = 0; c8 < 8; c8++) { + slope = preluDesc.propagate_down ? weight[0] : weight[c * 8 + c8]; + U32 off = n * ic * ih * iw * 8 + c * ih * iw * 8 + hw * 8 + c8; + if (input[off] > 0) { + output[off] = input[off]; + } else { + output[off] = input[off] * slope; + } + } + } + } + } + return SUCCESS; +} + +EE prelu_general(TensorDesc inputDesc, + void *input, + void *weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + void *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, odt; + DataFormat idf, odf; + U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0; + if (tensorIs4d(inputDesc) && tensorIs4d(outputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + } else { + return NOT_SUPPORTED; + } + if (idf != DF_NCHWC8) { + return NOT_SUPPORTED; + } + CHECK_REQUIREMENT(in == on && ic == oc && ih == oh && iw == ow); + EE ret = SUCCESS; + switch (idt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = prelu((F32 *)input, (F32 *)output, (F32 *)weight, preluDesc, in, ic, ih, iw); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = prelu((F16 *)input, (F16 *)output, (F16 *)weight, preluDesc, in, ic, ih, iw); + break; + } +#endif + default: { + ret = NOT_SUPPORTED; + break; + } + } + return ret; +} diff --git a/compute/tensor/src/cpu/general/rnn.cpp b/compute/tensor/src/cpu/general/rnn.cpp new file mode 100644 index 00000000..36e398c0 --- /dev/null +++ b/compute/tensor/src/cpu/general/rnn.cpp @@ -0,0 +1,202 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include + +#include "cpu/general/tensor_computing_general.h" +#include "cpu/general/general_functions.h" + +template +static void mvm_nkn32_template(U32 fn, U32 fk, const T *filterArray, T *input, T *output) +{ + for (U32 i = 0; i < fn; i++) { + for (U32 j = 0; j < 32; j++) { + U32 n = i * 32 + j; + F32 value = 0; + for (U32 k = 0; k < fk; k++) { + value += input[k] * filterArray[(i * fk + k) * 32 + j]; + } + output[n] += value; + } + } +} + +template +static EE rnncell(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *output) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + if (nullptr == currentX || nullptr == filter || nullptr == bias || nullptr == state || + nullptr == tmp || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ix; + U32 on, oh; + U32 fk, fn; + CHECK_STATUS(tensor2dGet(xDesc, &idt, &idf, &in, &ix)); + CHECK_STATUS(tensor2dGet(filterDesc[0], &fdt, &fdf, &fn, &fk)); + CHECK_STATUS(tensor2dGet(hDesc, &odt, &odf, &on, &oh)); + if (fdf != DF_NKN32) { + CHECK_STATUS(NOT_MATCH); + } + + U32 batch = in; + U32 xDim = ix; + U32 hDim = rnnParamSpec.numOutput; + I32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection + : rnnParamSpec.numOutput; + F32 forgetBias = rnnParamSpec.forgetBias; + ActivationMode activationMode = rnnParamSpec.activationMode; + if (activationMode != ACTIVATION_TANH) { + CHECK_STATUS(NOT_SUPPORTED); + } + + if (!(idt == fdt && idt == odt)) { + CHECK_STATUS(NOT_MATCH); + } + + const T *currentXArray = (const T *)currentX; + T *lastStateArray = (T *)state; + T *lastHArray = lastStateArray + column; + T *tmpArray = (T *)tmp; + T *currentStateArray = (T *)state; + T *currentHArray = currentStateArray + column; + T *outputArray = (T *)output; + T *xhArray = tmpArray; + T *intermediateH = xhArray + (xDim + hDim); + U32 lastStateStride = column + hDim; + U32 lastHStride = column + hDim; + U32 currentStateStride = column + hDim; + U32 currentHStride = column + hDim; + for (U32 m = 0; m < batch; m++) { + T *lastBatchH = lastHArray + m * lastHStride; + memcpy(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(T)); + memcpy(xhArray + xDim, lastBatchH, hDim * sizeof(T)); + + // MVM + memcpy(intermediateH, bias[0], column * 4 * sizeof(T)); + mvm_nkn32_template(fn / 32, fk, (const T *)filter[0], xhArray, intermediateH); + + T *out_i = intermediateH; + T *out_g = out_i + column; + T *out_f = out_i + column * 2; + T *out_o = out_i + column * 3; + T *lastBatchState = lastStateArray + m * lastStateStride; + T *currentBatchState = currentStateArray + m * currentStateStride; + T *currentBatchH = currentHArray + m * currentHStride; + T *currentOutput = outputArray + m * batchStrideH; + T *tmpState, *tmpHH, *tmpH; + if (rnnParamSpec.zoneoutCell == 0) { + tmpState = currentBatchState; + } else { + tmpState = out_i; + } + if (rnnParamSpec.numProjection > 0) { + tmpHH = out_g; + tmpH = currentOutput; + } else { + tmpHH = currentOutput; + tmpH = out_g; + } + + for (I32 h = 0; h < column; h++) { + F32 C_s = lastBatchState[h]; + F32 I_s = 1.0 / (1.0 + exp(-out_i[h])); + F32 G_s = tanh(out_g[h]); + F32 F_s = 1.0 / (1.0 + exp(-(out_f[h] + forgetBias))); + F32 O_s = 1.0 / (1.0 + exp(-out_o[h])); + C_s = C_s * F_s + I_s * G_s; + F32 value = O_s * tanh(C_s); + tmpState[h] = C_s; + tmpHH[h] = value; + } + + if (rnnParamSpec.zoneoutCell != 0) { + array_scale_template(tmpState, tmpState, column, 1 - rnnParamSpec.zoneoutCell, 0); + array_scale_template( + lastBatchState, lastBatchState, column, rnnParamSpec.zoneoutCell, 0); + array_add_template(tmpState, lastBatchState, currentBatchState, column); + } + + if (rnnParamSpec.numProjection > 0) { + memset(tmpH, 0, sizeof(T) * hDim); + mvm_nkn32_template( + hDim / 32, rnnParamSpec.numProjection, (const T *)filter[1], tmpHH, tmpH); + } + if (rnnParamSpec.zoneoutOutput != 0) { + if (rnnParamSpec.numProjection > 0) { + array_scale_template(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0); + } else { + array_scale_template(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0); + } + array_scale_template(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneoutOutput, 0); + array_add_template(out_f, lastBatchH, currentBatchH, hDim); + } else { + memcpy(currentBatchH, currentOutput, sizeof(T) * hDim); + } + } + return SUCCESS; +} + +EE rnncell_general(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *output) +{ + EE ret = SUCCESS; + switch (xDesc.dt) { +#ifdef _USE_FP16 + case DT_F16: + ret = rnncell(xDesc, currentX, filterDesc, filter, biasDesc, bias, state, tmpBytes, + tmp, rnnParamSpec, batchStrideX, batchStrideH, hDesc, output); + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + ret = rnncell(xDesc, currentX, filterDesc, filter, biasDesc, bias, state, tmpBytes, + tmp, rnnParamSpec, batchStrideX, batchStrideH, hDesc, output); + break; +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/tensor_computing/src/cpu/general/scale.cpp b/compute/tensor/src/cpu/general/scale.cpp similarity index 60% rename from tensor_computing/src/cpu/general/scale.cpp rename to compute/tensor/src/cpu/general/scale.cpp index 8f5c7ec3..b8f7ddd4 100644 --- a/tensor_computing/src/cpu/general/scale.cpp +++ b/compute/tensor/src/cpu/general/scale.cpp @@ -1,29 +1,28 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "cpu/general/tensor_computing_general.h" -template -EE scale_nchw(T* input, T* alpha, T* beta, U32 in, - U32 ic, U32 elements_per_channel, U32 align_size, T*output) +template +static EE scale_nchw( + T *input, T *alpha, T *beta, U32 in, U32 ic, U32 elements_per_channel, U32 align_size, T *output) { ic = ic / align_size; for (U32 n = 0; n < in; n++) { for (U32 c = 0; c < ic; c++) { for (U32 i = 0; i < elements_per_channel; i++) { for (U32 k = 0; k < align_size; k++) { - T alphaValue = alpha[c * align_size + k]; + T alphaValue = (nullptr == alpha) ? 1 : alpha[c * align_size + k]; T betaValue = (nullptr == beta) ? 0 : beta[c * align_size + k]; U32 index = ((n * ic + c) * elements_per_channel + i) * align_size + k; output[index] = alphaValue * input[index] + betaValue; @@ -34,14 +33,14 @@ EE scale_nchw(T* input, T* alpha, T* beta, U32 in, return SUCCESS; } -template -EE scale_nhwc(T* input, T* alpha, T* beta, U32 in, - U32 ic, U32 elements_per_channel, T*output) +template +static EE scale_nhwc( + T *input, T *alpha, T *beta, U32 in, U32 ic, U32 elements_per_channel, T *output) { for (U32 n = 0; n < in; n++) { for (U32 i = 0; i < elements_per_channel; i++) { for (U32 c = 0; c < ic; c++) { - T alphaValue = alpha[c]; + T alphaValue = (nullptr == alpha) ? 1 : alpha[c]; T betaValue = (nullptr == beta) ? 0 : beta[c]; U32 index = ((n * elements_per_channel) + i) * ic + c; output[index] = alphaValue * input[index] + betaValue; @@ -51,52 +50,64 @@ EE scale_nhwc(T* input, T* alpha, T* beta, U32 in, return SUCCESS; } -template -EE scale(T* input, I32 axis, I32 nDims, T* alpha, T* beta, - U32 in, U32 ic, U32 elements_per_channel, U32 align_size, T*output) +template +static EE scale(T *input, + I32 axis, + I32 nDims, + T *alpha, + T *beta, + U32 in, + U32 ic, + U32 elements_per_channel, + U32 align_size, + T *output) { EE ret = SUCCESS; - if (axis == 1) { - ret = scale_nchw(input, alpha, beta, in, - ic, elements_per_channel, align_size, output); - } else if (axis == nDims-1) { - ret = scale_nhwc(input, alpha, beta, in, - ic, elements_per_channel, output); + if (axis == 1 || axis == 0 || ic == 1) { + ret = scale_nchw(input, alpha, beta, in, ic, elements_per_channel, align_size, output); + } else if (axis == nDims - 1) { + ret = scale_nhwc(input, alpha, beta, in, ic, elements_per_channel, output); } else { ret = NOT_SUPPORTED; } return ret; } -EE scale_general(TensorDesc inputDesc, void* input, - I32 axis, void *alpha, void *beta, - TensorDesc outputDesc, void* output) +EE scale_general(TensorDesc inputDesc, + void *input, + void *alpha, + void *beta, + ScaleParamSpec p, + TensorDesc outputDesc, + void *output) { UNUSED(outputDesc); - if (nullptr == input || nullptr == output || nullptr == alpha) + if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); + } U32 length = tensorNumElements(inputDesc); - axis = (axis + inputDesc.nDims) % inputDesc.nDims; + int axis = (p.axis + inputDesc.nDims) % inputDesc.nDims; I32 in = inputDesc.dims[inputDesc.nDims - 1]; I32 ic = inputDesc.dims[inputDesc.nDims - 1 - axis]; I32 elements_per_channel = length / (in * ic); I32 align_size = 1; - if (inputDesc.df == DF_NCHWC8) + if (inputDesc.df == DF_NCHWC8) { align_size = 8; + } EE ret = SUCCESS; switch (inputDesc.dt) { #ifdef _USE_FP32 case DT_F32: { - ret = scale((F32*)input, axis, inputDesc.nDims, (F32*)alpha, (F32*)beta, - in, ic, elements_per_channel, align_size, (F32*)output); + ret = scale((F32 *)input, axis, inputDesc.nDims, (F32 *)alpha, (F32 *)beta, in, ic, + elements_per_channel, align_size, (F32 *)output); break; } #endif #ifdef _USE_FP16 case DT_F16: { - ret = scale((F16*)input, axis, inputDesc.nDims, (F16*)alpha, (F16*)beta, - in, ic, elements_per_channel, align_size, (F16*)output); + ret = scale((F16 *)input, axis, inputDesc.nDims, (F16 *)alpha, (F16 *)beta, in, ic, + elements_per_channel, align_size, (F16 *)output); break; } #endif diff --git a/tensor_computing/src/cpu/general/softmax.cpp b/compute/tensor/src/cpu/general/softmax.cpp similarity index 57% rename from tensor_computing/src/cpu/general/softmax.cpp rename to compute/tensor/src/cpu/general/softmax.cpp index e489fc98..2992c9c5 100644 --- a/tensor_computing/src/cpu/general/softmax.cpp +++ b/compute/tensor/src/cpu/general/softmax.cpp @@ -1,50 +1,68 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "cpu/general/tensor_computing_general.h" - -template -F32 array_max(const T* input, U32 len, U32 stride) { +template +static F32 array_max(const T *input, U32 len, U32 stride) +{ F32 tmp = input[0]; for (U32 i = 1; i < len; i++) { - if(input[i * stride] > tmp) + if (input[i * stride] > tmp) { tmp = input[i * stride]; + } } return tmp; } -template -EE softmax(TensorDesc inputDesc, const T* input, - int axis, - TensorDesc outputDesc, T* output) +template +static EE softmax(TensorDesc inputDesc, const T *input, int axis, TensorDesc outputDesc, T *output) { UNUSED(outputDesc); - if (nullptr == input || nullptr == output) + if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); - - CHECK_REQUIREMENT(DF_NCHWC8 != inputDesc.df); + } U32 size = tensorNumElements(inputDesc); axis = (axis + inputDesc.nDims) % inputDesc.nDims; axis = inputDesc.nDims - 1 - axis; + std::vector buffer; + if (inputDesc.df == DF_NCHWC8) { + if (axis == 2) { + if (inputDesc.dims[0] != 1 || inputDesc.dims[1] != 1) { + buffer = std::vector(size); + TensorDesc tmpInputDesc = inputDesc; + tmpInputDesc.df = DF_NCHW; + transformToNCHW(inputDesc, input, tmpInputDesc, buffer.data()); + input = (const T *)(buffer.data()); + } + } else { + for (I32 i = (int)inputDesc.nDims; i > 0; i--) { + inputDesc.dims[i] = inputDesc.dims[i - 1]; + } + inputDesc.dims[inputDesc.nDims - 1] /= 8; + inputDesc.dims[0] = 8; + inputDesc.nDims += 1; + axis += 1; + } + } U32 loops = inputDesc.dims[axis]; - + U32 loop_inner = 1; - for (int i = 0; i < axis; i++) + for (int i = 0; i < axis; i++) { loop_inner *= inputDesc.dims[i]; + } U32 loop_outer = size / loops / loop_inner; for (U32 i = 0; i < loop_outer; i++) { @@ -54,35 +72,34 @@ EE softmax(TensorDesc inputDesc, const T* input, F32 max_value = array_max(in, loops, loop_inner); F32 sum = 0; for (U32 i = 0; i < loops; i++) { - F32 tmp = exp(in[i*loop_inner] - max_value); + F32 tmp = exp(in[i * loop_inner] - max_value); sum += tmp; - out[i*loop_inner] = tmp; + out[i * loop_inner] = tmp; } sum = 1 / sum; for (U32 i = 0; i < loops; i++) { - out[i*loop_inner] *= sum; + out[i * loop_inner] *= sum; } } } return SUCCESS; } -EE softmax_general(TensorDesc inputDesc, const void* input, - int axis, - TensorDesc outputDesc, void* output) +EE softmax_general( + TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output) { DataType idt = inputDesc.dt; EE ret = SUCCESS; switch (idt) { #ifdef _USE_FP16 case DT_F16: { - ret = softmax(inputDesc, (const F16*)input, axis, outputDesc, (F16*)output); + ret = softmax(inputDesc, (const F16 *)input, p.axis, outputDesc, (F16 *)output); break; } #endif #ifdef _USE_FP32 case DT_F32: { - ret = softmax(inputDesc, (const F32*)input, axis, outputDesc, (F32*)output); + ret = softmax(inputDesc, (const F32 *)input, p.axis, outputDesc, (F32 *)output); break; } #endif diff --git a/compute/tensor/src/cpu/general/tensor_computing_general.h b/compute/tensor/src/cpu/general/tensor_computing_general.h new file mode 100644 index 00000000..fda8c48b --- /dev/null +++ b/compute/tensor/src/cpu/general/tensor_computing_general.h @@ -0,0 +1,165 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TENSOR_COMPUTING_GENERAL +#define _H_TENSOR_COMPUTING_GENERAL + +#include + +#include "error.h" +#include "sys.h" +#include "types.h" + +EE convolution_general(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc); + +EE deconvolution_general(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc); + +EE depthwise_pointwise_convolution_infer_forward_tmp_bytes_general(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes); + +EE depthwise_pointwise_convolution_general(TensorDesc inputDesc, + void *input, + TensorDesc dwFilterDesc, + const void *dwFilter, + TensorDesc pwFilterDesc, + const void *pwFilter, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const void *dwBias, + TensorDesc pwBiasDesc, + const void *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec); + +EE depthwise_convolution_general(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec); + +EE pooling_general(TensorDesc inputDesc, + const void *input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + void *output); + +EE pooling_bp_general(TensorDesc inputDesc, + const void *input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + void *output); + +EE attention_general(TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output); + +EE clip_general( + TensorDesc inputDesc, void *input, ClipParamSpec p, TensorDesc outputDesc, void *output); + +EE eltwise_general(DataType dataType, + std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode); + +EE rnncell_general(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *currentH); + +EE transpose_general( + TensorDesc inputDesc, const void *input, U32 *dim, TensorDesc outputDesc, void *output); + +EE scale_general(TensorDesc inputDesc, + void *input, + void *alpha, + void *beta, + ScaleParamSpec p, + TensorDesc outputDesc, + void *output); + +EE softmax_general( + TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output); + +EE check_general(TensorDesc inputDescA, + const void *inputA, + TensorDesc inputDescB, + const void *inputB, + CheckParamSpec p, + TensorDesc outputDesc, + void *output); + +EE layer_normalization_general( + TensorDesc inputDesc, void *input, void *alpha, void *beta, TensorDesc outputDesc, void *output); + +EE attention_mask_general(TensorDesc inputDesc, + const void *input, + AttentionMaskParamSpec p, + TensorDesc outputDesc, + void *output); + +EE prelu_general(TensorDesc inputDesc, + void *input, + void *weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + void *output); +#endif diff --git a/tensor_computing/src/cpu/general/transpose.cpp b/compute/tensor/src/cpu/general/transpose.cpp similarity index 76% rename from tensor_computing/src/cpu/general/transpose.cpp rename to compute/tensor/src/cpu/general/transpose.cpp index 63e4a396..3ab8115c 100644 --- a/tensor_computing/src/cpu/general/transpose.cpp +++ b/compute/tensor/src/cpu/general/transpose.cpp @@ -1,24 +1,26 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "cpu/general/tensor_computing_general.h" -EE transpose_general(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output, U32 *dim) { - if (nullptr == input || nullptr == output || nullptr == dim) +EE transpose_general( + TensorDesc inputDesc, const void *input, U32 *dim, TensorDesc outputDesc, void *output) +{ + if (nullptr == input || nullptr == output || nullptr == dim) { CHECK_STATUS(NULL_POINTER); + } U32 inputDim = inputDesc.nDims; U32 outputDim = outputDesc.nDims; @@ -37,11 +39,12 @@ EE transpose_general(TensorDesc inputDesc, void *input, TensorDesc outputDesc, v inputLocalIndex[inputDim - 1 - dim[outputDim - 1 - j]] = value; } U32 inputIndex = 0; - for (U32 j = inputDim-1; j > 0; j--) { - inputIndex = (inputIndex + inputLocalIndex[j]) * inputDesc.dims[j-1]; + for (U32 j = inputDim - 1; j > 0; j--) { + inputIndex = (inputIndex + inputLocalIndex[j]) * inputDesc.dims[j - 1]; } inputIndex += inputLocalIndex[0]; - memcpy(output_ptr+i*bytesOf(outputDesc.dt), input_ptr+inputIndex*bytesOf(inputDesc.dt), bytesOf(inputDesc.dt)); + memcpy(output_ptr + i * bytesOf(outputDesc.dt), + input_ptr + inputIndex * bytesOf(inputDesc.dt), bytesOf(inputDesc.dt)); } return SUCCESS; diff --git a/compute/tensor/src/cpu/l2normalization.cpp b/compute/tensor/src/cpu/l2normalization.cpp new file mode 100644 index 00000000..032bfbb3 --- /dev/null +++ b/compute/tensor/src/cpu/l2normalization.cpp @@ -0,0 +1,57 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/tensor_computing_cpu.h" +#include "cpu/cpu_functions.h" + +EE l2normalization_cpu( + TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output, Arch arch) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + ArrayVarFunction var_func = get_array_var_function(arch); + ArrayScaleFunction scale_func = get_array_scale_function(arch); + DataType idt, odt; + DataFormat idf, odf; + U32 ic = 0, ih = 0, iw = 0, oh = 0, ow = 0; + if (tensorIs2d(inputDesc)) { + CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &ih, &iw)); + ic = 1; + CHECK_STATUS(tensor2dGet(outputDesc, &odt, &odf, &oh, &ow)); + } else if (tensorIs3d(inputDesc)) { + U32 oc = 0; + CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &ic, &ih, &iw)); + CHECK_STATUS(tensor3dGet(outputDesc, &odt, &odf, &oc, &oh, &ow)); + CHECK_REQUIREMENT(ic == oc); + } else if (tensorIs4d(inputDesc)) { + idt = inputDesc.dt; + ic = inputDesc.dims[0]; + ih = inputDesc.dims[1]; + iw = inputDesc.dims[2]; + } else { + CHECK_STATUS(NOT_MATCH); + } + + // l2norm -> x / sqrt(sum(x^2)) + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < ih; h++) { + U32 index_off = (c * ih + h) * iw * bytesOf(idt); + const U8 *input_ptr = (const U8 *)input + index_off; + U8 *output_ptr = (U8 *)output + index_off; + F32 sum_row = var_func(idt, input_ptr, (I32)iw, 0.f) * static_cast(iw); + scale_func(idt, input_ptr, output_ptr, iw, 1.0 / sqrt(sum_row), 0); + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/non_max_suppression.cpp b/compute/tensor/src/cpu/non_max_suppression.cpp new file mode 100644 index 00000000..23118306 --- /dev/null +++ b/compute/tensor/src/cpu/non_max_suppression.cpp @@ -0,0 +1,222 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/tensor_computing_cpu.h" + +inline EE qsort_descent(std::vector &boxes, + std::vector &boxindex, + std::vector &scores, + int left, + int right) +{ + if (boxes.empty() || scores.empty()) { + return NOT_SUPPORTED; + } + + int i = left; + int j = right; + F32 temp = scores[(left + right) / 2]; + + while (i <= j) { + while (scores[i] > temp) { + i++; + } + while (scores[j] < temp) { + j--; + } + if (i <= j) { + std::swap(boxes[i], boxes[j]); + std::swap(scores[i], scores[j]); + std::swap(boxindex[i], boxindex[j]); + i++; + j--; + } + } + + if (left < j) { + qsort_descent(boxes, boxindex, scores, left, j); + } + if (i < right) { + qsort_descent(boxes, boxindex, scores, i, right); + } + + return SUCCESS; +} + +inline F32 intersectionarea(BoxRect a, BoxRect b) +{ + if (a.xmin > b.xmax || a.xmax < b.xmin || a.ymin > b.ymax || a.ymax < b.ymin) { + return 0.f; + } + F32 inter_width = std::min(a.xmax, b.xmax) - std::max(a.xmin, b.xmin); + F32 inter_height = std::min(a.ymax, b.ymax) - std::max(a.ymin, b.ymin); + + return inter_width * inter_height; +} + +inline EE nms_pickedboxes(std::vector boxes, std::vector &picked, F32 nms_threshold) +{ + I64 n = boxes.size(); + + std::vector areas(n); + for (I64 i = 0; i < n; i++) { + BoxRect box = boxes[i]; + + F32 width = box.xmax - box.xmin; + F32 height = box.ymax - box.ymin; + + areas[i] = width * height; + } + for (I64 i = 0; i < n; i++) { + BoxRect a = boxes[i]; + int keep = 1; + for (int j = 0; j < (int)picked.size(); j++) { + BoxRect b = boxes[picked[j]]; + F32 inter_area = intersectionarea(a, b); + F32 union_area = areas[i] + areas[picked[j]] - inter_area; + + if (inter_area / union_area > nms_threshold) { + keep = 0; + } + } + if (keep) { + picked.push_back(i); + } + } + return SUCCESS; +} + +template +EE non_max_suppression_kernel(std::vector input, + T *output, + U32 spatial_dim, + U32 num_class, + U32 max_output_boxes_per_class, + F32 iou_threshold, + F32 score_threshold) +{ + T *box = (T *)input[0]; + T *score = (T *)input[1]; + // decode box + std::vector> boxes; + boxes.resize(spatial_dim); + for (U32 i = 0; i < spatial_dim; i++) { + F32 ymin = std::min(box[i * 4], box[i * 4 + 2]); + F32 xmin = std::min(box[i * 4 + 1], box[i * 4 + 3]); + F32 ymax = std::max(box[i * 4], box[i * 4 + 2]); + F32 xmax = std::max(box[i * 4 + 1], box[i * 4 + 3]); + std::vector box_pixel; + box_pixel.resize(4); + box_pixel[0] = xmin; + box_pixel[1] = ymin; + box_pixel[2] = xmax; + box_pixel[3] = ymax; + boxes[i].assign(box_pixel.begin(), box_pixel.end()); + } + + std::vector all_boxinfo; + for (U32 i = 0; i < num_class; i++) { + std::vector class_boxrects; + std::vector class_boxscores; + std::vector class_boxindex; + for (U32 j = 0; j < spatial_dim; j++) { + F32 score_pixel = score[i * spatial_dim + j]; + if (score_pixel > score_threshold) { + std::vector inbox; + inbox.assign(boxes[j].begin(), boxes[j].end()); + BoxRect b = {inbox[0], inbox[1], inbox[2], inbox[3], i}; + class_boxrects.push_back(b); + class_boxindex.push_back(j); + class_boxscores.push_back(score_pixel); + } + } + // sort boxes and box index + qsort_descent(class_boxrects, class_boxindex, class_boxscores, 0, + static_cast(class_boxscores.size() - 1)); + std::vector picked; + // apply nms + nms_pickedboxes(class_boxrects, picked, iou_threshold); + std::vector boxindex; + for (I64 p = 0; p < (I64)picked.size(); p++) { + I64 picked_box = picked[p]; + boxindex.push_back(class_boxindex[picked_box]); + } + if (max_output_boxes_per_class < (U32)boxindex.size()) { + boxindex.resize(max_output_boxes_per_class); + } + for (I64 j = 0; j < (I64)boxindex.size(); j++) { + BoxInfo bi; + bi.box_index = boxindex[j]; + bi.label = i; + all_boxinfo.push_back(bi); + } + } + U32 num_detected = all_boxinfo.size(); + // the first box contains the number of availble boxes in the first element. + output[0] = num_detected; + output[1] = output[2] = 0; + for (U32 i = 0; i < num_detected; i++) { + BoxInfo bi = all_boxinfo[i]; + // batch_index = 0 + output[(i + 1) * 3] = 0; + // class_index + output[(i + 1) * 3 + 1] = bi.label; + // box_index + output[(i + 1) * 3 + 2] = bi.box_index; + } + return SUCCESS; +} + +EE non_max_suppression_cpu(std::vector inputDesc, + std::vector input, + NonMaxSuppressionParamSpec nonMaxSuppressionParamSpec, + TensorDesc outputDesc, + void *output) +{ + UNUSED(outputDesc); + if (nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt0, idt1; + DataFormat idf0, idf1; + U32 in0, ic0, ilens1; + U32 in1, ic1, ilens2; + // boxes + CHECK_STATUS(tensor3dGet(inputDesc[0], &idt0, &idf0, &in0, &ic0, &ilens1)); + // scores + CHECK_STATUS(tensor3dGet(inputDesc[1], &idt1, &idf1, &in1, &ic1, &ilens2)); + U32 spatial_dim = ic0; + U32 num_class = ic1; + CHECK_REQUIREMENT(spatial_dim == ilens2); + U32 max_output_boxes_per_class = nonMaxSuppressionParamSpec.max_output_boxes_per_class; + F32 iou_threshold = nonMaxSuppressionParamSpec.iou_threshold; + F32 score_threshold = nonMaxSuppressionParamSpec.score_threshold; + EE ret = SUCCESS; + switch (idt0) { +#ifdef _USE_FP32 + case DT_F32: + non_max_suppression_kernel(input, (F32 *)output, spatial_dim, num_class, + max_output_boxes_per_class, iou_threshold, score_threshold); + break; +#endif +#ifdef _USE_FP16 + case DT_F16: + non_max_suppression_kernel(input, (F16 *)output, spatial_dim, num_class, + max_output_boxes_per_class, iou_threshold, score_threshold); + break; +#endif + default: + ret = NOT_SUPPORTED; + } + return ret; +} diff --git a/compute/tensor/src/cpu/padding.cpp b/compute/tensor/src/cpu/padding.cpp new file mode 100644 index 00000000..a2f449b0 --- /dev/null +++ b/compute/tensor/src/cpu/padding.cpp @@ -0,0 +1,163 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "types.h" +#include "cpu/tensor_computing_cpu.h" +#include + +EE padding_infer_output_size_cpu( + TensorDesc inputDesc, PadParamSpec padParamSpec, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt = DT_F32; + DataFormat idf = DF_NCHW; + U32 in = 0, ic = 0, ih = 0, iw = 0; + if (tensorIs3d(inputDesc)) { + CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &in, &ic, &ih)); + iw = 1; + } else if (tensorIs4d(inputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + } else { + return NOT_SUPPORTED; + } + int out_n = in; + int out_c = ic; + int out_h = ih + padParamSpec.top + padParamSpec.bottom; + int out_w = iw + padParamSpec.left + padParamSpec.right; + if (tensorIs3d(inputDesc)) { + *outputDesc = tensor3df(idt, idf, out_n, out_c, out_h); + } else if (tensorIs4d(inputDesc)) { + *outputDesc = tensor4df(idt, idf, out_n, out_c, out_h, out_w); + } + return SUCCESS; +} + +EE padding_cpu(TensorDesc inputDesc, + const void *input, + PadParamSpec padParamSpec, + TensorDesc outputDesc, + void *output) +{ + DataType idt, odt; + DataFormat idf, odf; + U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0; + if (tensorIs3d(inputDesc)) { + CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &in, &ic, &ih)); + CHECK_STATUS(tensor3dGet(outputDesc, &odt, &odf, &on, &oc, &oh)); + iw = ow = 1; + } else if (tensorIs4d(inputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + } else { + return NOT_SUPPORTED; + } + CHECK_REQUIREMENT(in == on); + CHECK_REQUIREMENT(ic == oc); + U32 alignSize = 1; + if (idf == DF_NCHWC8) { + alignSize = 8; + } + ic /= alignSize; + oc /= alignSize; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < ih; h++) { + const U8 *inPtr = + (const U8 *)input + (((n * ic + c) * ih + h) * iw) * alignSize * bytesOf(idt); + U8 *outPtr = (U8 *)output + + (((n * oc + c) * oh + (padParamSpec.top + h)) * ow) * alignSize * bytesOf(odt); + if (padParamSpec.pad_mode == Pad_Constant) { + memset(outPtr, 0, padParamSpec.left * alignSize * bytesOf(odt)); + outPtr += padParamSpec.left * alignSize * bytesOf(odt); + memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt)); + outPtr += iw * alignSize * bytesOf(odt); + memset(outPtr, 0, padParamSpec.right * alignSize * bytesOf(odt)); + } else { + for (U32 w = 0; w < padParamSpec.left; w++) { + U32 index = 0; + if (padParamSpec.pad_mode == Pad_Reflect) { + index = (padParamSpec.left - w) * alignSize * bytesOf(idt); + } else if (padParamSpec.pad_mode == Pad_Symmetric) { + index = (padParamSpec.left - w - 1) * alignSize * bytesOf(idt); + } + memcpy(outPtr, inPtr + index, alignSize * bytesOf(idt)); + outPtr += alignSize * bytesOf(idt); + } + memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt)); + outPtr += iw * alignSize * bytesOf(odt); + for (U32 w = 0; w < padParamSpec.right; w++) { + U32 index = (iw - 1) * alignSize * bytesOf(idt); + if (padParamSpec.pad_mode == Pad_Reflect) { + index = (iw - w - 2) * alignSize * bytesOf(idt); + } else if (padParamSpec.pad_mode == Pad_Symmetric) { + index = (iw - w - 1) * alignSize * bytesOf(idt); + } + memcpy(outPtr, inPtr + index, alignSize * bytesOf(idt)); + outPtr += alignSize * bytesOf(idt); + } + } + } + U8 *outPtr = (U8 *)output + (((n * oc + c) * oh) * ow) * alignSize * bytesOf(odt); + for (U32 h = 0; h < padParamSpec.top; h++) { + U32 index = h * ow * alignSize * bytesOf(odt); + if (padParamSpec.pad_mode == Pad_Constant) { + memset(outPtr + index, 0, ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Edge) { + memcpy(outPtr + index, + outPtr + (padParamSpec.top * ow * alignSize * bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Reflect) { + memcpy(outPtr + index, + outPtr + + ((padParamSpec.top + padParamSpec.top - h) * ow * alignSize * + bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Symmetric) { + memcpy(outPtr + index, + outPtr + + ((padParamSpec.top + padParamSpec.top - h - 1) * ow * alignSize * + bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else { + return NOT_SUPPORTED; + } + } + for (U32 h = 0; h < padParamSpec.bottom; h++) { + U32 index = (padParamSpec.top + ih + h) * ow * alignSize * bytesOf(odt); + if (padParamSpec.pad_mode == Pad_Constant) { + memset(outPtr + index, 0, ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Edge) { + memcpy(outPtr + index, + outPtr + ((padParamSpec.top + ih - 1) * ow * alignSize * bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Reflect) { + // memcpy(outPtr+index, outPtr+((padParamSpec.top+ih-2-h)*ow*alignSize*bytesOf(odt)), ow*alignSize*bytesOf(odt)); + memcpy(outPtr + index, + outPtr + + ((padParamSpec.top + ih - 1 - padParamSpec.bottom + h) * ow * + alignSize * bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == Pad_Symmetric) { + memcpy(outPtr + index, + outPtr + ((padParamSpec.top + ih - 1 - h) * ow * alignSize * bytesOf(odt)), + ow * alignSize * bytesOf(odt)); + } else { + return NOT_SUPPORTED; + } + } + } + } + return SUCCESS; +} diff --git a/tensor_computing/src/cpu/arm/split.cpp b/compute/tensor/src/cpu/power.cpp similarity index 60% rename from tensor_computing/src/cpu/arm/split.cpp rename to compute/tensor/src/cpu/power.cpp index 682ad152..cf08407e 100644 --- a/tensor_computing/src/cpu/arm/split.cpp +++ b/compute/tensor/src/cpu/power.cpp @@ -1,34 +1,30 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#include "cpu/tensor_computing_cpu.h" +#include "cpu/cpu_functions.h" -#include -#include - -#include "cpu/arm/tensor_computing_arm.h" - -EE split_arm(TensorDesc inputDesc, void* input, - std::vector outputDesc, std::vector* output) +EE power_cpu( + TensorDesc inputDesc, void *input, PowerParamSpec p, TensorDesc outputDesc, void *output, Arch arch) { - UNUSED(inputDesc); - if (nullptr == input) + UNUSED(outputDesc); + ArrayScaleFunction scale_func = get_array_scale_function(arch); + ArrayPowerFunction power_func = get_array_power_function(arch); + if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); - if(outputDesc.size() <= 1) return NOT_MATCH; - - for(U32 i = 0; i < (*output).size(); i++) { - if (nullptr == (*output)[i]) - CHECK_STATUS(NULL_POINTER); - memcpy((*output)[i], input, tensorNumBytes(outputDesc[i])); } + + scale_func(inputDesc.dt, input, output, tensorNumElements(inputDesc), p.scale, p.shift); + power_func(outputDesc.dt, output, output, tensorNumElements(inputDesc), p.power); return SUCCESS; } diff --git a/compute/tensor/src/cpu/priorbox.cpp b/compute/tensor/src/cpu/priorbox.cpp new file mode 100644 index 00000000..f5f31780 --- /dev/null +++ b/compute/tensor/src/cpu/priorbox.cpp @@ -0,0 +1,206 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/tensor_computing_cpu.h" + +template +static EE priorbox_kernel(DataType idt0, + T *output, + U32 ih_layer, + U32 iw_layer, + U32 ih_img, + U32 iw_img, + std::vector minsizes, + std::vector maxsizes, + std::vector ars, + U32 flip, + U32 clip, + F32 *vars, + I32 imageW, + I32 imageH, + F32 stepW, + F32 stepH, + F32 offset, + Arch arch) +{ + U32 layer_w = iw_layer; + U32 layer_h = ih_layer; + + int img_w, img_h; + if (imageH == 0 || imageW == 0) { + img_w = iw_img; + img_h = ih_img; + } else { + img_w = imageW; + img_h = imageH; + } + F32 stp_h, stp_w; + if (stepW == 0 || stepH == 0) { + stp_w = static_cast(ceil((img_w) / layer_w)); + stp_h = static_cast(ceil((img_h) / layer_h)); + } else { + stp_w = stepW; + stp_h = stepH; + } + + U32 num_priorboxs = ars.size(); + if (flip) { + num_priorboxs = num_priorboxs * 2; + } + U32 num_minsize = minsizes.size(); + num_priorboxs = (num_priorboxs + 1) * num_minsize; + if (!maxsizes.empty()) { + U32 num_maxsize = maxsizes.size(); + num_priorboxs = num_priorboxs + num_maxsize; + } + int dim = layer_h * layer_w * num_priorboxs * 4; + int idx = 0; + for (U32 h = 0; h < layer_h; h++) { + for (U32 w = 0; w < layer_w; w++) { + F32 center_x = (w + offset) * stp_w; + F32 center_y = (h + offset) * stp_h; + F32 box_w, box_h; + for (int n = 0; n < (int)minsizes.size(); n++) { + F32 minsize = minsizes[n]; + box_w = box_h = minsize; + output[idx++] = (center_x - box_w / 2) / img_w; + output[idx++] = (center_y - box_h / 2) / img_h; + output[idx++] = (center_x + box_w / 2) / img_w; + output[idx++] = (center_y + box_h / 2) / img_h; + + if ((int)maxsizes.size() > 0) { + F32 maxsize = maxsizes[n]; + box_w = box_h = sqrt(minsize * maxsize); + output[idx++] = (center_x - box_w / 2) / img_w; + output[idx++] = (center_y - box_h / 2) / img_h; + output[idx++] = (center_x + box_w / 2) / img_w; + output[idx++] = (center_y + box_h / 2) / img_h; + } + + for (int a = 0; a < (int)ars.size(); a++) { + F32 ar = ars[a]; + box_w = minsize * sqrt(ar); + box_h = minsize / sqrt(ar); + output[idx++] = (center_x - box_w / 2) / img_w; + output[idx++] = (center_y - box_h / 2) / img_h; + output[idx++] = (center_x + box_w / 2) / img_w; + output[idx++] = (center_y + box_h / 2) / img_h; + if (flip) { + output[idx++] = (center_x - box_h / 2) / img_w; + output[idx++] = (center_y - box_w / 2) / img_h; + output[idx++] = (center_x + box_h / 2) / img_w; + output[idx++] = (center_y + box_w / 2) / img_h; + } + } + } + } + } + EE ret = SUCCESS; + if (clip) { + ClipParamSpec p; + p.min = 0; + p.max = 1; + TensorDesc desc = tensor1d(idt0, dim); + ret = clip_cpu(desc, output, p, desc, output, arch); + } + + for (int i = 0; i < dim / 4; i++) { + output[idx++] = vars[0]; + output[idx++] = vars[1]; + output[idx++] = vars[2]; + output[idx++] = vars[3]; + } + return ret; +} + +EE priorbox_cpu(std::vector inputDesc, + PriorBoxParamSpec priorBoxParamSpec, + TensorDesc outputDesc, + void *output, + Arch arch) +{ + UNUSED(outputDesc); + if (nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + U32 num = inputDesc.size(); + if (num != 2) { + return NOT_MATCH; + } + DataType idt0, idt1; + DataFormat idf0, idf1; + U32 in0 = 0, ic0 = 0, ih0 = 0, iw0 = 0; + U32 in1 = 0, ic1 = 0, ih1 = 0, iw1 = 0; + CHECK_STATUS(tensor4dGet(inputDesc[0], &idt0, &idf0, &in0, &ic0, &ih0, &iw0)); + CHECK_STATUS(tensor4dGet(inputDesc[1], &idt1, &idf1, &in1, &ic1, &ih1, &iw1)); + + std::vector minsizes; + for (int i = 0; i < 2; i++) { + if (priorBoxParamSpec.min_sizes[i] == 0) { + break; + } + minsizes.push_back(priorBoxParamSpec.min_sizes[i]); + } + std::vector maxsizes; + for (int i = 0; i < 2; i++) { + if (priorBoxParamSpec.max_sizes[i] == 0) { + break; + } + maxsizes.push_back(priorBoxParamSpec.max_sizes[i]); + } + std::vector ars; + for (int i = 0; i < 2; i++) { + if (priorBoxParamSpec.aspect_ratios[i] == 0) { + break; + } + ars.push_back(priorBoxParamSpec.aspect_ratios[i]); + } + U32 flip = priorBoxParamSpec.flip; + U32 clip = priorBoxParamSpec.clip; + F32 vars[4]; + for (int i = 0; i < 4; i++) { + vars[i] = priorBoxParamSpec.variances[i]; + } + U32 imageH = priorBoxParamSpec.image_h; + U32 imageW = priorBoxParamSpec.image_w; + F32 stepH = priorBoxParamSpec.step_h; + F32 stepW = priorBoxParamSpec.step_w; + F32 offset = priorBoxParamSpec.offset; + + EE ret = SUCCESS; + switch (idt0) { +#ifdef _USE_FP32 + case DT_F32: + ret = priorbox_kernel(idt0, (F32 *)output, ih0, iw0, ih1, iw1, minsizes, maxsizes, + ars, flip, clip, vars, imageW, imageH, stepW, stepH, offset, arch); + break; +#endif +#ifdef _USE_FP16 + case DT_F16: + ret = priorbox_kernel(idt0, (F16 *)output, ih0, iw0, ih1, iw1, minsizes, maxsizes, + ars, flip, clip, vars, imageW, imageH, stepW, stepH, offset, arch); + break; +#endif +#ifdef _USE_INT8 + case DT_I8: { + ret = priorbox_kernel(idt0, (F16 *)output, ih0, iw0, ih1, iw1, minsizes, maxsizes, + ars, flip, clip, vars, imageW, imageH, stepW, stepH, offset, arch); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/reduction.cpp b/compute/tensor/src/cpu/reduction.cpp new file mode 100644 index 00000000..5271d9fa --- /dev/null +++ b/compute/tensor/src/cpu/reduction.cpp @@ -0,0 +1,198 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/tensor_computing_cpu.h" +#include "cpu/cpu_functions.h" + +template +static EE reduction_kernel(TensorDesc inputDesc, + const T *input, + TensorDesc maskDesc, + const float *mask, + I32 axis, + ReductionMode reductionMode, + TensorDesc outputDesc, + T *output, + Arch arch) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + ArraySumFunction sum_func = get_array_sum_function(arch); + ArrayMeanFunction mean_func = get_array_mean_function(arch); + ArrayVarFunction var_func = get_array_var_function(arch); + ArrayAddFunction add_func = get_array_add_function(arch); + ArraySquareAndAddFunction square_and_add_func = get_array_square_and_add_function(arch); + ArrayScaleFunction scale_func = get_array_scale_function(arch); + + if (axis < 0) { + axis = inputDesc.nDims + axis; + } + axis = inputDesc.nDims - 1 - axis; + U32 loopInner = 1; + for (int i = 0; i < axis; i++) { + loopInner *= inputDesc.dims[i]; + } + U32 loopOuter = 1; + for (U32 i = axis + 1; i < inputDesc.nDims; i++) { + loopOuter *= inputDesc.dims[i]; + } + U32 len = inputDesc.dims[axis]; + U32 maskLen = tensorNumElements(maskDesc); + maskLen = (maskLen > 0) ? maskLen : len; + U32 axisDim = maskLen / len; + for (U32 i = 0; i < loopOuter; i++) { + if (loopInner == 1) { + if (mask != nullptr) { + return NOT_SUPPORTED; + } + const T *array = input + i * len; + F32 tmpValue = 0; + switch (reductionMode) { + case REDUCTION_SUM: + output[i] = sum_func(inputDesc.dt, array, len); + break; + case REDUCTION_MEAN: + output[i] = mean_func(inputDesc.dt, array, len); + break; + case REDUCTION_STD_DEVIATION: + tmpValue = mean_func(inputDesc.dt, array, len); + tmpValue = var_func(inputDesc.dt, array, len, tmpValue); + output[i] = sqrt(tmpValue); + break; + case REDUCTION_SCALAR_PRODUCT: + tmpValue = var_func(inputDesc.dt, array, len, 0); + break; + default: + return NOT_SUPPORTED; + } + } else { + CHECK_REQUIREMENT(REDUCTION_STD_DEVIATION != reductionMode); + for (U32 j = 0; j < maskLen; j += len) { + U32 axisIndex = j / len; + U32 outputIndex = (i * axisDim + axisIndex) * loopInner; + if (reductionMode == REDUCTION_SUM || reductionMode == REDUCTION_MEAN || + reductionMode == REDUCTION_SCALAR_PRODUCT) { + memset(output + outputIndex, 0, loopInner * bytesOf(inputDesc.dt)); + } else { + return NOT_SUPPORTED; + } + U32 count = 0; + for (U32 k = 0; k < len; k++) { + if (mask == nullptr || (mask != nullptr && mask[j + k] == 1)) { + if (reductionMode == REDUCTION_SUM || reductionMode == REDUCTION_MEAN) { + add_func(inputDesc.dt, output + outputIndex, + &input[(i * len + k) * loopInner], output + outputIndex, loopInner); + count++; + } else if (reductionMode == REDUCTION_SCALAR_PRODUCT) { + square_and_add_func(inputDesc.dt, output + outputIndex, + &input[(i * len + k) * loopInner], output + outputIndex, loopInner); + } else { + return NOT_SUPPORTED; + } + } + } + if (reductionMode == REDUCTION_MEAN) { + scale_func(inputDesc.dt, output + outputIndex, output + outputIndex, loopInner, + 1.0 / count, 0); + } + } + } + } + return SUCCESS; +} + +EE reduction_cpu(TensorDesc inputDesc, + const void *input, + TensorDesc maskDesc, + const void *mask, + ReductionParamSpec p, + int tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + Arch arch) +{ + EE ret = SUCCESS; + ArrayScaleFunction scale_func = get_array_scale_function(arch); + int start = 0; + TensorDesc tmpDesc = inputDesc; + if (inputDesc.df == DF_NCHWC8) { + for (int i = 0; i < p.axes_num; i++) { + // channel dimension + if (p.axes[i] == 1 || p.axes[i] == -3) { + start = -1; + break; + } + } + for (int i = (int)inputDesc.nDims - 1; i >= 0; i--) { + tmpDesc.dims[i + 1] = tmpDesc.dims[i]; + } + tmpDesc.dims[3] /= 8; + tmpDesc.dims[0] = 8; + tmpDesc.nDims += 1; + } + const void *tmp1 = input; + void *tmp2 = nullptr; + for (int i = start; i < p.axes_num; i++) { + if (p.axes_num - start == 1) { + tmp2 = output; + } else { + tmp2 = (char *)tmp + (i - start) % 2 * (tmpBytes / 2); + } + int axis; + if (i == -1) { + axis = 4; + } else { + axis = p.axes[i]; + } + + switch (inputDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = reduction_kernel(tmpDesc, (const F32 *)tmp1, maskDesc, + (const float *)mask, axis, p.reduction_mode, outputDesc, (F32 *)tmp2, arch); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = reduction_kernel(tmpDesc, (const F16 *)tmp1, maskDesc, + (const float *)mask, axis, p.reduction_mode, outputDesc, (F16 *)tmp2, arch); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + tmp1 = tmp2; + if (axis < 0) { + axis = tmpDesc.nDims + axis; + } + axis = tmpDesc.nDims - 1 - axis; + tmpDesc.dims[axis] = 1; + } + + if (tmp2 != output) { + memcpy(output, tmp2, tensorNumBytes(outputDesc)); + } + + if (p.coeff != 1) { + scale_func(outputDesc.dt, output, output, tensorNumElements(outputDesc), p.coeff, 0); + } + + return ret; +} diff --git a/compute/tensor/src/cpu/reshape.cpp b/compute/tensor/src/cpu/reshape.cpp new file mode 100644 index 00000000..58c2641c --- /dev/null +++ b/compute/tensor/src/cpu/reshape.cpp @@ -0,0 +1,120 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/tensor_computing_cpu.h" + +EE reshape_infer_output_size_cpu(TensorDesc inputDesc, ReshapeParamSpec p, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + return NULL_POINTER; + } + I32 *shape = p.shape_dims; + I32 shape_size = p.shape_size; + int inputElementNum = tensorNumElements(inputDesc); + int outputElementNum = 1; + for (int i = 0; i < shape_size; i++) { + outputElementNum *= shape[i]; + } + int index_range = ((int)inputDesc.nDims > shape_size) ? shape_size : inputDesc.nDims; + if (inputElementNum > 0 && outputElementNum > 0 && inputElementNum != outputElementNum) { + for (int i = 0; i < index_range; i++) { + if ((inputElementNum / (int)inputDesc.dims[inputDesc.nDims - 1 - i]) == + (outputElementNum / shape[i])) { + shape[i] = inputDesc.dims[inputDesc.nDims - 1 - i]; + break; + } + } + } + + *outputDesc = inputDesc; + (*outputDesc).nDims = shape_size; + if (shape_size == 2) { + (*outputDesc).df = DF_NORMAL; + } + if (shape_size >= 4) { + (*outputDesc).df = DF_NCHW; + } + + U32 factor = 1; + I32 count = 0; + for (I32 i = 0; i < shape_size; i++) { + I32 value = shape[i]; + if (value == 0) { + value = inputDesc.dims[inputDesc.nDims - 1 - i]; + } + if (value == -1) { + value = 0; + count++; + } else { + factor *= value; + } + + (*outputDesc).dims[shape_size - 1 - i] = value; + } + if (count > 1) { + return NOT_SUPPORTED; + } + + for (I32 i = 0; i < shape_size; i++) { + if ((*outputDesc).dims[i] == 0) { + (*outputDesc).dims[i] = tensorNumElements(inputDesc) / factor; + } + } + + return SUCCESS; +} + +EE reshape_cpu(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + if (tensorNumElements(inputDesc) != tensorNumElements(outputDesc)) { + // Only allow the removal of padded convolution channels + CHECK_REQUIREMENT(DF_NCHWC8 == inputDesc.df); + CHECK_REQUIREMENT(tensorNumElements(inputDesc) >= tensorNumElements(outputDesc)); + inputDesc.df = DF_NCHW; + } + if (DF_NCHWC8 != inputDesc.df) { + if (output != input) { + memcpy(output, input, tensorNumBytes(outputDesc)); + } + } else { + CHECK_REQUIREMENT(input != output); + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + + U32 elementBytes = bytesOf(idt); + ic /= 8; + U8 *inPtr = (U8 *)input; + U8 *outPtr = (U8 *)output; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 hw = 0; hw < ih * iw; hw++) { + for (U32 c8 = 0; c8 < 8; c8++) { + memcpy(outPtr + + elementBytes * (n * ic * 8 * ih * iw + (c * 8 + c8) * ih * iw + hw), + inPtr + + elementBytes * (n * ic * ih * iw * 8 + c * ih * iw * 8 + hw * 8 + c8), + elementBytes); + } + } + } + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/rnn.cpp b/compute/tensor/src/cpu/rnn.cpp new file mode 100644 index 00000000..3a7d9465 --- /dev/null +++ b/compute/tensor/src/cpu/rnn.cpp @@ -0,0 +1,273 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/tensor_computing_cpu.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#include "blas_enhance.h" + +template +static EE rnn_transform_filter(TensorDesc filterDesc, + const T *filterArray, + RNNParamSpec rnnParamSpec, + TensorDesc *ftmDesc, + T *ftmArray) +{ + if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt; + DataFormat fdf; + U32 fn, fk, ftm_n, ftm_k; + CHECK_STATUS(tensor2dGet(filterDesc, &fdt, &fdf, &fn, &fk)); + U32 alignSize = 32; + EE ret = SUCCESS; + switch (fdf) { + case DF_NKN32: { + ftm_n = fn; + ftm_k = fk; + break; + } + case DF_NK: { + // NK => NKN32 + if (fn % alignSize != 0) { + return NOT_MATCH; + } + ftm_n = fn / alignSize; + ftm_k = fk; + for (U32 n = 0; n < ftm_n; n++) { + for (U32 k = 0; k < ftm_k; k++) { + for (U32 n32 = 0; n32 < alignSize; n32++) { + ftmArray[n * ftm_k * alignSize + k * alignSize + n32] = + filterArray[(n * alignSize + n32) * ftm_k + k]; + } + } + } + break; + } + default: + ret = NOT_MATCH; + break; + } + *ftmDesc = tensor2df(fdt, DF_NKN32, fn, fk); + return ret; +} + +static EE rnn_transform_filter_cpu_kernel(TensorDesc filterDesc, + const void *filterArray, + RNNParamSpec rnnParamSpec, + TensorDesc *ftmDesc, + void *ftmArray) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = rnn_transform_filter( + filterDesc, (const F32 *)filterArray, rnnParamSpec, ftmDesc, (F32 *)ftmArray); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = rnn_transform_filter( + filterDesc, (const F16 *)filterArray, rnnParamSpec, ftmDesc, (F16 *)ftmArray); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE rnn_transform_filter_cpu(const TensorDesc *filterDesc, + const void **filterArray, + RNNParamSpec rnnParamSpec, + TensorDesc *ftmDesc, + void **ftmArray) +{ + int num1 = rnnParamSpec.biDirection ? 2 : 1; + int num2 = rnnParamSpec.numProjection > 0 ? 2 : 1; + EE ret = SUCCESS; + for (int i = 0; i < num1 * num2; i++) { + ret = rnn_transform_filter_cpu_kernel( + filterDesc[i], filterArray[i], rnnParamSpec, &ftmDesc[i], ftmArray[i]); + } + return ret; +} + +EE rnn_transform_filter_bytes_cpu( + const TensorDesc *filterDesc, RNNParamSpec rnnParamSpec, U32 *bytes) +{ + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + int num1 = rnnParamSpec.biDirection ? 2 : 1; + int num2 = rnnParamSpec.numProjection > 0 ? 2 : 1; + for (int i = 0; i < num1 * num2; i++) { + bytes[i] = tensorNumBytes(filterDesc[i]); + } + return SUCCESS; +} + +EE rnncell_infer_forward_tmp_bytes_cpu(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + RNNParamSpec rnnParamSpec, + U32 *bytes, + Arch arch) +{ + UNUSED(outputDesc); + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt; + DataFormat idf; + U32 batch, xDim; + CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &batch, &xDim)); + U32 hDim = rnnParamSpec.numOutput; + U32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection + : rnnParamSpec.numOutput; + *bytes = (hDim + xDim + column * 4) * bytesOf(idt); + return SUCCESS; +} + +EE rnn_infer_forward_tmp_bytes_cpu(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + RNNParamSpec rnnParamSpec, + U32 *bytes, + Arch arch) +{ + UNUSED(filterDesc); + UNUSED(outputDesc); + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt; + DataFormat idf; + U32 batch, step, xDim; + CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &batch, &step, &xDim)); + U32 hDim = rnnParamSpec.numOutput; + TensorDesc xDesc = tensor2df(idt, DF_NORMAL, batch, xDim); + CHECK_STATUS(rnncell_infer_forward_tmp_bytes_cpu( + xDesc, filterDesc, outputDesc, rnnParamSpec, bytes, arch)); + U32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection + : rnnParamSpec.numOutput; + *bytes += batch * (column + hDim) * bytesOf(idt); + return SUCCESS; +} + +EE rnncell_cpu(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + U32 tmpBytes, + void *tmp, + TensorDesc hDesc, + void *currentH, + Arch arch) +{ + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = rnncell_general(xDesc, currentX, filterDesc, filter, biasDesc, bias, state, tmpBytes, + tmp, rnnParamSpec, batchStrideX, batchStrideH, hDesc, currentH); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = rnncell_x86(xDesc, currentX, filterDesc, filter, biasDesc, bias, state, tmpBytes, tmp, + rnnParamSpec, batchStrideX, batchStrideH, hDesc, currentH, arch); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = rnncell_arm(xDesc, currentX, filterDesc, filter, biasDesc, bias, state, tmpBytes, tmp, + rnnParamSpec, batchStrideX, batchStrideH, hDesc, currentH, arch); +#endif + } + return ret; +} + +EE rnn_cpu(TensorDesc inputDesc, + const void *input, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + RNNParamSpec rnnParamSpec, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + Arch arch) +{ + UNUSED(outputDesc); + + if (nullptr == input || nullptr == filter || nullptr == bias || nullptr == tmp || + nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + DataType idt; + DataFormat idf; + U32 batch, step, xDim; + int num1 = rnnParamSpec.biDirection ? 2 : 1; + CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &batch, &step, &xDim)); + U32 hDim = rnnParamSpec.numOutput; + U32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection + : rnnParamSpec.numOutput; + + U8 *cellState = (U8 *)tmp; + U8 *tmpArray = cellState + batch * (column + hDim) * bytesOf(idt); + U32 batchStrideX = step * xDim; + U32 batchStrideH = step * hDim * num1; + TensorDesc xDesc = tensor2df(idt, DF_NORMAL, batch, xDim); + TensorDesc hDesc = tensor2df(idt, DF_NORMAL, batch, hDim); + + memset(cellState, 0, batch * (column + hDim) * bytesOf(idt)); + for (U32 t = 0; t < step; t++) { + const U8 *currentX = (const U8 *)input + t * xDim * bytesOf(idt); + U8 *currentH = (U8 *)output + t * hDim * num1 * bytesOf(idt); + CHECK_STATUS(rnncell_cpu(xDesc, currentX, filterDesc, filter, biasDesc, bias, cellState, + rnnParamSpec, batchStrideX, batchStrideH, tmpBytes, tmpArray, hDesc, currentH, arch)); + } + + if (rnnParamSpec.biDirection) { + memset(cellState, 0, batch * (column + hDim) * bytesOf(idt)); + int num2 = (rnnParamSpec.numProjection > 0) ? 2 : 1; + for (I32 t = step - 1; t >= 0; t--) { + const U8 *currentX = (const U8 *)input + t * xDim * bytesOf(idt); + U8 *currentH = (U8 *)output + t * hDim * num1 * bytesOf(idt) + hDim * bytesOf(idt); + CHECK_STATUS(rnncell_cpu(xDesc, currentX, &filterDesc[num2], &filter[num2], + &biasDesc[num2], &bias[num2], cellState, rnnParamSpec, batchStrideX, batchStrideH, + tmpBytes, tmpArray, hDesc, currentH, arch)); + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/roialign.cpp b/compute/tensor/src/cpu/roialign.cpp new file mode 100644 index 00000000..42969206 --- /dev/null +++ b/compute/tensor/src/cpu/roialign.cpp @@ -0,0 +1,170 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing_type.h" +#include "cpu/tensor_computing_cpu.h" + +template +static F32 bilinear_interpolate(T *data, U32 w, U32 h, F32 x, F32 y) +{ + if (y < -1.0 || y > h || x < -1.0 || x > w) { + return 0; + } + if (y <= 0) { + y = 0; + } + if (x <= 0) { + x = 0; + } + + U32 x0 = x; + U32 x1 = x0 + 1; + U32 y0 = y; + U32 y1 = y0 + 1; + + F32 hx = x1 - x; + F32 lx = x - x0; + F32 hy = y1 - y; + F32 ly = y - y0; + + if (x1 >= w) { + x1 = w - 1; + hx = 1.f; + lx = 0.f; + } + if (y1 >= h) { + y1 = h - 1; + hy = 1.f; + ly = 0.f; + } + + F32 r0 = data[y0 * w + x0] * hx + data[y0 * w + x1] * lx; + F32 r1 = data[y1 * w + x0] * hx + data[y1 * w + x1] * lx; + + F32 val = r0 * hy + r1 * ly; + return val; +} + +template +static EE roialign_kernel(std::vector input, + T *output, + std::vector inputDesc, + U32 output_h, + U32 output_w, + U32 sampling_ratio, + F32 spatial_scale) +{ + DataType idt0, idt1; + DataFormat idf0, idf1; + U32 in0, ic0, ih0, iw0; + U32 ih1, iw1; + CHECK_STATUS(tensor4dGet(inputDesc[0], &idt0, &idf0, &in0, &ic0, &ih0, &iw0)); + CHECK_STATUS(tensor2dGet(inputDesc[1], &idt1, &idf1, &ih1, &iw1)); + T *feature_map = (T *)input[0]; + T *rois = (T *)input[1]; + CHECK_REQUIREMENT(idf0 == DF_NCHWC8 || idf0 == DF_NCHW); + if (inputDesc[0].df == DF_NCHWC8) { + T *tmp = (T *)malloc(tensorNumBytes(inputDesc[0])); + memcpy(tmp, feature_map, tensorNumBytes(inputDesc[0])); + CHECK_STATUS(transformToNCHW(inputDesc[0], tmp, inputDesc[0], feature_map)); + free(tmp); + } + + U32 channel = ic0; + U32 feature_w = iw0; + U32 feature_h = ih0; + U32 num_rois = ih1; + for (U32 n = 0; n < num_rois; n++) { + U32 idx_n = n * channel * output_w * output_h; + F32 roi_start_x1 = static_cast(rois[n * 4]) * spatial_scale; + F32 roi_start_y1 = static_cast(rois[n * 4 + 1]) * spatial_scale; + F32 roi_end_x2 = static_cast(rois[n * 4 + 2]) * spatial_scale; + F32 roi_end_y2 = static_cast(rois[n * 4 + 3]) * spatial_scale; + + F32 roi_w = std::max(roi_end_x2 - roi_start_x1, 1.f); + F32 roi_h = std::max(roi_end_y2 - roi_start_y1, 1.f); + + F32 bin_size_w = roi_w / static_cast(output_w); + F32 bin_size_h = roi_h / static_cast(output_h); + + U32 bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_w / output_w); + U32 bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_h / output_h); + + F32 count = bin_grid_h * bin_grid_w; + for (U32 c = 0; c < channel; c++) { + U32 idx_nc = idx_n + c * output_h * output_w; + T *feature_map_offset = feature_map + c * feature_h * feature_w; + for (U32 ph = 0; ph < output_h; ph++) { + for (U32 pw = 0; pw < output_w; pw++) { + U32 idx = idx_nc + ph * output_w + pw; + F32 output_val = 0; + F32 start_x = roi_start_x1 + pw * bin_size_w; + F32 start_y = roi_start_y1 + ph * bin_size_h; + start_x = std::min(std::max(start_x, 0.f), (F32)feature_w); + start_y = std::min(std::max(start_y, 0.f), (F32)feature_h); + for (U32 by = 0; by < bin_grid_h; by++) { + F32 y = start_y + + static_cast(by + 0.5f) * bin_size_h / static_cast(bin_grid_h); + for (U32 bx = 0; bx < bin_grid_w; bx++) { + F32 x = start_x + + static_cast(bx + 0.5f) * bin_size_w / + static_cast(bin_grid_w); + F32 val = bilinear_interpolate( + (T *)feature_map_offset, feature_w, feature_h, x, y); + output_val += val; + } + } + output_val /= count; + output[idx] = output_val; + } + } + } + } + + return SUCCESS; +} + +EE roialign_cpu(std::vector inputDesc, + std::vector input, + RoiAlignParamSpec roiAlignParamSpec, + TensorDesc outputDesc, + void *output) +{ + UNUSED(outputDesc); + if (nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + U32 output_h = roiAlignParamSpec.output_h; + U32 output_w = roiAlignParamSpec.output_w; + U32 sampling_ratio = roiAlignParamSpec.sampling_ratio; + F32 spatial_scale = roiAlignParamSpec.spatial_scale; + EE ret = SUCCESS; + switch (inputDesc[0].dt) { +#ifdef _USE_FP32 + case DT_F32: + ret = roialign_kernel( + input, (F32 *)output, inputDesc, output_h, output_w, sampling_ratio, spatial_scale); + break; +#endif +#ifdef _USE_FP16 + case DT_F16: + ret = roialign_kernel( + input, (F16 *)output, inputDesc, output_h, output_w, sampling_ratio, spatial_scale); + break; +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/tensor_computing/src/cpu/arm/slice.cpp b/compute/tensor/src/cpu/slice.cpp similarity index 73% rename from tensor_computing/src/cpu/arm/slice.cpp rename to compute/tensor/src/cpu/slice.cpp index 106ea87a..72b59cef 100644 --- a/tensor_computing/src/cpu/arm/slice.cpp +++ b/compute/tensor/src/cpu/slice.cpp @@ -1,33 +1,36 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include -#include "cpu/arm/tensor_computing_arm.h" - +#include "cpu/tensor_computing_cpu.h" -EE slice_arm(TensorDesc inputDesc, void* input, - int axis, - std::vector outputDesc, std::vector* output) +EE slice_cpu(TensorDesc inputDesc, + void *input, + SliceParamSpec p, + std::vector outputDesc, + std::vector *output) { - if (nullptr == input || nullptr == output) + if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); + } U32 num = outputDesc.size(); - if (num < 1) return NOT_MATCH; + if (num < 1) { + return NOT_MATCH; + } int dim = inputDesc.nDims; - axis = (axis + dim) % dim; + int axis = (p.axis + dim) % dim; axis = dim - 1 - axis; U32 tileSize = bytesOf(inputDesc.dt); for (I32 i = 0; i < axis; i++) { @@ -49,9 +52,10 @@ EE slice_arm(TensorDesc inputDesc, void* input, for (U32 i = 0; i < loops; i++) { for (U32 j = 0; j < num; j++) { U32 blockSize = outputDesc[j].dims[axis] * tileSize; - if (blockSize > 0 && nullptr == (*output)[j]) + if (blockSize > 0 && nullptr == (*output)[j]) { CHECK_STATUS(NULL_POINTER); - U8* dstPtr = (U8*)((*output)[j]) + i * blockSize; + } + U8 *dstPtr = (U8 *)((*output)[j]) + i * blockSize; memcpy(dstPtr, ptr, blockSize); ptr += blockSize; } diff --git a/tensor_computing/src/cpu/general/split.cpp b/compute/tensor/src/cpu/split.cpp similarity index 68% rename from tensor_computing/src/cpu/general/split.cpp rename to compute/tensor/src/cpu/split.cpp index 5c5f4dec..38d25cb5 100644 --- a/tensor_computing/src/cpu/general/split.cpp +++ b/compute/tensor/src/cpu/split.cpp @@ -1,33 +1,38 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include -#include "cpu/general/tensor_computing_general.h" +#include "cpu/tensor_computing_cpu.h" -EE split_general(TensorDesc inputDesc, void* input, - std::vector outputDesc, std::vector* output) +EE split_cpu(TensorDesc inputDesc, + void *input, + std::vector outputDesc, + std::vector *output) { UNUSED(inputDesc); - if (nullptr == input || nullptr == output) + if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); - if(outputDesc.size() <= 1) return NOT_MATCH; + } + if (outputDesc.size() <= 1) { + return NOT_MATCH; + } - for(U32 i = 0; i < (*output).size(); i++) { - if (nullptr == (*output)[i]) + for (U32 i = 0; i < (*output).size(); i++) { + if (nullptr == (*output)[i]) { CHECK_STATUS(NULL_POINTER); + } memcpy((*output)[i], input, tensorNumBytes(outputDesc[i])); } return SUCCESS; diff --git a/compute/tensor/src/cpu/tensor_computing_cpu.h b/compute/tensor/src/cpu/tensor_computing_cpu.h new file mode 100644 index 00000000..4ce621d4 --- /dev/null +++ b/compute/tensor/src/cpu/tensor_computing_cpu.h @@ -0,0 +1,286 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TENSOR_COMPUTING_CPU +#define _H_TENSOR_COMPUTING_CPU + +#include "sys.h" +#include "types.h" + +EE rnn_transform_filter_cpu(const TensorDesc *filterDescs, + const void **filterArray, + RNNParamSpec rnnParamSpec, + TensorDesc *ftmDesc, + void **ftmArray); + +EE rnn_transform_filter_bytes_cpu( + const TensorDesc *filterDesc, RNNParamSpec rnnParamSpec, U32 *bytes); + +EE rnncell_infer_forward_tmp_bytes_cpu(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + RNNParamSpec rnnParamSpec, + U32 *bytes, + Arch arch); + +EE rnn_infer_forward_tmp_bytes_cpu(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + RNNParamSpec rnnParamSpec, + U32 *bytes, + Arch arch); + +EE rnncell_cpu(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + U32 tmpBytes, + void *tmp, + TensorDesc hDesc, + void *currentH, + Arch arch); + +EE rnn_cpu(TensorDesc inputDesc, + const void *input, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + RNNParamSpec rnnParamSpec, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + Arch arch); + +EE embedding_cpu(TensorDesc inputDesc, + void *input, + void *weight, + EmbedParamSpec p, + TensorDesc outputDesc, + void *output); + +EE tfslice_infer_output_size_cpu(TensorDesc inputDesc, TfSliceParamSpec p, TensorDesc *outputDesc); + +EE tfslice_cpu( + TensorDesc inputDesc, void *input, TfSliceParamSpec p, TensorDesc outputDesc, void *output); + +EE padding_infer_output_size_cpu( + TensorDesc inputDesc, PadParamSpec padParamSpec, TensorDesc *outputDesc); + +EE padding_cpu(TensorDesc inputDesc, + const void *input, + PadParamSpec padParamSpec, + TensorDesc outputDesc, + void *output); + +EE reshape_infer_output_size_cpu(TensorDesc inputDesc, ReshapeParamSpec p, TensorDesc *outputDesc); + +EE reshape_cpu(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output); + +EE depthwise_convolution_transform_filter_bytes_cpu( + TensorDesc filterDesc, DepthwiseConvolutionForwardAlgorithm algorithm, U32 *bytes); + +EE eltwise_cpu(std::vector inputDesc, + std::vector input, + EltwiseParamSpec eltwiseDesc, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + Arch arch); + +EE roialign_cpu(std::vector inputDesc, + std::vector input, + RoiAlignParamSpec roiAlignParamSpec, + TensorDesc outputDesc, + void *output); + +EE split_cpu(TensorDesc inputDesc, + void *input, + std::vector outputDesc, + std::vector *output); + +EE transpose_cpu( + TensorDesc inputDesc, const void *input, U32 *dim, TensorDesc outputDesc, void *output); + +EE reduction_cpu(TensorDesc inputDesc, + const void *input, + TensorDesc maskDesc, + const void *mask, + ReductionParamSpec p, + int tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + Arch arch); + +EE non_max_suppression_cpu(std::vector inputDesc, + std::vector input, + NonMaxSuppressionParamSpec nonMaxSuppressionParamSpec, + TensorDesc outputDesc, + void *output); + +EE concat_cpu(std::vector inputDesc, + std::vector input, + void *inputScale, + ConcatParamSpec p, + void *tmp, + TensorDesc outputDesc, + void *output, + void *outputScale); + +EE l2normalization_cpu( + TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output, Arch arch); + +EE power_cpu(TensorDesc inputDesc, + void *input, + PowerParamSpec p, + TensorDesc outputDesc, + void *output, + Arch arch); + +EE slice_cpu(TensorDesc inputDesc, + void *input, + SliceParamSpec p, + std::vector outputDesc, + std::vector *output); + +EE priorbox_cpu(std::vector inputDesc, + PriorBoxParamSpec priorBoxParamSpec, + TensorDesc outputDesc, + void *output, + Arch arch); + +EE clip_cpu(TensorDesc inputDesc, + void *input, + ClipParamSpec p, + TensorDesc outputDesc, + void *output, + Arch arch); + +EE detectionoutput_cpu(std::vector inputDesc, + std::vector input, + DetectionOutputParamSpec detectionOutputParamSpec, + TensorDesc outputDesc, + void *output); + +EE deconvolution_infer_forward_algorithm_cpu(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ConvolutionForwardAlgorithm *algorithm, + DataType targetDataType, + Arch arch); + +EE deconvolution_transform_filter_bytes_cpu(TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes, + Arch arch); + +EE deconvolution_transform_filter_cpu(TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed, + Arch arch); + +EE deconvolution_infer_forward_tmp_bytes_cpu(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes, + Arch arch); + +EE deconvolution_cpu(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch); + +EE convolution_cpu(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch); + +EE depthwise_pointwise_convolution_cpu(TensorDesc inputDesc, + void *input, + TensorDesc dwFilterDesc, + const void *dwFilter, + TensorDesc pwFilterDesc, + const void *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const void *dwBias, + TensorDesc pwBiasDesc, + const void *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch); + +EE activation_cpu(TensorDesc inputDesc, + void *input, + ActivationParamSpec activationDesc, + TensorDesc outputDesc, + void *output, + Arch arch); + +EE yolov3detectionoutput_cpu(std::vector inputDesc, + std::vector input, + Yolov3DetectionOutputParamSpec yolov3DetectionOutputParamSpec, + TensorDesc outputDesc, + void *output, + Arch arch); + +EE argmax_cpu( + TensorDesc inputDesc, const void *input, ArgMaxParamSpec p, TensorDesc outputDesc, void *output); + +#endif diff --git a/compute/tensor/src/cpu/tfslice.cpp b/compute/tensor/src/cpu/tfslice.cpp new file mode 100644 index 00000000..b72cc230 --- /dev/null +++ b/compute/tensor/src/cpu/tfslice.cpp @@ -0,0 +1,131 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/tensor_computing_cpu.h" + +EE tfslice_infer_output_size_cpu(TensorDesc inputDesc, TfSliceParamSpec p, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + int *begin = p.begin; + int *end = p.end; + int *strides = p.strides; + char *beginMask = p.begin_mask; + char *endMask = p.end_mask; + U32 dimSize = p.dim_size; + + CHECK_REQUIREMENT(dimSize == inputDesc.nDims); + *outputDesc = inputDesc; + for (U32 i = 0; i < dimSize; i++) { + int axis = dimSize - 1 - i; + int axisBegin = (beginMask[i] == 1) ? 0 : begin[i]; + int axisEnd = (endMask[i] == 1) ? inputDesc.dims[axis] : end[i]; + int num = (axisEnd - axisBegin) / strides[i]; + outputDesc->dims[axis] = num; + begin[i] = axisBegin; + end[i] = axisEnd; + } + if (inputDesc.df == DF_NCHWC8) { + int channelAxis = 1; + if (begin[channelAxis] % 8 != 0 || strides[channelAxis] != 1 || + (end[channelAxis] - begin[channelAxis]) / strides[channelAxis] % 8 != 0) { + outputDesc->df = DF_NCHW; + } + } + return SUCCESS; +} + +EE tfslice_cpu( + TensorDesc inputDesc, void *input, TfSliceParamSpec p, TensorDesc outputDesc, void *output) +{ + int *begin = p.begin; + int *end = p.end; + int *strides = p.strides; + char *beginMask = p.begin_mask; + char *endMask = p.end_mask; + U32 dimSize = p.dim_size; + for (U32 i = 0; i < dimSize; i++) { + int axis = dimSize - 1 - i; + int axisBegin = (beginMask[i] == 1) ? 0 : begin[i]; + int axisEnd = (endMask[i] == 1) ? inputDesc.dims[axis] : end[i]; + begin[i] = axisBegin; + end[i] = axisEnd; + } + + U32 num = tensorNumElements(outputDesc); + U8 *dst = (U8 *)output; + U32 elementSize = bytesOf(inputDesc.dt); + int channelAxis = inputDesc.nDims - 2; + if (inputDesc.df == outputDesc.df) { + std::vector tmpInputDims(inputDesc.nDims), tmpOutputDims(outputDesc.nDims); + memcpy(tmpInputDims.data(), inputDesc.dims, inputDesc.nDims * sizeof(U32)); + memcpy(tmpOutputDims.data(), outputDesc.dims, outputDesc.nDims * sizeof(U32)); + int startAxis = 0; + int elementNum = 1; + if (inputDesc.df == DF_NCHWC8) { + elementNum *= 8; + begin[1] /= 8; + tmpInputDims[channelAxis] /= 8; + tmpOutputDims[channelAxis] /= 8; + tmpInputDims.insert(tmpInputDims.begin(), 8); + tmpOutputDims.insert(tmpOutputDims.begin(), 8); + startAxis = 1; + } + for (int i = dimSize - 1; i >= 0; i--) { + int reverseAxis = dimSize - 1 - i; + if (begin[i] == 0 && end[i] == (int)inputDesc.dims[reverseAxis] && strides[i] == 1) { + elementNum *= (end[i] - begin[i]); + } else { + break; + } + } + U32 tileSize = elementSize * elementNum; + for (U32 i = 0; i < num; i += elementNum, dst += tileSize) { + std::vector localIndex = + calculateLocalIndex(i, tmpOutputDims.data(), tmpOutputDims.size()); + for (U32 j = 0; j < dimSize; j++) { + int reverseAxis = dimSize - 1 - j; + localIndex[startAxis + j] = + localIndex[startAxis + j] * strides[reverseAxis] + begin[reverseAxis]; + } + U32 srcIndex = + calculateGlobalIndex(localIndex.data(), tmpInputDims.data(), tmpInputDims.size()); + U8 *src = (U8 *)input + srcIndex * elementSize; + memcpy(dst, src, tileSize); + } + if (inputDesc.df == DF_NCHWC8) { + begin[1] *= 8; + } + } else { + CHECK_REQUIREMENT(inputDesc.df == DF_NCHWC8); + U32 tmpNDims = inputDesc.nDims + 1; + std::vector tmpDims(tmpNDims); + tmpDims[0] = 8; + memcpy(&(tmpDims[1]), inputDesc.dims, inputDesc.nDims * sizeof(U32)); + for (U32 i = 0; i < num; i++, dst += elementSize) { + std::vector localIndex = calculateLocalIndex(i, outputDesc.dims, outputDesc.nDims); + for (U32 j = 0; j < dimSize; j++) { + int reverseAxis = dimSize - 1 - j; + localIndex[j] = localIndex[j] * strides[reverseAxis] + begin[reverseAxis]; + } + int c8 = localIndex[channelAxis] % 8; + localIndex[channelAxis] /= 8; + localIndex.insert(localIndex.begin(), c8); + U32 index = calculateGlobalIndex(localIndex.data(), tmpDims.data(), tmpNDims); + U8 *src = (U8 *)input + index * elementSize; + memcpy(dst, src, elementSize); + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/transpose.cpp b/compute/tensor/src/cpu/transpose.cpp new file mode 100644 index 00000000..38007d61 --- /dev/null +++ b/compute/tensor/src/cpu/transpose.cpp @@ -0,0 +1,24 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/tensor_computing_cpu.h" + +EE transpose_cpu( + TensorDesc inputDesc, const void *input, U32 *dim, TensorDesc outputDesc, void *output) +{ + if (nullptr == input || nullptr == output || nullptr == dim) { + CHECK_STATUS(NULL_POINTER); + } + return array_transpose( + inputDesc.dt, inputDesc.dims, input, outputDesc.dims, output, dim, inputDesc.nDims); +} diff --git a/compute/tensor/src/cpu/x86/attention_mask.cpp b/compute/tensor/src/cpu/x86/attention_mask.cpp new file mode 100644 index 00000000..52d0a85e --- /dev/null +++ b/compute/tensor/src/cpu/x86/attention_mask.cpp @@ -0,0 +1,40 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/tensor_computing_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif + +EE attention_mask_x86(TensorDesc inputDesc, + const void *input, + AttentionMaskParamSpec p, + TensorDesc outputDesc, + void *output) +{ + DataType idt = inputDesc.dt; + EE ret = SUCCESS; + switch (idt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = attention_mask_fp32(inputDesc, (const F32 *)input, p, outputDesc, (F32 *)output); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + + return ret; +} diff --git a/compute/tensor/src/cpu/x86/check.cpp b/compute/tensor/src/cpu/x86/check.cpp new file mode 100644 index 00000000..0cf0c5ba --- /dev/null +++ b/compute/tensor/src/cpu/x86/check.cpp @@ -0,0 +1,105 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/tensor_computing_x86.h" +#include "x86_avx2_expand.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif + +static EE check_u32(TensorDesc inputDescA, + const U32 *inputA, + TensorDesc inputDescB, + const U32 *inputB, + CheckMode checkMode, + TensorDesc outputDesc, + I32 *output) +{ + if (nullptr == inputA || nullptr == inputB || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) { + CHECK_STATUS(NOT_MATCH); + } + + U32 size = tensorNumElements(inputDescA); + U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1]; + if (tensorNumElements(outputDesc) != loopOuter) { + CHECK_STATUS(NOT_MATCH); + } + I32 length = size / loopOuter; + for (U32 j = 0; j < loopOuter; j++) { + const U32 *arrayA = inputA + j * length; + const U32 *arrayB = inputB + j * length; + switch (checkMode) { + case CHECK_EQUAL: { + __m256i count_v = _mm256_set1_epi32(0); + I32 i = 0; + for (; i < length - 7; i += 8) { + __m256i a = _mm256_loadu_si256((__m256i *)arrayA + i); + __m256i b = _mm256_loadu_si256((__m256i *)arrayA + i); + count_v = _mm256_add_epi32(count_v, _mm256_cmpeq_epi32(a, b)); + } + I32 count = _mm256_hadd_u32(count_v); + for (; i < length; i++) { + if (arrayA[i] == arrayB[i]) { + count++; + } + } + output[j] = (count == length); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + } + return SUCCESS; +} + +EE check_x86(TensorDesc inputDescA, + const void *inputA, + TensorDesc inputDescB, + const void *inputB, + CheckParamSpec p, + TensorDesc outputDesc, + void *output) +{ + DataType idt = inputDescA.dt; + EE ret = SUCCESS; + switch (idt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = check_fp32(inputDescA, (const F32 *)inputA, inputDescB, (const F32 *)inputB, + p.check_mode, outputDesc, (I32 *)output); + break; + } +#endif + case DT_U32: { + ret = check_u32(inputDescA, (const U32 *)inputA, inputDescB, (const U32 *)inputB, + p.check_mode, outputDesc, (I32 *)output); + break; + } + case DT_I32: { + ret = check_u32(inputDescA, (const U32 *)inputA, inputDescB, (const U32 *)inputB, + p.check_mode, outputDesc, (I32 *)output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + + return ret; +} diff --git a/blas-enhance/src/cpu/arm/fp16/mvm.cpp b/compute/tensor/src/cpu/x86/clip.cpp similarity index 64% rename from blas-enhance/src/cpu/arm/fp16/mvm.cpp rename to compute/tensor/src/cpu/x86/clip.cpp index 6944482e..fae34a1b 100644 --- a/blas-enhance/src/cpu/arm/fp16/mvm.cpp +++ b/compute/tensor/src/cpu/x86/clip.cpp @@ -1,31 +1,32 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#include "cpu/x86/tensor_computing_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif -#include "error.h" -#include "cpu/arm/fp16/blas_fp16.h" -#include "cpu/arm/fp16/mvm.h" - - -EE mvm_fp16(U32 row, U32 col, bool transpose, F16* matrix, F16* vector, F16* result, Arch arch) { +EE clip_x86(TensorDesc inputDesc, void *input, ClipParamSpec p, TensorDesc outputDesc, void *output) +{ + UNUSED(outputDesc); EE ret = SUCCESS; - switch (arch) { - case ARM_A55: - mvm_A55(row, col, transpose, matrix, vector, result); - break; - case ARM_A76: - mvm_A76(row, col, transpose, matrix, vector, result); + switch (inputDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = clip_fp32((F32 *)input, (F32 *)output, tensorNumElements(inputDesc), p.min, p.max); break; + } +#endif default: ret = NOT_SUPPORTED; break; diff --git a/compute/tensor/src/cpu/x86/convolution.cpp b/compute/tensor/src/cpu/x86/convolution.cpp new file mode 100644 index 00000000..4692257a --- /dev/null +++ b/compute/tensor/src/cpu/x86/convolution.cpp @@ -0,0 +1,228 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/x86/tensor_computing_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif +#include "ut_util.h" + +EE convolution_infer_forward_algorithm_x86(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ConvolutionForwardAlgorithm *algorithm, + DataType targetDataType) +{ + UNUSED(filterDesc); + UNUSED(outputDesc); + UNUSED(convParamSpec); + UNUSED(policy); + UNUSED(targetDataType); + if (nullptr == algorithm) { + CHECK_STATUS(NULL_POINTER); + } + if (*algorithm != CONVOLUTION_ALGORITHM_NULL) { + return SUCCESS; + } + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 group = convParamSpec.group; + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + if ((idf != DF_NCHWC8) || (ic / group % 8 != 0)) { + *algorithm = CONVOLUTION_ALGORITHM_GEMM_ICNCHW; + return SUCCESS; + } + + if ((strideH == 1) && (strideW == 1) && (fh == 1) && (fw == 1)) { + *algorithm = CONVOLUTION_ALGORITHM_POINTWISE; + return SUCCESS; + } + + *algorithm = CONVOLUTION_ALGORITHM_DIRECT; + return SUCCESS; +} + +EE convolution_transform_filter_bytes_x86(TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes) +{ + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + EE ret = SUCCESS; + + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + U32 fnAlignSize = 8; + U32 fnGroupSize = fn / convParamSpec.group; + U32 fnPadding = (fnGroupSize / fnAlignSize + ((fnGroupSize % fnAlignSize) == 0 ? 0 : 1)) * + fnAlignSize * convParamSpec.group; + U32 fcPadding = (fc / fnAlignSize + ((fc % fnAlignSize) == 0 ? 0 : 1)) * fnAlignSize; + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + *bytes = fnPadding * fcPadding * fh * fw; + break; + case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: + *bytes = fnPadding * fc * fh * fw; + break; + case CONVOLUTION_ALGORITHM_POINTWISE: + *bytes = fnPadding * fcPadding; + break; + default: + return NOT_SUPPORTED; + } + *bytes *= bytesOf(fdt); + *bytes += 32; + return ret; +} + +EE convolution_transform_filter_x86(TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = convolution_transform_filter_fp32(filterDesc, (F32 *)filter, convParamSpec, + algorithm, ftmDesc, (F32 *)filterTransformed); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE convolution_infer_forward_tmp_bytes_x86(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = convolution_infer_forward_tmp_bytes_fp32( + inputDesc, filterDesc, outputDesc, convParamSpec, algorithm, bytes); + break; + } +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + + return ret; +} + +EE convolution_x86(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + UNUSED(scaleDesc); + UNUSED(scale); + U32 group = convParamSpec.group; + U32 batchAxis = inputDesc.nDims - 1; + U32 dataChannelAxis = inputDesc.nDims - 2; + U32 filterChannelAxis = filterDesc.nDims - 1; + U32 biasChannelAxis = 0; + CHECK_REQUIREMENT(inputDesc.dims[batchAxis] == 1); + U32 icGroupSize = inputDesc.dims[dataChannelAxis] / group; + + void *inputTransform; + if (inputDesc.df == DF_NCHWC8 && icGroupSize % 8 != 0) { + TensorDesc tmpInputDesc = inputDesc; + tmpInputDesc.df = DF_NCHW; + transformToNCHW(inputDesc, input, tmpInputDesc, tmp); + inputTransform = tmp; + tmp = (U8 *)tmp + tensorNumBytes(tmpInputDesc); + tmpBytes -= tensorNumBytes(tmpInputDesc); + inputDesc.df = DF_NCHW; + } else { + inputTransform = input; + } + + TensorDesc tmpInputDesc = inputDesc; + tmpInputDesc.dims[dataChannelAxis] /= group; + TensorDesc tmpOutputDesc = outputDesc; + tmpOutputDesc.dims[dataChannelAxis] /= group; + TensorDesc tmpFilterDesc = filterDesc; + tmpFilterDesc.dims[filterChannelAxis] /= group; + TensorDesc tmpBiasDesc = biasDesc; + tmpBiasDesc.dims[biasChannelAxis] /= group; + + TensorDesc paddingFilterDesc = tmpFilterDesc; + paddingFilterDesc.dims[filterChannelAxis] = (tmpFilterDesc.dims[filterChannelAxis] + 7) / 8 * 8; + + EE ret = SUCCESS; + for (U32 g = 0; g < group; g++) { + void *tmpInput = (U8 *)inputTransform + g * tensorNumBytes(tmpInputDesc); + const void *tmpFilter = (U8 *)filter + g * tensorNumBytes(paddingFilterDesc); + const void *tmpBias = (U8 *)bias + g * tensorNumBytes(tmpBiasDesc); + void *tmpOutput = (U8 *)output + g * tensorNumBytes(tmpOutputDesc); + switch (filterDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = convolution_fp32(tmpInputDesc, (F32 *)tmpInput, tmpFilterDesc, + (F32 *)tmpFilter, convParamSpec, algorithm, tmpBiasDesc, (F32 *)tmpBias, + tmpBytes, tmp, tmpOutputDesc, (F32 *)tmpOutput, activationDesc, arch); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + } + return ret; +} diff --git a/compute/tensor/src/cpu/x86/deconvolution.cpp b/compute/tensor/src/cpu/x86/deconvolution.cpp new file mode 100644 index 00000000..b144224b --- /dev/null +++ b/compute/tensor/src/cpu/x86/deconvolution.cpp @@ -0,0 +1,42 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/tensor_computing_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif +#ifdef _USE_FP16 +#include "cpu/x86/fp16/tensor_computing_fp16.h" +#endif + +EE deconvolution_transform_filter_x86(TensorDesc filterDesc, + const void *filter, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = deconvolution_transform_filter_fp32( + filterDesc, (F32 *)filter, algorithm, ftmDesc, (F32 *)filterTransformed); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/x86/depthwise_convolution.cpp b/compute/tensor/src/cpu/x86/depthwise_convolution.cpp new file mode 100644 index 00000000..2423173a --- /dev/null +++ b/compute/tensor/src/cpu/x86/depthwise_convolution.cpp @@ -0,0 +1,102 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/tensor_computing_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif + +EE depthwise_convolution_transform_filter_x86(TensorDesc filterDesc, + const void *filter, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = depthwise_convolution_transform_filter_fp32( + filterDesc, (F32 *)filter, algorithm, ftmDesc, (F32 *)filterTransformed); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_convolution_infer_forward_tmp_bytes_x86(TensorDesc inputDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes) +{ + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, odt; + DataFormat idf, odf; + U32 in, ic, ih, iw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + EE ret = SUCCESS; + switch (algorithm) { + case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: + *bytes = ic * ih_pad * iw_pad; + break; + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: + *bytes = ic * ih_pad * iw_pad + ic * oh * ow; + break; + default: { + ret = NOT_MATCH; + *bytes = 0; + break; + } + } + *bytes *= bytesOf(idt); + *bytes += 32; + return ret; +} + +EE depthwise_convolution_x86(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec, + Arch arch) +{ + TensorDesc blankTensorDesc; + ActivationParamSpec blankActivationParamSpec; + return depthwise_pointwise_convolution_x86(inputDesc, input, filterDesc, filter, blankTensorDesc, + nullptr, convParamSpec, algorithm, blankTensorDesc, bias, biasDesc, nullptr, tmpBytes, tmp, + outputDesc, output, depthwiseActivationParamSpec, blankActivationParamSpec, arch); +} diff --git a/compute/tensor/src/cpu/x86/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/x86/depthwise_pointwise_convolution.cpp new file mode 100644 index 00000000..02cada77 --- /dev/null +++ b/compute/tensor/src/cpu/x86/depthwise_pointwise_convolution.cpp @@ -0,0 +1,83 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/tensor_computing_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif + +EE depthwise_pointwise_convolution_transform_filter_x86(TensorDesc dwFilterDesc, + const void *dwFilter, + TensorDesc pwFilterDesc, + const void *pwFilter, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *dwFtmDesc, + void *dwFilterTransformed, + TensorDesc *pwFtmDesc, + void *pwFilterTransformed) +{ + EE ret = SUCCESS; + switch (dwFilterDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = depthwise_pointwise_convolution_transform_filter_fp32(dwFilterDesc, + (F32 *)dwFilter, pwFilterDesc, (F32 *)pwFilter, algorithm, dwFtmDesc, + (F32 *)dwFilterTransformed, pwFtmDesc, (F32 *)pwFilterTransformed); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_pointwise_convolution_x86(TensorDesc inputDesc, + void *input, + TensorDesc dwFilterDesc, + const void *dwFilter, + TensorDesc pwFilterDesc, + const void *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const void *dwBias, + TensorDesc pwBiasDesc, + const void *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch) +{ + EE ret = SUCCESS; + switch (dwFilterDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = depthwise_pointwise_convolution_fp32(inputDesc, (F32 *)input, dwFilterDesc, + (const F32 *)dwFilter, pwFilterDesc, (const F32 *)pwFilter, convParamSpec, + algorithm, dwBiasDesc, (const F32 *)dwBias, pwBiasDesc, (const F32 *)pwBias, + tmpBytes, tmp, outputDesc, (F32 *)output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, arch); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/x86/eltwise.cpp b/compute/tensor/src/cpu/x86/eltwise.cpp new file mode 100644 index 00000000..8ead9916 --- /dev/null +++ b/compute/tensor/src/cpu/x86/eltwise.cpp @@ -0,0 +1,41 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/x86/tensor_computing_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif + +EE eltwise_x86(DataType dataType, + std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode) +{ + EE ret = SUCCESS; + switch (dataType) { +#ifdef _USE_FP32 + case DT_F32: { + ret = eltwise_fp32(input, inputSize, num, len, output, eltwiseMode); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/x86/fp32/attention_mask.cpp b/compute/tensor/src/cpu/x86/fp32/attention_mask.cpp new file mode 100644 index 00000000..9d683bca --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/attention_mask.cpp @@ -0,0 +1,82 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +EE attention_mask_fp32(TensorDesc inputDesc, + const F32 *input, + AttentionMaskParamSpec p, + TensorDesc outputDesc, + F32 *output) +{ + UNUSED(outputDesc); + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + I32 attentionLength = p.attention_length; + bool sameLength = p.same_length; + float maskValue = p.mask; + int qlen = inputDesc.dims[1]; + int klen = inputDesc.dims[0]; + int mlen = klen - qlen; + I32 length = qlen * klen; + std::vector mask; + if (attentionLength < 0) { + mask = std::vector(length, 0); + } else { + mask = std::vector(length, 1); + for (int i = 0; i < qlen; i++) { + int start, loops; + if (attentionLength > 0) { + int end = mlen + i; + start = UNI_MAX(end - attentionLength, 0); + loops = end - start + 1; + } else { + if (sameLength) { + start = i; + loops = qlen + 1; + } else { + start = 0; + loops = i + qlen + 1; + } + } + loops = UNI_MAX(loops, 0); + start = UNI_MIN(start, klen); + if (start + loops > klen) { + loops = UNI_MAX(klen - start, 0); + } + memset(&mask[i * klen + start], 0, sizeof(F32) * loops); + } + } + I32 loops = tensorNumElements(inputDesc) / length; + __m256 one_v = _mm256_set1_ps(1.0f); + __m256 mask_value_v = _mm256_set1_ps(maskValue); + for (int i = 0, index = 0; i < loops; i++) { + int j = 0; + for (; j < length - 7; j += 8) { + __m256 in = _mm256_loadu_ps(input + index); + __m256 mask_v = _mm256_loadu_ps(&mask[j]); + __m256 tmp_v = _mm256_sub_ps(one_v, mask_v); + mask_v = _mm256_mul_ps(mask_value_v, mask_v); + tmp_v = _mm256_fmsub_ps(in, tmp_v, mask_v); + _mm256_storeu_ps(output + index, tmp_v); + index += 8; + } + for (; j < length; j++) { + output[index] = input[index] * (1 - mask[j]) - maskValue * mask[j]; + index++; + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/check.cpp b/compute/tensor/src/cpu/x86/fp32/check.cpp new file mode 100644 index 00000000..9140fe00 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/check.cpp @@ -0,0 +1,103 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#include "x86_avx2_expand.h" + +EE check_fp32(TensorDesc inputDescA, + const F32 *inputA, + TensorDesc inputDescB, + const F32 *inputB, + CheckMode checkMode, + TensorDesc outputDesc, + I32 *output) +{ + if (nullptr == inputA || nullptr == inputB || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) { + CHECK_STATUS(NOT_MATCH); + } + + U32 size = tensorNumElements(inputDescA); + U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1]; + I32 length = size / loopOuter; + if (tensorNumElements(outputDesc) != loopOuter) { + CHECK_STATUS(NOT_MATCH); + } + for (U32 j = 0; j < loopOuter; j++) { + const F32 *arrayA = inputA + j * length; + const F32 *arrayB = inputB + j * length; + switch (checkMode) { + case CHECK_GREAT: { + __m256i count_v = _mm256_set1_epi32(0); + I32 i = 0; + for (; i < length - 7; i += 8) { + __m256 a = _mm256_loadu_ps(arrayA + i); + __m256 b = _mm256_loadu_ps(arrayA + i); + count_v = _mm256_add_epi32( + count_v, _mm256_cvtps_epi32(_mm256_cmp_ps(a, b, _CMP_GT_OS))); + } + I32 count = _mm256_hadd_u32(count_v); + for (; i < length; i++) { + if (arrayA[i] > arrayB[i]) { + count++; + } + } + output[j] = (count == length); + break; + } + case CHECK_GREATEQUAL: { + __m256i count_v = _mm256_set1_epi32(0); + I32 i = 0; + for (; i < length - 7; i += 8) { + __m256 a = _mm256_loadu_ps(arrayA + i); + __m256 b = _mm256_loadu_ps(arrayA + i); + count_v = _mm256_add_epi32( + count_v, _mm256_cvtps_epi32(_mm256_cmp_ps(a, b, _CMP_GE_OS))); + } + I32 count = _mm256_hadd_u32(count_v); + for (; i < length; i++) { + if (arrayA[i] >= arrayB[i]) { + count++; + } + } + output[j] = (count == length); + break; + } + case CHECK_EQUAL: { + __m256i count_v = _mm256_set1_epi32(0); + I32 i = 0; + for (; i < length - 7; i += 8) { + __m256 a = _mm256_loadu_ps(arrayA + i); + __m256 b = _mm256_loadu_ps(arrayA + i); + count_v = _mm256_add_epi32( + count_v, _mm256_cvtps_epi32(_mm256_cmp_ps(a, b, _CMP_EQ_OS))); + } + I32 count = _mm256_hadd_u32(count_v); + for (; i < length; i++) { + if (arrayA[i] == arrayB[i]) { + count++; + } + } + output[j] = (count == length); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/clip.cpp b/compute/tensor/src/cpu/x86/fp32/clip.cpp new file mode 100644 index 00000000..cfa53653 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/clip.cpp @@ -0,0 +1,38 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +EE clip_fp32(F32 *input, F32 *output, I32 len, F32 minValue, F32 maxValue) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + __m256 min_v = _mm256_set1_ps(minValue); + __m256 max_v = _mm256_set1_ps(maxValue); + + I32 i = 0; + for (i = 0; i < len - 7; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 tmp_v = _mm256_min_ps(max_v, _mm256_max_ps(min_v, in)); + _mm256_storeu_ps(output + i, tmp_v); + } + for (; i < len; i++) { + F32 value = input[i]; + value = (value > minValue) ? value : minValue; + value = (value < maxValue) ? value : maxValue; + output[i] = value; + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/convolution.cpp b/compute/tensor/src/cpu/x86/fp32/convolution.cpp new file mode 100644 index 00000000..f9fb09aa --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/convolution.cpp @@ -0,0 +1,134 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" + +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +EE convolution_infer_forward_tmp_bytes_fp32(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes) +{ + if (nullptr == bytes) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + + U32 icAlignSize = 8; + U32 icPadding = (ic + icAlignSize - 1) / icAlignSize * icAlignSize; + + EE ret = SUCCESS; + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + *bytes = icPadding * ih_pad * iw_pad; + break; + case CONVOLUTION_ALGORITHM_POINTWISE: + *bytes = oc; + break; + case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: + *bytes = 0; + break; + default: + ret = NOT_MATCH; + break; + } + + // pre data processing space for not complete NCHWC8 group convolution input + U32 icGroupSize = ic / convParamSpec.group; + if (idf == DF_NCHWC8 && icGroupSize % 8 != 0) { + *bytes += tensorNumBytes(inputDesc); + } + + *bytes *= bytesOf(idt); + *bytes += 32; + return ret; +} + +EE convolution_fp32(TensorDesc inputDesc, + F32 *input, + TensorDesc filterDesc, + const F32 *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const F32 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *output, + ActivationParamSpec activationDesc, + Arch arch) +{ + UNUSED(arch); + if (nullptr == input || nullptr == filter || nullptr == output || nullptr == bias || + nullptr == tmp) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(odf == DF_NCHWC8)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(ic == fc && oc == fn)) { + CHECK_STATUS(NOT_MATCH); + } + + EE ret = SUCCESS; + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + ret = convolution_direct(inputDesc, input, filterDesc, filter, convParamSpec, biasDesc, + bias, tmpBytes, tmp, outputDesc, output, activationDesc); + break; + case CONVOLUTION_ALGORITHM_POINTWISE: + ret = convolution_1x1_direct(inputDesc, input, filterDesc, filter, convParamSpec, + bias, tmpBytes, tmp, outputDesc, output, activationDesc); + break; + case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: + ret = convolution_direct_nchw(inputDesc, input, filterDesc, filter, convParamSpec, + biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/x86/fp32/convolution_1x1_direct.cpp b/compute/tensor/src/cpu/x86/fp32/convolution_1x1_direct.cpp new file mode 100644 index 00000000..3c3173de --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/convolution_1x1_direct.cpp @@ -0,0 +1,1749 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" + +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +#define UNROLL_HW 4 +#define SIMDW 8 +#define UNROLL_OC 24 +#define UNROLL_IC_BLOCK_DIM 8 +#define BLOCK_IC_DIM 128 +#define BLOCK_OC_DIM 96 +#define BLOCK_HW_DIM 128 +#define align_addr(addr, unit) (((uintptr_t)addr + unit - 1) / unit * unit) + +typedef void (*kernel_func)( + F32 *curI, const F32 *curW, F32 *curO, const F32 *curB, U32 oStep, U32 store, U32 ic, U32 fStep); + +inline void avx2_pointwise_kernel_3x32( + F32 *curI, const F32 *curW, F32 *curO, const F32 *curB, U32 oStep, U32 store, U32 ic, U32 fStep) +{ + __asm__ __volatile__("shr $3, %%ecx \n\t" + "mov %5, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups (%3), %%ymm1 \n\t" + "vmovups (%3), %%ymm2 \n\t" + "vmovups 0x20(%3), %%ymm3 \n\t" + "vmovups 0x20(%3), %%ymm4 \n\t" + "vmovups 0x20(%3), %%ymm5 \n\t" + "vmovups 0x40(%3), %%ymm6 \n\t" + "vmovups 0x40(%3), %%ymm7 \n\t" + "vmovups 0x40(%3), %%ymm8 \n\t" + "vmovups 0x60(%3), %%ymm9 \n\t" + "vmovups 0x60(%3), %%ymm10 \n\t" + "vmovups 0x60(%3), %%ymm11 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "mov %2, %%rax \n\t" + "vmovups (%%rax), %%ymm0 \n\t" + "vmovups 0x20(%%rax), %%ymm1 \n\t" + "vmovups 0x40(%%rax), %%ymm2 \n\t" + "add %4, %%rax \n\t" + "vmovups (%%rax), %%ymm3 \n\t" + "vmovups 0x20(%%rax), %%ymm4 \n\t" + "vmovups 0x40(%%rax), %%ymm5 \n\t" + "add %4, %%rax \n\t" + "vmovups (%%rax), %%ymm6 \n\t" + "vmovups 0x20(%%rax), %%ymm7 \n\t" + "vmovups 0x40(%%rax), %%ymm8 \n\t" + "add %4, %%rax \n\t" + "vmovups (%%rax), %%ymm9 \n\t" + "vmovups 0x20(%%rax), %%ymm10 \n\t" + "vmovups 0x40(%%rax), %%ymm11 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vbroadcastss (%1), %%ymm12 \n\t" + "vbroadcastss 0x20(%1), %%ymm13 \n\t" + "vbroadcastss 0x40(%1), %%ymm14 \n\t" + "vmovaps (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovaps 0x20(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovaps 0x40(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovaps 0x60(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x4(%1), %%ymm12 \n\t" + "vbroadcastss 0x24(%1), %%ymm13 \n\t" + "vbroadcastss 0x44(%1), %%ymm14 \n\t" + "vmovaps 0x80(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovaps 0xA0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovaps 0xC0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovaps 0xE0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x8(%1), %%ymm12 \n\t" + "vbroadcastss 0x28(%1), %%ymm13 \n\t" + "vbroadcastss 0x48(%1), %%ymm14 \n\t" + "vmovaps 0x100(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovaps 0x120(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovaps 0x140(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovaps 0x160(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0xC(%1), %%ymm12 \n\t" + "vbroadcastss 0x2C(%1), %%ymm13 \n\t" + "vbroadcastss 0x4C(%1), %%ymm14 \n\t" + "vmovaps 0x180(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovaps 0x1A0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovaps 0x1C0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovaps 0x1E0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x10(%1), %%ymm12 \n\t" + "vbroadcastss 0x30(%1), %%ymm13 \n\t" + "vbroadcastss 0x50(%1), %%ymm14 \n\t" + "vmovaps 0x200(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovaps 0x220(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovaps 0x240(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovaps 0x260(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x14(%1), %%ymm12 \n\t" + "vbroadcastss 0x34(%1), %%ymm13 \n\t" + "vbroadcastss 0x54(%1), %%ymm14 \n\t" + "vmovaps 0x280(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovaps 0x2A0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovaps 0x2C0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovaps 0x2E0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x18(%1), %%ymm12 \n\t" + "vbroadcastss 0x38(%1), %%ymm13 \n\t" + "vbroadcastss 0x58(%1), %%ymm14 \n\t" + "vmovaps 0x300(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovaps 0x320(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovaps 0x340(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovaps 0x360(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x1C(%1), %%ymm12 \n\t" + "vbroadcastss 0x3C(%1), %%ymm13 \n\t" + "vbroadcastss 0x5C(%1), %%ymm14 \n\t" + "vmovaps 0x380(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovaps 0x3A0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovaps 0x3C0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovaps 0x3E0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add $0x400, %0 \n\t" + "add %7, %1 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %5 \n\t" + "je 2f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm7, %%ymm7 \n\t" + "vmaxps %%ymm15, %%ymm8, %%ymm8 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + "vmaxps %%ymm15, %%ymm10, %%ymm10 \n\t" + "vmaxps %%ymm15, %%ymm11, %%ymm11 \n\t" + + // relu6 + "and $0x4, %5 \n\t" + "je 2f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm7, %%ymm7 \n\t" + "vminps %%ymm12, %%ymm8, %%ymm8 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + "vminps %%ymm12, %%ymm10, %%ymm10 \n\t" + "vminps %%ymm12, %%ymm11, %%ymm11 \n\t" + + "2: \n\t" + "vmovups %%ymm0, (%2) \n\t" + "vmovups %%ymm1, 0x20(%2) \n\t" + "vmovups %%ymm2, 0x40(%2) \n\t" + "add %4, %2 \n\t" + "vmovups %%ymm3, (%2) \n\t" + "vmovups %%ymm4, 0x20(%2) \n\t" + "vmovups %%ymm5, 0x40(%2) \n\t" + "add %4, %2 \n\t" + "vmovups %%ymm6, (%2) \n\t" + "vmovups %%ymm7, 0x20(%2) \n\t" + "vmovups %%ymm8, 0x40(%2) \n\t" + "add %4, %2 \n\t" + "vmovups %%ymm9, (%2) \n\t" + "vmovups %%ymm10, 0x20(%2) \n\t" + "vmovups %%ymm11, 0x40(%2) \n\t" + : + : "r"(curW), "r"(curI), "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store), + "c"(ic), "r"(I64(fStep)) + : "%eax", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); +} + +inline void avx2_pointwise_kernel_4x24( + F32 *curI, const F32 *curW, F32 *curO, const F32 *curB, U32 oStep, U32 store, U32 ic, U32 fStep) +{ + __asm__ __volatile__("shr $3, %%ecx \n\t" + "mov %5, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups (%3), %%ymm1 \n\t" + "vmovups (%3), %%ymm2 \n\t" + "vmovups (%3), %%ymm3 \n\t" + "vmovups 0x20(%3), %%ymm4 \n\t" + "vmovups 0x20(%3), %%ymm5 \n\t" + "vmovups 0x20(%3), %%ymm6 \n\t" + "vmovups 0x20(%3), %%ymm7 \n\t" + "vmovups 0x40(%3), %%ymm8 \n\t" + "vmovups 0x40(%3), %%ymm9 \n\t" + "vmovups 0x40(%3), %%ymm10 \n\t" + "vmovups 0x40(%3), %%ymm11 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%2), %%ymm0 \n\t" + "vmovups 0x20(%2), %%ymm1 \n\t" + "vmovups 0x40(%2), %%ymm2 \n\t" + "vmovups 0x60(%2), %%ymm3 \n\t" + "vmovups (%2, %4), %%ymm4 \n\t" + "vmovups 0x20(%2, %4), %%ymm5 \n\t" + "vmovups 0x40(%2, %4), %%ymm6 \n\t" + "vmovups 0x60(%2, %4), %%ymm7 \n\t" + "vmovups (%2, %4, 2), %%ymm8 \n\t" + "vmovups 0x20(%2, %4, 2), %%ymm9 \n\t" + "vmovups 0x40(%2, %4, 2), %%ymm10 \n\t" + "vmovups 0x60(%2, %4, 2), %%ymm11 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%0), %%ymm12 \n\t" + "vmovaps 0x20(%0), %%ymm13 \n\t" + "vmovaps 0x40(%0), %%ymm14 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0x20(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss 0x40(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss 0x60(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vmovaps 0x60(%0), %%ymm12 \n\t" + "vmovaps 0x80(%0), %%ymm13 \n\t" + "vmovaps 0xA0(%0), %%ymm14 \n\t" + "vbroadcastss 0x4(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0x24(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss 0x44(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss 0x64(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vmovaps 0xC0(%0), %%ymm12 \n\t" + "vmovaps 0xE0(%0), %%ymm13 \n\t" + "vmovaps 0x100(%0), %%ymm14 \n\t" + "vbroadcastss 0x8(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0x28(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss 0x48(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss 0x68(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vmovaps 0x120(%0), %%ymm12 \n\t" + "vmovaps 0x140(%0), %%ymm13 \n\t" + "vmovaps 0x160(%0), %%ymm14 \n\t" + "vbroadcastss 0xC(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0x2C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss 0x4C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss 0x6C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vmovaps 0x180(%0), %%ymm12 \n\t" + "vmovaps 0x1A0(%0), %%ymm13 \n\t" + "vmovaps 0x1C0(%0), %%ymm14 \n\t" + "vbroadcastss 0x10(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0x30(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss 0x50(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss 0x70(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vmovaps 0x1E0(%0), %%ymm12 \n\t" + "vmovaps 0x200(%0), %%ymm13 \n\t" + "vmovaps 0x220(%0), %%ymm14 \n\t" + "vbroadcastss 0x14(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0x34(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss 0x54(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss 0x74(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vmovaps 0x240(%0), %%ymm12 \n\t" + "vmovaps 0x260(%0), %%ymm13 \n\t" + "vmovaps 0x280(%0), %%ymm14 \n\t" + "vbroadcastss 0x18(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0x38(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss 0x58(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss 0x78(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vmovaps 0x2A0(%0), %%ymm12 \n\t" + "vmovaps 0x2C0(%0), %%ymm13 \n\t" + "vmovaps 0x2E0(%0), %%ymm14 \n\t" + "vbroadcastss 0x1C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss 0x3C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss 0x5C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss 0x7C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add $0x300, %0 \n\t" + "add %7, %1 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %5 \n\t" + "je 2f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm7, %%ymm7 \n\t" + "vmaxps %%ymm15, %%ymm8, %%ymm8 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + "vmaxps %%ymm15, %%ymm10, %%ymm10 \n\t" + "vmaxps %%ymm15, %%ymm11, %%ymm11 \n\t" + + // relu6 + "and $0x4, %5 \n\t" + "je 2f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm7, %%ymm7 \n\t" + "vminps %%ymm12, %%ymm8, %%ymm8 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + "vminps %%ymm12, %%ymm10, %%ymm10 \n\t" + "vminps %%ymm12, %%ymm11, %%ymm11 \n\t" + + "2: \n\t" + "vmovups %%ymm0, (%2) \n\t" + "vmovups %%ymm1, 0x20(%2) \n\t" + "vmovups %%ymm2, 0x40(%2) \n\t" + "vmovups %%ymm3, 0x60(%2) \n\t" + "vmovups %%ymm4, (%2, %4) \n\t" + "vmovups %%ymm5, 0x20(%2, %4) \n\t" + "vmovups %%ymm6, 0x40(%2, %4) \n\t" + "vmovups %%ymm7, 0x60(%2, %4) \n\t" + "vmovups %%ymm8, (%2, %4, 2) \n\t" + "vmovups %%ymm9, 0x20(%2, %4, 2) \n\t" + "vmovups %%ymm10, 0x40(%2, %4, 2) \n\t" + "vmovups %%ymm11, 0x60(%2, %4, 2) \n\t" + : + : "r"(curW), "r"(curI), "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store), + "c"(ic), "r"(I64(fStep)) + : "%eax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", + "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); +} + +inline void avx2_pointwise_kernel_6x16( + F32 *curI, const F32 *curW, F32 *curO, const F32 *curB, U32 oStep, U32 store, U32 ic, U32 fStep) +{ + __asm__ __volatile__("shr $3, %%ecx \n\t" + "mov %5, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups (%3), %%ymm1 \n\t" + "vmovups (%3), %%ymm2 \n\t" + "vmovups (%3), %%ymm3 \n\t" + "vmovups (%3), %%ymm4 \n\t" + "vmovups (%3), %%ymm5 \n\t" + "vmovups 0x20(%3), %%ymm6 \n\t" + "vmovups 0x20(%3), %%ymm7 \n\t" + "vmovups 0x20(%3), %%ymm8 \n\t" + "vmovups 0x20(%3), %%ymm9 \n\t" + "vmovups 0x20(%3), %%ymm10 \n\t" + "vmovups 0x20(%3), %%ymm11 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%2), %%ymm0 \n\t" + "vmovups 0x20(%2), %%ymm1 \n\t" + "vmovups 0x40(%2), %%ymm2 \n\t" + "vmovups 0x60(%2), %%ymm3 \n\t" + "vmovups 0x80(%2), %%ymm4 \n\t" + "vmovups 0xA0(%2), %%ymm5 \n\t" + "vmovups (%2, %4), %%ymm6 \n\t" + "vmovups 0x20(%2, %4), %%ymm7 \n\t" + "vmovups 0x40(%2, %4), %%ymm8 \n\t" + "vmovups 0x60(%2, %4), %%ymm9 \n\t" + "vmovups 0x80(%2, %4), %%ymm10 \n\t" + "vmovups 0xA0(%2, %4), %%ymm11 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%0), %%ymm12 \n\t" + "vmovaps 0x20(%0), %%ymm13 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vbroadcastss 0x20(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm7 \n\t" + "vbroadcastss 0x40(%1), %%ymm15 \n\t" + "vbroadcastss 0x60(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm8 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm9 \n\t" + "vbroadcastss 0x80(%1), %%ymm15 \n\t" + "vbroadcastss 0xA0(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm5 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm11 \n\t" + + "vmovaps 0x40(%0), %%ymm12 \n\t" + "vmovaps 0x60(%0), %%ymm13 \n\t" + "vbroadcastss 0x4(%1), %%ymm15 \n\t" + "vbroadcastss 0x24(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm7 \n\t" + "vbroadcastss 0x44(%1), %%ymm15 \n\t" + "vbroadcastss 0x64(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm8 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm9 \n\t" + "vbroadcastss 0x84(%1), %%ymm15 \n\t" + "vbroadcastss 0xA4(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm5 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm11 \n\t" + + "vmovaps 0x80(%0), %%ymm12 \n\t" + "vmovaps 0xA0(%0), %%ymm13 \n\t" + "vbroadcastss 0x8(%1), %%ymm15 \n\t" + "vbroadcastss 0x28(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm7 \n\t" + "vbroadcastss 0x48(%1), %%ymm15 \n\t" + "vbroadcastss 0x68(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm8 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm9 \n\t" + "vbroadcastss 0x88(%1), %%ymm15 \n\t" + "vbroadcastss 0xA8(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm5 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm11 \n\t" + + "vmovaps 0xC0(%0), %%ymm12 \n\t" + "vmovaps 0xE0(%0), %%ymm13 \n\t" + "vbroadcastss 0xC(%1), %%ymm15 \n\t" + "vbroadcastss 0x2C(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm7 \n\t" + "vbroadcastss 0x4C(%1), %%ymm15 \n\t" + "vbroadcastss 0x6C(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm8 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm9 \n\t" + "vbroadcastss 0x8C(%1), %%ymm15 \n\t" + "vbroadcastss 0xAC(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm5 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm11 \n\t" + + "vmovaps 0x100(%0), %%ymm12 \n\t" + "vmovaps 0x120(%0), %%ymm13 \n\t" + "vbroadcastss 0x10(%1), %%ymm15 \n\t" + "vbroadcastss 0x30(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm7 \n\t" + "vbroadcastss 0x50(%1), %%ymm15 \n\t" + "vbroadcastss 0x70(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm8 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm9 \n\t" + "vbroadcastss 0x90(%1), %%ymm15 \n\t" + "vbroadcastss 0xB0(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm5 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm11 \n\t" + + "vmovaps 0x140(%0), %%ymm12 \n\t" + "vmovaps 0x160(%0), %%ymm13 \n\t" + "vbroadcastss 0x14(%1), %%ymm15 \n\t" + "vbroadcastss 0x34(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm7 \n\t" + "vbroadcastss 0x54(%1), %%ymm15 \n\t" + "vbroadcastss 0x74(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm8 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm9 \n\t" + "vbroadcastss 0x94(%1), %%ymm15 \n\t" + "vbroadcastss 0xB4(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm5 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm11 \n\t" + + "vmovaps 0x180(%0), %%ymm12 \n\t" + "vmovaps 0x1A0(%0), %%ymm13 \n\t" + "vbroadcastss 0x18(%1), %%ymm15 \n\t" + "vbroadcastss 0x38(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm7 \n\t" + "vbroadcastss 0x58(%1), %%ymm15 \n\t" + "vbroadcastss 0x78(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm8 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm9 \n\t" + "vbroadcastss 0x98(%1), %%ymm15 \n\t" + "vbroadcastss 0xB8(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm5 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm11 \n\t" + + "vmovaps 0x1C0(%0), %%ymm12 \n\t" + "vmovaps 0x1E0(%0), %%ymm13 \n\t" + "vbroadcastss 0x1C(%1), %%ymm15 \n\t" + "vbroadcastss 0x3C(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm7 \n\t" + "vbroadcastss 0x5C(%1), %%ymm15 \n\t" + "vbroadcastss 0x7C(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm8 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm9 \n\t" + "vbroadcastss 0x9C(%1), %%ymm15 \n\t" + "vbroadcastss 0xBC(%1), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm5 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm11 \n\t" + + "add $0x200, %0 \n\t" + "add %7, %1 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %5 \n\t" + "je 2f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm7, %%ymm7 \n\t" + "vmaxps %%ymm15, %%ymm8, %%ymm8 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + "vmaxps %%ymm15, %%ymm10, %%ymm10 \n\t" + "vmaxps %%ymm15, %%ymm11, %%ymm11 \n\t" + + // relu6 + "and $0x4, %5 \n\t" + "je 2f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm7, %%ymm7 \n\t" + "vminps %%ymm12, %%ymm8, %%ymm8 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + "vminps %%ymm12, %%ymm10, %%ymm10 \n\t" + "vminps %%ymm12, %%ymm11, %%ymm11 \n\t" + + "2: \n\t" + "vmovups %%ymm0, (%2) \n\t" + "vmovups %%ymm1, 0x20(%2) \n\t" + "vmovups %%ymm2, 0x40(%2) \n\t" + "vmovups %%ymm3, 0x60(%2) \n\t" + "vmovups %%ymm4, 0x80(%2) \n\t" + "vmovups %%ymm5, 0xA0(%2) \n\t" + "vmovups %%ymm6, (%2, %4) \n\t" + "vmovups %%ymm7, 0x20(%2, %4) \n\t" + "vmovups %%ymm8, 0x40(%2, %4) \n\t" + "vmovups %%ymm9, 0x60(%2, %4) \n\t" + "vmovups %%ymm10, 0x80(%2, %4) \n\t" + "vmovups %%ymm11, 0xA0(%2, %4) \n\t" + : + : "r"(curW), "r"(curI), "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store), + "c"(ic), "r"(I64(fStep)) + : "%eax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", + "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); +} + +inline void avx2_pointwise_kernel_12x8( + F32 *curI, const F32 *curW, F32 *curO, const F32 *curB, U32 oStep, U32 store, U32 ic, U32 fStep) +{ + __asm__ __volatile__( + "shr $3, %%ecx \n\t" + "mov %4, %%ebx \n\t" + "and $0x1, %%ebx \n\t" + "jne 0f \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups (%3), %%ymm1 \n\t" + "vmovups (%3), %%ymm2 \n\t" + "vmovups (%3), %%ymm3 \n\t" + "vmovups (%3), %%ymm4 \n\t" + "vmovups (%3), %%ymm5 \n\t" + "vmovups (%3), %%ymm6 \n\t" + "vmovups (%3), %%ymm7 \n\t" + "vmovups (%3), %%ymm8 \n\t" + "vmovups (%3), %%ymm9 \n\t" + "vmovups (%3), %%ymm10 \n\t" + "vmovups (%3), %%ymm11 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%2), %%ymm0 \n\t" + "vmovups 0x20(%2), %%ymm1 \n\t" + "vmovups 0x40(%2), %%ymm2 \n\t" + "vmovups 0x60(%2), %%ymm3 \n\t" + "vmovups 0x80(%2), %%ymm4 \n\t" + "vmovups 0xA0(%2), %%ymm5 \n\t" + "vmovups 0xC0(%2), %%ymm6 \n\t" + "vmovups 0xE0(%2), %%ymm7 \n\t" + "vmovups 0x100(%2), %%ymm8 \n\t" + "vmovups 0x120(%2), %%ymm9 \n\t" + "vmovups 0x140(%2), %%ymm10 \n\t" + "vmovups 0x160(%2), %%ymm11 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%0), %%ymm12 \n\t" + "vbroadcastss (%1), %%ymm13 \n\t" + "vbroadcastss 0x20(%1), %%ymm14 \n\t" + "vbroadcastss 0x40(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vbroadcastss 0x60(%1), %%ymm13 \n\t" + "vbroadcastss 0x80(%1), %%ymm14 \n\t" + "vbroadcastss 0xA0(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm5 \n\t" + "vbroadcastss 0xC0(%1), %%ymm13 \n\t" + "vbroadcastss 0xE0(%1), %%ymm14 \n\t" + "vbroadcastss 0x100(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm8 \n\t" + "vbroadcastss 0x120(%1), %%ymm13 \n\t" + "vbroadcastss 0x140(%1), %%ymm14 \n\t" + "vbroadcastss 0x160(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm11 \n\t" + + "vmovaps 0x20(%0), %%ymm12 \n\t" + "vbroadcastss 0x4(%1), %%ymm13 \n\t" + "vbroadcastss 0x24(%1), %%ymm14 \n\t" + "vbroadcastss 0x44(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vbroadcastss 0x64(%1), %%ymm13 \n\t" + "vbroadcastss 0x84(%1), %%ymm14 \n\t" + "vbroadcastss 0xA4(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm5 \n\t" + "vbroadcastss 0xC4(%1), %%ymm13 \n\t" + "vbroadcastss 0xE4(%1), %%ymm14 \n\t" + "vbroadcastss 0x104(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm8 \n\t" + "vbroadcastss 0x124(%1), %%ymm13 \n\t" + "vbroadcastss 0x144(%1), %%ymm14 \n\t" + "vbroadcastss 0x164(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm11 \n\t" + + "vmovaps 0x40(%0), %%ymm12 \n\t" + "vbroadcastss 0x8(%1), %%ymm13 \n\t" + "vbroadcastss 0x28(%1), %%ymm14 \n\t" + "vbroadcastss 0x48(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vbroadcastss 0x68(%1), %%ymm13 \n\t" + "vbroadcastss 0x88(%1), %%ymm14 \n\t" + "vbroadcastss 0xA8(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm5 \n\t" + "vbroadcastss 0xC8(%1), %%ymm13 \n\t" + "vbroadcastss 0xE8(%1), %%ymm14 \n\t" + "vbroadcastss 0x108(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm8 \n\t" + "vbroadcastss 0x128(%1), %%ymm13 \n\t" + "vbroadcastss 0x148(%1), %%ymm14 \n\t" + "vbroadcastss 0x168(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm11 \n\t" + + "vmovaps 0x60(%0), %%ymm12 \n\t" + "vbroadcastss 0xC(%1), %%ymm13 \n\t" + "vbroadcastss 0x2C(%1), %%ymm14 \n\t" + "vbroadcastss 0x4C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vbroadcastss 0x6C(%1), %%ymm13 \n\t" + "vbroadcastss 0x8C(%1), %%ymm14 \n\t" + "vbroadcastss 0xAC(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm5 \n\t" + "vbroadcastss 0xCC(%1), %%ymm13 \n\t" + "vbroadcastss 0xEC(%1), %%ymm14 \n\t" + "vbroadcastss 0x10C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm8 \n\t" + "vbroadcastss 0x12C(%1), %%ymm13 \n\t" + "vbroadcastss 0x14C(%1), %%ymm14 \n\t" + "vbroadcastss 0x16C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm11 \n\t" + + "vmovaps 0x80(%0), %%ymm12 \n\t" + "vbroadcastss 0x10(%1), %%ymm13 \n\t" + "vbroadcastss 0x30(%1), %%ymm14 \n\t" + "vbroadcastss 0x50(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vbroadcastss 0x70(%1), %%ymm13 \n\t" + "vbroadcastss 0x90(%1), %%ymm14 \n\t" + "vbroadcastss 0xB0(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm5 \n\t" + "vbroadcastss 0xD0(%1), %%ymm13 \n\t" + "vbroadcastss 0xF0(%1), %%ymm14 \n\t" + "vbroadcastss 0x110(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm8 \n\t" + "vbroadcastss 0x130(%1), %%ymm13 \n\t" + "vbroadcastss 0x150(%1), %%ymm14 \n\t" + "vbroadcastss 0x170(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm11 \n\t" + + "vmovaps 0xA0(%0), %%ymm12 \n\t" + "vbroadcastss 0x14(%1), %%ymm13 \n\t" + "vbroadcastss 0x34(%1), %%ymm14 \n\t" + "vbroadcastss 0x54(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vbroadcastss 0x74(%1), %%ymm13 \n\t" + "vbroadcastss 0x94(%1), %%ymm14 \n\t" + "vbroadcastss 0xB4(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm5 \n\t" + "vbroadcastss 0xD4(%1), %%ymm13 \n\t" + "vbroadcastss 0xF4(%1), %%ymm14 \n\t" + "vbroadcastss 0x114(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm8 \n\t" + "vbroadcastss 0x134(%1), %%ymm13 \n\t" + "vbroadcastss 0x154(%1), %%ymm14 \n\t" + "vbroadcastss 0x174(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm11 \n\t" + + "vmovaps 0xC0(%0), %%ymm12 \n\t" + "vbroadcastss 0x18(%1), %%ymm13 \n\t" + "vbroadcastss 0x38(%1), %%ymm14 \n\t" + "vbroadcastss 0x58(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vbroadcastss 0x78(%1), %%ymm13 \n\t" + "vbroadcastss 0x98(%1), %%ymm14 \n\t" + "vbroadcastss 0xB8(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm5 \n\t" + "vbroadcastss 0xD8(%1), %%ymm13 \n\t" + "vbroadcastss 0xF8(%1), %%ymm14 \n\t" + "vbroadcastss 0x118(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm8 \n\t" + "vbroadcastss 0x138(%1), %%ymm13 \n\t" + "vbroadcastss 0x158(%1), %%ymm14 \n\t" + "vbroadcastss 0x178(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm11 \n\t" + + "vmovaps 0xE0(%0), %%ymm12 \n\t" + "vbroadcastss 0x1C(%1), %%ymm13 \n\t" + "vbroadcastss 0x3C(%1), %%ymm14 \n\t" + "vbroadcastss 0x5C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vbroadcastss 0x7C(%1), %%ymm13 \n\t" + "vbroadcastss 0x9C(%1), %%ymm14 \n\t" + "vbroadcastss 0xBC(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm5 \n\t" + "vbroadcastss 0xDC(%1), %%ymm13 \n\t" + "vbroadcastss 0xFC(%1), %%ymm14 \n\t" + "vbroadcastss 0x11C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm8 \n\t" + "vbroadcastss 0x13C(%1), %%ymm13 \n\t" + "vbroadcastss 0x15C(%1), %%ymm14 \n\t" + "vbroadcastss 0x17C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm11 \n\t" + + "add $0x100, %0 \n\t" + "add %6, %1 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %4 \n\t" + "je 2f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm7, %%ymm7 \n\t" + "vmaxps %%ymm15, %%ymm8, %%ymm8 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + "vmaxps %%ymm15, %%ymm10, %%ymm10 \n\t" + "vmaxps %%ymm15, %%ymm11, %%ymm11 \n\t" + + // relu6 + "and $0x4, %4 \n\t" + "je 2f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm7, %%ymm7 \n\t" + "vminps %%ymm12, %%ymm8, %%ymm8 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + "vminps %%ymm12, %%ymm10, %%ymm10 \n\t" + "vminps %%ymm12, %%ymm11, %%ymm11 \n\t" + + "2: \n\t" + "vmovups %%ymm0, (%2) \n\t" + "vmovups %%ymm1, 0x20(%2) \n\t" + "vmovups %%ymm2, 0x40(%2) \n\t" + "vmovups %%ymm3, 0x60(%2) \n\t" + "vmovups %%ymm4, 0x80(%2) \n\t" + "vmovups %%ymm5, 0xA0(%2) \n\t" + "vmovups %%ymm6, 0xC0(%2) \n\t" + "vmovups %%ymm7, 0xE0(%2) \n\t" + "vmovups %%ymm8, 0x100(%2) \n\t" + "vmovups %%ymm9, 0x120(%2) \n\t" + "vmovups %%ymm10, 0x140(%2) \n\t" + "vmovups %%ymm11, 0x160(%2) \n\t" + : + : "r"(curW), "r"(curI), "r"(curO), "r"(curB), "r"(store), "c"(ic), "r"(I64(fStep)) + : "%eax", "%rax", "%ebx", "%r9", "%r10", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", + "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); +} + +inline void avx2_pointwise_kernel_1x32( + F32 *curI, const F32 *curW, F32 *curO, const F32 *curB, U32 oStep, U32 store, U32 ic, U32 fStep) +{ + __asm__ __volatile__("shr $3, %%ecx \n\t" + "mov %5, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups 0x20(%3), %%ymm3 \n\t" + "vmovups 0x40(%3), %%ymm6 \n\t" + "vmovups 0x60(%3), %%ymm9 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "mov %2, %%rax \n\t" + "vmovups (%%rax), %%ymm0 \n\t" + "add %4, %%rax \n\t" + "vmovups (%%rax), %%ymm3 \n\t" + "add %4, %%rax \n\t" + "vmovups (%%rax), %%ymm6 \n\t" + "add %4, %%rax \n\t" + "vmovups (%%rax), %%ymm9 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vbroadcastss (%1), %%ymm12 \n\t" + "vmovaps (%0), %%ymm13 \n\t" + "vmovaps 0x20(%0), %%ymm14 \n\t" + "vmovaps 0x40(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovaps 0x60(%0), %%ymm14 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x4(%1), %%ymm12 \n\t" + "vmovaps 0x80(%0), %%ymm13 \n\t" + "vmovaps 0xA0(%0), %%ymm14 \n\t" + "vmovaps 0xC0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovaps 0xE0(%0), %%ymm14 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x8(%1), %%ymm12 \n\t" + "vmovaps 0x100(%0), %%ymm13 \n\t" + "vmovaps 0x120(%0), %%ymm14 \n\t" + "vmovaps 0x140(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovaps 0x160(%0), %%ymm14 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0xC(%1), %%ymm12 \n\t" + "vmovaps 0x180(%0), %%ymm13 \n\t" + "vmovaps 0x1A0(%0), %%ymm14 \n\t" + "vmovaps 0x1C0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovaps 0x1E0(%0), %%ymm14 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x10(%1), %%ymm12 \n\t" + "vmovaps 0x200(%0), %%ymm13 \n\t" + "vmovaps 0x220(%0), %%ymm14 \n\t" + "vmovaps 0x240(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovaps 0x260(%0), %%ymm14 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x14(%1), %%ymm12 \n\t" + "vmovaps 0x280(%0), %%ymm13 \n\t" + "vmovaps 0x2A0(%0), %%ymm14 \n\t" + "vmovaps 0x2C0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovaps 0x2E0(%0), %%ymm14 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x18(%1), %%ymm12 \n\t" + "vmovaps 0x300(%0), %%ymm13 \n\t" + "vmovaps 0x320(%0), %%ymm14 \n\t" + "vmovaps 0x340(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovaps 0x360(%0), %%ymm14 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x1C(%1), %%ymm12 \n\t" + "vmovaps 0x380(%0), %%ymm13 \n\t" + "vmovaps 0x3A0(%0), %%ymm14 \n\t" + "vmovaps 0x3C0(%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovaps 0x3E0(%0), %%ymm13 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm9 \n\t" + + "add $0x400, %0 \n\t" + "add %7, %1 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %5 \n\t" + "je 2f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + + // relu6 + "and $0x4, %5 \n\t" + "je 2f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + + "2: \n\t" + "vmovups %%ymm0, (%2) \n\t" + "add %4, %2 \n\t" + "vmovups %%ymm3, (%2) \n\t" + "add %4, %2 \n\t" + "vmovups %%ymm6, (%2) \n\t" + "add %4, %2 \n\t" + "vmovups %%ymm9, (%2) \n\t" + : + : "r"(curW), "r"(curI), "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store), + "c"(ic), "r"(I64(fStep)) + : "%eax", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); +} + +inline void avx2_pointwise_kernel_1x24( + F32 *curI, const F32 *curW, F32 *curO, const F32 *curB, U32 oStep, U32 store, U32 ic, U32 fStep) +{ + __asm__ __volatile__("shr $3, %%ecx \n\t" + "mov %5, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups 0x20(%3), %%ymm4 \n\t" + "vmovups 0x40(%3), %%ymm8 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%2), %%ymm0 \n\t" + "vmovups (%2, %4), %%ymm4 \n\t" + "vmovups (%2, %4, 2), %%ymm8 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%0), %%ymm12 \n\t" + "vmovaps 0x20(%0), %%ymm13 \n\t" + "vmovaps 0x40(%0), %%ymm14 \n\t" + "vbroadcastss 0x0(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + + "vmovaps 0x60(%0), %%ymm12 \n\t" + "vmovaps 0x80(%0), %%ymm13 \n\t" + "vmovaps 0xA0(%0), %%ymm14 \n\t" + "vbroadcastss 0x4(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + + "vmovaps 0xC0(%0), %%ymm12 \n\t" + "vmovaps 0xE0(%0), %%ymm13 \n\t" + "vmovaps 0x100(%0), %%ymm14 \n\t" + "vbroadcastss 0x8(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + + "vmovaps 0x120(%0), %%ymm12 \n\t" + "vmovaps 0x140(%0), %%ymm13 \n\t" + "vmovaps 0x160(%0), %%ymm14 \n\t" + "vbroadcastss 0xC(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + + "vmovaps 0x180(%0), %%ymm12 \n\t" + "vmovaps 0x1A0(%0), %%ymm13 \n\t" + "vmovaps 0x1C0(%0), %%ymm14 \n\t" + "vbroadcastss 0x10(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + + "vmovaps 0x1E0(%0), %%ymm12 \n\t" + "vmovaps 0x200(%0), %%ymm13 \n\t" + "vmovaps 0x220(%0), %%ymm14 \n\t" + "vbroadcastss 0x14(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + + "vmovaps 0x240(%0), %%ymm12 \n\t" + "vmovaps 0x260(%0), %%ymm13 \n\t" + "vmovaps 0x280(%0), %%ymm14 \n\t" + "vbroadcastss 0x18(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + + "vmovaps 0x2A0(%0), %%ymm12 \n\t" + "vmovaps 0x2C0(%0), %%ymm13 \n\t" + "vmovaps 0x2E0(%0), %%ymm14 \n\t" + "vbroadcastss 0x1C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + + "add $0x300, %0 \n\t" + "add %7, %1 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %5 \n\t" + "je 2f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm8, %%ymm8 \n\t" + + // relu6 + "and $0x4, %5 \n\t" + "je 2f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm8, %%ymm8 \n\t" + + "2: \n\t" + "vmovups %%ymm0, (%2) \n\t" + "vmovups %%ymm4, (%2, %4) \n\t" + "vmovups %%ymm8, (%2, %4, 2) \n\t" + : + : "r"(curW), "r"(curI), "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store), + "c"(ic), "r"(I64(fStep)) + : "%eax", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); +} + +inline void avx2_pointwise_kernel_1x16( + F32 *curI, const F32 *curW, F32 *curO, const F32 *curB, U32 oStep, U32 store, U32 ic, U32 fStep) +{ + __asm__ __volatile__("shr $3, %%ecx \n\t" + "mov %5, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%3), %%ymm0 \n\t" + "vmovups 0x20(%3), %%ymm4 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%2), %%ymm0 \n\t" + "vmovups (%2, %4), %%ymm4 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%0), %%ymm12 \n\t" + "vmovaps 0x20(%0), %%ymm13 \n\t" + "vbroadcastss 0x0(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vmovaps 0x40(%0), %%ymm12 \n\t" + "vmovaps 0x60(%0), %%ymm13 \n\t" + "vbroadcastss 0x4(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vmovaps 0x80(%0), %%ymm12 \n\t" + "vmovaps 0xA0(%0), %%ymm13 \n\t" + "vbroadcastss 0x8(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vmovaps 0xC0(%0), %%ymm12 \n\t" + "vmovaps 0xE0(%0), %%ymm13 \n\t" + "vbroadcastss 0xC(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vmovaps 0x100(%0), %%ymm12 \n\t" + "vmovaps 0x120(%0), %%ymm13 \n\t" + "vbroadcastss 0x10(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vmovaps 0x140(%0), %%ymm12 \n\t" + "vmovaps 0x160(%0), %%ymm13 \n\t" + "vbroadcastss 0x14(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vmovaps 0x180(%0), %%ymm12 \n\t" + "vmovaps 0x1A0(%0), %%ymm13 \n\t" + "vbroadcastss 0x18(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vmovaps 0x1C0(%0), %%ymm12 \n\t" + "vmovaps 0x1E0(%0), %%ymm13 \n\t" + "vbroadcastss 0x1C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "add $0x200, %0 \n\t" + "add %7, %1 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %5 \n\t" + "je 2f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + + // relu6 + "and $0x4, %5 \n\t" + "je 2f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + + "2: \n\t" + "vmovups %%ymm0, (%2) \n\t" + "vmovups %%ymm4, (%2, %4) \n\t" + : + : "r"(curW), "r"(curI), "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store), + "c"(ic), "r"(I64(fStep)) + : "%eax", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); +} + +inline void avx2_pointwise_kernel_1x8( + F32 *curI, const F32 *curW, F32 *curO, const F32 *curB, U32 oStep, U32 store, U32 ic, U32 fStep) +{ + __asm__ __volatile__( + "shr $3, %%ecx \n\t" + "mov %4, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%3), %%ymm0 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%2), %%ymm0 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%0), %%ymm12 \n\t" + "vbroadcastss 0x0(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovaps 0x20(%0), %%ymm12 \n\t" + "vbroadcastss 0x4(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovaps 0x40(%0), %%ymm12 \n\t" + "vbroadcastss 0x8(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovaps 0x60(%0), %%ymm12 \n\t" + "vbroadcastss 0xC(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovaps 0x80(%0), %%ymm12 \n\t" + "vbroadcastss 0x10(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovaps 0xA0(%0), %%ymm12 \n\t" + "vbroadcastss 0x14(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovaps 0xC0(%0), %%ymm12 \n\t" + "vbroadcastss 0x18(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vmovaps 0xE0(%0), %%ymm12 \n\t" + "vbroadcastss 0x1C(%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "add $0x100, %0 \n\t" + "add %6, %1 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %4 \n\t" + "je 2f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + + // relu6 + "and $0x4, %4 \n\t" + "je 2f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + + "2: \n\t" + "vmovups %%ymm0, (%2) \n\t" + : + : "r"(curW), "r"(curI), "r"(curO), "r"(curB), "r"(store), "c"(ic), "r"(I64(fStep)) + : "%eax", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", + "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory", + "cc"); +} + +EE convolution_1x1_direct(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(tmpBytes); + DataType idt, odt, fdt; + DataFormat idf, odf, fdf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (((fdf != DF_NCHWCxN24) && (fdf != DF_NCHWCxN32)) || (idf != DF_NCHWC8)) { + CHECK_STATUS(NOT_MATCH); + } + + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + F32 *ftmp = inArray; + F32 *btmp = (F32 *)align_addr(tmp, 32); + filterArray = (F32 *)align_addr(filterArray, 32); + + U32 oStep = oh * ow * SIMDW * 4; + U32 fStep = ih * iw * SIMDW * 4; + U32 store = 0, icSize = 0, ocBlocking = 0; + U32 ohow = oh * ow; + U32 icPadding = (ic + 8 - 1) / 8 * 8; + kernel_func kernel[2][4] = {{avx2_pointwise_kernel_1x8, avx2_pointwise_kernel_1x16, + avx2_pointwise_kernel_1x24, avx2_pointwise_kernel_1x32}, + {avx2_pointwise_kernel_12x8, avx2_pointwise_kernel_6x16, avx2_pointwise_kernel_4x24, + avx2_pointwise_kernel_3x32}}; + + U32 unroll_oc_array[4] = {8, 16, 24, 32}; + U32 unroll_hw_array[4] = {12, 6, 4, 3}; + U32 unroll_oc = 24, unroll_hw = 4; + + if ((oc % 24 != 0) && (oc % 32 == 0)) { + unroll_oc = 32; + unroll_hw = 3; + } + +#ifdef _USE_OPENMP + U32 alpha = (ohow + OMP_NUM_THREADS * BLOCK_HW_DIM - 1) / (OMP_NUM_THREADS * BLOCK_HW_DIM); + U32 block_hw_dim = (ohow + OMP_NUM_THREADS * alpha - 1 ) / (OMP_NUM_THREADS * alpha); +#else + U32 block_hw_dim = BLOCK_HW_DIM; +#endif + + U32 hwBlockNums = (ohow + block_hw_dim - 1 ) / block_hw_dim; + + if ((paddingT != 0) || (paddingB != 0) || (paddingL != 0) || (paddingR != 0)) { + __m256 zero = _mm256_set1_ps(0.); + switch (activationDesc.mode) { + case ACTIVATION_NULL: { + for (U32 ocb = 0; ocb < oc; ocb += 8) { + _mm256_store_ps(btmp + ocb, _mm256_loadu_ps(biasArray + ocb)); + } + break; + } + case ACTIVATION_RELU: { + for (U32 ocb = 0; ocb < oc; ocb += 8) { + _mm256_store_ps(btmp + ocb, _mm256_max_ps(zero, _mm256_loadu_ps(biasArray + ocb))); + } + break; + } + case ACTIVATION_RELU6: { + __m256 six = _mm256_set1_ps(6.); + for (U32 ocb = 0; ocb < oc; ocb += 8) { + _mm256_store_ps(btmp + ocb, _mm256_min_ps(six, _mm256_max_ps(zero, _mm256_loadu_ps(biasArray + ocb)))); + } + break; + } + default: + return NOT_SUPPORTED; + } + } + + for (U32 n = 0; n < in; ++n) { + for (U32 ocbb = 0; ocbb < oc; ocbb += ocBlocking) { + store = 0; + ocBlocking = UNI_MIN(oc - ocbb, BLOCK_OC_DIM); + for (U32 icb = 0; icb < icPadding; icb += icSize) { + icSize = UNI_MIN(icPadding - icb, BLOCK_IC_DIM); + store |= (icb > 0); + if (icb == icPadding - icSize) { + store |= U32(activationDesc.mode) << 1; + } + F32 *curI = ftmp + icb * ih * iw; + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 hwIdx = 0; hwIdx < hwBlockNums; ++hwIdx) { + U32 hw = hwIdx * block_hw_dim; + U32 hwSize = UNI_MIN(block_hw_dim, ohow - hw); + U32 ocSize = 0, ihwSize = 0; + for (U32 ocb = ocbb; ocb < ocbb + ocBlocking; ocb += ocSize) { + ocSize = UNI_MIN(ocbb + ocBlocking - ocb, unroll_oc); + ocSize = unroll_oc_array[(ocSize>>3)-1]; + U32 unroll_hw = unroll_hw_array[(ocSize>>3)-1]; + const F32 *curB = biasArray + ocb; + const F32 *curW = filterArray + ocb * icPadding + icb * ocSize; + F32 *curO = outArray + ocb * oh * ow; + for (U32 ihw = hw; ihw < hw + hwSize; ihw += ihwSize) { + if ((hw + hwSize - ihw) >= unroll_hw) { + ihwSize = unroll_hw; + } else { + ihwSize = 1; + } + F32 *calI = curI + ihw * SIMDW; + F32 *calO = curO + ihw * SIMDW; + kernel[ihwSize>1][(ocSize>>3)-1](calI, curW, calO, curB, oStep, store, icSize, fStep); + } + } + } + } else { +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (I32 h = 0; h < oh; ++h) { + U32 ocSize = 0, ihwSize = 0; + for (U32 ocb = ocbb; ocb < ocbb + ocBlocking; ocb += ocSize) { + ocSize = UNI_MIN(ocbb + ocBlocking - ocb, unroll_oc); + ocSize = unroll_oc_array[(ocSize>>3)-1]; + U32 unroll_hw = unroll_hw_array[(ocSize>>3)-1]; + const F32 *curB = biasArray + ocb; + const F32 *curW = filterArray + ocb * icPadding + icb * ocSize; + F32 *curO = outArray + ocb * oh * ow; + for (U32 w = 0; w < ow; w += ihwSize) { + F32 *calI = curI + ((h - paddingT) * iw + w - paddingL) * SIMDW; + F32 *calO = curO + (h * ow + w) * SIMDW; + ihwSize = 1; + if ((h < paddingT) || (h >= ih + paddingT) || (w < paddingL) || (w >= paddingL + iw)) { + for (U32 oci = 0; oci < ocSize; oci += SIMDW) { + _mm256_storeu_ps(calO + ohow * oci, _mm256_load_ps(btmp + oci + ocb)); + } + continue; + } + if ((iw - (w - paddingL)) >= unroll_hw) { + ihwSize = unroll_hw; + } + kernel[ihwSize>1][(ocSize>>3)-1](calI, curW, calO, curB, oStep, store, icSize, fStep); + } + } + } + } + } + } + inArray += ic * ih * iw; + outArray += oc * oh * ow; + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/convolution_2x2_direct.cpp b/compute/tensor/src/cpu/x86/fp32/convolution_2x2_direct.cpp new file mode 100644 index 00000000..55d24ad7 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/convolution_2x2_direct.cpp @@ -0,0 +1,1769 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" + +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#include "cpu/x86/fp32/transform_functions_fp32.h" + +#define UNROLL_W 3 +#define UNROLL_OC_DIM 8 +#define BLOCK_OC_DIM 32 +#define BLOCK_IC_DIM 32 +#define UNROLL_IC_BLOCK_DIM 8 +#define BLOCK_HW_DIM 768 +#define align_addr(addr, unit) (((uintptr_t)addr + unit - 1) / unit * unit) + +typedef void (*kernel_func)(F32 *curI, + const F32 *curW, + F32 *curO, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 store, + const F32 *curB, + U32 dw, + F32 *in_1, + F32 *in_2, + U32 ic, + U32 fStep); + +void avx2_conv_kernel_3x32(F32 *curI, + const F32 *curW, + F32 *curO, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 store, + const F32 *curB, + U32 dw, + F32 *in_1, + F32 *in_2, + U32 ic, + U32 fStep) +{ + __asm__ __volatile__("mov %3, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups (%1), %%ymm1 \n\t" + "vmovups (%1), %%ymm2 \n\t" + "vmovups 0x20(%1), %%ymm3 \n\t" + "vmovups 0x20(%1), %%ymm4 \n\t" + "vmovups 0x20(%1), %%ymm5 \n\t" + "vmovups 0x40(%1), %%ymm6 \n\t" + "vmovups 0x40(%1), %%ymm7 \n\t" + "vmovups 0x40(%1), %%ymm8 \n\t" + "vmovups 0x60(%1), %%ymm9 \n\t" + "vmovups 0x60(%1), %%ymm10 \n\t" + "vmovups 0x60(%1), %%ymm11 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "mov %0, %%rax \n\t" + "vmovups (%%rax), %%ymm0 \n\t" + "vmovups 0x20(%%rax), %%ymm1 \n\t" + "vmovups 0x40(%%rax), %%ymm2 \n\t" + "add %2, %%rax \n\t" + "vmovups (%%rax), %%ymm3 \n\t" + "vmovups 0x20(%%rax), %%ymm4 \n\t" + "vmovups 0x40(%%rax), %%ymm5 \n\t" + "add %2, %%rax \n\t" + "vmovups (%%rax), %%ymm6 \n\t" + "vmovups 0x20(%%rax), %%ymm7 \n\t" + "vmovups 0x40(%%rax), %%ymm8 \n\t" + "add %2, %%rax \n\t" + "vmovups (%%rax), %%ymm9 \n\t" + "vmovups 0x20(%%rax), %%ymm10 \n\t" + "vmovups 0x40(%%rax), %%ymm11 \n\t" + + ".align 16 \n\t" + "1: \n\t" + : + : "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store) + : "%ecx", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "memory", "cc"); + + __asm__ __volatile__(".align 16 \n\t" + "2: \n\t" + + "mov %5, %%ebx \n\t" + ".align 16 \n\t" + "3: \n\t" + + "mov %4, %%ecx \n\t" + ".align 16 \n\t" + "4: \n\t" + + "vbroadcastss (%0), %%ymm12 \n\t" + "vbroadcastss (%1), %%ymm13 \n\t" + "vbroadcastss (%2), %%ymm14 \n\t" + "vmovups 0x0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x20(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovups 0x40(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovups 0x60(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x4(%0), %%ymm12 \n\t" + "vbroadcastss 0x4(%1), %%ymm13 \n\t" + "vbroadcastss 0x4(%2), %%ymm14 \n\t" + "vmovups 0x80(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0xA0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovups 0xC0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovups 0xE0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x8(%0), %%ymm12 \n\t" + "vbroadcastss 0x8(%1), %%ymm13 \n\t" + "vbroadcastss 0x8(%2), %%ymm14 \n\t" + "vmovups 0x100(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x120(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovups 0x140(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovups 0x160(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0xC(%0), %%ymm12 \n\t" + "vbroadcastss 0xC(%1), %%ymm13 \n\t" + "vbroadcastss 0xC(%2), %%ymm14 \n\t" + "vmovups 0x180(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x1A0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovups 0x1C0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovups 0x1E0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x10(%0), %%ymm12 \n\t" + "vbroadcastss 0x10(%1), %%ymm13 \n\t" + "vbroadcastss 0x10(%2), %%ymm14 \n\t" + "vmovups 0x200(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x220(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovups 0x240(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovups 0x260(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x14(%0), %%ymm12 \n\t" + "vbroadcastss 0x14(%1), %%ymm13 \n\t" + "vbroadcastss 0x14(%2), %%ymm14 \n\t" + "vmovups 0x280(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x2A0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovups 0x2C0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovups 0x2E0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x18(%0), %%ymm12 \n\t" + "vbroadcastss 0x18(%1), %%ymm13 \n\t" + "vbroadcastss 0x18(%2), %%ymm14 \n\t" + "vmovups 0x300(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x320(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovups 0x340(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovups 0x360(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x1C(%0), %%ymm12 \n\t" + "vbroadcastss 0x1C(%1), %%ymm13 \n\t" + "vbroadcastss 0x1C(%2), %%ymm14 \n\t" + "vmovups 0x380(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x3A0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovups 0x3C0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovups 0x3E0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add %7, %0 \n\t" + "add %7, %1 \n\t" + "add %7, %2 \n\t" + "add $0x400, %3 \n\t" + "dec %%ecx \n\t" + "jg 4b \n\t" + + "add %6, %0 \n\t" + "add %6, %1 \n\t" + "add %6, %2 \n\t" + "dec %%ebx \n\t" + "jg 3b \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "dec %%eax \n\t" + "jg 2b \n\t" + + : + : "r"(curI), "r"(in_1), "r"(in_2), "r"(curW), "r"(fw), "r"(fh), + "r"(I64(iStep)), "r"(I64(dw)), "a"(ic / 8), "r"(I64(fStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); + + __asm__ __volatile__( + // relu + "and $0x6, %2 \n\t" + "je 5f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm7, %%ymm7 \n\t" + "vmaxps %%ymm15, %%ymm8, %%ymm8 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + "vmaxps %%ymm15, %%ymm10, %%ymm10 \n\t" + "vmaxps %%ymm15, %%ymm11, %%ymm11 \n\t" + + // relu6 + "and $0x4, %2 \n\t" + "je 5f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm7, %%ymm7 \n\t" + "vminps %%ymm12, %%ymm8, %%ymm8 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + "vminps %%ymm12, %%ymm10, %%ymm10 \n\t" + "vminps %%ymm12, %%ymm11, %%ymm11 \n\t" + + "5: \n\t" + "vmovups %%ymm0, (%0) \n\t" + "vmovups %%ymm1, 0x20(%0) \n\t" + "vmovups %%ymm2, 0x40(%0) \n\t" + "add %1, %0 \n\t" + "vmovups %%ymm3, (%0) \n\t" + "vmovups %%ymm4, 0x20(%0) \n\t" + "vmovups %%ymm5, 0x40(%0) \n\t" + "add %1, %0 \n\t" + "vmovups %%ymm6, (%0) \n\t" + "vmovups %%ymm7, 0x20(%0) \n\t" + "vmovups %%ymm8, 0x40(%0) \n\t" + "add %1, %0 \n\t" + "vmovups %%ymm9, (%0) \n\t" + "vmovups %%ymm10, 0x20(%0) \n\t" + "vmovups %%ymm11, 0x40(%0) \n\t" + : "+r"(curO) + : "r"(I64(oStep)), "r"(store) + : "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", + "%ymm9", "%ymm10", "%ymm11", "memory", "cc"); +} + +void avx2_conv_kernel_2x32(F32 *curI, + const F32 *curW, + F32 *curO, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 store, + const F32 *curB, + U32 dw, + F32 *in_1, + F32 *in_2, + U32 ic, + U32 fStep) +{ + __asm__ __volatile__("mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups (%8), %%ymm1 \n\t" + "vmovups 0x20(%8), %%ymm3 \n\t" + "vmovups 0x20(%8), %%ymm4 \n\t" + "vmovups 0x40(%8), %%ymm6 \n\t" + "vmovups 0x40(%8), %%ymm7 \n\t" + "vmovups 0x60(%8), %%ymm9 \n\t" + "vmovups 0x60(%8), %%ymm10 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "mov %1, %8 \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups 0x20(%8), %%ymm1 \n\t" + "add %4, %8 \n\t" + "vmovups (%8), %%ymm3 \n\t" + "vmovups 0x20(%8), %%ymm4 \n\t" + "add %4, %8 \n\t" + "vmovups (%8), %%ymm6 \n\t" + "vmovups 0x20(%8), %%ymm7 \n\t" + "add %4, %8 \n\t" + "vmovups (%8), %%ymm9 \n\t" + "vmovups 0x20(%8), %%ymm10 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "mov %5, %%ebx \n\t" + ".align 16 \n\t" + "2: \n\t" + + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "3: \n\t" + + "vbroadcastss (%0), %%ymm12 \n\t" + "vbroadcastss (%10), %%ymm13 \n\t" + "vmovups 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x20(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vmovups 0x40(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vmovups 0x60(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + + "vbroadcastss 0x4(%0), %%ymm12 \n\t" + "vbroadcastss 0x4(%10), %%ymm13 \n\t" + "vmovups 0x80(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0xA0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vmovups 0xC0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vmovups 0xE0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + + "vbroadcastss 0x8(%0), %%ymm12 \n\t" + "vbroadcastss 0x8(%10), %%ymm13 \n\t" + "vmovups 0x100(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x120(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vmovups 0x140(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vmovups 0x160(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + + "vbroadcastss 0xC(%0), %%ymm12 \n\t" + "vbroadcastss 0xC(%10), %%ymm13 \n\t" + "vmovups 0x180(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x1A0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vmovups 0x1C0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vmovups 0x1E0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + + "vbroadcastss 0x10(%0), %%ymm12 \n\t" + "vbroadcastss 0x10(%10), %%ymm13 \n\t" + "vmovups 0x200(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x220(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vmovups 0x240(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vmovups 0x260(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + + "vbroadcastss 0x14(%0), %%ymm12 \n\t" + "vbroadcastss 0x14(%10), %%ymm13 \n\t" + "vmovups 0x280(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x2A0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vmovups 0x2C0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vmovups 0x2E0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + + "vbroadcastss 0x18(%0), %%ymm12 \n\t" + "vbroadcastss 0x18(%10), %%ymm13 \n\t" + "vmovups 0x300(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x320(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vmovups 0x340(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vmovups 0x360(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + + "vbroadcastss 0x1C(%0), %%ymm12 \n\t" + "vbroadcastss 0x1C(%10), %%ymm13 \n\t" + "vmovups 0x380(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x3A0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vmovups 0x3C0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vmovups 0x3E0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + + "add %9, %0 \n\t" + "add %9, %10 \n\t" + "add $0x400, %2 \n\t" + "dec %%ecx \n\t" + "jg 3b \n\t" + + "add %6, %0 \n\t" + "add %6, %10 \n\t" + "dec %%ebx \n\t" + "jg 2b \n\t" + + "add %12, %0 \n\t" + "add %12, %10 \n\t" + "dec %%eax \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %7 \n\t" + "je 4f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm7, %%ymm7 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + "vmaxps %%ymm15, %%ymm10, %%ymm10 \n\t" + + // relu6 + "and $0x4, %7 \n\t" + "je 4f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm7, %%ymm7 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + "vminps %%ymm12, %%ymm10, %%ymm10 \n\t" + + "4: \n\t" + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm3, (%1) \n\t" + "vmovups %%ymm4, 0x20(%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm6, (%1) \n\t" + "vmovups %%ymm7, 0x20(%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm9, (%1) \n\t" + "vmovups %%ymm10, 0x20(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(curW), "r"(fw), "r"(I64(oStep)), "r"(fh), + "r"(I64(iStep)), "r"(store), "r"(curB), "r"(I64(dw)), "r"(in_1), + "a"(ic / 8), "r"(I64(fStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm1", "%ymm3", "%ymm4", "%ymm6", "%ymm7", + "%ymm9", "%ymm10", "%ymm12", "%ymm13", "%ymm15", "memory"); +} + +void avx2_conv_kernel_1x32(F32 *curI, + const F32 *curW, + F32 *curO, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 store, + const F32 *curB, + U32 dw, + F32 *in_1, + F32 *in_2, + U32 ic, + U32 fStep) +{ + __asm__ __volatile__( + "mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups 0x20(%8), %%ymm3 \n\t" + "vmovups 0x40(%8), %%ymm6 \n\t" + "vmovups 0x60(%8), %%ymm9 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "mov %1, %8 \n\t" + "vmovups (%8), %%ymm0 \n\t" + "add %4, %8 \n\t" + "vmovups (%8), %%ymm3 \n\t" + "add %4, %8 \n\t" + "vmovups (%8), %%ymm6 \n\t" + "add %4, %8 \n\t" + "vmovups (%8), %%ymm9 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "mov %5, %%ebx \n\t" + ".align 16 \n\t" + "2: \n\t" + + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "3: \n\t" + + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovups 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x20(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vmovups 0x40(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovups 0x60(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x4(%0), %%ymm12 \n\t" + "vmovups 0x80(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0xA0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vmovups 0xC0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovups 0xE0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x8(%0), %%ymm12 \n\t" + "vmovups 0x100(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x120(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vmovups 0x140(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovups 0x160(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0xC(%0), %%ymm12 \n\t" + "vmovups 0x180(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x1A0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vmovups 0x1C0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovups 0x1E0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x10(%0), %%ymm12 \n\t" + "vmovups 0x200(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x220(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vmovups 0x240(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovups 0x260(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x14(%0), %%ymm12 \n\t" + "vmovups 0x280(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x2A0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vmovups 0x2C0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovups 0x2E0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x18(%0), %%ymm12 \n\t" + "vmovups 0x300(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x320(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vmovups 0x340(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovups 0x360(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x1C(%0), %%ymm12 \n\t" + "vmovups 0x380(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x3A0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vmovups 0x3C0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vmovups 0x3E0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + + "add %9, %0 \n\t" + "add $0x400, %2 \n\t" + "dec %%ecx \n\t" + "jg 3b \n\t" + + "add %6, %0 \n\t" + "sub $1, %%ebx \n\t" + "jg 2b \n\t" + + "add %11, %0 \n\t" + "sub $1, %%eax \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %7 \n\t" + "je 4f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + + // relu6 + "and $0x4, %7 \n\t" + "je 4f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + + "4: \n\t" + "vmovups %%ymm0, (%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm3, (%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm6, (%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm9, (%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(curW), "r"(fw), "r"(I64(oStep)), "r"(fh), "r"(I64(iStep)), + "r"(store), "r"(curB), "r"(I64(dw)), "a"(ic / 8), "r"(I64(fStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm3", "%ymm6", "%ymm9", "%ymm12", "%ymm15", "memory", "cc"); +} + +void avx2_conv_kernel_3x16(F32 *curI, + const F32 *curW, + F32 *curO, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 store, + const F32 *curB, + U32 dw, + F32 *in_1, + F32 *in_2, + U32 ic, + U32 fStep) +{ + __asm__ __volatile__("mov %3, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups (%1), %%ymm1 \n\t" + "vmovups (%1), %%ymm2 \n\t" + "vmovups 0x20(%1), %%ymm3 \n\t" + "vmovups 0x20(%1), %%ymm4 \n\t" + "vmovups 0x20(%1), %%ymm5 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "mov %0, %%rax \n\t" + "vmovups (%%rax), %%ymm0 \n\t" + "vmovups 0x20(%%rax), %%ymm1 \n\t" + "vmovups 0x40(%%rax), %%ymm2 \n\t" + "add %2, %%rax \n\t" + "vmovups (%%rax), %%ymm3 \n\t" + "vmovups 0x20(%%rax), %%ymm4 \n\t" + "vmovups 0x40(%%rax), %%ymm5 \n\t" + + ".align 16 \n\t" + "1: \n\t" + : + : "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store) + : "%ecx", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "memory", "cc"); + + __asm__ __volatile__(".align 16 \n\t" + "2: \n\t" + + "mov %5, %%ebx \n\t" + ".align 16 \n\t" + "3: \n\t" + + "mov %4, %%ecx \n\t" + ".align 16 \n\t" + "4: \n\t" + + "vbroadcastss (%0), %%ymm12 \n\t" + "vbroadcastss (%1), %%ymm13 \n\t" + "vbroadcastss (%2), %%ymm14 \n\t" + "vmovups 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x20(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + + "vbroadcastss 0x4(%0), %%ymm12 \n\t" + "vbroadcastss 0x4(%1), %%ymm13 \n\t" + "vbroadcastss 0x4(%2), %%ymm14 \n\t" + "vmovups 0x40(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x60(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + + "vbroadcastss 0x8(%0), %%ymm12 \n\t" + "vbroadcastss 0x8(%1), %%ymm13 \n\t" + "vbroadcastss 0x8(%2), %%ymm14 \n\t" + "vmovups 0x80(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0xA0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + + "vbroadcastss 0xC(%0), %%ymm12 \n\t" + "vbroadcastss 0xC(%1), %%ymm13 \n\t" + "vbroadcastss 0xC(%2), %%ymm14 \n\t" + "vmovups 0xC0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0xE0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + + "vbroadcastss 0x10(%0), %%ymm12 \n\t" + "vbroadcastss 0x10(%1), %%ymm13 \n\t" + "vbroadcastss 0x10(%2), %%ymm14 \n\t" + "vmovups 0x100(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x120(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + + "vbroadcastss 0x14(%0), %%ymm12 \n\t" + "vbroadcastss 0x14(%1), %%ymm13 \n\t" + "vbroadcastss 0x14(%2), %%ymm14 \n\t" + "vmovups 0x140(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x160(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + + "vbroadcastss 0x18(%0), %%ymm12 \n\t" + "vbroadcastss 0x18(%1), %%ymm13 \n\t" + "vbroadcastss 0x18(%2), %%ymm14 \n\t" + "vmovups 0x180(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x1A0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + + "vbroadcastss 0x1C(%0), %%ymm12 \n\t" + "vbroadcastss 0x1C(%1), %%ymm13 \n\t" + "vbroadcastss 0x1C(%2), %%ymm14 \n\t" + "vmovups 0x1C0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x1E0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + + "add %7, %0 \n\t" + "add %7, %1 \n\t" + "add %7, %2 \n\t" + "add $0x200, %3 \n\t" + "dec %%ecx \n\t" + "jg 4b \n\t" + + "add %6, %0 \n\t" + "add %6, %1 \n\t" + "add %6, %2 \n\t" + "dec %%ebx \n\t" + "jg 3b \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "dec %%eax \n\t" + "jg 2b \n\t" + + : + : "r"(curI), "r"(in_1), "r"(in_2), "r"(curW), "r"(fw), "r"(fh), + "r"(I64(iStep)), "r"(I64(dw)), "a"(ic / 8), "r"(I64(fStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); + + __asm__ __volatile__( + // relu + "and $0x6, %2 \n\t" + "je 5f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + + // relu6 + "and $0x4, %2 \n\t" + "je 5f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + + "5: \n\t" + "vmovups %%ymm0, (%0) \n\t" + "vmovups %%ymm1, 0x20(%0) \n\t" + "vmovups %%ymm2, 0x40(%0) \n\t" + "add %1, %0 \n\t" + "vmovups %%ymm3, (%0) \n\t" + "vmovups %%ymm4, 0x20(%0) \n\t" + "vmovups %%ymm5, 0x40(%0) \n\t" + : "+r"(curO) + : "r"(I64(oStep)), "r"(store) + : "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", + "%ymm9", "%ymm10", "%ymm11", "memory", "cc"); +} + +void avx2_conv_kernel_2x16(F32 *curI, + const F32 *curW, + F32 *curO, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 store, + const F32 *curB, + U32 dw, + F32 *in_1, + F32 *in_2, + U32 ic, + U32 fStep) +{ + __asm__ __volatile__("mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups (%8), %%ymm1 \n\t" + "vmovups 0x20(%8), %%ymm3 \n\t" + "vmovups 0x20(%8), %%ymm4 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "mov %1, %8 \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups 0x20(%8), %%ymm1 \n\t" + "add %4, %8 \n\t" + "vmovups (%8), %%ymm3 \n\t" + "vmovups 0x20(%8), %%ymm4 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "mov %5, %%ebx \n\t" + ".align 16 \n\t" + "2: \n\t" + + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "3: \n\t" + + "vbroadcastss (%0), %%ymm12 \n\t" + "vbroadcastss (%10), %%ymm13 \n\t" + "vmovups 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x20(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vbroadcastss 0x4(%0), %%ymm12 \n\t" + "vbroadcastss 0x4(%10), %%ymm13 \n\t" + "vmovups 0x40(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x60(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vbroadcastss 0x8(%0), %%ymm12 \n\t" + "vbroadcastss 0x8(%10), %%ymm13 \n\t" + "vmovups 0x80(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0xA0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vbroadcastss 0xC(%0), %%ymm12 \n\t" + "vbroadcastss 0xC(%10), %%ymm13 \n\t" + "vmovups 0xC0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0xE0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vbroadcastss 0x10(%0), %%ymm12 \n\t" + "vbroadcastss 0x10(%10), %%ymm13 \n\t" + "vmovups 0x100(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x120(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vbroadcastss 0x14(%0), %%ymm12 \n\t" + "vbroadcastss 0x14(%10), %%ymm13 \n\t" + "vmovups 0x140(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x160(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vbroadcastss 0x18(%0), %%ymm12 \n\t" + "vbroadcastss 0x18(%10), %%ymm13 \n\t" + "vmovups 0x180(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x1A0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vbroadcastss 0x1C(%0), %%ymm12 \n\t" + "vbroadcastss 0x1C(%10), %%ymm13 \n\t" + "vmovups 0x1C0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x1E0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "add %9, %0 \n\t" + "add %9, %10 \n\t" + "add $0x200, %2 \n\t" + "dec %%ecx \n\t" + "jg 3b \n\t" + + "add %6, %0 \n\t" + "add %6, %10 \n\t" + "dec %%ebx \n\t" + "jg 2b \n\t" + + "add %12, %0 \n\t" + "add %12, %10 \n\t" + "dec %%eax \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %7 \n\t" + "je 4f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + + // relu6 + "and $0x4, %7 \n\t" + "je 4f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + + "4: \n\t" + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm3, (%1) \n\t" + "vmovups %%ymm4, 0x20(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(curW), "r"(fw), "r"(I64(oStep)), "r"(fh), + "r"(I64(iStep)), "r"(store), "r"(curB), "r"(I64(dw)), "r"(in_1), + "a"(ic / 8), "r"(I64(fStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm1", "%ymm3", "%ymm4", "%ymm6", "%ymm7", + "%ymm9", "%ymm10", "%ymm12", "%ymm13", "%ymm15", "memory"); +} + +void avx2_conv_kernel_1x16(F32 *curI, + const F32 *curW, + F32 *curO, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 store, + const F32 *curB, + U32 dw, + F32 *in_1, + F32 *in_2, + U32 ic, + U32 fStep) +{ + __asm__ __volatile__( + "mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups 0x20(%8), %%ymm3 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "mov %1, %8 \n\t" + "vmovups (%8), %%ymm0 \n\t" + "add %4, %8 \n\t" + "vmovups (%8), %%ymm3 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "mov %5, %%ebx \n\t" + ".align 16 \n\t" + "2: \n\t" + + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "3: \n\t" + + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovups 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x20(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "vbroadcastss 0x4(%0), %%ymm12 \n\t" + "vmovups 0x40(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x60(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "vbroadcastss 0x8(%0), %%ymm12 \n\t" + "vmovups 0x80(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0xA0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "vbroadcastss 0xC(%0), %%ymm12 \n\t" + "vmovups 0xC0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0xE0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "vbroadcastss 0x10(%0), %%ymm12 \n\t" + "vmovups 0x100(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x120(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "vbroadcastss 0x14(%0), %%ymm12 \n\t" + "vmovups 0x140(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x160(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "vbroadcastss 0x18(%0), %%ymm12 \n\t" + "vmovups 0x180(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x1A0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "vbroadcastss 0x1C(%0), %%ymm12 \n\t" + "vmovups 0x1C0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x1E0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "add %9, %0 \n\t" + "add $0x200, %2 \n\t" + "dec %%ecx \n\t" + "jg 3b \n\t" + + "add %6, %0 \n\t" + "sub $1, %%ebx \n\t" + "jg 2b \n\t" + + "add %11, %0 \n\t" + "sub $1, %%eax \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %7 \n\t" + "je 4f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + + // relu6 + "and $0x4, %7 \n\t" + "je 4f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + + "4: \n\t" + "vmovups %%ymm0, (%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm3, (%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(curW), "r"(fw), "r"(I64(oStep)), "r"(fh), "r"(I64(iStep)), + "r"(store), "r"(curB), "r"(I64(dw)), "a"(ic / 8), "r"(I64(fStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm3", "%ymm6", "%ymm9", "%ymm12", "%ymm15", "memory", "cc"); +} + +void avx2_conv_kernel_3x8(F32 *curI, + const F32 *curW, + F32 *curO, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 store, + const F32 *curB, + U32 dw, + F32 *in_1, + F32 *in_2, + U32 ic, + U32 fStep) +{ + __asm__ __volatile__("mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups (%8), %%ymm1 \n\t" + "vmovups (%8), %%ymm2 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups 0x20(%1), %%ymm1 \n\t" + "vmovups 0x40(%1), %%ymm2 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "mov %5, %%ebx \n\t" + ".align 16 \n\t" + "2: \n\t" + + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "3: \n\t" + + "vbroadcastss (%0), %%ymm12 \n\t" + "vbroadcastss (%10), %%ymm13 \n\t" + "vbroadcastss (%11), %%ymm14 \n\t" + "vmovups 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + + "vbroadcastss 0x4(%0), %%ymm12 \n\t" + "vbroadcastss 0x4(%10), %%ymm13 \n\t" + "vbroadcastss 0x4(%11), %%ymm14 \n\t" + "vmovups 0x20(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + + "vbroadcastss 0x8(%0), %%ymm12 \n\t" + "vbroadcastss 0x8(%10), %%ymm13 \n\t" + "vbroadcastss 0x8(%11), %%ymm14 \n\t" + "vmovups 0x40(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + + "vbroadcastss 0xC(%0), %%ymm12 \n\t" + "vbroadcastss 0xC(%10), %%ymm13 \n\t" + "vbroadcastss 0xC(%11), %%ymm14 \n\t" + "vmovups 0x60(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + + "vbroadcastss 0x10(%0), %%ymm12 \n\t" + "vbroadcastss 0x10(%10), %%ymm13 \n\t" + "vbroadcastss 0x10(%11), %%ymm14 \n\t" + "vmovups 0x80(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + + "vbroadcastss 0x14(%0), %%ymm12 \n\t" + "vbroadcastss 0x14(%10), %%ymm13 \n\t" + "vbroadcastss 0x14(%11), %%ymm14 \n\t" + "vmovups 0xA0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + + "vbroadcastss 0x18(%0), %%ymm12 \n\t" + "vbroadcastss 0x18(%10), %%ymm13 \n\t" + "vbroadcastss 0x18(%11), %%ymm14 \n\t" + "vmovups 0xC0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + + "vbroadcastss 0x1C(%0), %%ymm12 \n\t" + "vbroadcastss 0x1C(%10), %%ymm13 \n\t" + "vbroadcastss 0x1C(%11), %%ymm14 \n\t" + "vmovups 0xE0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + + "add %9, %0 \n\t" + "add %9, %10 \n\t" + "add %9, %11 \n\t" + "add $0x100, %2 \n\t" + "dec %%ecx \n\t" + "jg 3b \n\t" + + "add %6, %0 \n\t" + "add %6, %10 \n\t" + "add %6, %11 \n\t" + "dec %%ebx \n\t" + "jg 2b \n\t" + + "add %12, %0 \n\t" + "add %12, %10 \n\t" + "add %12, %11 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %7 \n\t" + "je 4f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + + // relu6 + "and $0x4, %7 \n\t" + "je 4f \n\t" + "mov $0x40C00000, %%eax \n\t" + "vmovd %%eax, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + + "4: \n\t" + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + "vmovups %%ymm2, 0x40(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(curW), "r"(fw), "a"(ic / 8), "r"(fh), + "r"(I64(iStep)), "r"(store), "r"(curB), "r"(I64(dw)), "r"(in_1), "r"(in_2), + "r"(I64(fStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm1", "%ymm2", "%ymm12", "%ymm13", "%ymm14", + "%ymm15", "memory", "cc"); +} + +void avx2_conv_kernel_2x8(F32 *curI, + const F32 *curW, + F32 *curO, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 store, + const F32 *curB, + U32 dw, + F32 *in_1, + F32 *in_2, + U32 ic, + U32 fStep) +{ + __asm__ __volatile__("mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups (%8), %%ymm1 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups 0x20(%1), %%ymm1 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + ".align 16 \n\t" + "2: \n\t" + + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "3: \n\t" + + "vbroadcastss (%0), %%ymm12 \n\t" + "vbroadcastss (%10), %%ymm13 \n\t" + "vmovups 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vbroadcastss 0x4(%0), %%ymm12 \n\t" + "vbroadcastss 0x4(%10), %%ymm13 \n\t" + "vmovups 0x20(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vbroadcastss 0x8(%0), %%ymm12 \n\t" + "vbroadcastss 0x8(%10), %%ymm13 \n\t" + "vmovups 0x40(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vbroadcastss 0xC(%0), %%ymm12 \n\t" + "vbroadcastss 0xC(%10), %%ymm13 \n\t" + "vmovups 0x60(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vbroadcastss 0x10(%0), %%ymm12 \n\t" + "vbroadcastss 0x10(%10), %%ymm13 \n\t" + "vmovups 0x80(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vbroadcastss 0x14(%0), %%ymm12 \n\t" + "vbroadcastss 0x14(%10), %%ymm13 \n\t" + "vmovups 0xA0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vbroadcastss 0x18(%0), %%ymm12 \n\t" + "vbroadcastss 0x18(%10), %%ymm13 \n\t" + "vmovups 0xC0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vbroadcastss 0x1C(%0), %%ymm12 \n\t" + "vbroadcastss 0x1C(%10), %%ymm13 \n\t" + "vmovups 0xE0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "add %9, %0 \n\t" + "add %9, %10 \n\t" + "add $0x100, %2 \n\t" + "dec %%ecx \n\t" + "jg 3b \n\t" + + "add %6, %0 \n\t" + "add %6, %10 \n\t" + "dec %%ebx \n\t" + "jg 2b \n\t" + + "add %11, %0 \n\t" + "add %11, %10 \n\t" + "dec %%eax \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %7 \n\t" + "je 4f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + + // relu6 + "and $0x4, %7 \n\t" + "je 4f \n\t" + "mov $0x40C00000, %%eax \n\t" + "vmovd %%eax, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + + "4: \n\t" + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(curW), "r"(fw), "a"(ic / 8), "b"(fh), + "r"(I64(iStep)), "r"(store), "r"(curB), "r"(I64(dw)), "r"(in_1), + "r"(I64(fStep)) + : "%ecx", "%ymm0", "%ymm1", "%ymm12", "%ymm13", "%ymm15", "memory", "cc"); +} + +void avx2_conv_kernel_1x8(F32 *curI, + const F32 *curW, + F32 *curO, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 store, + const F32 *curB, + U32 dw, + F32 *in_1, + F32 *in_2, + U32 ic, + U32 fStep) +{ + __asm__ __volatile__("mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%1), %%ymm0 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + ".align 16 \n\t" + "2: \n\t" + + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "3: \n\t" + + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovups 0x0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vbroadcastss 0x4(%0), %%ymm12 \n\t" + "vmovups 0x20(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vbroadcastss 0x8(%0), %%ymm12 \n\t" + "vmovups 0x40(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vbroadcastss 0xC(%0), %%ymm12 \n\t" + "vmovups 0x60(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vbroadcastss 0x10(%0), %%ymm12 \n\t" + "vmovups 0x80(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vbroadcastss 0x14(%0), %%ymm12 \n\t" + "vmovups 0xA0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vbroadcastss 0x18(%0), %%ymm12 \n\t" + "vmovups 0xC0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vbroadcastss 0x1C(%0), %%ymm12 \n\t" + "vmovups 0xE0(%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "add %9, %0 \n\t" + "add $0x100, %2 \n\t" + "dec %%ecx \n\t" + "jg 3b \n\t" + + "add %6, %0 \n\t" + "dec %%ebx \n\t" + "jg 2b \n\t" + + "add %11, %0 \n\t" + "dec %%eax \n\t" + "jg 1b \n\t" + + // relu + "and $0x6, %7 \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + + // relu6 + "and $0x4, %7 \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%eax \n\t" + "vmovd %%eax, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + + "3: \n\t" + "vmovups %%ymm0, (%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(curW), "r"(fw), "r"(I64(oStep)), "b"(fh), + "r"(I64(iStep)), "r"(store), "r"(curB), "r"(I64(dw)), "a"(ic / 8), + "r"(I64(fStep)) + : "%ecx", "%ymm0", "%ymm12", "%ymm15", "memory", "cc"); +} + +EE convolution_2x2_direct(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if ((fdf != DF_NCHWCxN32) || (idf != DF_NCHWC8) || (ic % 8 != 0)) { + CHECK_STATUS(NOT_MATCH); + } + + F32 *curI, *curO, *calI, *calO; + const F32 *curW, *curB, *calW; + F32 *ftmp = (F32 *)align_addr(tmp, 32); + filterArray = (F32 *)align_addr(filterArray, 32); + + U32 icAlignSize = 8; + U32 icPadding = (ic + icAlignSize - 1) / icAlignSize * icAlignSize; + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + + U32 oStep = oh * ow * UNROLL_OC_DIM * 4; + U32 iStep = (iw_pad - fw * dilateW + (dilateH - 1) * iw_pad) * UNROLL_IC_BLOCK_DIM * 4; + U32 fStep = ((ih_pad - fh) * iw_pad) * UNROLL_IC_BLOCK_DIM * 4; + U32 sw = strideW * UNROLL_IC_BLOCK_DIM * 4; + U32 dw = dilateW * UNROLL_IC_BLOCK_DIM * 4; + U32 wSize = 0, store = 0, ocSize = 0, icSize = 0, hwSize = 0; + I32 ih_idx = 0; + kernel_func kernel[3][3] = {{avx2_conv_kernel_1x8, avx2_conv_kernel_2x8, avx2_conv_kernel_3x8}, + {avx2_conv_kernel_1x16, avx2_conv_kernel_2x16, avx2_conv_kernel_3x16}, + {avx2_conv_kernel_1x32, avx2_conv_kernel_2x32, avx2_conv_kernel_3x32}}; + U32 ocblocks[3] = {8, 16, 32}; + + I32 ohow = oh * ow; + + for (U32 n = 0; n < in; ++n) { + if (idf == DF_NCHWC8 && paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + ftmp = inArray; + } else { + PaddingNCHWC8(inArray, ftmp, inputDesc, convParamSpec); + } + store = 0; + for (U32 icbb = 0; icbb < icPadding; icbb += icSize) { + icSize = UNI_MIN(BLOCK_IC_DIM, icPadding - icbb); + store |= (icbb > 0); + if (icbb == icPadding - icSize) { + store |= U32(activationDesc.mode) << 1; + } + for (I32 hw = 0; hw < ohow; hw += hwSize) { + hwSize = UNI_MIN(ohow - hw, BLOCK_HW_DIM); + for (U32 ocb = 0; ocb < oc; ocb += ocSize) { + curB = biasArray + ocb; + ocSize = UNI_MIN(BLOCK_OC_DIM, oc - ocb); + ocSize = ocblocks[ocSize >> 4]; + calW = filterArray + ocb * icPadding * fh * fw + ocSize * icbb * fh * fw; + curI = ftmp + icbb * ih_pad * iw_pad; + + for (I32 ihw = hw; ihw < hw + hwSize; ihw += wSize) { + wSize = UNI_MIN(hw + hwSize - ihw, UNROLL_W); + U32 in_h_0 = ihw / ow * strideH; + U32 in_w_0 = ihw % ow * strideW; + U32 in_h_1 = (ihw + 1) / ow * strideH; + U32 in_w_1 = (ihw + 1) % ow * strideW; + U32 in_h_2 = (ihw + 2) / ow * strideH; + U32 in_w_2 = (ihw + 2) % ow * strideW; + F32 *out_ptr = outArray + (n * oc + ocb) * ohow + ihw * 8; + F32 *in_0 = curI + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F32 *in_1 = curI + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F32 *in_2 = curI + in_h_2 * iw_pad * 8 + in_w_2 * 8; + + kernel[ocSize >> 4][wSize - 1](in_0, calW, out_ptr, fw, fh, oStep, iStep, + store, curB, dw, in_1, in_2, icSize, fStep); + } + } + } + } + inArray += ic * ih * iw; + outArray += oc * oh * ow; + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/convolution_direct.cpp b/compute/tensor/src/cpu/x86/fp32/convolution_direct.cpp new file mode 100644 index 00000000..ef2e949a --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/convolution_direct.cpp @@ -0,0 +1,720 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" + +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#include "cpu/x86/fp32/transform_functions_fp32.h" + +#define UNROLL_W 3 +#define UNROLL_OC_DIM 8 +#define BLOCK_OC_DIM 32 +#define BLOCK_IC_DIM 32 +#define BLOCK_HW_DIM 128 +#define UNROLL_IC_BLOCK_DIM 8 +#define align_addr(addr, unit) (((uintptr_t)addr + unit - 1) / unit * unit) + +// clang-format off +#define kernel4x3(m0, r0, r1, r2, r3, m1, m2, m3, m4) \ + "vbroadcastss "#m0"("#r0"), %%ymm12 \n\t" \ + "vbroadcastss "#m0"("#r1"), %%ymm13 \n\t" \ + "vbroadcastss "#m0"("#r2"), %%ymm14 \n\t" \ + "vmovups "#m1"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" \ + "vmovups "#m2"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" \ + "vmovups "#m3"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" \ + "vmovups "#m4"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + +#define kernel4x2(m0, r0, r1, r2, r3, m1, m2, m3, m4) \ + "vbroadcastss "#m0"("#r0"), %%ymm12 \n\t" \ + "vbroadcastss "#m0"("#r1"), %%ymm13 \n\t" \ + "vmovups "#m1"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" \ + "vmovups "#m2"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" \ + "vmovups "#m3"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" \ + "vmovups "#m4"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + +#define kernel4x1(m0, r0, r1, r2, r3, m1, m2, m3, m4) \ + "vbroadcastss "#m0"("#r0"), %%ymm12 \n\t" \ + "vmovups "#m1"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" \ + "vmovups "#m2"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" \ + "vmovups "#m3"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" \ + "vmovups "#m4"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" \ + +#define kernel2x3(m0, r0, r1, r2, r3, m1, m2, m3, m4) \ + "vbroadcastss "#m0"("#r0"), %%ymm12 \n\t" \ + "vbroadcastss "#m0"("#r1"), %%ymm13 \n\t" \ + "vbroadcastss "#m0"("#r2"), %%ymm14 \n\t" \ + "vmovups "#m1"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" \ + "vmovups "#m2"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" \ + +#define kernel2x2(m0, r0, r1, r2, r3, m1, m2, m3, m4) \ + "vbroadcastss "#m0"("#r0"), %%ymm12 \n\t" \ + "vbroadcastss "#m0"("#r1"), %%ymm13 \n\t" \ + "vmovups "#m1"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" \ + "vmovups "#m2"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" \ + +#define kernel2x1(m0, r0, r1, r2, r3, m1, m2, m3, m4) \ + "vbroadcastss "#m0"("#r0"), %%ymm12 \n\t" \ + "vmovups "#m1"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" \ + "vmovups "#m2"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" \ + +#define kernel1x3(m0, r0, r1, r2, r3, m1, m2, m3, m4) \ + "vbroadcastss "#m0"("#r0"), %%ymm12 \n\t" \ + "vbroadcastss "#m0"("#r1"), %%ymm13 \n\t" \ + "vbroadcastss "#m0"("#r2"), %%ymm14 \n\t" \ + "vmovups "#m1"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" \ + +#define kernel1x2(m0, r0, r1, r2, r3, m1, m2, m3, m4) \ + "vbroadcastss "#m0"("#r0"), %%ymm12 \n\t" \ + "vbroadcastss "#m0"("#r1"), %%ymm13 \n\t" \ + "vmovups "#m1"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" \ + +#define kernel1x1(m0, r0, r1, r2, r3, m1, m2, m3, m4) \ + "vbroadcastss "#m0"("#r0"), %%ymm12 \n\t" \ + "vmovups "#m1"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" \ + +#define kernel4c8(r, r0, r1, r2, r3) \ + kernel4x##r(0x0, r0, r1, r2, r3, 0x0, 0x20, 0x40, 0x60) \ + kernel4x##r(0x4, r0, r1, r2, r3, 0x80, 0xA0, 0xC0, 0xE0) \ + kernel4x##r(0x8, r0, r1, r2, r3, 0x100, 0x120, 0x140, 0x160) \ + kernel4x##r(0xC, r0, r1, r2, r3, 0x180, 0x1A0, 0x1C0, 0x1E0) \ + kernel4x##r(0x10, r0, r1, r2, r3, 0x200, 0x220, 0x240, 0x260) \ + kernel4x##r(0x14, r0, r1, r2, r3, 0x280, 0x2A0, 0x2C0, 0x2E0) \ + kernel4x##r(0x18, r0, r1, r2, r3, 0x300, 0x320, 0x340, 0x360) \ + kernel4x##r(0x1C, r0, r1, r2, r3, 0x380, 0x3A0, 0x3C0, 0x3E0) + +#define kernel2c8(r, r0, r1, r2, r3) \ + kernel4x##r(0x0, r0, r1, r2, r3, 0x0, 0x20, 0, 0) \ + kernel4x##r(0x4, r0, r1, r2, r3, 0x40, 0x60, 0, 0) \ + kernel4x##r(0x8, r0, r1, r2, r3, 0x80, 0xA0, 0, 0) \ + kernel4x##r(0xC, r0, r1, r2, r3, 0xC0, 0xE0, 0, 0) \ + kernel4x##r(0x10, r0, r1, r2, r3, 0x100, 0x120, 0, 0) \ + kernel4x##r(0x14, r0, r1, r2, r3, 0x140, 0x160, 0, 0) \ + kernel4x##r(0x18, r0, r1, r2, r3, 0x180, 0x1A0, 0, 0) \ + kernel4x##r(0x1C, r0, r1, r2, r3, 0x1C0, 0x1E0, 0, 0) + +#define kernel1c8(r, r0, r1, r2, r3) \ + kernel4x##r(0x0, r0, r1, r2, r3, 0x0, 0, 0, 0) \ + kernel4x##r(0x4, r0, r1, r2, r3, 0x20, 0, 0, 0) \ + kernel4x##r(0x8, r0, r1, r2, r3, 0x40, 0, 0, 0) \ + kernel4x##r(0xC, r0, r1, r2, r3, 0x60, 0, 0, 0) \ + kernel4x##r(0x10, r0, r1, r2, r3, 0x80, 0, 0, 0) \ + kernel4x##r(0x14, r0, r1, r2, r3, 0xA0, 0, 0, 0) \ + kernel4x##r(0x18, r0, r1, r2, r3, 0xC0, 0, 0, 0) \ + kernel4x##r(0x1C, r0, r1, r2, r3, 0xE0, 0, 0, 0) + +typedef void (*kernel_func)(F32 *curI, const F32 *curW, F32 *curO, U32 fw, U32 fh, U32 oStep, U32 iStep, U32 store, const F32 *curB, U32 dw, F32 *in_1, F32 *in_2); + +void avx2_conv_kernel_3x32c8(F32 *curI, const F32 *curW, F32 *curO, U32 fw, U32 fh, U32 oStep, U32 iStep, U32 store, const F32 *curB, U32 dw, F32 *in_1, F32 *in_2) { + __asm__ __volatile__("mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups (%8), %%ymm1 \n\t" + "vmovups (%8), %%ymm2 \n\t" + "vmovups 0x20(%8), %%ymm3 \n\t" + "vmovups 0x20(%8), %%ymm4 \n\t" + "vmovups 0x20(%8), %%ymm5 \n\t" + "vmovups 0x40(%8), %%ymm6 \n\t" + "vmovups 0x40(%8), %%ymm7 \n\t" + "vmovups 0x40(%8), %%ymm8 \n\t" + "vmovups 0x60(%8), %%ymm9 \n\t" + "vmovups 0x60(%8), %%ymm10 \n\t" + "vmovups 0x60(%8), %%ymm11 \n\t" + "jmp 1f \n\t" + ".align 16 \n\t" + "0: \n\t" + "mov %1, %%r9 \n\t" + "vmovups (%%r9), %%ymm0 \n\t" + "vmovups 0x20(%%r9), %%ymm1 \n\t" + "vmovups 0x40(%%r9), %%ymm2 \n\t" + "add %4, %%r9 \n\t" + "vmovups (%%r9), %%ymm3 \n\t" + "vmovups 0x20(%%r9), %%ymm4 \n\t" + "vmovups 0x40(%%r9), %%ymm5 \n\t" + "add %4, %%r9 \n\t" + "vmovups (%%r9), %%ymm6 \n\t" + "vmovups 0x20(%%r9), %%ymm7 \n\t" + "vmovups 0x40(%%r9), %%ymm8 \n\t" + "add %4, %%r9 \n\t" + "vmovups (%%r9), %%ymm9 \n\t" + "vmovups 0x20(%%r9), %%ymm10 \n\t" + "vmovups 0x40(%%r9), %%ymm11 \n\t" + ".align 16 \n\t" + "1: \n\t" + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + kernel4c8(3, %0, %10, %11, %2) + "add %9, %0 \n\t" + "add %9, %10 \n\t" + "add %9, %11 \n\t" + "add $0x400, %2 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + "add %6, %0 \n\t" + "add %6, %10 \n\t" + "add %6, %11 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + // relu + "and $0x6, %7 \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm7, %%ymm7 \n\t" + "vmaxps %%ymm15, %%ymm8, %%ymm8 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + "vmaxps %%ymm15, %%ymm10, %%ymm10 \n\t" + "vmaxps %%ymm15, %%ymm11, %%ymm11 \n\t" + // relu6 + "and $0x4, %7 \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm7, %%ymm7 \n\t" + "vminps %%ymm12, %%ymm8, %%ymm8 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + "vminps %%ymm12, %%ymm10, %%ymm10 \n\t" + "vminps %%ymm12, %%ymm11, %%ymm11 \n\t" + "3: \n\t" + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + "vmovups %%ymm2, 0x40(%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm3, (%1) \n\t" + "vmovups %%ymm4, 0x20(%1) \n\t" + "vmovups %%ymm5, 0x40(%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm6, (%1) \n\t" + "vmovups %%ymm7, 0x20(%1) \n\t" + "vmovups %%ymm8, 0x40(%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm9, (%1) \n\t" + "vmovups %%ymm10, 0x20(%1) \n\t" + "vmovups %%ymm11, 0x40(%1) \n\t" + : + : "r" (curI), "r" (curO), "r" (curW), "r" (fw), + "r" (I64(oStep)), "b" (fh), "r" (I64(iStep)), "r" (store), + "r" (curB), "r" (I64(dw)), "r" (in_1), "r" (in_2) + : "%ecx", "%r9", + "%ymm0", "%ymm1", "%ymm2", "%ymm3", + "%ymm4", "%ymm5", "%ymm6", "%ymm7", + "%ymm8", "%ymm9", "%ymm10", "%ymm11", + "%ymm12", "%ymm13", "%ymm14", "%ymm15", + "memory", "cc"); +} + +void avx2_conv_kernel_1x32c8(F32 *curI, const F32 *curW, F32 *curO, U32 fw, U32 fh, U32 oStep, U32 iStep, U32 store, const F32 *curB, U32 dw, F32 *in_1, F32 *in_2) { + __asm__ __volatile__("mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups 0x20(%8), %%ymm3 \n\t" + "vmovups 0x40(%8), %%ymm6 \n\t" + "vmovups 0x60(%8), %%ymm9 \n\t" + "jmp 1f \n\t" + ".align 16 \n\t" + "0: \n\t" + "mov %1, %%r9 \n\t" + "vmovups (%%r9), %%ymm0 \n\t" + "add %4, %%r9 \n\t" + "vmovups (%%r9), %%ymm3 \n\t" + "add %4, %%r9 \n\t" + "vmovups (%%r9), %%ymm6 \n\t" + "add %4, %%r9 \n\t" + "vmovups (%%r9), %%ymm9 \n\t" + ".align 16 \n\t" + "1: \n\t" + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + kernel4c8(1, %0, 0, 0, %2) + "add %9, %0 \n\t" + "add $0x400, %2 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + "add %6, %0 \n\t" + "sub $1, %%ebx \n\t" + "jg 1b \n\t" + // relu + "and $0x6, %7 \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + // relu6 + "and $0x4, %7 \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + "3: \n\t" + "vmovups %%ymm0, (%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm3, (%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm6, (%1) \n\t" + "add %4, %1 \n\t" + "vmovups %%ymm9, (%1) \n\t" + : + : "r" (curI), "r" (curO), "r" (curW), "r" (fw), + "r" (I64(oStep)), "b" (fh), "r" (I64(iStep)), "r" (store), + "r" (curB), "r" (I64(dw)) + : "%ecx", "%r9", + "%ymm0", "%ymm3", "%ymm6", "%ymm9", "%ymm12", "%ymm15", + "memory", "cc"); +} + +void avx2_conv_kernel_3x16c8(F32 *curI, const F32 *curW, F32 *curO, U32 fw, U32 fh, U32 oStep, U32 iStep, U32 store, const F32 *curB, U32 dw, F32 *in_1, F32 *in_2) { + __asm__ __volatile__("mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups (%8), %%ymm1 \n\t" + "vmovups (%8), %%ymm2 \n\t" + "vmovups 0x20(%8), %%ymm3 \n\t" + "vmovups 0x20(%8), %%ymm4 \n\t" + "vmovups 0x20(%8), %%ymm5 \n\t" + "jmp 1f \n\t" + ".align 16 \n\t" + "0: \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups 0x20(%1), %%ymm1 \n\t" + "vmovups 0x40(%1), %%ymm2 \n\t" + "vmovups (%1, %4), %%ymm3 \n\t" + "vmovups 0x20(%1, %4), %%ymm4 \n\t" + "vmovups 0x40(%1, %4), %%ymm5 \n\t" + ".align 16 \n\t" + "1: \n\t" + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + kernel2c8(3, %0, %10, %11, %2) + "add %9, %0 \n\t" + "add %9, %10 \n\t" + "add %9, %11 \n\t" + "add $0x200, %2 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + "add %6, %0 \n\t" + "add %6, %10 \n\t" + "add %6, %11 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + // relu + "and $0x6, %7 \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + // relu6 + "and $0x4, %7 \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + "3: \n\t" + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + "vmovups %%ymm2, 0x40(%1) \n\t" + "vmovups %%ymm3, (%1, %4) \n\t" + "vmovups %%ymm4, 0x20(%1, %4) \n\t" + "vmovups %%ymm5, 0x40(%1, %4) \n\t" + : + : "r" (curI), "r" (curO), "r" (curW), "r" (fw), + "r" (I64(oStep)), "b" (fh), "r" (I64(iStep)), "r" (store), + "r" (curB), "r" (I64(dw)), "r" (in_1), "r" (in_2) + : "%ecx", + "%ymm0", "%ymm1", "%ymm2", "%ymm3", + "%ymm4", "%ymm5", + "%ymm12", "%ymm13", "%ymm14", "%ymm15", + "memory", "cc"); +} + +void avx2_conv_kernel_1x16c8(F32 *curI, const F32 *curW, F32 *curO, U32 fw, U32 fh, U32 oStep, U32 iStep, U32 store, const F32 *curB, U32 dw, F32 *in_1, F32 *in_2) { + __asm__ __volatile__("mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups 0x20(%8), %%ymm3 \n\t" + "jmp 1f \n\t" + ".align 16 \n\t" + "0: \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups (%1, %4), %%ymm3 \n\t" + ".align 16 \n\t" + "1: \n\t" + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + kernel2c8(1, %0, 0, 0, %2) + "add %9, %0 \n\t" + "add $0x200, %2 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + "add %6, %0 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + // relu + "and $0x6, %7 \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + // relu6 + "and $0x4, %7 \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "3: \n\t" + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm3, (%1, %4) \n\t" + : + : "r" (curI), "r" (curO), "r" (curW), "r" (fw), + "r" (I64(oStep)), "b" (fh), "r" (I64(iStep)), "r" (store), + "r" (curB), "r" (I64(dw)) + : "%ecx", + "%ymm0", "%ymm3", "%ymm12", "%ymm15", + "memory", "cc"); +} + +void avx2_conv_kernel_3x8c8(F32 *curI, const F32 *curW, F32 *curO, U32 fw, U32 fh, U32 oStep, U32 iStep, U32 store, const F32 *curB, U32 dw, F32 *in_1, F32 *in_2) { + __asm__ __volatile__("mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "vmovups (%8), %%ymm1 \n\t" + "vmovups (%8), %%ymm2 \n\t" + "jmp 1f \n\t" + ".align 16 \n\t" + "0: \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups 0x20(%1), %%ymm1 \n\t" + "vmovups 0x40(%1), %%ymm2 \n\t" + ".align 16 \n\t" + "1: \n\t" + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + kernel1c8(3, %0, %10, %11, %2) + "add %9, %0 \n\t" + "add %9, %10 \n\t" + "add %9, %11 \n\t" + "add $0x100, %2 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + "add %6, %0 \n\t" + "add %6, %10 \n\t" + "add %6, %11 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + // relu + "and $0x6, %7 \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + // relu6 + "and $0x4, %7 \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%eax \n\t" + "vmovd %%eax, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "3: \n\t" + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + "vmovups %%ymm2, 0x40(%1) \n\t" + : + : "r" (curI), "r" (curO), "r" (curW), "r" (fw), + "r" (I64(oStep)), "b" (fh), "r" (I64(iStep)), "r" (store), + "r" (curB), "r" (I64(dw)), "r" (in_1), "r" (in_2) + : "%ecx", + "%ymm0", "%ymm1", "%ymm2", + "%ymm12", "%ymm13", "%ymm14", "%ymm15", + "memory", "cc"); +} + +void avx2_conv_kernel_1x8c8(F32 *curI, const F32 *curW, F32 *curO, U32 fw, U32 fh, U32 oStep, U32 iStep, U32 store, const F32 *curB, U32 dw, F32 *in_1, F32 *in_2) { + __asm__ __volatile__("mov %7, %%ecx \n\t" + "and $0x1, %%ecx \n\t" + "jne 0f \n\t" + "vmovups (%8), %%ymm0 \n\t" + "jmp 1f \n\t" + ".align 16 \n\t" + "0: \n\t" + "vmovups (%1), %%ymm0 \n\t" + ".align 16 \n\t" + "1: \n\t" + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + kernel1c8(1, %0, 0, 0, %2) + "add %9, %0 \n\t" + "add $0x100, %2 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + "add %6, %0 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + // relu + "and $0x6, %7 \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + // relu6 + "and $0x4, %7 \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%eax \n\t" + "vmovd %%eax, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "3: \n\t" + "vmovups %%ymm0, (%1) \n\t" + : + : "r" (curI), "r" (curO), "r" (curW), "r" (fw), + "r" (I64(oStep)), "b" (fh), "r" (I64(iStep)), "r" (store), + "r" (curB), "r" (I64(dw)) + : "%ecx", + "%ymm0", "%ymm12", "%ymm15", + "memory", "cc"); +} + +EE convolution_direct(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if ((2 == fh) && (2 == fw)) { + return convolution_2x2_direct(inputDesc, inArray, filterDesc, filterArray, convParamSpec, + biasDesc, biasArray, tmpBytes, tmp, outputDesc, outArray, activationDesc); + } + + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + if ((fdf != DF_NCHWCxN32) || (idf != DF_NCHWC8) || (ic % 8 != 0)) { + CHECK_STATUS(NOT_MATCH); + } + + F32 *ftmp = (F32 *)align_addr(tmp, 32); + filterArray = (F32 *)align_addr(filterArray, 32); + + U32 icAlignSize = 8; + U32 icPadding = (ic + icAlignSize - 1) / icAlignSize * icAlignSize; + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + I32 ohow = oh * ow; + + U32 oStep = oh * ow * UNROLL_OC_DIM * 4; + U32 iStep = (iw_pad - fw * dilateW + (dilateH - 1) * iw_pad) * UNROLL_IC_BLOCK_DIM * 4; + U32 sw = strideW * UNROLL_IC_BLOCK_DIM * 4; + U32 dw = dilateW * UNROLL_IC_BLOCK_DIM * 4; + kernel_func kernel[3][2] = {{avx2_conv_kernel_1x8c8, avx2_conv_kernel_3x8c8}, + {avx2_conv_kernel_1x16c8, avx2_conv_kernel_3x16c8}, + {avx2_conv_kernel_1x32c8, avx2_conv_kernel_3x32c8}}; + U32 ocblocks[3] = {8, 16, 32}; + +#ifdef _USE_OPENMP + U32 alpha = (ohow + OMP_NUM_THREADS * BLOCK_HW_DIM - 1) / (OMP_NUM_THREADS * BLOCK_HW_DIM); + U32 block_hw_dim = (ohow + OMP_NUM_THREADS * alpha - 1 ) / (OMP_NUM_THREADS * alpha); +#else + U32 block_hw_dim = BLOCK_HW_DIM; +#endif + + U32 icSize = 0; + U32 hwBlockNums = (ohow + block_hw_dim - 1 ) / block_hw_dim; + U32 ocBlockNums = oc / BLOCK_OC_DIM; + U32 ocbArray[4] = {0}; + U32 oc_remain = oc % BLOCK_OC_DIM; + for (U32 i = 0, j = 0; i < oc_remain; i += icSize, ++j) { + icSize = ocblocks[(oc_remain - i)>>4]; + ocbArray[j + 1] = icSize + ocbArray[j]; + ++ocBlockNums; + } + U32 hwocBlockNums = hwBlockNums * ocBlockNums; + + for (U32 n = 0; n < in; ++n) { + if ((paddingT == 0) && (paddingB == 0) && (paddingL == 0) && (paddingR == 0)) { + ftmp = inArray; + } else { + PaddingNCHWC8(inArray, ftmp, inputDesc, convParamSpec); + } +#ifdef _USE_OPENMP +#pragma omp parallel num_threads(OMP_NUM_THREADS) + { +#endif + U32 private_icSize = icSize; + for (U32 icbb = 0; icbb < ic; icbb += private_icSize) { + private_icSize = UNI_MIN(BLOCK_IC_DIM, ic - icbb); +#ifdef _USE_OPENMP +#pragma omp for +#endif + for (I32 bIdx = 0; bIdx < hwocBlockNums; ++bIdx) { + U32 hw = (bIdx / ocBlockNums) * block_hw_dim; + U32 hwSize = UNI_MIN(block_hw_dim, ohow - hw); + U32 ocIdx = bIdx % ocBlockNums; + U32 ocb = ocIdx * BLOCK_OC_DIM; + if (ocIdx > oc / BLOCK_OC_DIM) { + ocb += ocbArray[ocIdx - oc / BLOCK_OC_DIM]; + } + U32 ocSize = UNI_MIN(BLOCK_OC_DIM, oc - ocb); + ocSize = ocblocks[ocSize >> 4]; + const F32 *curB = biasArray + ocb; + U32 store = 0, icbSize = 0; + for (U32 icb = icbb; icb < icbb + private_icSize; icb += icbSize) { + icbSize = UNI_MIN(icbb + private_icSize - icb, UNROLL_IC_BLOCK_DIM); + const F32 *calW = filterArray + ocb * icPadding * fh * fw + ocSize * icb * fh * fw; + F32 *curI = ftmp + icb * ih_pad * iw_pad; + + store |= (icb > 0); + if (icb == ic - icbSize) { + store |= U32(activationDesc.mode) << 1; + } + U32 wSize = 0; + for (I32 ihw = hw; ihw < hw + hwSize; ihw += wSize) { + wSize = UNI_MIN(hw + hwSize - ihw, UNROLL_W); + if (wSize < 3) { + wSize = 1; + } + U32 in_h_0 = ihw / ow * strideH; + U32 in_w_0 = ihw % ow * strideW; + U32 in_h_1 = (ihw + 1) / ow * strideH; + U32 in_w_1 = (ihw + 1) % ow * strideW; + U32 in_h_2 = (ihw + 2) / ow * strideH; + U32 in_w_2 = (ihw + 2) % ow * strideW; + F32 *out_ptr = outArray + (n * oc + ocb) * ohow + ihw * 8; + F32 *in_0 = curI + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F32 *in_1 = curI + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F32 *in_2 = curI + in_h_2 * iw_pad * 8 + in_w_2 * 8; + + kernel[ocSize>>4][wSize>>1](in_0, calW, out_ptr, fw, fh, oStep, iStep, + store, curB, dw, in_1, in_2); + } + } + } + } +#ifdef _USE_OPENMP + } +#endif + inArray += ic * ih * iw; + outArray += oc * oh * ow; + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/convolution_direct_nchw.cpp b/compute/tensor/src/cpu/x86/fp32/convolution_direct_nchw.cpp new file mode 100644 index 00000000..7e0009ae --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/convolution_direct_nchw.cpp @@ -0,0 +1,1861 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" + +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#include "cpu/x86/fp32/transform_functions_fp32.h" + +#define UNROLL_W 4 +#define UNROLL_OC_DIM 8 +#define BLOCK_OC_DIM 24 +#define BLOCK_IC_DIM 32 +#define BLOCK_HW_DIM 768 +#define UNROLL_IC_BLOCK_DIM 8 +#define align_addr(addr, unit) (((uintptr_t)addr + unit - 1) / unit * unit) + +#define kernel4x3(m0, r0, r1, r2, r3, m1, m2, m3, m4) \ + "vbroadcastss "#m0"("#r0"), %%ymm12 \n\t" \ + "vbroadcastss "#m0"("#r1"), %%ymm13 \n\t" \ + "vbroadcastss "#m0"("#r2"), %%ymm14 \n\t" \ + "vmovups "#m1"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" \ + "vmovups "#m2"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" \ + "vmovups "#m3"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" \ + "vmovups "#m4"("#r3"), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + +typedef void (*kernel_func)(F32 *in_0, F32 *in_1, F32 *in_2, F32 *in_3, const F32 *curW, F32 *curO, const F32 *curB, + U32 fw, U32 fh, U32 oStep, U32 hStep, U32 store, U32 dw, U32 ic, U32 iStep, U32 fwStep, U32 fhStep); + +void avx2_conv_kernel_3x32(F32 *in_0, F32 *in_1, F32 *in_2, F32 *in_3, const F32 *curW, F32 *curO, const F32 *curB, + U32 fw, U32 fh, U32 oStep, U32 hStep, U32 store, U32 dw, U32 ic, U32 iStep, U32 fwStep, U32 fhStep) { + __asm__ __volatile__("mov %3, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups (%1), %%ymm1 \n\t" + "vmovups (%1), %%ymm2 \n\t" + "vmovups 0x20(%1), %%ymm3 \n\t" + "vmovups 0x20(%1), %%ymm4 \n\t" + "vmovups 0x20(%1), %%ymm5 \n\t" + "vmovups 0x40(%1), %%ymm6 \n\t" + "vmovups 0x40(%1), %%ymm7 \n\t" + "vmovups 0x40(%1), %%ymm8 \n\t" + "vmovups 0x60(%1), %%ymm9 \n\t" + "vmovups 0x60(%1), %%ymm10 \n\t" + "vmovups 0x60(%1), %%ymm11 \n\t" + "jmp 1f \n\t" + ".align 16 \n\t" + "0: \n\t" + "mov %0, %%rax \n\t" + "vmovups (%%rax), %%ymm0 \n\t" + "vmovups 0x20(%%rax), %%ymm1 \n\t" + "vmovups 0x40(%%rax), %%ymm2 \n\t" + "add %2, %%rax \n\t" + "vmovups (%%rax), %%ymm3 \n\t" + "vmovups 0x20(%%rax), %%ymm4 \n\t" + "vmovups 0x40(%%rax), %%ymm5 \n\t" + "add %2, %%rax \n\t" + "vmovups (%%rax), %%ymm6 \n\t" + "vmovups 0x20(%%rax), %%ymm7 \n\t" + "vmovups 0x40(%%rax), %%ymm8 \n\t" + "add %2, %%rax \n\t" + "vmovups (%%rax), %%ymm9 \n\t" + "vmovups 0x20(%%rax), %%ymm10 \n\t" + "vmovups 0x40(%%rax), %%ymm11 \n\t" + ".align 16 \n\t" + "1: \n\t" + : + : "r" (curO), "r" (curB), "r" (I64(oStep)), "r" (store) + : "%eax", "%rax", + "%ymm0", "%ymm1", "%ymm2", "%ymm3", + "%ymm4", "%ymm5", "%ymm6", "%ymm7", + "%ymm8", "%ymm9", "%ymm10", "%ymm11", + "memory", "cc"); + + if ((fw == 7) && (fh > 0)) { + __asm__ __volatile__(".align 16 \n\t" + "0: \n\t" + "mov %4, %%ebx \n\t" + ".align 16 \n\t" + "1: \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x0, 0x20, 0x40, 0x60) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x80, 0xA0, 0xC0, 0xE0) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x100, 0x120, 0x140, 0x160) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x180, 0x1A0, 0x1C0, 0x1E0) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x200, 0x220, 0x240, 0x260) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x280, 0x2A0, 0x2C0, 0x2E0) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x300, 0x320, 0x340, 0x360) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + "add $0x380, %3 \n\t" + "add %7, %0 \n\t" + "add %7, %1 \n\t" + "add %7, %2 \n\t" + "add %10, %3 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %11, %3 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r" (in_0), "r" (in_1), "r" (in_2), + "r" (curW), "r" (fh), "r" (fw), "a" (ic), + "r" (I64(hStep)), "r" (I64(dw)), "r" (I64(iStep)), "r" (I64(fwStep)), "r" (I64(fhStep)) + : "%ecx", "%ebx", + "%ymm0", "%ymm1", "%ymm2", "%ymm3", + "%ymm4", "%ymm5", "%ymm6", "%ymm7", + "%ymm8", "%ymm9", "%ymm10", "%ymm11", + "%ymm12", "%ymm13", "%ymm14", "%ymm15", + "memory", "cc"); + } else if ((fw == 5) && (fh > 0)) { + __asm__ __volatile__(".align 16 \n\t" + "0: \n\t" + "mov %4, %%ebx \n\t" + ".align 16 \n\t" + "1: \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x0, 0x20, 0x40, 0x60) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x80, 0xA0, 0xC0, 0xE0) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x100, 0x120, 0x140, 0x160) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x180, 0x1A0, 0x1C0, 0x1E0) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x200, 0x220, 0x240, 0x260) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + "add $0x280, %3 \n\t" + "add %7, %0 \n\t" + "add %7, %1 \n\t" + "add %7, %2 \n\t" + "add %10, %3 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %11, %3 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r" (in_0), "r" (in_1), "r" (in_2), + "r" (curW), "r" (fh), "r" (fw), "a" (ic), + "r" (I64(hStep)), "r" (I64(dw)), "r" (I64(iStep)), "r" (I64(fwStep)), "r" (I64(fhStep)) + : "%ecx", "%ebx", + "%ymm0", "%ymm1", "%ymm2", "%ymm3", + "%ymm4", "%ymm5", "%ymm6", "%ymm7", + "%ymm8", "%ymm9", "%ymm10", "%ymm11", + "%ymm12", "%ymm13", "%ymm14", "%ymm15", + "memory", "cc"); + } else if ((fw == 3) && (fh == 3)) { + __asm__ __volatile__("add %8, %7 \n\t" + ".align 16 \n\t" + "0: \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x0, 0x20, 0x40, 0x60) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x80, 0xA0, 0xC0, 0xE0) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x100, 0x120, 0x140, 0x160) + "add %7, %0 \n\t" + "add %7, %1 \n\t" + "add %7, %2 \n\t" + "add %10, %3 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x180, 0x1A0, 0x1C0, 0x1E0) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x200, 0x220, 0x240, 0x260) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x280, 0x2A0, 0x2C0, 0x2E0) + "add %7, %0 \n\t" + "add %7, %1 \n\t" + "add %7, %2 \n\t" + "add %10, %3 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x300, 0x320, 0x340, 0x360) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x380, 0x3A0, 0x3C0, 0x3E0) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x400, 0x420, 0x440, 0x460) + "add %7, %0 \n\t" + "add %7, %1 \n\t" + "add %7, %2 \n\t" + "add %10, %3 \n\t" + "add $0x480, %3 \n\t" + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %11, %3 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r" (in_0), "r" (in_1), "r" (in_2), + "r" (curW), "r" (fh), "r" (fw), "a" (ic), + "r" (I64(hStep)), "r" (I64(dw)), "r" (I64(iStep)), "r" (I64(fwStep)), "r" (I64(fhStep)) + : "%ecx", + "%ymm0", "%ymm1", "%ymm2", "%ymm3", + "%ymm4", "%ymm5", "%ymm6", "%ymm7", + "%ymm8", "%ymm9", "%ymm10", "%ymm11", + "%ymm12", "%ymm13", "%ymm14", "%ymm15", + "memory", "cc"); + } else if ((fh > 0) && (fw > 0)) { + __asm__ __volatile__(".align 16 \n\t" + "0: \n\t" + "mov %4, %%ebx \n\t" + ".align 16 \n\t" + "1: \n\t" + "mov %5, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + kernel4x3(0x0, %0, %1, %2, %3, 0x0, 0x20, 0x40, 0x60) + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + "add $0x80, %3 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + "add %7, %0 \n\t" + "add %7, %1 \n\t" + "add %7, %2 \n\t" + "add %10, %3 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %11, %3 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r" (in_0), "r" (in_1), "r" (in_2), + "r" (curW), "r" (fh), "r" (fw), "a" (ic), + "r" (I64(hStep)), "r" (I64(dw)), "r" (I64(iStep)), "r" (I64(fwStep)), "r" (I64(fhStep)) + : "%ecx", "%ebx", + "%ymm0", "%ymm1", "%ymm2", "%ymm3", + "%ymm4", "%ymm5", "%ymm6", "%ymm7", + "%ymm8", "%ymm9", "%ymm10", "%ymm11", + "%ymm12", "%ymm13", "%ymm14", "%ymm15", + "memory", "cc"); + } + + __asm__ __volatile__( + // relu + "and $0x6, %2 \n\t" + "je 0f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm7, %%ymm7 \n\t" + "vmaxps %%ymm15, %%ymm8, %%ymm8 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + "vmaxps %%ymm15, %%ymm10, %%ymm10 \n\t" + "vmaxps %%ymm15, %%ymm11, %%ymm11 \n\t" + + // relu6 + "and $0x4, %2 \n\t" + "je 0f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm7, %%ymm7 \n\t" + "vminps %%ymm12, %%ymm8, %%ymm8 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + "vminps %%ymm12, %%ymm10, %%ymm10 \n\t" + "vminps %%ymm12, %%ymm11, %%ymm11 \n\t" + + "0: \n\t" + "vmovups %%ymm0, (%0) \n\t" + "vmovups %%ymm1, 0x20(%0) \n\t" + "vmovups %%ymm2, 0x40(%0) \n\t" + "add %1, %0 \n\t" + "vmovups %%ymm3, (%0) \n\t" + "vmovups %%ymm4, 0x20(%0) \n\t" + "vmovups %%ymm5, 0x40(%0) \n\t" + "add %1, %0 \n\t" + "vmovups %%ymm6, (%0) \n\t" + "vmovups %%ymm7, 0x20(%0) \n\t" + "vmovups %%ymm8, 0x40(%0) \n\t" + "add %1, %0 \n\t" + "vmovups %%ymm9, (%0) \n\t" + "vmovups %%ymm10, 0x20(%0) \n\t" + "vmovups %%ymm11, 0x40(%0) \n\t" + : + : "r"(curO), "r"(I64(oStep)), "r"(store) + : "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", + "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm15", "memory", "cc"); +} + +void avx2_conv_kernel_1x32(F32 *in_0, + F32 *in_1, + F32 *in_2, + F32 *in_3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 hStep, + U32 store, + U32 dw, + U32 ic, + U32 iStep, + U32 fwStep, + U32 fhStep) +{ + __asm__ __volatile__("mov %3, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups 0x20(%1), %%ymm3 \n\t" + "vmovups 0x40(%1), %%ymm6 \n\t" + "vmovups 0x60(%1), %%ymm9 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "mov %0, %%rax \n\t" + "vmovups (%%rax), %%ymm0 \n\t" + "add %2, %%rax \n\t" + "vmovups (%%rax), %%ymm3 \n\t" + "add %2, %%rax \n\t" + "vmovups (%%rax), %%ymm6 \n\t" + "add %2, %%rax \n\t" + "vmovups (%%rax), %%ymm9 \n\t" + + ".align 16 \n\t" + "1: \n\t" + : + : "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store) + : "%eax", "%rax", "%ymm0", "%ymm3", "%ymm6", "%ymm9", "memory", "cc"); + + if ((fh == 3) && (fw == 3)) { + __asm__ __volatile__("add %8, %7 \n\t" + ".align 16 \n\t" + "0: \n\t" + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovaps (%3), %%ymm11 \n\t" + "vmovaps 0x20(%3), %%ymm13 \n\t" + "vmovaps 0x40(%3), %%ymm14 \n\t" + "vmovaps 0x60(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm11, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "add %8, %0 \n\t" + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovaps 0x80(%3), %%ymm11 \n\t" + "vmovaps 0xA0(%3), %%ymm13 \n\t" + "vmovaps 0xC0(%3), %%ymm14 \n\t" + "vmovaps 0xE0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm11, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "add %8, %0 \n\t" + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovaps 0x100(%3), %%ymm11 \n\t" + "vmovaps 0x120(%3), %%ymm13 \n\t" + "vmovaps 0x140(%3), %%ymm14 \n\t" + "vmovaps 0x160(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm11, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "add %7, %0 \n\t" + "add %10, %3 \n\t" + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovaps 0x180(%3), %%ymm11 \n\t" + "vmovaps 0x1A0(%3), %%ymm13 \n\t" + "vmovaps 0x1C0(%3), %%ymm14 \n\t" + "vmovaps 0x1E0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm11, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "add %8, %0 \n\t" + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovaps 0x200(%3), %%ymm11 \n\t" + "vmovaps 0x220(%3), %%ymm13 \n\t" + "vmovaps 0x240(%3), %%ymm14 \n\t" + "vmovaps 0x260(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm11, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "add %8, %0 \n\t" + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovaps 0x280(%3), %%ymm11 \n\t" + "vmovaps 0x2A0(%3), %%ymm13 \n\t" + "vmovaps 0x2C0(%3), %%ymm14 \n\t" + "vmovaps 0x2E0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm11, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "add %7, %0 \n\t" + "add %10, %3 \n\t" + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovaps 0x300(%3), %%ymm11 \n\t" + "vmovaps 0x320(%3), %%ymm13 \n\t" + "vmovaps 0x340(%3), %%ymm14 \n\t" + "vmovaps 0x360(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm11, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "add %8, %0 \n\t" + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovaps 0x380(%3), %%ymm11 \n\t" + "vmovaps 0x3A0(%3), %%ymm13 \n\t" + "vmovaps 0x3C0(%3), %%ymm14 \n\t" + "vmovaps 0x3E0(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm11, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "add %8, %0 \n\t" + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovaps 0x400(%3), %%ymm11 \n\t" + "vmovaps 0x420(%3), %%ymm13 \n\t" + "vmovaps 0x440(%3), %%ymm14 \n\t" + "vmovaps 0x460(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm11, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "add %7, %0 \n\t" + "add %10, %3 \n\t" + "add $0x480, %3 \n\t" + "add %9, %0 \n\t" + "add %11, %3 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r" (in_0), "r" (in_1), "r" (in_2), + "r" (curW), "r" (fh), "r" (fw), "a" (ic), + "r" (I64(hStep)), "r" (I64(dw)), "r" (I64(iStep)), "r" (I64(fwStep)), "r" (I64(fhStep)) + : + "%ymm0", "%ymm3", "%ymm6", "%ymm9", "%ymm11", + "%ymm12", "%ymm13", "%ymm14", "%ymm15", + "memory", "cc"); + } else if ((fh > 0) && (fw > 0)) { + __asm__ __volatile__(".align 16 \n\t" + "0: \n\t" + "mov %4, %%ebx \n\t" + ".align 16 \n\t" + "1: \n\t" + "mov %5, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + "vbroadcastss (%0), %%ymm12 \n\t" + "vmovaps (%3), %%ymm11 \n\t" + "vmovaps 0x20(%3), %%ymm13 \n\t" + "vmovaps 0x40(%3), %%ymm14 \n\t" + "vmovaps 0x60(%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm11, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "add %8, %0 \n\t" + "add $0x80, %3 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + "add %7, %0 \n\t" + "add %10, %3 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + "add %9, %0 \n\t" + "add %11, %3 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r" (in_0), "r" (in_1), "r" (in_2), + "r" (curW), "r" (fh), "r" (fw), "a" (ic), + "r" (I64(hStep)), "r" (I64(dw)), "r" (I64(iStep)), "r" (I64(fwStep)), "r" (I64(fhStep)) + : "%ecx", "%ebx", + "%ymm0", "%ymm3", "%ymm6", "%ymm9", "%ymm11", + "%ymm12", "%ymm13", "%ymm14", "%ymm15", + "memory", "cc"); + } + + __asm__ __volatile__( + // relu + "and $0x6, %2 \n\t" + "je 0f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + + // relu6 + "and $0x4, %2 \n\t" + "je 0f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + + "0: \n\t" + "vmovups %%ymm0, (%0) \n\t" + "add %1, %0 \n\t" + "vmovups %%ymm3, (%0) \n\t" + "add %1, %0 \n\t" + "vmovups %%ymm6, (%0) \n\t" + "add %1, %0 \n\t" + "vmovups %%ymm9, (%0) \n\t" + : + : "r"(curO), "r"(I64(oStep)), "r"(store) + : "%ecx", "%ymm0", "%ymm3", "%ymm6", "%ymm9", "%ymm12", "%ymm15", "memory", "cc"); +} + +void avx2_conv_kernel_4x24(F32 *in_0, + F32 *in_1, + F32 *in_2, + F32 *in_3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 hStep, + U32 store, + U32 dw, + U32 ic, + U32 iStep, + U32 fwStep, + U32 fhStep) +{ + __asm__ __volatile__("mov %3, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups (%1), %%ymm1 \n\t" + "vmovups (%1), %%ymm2 \n\t" + "vmovups (%1), %%ymm3 \n\t" + "vmovups 0x20(%1), %%ymm4 \n\t" + "vmovups 0x20(%1), %%ymm5 \n\t" + "vmovups 0x20(%1), %%ymm6 \n\t" + "vmovups 0x20(%1), %%ymm7 \n\t" + "vmovups 0x40(%1), %%ymm8 \n\t" + "vmovups 0x40(%1), %%ymm9 \n\t" + "vmovups 0x40(%1), %%ymm10 \n\t" + "vmovups 0x40(%1), %%ymm11 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%0), %%ymm0 \n\t" + "vmovups 0x20(%0), %%ymm1 \n\t" + "vmovups 0x40(%0), %%ymm2 \n\t" + "vmovups 0x60(%0), %%ymm3 \n\t" + "vmovups (%0, %2), %%ymm4 \n\t" + "vmovups 0x20(%0, %2), %%ymm5 \n\t" + "vmovups 0x40(%0, %2), %%ymm6 \n\t" + "vmovups 0x60(%0, %2), %%ymm7 \n\t" + "vmovups (%0, %2, 2), %%ymm8 \n\t" + "vmovups 0x20(%0, %2, 2), %%ymm9 \n\t" + "vmovups 0x40(%0, %2, 2), %%ymm10 \n\t" + "vmovups 0x60(%0, %2, 2), %%ymm11 \n\t" + + ".align 16 \n\t" + "1: \n\t" + : + : "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store) + : "%eax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", + "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "memory", "cc"); + + if ((fw == 3) && (fh == 3)) { + __asm__ __volatile__("mov %7, %%eax \n\t" + ".align 16 \n\t" + "0: \n\t" + + "vmovaps (%4), %%ymm12 \n\t" + "vmovaps 0x20(%4), %%ymm13 \n\t" + "vmovaps 0x40(%4), %%ymm14 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss (%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + + "vmovaps 0x60(%4), %%ymm12 \n\t" + "vmovaps 0x80(%4), %%ymm13 \n\t" + "vmovaps 0xA0(%4), %%ymm14 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss (%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + + "vmovaps 0xC0(%4), %%ymm12 \n\t" + "vmovaps 0xE0(%4), %%ymm13 \n\t" + "vmovaps 0x100(%4), %%ymm14 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss (%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + "add %8, %3 \n\t" + "add %11, %4 \n\t" + + "vmovaps 0x120(%4), %%ymm12 \n\t" + "vmovaps 0x140(%4), %%ymm13 \n\t" + "vmovaps 0x160(%4), %%ymm14 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss (%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + + "vmovaps 0x180(%4), %%ymm12 \n\t" + "vmovaps 0x1A0(%4), %%ymm13 \n\t" + "vmovaps 0x1C0(%4), %%ymm14 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss (%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + + "vmovaps 0x1E0(%4), %%ymm12 \n\t" + "vmovaps 0x200(%4), %%ymm13 \n\t" + "vmovaps 0x220(%4), %%ymm14 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss (%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + "add %8, %3 \n\t" + "add %11, %4 \n\t" + + "vmovaps 0x240(%4), %%ymm12 \n\t" + "vmovaps 0x260(%4), %%ymm13 \n\t" + "vmovaps 0x280(%4), %%ymm14 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss (%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + + "vmovaps 0x2A0(%4), %%ymm12 \n\t" + "vmovaps 0x2C0(%4), %%ymm13 \n\t" + "vmovaps 0x2E0(%4), %%ymm14 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss (%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + + "vmovaps 0x300(%4), %%ymm12 \n\t" + "vmovaps 0x320(%4), %%ymm13 \n\t" + "vmovaps 0x340(%4), %%ymm14 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss (%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + "add %8, %3 \n\t" + + "add $0x360, %4 \n\t" + "add %10, %0 \n\t" + "add %10, %1 \n\t" + "add %10, %2 \n\t" + "add %10, %3 \n\t" + "add %12, %4 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r"(in_0), "r"(in_1), "r"(in_2), "r"(in_3), "r"(curW), "r"(fh), + "r"(fw), "r"(ic), "r"(I64(hStep)), "r"(I64(dw)), "r"(I64(iStep)), + "r"(I64(fwStep)), "r"(I64(fhStep)) + : "%ecx", "%ebx", "%eax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", + "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", + "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory", "cc"); + + } else if ((fh > 0) && (fw > 0)) { + __asm__ __volatile__(".align 16 \n\t" + "0: \n\t" + + "mov %5, %%ebx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "mov %6, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + + "vmovaps (%4), %%ymm12 \n\t" + "vmovaps 0x20(%4), %%ymm13 \n\t" + "vmovaps 0x40(%4), %%ymm14 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm9 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm10 \n\t" + "vbroadcastss (%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + "add $0x60, %4 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + "add %8, %3 \n\t" + "add %11, %4 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + + "add %10, %0 \n\t" + "add %10, %1 \n\t" + "add %10, %2 \n\t" + "add %10, %3 \n\t" + "add %12, %4 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r"(in_0), "r"(in_1), "r"(in_2), "r"(in_3), "r"(curW), "r"(fh), + "r"(fw), "a"(ic), "r"(I64(hStep)), "r"(I64(dw)), "r"(I64(iStep)), + "r"(I64(fwStep)), "r"(I64(fhStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", + "%ymm13", "%ymm14", "%ymm15", "memory", "cc"); + } + + __asm__ __volatile__( + // relu + "and $0x6, %2 \n\t" + "je 0f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm7, %%ymm7 \n\t" + "vmaxps %%ymm15, %%ymm8, %%ymm8 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + "vmaxps %%ymm15, %%ymm10, %%ymm10 \n\t" + "vmaxps %%ymm15, %%ymm11, %%ymm11 \n\t" + + // relu6 + "and $0x4, %2 \n\t" + "je 0f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm7, %%ymm7 \n\t" + "vminps %%ymm12, %%ymm8, %%ymm8 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + "vminps %%ymm12, %%ymm10, %%ymm10 \n\t" + "vminps %%ymm12, %%ymm11, %%ymm11 \n\t" + + "0: \n\t" + "vmovups %%ymm0, (%0) \n\t" + "vmovups %%ymm1, 0x20(%0) \n\t" + "vmovups %%ymm2, 0x40(%0) \n\t" + "vmovups %%ymm3, 0x60(%0) \n\t" + "vmovups %%ymm4, (%0, %1) \n\t" + "vmovups %%ymm5, 0x20(%0, %1) \n\t" + "vmovups %%ymm6, 0x40(%0, %1) \n\t" + "vmovups %%ymm7, 0x60(%0, %1) \n\t" + "vmovups %%ymm8, (%0, %1, 2) \n\t" + "vmovups %%ymm9, 0x20(%0, %1, 2) \n\t" + "vmovups %%ymm10, 0x40(%0, %1, 2) \n\t" + "vmovups %%ymm11, 0x60(%0, %1, 2) \n\t" + : + : "r"(curO), "r"(I64(oStep)), "r"(store) + : "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", + "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm15", "memory", "cc"); +} + +void avx2_conv_kernel_1x24(F32 *in_0, + F32 *in_1, + F32 *in_2, + F32 *in_3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 hStep, + U32 store, + U32 dw, + U32 ic, + U32 iStep, + U32 fwStep, + U32 fhStep) +{ + __asm__ __volatile__("mov %3, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups 0x20(%1), %%ymm4 \n\t" + "vmovups 0x40(%1), %%ymm8 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%0), %%ymm0 \n\t" + "vmovups (%0, %2), %%ymm4 \n\t" + "vmovups (%0, %2, 2), %%ymm8 \n\t" + + ".align 16 \n\t" + "1: \n\t" + : + : "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store) + : "%eax", "%ymm0", "%ymm4", "%ymm8", "memory", "cc"); + + if ((fh > 0) && (fw > 0)) { + __asm__ __volatile__(".align 16 \n\t" + "0: \n\t" + + "mov %2, %%ebx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + + "vmovaps (%1), %%ymm12 \n\t" + "vmovaps 0x20(%1), %%ymm13 \n\t" + "vmovaps 0x40(%1), %%ymm14 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + + "add %6, %0 \n\t" + "add $0x60, %1 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + + "add %5, %0 \n\t" + "add %8, %1 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + + "add %7, %0 \n\t" + "add %9, %1 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r"(in_0), "r"(curW), "r"(fh), "r"(fw), "a"(ic), "r"(I64(hStep)), + "r"(I64(dw)), "r"(I64(iStep)), "r"(I64(fwStep)), "r"(I64(fhStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm4", "%ymm8", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); + } + + __asm__ __volatile__( + // relu + "and $0x6, %2 \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm8, %%ymm8 \n\t" + + // relu6 + "and $0x4, %2 \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm8, %%ymm8 \n\t" + + "3: \n\t" + "vmovups %%ymm0, (%0) \n\t" + "vmovups %%ymm4, (%0, %1) \n\t" + "vmovups %%ymm8, (%0, %1, 2) \n\t" + : + : "r"(curO), "r"(I64(oStep)), "r"(store) + : "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", + "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm15", "memory", "cc"); +} + +void avx2_conv_kernel_4x16(F32 *in_0, + F32 *in_1, + F32 *in_2, + F32 *in_3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 hStep, + U32 store, + U32 dw, + U32 ic, + U32 iStep, + U32 fwStep, + U32 fhStep) +{ + __asm__ __volatile__("mov %3, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups (%1), %%ymm1 \n\t" + "vmovups (%1), %%ymm2 \n\t" + "vmovups (%1), %%ymm3 \n\t" + "vmovups 0x20(%1), %%ymm4 \n\t" + "vmovups 0x20(%1), %%ymm5 \n\t" + "vmovups 0x20(%1), %%ymm6 \n\t" + "vmovups 0x20(%1), %%ymm7 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%0), %%ymm0 \n\t" + "vmovups 0x20(%0), %%ymm1 \n\t" + "vmovups 0x40(%0), %%ymm2 \n\t" + "vmovups 0x60(%0), %%ymm3 \n\t" + "vmovups (%0, %2), %%ymm4 \n\t" + "vmovups 0x20(%0, %2), %%ymm5 \n\t" + "vmovups 0x40(%0, %2), %%ymm6 \n\t" + "vmovups 0x60(%0, %2), %%ymm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + : + : "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store) + : "%eax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", + "%ymm7", "memory", "cc"); + + if ((fh > 0) && (fw > 0)) { + __asm__ __volatile__(".align 16 \n\t" + "0: \n\t" + + "mov %5, %%ebx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "mov %6, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + + "vmovaps (%4), %%ymm12 \n\t" + "vmovaps 0x20(%4), %%ymm13 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vbroadcastss (%1), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm5 \n\t" + "vbroadcastss (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm6 \n\t" + "vbroadcastss (%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + "add $0x40, %4 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + "add %8, %3 \n\t" + "add %11, %4 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + + "add %10, %0 \n\t" + "add %10, %1 \n\t" + "add %10, %2 \n\t" + "add %10, %3 \n\t" + "add %12, %4 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r"(in_0), "r"(in_1), "r"(in_2), "r"(in_3), "r"(curW), "r"(fh), + "r"(fw), "a"(ic), "r"(I64(hStep)), "r"(I64(dw)), "r"(I64(iStep)), + "r"(I64(fwStep)), "r"(I64(fhStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm12", "%ymm13", "%ymm15", "memory", "cc"); + } + + __asm__ __volatile__( + // relu + "and $0x6, %2 \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm7, %%ymm7 \n\t" + + // relu6 + "and $0x4, %2 \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm7, %%ymm7 \n\t" + + "3: \n\t" + "vmovups %%ymm0, (%0) \n\t" + "vmovups %%ymm1, 0x20(%0) \n\t" + "vmovups %%ymm2, 0x40(%0) \n\t" + "vmovups %%ymm3, 0x60(%0) \n\t" + "vmovups %%ymm4, (%0, %1) \n\t" + "vmovups %%ymm5, 0x20(%0, %1) \n\t" + "vmovups %%ymm6, 0x40(%0, %1) \n\t" + "vmovups %%ymm7, 0x60(%0, %1) \n\t" + : + : "r"(curO), "r"(I64(oStep)), "r"(store) + : "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm12", + "%ymm15", "memory", "cc"); +} + +void avx2_conv_kernel_1x16(F32 *in_0, + F32 *in_1, + F32 *in_2, + F32 *in_3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 hStep, + U32 store, + U32 dw, + U32 ic, + U32 iStep, + U32 fwStep, + U32 fhStep) +{ + __asm__ __volatile__("mov %3, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups 0x20(%1), %%ymm4 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%0), %%ymm0 \n\t" + "vmovups (%0, %2), %%ymm4 \n\t" + + ".align 16 \n\t" + "1: \n\t" + : + : "r"(curO), "r"(curB), "r"(I64(oStep)), "r"(store) + : "%eax", "%ymm0", "%ymm4", "memory", "cc"); + + if ((fh > 0) && (fw > 0)) { + __asm__ __volatile__(".align 16 \n\t" + "0: \n\t" + + "mov %2, %%ebx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + + "vmovaps (%1), %%ymm12 \n\t" + "vmovaps 0x20(%1), %%ymm13 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "add %6, %0 \n\t" + "add $0x40, %1 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + + "add %5, %0 \n\t" + "add %8, %1 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + + "add %7, %0 \n\t" + "add %9, %1 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r"(in_0), "r"(curW), "r"(fh), "r"(fw), "a"(ic), "r"(I64(hStep)), + "r"(I64(dw)), "r"(I64(iStep)), "r"(I64(fwStep)), "r"(I64(fhStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm4", "%ymm8", "%ymm12", "%ymm13", + "%ymm15", "memory", "cc"); + } + + __asm__ __volatile__( + // relu + "and $0x6, %2 \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + + // relu6 + "and $0x4, %2 \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + + "3: \n\t" + "vmovups %%ymm0, (%0) \n\t" + "vmovups %%ymm4, (%0, %1) \n\t" + : + : "r"(curO), "r"(I64(oStep)), "r"(store) + : "%ecx", "%ymm0", "%ymm4", "%ymm12", "%ymm15", "memory", "cc"); +} + +void avx2_conv_kernel_4x8(F32 *in_0, + F32 *in_1, + F32 *in_2, + F32 *in_3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 hStep, + U32 store, + U32 dw, + U32 ic, + U32 iStep, + U32 fwStep, + U32 fhStep) +{ + __asm__ __volatile__("mov %2, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%1), %%ymm0 \n\t" + "vmovups (%1), %%ymm1 \n\t" + "vmovups (%1), %%ymm2 \n\t" + "vmovups (%1), %%ymm3 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%0), %%ymm0 \n\t" + "vmovups 0x20(%0), %%ymm1 \n\t" + "vmovups 0x40(%0), %%ymm2 \n\t" + "vmovups 0x60(%0), %%ymm3 \n\t" + + ".align 16 \n\t" + "1: \n\t" + : + : "r"(curO), "r"(curB), "r"(store) + : "%eax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "memory", "cc"); + + if ((fh > 0) && (fw > 0)) { + __asm__ __volatile__(".align 16 \n\t" + "0: \n\t" + + "mov %5, %%ebx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "mov %6, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + + "vmovaps (%4), %%ymm12 \n\t" + "vbroadcastss (%0), %%ymm11 \n\t" + "vbroadcastss (%1), %%ymm13 \n\t" + "vbroadcastss (%2), %%ymm14 \n\t" + "vbroadcastss (%3), %%ymm15 \n\t" + "vfmadd231ps %%ymm11, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm1 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + "add $0x20, %4 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + + "add %8, %0 \n\t" + "add %8, %1 \n\t" + "add %8, %2 \n\t" + "add %8, %3 \n\t" + "add %11, %4 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + + "add %10, %0 \n\t" + "add %10, %1 \n\t" + "add %10, %2 \n\t" + "add %10, %3 \n\t" + "add %12, %4 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r"(in_0), "r"(in_1), "r"(in_2), "r"(in_3), "r"(curW), "r"(fh), + "r"(fw), "a"(ic), "r"(I64(hStep)), "r"(I64(dw)), "r"(I64(iStep)), + "r"(I64(fwStep)), "r"(I64(fhStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm11", + "%ymm14", "%ymm12", "%ymm13", "%ymm15", "memory", "cc"); + } + + __asm__ __volatile__( + // relu + "and $0x6, %1 \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + + // relu6 + "and $0x4, %1 \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + + "3: \n\t" + "vmovups %%ymm0, (%0) \n\t" + "vmovups %%ymm1, 0x20(%0) \n\t" + "vmovups %%ymm2, 0x40(%0) \n\t" + "vmovups %%ymm3, 0x60(%0) \n\t" + : + : "r"(curO), "r"(store) + : "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm12", "%ymm15", "memory", "cc"); +} + +void avx2_conv_kernel_1x8(F32 *in_0, + F32 *in_1, + F32 *in_2, + F32 *in_3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 hStep, + U32 store, + U32 dw, + U32 ic, + U32 iStep, + U32 fwStep, + U32 fhStep) +{ + __asm__ __volatile__("mov %2, %%eax \n\t" + "and $0x1, %%eax \n\t" + "jne 0f \n\t" + "vmovups (%1), %%ymm0 \n\t" + "jmp 1f \n\t" + + ".align 16 \n\t" + "0: \n\t" + "vmovups (%0), %%ymm0 \n\t" + + ".align 16 \n\t" + "1: \n\t" + : + : "r"(curO), "r"(curB), "r"(store) + : "%eax", "%ymm0", "memory", "cc"); + + if ((fh > 0) && (fw > 0)) { + __asm__ __volatile__(".align 16 \n\t" + "0: \n\t" + + "mov %2, %%ebx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "mov %3, %%ecx \n\t" + ".align 16 \n\t" + "2: \n\t" + + "vmovaps (%1), %%ymm12 \n\t" + "vbroadcastss (%0), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "add %6, %0 \n\t" + "add $0x20, %1 \n\t" + "dec %%ecx \n\t" + "jg 2b \n\t" + + "add %5, %0 \n\t" + "add %8, %1 \n\t" + "dec %%ebx \n\t" + "jg 1b \n\t" + + "add %7, %0 \n\t" + "add %9, %1 \n\t" + "dec %%eax \n\t" + "jg 0b \n\t" + : + : "r"(in_0), "r"(curW), "r"(fh), "r"(fw), "a"(ic), "r"(I64(hStep)), + "r"(I64(dw)), "r"(I64(iStep)), "r"(I64(fwStep)), "r"(I64(fhStep)) + : "%ecx", "%ebx", "%ymm0", "%ymm12", "%ymm15", "memory", "cc"); + } + + __asm__ __volatile__( + // relu + "and $0x6, %1 \n\t" + "je 0f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + + // relu6 + "and $0x4, %1 \n\t" + "je 0f \n\t" + "mov $0x40C00000, %%ecx \n\t" + "vmovd %%ecx, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + + "0: \n\t" + "vmovups %%ymm0, (%0) \n\t" + : + : "r"(curO), "r"(store) + : "%ecx", "%ymm0", "%ymm12", "%ymm15", "memory", "cc"); +} + +EE convolution_direct_nchw(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + I32 strideH = convParamSpec.stride_h; + I32 strideW = convParamSpec.stride_w; + I32 paddingT = convParamSpec.padding_top; + I32 paddingB = convParamSpec.padding_bottom; + I32 paddingL = convParamSpec.padding_left; + I32 paddingR = convParamSpec.padding_right; + I32 dilateH = convParamSpec.dilatedRate_h; + I32 dilateW = convParamSpec.dilatedRate_w; + + if (((fdf != DF_NCHWCxN24) && (fdf != DF_NCHWCxN32)) || (idf != DF_NCHW)) { + CHECK_STATUS(NOT_MATCH); + } + + F32 *curI, *curO, *calI, *calO; + const F32 *curW, *curB, *calW; + F32 *ftmp = inArray; + filterArray = (F32 *)align_addr(filterArray, 32); + + U32 oStep = oh * ow * UNROLL_OC_DIM * 4; + U32 iStep = ((ih - fh) * iw) * 4; + U32 hStep = (iw - fw * dilateW + (dilateH - 1) * iw) * 4; + U32 dw = dilateW * 4; + U32 wSize = 0, store = 0, ocSize = 0, icSize = 0, hwSize = 0, icbSize = 0; + I32 ih_idx = 0; + kernel_func kernel[4][2] = {{avx2_conv_kernel_1x8, avx2_conv_kernel_4x8}, + {avx2_conv_kernel_1x16, avx2_conv_kernel_4x16}, + {avx2_conv_kernel_1x24, avx2_conv_kernel_4x24}, + {avx2_conv_kernel_1x32, avx2_conv_kernel_3x32}}; + U32 ocblocks[4] = {8, 16, 24, 32}; + U32 wblocks[4] = {4, 4, 4, 3}; + U32 unroll_w = UNROLL_W, unroll_oc = BLOCK_OC_DIM; + I32 ohow = oh * ow; + + U32 k24 = (oc + 23) / 24 * (ohow + 3) / 4; + U32 k32 = (oc + 31) / 32 * (ohow + 2) / 3; + if (k32 < k24) { + unroll_oc = 32; + } + + I32 oh_padding_t = 0; + I32 oh_padding_b = 0; + + for (U32 n = 0; n < in; ++n) { + store = 0; + for (U32 icbb = 0; icbb < ic; icbb += icSize) { + icSize = UNI_MIN(BLOCK_IC_DIM, ic - icbb); + store |= (icbb > 0); + if (icbb == ic - icSize) { + store |= U32(activationDesc.mode) << 1; + } + if ((paddingL == 0) && (paddingR == 0) && (paddingT != 0 || paddingB != 0)) { + oh_padding_t = UNI_MIN((paddingT - 1) / strideH + 1, oh); + oh_padding_b = UNI_MIN((paddingB - 1) / strideH + 1, oh - oh_padding_t); + if (((ih + paddingT - fh) / strideH + 1) >= oh) { + oh_padding_b = 0; + } + for (U32 ocb = 0; ocb < oc; ocb += ocSize) { + ocSize = UNI_MIN(unroll_oc, oc - ocb); + ocSize = ocblocks[(ocSize >> 3) - 1]; + unroll_w = wblocks[(ocSize >> 3) - 1]; + curW = filterArray + ocb * ic * fh * fw + ocSize * icbb * fh * fw; + curB = biasArray + ocb; + curI = ftmp + icbb * ih * iw; + for (I32 h = 0; h < oh_padding_t; ++h) { + I32 in_h_0 = h * strideH - paddingT; + U32 tfh = UNI_MIN(fh + in_h_0, ih); + iStep = ((ih - tfh) * iw) * 4; + for (I32 w = 0; w < ow; w += wSize) { + wSize = UNI_MIN(ow - w, unroll_w); + if (wSize < unroll_w) { + wSize = 1; + } + U32 in_w_0 = w * strideW; + U32 in_w_1 = (w + 1) * strideW; + U32 in_w_2 = (w + 2) * strideW; + U32 in_w_3 = (w + 3) * strideW; + F32 *out_ptr = outArray + (n * oc + ocb) * ohow + (h * ow + w) * 8; + F32 *in_0 = curI + in_w_0; + F32 *in_1 = curI + in_w_1; + F32 *in_2 = curI + in_w_2; + F32 *in_3 = curI + in_w_3; + kernel[(ocSize >> 3) - 1][wSize > 1](in_0, in_1, in_2, in_3, curW + (fh - tfh) * fw * ocSize, + out_ptr, curB, fw, tfh, oStep, hStep, store, dw, icSize, iStep, 0, fw * (fh - tfh) * ocSize * 4); + } + } + } + } + if ((paddingL == 0) && (paddingR == 0)) { + iStep = ((ih - fh) * iw) * 4; + for (I32 hw = oh_padding_t * ow; hw < ohow - oh_padding_b * ow; hw += hwSize) { + hwSize = UNI_MIN(BLOCK_HW_DIM, ohow - oh_padding_b * ow - hw); + for (U32 ocb = 0; ocb < oc; ocb += ocSize) { + ocSize = UNI_MIN(unroll_oc, oc - ocb); + ocSize = ocblocks[(ocSize >> 3) - 1]; + unroll_w = wblocks[(ocSize >> 3) - 1]; + curW = filterArray + ocb * ic * fh * fw + ocSize * icbb * fh * fw; + curB = biasArray + ocb; + curI = ftmp + icbb * ih * iw; + for (I32 ihw = hw; ihw < hw + hwSize; ihw += wSize) { + wSize = UNI_MIN(hw + hwSize - ihw, unroll_w); + if (wSize < unroll_w) { + wSize = 1; + } + U32 in_h_0 = ihw / ow * strideH - paddingT; + U32 in_w_0 = ihw % ow * strideW; + U32 in_h_1 = (ihw + 1) / ow * strideH - paddingT; + U32 in_w_1 = (ihw + 1) % ow * strideW; + U32 in_h_2 = (ihw + 2) / ow * strideH - paddingT; + U32 in_w_2 = (ihw + 2) % ow * strideW; + U32 in_h_3 = (ihw + 3) / ow * strideH - paddingT; + U32 in_w_3 = (ihw + 3) % ow * strideW; + F32 *out_ptr = outArray + (n * oc + ocb) * ohow + ihw * 8; + F32 *in_0 = curI + in_h_0 * iw + in_w_0; + F32 *in_1 = curI + in_h_1 * iw + in_w_1; + F32 *in_2 = curI + in_h_2 * iw + in_w_2; + F32 *in_3 = curI + in_h_3 * iw + in_w_3; + kernel[(ocSize >> 3) - 1][wSize > 1](in_0, in_1, in_2, in_3, curW, + out_ptr, curB, fw, fh, oStep, hStep, store, dw, icSize, iStep, 0, 0); + } + } + } + } + if ((paddingL == 0) && (paddingR == 0) && (paddingT != 0 || paddingB != 0)) { + for (U32 ocb = 0; ocb < oc; ocb += ocSize) { + ocSize = UNI_MIN(unroll_oc, oc - ocb); + ocSize = ocblocks[(ocSize >> 3) - 1]; + unroll_w = wblocks[(ocSize >> 3) - 1]; + curW = filterArray + ocb * ic * fh * fw + ocSize * icbb * fh * fw; + curB = biasArray + ocb; + curI = ftmp + icbb * ih * iw; + for (I32 h = oh - oh_padding_b; h < oh; ++h) { + I32 in_h_0 = h * strideH - paddingT; + U32 tfh = ih - in_h_0; + iStep = ((ih - tfh) * iw) * 4; + for (I32 w = 0; w < ow; w += wSize) { + wSize = UNI_MIN(ow - w, unroll_w); + if (wSize < unroll_w) { + wSize = 1; + } + U32 in_w_0 = w * strideW; + U32 in_w_1 = (w + 1) * strideW; + U32 in_w_2 = (w + 2) * strideW; + U32 in_w_3 = (w + 3) * strideW; + F32 *out_ptr = outArray + (n * oc + ocb) * ohow + (h * ow + w) * 8; + F32 *in_0 = curI + in_h_0 * iw + in_w_0; + F32 *in_1 = curI + in_h_0 * iw + in_w_1; + F32 *in_2 = curI + in_h_0 * iw + in_w_2; + F32 *in_3 = curI + in_h_0 * iw + in_w_3; + kernel[(ocSize >> 3) - 1][wSize > 1](in_0, in_1, in_2, in_3, curW, + out_ptr, curB, fw, tfh, oStep, hStep, store, dw, icSize, iStep, 0, fw * (fh - tfh) * ocSize * 4); + } + } + } + } + if ((paddingL != 0) || (paddingR != 0)) { + I32 tfw = fw, tfh = fh, wh = 0; + I32 in_h = 0, in_w = 0; + I32 ow_padding_l = UNI_MIN((paddingL - 1) / strideW + 1, ow); + I32 ow_padding_r = UNI_MIN((paddingR - 1) / strideW + 1, ow - ow_padding_l); + if (((iw + paddingL - fw) / strideW + 1) >= ow) { + ow_padding_r = 0; + } + for (I32 h = 0; h < oh; ++h) { + tfh = fh; + in_h = h * strideH - paddingT; + calW = curW; + wh = 0; + if (in_h < 0) { + tfh = UNI_MIN(fh + in_h, ih); + in_h = 0; + wh = fh - tfh; + } else if (in_h + fh >= ih) { + tfh = ih - in_h; + curW = filterArray; + } + iStep = ((ih - tfh) * iw) * 4; + for (U32 ocb = 0; ocb < oc; ocb += ocSize) { + ocSize = UNI_MIN(unroll_oc, oc - ocb); + ocSize = ocblocks[(ocSize >> 3) - 1]; + unroll_w = wblocks[(ocSize >> 3) - 1]; + curW = filterArray + ocb * ic * fh * fw + ocSize * icbb * fh * fw + + wh * fw * ocSize; + curB = biasArray + ocb; + curI = ftmp + icbb * ih * iw + in_h * iw; + curO = outArray + ocb * ohow + h * ow * 8; + I32 w = 0; + for (; w < ow_padding_l; ++w) { + I32 in_w = w * strideW - paddingL; + tfw = UNI_MIN(fw + in_w, iw); + const F32 *useW = curW + (fw - tfw) * ocSize; + hStep = (iw - tfw * dilateW + (dilateH - 1) * iw) * 4; + calO = curO + w * 8; + kernel[(ocSize >> 3) - 1][0](curI, nullptr, nullptr, nullptr, useW, + calO, curB, tfw, tfh, oStep, hStep, store, dw, icSize, iStep, + (fw - tfw) * ocSize * 4, fw * (fh - tfh) * ocSize * 4); + } + for (; w < ow - ow_padding_r; w += wSize) { + hStep = (iw - fw * dilateW + (dilateH - 1) * iw) * 4; + wSize = UNI_MIN(ow - ow_padding_r - w, unroll_w); + if (wSize < unroll_w) { + wSize = 1; + } + F32 *in_0 = curI + w * strideW - paddingL; + F32 *in_1 = curI + (w + 1) * strideW - paddingL; + F32 *in_2 = curI + (w + 2) * strideW - paddingL; + F32 *in_3 = curI + (w + 3) * strideW - paddingL; + calO = curO + w * 8; + kernel[(ocSize >> 3) - 1][wSize > 1](in_0, in_1, in_2, in_3, curW, calO, + curB, fw, tfh, oStep, hStep, store, dw, icSize, iStep, 0, + fw * (fh - tfh) * ocSize * 4); + } + for (; w < ow; ++w) { + I32 in_w = w * strideW - paddingL; + tfw = iw - in_w; + hStep = (iw - tfw * dilateW + (dilateH - 1) * iw) * 4; + F32 *in_0 = curI + in_w; + calO = curO + w * 8; + kernel[(ocSize >> 3) - 1][0](in_0, nullptr, nullptr, nullptr, curW, + calO, curB, tfw, tfh, oStep, hStep, store, dw, icSize, iStep, + (fw - tfw) * ocSize * 4, fw * (fh - tfh) * ocSize * 4); + } + } + } + } + } + inArray += ic * ih * iw; + outArray += oc * oh * ow; + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/convolution_transform.cpp b/compute/tensor/src/cpu/x86/fp32/convolution_transform.cpp new file mode 100644 index 00000000..09601378 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/convolution_transform.cpp @@ -0,0 +1,153 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#include "cpu/x86/fp32/transform_functions_fp32.h" + +// N is 32/24 +template +inline EE transformNCHWToNCHWCxNxWrapper( + TensorDesc filterDesc, const F32 *filterArray, TensorDesc ftmDesc, F32 *ftmArray, U32 cx) +{ + EE ret = NOT_SUPPORTED; + switch (cx) { + case 128: + ret = transformNCHWToNCHWCxNx<128, N>(filterDesc, filterArray, ftmDesc, ftmArray); + break; + case 8: + ret = transformNCHWToNCHWCxNx<8, N>(filterDesc, filterArray, ftmDesc, ftmArray); + break; + case 1: + ret = transformNCHWToNCHWCxNx<1, N>(filterDesc, filterArray, ftmDesc, ftmArray); + break; + default: + break; + } + return ret; +} + +inline EE convolution_transform_filter_kernel_fp32(TensorDesc filterDesc, + const F32 *filterArray, + TensorDesc *ftmDesc, + F32 *ftmArray, + DataFormat ftmDataFormat, + U32 cx) +{ + if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + if (fdf == ftmDataFormat) { + *ftmDesc = filterDesc; + memcpy(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt)); + return SUCCESS; + } + if (fdf != DF_NCHW) { + CHECK_STATUS(NOT_SUPPORTED); + } + EE ret = SUCCESS; + switch (ftmDataFormat) { + case DF_NCHWCxN32: { + /* + * NCHW => NCHWCxN32 + */ + *ftmDesc = tensor4df(fdt, ftmDataFormat, fn, fc, fh, fw); + transformNCHWToNCHWCxNxWrapper<32>(filterDesc, filterArray, *ftmDesc, ftmArray, cx); + break; + } + case DF_NCHWCxN24: { + *ftmDesc = tensor4df(fdt, ftmDataFormat, fn, fc, fh, fw); + transformNCHWToNCHWCxNxWrapper<24>(filterDesc, filterArray, *ftmDesc, ftmArray, cx); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE convolution_transform_filter_fp32(TensorDesc filterDesc, + const F32 *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F32 *filterTransformed) +{ + U32 cx = 0; + DataFormat ftmDataFormat; + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: { + ftmDataFormat = DF_NCHWCxN32; + cx = 8; + break; + } + case CONVOLUTION_ALGORITHM_POINTWISE: { + if ((fn % 24 != 0) && (fn % 32 == 0)) { + ftmDataFormat = DF_NCHWCxN32; + } else { + ftmDataFormat = DF_NCHWCxN24; + } + cx = 128; + break; + } + case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: { + fn = (fn + 7) / 8 * 8 / convParamSpec.group; + if ((fn % 24 == 0) && (fn % 32 != 0)) { + ftmDataFormat = DF_NCHWCxN24; + } else { + ftmDataFormat = DF_NCHWCxN32; + } + cx = 1; + break; + } + default: + return NOT_MATCH; + } + + // align to 32 byte + filterTransformed = (F32 *)(((uintptr_t)filterTransformed + 32 - 1) / 32 * 32); + + if (algorithm == CONVOLUTION_ALGORITHM_POINTWISE) { + EE ret = convolution_transform_filter_kernel_fp32( + filterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat, cx); + CHECK_STATUS(ret); + return ret; + } + + U32 channelAxis = filterDesc.nDims - 1; + TensorDesc tmpFilterDesc = filterDesc; + tmpFilterDesc.dims[channelAxis] /= convParamSpec.group; + U32 fnPadding = tmpFilterDesc.dims[channelAxis]; + if (fnPadding % 8 != 0) { + fnPadding = (fnPadding / 8 + 1) * 8; + } + U32 originalTileSize = tensorNumElements(tmpFilterDesc); + for (U32 g = 0; g < convParamSpec.group; g++) { + CHECK_STATUS(convolution_transform_filter_kernel_fp32( + tmpFilterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat, cx)); + U32 newTileSize = tensorNumElements(*ftmDesc) / tmpFilterDesc.dims[channelAxis] * fnPadding; + filter += originalTileSize; + filterTransformed += newTileSize; + } + ftmDesc->dims[channelAxis] = filterDesc.dims[channelAxis]; + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/deconvolution_transform.cpp b/compute/tensor/src/cpu/x86/fp32/deconvolution_transform.cpp new file mode 100644 index 00000000..9a256507 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/deconvolution_transform.cpp @@ -0,0 +1,185 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +template +inline void transformCNHW2NCHWCxNxKernel( + U32 fc, U32 fn, U32 fh, U32 fw, U32 fnPadding, const F32 *input, F32 *output) +{ + F32 *dest; + const F32 *src; + U32 cSize = 0, cSizePadding = 0; + U32 lstep = fh * fw; + U32 hwMax = fh * fw - 1; + __m256i vindex = _mm256_set_epi32( + lstep * 7, lstep * 6, lstep * 5, lstep * 4, lstep * 3, lstep * 2, lstep, 0); + for (U32 n = 0; n < fn; n += cSize) { + cSize = UNI_MIN(fn - n, C); + cSizePadding = UNI_MIN(fnPadding - n, C); + for (U32 hw = 0; hw < fh * fw; ++hw) { + for (U32 c8 = 0; c8 < cSize; ++c8) { + src = input + (n + c8) * fc * fh * fw + hwMax - hw; + dest = output + n * fh * fw * N + hw * cSizePadding * N + c8 * N; + if (N >= 8) { + _mm256_storeu_ps(dest, _mm256_i32gather_ps(src, vindex, 4)); + } + if (N >= 16) { + _mm256_storeu_ps(dest + 8, _mm256_i32gather_ps(src + 8 * lstep, vindex, 4)); + } + if (N >= 24) { + _mm256_storeu_ps(dest + 16, _mm256_i32gather_ps(src + 16 * lstep, vindex, 4)); + } + if (N == 32) { + _mm256_storeu_ps(dest + 24, _mm256_i32gather_ps(src + 24 * lstep, vindex, 4)); + } + } + memset(dest + N, 0, ((cSizePadding - cSize) * N * 4)); + } + } +} + +// N is 32/24 +template +inline EE transformCNHW2NCHWCxNx( + TensorDesc inputDesc, const F32 *input, TensorDesc outputDesc, F32 *output) +{ + if (input == NULL || output == NULL) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt, odt; + DataFormat fdf, odf; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + U32 tail = fc % N; + U32 remain = fc - tail; + + for (U32 c = 0; c < remain; c += N) { + transformCNHW2NCHWCxNxKernel(fc, fn, fh, fw, on, input, output); + input += fh * fw * N; + output += on * fh * fw * N; + } + if (tail >= 16) { + transformCNHW2NCHWCxNxKernel(fc, fn, fh, fw, on, input, output); + input += fh * fw * 16; + output += on * fh * fw * 16; + tail -= 16; + } + if (tail >= 8) { + transformCNHW2NCHWCxNxKernel(fc, fn, fh, fw, on, input, output); + input += fh * fw * 8; + output += on * fh * fw * 8; + tail -= 8; + } + if (tail > 0) { + F32 *dest; + const F32 *src; + U32 cSize = 0, cSizePadding = 0; + U32 hwMax = fh * fw - 1; + F32 m[8] = {0.0f}; + for (U32 i = 0; i < tail; ++i) { + m[i] = -1.0f; + } + __m256 mask = _mm256_set_ps(m[7], m[6], m[5], m[4], m[3], m[2], m[1], m[0]); + U32 lstep = fh * fw; + __m256i vindex = _mm256_set_epi32( + lstep * 7, lstep * 6, lstep * 5, lstep * 4, lstep * 3, lstep * 2, lstep, 0); + __m256 src256 = _mm256_setzero_ps(); + + for (U32 n = 0; n < fn; n += cSize) { + cSize = UNI_MIN(fn - n, C); + cSizePadding = UNI_MIN(on - n, C); + for (U32 hw = 0; hw < fh * fw; ++hw) { + for (U32 c8 = 0; c8 < cSize; ++c8) { + src = input + (n + c8) * fc * fh * fw + hwMax - hw; + dest = output + n * fh * fw * 8 + hw * cSizePadding * 8 + c8 * 8; + _mm256_storeu_ps(dest, _mm256_mask_i32gather_ps(src256, src, vindex, mask, 4)); + } + memset(dest + 8, 0, ((cSizePadding - cSize) * 32)); + } + } + } + return SUCCESS; +} + +inline EE deconvolution_transform_filter_kernel_fp32(TensorDesc filterDesc, + const F32 *filterArray, + TensorDesc *ftmDesc, + F32 *ftmArray, + DataFormat ftmDataFormat) +{ + if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + ftmArray = (F32 *)(((uintptr_t)ftmArray + 32 - 1) / 32 * 32); + if (fdf == ftmDataFormat) { + *ftmDesc = filterDesc; + memcpy(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt)); + return SUCCESS; + } + if (fdf != DF_NCHW) { + CHECK_STATUS(NOT_SUPPORTED); + } + EE ret = SUCCESS; + switch (ftmDataFormat) { + case DF_NCHWCxN32: { + U32 fnAlignSize = 8; + U32 fnPadding = (fn + fnAlignSize - 1) / fnAlignSize * fnAlignSize; + *ftmDesc = tensor4df(fdt, ftmDataFormat, fnPadding, fc, fh, fw); + transformCNHW2NCHWCxNx<8, 32>(filterDesc, filterArray, *ftmDesc, ftmArray); + *ftmDesc = tensor4df(fdt, ftmDataFormat, fn, fc, fh, fw); + break; + } + case DF_NCHWC24: { + filterDesc = tensor4df(fdt, fdf, 1, fc, fh, fw); + *ftmDesc = tensor4df(fdt, ftmDataFormat, 1, fc, fh, fw); + transformCNHW2NCHWCxNx<1, 24>(filterDesc, filterArray, *ftmDesc, ftmArray); + *ftmDesc = tensor4df(fdt, ftmDataFormat, fn, fc, fh, fw); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE deconvolution_transform_filter_fp32(TensorDesc filterDesc, + const F32 *filter, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F32 *filterTransformed) +{ + DataFormat ftmDataFormat; + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + ftmDataFormat = DF_NCHWCxN32; + break; + case CONVOLUTION_ALGORITHM_GROUP_DECONV: + ftmDataFormat = DF_NCHWC24; + break; + default: + return NOT_MATCH; + } + EE ret = deconvolution_transform_filter_kernel_fp32( + filterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat); + CHECK_STATUS(ret); + return ret; +} diff --git a/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_direct.cpp b/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_direct.cpp new file mode 100644 index 00000000..cf506a78 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_direct.cpp @@ -0,0 +1,834 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing.h" + +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +#define UNROLL_W 4 +#define SIMD_W 8 +#define UNROLL_OC_BLOCK_DIM 24 +#define align_addr(addr, unit) (((uintptr_t)addr + unit - 1) / unit * unit) + +typedef void (*kernel_func)(F32 *in0, + F32 *in1, + F32 *in2, + F32 *in3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 hStep, + U32 store, + U32 dw, + U32 wStep); + +void avx2_dw_kernel_4x24(F32 *in0, + F32 *in1, + F32 *in2, + F32 *in3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 hStep, + U32 store, + U32 dw, + U32 wStep) +{ + __asm__ __volatile__("vmovups (%5), %%ymm0 \n\t" + "vmovups (%5), %%ymm1 \n\t" + "vmovups (%5), %%ymm2 \n\t" + "vmovups (%5), %%ymm3 \n\t" + "vmovups 0x20(%5), %%ymm4 \n\t" + "vmovups 0x20(%5), %%ymm5 \n\t" + "vmovups 0x20(%5), %%ymm6 \n\t" + "vmovups 0x20(%5), %%ymm7 \n\t" + "vmovups 0x40(%5), %%ymm8 \n\t" + "vmovups 0x40(%5), %%ymm9 \n\t" + "vmovups 0x40(%5), %%ymm10 \n\t" + "vmovups 0x40(%5), %%ymm11 \n\t" + + "cmp $0, %%ecx \n\t" + "je 3f \n\t" + "cmp $0, %6 \n\t" + "je 3f \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %6, %%eax \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%4), %%ymm12 \n\t" + "vmovups (%0), %%ymm13 \n\t" + "vmovups (%1), %%ymm14 \n\t" + "vmovups (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vmovups (%3), %%ymm13 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + + "vmovaps 0x20(%4), %%ymm14 \n\t" + "vmovups (%0, %8), %%ymm15 \n\t" + "vmovups (%1, %8), %%ymm13 \n\t" + "vmovups (%2, %8), %%ymm12 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm4 \n\t" + "vfmadd231ps %%ymm13, %%ymm14, %%ymm5 \n\t" + "vmovups (%3, %8), %%ymm15 \n\t" + "vfmadd231ps %%ymm12, %%ymm14, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm7 \n\t" + + "vmovaps 0x40(%4), %%ymm13 \n\t" + "vmovups (%0, %8, 2), %%ymm12 \n\t" + "vmovups (%1, %8, 2), %%ymm15 \n\t" + "vmovups (%2, %8, 2), %%ymm14 \n\t" + "vfmadd231ps %%ymm12, %%ymm13, %%ymm8 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm9 \n\t" + "vmovups (%3, %8, 2), %%ymm12 \n\t" + "vfmadd231ps %%ymm14, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm12, %%ymm13, %%ymm11 \n\t" + + "add %12, %0 \n\t" + "add %12, %1 \n\t" + "add %12, %2 \n\t" + "add %12, %3 \n\t" + "add $0x60, %4 \n\t" + "dec %%eax \n\t" + "jg 1b \n\t" + + "add %10, %4 \n\t" + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + "dec %%ecx \n\t" + "jg 0b \n\t" + + // relu + "mov %11, %%eax \n\t" + "and $0x6, %%eax \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm7, %%ymm7 \n\t" + "vmaxps %%ymm15, %%ymm8, %%ymm8 \n\t" + "vmaxps %%ymm15, %%ymm9, %%ymm9 \n\t" + "vmaxps %%ymm15, %%ymm10, %%ymm10 \n\t" + "vmaxps %%ymm15, %%ymm11, %%ymm11 \n\t" + + // relu6 + "and $0x4, %%eax \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%eax \n\t" + "vmovd %%eax, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm7, %%ymm7 \n\t" + "vminps %%ymm12, %%ymm8, %%ymm8 \n\t" + "vminps %%ymm12, %%ymm9, %%ymm9 \n\t" + "vminps %%ymm12, %%ymm10, %%ymm10 \n\t" + "vminps %%ymm12, %%ymm11, %%ymm11 \n\t" + + ".align 16 \n\t" + "3: \n\t" + : + : "r"(in0), "r"(in1), "r"(in2), "r"(in3), "r"(curW), "r"(curB), "r"(fw), + "c"(fh), "r"((I64)iStep), "r"((I64)hStep), "r"((I64)wStep), "r"(store), + "r"((I64)dw) + : "%eax", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); + + __asm__ __volatile__("vmovups %%ymm0, (%0) \n\t" + "vmovups %%ymm1, 0x20(%0) \n\t" + "vmovups %%ymm2, 0x40(%0) \n\t" + "vmovups %%ymm3, 0x60(%0) \n\t" + "vmovups %%ymm4, (%0, %1) \n\t" + "vmovups %%ymm5, 0x20(%0, %1) \n\t" + "vmovups %%ymm6, 0x40(%0, %1) \n\t" + "vmovups %%ymm7, 0x60(%0, %1) \n\t" + "vmovups %%ymm8, (%0, %1, 2) \n\t" + "vmovups %%ymm9, 0x20(%0, %1, 2) \n\t" + "vmovups %%ymm10, 0x40(%0, %1, 2) \n\t" + "vmovups %%ymm11, 0x60(%0, %1, 2) \n\t" + + ".align 16 \n\t" + "1: \n\t" + : + : "r"(curO), "r"((I64)oStep) + : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", + "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", + "%ymm15", "memory", "cc"); +} + +void avx2_dw_kernel_4x16(F32 *in0, + F32 *in1, + F32 *in2, + F32 *in3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 hStep, + U32 store, + U32 dw, + U32 wStep) +{ + __asm__ __volatile__("vmovups (%5), %%ymm0 \n\t" + "vmovups (%5), %%ymm1 \n\t" + "vmovups (%5), %%ymm2 \n\t" + "vmovups (%5), %%ymm3 \n\t" + "vmovups 0x20(%5), %%ymm4 \n\t" + "vmovups 0x20(%5), %%ymm5 \n\t" + "vmovups 0x20(%5), %%ymm6 \n\t" + "vmovups 0x20(%5), %%ymm7 \n\t" + + "cmp $0, %%ecx \n\t" + "je 3f \n\t" + "cmp $0, %6 \n\t" + "je 3f \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %6, %%eax \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%4), %%ymm12 \n\t" + "vmovups (%0), %%ymm13 \n\t" + "vmovups (%1), %%ymm14 \n\t" + "vmovups (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vmovups (%3), %%ymm13 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + + "vmovaps 0x20(%4), %%ymm14 \n\t" + "vmovups (%0, %8), %%ymm15 \n\t" + "vmovups (%1, %8), %%ymm13 \n\t" + "vmovups (%2, %8), %%ymm12 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm4 \n\t" + "vfmadd231ps %%ymm13, %%ymm14, %%ymm5 \n\t" + "vmovups (%3, %8), %%ymm15 \n\t" + "vfmadd231ps %%ymm12, %%ymm14, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm7 \n\t" + + "add %12, %0 \n\t" + "add %12, %1 \n\t" + "add %12, %2 \n\t" + "add %12, %3 \n\t" + "add $0x40, %4 \n\t" + "dec %%eax \n\t" + "jg 1b \n\t" + + "add %10, %4 \n\t" + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + "dec %%ecx \n\t" + "jg 0b \n\t" + + // relu + "mov %11, %%eax \n\t" + "and $0x6, %%eax \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm5, %%ymm5 \n\t" + "vmaxps %%ymm15, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm15, %%ymm7, %%ymm7 \n\t" + + // relu6 + "and $0x4, %%eax \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%eax \n\t" + "vmovd %%eax, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm5, %%ymm5 \n\t" + "vminps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vminps %%ymm12, %%ymm7, %%ymm7 \n\t" + + ".align 16 \n\t" + "3: \n\t" + : + : "r"(in0), "r"(in1), "r"(in2), "r"(in3), "r"(curW), "r"(curB), "r"(fw), + "c"(fh), "r"((I64)iStep), "r"((I64)hStep), "r"((I64)wStep), "r"(store), + "r"((I64)dw) + : "%eax", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); + + __asm__ __volatile__("vmovups %%ymm0, (%0) \n\t" + "vmovups %%ymm1, 0x20(%0) \n\t" + "vmovups %%ymm2, 0x40(%0) \n\t" + "vmovups %%ymm3, 0x60(%0) \n\t" + "vmovups %%ymm4, (%0, %1) \n\t" + "vmovups %%ymm5, 0x20(%0, %1) \n\t" + "vmovups %%ymm6, 0x40(%0, %1) \n\t" + "vmovups %%ymm7, 0x60(%0, %1) \n\t" + + ".align 16 \n\t" + "1: \n\t" + : + : "r"(curO), "r"((I64)oStep) + : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", + "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", + "%ymm15", "memory", "cc"); +} + +void avx2_dw_kernel_4x8(F32 *in0, + F32 *in1, + F32 *in2, + F32 *in3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 hStep, + U32 store, + U32 dw, + U32 wStep) +{ + __asm__ __volatile__("vmovups (%6), %%ymm0 \n\t" + "vmovups (%6), %%ymm1 \n\t" + "vmovups (%6), %%ymm2 \n\t" + "vmovups (%6), %%ymm3 \n\t" + + "cmp $0, %%ecx \n\t" + "je 3f \n\t" + "cmp $0, %6 \n\t" + "je 3f \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %7, %%eax \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%4), %%ymm12 \n\t" + "vmovups (%0), %%ymm13 \n\t" + "vmovups (%1), %%ymm14 \n\t" + "vmovups (%2), %%ymm15 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm1 \n\t" + "vmovups (%3), %%ymm13 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm2 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + + "add %13, %0 \n\t" + "add %13, %1 \n\t" + "add %13, %2 \n\t" + "add %13, %3 \n\t" + "add $0x20, %4 \n\t" + "dec %%eax \n\t" + "jg 1b \n\t" + + "add %9, %4 \n\t" + "add %11, %0 \n\t" + "add %11, %1 \n\t" + "add %11, %2 \n\t" + "add %11, %3 \n\t" + "dec %%ecx \n\t" + "jg 0b \n\t" + + // relu + "mov %12, %%eax \n\t" + "and $0x6, %%eax \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm1, %%ymm1 \n\t" + "vmaxps %%ymm15, %%ymm2, %%ymm2 \n\t" + "vmaxps %%ymm15, %%ymm3, %%ymm3 \n\t" + + // relu6 + "and $0x4, %%eax \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%eax \n\t" + "vmovd %%eax, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm1, %%ymm1 \n\t" + "vminps %%ymm12, %%ymm2, %%ymm2 \n\t" + "vminps %%ymm12, %%ymm3, %%ymm3 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "vmovups %%ymm0, (%5) \n\t" + "vmovups %%ymm1, 0x20(%5) \n\t" + "vmovups %%ymm2, 0x40(%5) \n\t" + "vmovups %%ymm3, 0x60(%5) \n\t" + : + : "r"(in0), "r"(in1), "r"(in2), "r"(in3), "r"(curW), "r"(curO), "r"(curB), + "r"(fw), "c"(fh), "r"((I64)wStep), "r"((I64)oStep), "r"((I64)hStep), + "r"(store), "r"((I64)dw) + : "%eax", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); +} + +void avx2_dw_kernel_1x24(F32 *in0, + F32 *in1, + F32 *in2, + F32 *in3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 hStep, + U32 store, + U32 dw, + U32 wStep) +{ + __asm__ __volatile__("vmovups (%3), %%ymm0 \n\t" + "vmovups 0x20(%3), %%ymm4 \n\t" + "vmovups 0x40(%3), %%ymm8 \n\t" + + "cmp $0, %%ecx \n\t" + "je 3f \n\t" + "cmp $0, %4 \n\t" + "je 3f \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %4, %%eax \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%1), %%ymm12 \n\t" + "vmovups (%0), %%ymm13 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vmovaps 0x20(%1), %%ymm14 \n\t" + "vmovups (%0, %8), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm4 \n\t" + + "vmovaps 0x40(%1), %%ymm13 \n\t" + "vmovups (%0, %8, 2), %%ymm12 \n\t" + "vfmadd231ps %%ymm12, %%ymm13, %%ymm8 \n\t" + + "add %11, %0 \n\t" + "add $0x60, %1 \n\t" + "dec %%eax \n\t" + "jg 1b \n\t" + + "add %6, %1 \n\t" + "add %9, %0 \n\t" + "dec %%ecx \n\t" + "jg 0b \n\t" + + // relu + "mov %10, %%eax \n\t" + "and $0x6, %%eax \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + "vmaxps %%ymm15, %%ymm8, %%ymm8 \n\t" + + // relu6 + "and $0x4, %%eax \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%eax \n\t" + "vmovd %%eax, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + "vminps %%ymm12, %%ymm8, %%ymm8 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "vmovups %%ymm0, (%2) \n\t" + "vmovups %%ymm4, (%2, %7) \n\t" + "vmovups %%ymm8, (%2, %7, 2) \n\t" + : + : "r"(in0), "r"(curW), "r"(curO), "r"(curB), "r"(fw), "c"(fh), + "r"((I64)wStep), "r"((I64)oStep), "r"((I64)iStep), "r"((I64)hStep), + "r"(store), "r"((I64)dw) + : "%eax", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); +} + +void avx2_dw_kernel_1x16(F32 *in0, + F32 *in1, + F32 *in2, + F32 *in3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 hStep, + U32 store, + U32 dw, + U32 wStep) +{ + __asm__ __volatile__("vmovups (%3), %%ymm0 \n\t" + "vmovups 0x20(%3), %%ymm4 \n\t" + + "cmp $0, %%ecx \n\t" + "je 3f \n\t" + "cmp $0, %4 \n\t" + "je 3f \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %4, %%eax \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%1), %%ymm12 \n\t" + "vmovups (%0), %%ymm13 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + "vmovaps 0x20(%1), %%ymm14 \n\t" + "vmovups (%0, %8), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm4 \n\t" + + "add %11, %0 \n\t" + "add $0x40, %1 \n\t" + "dec %%eax \n\t" + "jg 1b \n\t" + + "add %6, %1 \n\t" + "add %9, %0 \n\t" + "dec %%ecx \n\t" + "jg 0b \n\t" + + // relu + "mov %10, %%eax \n\t" + "and $0x6, %%eax \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm15, %%ymm4, %%ymm4 \n\t" + + // relu6 + "and $0x4, %%eax \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%eax \n\t" + "vmovd %%eax, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + "vminps %%ymm12, %%ymm4, %%ymm4 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "vmovups %%ymm0, (%2) \n\t" + "vmovups %%ymm4, (%2, %7) \n\t" + : + : "r"(in0), "r"(curW), "r"(curO), "r"(curB), "r"(fw), "c"(fh), + "r"((I64)wStep), "r"((I64)oStep), "r"((I64)iStep), "r"((I64)hStep), + "r"(store), "r"((I64)dw) + : "%eax", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); +} + +void avx2_dw_kernel_1x8(F32 *in0, + F32 *in1, + F32 *in2, + F32 *in3, + const F32 *curW, + F32 *curO, + const F32 *curB, + U32 fw, + U32 fh, + U32 oStep, + U32 iStep, + U32 hStep, + U32 store, + U32 dw, + U32 wStep) +{ + __asm__ __volatile__("vmovups (%3), %%ymm0 \n\t" + + "cmp $0, %%ecx \n\t" + "je 3f \n\t" + "cmp $0, %4 \n\t" + "je 3f \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %4, %%eax \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%1), %%ymm12 \n\t" + "vmovups (%0), %%ymm13 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm0 \n\t" + + "add %11, %0 \n\t" + "add $0x20, %1 \n\t" + "dec %%eax \n\t" + "jg 1b \n\t" + + "add %6, %1 \n\t" + "add %9, %0 \n\t" + "dec %%ecx \n\t" + "jg 0b \n\t" + + // relu + "mov %10, %%eax \n\t" + "and $0x6, %%eax \n\t" + "je 3f \n\t" + "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" + "vmaxps %%ymm15, %%ymm0, %%ymm0 \n\t" + + // relu6 + "and $0x4, %%eax \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%eax \n\t" + "vmovd %%eax, %%xmm12 \n\t" + "vpermps %%ymm12, %%ymm15, %%ymm12 \n\t" + "vminps %%ymm12, %%ymm0, %%ymm0 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "vmovups %%ymm0, (%2) \n\t" + : + : "r"(in0), "r"(curW), "r"(curO), "r"(curB), "r"(fw), "c"(fh), + "r"((I64)wStep), "r"((I64)oStep), "r"((I64)iStep), "r"((I64)hStep), + "r"(store), "r"((I64)dw) + : "%eax", "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "%ymm14", "%ymm15", "memory", "cc"); +} + +EE depthwise_convolution_direct(TensorDesc inputDesc, + F32 *inArray, + TensorDesc dwFilterDesc, + const F32 *dwFilterArray, + TensorDesc pwFilterDesc, + const F32 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F32 *dwBiasArray, + TensorDesc pwBiasDesc, + const F32 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec) +{ + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + I32 strideH = convParamSpec.stride_h; + I32 strideW = convParamSpec.stride_w; + I32 paddingT = convParamSpec.padding_top; + I32 paddingB = convParamSpec.padding_bottom; + I32 paddingL = convParamSpec.padding_left; + I32 paddingR = convParamSpec.padding_right; + I32 dilateH = convParamSpec.dilatedRate_h; + I32 dilateW = convParamSpec.dilatedRate_w; + + if (fdf != DF_NCHWC24 || idf != DF_NCHWC8) { + CHECK_STATUS(NOT_MATCH); + } + + F32 *curI, *curO, *calI, *calO; + const F32 *curW, *curB, *calW; + F32 *ftmp = inArray; + dwFilterArray = (F32 *)align_addr(dwFilterArray, 32); + + U32 icAlignSize = 8; + U32 icPadding = (ic + icAlignSize - 1) / icAlignSize * icAlignSize; + U32 ih_pad = ih + paddingT + paddingB; + U32 iw_pad = iw + paddingL + paddingR; + F32 *useOutArray = (F32 *)align_addr(tmp, 32); + if (pwFilterArray == nullptr) { + useOutArray = outArray; + } + + U32 oStep = oh * ow * SIMD_W * 4; + U32 ocblocking = 0; + U32 iStep = ih * iw * SIMD_W * 4; + U32 hStep = (iw - fw * dilateW + (dilateH - 1) * iw) * SIMD_W * 4; + U32 sw = strideW * SIMD_W * 4; + U32 dw = dilateW * SIMD_W * 4; + U32 wSize = 0, store = 0, ocSize = 0; + U32 ocblocks[3] = {8, 16, 24}; + + U32 ohow = oh * ow; + + F32 *curIn[4]; + U32 in_h = 0, in_w = 0, oc_idx = 0; + + kernel_func kernel[2][3] = {{avx2_dw_kernel_1x8, avx2_dw_kernel_1x16, avx2_dw_kernel_1x24}, + {avx2_dw_kernel_4x8, avx2_dw_kernel_4x16, avx2_dw_kernel_4x24}}; + + store |= U32(depthwiseActivationParamSpec.mode) << 1; + for (U32 n = 0; n < in; ++n) { + for (U32 ocb = 0; ocb < icPadding; ocb += ocSize) { + curW = dwFilterArray + ocb * fh * fw; + curB = dwBiasArray + ocb; + curI = ftmp + ocb * ih * iw; + curO = useOutArray + (n * icPadding + ocb) * oh * ow; + ocSize = UNI_MIN(UNROLL_OC_BLOCK_DIM, icPadding - ocb); + oc_idx = (ocSize >> 3) - 1; + ocSize = ocblocks[oc_idx]; + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + for (U32 hw = 0; hw < ohow; hw += wSize) { + wSize = UNI_MIN(ohow - hw, UNROLL_W); + if (wSize < 4) { + wSize = 1; + } + U32 in_h_0 = hw / ow * strideH; + U32 in_w_0 = hw % ow * strideW; + U32 in_h_1 = (hw + 1) / ow * strideH; + U32 in_w_1 = (hw + 1) % ow * strideW; + U32 in_h_2 = (hw + 2) / ow * strideH; + U32 in_w_2 = (hw + 2) % ow * strideW; + U32 in_h_3 = (hw + 3) / ow * strideH; + U32 in_w_3 = (hw + 3) % ow * strideW; + F32 *in_0 = curI + in_h_0 * iw_pad * 8 + in_w_0 * 8; + F32 *in_1 = curI + in_h_1 * iw_pad * 8 + in_w_1 * 8; + F32 *in_2 = curI + in_h_2 * iw_pad * 8 + in_w_2 * 8; + F32 *in_3 = curI + in_h_3 * iw_pad * 8 + in_w_3 * 8; + calO = curO + hw * 8; + + kernel[wSize >> 2][oc_idx](in_0, in_1, in_2, in_3, curW, calO, curB, fw, fh, + oStep, iStep, hStep, store, dw, 0); + } + } else { + I32 tfw = fw, tfh = fh; + I32 in_h = 0, in_w = 0; + I32 ow_padding_l = UNI_MIN((paddingL - 1) / strideW + 1, ow); + I32 ow_padding_r = UNI_MIN((paddingR - 1) / strideW + 1, ow - ow_padding_l); + if (((iw + paddingL - fw) / strideW + 1) >= ow) { + ow_padding_r = 0; + } + for (I32 h = 0; h < oh; ++h) { + tfh = fh; + in_h = h * strideH - paddingT; + calW = curW; + if (in_h < 0) { + tfh = UNI_MIN(fh + in_h, ih); + calW = curW + (fh - tfh) * fw * ocSize; + in_h = 0; + } else if (in_h + fh >= ih) { + tfh = ih - in_h; + } + I32 w = 0; + for (; w < ow_padding_l; ++w) { + I32 in_w = w * strideW - paddingL; + tfw = UNI_MIN(fw + in_w, iw); + const F32 *useW = calW + (fw - tfw) * ocSize; + hStep = (iw - tfw * dilateW + (dilateH - 1) * iw) * SIMD_W * 4; + F32 *in_0 = curI + in_h * iw * 8; + calO = curO + (h * ow + w) * 8; + kernel[0][oc_idx](in_0, nullptr, nullptr, nullptr, useW, calO, curB, tfw, + tfh, oStep, iStep, hStep, store, dw, (fw - tfw) * ocSize * 4); + } + for (; w < ow - ow_padding_r; w += wSize) { + hStep = (iw - fw * dilateW + (dilateH - 1) * iw) * SIMD_W * 4; + wSize = UNI_MIN(ow - ow_padding_r - w, UNROLL_W); + if (wSize < 4) { + wSize = 1; + } + U32 in_w_0 = w * strideW - paddingL; + U32 in_w_1 = (w + 1) * strideW - paddingL; + U32 in_w_2 = (w + 2) * strideW - paddingL; + U32 in_w_3 = (w + 3) * strideW - paddingL; + F32 *in_0 = curI + in_h * iw * 8 + in_w_0 * 8; + F32 *in_1 = curI + in_h * iw * 8 + in_w_1 * 8; + F32 *in_2 = curI + in_h * iw * 8 + in_w_2 * 8; + F32 *in_3 = curI + in_h * iw * 8 + in_w_3 * 8; + calO = curO + (h * ow + w) * 8; + + kernel[wSize >> 2][oc_idx](in_0, in_1, in_2, in_3, calW, calO, curB, fw, + tfh, oStep, iStep, hStep, store, dw, 0); + } + for (; w < ow; ++w) { + I32 in_w = w * strideW - paddingL; + tfw = iw - in_w; + hStep = (iw - tfw * dilateW + (dilateH - 1) * iw) * SIMD_W * 4; + F32 *in_0 = curI + in_h * iw * 8 + in_w * 8; + calO = curO + (h * ow + w) * 8; + kernel[0][oc_idx](in_0, nullptr, nullptr, nullptr, calW, calO, curB, tfw, + tfh, oStep, iStep, hStep, store, dw, (fw - tfw) * ocSize * 4); + } + } + } + } + } + + if (pwFilterArray != nullptr) { + TensorDesc pwInputDesc = tensor4df(odt, DF_NCHWC8, 1, ic, oh, ow); + tmpBytes -= oh * ic *oh * ow + 32; + tmp = (void *)((F32 *)tmp + oh * ic *oh * ow + 32); + ConvolutionParamSpec p = createConvolutionParamSpec( + 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, fn, Convolution_Pointwise); + convolution_1x1_direct(pwInputDesc, useOutArray, pwFilterDesc, pwFilterArray, p, + pwBiasArray, tmpBytes, tmp, outputDesc, outArray, pointwiseActivationParamSpec); + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_transform.cpp b/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_transform.cpp new file mode 100644 index 00000000..ff2d97dd --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_transform.cpp @@ -0,0 +1,71 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#include "cpu/x86/fp32/transform_functions_fp32.h" + +inline EE depthwise_convolution_transform_filter_kernel_fp32(TensorDesc filterDesc, + const F32 *filterArray, + TensorDesc *ftmDesc, + F32 *ftmArray, + DataFormat ftmDataFormat) +{ + if (nullptr == filterArray || nullptr == ftmDesc || nullptr == ftmArray) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + ftmArray = (F32 *)(((uintptr_t)ftmArray + 32 - 1) / 32 * 32); + if (fdf == ftmDataFormat) { + *ftmDesc = filterDesc; + memcpy(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt)); + return SUCCESS; + } + if (fdf != DF_NCHW) { + CHECK_STATUS(NOT_SUPPORTED); + } + switch (ftmDataFormat) { + case DF_NCHWC24: { + filterDesc = tensor4df(fdt, fdf, fc, 1, fh, fw); + *ftmDesc = tensor4df(fdt, ftmDataFormat, fc, 1, fh, fw); + transformNCHWToNCHWCxNx<1, 24>(filterDesc, filterArray, *ftmDesc, ftmArray); + *ftmDesc = tensor4df(fdt, ftmDataFormat, fn, fc, fh, fw); + break; + } + default: + return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE depthwise_convolution_transform_filter_fp32(TensorDesc filterDesc, + const F32 *filter, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F32 *filterTransformed) +{ + DataFormat ftmDataFormat; + switch (algorithm) { + case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: + ftmDataFormat = DF_NCHWC24; + break; + default: + return NOT_MATCH; + } + EE ret = depthwise_convolution_transform_filter_kernel_fp32( + filterDesc, filter, ftmDesc, filterTransformed, ftmDataFormat); + CHECK_STATUS(ret); + return ret; +} diff --git a/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution.cpp new file mode 100644 index 00000000..84eb23e3 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution.cpp @@ -0,0 +1,69 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "types.h" +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +EE depthwise_pointwise_convolution_fp32(TensorDesc inputDesc, + F32 *input, + TensorDesc dwFilterDesc, + const F32 *dwFilter, + TensorDesc pwFilterDesc, + const F32 *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const F32 *dwBias, + TensorDesc pwBiasDesc, + const F32 *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch) +{ + UNUSED(arch); + if (nullptr == input || nullptr == dwFilter || nullptr == output || nullptr == dwBias || + nullptr == tmp) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(idf == DF_NCHWC8 && odf == DF_NCHWC8)) { + CHECK_STATUS(NOT_MATCH); + } + if (ic != fc) { + CHECK_STATUS(NOT_MATCH); + } + + EE ret = NOT_MATCH; + if (algorithm == DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT || + algorithm == DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT) { + ret = depthwise_convolution_direct(inputDesc, input, dwFilterDesc, dwFilter, pwFilterDesc, + pwFilter, convParamSpec, dwBiasDesc, dwBias, pwBiasDesc, pwBias, tmpBytes, tmp, + outputDesc, output, depthwiseActivationParamSpec, pointwiseActivationParamSpec); + } + return ret; +} diff --git a/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution_transform.cpp b/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution_transform.cpp new file mode 100644 index 00000000..5f5e7958 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution_transform.cpp @@ -0,0 +1,38 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +EE depthwise_pointwise_convolution_transform_filter_fp32(TensorDesc dwFilterDesc, + const F32 *dwFilter, + TensorDesc pwFilterDesc, + const F32 *pwFilter, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *dwFtmDesc, + F32 *dwFilterTransformed, + TensorDesc *pwFtmDesc, + F32 *pwFilterTransformed) +{ + EE ret = depthwise_convolution_transform_filter_fp32(dwFilterDesc, dwFilter, + DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT, dwFtmDesc, dwFilterTransformed); + CHECK_STATUS(ret); + if (pwFilter == nullptr) { + return ret; + } + + ConvolutionParamSpec nullSpec; + ret = convolution_transform_filter_fp32(pwFilterDesc, pwFilter, nullSpec, + CONVOLUTION_ALGORITHM_POINTWISE, pwFtmDesc, pwFilterTransformed); + CHECK_STATUS(ret); + return ret; +} diff --git a/compute/tensor/src/cpu/x86/fp32/eltwise.cpp b/compute/tensor/src/cpu/x86/fp32/eltwise.cpp new file mode 100644 index 00000000..5a18f884 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/eltwise.cpp @@ -0,0 +1,82 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#include "cpu/cpu_functions.h" + +EE eltwise_fp32(std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode) +{ + F32 buffer[8]; + U32 len_tail = len % 8; + U32 len_main = len - len_tail; + + F32 *tmp = buffer; + F32 *output_ptr = (F32 *)output; + for (U32 i = 0; i < len_main; i += 8) { + get_vector((F32 *)input[0], inputSize[0], &tmp, 8, i, 8, buffer); + __m256 tmp_v = _mm256_loadu_ps(tmp); + for (U32 j = 1; j < num; j++) { + get_vector((F32 *)input[j], inputSize[j], &tmp, 8, i, 8, buffer); + __m256 value_v = _mm256_loadu_ps(tmp); + switch (eltwiseMode) { + case ELTWISE_SUM: + tmp_v = _mm256_add_ps(value_v, tmp_v); + break; + case ELTWISE_MAX: + tmp_v = _mm256_max_ps(value_v, tmp_v); + break; + case ELTWISE_PROD: + tmp_v = _mm256_mul_ps(value_v, tmp_v); + break; + case ELTWISE_SUB: + tmp_v = _mm256_sub_ps(tmp_v, value_v); + break; + case ELTWISE_DIV: + tmp_v = _mm256_div_ps(tmp_v, value_v); + break; + default: + return NOT_SUPPORTED; + } + } + _mm256_storeu_ps(output_ptr + i, tmp_v); + } + + for (U32 i = len_main; i < len; i++) { + get_vector((F32 *)input[0], inputSize[0], &tmp, 8, i, 1, buffer); + F32 tmp_s = tmp[0]; + for (U32 j = 1; j < num; j++) { + get_vector((F32 *)input[j], inputSize[j], &tmp, 8, i, 1, buffer); + F32 value_s = tmp[0]; + switch (eltwiseMode) { + case ELTWISE_SUM: + tmp_s = value_s + tmp_s; + break; + case ELTWISE_MAX: + tmp_s = (value_s > tmp_s) ? value_s : tmp_s; + break; + case ELTWISE_PROD: + tmp_s *= value_s; + break; + default: + return NOT_SUPPORTED; + } + } + output_ptr[i] = tmp_s; + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/l2normalization.cpp b/compute/tensor/src/cpu/x86/fp32/l2normalization.cpp new file mode 100644 index 00000000..0ba99ca5 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/l2normalization.cpp @@ -0,0 +1,64 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#include "cpu/x86/tensor_computing_x86.h" +#include "cpu/x86/fp32/x86_functions_fp32.h" + +EE l2normalization_fp32(TensorDesc inputDesc, const F32 *input, TensorDesc outputDesc, F32 *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, odt; + DataFormat idf, odf; + U32 ic = 0, ih = 0, iw = 0, oh = 0, ow = 0; + if (tensorIs2d(inputDesc)) { + CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &ih, &iw)); + ic = 1; + CHECK_STATUS(tensor2dGet(outputDesc, &odt, &odf, &oh, &ow)); + } else if (tensorIs3d(inputDesc)) { + U32 oc = 0; + CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &ic, &ih, &iw)); + CHECK_STATUS(tensor3dGet(outputDesc, &odt, &odf, &oc, &oh, &ow)); + CHECK_REQUIREMENT(ic == oc); + } else if (tensorIs4d(inputDesc)) { + idt = inputDesc.dt; + ic = inputDesc.dims[0]; + ih = inputDesc.dims[1]; + iw = inputDesc.dims[2]; + } else { + CHECK_STATUS(NOT_MATCH); + } + + // l2norm -> x / sqrt(sum(x^2)) + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < ih; h++) { + U32 index_off = (c * ih + h) * iw; + F32 sum_row = array_var_f32(input + index_off, (I32)iw, 0.f) * static_cast(iw); + F32 sqrt_sum_row = sqrt(sum_row); + __m256 sqrt_sum_row_4 = _mm256_set1_ps(sqrt_sum_row); + __m256 in, out; + U32 w = 0; + for (w = 0; w < iw - 7; w += 8) { + in = _mm256_loadu_ps(input + index_off + w); + out = _mm256_div_ps(in, sqrt_sum_row_4); + _mm256_storeu_ps(output + index_off + w, out); + } + for (; w < iw; w++) { + output[index_off + w] = input[index_off + w] / sqrt_sum_row; + } + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/lstm.cpp b/compute/tensor/src/cpu/x86/fp32/lstm.cpp new file mode 100644 index 00000000..3ede5f14 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/lstm.cpp @@ -0,0 +1,318 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +void mvm_nkn32_with_bias(U32 fn, U32 fk, const F32 *filterArray, const F32 *input, F32 *output, const F32 *bias) +{ +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 n = 0; n < fn; ++n) { + const F32 *f = filterArray + n * fk * 32; + F32 *out = output + n * 32; + const F32 *b = bias + n * 32; + if (bias == nullptr) { + __asm__ __volatile__("vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + : + : + :"%ymm0", "%ymm1", "%ymm2", "%ymm3"); + } else { + __asm__ __volatile__("vmovups (%0), %%ymm0 \n\t" + "vmovups 0x20(%0), %%ymm1 \n\t" + "vmovups 0x40(%0), %%ymm2 \n\t" + "vmovups 0x60(%0), %%ymm3 \n\t" + : + :"r"(b) + :"%ymm0", "%ymm1", "%ymm2", "%ymm3"); + } + __asm__ __volatile__("mov %1, %%rax \n\t" + "mov %3, %%ecx \n\t" + "shr $3, %%ecx \n\t" + "je 1f \n\t" + ".align 16 \n\t" + "0: \n\t" + + "vmovups (%0), %%ymm4 \n\t" + "vmovups 0x20(%0), %%ymm5 \n\t" + "vmovups 0x40(%0), %%ymm6 \n\t" + "vmovups 0x60(%0), %%ymm7 \n\t" + "vbroadcastss 0x0(%%rax), %%ymm8 \n\t" + "vmovups 0x80(%0), %%ymm9 \n\t" + "vmovups 0xA0(%0), %%ymm10 \n\t" + "vmovups 0xC0(%0), %%ymm11 \n\t" + "vmovups 0xE0(%0), %%ymm12 \n\t" + "vbroadcastss 0x4(%%rax), %%ymm13 \n\t" + "vfmadd231ps %%ymm8, %%ymm4, %%ymm0 \n\t" + "vfmadd231ps %%ymm8, %%ymm5, %%ymm1 \n\t" + "vfmadd231ps %%ymm8, %%ymm6, %%ymm2 \n\t" + "vfmadd231ps %%ymm8, %%ymm7, %%ymm3 \n\t" + + "vmovups 0x100(%0), %%ymm4 \n\t" + "vmovups 0x120(%0), %%ymm5 \n\t" + "vmovups 0x140(%0), %%ymm6 \n\t" + "vmovups 0x160(%0), %%ymm7 \n\t" + "vbroadcastss 0x8(%%rax), %%ymm8 \n\t" + "vfmadd231ps %%ymm13, %%ymm9, %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm10, %%ymm1 \n\t" + "vfmadd231ps %%ymm13, %%ymm11, %%ymm2 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + + "vmovups 0x180(%0), %%ymm9 \n\t" + "vmovups 0x1A0(%0), %%ymm10 \n\t" + "vmovups 0x1C0(%0), %%ymm11 \n\t" + "vmovups 0x1E0(%0), %%ymm12 \n\t" + "vbroadcastss 0xC(%%rax), %%ymm13 \n\t" + "vfmadd231ps %%ymm8, %%ymm4, %%ymm0 \n\t" + "vfmadd231ps %%ymm8, %%ymm5, %%ymm1 \n\t" + "vfmadd231ps %%ymm8, %%ymm6, %%ymm2 \n\t" + "vfmadd231ps %%ymm8, %%ymm7, %%ymm3 \n\t" + + "vmovups 0x200(%0), %%ymm4 \n\t" + "vmovups 0x220(%0), %%ymm5 \n\t" + "vmovups 0x240(%0), %%ymm6 \n\t" + "vmovups 0x260(%0), %%ymm7 \n\t" + "vbroadcastss 0x10(%%rax), %%ymm8 \n\t" + "vfmadd231ps %%ymm13, %%ymm9 , %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm10, %%ymm1 \n\t" + "vfmadd231ps %%ymm13, %%ymm11, %%ymm2 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + + "vmovups 0x280(%0), %%ymm9 \n\t" + "vmovups 0x2A0(%0), %%ymm10 \n\t" + "vmovups 0x2C0(%0), %%ymm11 \n\t" + "vmovups 0x2E0(%0), %%ymm12 \n\t" + "vbroadcastss 0x14(%%rax), %%ymm13 \n\t" + "vfmadd231ps %%ymm8, %%ymm4, %%ymm0 \n\t" + "vfmadd231ps %%ymm8, %%ymm5, %%ymm1 \n\t" + "vfmadd231ps %%ymm8, %%ymm6, %%ymm2 \n\t" + "vfmadd231ps %%ymm8, %%ymm7, %%ymm3 \n\t" + + "vmovups 0x300(%0), %%ymm4 \n\t" + "vmovups 0x320(%0), %%ymm5 \n\t" + "vmovups 0x340(%0), %%ymm6 \n\t" + "vmovups 0x360(%0), %%ymm7 \n\t" + "vbroadcastss 0x18(%%rax), %%ymm8 \n\t" + "vfmadd231ps %%ymm13, %%ymm9 , %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm10, %%ymm1 \n\t" + "vfmadd231ps %%ymm13, %%ymm11, %%ymm2 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + + "vmovups 0x380(%0), %%ymm9 \n\t" + "vmovups 0x3A0(%0), %%ymm10 \n\t" + "vmovups 0x3C0(%0), %%ymm11 \n\t" + "vmovups 0x3E0(%0), %%ymm12 \n\t" + "vbroadcastss 0x1C(%%rax), %%ymm13 \n\t" + "vfmadd231ps %%ymm8, %%ymm4, %%ymm0 \n\t" + "vfmadd231ps %%ymm8, %%ymm5, %%ymm1 \n\t" + "vfmadd231ps %%ymm8, %%ymm6, %%ymm2 \n\t" + "vfmadd231ps %%ymm8, %%ymm7, %%ymm3 \n\t" + + "vfmadd231ps %%ymm13, %%ymm9 , %%ymm0 \n\t" + "vfmadd231ps %%ymm13, %%ymm10, %%ymm1 \n\t" + "vfmadd231ps %%ymm13, %%ymm11, %%ymm2 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm3 \n\t" + + "add $0x400, %0 \n\t" + "add $0x20, %%rax \n\t" + + "sub $1, %%ecx \n\t" + "jg 0b \n\t" + ".align 16 \n\t" + "1: \n\t" + + "mov %3, %%ecx \n\t" + "and $7, %%ecx \n\t" + "je 3f \n\t" + "2: \n\t" + "vmovups (%0), %%ymm4 \n\t" + "vmovups 0x20(%0), %%ymm5 \n\t" + "vmovups 0x40(%0), %%ymm6 \n\t" + "vmovups 0x60(%0), %%ymm7 \n\t" + "vbroadcastss (%%rax), %%ymm8 \n\t" + "vfmadd231ps %%ymm8, %%ymm4, %%ymm0 \n\t" + "vfmadd231ps %%ymm8, %%ymm5, %%ymm1 \n\t" + "vfmadd231ps %%ymm8, %%ymm6, %%ymm2 \n\t" + "vfmadd231ps %%ymm8, %%ymm7, %%ymm3 \n\t" + "add $0x80, %0 \n\t" + "add $0x4, %%rax \n\t" + "sub $1, %%ecx \n\t" + "jg 2b \n\t" + + "3: \n\t" + "vmovups %%ymm0, (%2) \n\t" + "vmovups %%ymm1, 0x20(%2) \n\t" + "vmovups %%ymm2, 0x40(%2) \n\t" + "vmovups %%ymm3, 0x60(%2) \n\t" + + : "+r"(f) + : "r"(input), "r"(out), "r"(fk) + : "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", + "memory"); + } +} + +EE rnncell_fp32(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *output, + Arch arch) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(arch); + if (nullptr == currentX || nullptr == filter || nullptr == bias || nullptr == state || + nullptr == tmp || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ix; + U32 on, oh; + U32 fk, fn; + CHECK_STATUS(tensor2dGet(xDesc, &idt, &idf, &in, &ix)); + CHECK_STATUS(tensor2dGet(filterDesc[0], &fdt, &fdf, &fn, &fk)); + CHECK_STATUS(tensor2dGet(hDesc, &odt, &odf, &on, &oh)); + if (fdf != DF_NKN32) { + CHECK_STATUS(NOT_MATCH); + } + fn /= 32; + + U32 batch = in; + I32 xDim = ix; + I32 hDim = rnnParamSpec.numOutput; + I32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection + : rnnParamSpec.numOutput; + if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(4 * column == (I32)fn * 32 && (ix + oh) == fk && in == on)) { + CHECK_STATUS(NOT_MATCH); + } + F32 forgetBias = rnnParamSpec.forgetBias; + ActivationMode activationMode = rnnParamSpec.activationMode; + if (activationMode != ACTIVATION_TANH) { + CHECK_STATUS(NOT_SUPPORTED); + } + + const F32 *currentXArray = (const F32 *)currentX; + F32 *lastStateArray = (F32 *)state; + F32 *lastHArray = lastStateArray + column; + F32 *tmpArray = (F32 *)tmp; + F32 *currentStateArray = (F32 *)state; + F32 *currentHArray = currentStateArray + column; + F32 *outputArray = (F32 *)output; + F32 *xhArray = tmpArray; + F32 *intermediateH = xhArray + (xDim + hDim); + U32 lastStateStride = column + hDim; + U32 lastHStride = column + hDim; + U32 currentStateStride = column + hDim; + U32 currentHStride = column + hDim; + __m256 forgetBiasVector = _mm256_set1_ps(forgetBias); + for (U32 m = 0; m < batch; m++) { + F32 *lastBatchH = lastHArray + m * lastHStride; + memcpy(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F32)); + memcpy(xhArray + xDim, lastBatchH, hDim * sizeof(F32)); + mvm_nkn32_with_bias(fn, fk, (const F32 *)filter[0], xhArray, intermediateH, (const F32 *)bias[0]); + + F32 *out_i = intermediateH; + F32 *out_g = out_i + column; + F32 *out_f = out_i + column * 2; + F32 *out_o = out_i + column * 3; + + F32 *lastBatchState = lastStateArray + m * lastStateStride; + F32 *currentBatchState = currentStateArray + m * currentStateStride; + F32 *currentBatchH = currentHArray + m * currentHStride; + F32 *currentOutput = outputArray + m * batchStrideH; + + F32 *tmpState, *tmpHH, *tmpH; + if (rnnParamSpec.zoneoutCell == 0) { + tmpState = currentBatchState; + } else { + tmpState = out_i; + } + if (rnnParamSpec.numProjection > 0) { + tmpHH = out_g; + tmpH = currentOutput; + } else { + tmpHH = currentOutput; + tmpH = out_g; + } + + I32 h = 0; + for (; h < column - 7; h += 8) { + __m256 out_i_v = _mm256_loadu_ps(out_i + h); + __m256 out_g_v = _mm256_loadu_ps(out_g + h); + __m256 out_f_v = _mm256_loadu_ps(out_f + h); + __m256 out_o_v = _mm256_loadu_ps(out_o + h); + __m256 C_v = _mm256_loadu_ps(lastBatchState + h); + __m256 I_v = _mm256_sigmod_ps(out_i_v); + __m256 F_v = _mm256_sigmod_ps(_mm256_add_ps(out_f_v, forgetBiasVector)); + __m256 O_v = _mm256_sigmod_ps(out_o_v); + __m256 G_v = _mm256_tanh_ps(out_g_v); + C_v = _mm256_add_ps(_mm256_mul_ps(C_v, F_v), _mm256_mul_ps(I_v, G_v)); + __m256 out_hidden_v = _mm256_mul_ps(O_v, _mm256_tanh_ps(C_v)); + _mm256_storeu_ps(tmpState + h, C_v); + _mm256_storeu_ps(tmpHH + h, out_hidden_v); + } + for (; h < column; h++) { + F32 C_s = lastBatchState[h]; + F32 I_s = 1.0 / (1.0 + exp(-out_i[h])); + F32 F_s = 1.0 / (1.0 + exp(-(out_f[h] + forgetBias))); + F32 O_s = 1.0 / (1.0 + exp(-out_o[h])); + F32 G_s = tanh(out_g[h]); + C_s = C_s * F_s + I_s * G_s; + F32 value = O_s * tanh(C_s); + tmpState[h] = C_s; + tmpHH[h] = value; + } + if (rnnParamSpec.zoneoutCell != 0) { + array_scale_f32(tmpState, tmpState, column, 1 - rnnParamSpec.zoneoutCell, 0); + array_scale_f32(lastBatchState, lastBatchState, column, rnnParamSpec.zoneoutCell, 0); + array_add_f32(tmpState, lastBatchState, currentBatchState, column); + } + + if (rnnParamSpec.numProjection > 0) { + mvm_nkn32_with_bias(hDim / 32, rnnParamSpec.numProjection, (const F32 *)filter[1], tmpHH, tmpH, nullptr); + } + + if (rnnParamSpec.zoneoutOutput != 0) { + if (rnnParamSpec.numProjection > 0) { + array_scale_f32(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0); + } else { + array_scale_f32(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0); + } + array_scale_f32(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneoutOutput, 0); + array_add_f32(out_f, lastBatchH, currentBatchH, hDim); + } else { + memcpy(currentBatchH, currentOutput, sizeof(F32) * hDim); + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/normalization.cpp b/compute/tensor/src/cpu/x86/fp32/normalization.cpp new file mode 100644 index 00000000..46b08655 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/normalization.cpp @@ -0,0 +1,62 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +inline void array_norm_scale_fp32( + F32 *input, F32 *output, I32 len, F32 mean, F32 var, F32 *alpha, F32 *beta) +{ + F32 eps = 1e-6; + F32 std_value = sqrt(var + eps); + __m256 mean_v = _mm256_set1_ps(mean); + __m256 std_v = _mm256_set1_ps(std_value); + + I32 i = 0; + for (i = 0; i < len - 7; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 alpha_v = _mm256_loadu_ps(alpha + i); + __m256 beta_v = _mm256_loadu_ps(beta + i); + + __m256 tmp_v = _mm256_sub_ps(in, mean_v); + tmp_v = _mm256_div_ps(tmp_v, std_v); + tmp_v = _mm256_fmadd_ps(alpha_v, tmp_v, beta_v); + _mm256_storeu_ps(output + i, tmp_v); + } + for (; i < len; i++) { + output[i] = alpha[i] * (input[i] - mean) / std_value + beta[i]; + } +} + +EE layer_normalization_fp32( + TensorDesc inputDesc, F32 *input, F32 *alpha, F32 *beta, TensorDesc outputDesc, F32 *output) +{ + UNUSED(outputDesc); + if (nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + U32 size = tensorNumElements(inputDesc); + I32 size_inner = inputDesc.dims[0]; + I32 size_outer = size / size_inner; + for (I32 i = 0; i < size_outer; i++) { + F32 *current_input = input + i * size_inner; + F32 *current_output = output + i * size_inner; + F32 mean = array_mean_f32(current_input, size_inner); + F32 var = array_var_f32(current_input, size_inner, mean); + + array_norm_scale_fp32(current_input, current_output, size_inner, mean, var, alpha, beta); + } + + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/pooling.cpp b/compute/tensor/src/cpu/x86/fp32/pooling.cpp new file mode 100644 index 00000000..0a22e198 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/pooling.cpp @@ -0,0 +1,394 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +#define UNROLL_W 4 + +typedef void (*pooling_max_func)(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride); +typedef void (*pooling_mean_func)( + const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 padSize); + +void pooling_max_w4(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride) +{ + __asm__ __volatile__("mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "mov %%eax, %%eax \n\t" + "mov %5, %%eax \n\t" + "mov %%rax, %%r9 \n\t" + "add %%r9, %%r9 \n\t" + "mov %%rax, %%r10 \n\t" + "add %%r9, %%r10 \n\t" + "add %0, %%rax \n\t" + "add %0, %%r9 \n\t" + "add %0, %%r10 \n\t" + + "vmovups (%0), %%ymm0 \n\t" + "vmovups (%%rax), %%ymm1 \n\t" + "vmovups (%%r9), %%ymm2 \n\t" + "vmovups (%%r10), %%ymm3 \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%0), %%ymm4 \n\t" + "vmovups (%%rax), %%ymm5 \n\t" + "vmovups (%%r9), %%ymm6 \n\t" + "vmovups (%%r10), %%ymm7 \n\t" + + "vmaxps %%ymm0, %%ymm4, %%ymm0 \n\t" + "vmaxps %%ymm1, %%ymm5, %%ymm1 \n\t" + "vmaxps %%ymm2, %%ymm6, %%ymm2 \n\t" + "vmaxps %%ymm3, %%ymm7, %%ymm3 \n\t" + + "add $0x20, %0 \n\t" + "add $0x20, %%rax \n\t" + "add $0x20, %%r9 \n\t" + "add $0x20, %%r10 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + "add %%rdi, %0 \n\t" + "add %%rdi, %%rax \n\t" + "add %%rdi, %%r9 \n\t" + "add %%rdi, %%r10 \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + "vmovups %%ymm2, 0x40(%1) \n\t" + "vmovups %%ymm3, 0x60(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride) + : "%eax", "%rax", "%ecx", "%r10", "%r9", "%rdi", "%ymm0", "%ymm1", "%ymm2", + "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "memory", "cc"); +} + +void pooling_max_w2(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride) +{ + __asm__ __volatile__( + "mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "mov %%eax, %%eax \n\t" + "mov %5, %%eax \n\t" + "add %0, %%rax \n\t" + + "vmovups (%0), %%ymm0 \n\t" + "vmovups (%%rax), %%ymm1 \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%0), %%ymm4 \n\t" + "vmovups (%%rax), %%ymm5 \n\t" + + "vmaxps %%ymm0, %%ymm4, %%ymm0 \n\t" + "vmaxps %%ymm1, %%ymm5, %%ymm1 \n\t" + + "add $0x20, %0 \n\t" + "add $0x20, %%rax \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + "add %%rdi, %0 \n\t" + "add %%rdi, %%rax \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride) + : "%eax", "%rax", "%ecx", "%rdi", "%ymm0", "%ymm1", "%ymm4", "%ymm5", "memory", "cc"); +} + +void pooling_max_w1(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride) +{ + __asm__ __volatile__("mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + + "vmovups (%0), %%ymm0 \n\t" + ".align 16 \n\t" + "0: \n\t" + + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%0), %%ymm4 \n\t" + "vmaxps %%ymm0, %%ymm4, %%ymm0 \n\t" + + "add $0x20, %0 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + "add %%rdi, %0 \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + + "vmovups %%ymm0, (%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride) + : "%eax", "%rax", "%ecx", "%rdi", "%ymm0", "%ymm4", "memory", "cc"); +} + +void pooling_mean_w4(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 padSize) +{ + __asm__ __volatile__( + "mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "mov %5, %%eax \n\t" + "mov %%rax, %%r9 \n\t" + "add %%r9, %%r9 \n\t" + "mov %%rax, %%r10 \n\t" + "add %%r9, %%r10 \n\t" + "add %0, %%rax \n\t" + "add %0, %%r9 \n\t" + "add %0, %%r10 \n\t" + + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%0), %%ymm4 \n\t" + "vmovups (%%rax), %%ymm5 \n\t" + "vmovups (%%r9), %%ymm6 \n\t" + "vmovups (%%r10), %%ymm7 \n\t" + + "vaddps %%ymm0, %%ymm4, %%ymm0 \n\t" + "vaddps %%ymm1, %%ymm5, %%ymm1 \n\t" + "vaddps %%ymm2, %%ymm6, %%ymm2 \n\t" + "vaddps %%ymm3, %%ymm7, %%ymm3 \n\t" + + "add $0x20, %0 \n\t" + "add $0x20, %%rax \n\t" + "add $0x20, %%r9 \n\t" + "add $0x20, %%r10 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + "add %%rdi, %0 \n\t" + "add %%rdi, %%rax \n\t" + "add %%rdi, %%r9 \n\t" + "add %%rdi, %%r10 \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + + "vbroadcastss (%6), %%ymm4 \n\t" + "vdivps %%ymm4, %%ymm0, %%ymm0 \n\t" + "vdivps %%ymm4, %%ymm1, %%ymm1 \n\t" + "vdivps %%ymm4, %%ymm2, %%ymm2 \n\t" + "vdivps %%ymm4, %%ymm3, %%ymm3 \n\t" + + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + "vmovups %%ymm2, 0x40(%1) \n\t" + "vmovups %%ymm3, 0x60(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&padSize) + : "%eax", "%rax", "%ecx", "%r10", "%r9", "%rdi", "%ymm0", "%ymm1", "%ymm2", "%ymm3", + "%ymm4", "%ymm5", "%ymm6", "%ymm7", "memory", "cc"); +} + +void pooling_mean_w2(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 padSize) +{ + __asm__ __volatile__( + "mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "mov %5, %%eax \n\t" + "add %0, %%rax \n\t" + + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%0), %%ymm4 \n\t" + "vmovups (%%rax), %%ymm5 \n\t" + + "vaddps %%ymm0, %%ymm4, %%ymm0 \n\t" + "vaddps %%ymm1, %%ymm5, %%ymm1 \n\t" + + "add $0x20, %0 \n\t" + "add $0x20, %%rax \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + "add %%rdi, %0 \n\t" + "add %%rdi, %%rax \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + + "vbroadcastss (%6), %%ymm4 \n\t" + "vdivps %%ymm4, %%ymm0, %%ymm0 \n\t" + "vdivps %%ymm4, %%ymm1, %%ymm1 \n\t" + + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&padSize) + : "%eax", "%rax", "%ecx", "%rdi", "%ymm0", "%ymm1", "%ymm4", "%ymm5", "memory", "cc"); +} + +void pooling_mean_w1(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 padSize) +{ + __asm__ __volatile__( + "mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%0), %%ymm4 \n\t" + + "vaddps %%ymm0, %%ymm4, %%ymm0 \n\t" + + "add $0x20, %0 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + "add %%rdi, %0 \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + + "vbroadcastss (%6), %%ymm4 \n\t" + "vdivps %%ymm4, %%ymm0, %%ymm0 \n\t" + + "vmovups %%ymm0, (%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&padSize) + : "%eax", "%rax", "%ecx", "%rdi", "%ymm0", "%ymm4", "memory", "cc"); +} + +EE pooling_fp32(TensorDesc inputDesc, + const F32 *input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + F32 *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, odt; + DataFormat idf, odf; + U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (idt != odt || idt != DT_F32) { + CHECK_STATUS(NOT_MATCH); + } + if (in != on || ic != oc) { + CHECK_STATUS(NOT_MATCH); + } + if (idf != DF_NCHWC8 || odf != idf) { + CHECK_STATUS(NOT_MATCH); + } + + PoolingMode pm = poolingParamSpec.mode; + U32 strideH = poolingParamSpec.stride_h; + U32 strideW = poolingParamSpec.stride_w; + U32 paddingT = poolingParamSpec.padding_top; + U32 paddingL = poolingParamSpec.padding_left; + U32 kernelSizeH = poolingParamSpec.kernel_h; + U32 kernelSizeW = poolingParamSpec.kernel_w; + U32 wSize, kh, kw, iStep; + F32 padSize, *curO; + const F32 *curI; + if (paddingT >= kernelSizeH || paddingL >= kernelSizeW) { + CHECK_STATUS(NOT_SUPPORTED); + } + + ic /= 8; + U32 wSizes[3] = {1, 2, 4}; + pooling_max_func pooling_max[3] = {pooling_max_w1, pooling_max_w2, pooling_max_w4}; + pooling_mean_func pooling_mean[3] = {pooling_mean_w1, pooling_mean_w2, pooling_mean_w4}; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < oh; h++) { + for (U32 w = 0; w < ow; w += wSize) { + wSize = UNI_MIN(ow - w - paddingL / strideW, UNROLL_W); + wSize = wSizes[wSize >> 1]; + int hstart = (int)h * (int)strideH - (int)paddingT; + int wstart = (int)w * (int)strideW - (int)paddingL; + int hend = UNI_MIN(hstart + kernelSizeH, ih); + int wend = UNI_MIN(wstart + kernelSizeW, iw); + hstart = UNI_MAX(hstart, 0); + wstart = UNI_MAX(wstart, 0); + + curI = input + (hstart * iw + wstart) * 8; + curO = output + (h * ow + w) * 8; + kh = hend - hstart; + kw = wend - wstart; + iStep = (iw - kw) * 32; + padSize = kw * kh * 1.0f; + if (kw < kernelSizeW) { + wSize = 1; + } + + switch (pm) { + case POOLING_MAX: { + pooling_max[wSize >> 1](curI, curO, kw, kh, iStep, strideW * 32); + break; + } + case POOLING_MEAN: { + pooling_mean[wSize >> 1]( + curI, curO, kw, kh, iStep, strideW * 32, padSize); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + } + } + } + input += ih * iw * 8; + output += oh * ow * 8; + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/scale.cpp b/compute/tensor/src/cpu/x86/fp32/scale.cpp new file mode 100644 index 00000000..673ff9d8 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/scale.cpp @@ -0,0 +1,119 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +EE scale_nchwc8_fp32( + F32 *input, F32 *alpha, F32 *beta, I32 in, I32 ic, I32 elements_per_channel, F32 *output) +{ + __m256 in_vec, out_vec; + __m256 one = _mm256_set1_ps(1.); + __m256 zero = _mm256_set1_ps(0.); + U32 index = 0; + for (I32 n = 0; n < in; n++) { + for (I32 c = 0; c < ic; c += 8) { + __m256 alpha_vec = (alpha == nullptr) ? one : _mm256_loadu_ps(alpha + c); + __m256 beta_vec = (beta == nullptr) ? zero : _mm256_loadu_ps(beta + c); + for (I32 i = 0; i < elements_per_channel; i++) { + in_vec = _mm256_loadu_ps(input + index); + out_vec = _mm256_fmadd_ps(alpha_vec, in_vec, beta_vec); + _mm256_storeu_ps(output + index, out_vec); + index += 8; + } + } + } + return SUCCESS; +} + +EE scale_nchw_fp32( + F32 *input, F32 *alpha, F32 *beta, I32 in, I32 ic, I32 elements_per_channel, F32 *output) +{ + __m256 one = _mm256_set1_ps(1.); + __m256 zero = _mm256_set1_ps(0.); + U32 index = 0; + for (I32 n = 0; n < in; n++) { + for (I32 c = 0; c < ic; c++) { + __m256 alpha_vec = (alpha == nullptr) ? one : _mm256_set1_ps(alpha[c]); + __m256 beta_vec = (beta == nullptr) ? zero : _mm256_set1_ps(beta[c]); + I32 i = 0; + for (; i < elements_per_channel - 7; i += 8) { + __m256 in_vec = _mm256_loadu_ps(input + index); + __m256 out_vec = _mm256_fmadd_ps(alpha_vec, in_vec, beta_vec); + _mm256_storeu_ps(output + index, out_vec); + index += 8; + } + for (; i < elements_per_channel; i++) { + float alpha_s = (alpha == nullptr) ? 1 : alpha[c]; + float beta_s = (beta == nullptr) ? 0 : beta[c]; + output[index] = alpha_s * input[index] + beta_s; + index++; + } + } + } + return SUCCESS; +} + +EE scale_nhwc_fp32( + F32 *input, F32 *alpha, F32 *beta, I32 in, I32 ic, I32 elements_per_channel, F32 *output) +{ + __m256 one = _mm256_set1_ps(1.); + __m256 zero = _mm256_set1_ps(0.); + U32 index = 0; + for (I32 n = 0; n < in; n++) { + for (I32 i = 0; i < elements_per_channel; i++) { + I32 c = 0; + for (; c < ic - 7; c += 8) { + __m256 alpha_vec = (alpha == nullptr) ? one : _mm256_loadu_ps(alpha + c); + __m256 beta_vec = (beta == nullptr) ? zero : _mm256_loadu_ps(beta + c); + __m256 in_vec = _mm256_loadu_ps(input + index); + __m256 out_vec = _mm256_fmadd_ps(alpha_vec, in_vec, beta_vec); + _mm256_storeu_ps(output + index, out_vec); + index += 8; + } + for (; c < ic; c++) { + float alpha_s = (alpha == nullptr) ? 1 : alpha[c]; + float beta_s = (beta == nullptr) ? 0 : beta[c]; + output[index] = alpha_s * input[index] + beta_s; + index++; + } + } + } + return SUCCESS; +} + +EE scale_fp32(F32 *input, + I32 axis, + I32 nDims, + F32 *alpha, + F32 *beta, + I32 in, + I32 ic, + I32 elements_per_channel, + F32 *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + EE ret = SUCCESS; + // If ic is 1, it means that weights/vectors have only one param, so we need use the calculation logic of nchw. + if (axis == 1 || axis == 0 || ic == 1) { + ret = scale_nchw_fp32(input, alpha, beta, in, ic, elements_per_channel, output); + } else if (axis == nDims - 1) { + ret = scale_nhwc_fp32(input, alpha, beta, in, ic, elements_per_channel, output); + } else if (axis == nDims) { + ret = scale_nchwc8_fp32(input, alpha, beta, in, ic, elements_per_channel, output); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + return ret; +} diff --git a/compute/tensor/src/cpu/x86/fp32/softmax.cpp b/compute/tensor/src/cpu/x86/fp32/softmax.cpp new file mode 100644 index 00000000..1ff108db --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/softmax.cpp @@ -0,0 +1,139 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +void softmax_lastAxis_fp32(const F32 *input, I32 loopOuter, I32 loops, F32 *output) +{ + for (I32 i = 0; i < loopOuter; i++) { + const F32 *inputPtr = input + i * loops; + F32 *outputPtr = output + i * loops; + + __m256 max_v, sub_v, sum_v, tmp_v; + F32 max_s, tmp_s; + max_s = array_max_f32(inputPtr, loops); + max_v = _mm256_set1_ps(max_s); + sum_v = _mm256_set1_ps(0.f); + + I32 j = 0; + F32 sum_s = 0; + for (j = 0; j < loops - 7; j += 8) { + __m256 in = _mm256_loadu_ps(inputPtr + j); + sub_v = _mm256_sub_ps(in, max_v); + tmp_v = _mm256_exp_ps(sub_v); + sum_v = _mm256_add_ps(sum_v, tmp_v); + _mm256_storeu_ps(outputPtr + j, tmp_v); + } + sum_s += _mm256_sum_ps(sum_v); + for (; j < loops; j++) { + tmp_s = exp(inputPtr[j] - max_s); + outputPtr[j] = tmp_s; + sum_s += tmp_s; + } + array_scale_f32(outputPtr, outputPtr, loops, 1.0 / sum_s, 0); + } +} + +void softmax_anyAxis_fp32(const F32 *input, I32 loopOuter, I32 loops, I32 loopInner, F32 *output) +{ + std::vector buffer(loopInner * 2); + F32 *maxBuffer = &buffer[0]; + F32 *sumBuffer = &buffer[loopInner]; + I32 k = 0; + for (I32 i = 0; i < loopOuter; i++) { + const F32 *inputPtrBase = input + i * loops * loopInner; + F32 *outputPtrBase = output + i * loops * loopInner; + + memcpy(maxBuffer, inputPtrBase, loopInner * sizeof(F32)); + memset(sumBuffer, 0, loopInner * sizeof(F32)); + for (I32 j = 1; j < loops; j++) { + const F32 *inputPtr = inputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 7; k += 8) { + __m256 in_v = _mm256_loadu_ps(inputPtr + k); + __m256 out_v = _mm256_loadu_ps(maxBuffer + k); + __m256 max_v = _mm256_max_ps(in_v, out_v); + _mm256_storeu_ps(maxBuffer + k, max_v); + } + for (; k < loopInner; k++) { + maxBuffer[k] = UNI_MAX(maxBuffer[k], inputPtr[k]); + } + } + for (I32 j = 0; j < loops; j++) { + const F32 *inputPtr = inputPtrBase + j * loopInner; + F32 *outputPtr = outputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 7; k += 8) { + __m256 in_v = _mm256_loadu_ps(inputPtr + k); + __m256 max_v = _mm256_loadu_ps(maxBuffer + k); + __m256 sub_v = _mm256_sub_ps(in_v, max_v); + __m256 exp_v = _mm256_exp_ps(sub_v); + __m256 sum_v = _mm256_loadu_ps(sumBuffer + k); + sum_v = _mm256_add_ps(sum_v, exp_v); + _mm256_storeu_ps(sumBuffer + k, sum_v); + _mm256_storeu_ps(outputPtr + k, exp_v); + } + for (; k < loopInner; k++) { + outputPtr[k] = exp(inputPtr[k] - maxBuffer[k]); + sumBuffer[k] += outputPtr[k]; + } + } + for (I32 j = 0; j < loops; j++) { + F32 *outputPtr = outputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 7; k += 8) { + __m256 out_v = _mm256_loadu_ps(outputPtr + k); + __m256 sum_v = _mm256_loadu_ps(sumBuffer + k); + out_v = _mm256_div_ps(out_v, sum_v); + _mm256_storeu_ps(outputPtr + k, out_v); + } + for (; k < loopInner; k++) { + outputPtr[k] /= sumBuffer[k]; + } + } + } +} + +EE softmax_fp32(TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output) +{ + UNUSED(outputDesc); + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + U32 size = tensorNumElements(inputDesc); + axis = (axis + inputDesc.nDims) % inputDesc.nDims; + axis = inputDesc.nDims - 1 - axis; + I32 loops = inputDesc.dims[axis]; + + I32 loopInner = 1; + for (int i = 0; i < axis; i++) { + loopInner *= inputDesc.dims[i]; + } + U32 loopOuter = size / loops / loopInner; + + if (loopInner == 1) { + if (DF_NCHWC8 == inputDesc.df && 4 == inputDesc.nDims && + (inputDesc.dims[1] != 1 || inputDesc.dims[0] != 1)) { + CHECK_REQUIREMENT(2 != axis); + loopInner *= 8; + loopOuter /= 8; + softmax_anyAxis_fp32(input, loopOuter, loops, loopInner, output); + } else { + softmax_lastAxis_fp32(input, loopOuter, loops, output); + } + } else { + CHECK_REQUIREMENT(DF_NCHWC8 != inputDesc.df); + softmax_anyAxis_fp32(input, loopOuter, loops, loopInner, output); + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/tensor_computing_fp32.h b/compute/tensor/src/cpu/x86/fp32/tensor_computing_fp32.h new file mode 100644 index 00000000..6ff33c5d --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/tensor_computing_fp32.h @@ -0,0 +1,259 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef CHEETAH_TENSOR_COMPUTING_FP32_H +#define CHEETAH_TENSOR_COMPUTING_FP32_H + +#include +#include "sys.h" +#include "error.h" +#include "types.h" +#include "thread_affinity.h" +#include "x86_functions_fp32.h" + +EE attention_mask_fp32(TensorDesc inputDesc, + const F32 *input, + AttentionMaskParamSpec p, + TensorDesc outputDesc, + F32 *output); + +EE convolution_transform_filter_fp32(TensorDesc filterDesc, + const F32 *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F32 *filterTransformed); + +EE convolution_infer_forward_tmp_bytes_fp32(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes); + +EE convolution_fp32(TensorDesc inputDesc, + F32 *input, + TensorDesc filterDesc, + const F32 *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const F32 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *output, + ActivationParamSpec activationDesc, + Arch arch); + +EE convolution_direct(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc); + +EE convolution_1x1_direct(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc); + +EE convolution_2x2_direct(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc); + +EE convolution_direct_nchw(TensorDesc inputDesc, + F32 *inArray, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc); + +EE check_fp32(TensorDesc inputDescA, + const F32 *inputA, + TensorDesc inputDescB, + const F32 *inputB, + CheckMode checkMode, + TensorDesc outputDesc, + I32 *output); + +EE clip_fp32(F32 *input, F32 *output, I32 len, F32 minValue, F32 maxValue); + +EE depthwise_pointwise_convolution_transform_filter_fp32(TensorDesc dwFilterDesc, + const F32 *dwFilter, + TensorDesc pwFilterDesc, + const F32 *pwFilter, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *dwFtmDesc, + F32 *dwFilterTransformed, + TensorDesc *pwFtmDesc, + F32 *pwFilterTransformed); + +EE depthwise_pointwise_convolution_fp32(TensorDesc inputDesc, + F32 *input, + TensorDesc dwFilterDesc, + const F32 *dwFilter, + TensorDesc pwFilterDesc, + const F32 *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const F32 *dwBias, + TensorDesc pwBiasDesc, + const F32 *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch); + +EE depthwise_convolution_infer_forward_algorithm_fp32(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + DepthwiseConvolutionForwardAlgorithm *algorithm, + DataType targetDataType); + +EE depthwise_convolution_transform_filter_bytes_fp32( + TensorDesc filterDesc, DepthwiseConvolutionForwardAlgorithm algorithm, U32 *bytes); + +EE depthwise_convolution_transform_filter_fp32(TensorDesc filterDesc, + const F32 *filter, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F32 *filterTransformed); + +EE depthwise_convolution_transform_filter_fp32(TensorDesc filterDesc, + const F32 *filter, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F32 *filterTransformed); + +EE depthwise_convolution_fp32(TensorDesc inputDesc, + F32 *input, + TensorDesc filterDesc, + const F32 *filter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const F32 *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *output, + ActivationParamSpec depthwiseActivationParamSpec, + Arch arch); + +EE depthwise_convolution_direct(TensorDesc inputDesc, + F32 *inArray, + TensorDesc dwFilterDesc, + const F32 *dwFilterArray, + TensorDesc pwFilterDesc, + const F32 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F32 *dwBiasArray, + TensorDesc pwBiasDesc, + const F32 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec); + +EE eltwise_fp32(std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode); + +EE layer_normalization_fp32( + TensorDesc inputDesc, F32 *input, F32 *alpha, F32 *beta, TensorDesc outputDesc, F32 *output); + +EE l2normalization_fp32(TensorDesc inputDesc, const F32 *input, TensorDesc outputDesc, F32 *output); + +EE rnncell_fp32(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *output, + Arch arch); + +EE pooling_fp32(TensorDesc inputDesc, + const F32 *input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + F32 *output); + +EE scale_fp32(F32 *input, + I32 axis, + I32 nDims, + F32 *alpha, + F32 *beta, + I32 in, + I32 ic, + I32 elements_per_channel, + F32 *output); + +EE softmax_fp32( + TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output); + +EE deconvolution_transform_filter_fp32(TensorDesc filterDesc, + const F32 *filter, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F32 *filterTransformed); + +#endif //CHEETAH_TENSOR_COMPUTING_FP32_H diff --git a/compute/tensor/src/cpu/x86/fp32/transform_functions_fp32.h b/compute/tensor/src/cpu/x86/fp32/transform_functions_fp32.h new file mode 100644 index 00000000..4e69369b --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/transform_functions_fp32.h @@ -0,0 +1,149 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef CHEETAH_X86_TRANSFORM_FUNTIONS_FP32_H +#define CHEETAH_X86_TRANSFORM_FUNTIONS_FP32_H + +#include "types.h" + +template +inline void transformNCHWCxNx(U32 fc, U32 fh, U32 fw, U32 oc, const F32 *input, F32 *output) +{ + F32 *dest; + const F32 *src; + U32 cSize = 0, cSizePadding = 0; + U32 lstep = fc * fh * fw; + __m256i vindex = _mm256_set_epi32( + lstep * 7, lstep * 6, lstep * 5, lstep * 4, lstep * 3, lstep * 2, lstep, 0); + for (U32 c = 0; c < fc; c += cSize) { + cSize = UNI_MIN(fc - c, C); + cSizePadding = UNI_MIN(oc - c, C); + for (U32 hw = 0; hw < fh * fw; ++hw) { + for (U32 c8 = 0; c8 < cSize; ++c8) { + src = input + (c + c8) * fh * fw + hw; + dest = output + c * fh * fw * N + hw * cSizePadding * N + c8 * N; + if (N >= 8) { + _mm256_storeu_ps(dest, _mm256_i32gather_ps(src, vindex, 4)); + } + if (N >= 16) { + _mm256_storeu_ps(dest + 8, _mm256_i32gather_ps(src + 8 * lstep, vindex, 4)); + } + if (N >= 24) { + _mm256_storeu_ps(dest + 16, _mm256_i32gather_ps(src + 16 * lstep, vindex, 4)); + } + if (N == 32) { + _mm256_storeu_ps(dest + 24, _mm256_i32gather_ps(src + 24 * lstep, vindex, 4)); + } + } + memset(dest + N, 0, ((cSizePadding - cSize) * N * 4)); + } + } +} + +// N is 32/24 +template +inline EE transformNCHWToNCHWCxNx( + TensorDesc inputDesc, const F32 *input, TensorDesc outputDesc, F32 *output) +{ + if (input == NULL || output == NULL) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt, odt; + DataFormat fdf, odf; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + U32 remain = fn % N; + fn -= remain; + + for (U32 n = 0; n < fn; n += N) { + transformNCHWCxNx(fc, fh, fw, oc, input, output); + input += fc * fh * fw * N; + output += oc * fh * fw * N; + } + if (remain >= 16) { + transformNCHWCxNx(fc, fh, fw, oc, input, output); + input += fc * fh * fw * 16; + output += oc * fh * fw * 16; + remain -= 16; + } + if (remain >= 8) { + transformNCHWCxNx(fc, fh, fw, oc, input, output); + input += fc * fh * fw * 8; + output += oc * fh * fw * 8; + remain -= 8; + } + if (remain > 0) { + F32 *dest; + const F32 *src; + U32 cSize = 0, cSizePadding = 0; + F32 m[8] = {0.0f}; + for (U32 i = 0; i < remain; ++i) { + m[i] = -1.0f; + } + __m256 mask = _mm256_set_ps(m[7], m[6], m[5], m[4], m[3], m[2], m[1], m[0]); + U32 lstep = fc * fh * fw; + __m256i vindex = _mm256_set_epi32( + lstep * 7, lstep * 6, lstep * 5, lstep * 4, lstep * 3, lstep * 2, lstep, 0); + __m256 src256 = _mm256_setzero_ps(); + for (U32 c = 0; c < fc; c += cSize) { + cSize = UNI_MIN(fc - c, C); + cSizePadding = UNI_MIN(oc - c, C); + for (U32 hw = 0; hw < fh * fw; ++hw) { + for (U32 c8 = 0; c8 < cSize; ++c8) { + src = input + (c + c8) * fh * fw + hw; + dest = output + c * fh * fw * 8 + hw * cSizePadding * 8 + c8 * 8; + _mm256_storeu_ps(dest, _mm256_mask_i32gather_ps(src256, src, vindex, mask, 4)); + } + memset(dest + 8, 0, ((cSizePadding - cSize) * 32)); + } + } + fn += remain; + } + return SUCCESS; +} + + +inline void PaddingNCHWC8(F32 *data, F32 *tmp, + TensorDesc inputDesc, ConvolutionParamSpec convParamSpec) { + // NCHWC8 + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + U32 padih = paddingT + paddingB + ih; + U32 padiw = paddingL + paddingR + iw; + U32 coff, hoff; + + CHECK_REQUIREMENT((idf == DF_NCHWC8) && (ic % 8 == 0)); + for(U32 c = 0; c < ic; c += 8) { + coff = c * padih * padiw; + memset(tmp + coff, 0, padiw * paddingT * 8 * bytesOf(idt)); + for (U32 h = 0; h < ih; ++h) { + hoff = (h + paddingT) * padiw; + memset(tmp + coff + hoff * 8, 0, paddingL * 8 * bytesOf(idt)); + memcpy(tmp + coff + (hoff + paddingL) * 8, data + c * ih * iw + h * iw * 8, iw * 8 * bytesOf(idt)); + memset(tmp + coff + (hoff + (paddingL + iw)) * 8, 0, paddingR * 8 * bytesOf(idt)); + } + memset(tmp + coff + (ih + paddingT) * padiw * 8, 0, padiw * paddingB * 8 * bytesOf(idt)); + } +} + +#endif diff --git a/compute/tensor/src/cpu/x86/fp32/x86_functions_fp32.h b/compute/tensor/src/cpu/x86/fp32/x86_functions_fp32.h new file mode 100644 index 00000000..2897bdb6 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/x86_functions_fp32.h @@ -0,0 +1,361 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef CHEETAH_X86_FUNCTIONS_FP32_H +#define CHEETAH_X86_FUNCTIONS_FP32_H +#include +#include "x86_avx2_expand.h" +#include "types.h" + +inline EE activation_fp32(F32 *input, U32 len, ActivationParamSpec activationDesc, F32 *output) +{ + __m256 in, out; + __m256 zero = _mm256_set1_ps(0.); + __m256 one = _mm256_set1_ps(1.); + __m256 three = _mm256_set1_ps(3.); + __m256 six = _mm256_set1_ps(6.); + U32 len_main = len / 8; + U32 len_tail = len % 8; + + F32 value; + switch (activationDesc.mode) { + case ACTIVATION_NULL: { + break; + } + case ACTIVATION_RELU: { + if (activationDesc.value[0] == 0) { + for (U32 i = 0; i < len_main; i++) { + in = _mm256_loadu_ps(input); + out = _mm256_max_ps(zero, in); + _mm256_storeu_ps(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + output[i] = (input[i] < 0) ? 0 : input[i]; + } + } else { + __m256 scale = _mm256_set1_ps(activationDesc.value[0]); + for (U32 i = 0; i < len_main; i++) { + in = _mm256_loadu_ps(input); + __m256 tmp = _mm256_mul_ps(in, scale); + out = _mm256_max_ps(tmp, in); + _mm256_storeu_ps(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + float tmp = activationDesc.value[0] * input[i]; + output[i] = (input[i] < tmp) ? tmp : input[i]; + } + } + break; + } + case ACTIVATION_RELU6: { + for (U32 i = 0; i < len_main; i++) { + in = _mm256_loadu_ps(input); + out = _mm256_max_ps(zero, in); + out = _mm256_min_ps(six, out); + _mm256_storeu_ps(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + value = (input[i] < 0) ? 0 : input[i]; + if (value > 6) { + value = 6; + } + output[i] = value; + } + break; + } + case ACTIVATION_H_SIGMOID: { + for (U32 i = 0; i < len_main; i++) { + in = _mm256_loadu_ps(input); + out = _mm256_add_ps(in, three); + out = _mm256_max_ps(out, zero); + out = _mm256_min_ps(out, six); + out = _mm256_div_ps(out, six); + _mm256_storeu_ps(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + value = input[i] + 3; + value = (value < 0) ? 0 : value; + value = (value > 6) ? 6 : value; + value = value / 6; + output[i] = value; + } + break; + } + case ACTIVATION_H_SWISH: { + for (U32 i = 0; i < len_main; i++) { + in = _mm256_loadu_ps(input); + out = _mm256_add_ps(in, three); + out = _mm256_max_ps(out, zero); + out = _mm256_min_ps(out, six); + out = _mm256_div_ps(out, six); + out = _mm256_mul_ps(out, in); + _mm256_storeu_ps(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + value = input[i] + 3; + value = (value < 0) ? 0 : value; + value = (value > 6) ? 6 : value; + value = input[i] * value; + value = value / 6; + output[i] = value; + } + break; + } + case ACTIVATION_GELU: { + F32 two_div_PI_sqrt = sqrt(2 / 3.14159265358979323846); + __m256 vec0 = _mm256_set1_ps(two_div_PI_sqrt); + __m256 vec1 = _mm256_set1_ps(0.044715); + __m256 vec2 = _mm256_set1_ps(0.5); + for (U32 i = 0; i < len_main; i++) { + in = _mm256_loadu_ps(input); + out = _mm256_mul_ps(in, in); + out = _mm256_mul_ps(out, in); + out = _mm256_fmadd_ps(vec1, out, in); + out = _mm256_mul_ps(vec0, out); + out = _mm256_tanh_ps(out); + out = _mm256_add_ps(one, out); + out = _mm256_mul_ps(vec2, out); + out = _mm256_mul_ps(in, out); + _mm256_storeu_ps(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + value = input[i]; + value = two_div_PI_sqrt * (value + 0.044715 * powf(value, 3)); + value = 1.0 - 2.0 / (exp(2.0 * value) + 1.0); + value = 0.5 * (1.0 + value); + value = input[i] * value; + output[i] = value; + } + break; + } + case ACTIVATION_TANH: { + for (U32 i = 0; i < len_main; i++) { + in = _mm256_loadu_ps(input); + out = _mm256_tanh_ps(in); + _mm256_storeu_ps(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + value = 1.0 - 2.0 / (exp(2.0 * input[i]) + 1.0); + output[i] = value; + } + break; + } + case ACTIVATION_SIGMOID: { + for (U32 i = 0; i < len_main; i++) { + in = _mm256_loadu_ps(input); + out = _mm256_sigmod_ps(in); + _mm256_storeu_ps(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + value = 1.0 / (1.0 + exp(-1.0 * input[i])); + output[i] = value; + } + break; + } + case ACTIVATION_MISH: { + for (U32 i = 0; i < len_main; i++) { + in = _mm256_loadu_ps(input); + out = _mm256_mul_ps( + in, _mm256_tanh_ps(_mm256_log_ps(_mm256_add_ps(_mm256_exp_ps(in), one)))); + _mm256_storeu_ps(output, out); + input += 8; + output += 8; + } + for (U32 i = 0; i < len_tail; i++) { + value = input[i] * tanh(log(exp(input[i]) + 1.0)); + output[i] = value; + } + break; + } + default: + return NOT_SUPPORTED; + } + + return SUCCESS; +} + +inline void array_scale_f32(const F32 *input, F32 *output, I32 len, F32 alpha, F32 beta) +{ + __m256 alpha_v = _mm256_set1_ps(alpha); + __m256 beta_v = _mm256_set1_ps(beta); + I32 i = 0; + for (i = 0; i < len - 7; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 tmp_v = _mm256_add_ps(beta_v, _mm256_mul_ps(alpha_v, in)); + _mm256_storeu_ps(output + i, tmp_v); + } + for (; i < len; i++) { + output[i] = alpha * input[i] + beta; + } +} + +inline void array_power_f32(F32 *input, F32 *output, I32 len, F32 power) +{ + I32 i = 0; + if (power == -1) { + __m256 one_v = _mm256_set1_ps(1); + for (i = 0; i < len - 7; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 tmp_v = _mm256_div_ps(one_v, in); + _mm256_storeu_ps(output + i, tmp_v); + } + } else if (power == 0.5) { + for (i = 0; i < len - 7; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 tmp_v = _mm256_sqrt_ps(in); + _mm256_storeu_ps(output + i, tmp_v); + } + } else if (power == 1) { + if (input != output) { + memcpy(output, input, len * sizeof(F32)); + } + i = len; + } else if (power == 2) { + for (i = 0; i < len - 7; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 tmp_v = _mm256_mul_ps(in, in); + _mm256_storeu_ps(output + i, tmp_v); + } + } + for (; i < len; i++) { + output[i] = powf(input[i], power); + } +} + +inline F32 array_max_f32(const F32 *data, I32 len) +{ + F32 max_s = data[0]; + I32 i = 0; + if (len >= 16) { + __m256 max_v, tmp_v; + F32 max_nums[8]; + max_v = _mm256_loadu_ps(data); + for (i = 8; i < len - 7; i += 8) { + tmp_v = _mm256_loadu_ps(data + i); + max_v = _mm256_max_ps(tmp_v, max_v); + } + _mm256_storeu_ps(max_nums, max_v); + max_s = _mm256_hmax_ps(max_v); + } + + for (; i < len; i++) { + if (data[i] > max_s) { + max_s = data[i]; + } + } + + return max_s; +} + +// array var +inline F32 array_var_f32(const F32 *data, I32 len, F32 mean) +{ + if (len <= 0) { + return 0; + } + + I32 i = 0; + F32 sum_s = 0; + __m256 mean_v = _mm256_set1_ps(mean); + for (i = 0; i < len - 7; i += 8) { + __m256 in = _mm256_loadu_ps(data + i); + __m256 tmp_v = _mm256_sub_ps(in, mean_v); + __m256 sum_v = _mm256_mul_ps(tmp_v, tmp_v); + sum_s += _mm256_sum_ps(sum_v); + } + for (; i < len; i++) { + F32 in = data[i]; + F32 tmp = in - mean; + sum_s += tmp * tmp; + } + return sum_s / len; +} + +// array sum +inline F32 array_sum_f32(const F32 *data, I32 len) +{ + if (len <= 0) { + return 0; + } + + I32 i = 0; + F32 sum_s = 0; + __m256 sum_v = _mm256_set1_ps(0); + for (i = 0; i < len - 7; i += 8) { + __m256 in = _mm256_loadu_ps(data + i); + sum_v = _mm256_add_ps(sum_v, in); + } + sum_s += _mm256_sum_ps(sum_v); + for (; i < len; i++) { + sum_s += data[i]; + } + return sum_s; +} + +// array mean +inline F32 array_mean_f32(const F32 *data, I32 len) +{ + if (len <= 0) { + return 0; + } + return array_sum_f32(data, len) / len; +} + +inline void array_add_f32(const F32 *inputA, const F32 *inputB, F32 *output, I32 len) +{ + I32 i = 0; + for (i = 0; i < len - 7; i += 8) { + __m256 a = _mm256_loadu_ps(inputA + i); + __m256 b = _mm256_loadu_ps(inputB + i); + __m256 c = _mm256_add_ps(a, b); + _mm256_storeu_ps(output + i, c); + } + + for (; i < len; i++) { + output[i] = inputA[i] + inputB[i]; + } +} + +inline void array_square_and_add_f32(const F32 *inputA, const F32 *inputB, F32 *output, I32 len) +{ + I32 i = 0; + for (i = 0; i < len - 7; i += 8) { + __m256 a = _mm256_loadu_ps(inputA + i); + __m256 b = _mm256_loadu_ps(inputB + i); + b = _mm256_mul_ps(b, b); + __m256 c = _mm256_add_ps(a, b); + _mm256_storeu_ps(output + i, c); + } + + for (; i < len; i++) { + output[i] = inputA[i] + inputB[i] * inputB[i]; + } +} + +#endif //CHEETAH_X86_FUNCTION_FP32_H diff --git a/compute/tensor/src/cpu/x86/normalization.cpp b/compute/tensor/src/cpu/x86/normalization.cpp new file mode 100644 index 00000000..aaf9f160 --- /dev/null +++ b/compute/tensor/src/cpu/x86/normalization.cpp @@ -0,0 +1,37 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/tensor_computing_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif + +EE layer_normalization_x86( + TensorDesc inputDesc, void *input, void *alpha, void *beta, TensorDesc outputDesc, void *output) +{ + DataType idt = inputDesc.dt; + EE ret = SUCCESS; + switch (idt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = layer_normalization_fp32( + inputDesc, (F32 *)input, (F32 *)alpha, (F32 *)beta, outputDesc, (F32 *)output); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/x86/pooling.cpp b/compute/tensor/src/cpu/x86/pooling.cpp new file mode 100644 index 00000000..82e5b73e --- /dev/null +++ b/compute/tensor/src/cpu/x86/pooling.cpp @@ -0,0 +1,41 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/tensor_computing_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif + +EE pooling_x86(TensorDesc inputDesc, + const void *input, + PoolingParamSpec poolingParamSpec, + const void *scale, + TensorDesc outputDesc, + void *output) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + UNUSED(scale); + ret = pooling_fp32( + inputDesc, (const F32 *)input, poolingParamSpec, outputDesc, (F32 *)output); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/x86/rnn.cpp b/compute/tensor/src/cpu/x86/rnn.cpp new file mode 100644 index 00000000..ccc97141 --- /dev/null +++ b/compute/tensor/src/cpu/x86/rnn.cpp @@ -0,0 +1,49 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/tensor_computing_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif + +EE rnncell_x86(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *output, + Arch arch) +{ + EE ret = SUCCESS; + switch (xDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = rnncell_fp32(xDesc, currentX, filterDesc, filter, biasDesc, bias, state, tmpBytes, + tmp, rnnParamSpec, batchStrideX, batchStrideH, hDesc, output, arch); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/x86/scale.cpp b/compute/tensor/src/cpu/x86/scale.cpp new file mode 100644 index 00000000..0043ea9a --- /dev/null +++ b/compute/tensor/src/cpu/x86/scale.cpp @@ -0,0 +1,51 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/tensor_computing_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif + +EE scale_x86(TensorDesc inputDesc, + void *input, + void *alpha, + void *beta, + ScaleParamSpec p, + TensorDesc outputDesc, + void *output) +{ + UNUSED(outputDesc); + U32 length = tensorNumElements(inputDesc); + int axis = (p.axis + inputDesc.nDims) % inputDesc.nDims; + I32 in = inputDesc.dims[inputDesc.nDims - 1]; + I32 ic = inputDesc.dims[inputDesc.nDims - 1 - axis]; + I32 elements_per_channel = length / (in * ic); + if (inputDesc.df == DF_NCHWC8) { + axis = inputDesc.nDims; + } + EE ret = SUCCESS; + switch (inputDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = scale_fp32((F32 *)input, axis, inputDesc.nDims, (F32 *)alpha, (F32 *)beta, in, ic, + elements_per_channel, (F32 *)output); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + + return ret; +} diff --git a/compute/tensor/src/cpu/x86/softmax.cpp b/compute/tensor/src/cpu/x86/softmax.cpp new file mode 100644 index 00000000..9c2a37f0 --- /dev/null +++ b/compute/tensor/src/cpu/x86/softmax.cpp @@ -0,0 +1,37 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/tensor_computing_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#endif + +EE softmax_x86( + TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output) +{ + DataType idt = inputDesc.dt; + EE ret = SUCCESS; + switch (idt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = softmax_fp32(inputDesc, (const F32 *)input, p.axis, outputDesc, (F32 *)output); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + + return ret; +} diff --git a/compute/tensor/src/cpu/x86/tensor_computing_x86.h b/compute/tensor/src/cpu/x86/tensor_computing_x86.h new file mode 100644 index 00000000..d65fbc60 --- /dev/null +++ b/compute/tensor/src/cpu/x86/tensor_computing_x86.h @@ -0,0 +1,212 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef CHEETAH_TENSOR_COMPUTING_X86_H +#define CHEETAH_TENSOR_COMPUTING_X86_H + +#include +#include "error.h" +#include "sys.h" +#include "types.h" + +EE attention_mask_x86(TensorDesc inputDesc, + const void *input, + AttentionMaskParamSpec p, + TensorDesc outputDesc, + void *output); + +EE check_x86(TensorDesc inputDescA, + const void *inputA, + TensorDesc inputDescB, + const void *inputB, + CheckParamSpec p, + TensorDesc outputDesc, + void *output); + +EE clip_x86(TensorDesc inputDesc, void *input, ClipParamSpec p, TensorDesc outputDesc, void *output); + +EE convolution_infer_forward_algorithm_x86(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ConvolutionForwardAlgorithm *algorithm, + DataType targetDataType); + +EE convolution_transform_filter_bytes_x86(TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes); + +EE convolution_transform_filter_x86(TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed); + +EE convolution_infer_forward_tmp_bytes_x86(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes); + +EE convolution_x86(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch); + +EE depthwise_pointwise_convolution_transform_filter_x86(TensorDesc dwFilterDesc, + const void *dwFilter, + TensorDesc pwFilterDesc, + const void *pwFilter, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *dwFtmDesc, + void *dwFilterTransformed, + TensorDesc *pwFtmDesc, + void *pwFilterTransformed); + +EE depthwise_pointwise_convolution_x86(TensorDesc inputDesc, + void *input, + TensorDesc dwFilterDesc, + const void *dwFilter, + TensorDesc pwFilterDesc, + const void *pwFilter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc dwBiasDesc, + const void *dwBias, + TensorDesc pwBiasDesc, + const void *pwBias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + Arch arch); + +EE depthwise_convolution_transform_filter_bytes_x86( + TensorDesc filterDesc, DepthwiseConvolutionForwardAlgorithm algorithm, U32 *bytes); + +EE depthwise_convolution_transform_filter_x86(TensorDesc filterDesc, + const void *filter, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed); + +EE depthwise_convolution_infer_forward_tmp_bytes_x86(TensorDesc inputDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes); + +EE depthwise_convolution_x86(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec depthwiseActivationParamSpec, + Arch arch); + +EE eltwise_x86(DataType dataType, + std::vector input, + std::vector inputSize, + U32 num, + U32 len, + void *output, + EltwiseMode eltwiseMode); + +EE layer_normalization_x86( + TensorDesc inputDesc, void *input, void *alpha, void *beta, TensorDesc outputDesc, void *output); + +EE rnncell_x86(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *currentH, + Arch arch); + +EE scale_x86(TensorDesc inputDesc, + void *input, + void *alpha, + void *beta, + ScaleParamSpec p, + TensorDesc outputDesc, + void *output); + +EE pooling_x86(TensorDesc inputDesc, + const void *input, + PoolingParamSpec poolingParamSpec, + const void *scale, + TensorDesc outputDesc, + void *output); + +EE reshape_x86(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output); + +EE softmax_x86( + TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output); + +EE deconvolution_transform_filter_x86(TensorDesc filterDesc, + const void *filter, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + void *filterTransformed); + +EE deconvolution_x86(TensorDesc inputDesc, + void *input, + TensorDesc filterDesc, + const void *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc scaleDesc, + const void *scale, + TensorDesc biasDesc, + const void *bias, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + ActivationParamSpec activationDesc, + Arch arch); + +#endif //CHEETAH_TENSOR_COMPUTING_X86_H diff --git a/compute/tensor/src/cpu/x86/x86_functions.h b/compute/tensor/src/cpu/x86/x86_functions.h new file mode 100644 index 00000000..1af44b7b --- /dev/null +++ b/compute/tensor/src/cpu/x86/x86_functions.h @@ -0,0 +1,158 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_X86_FUNCTIONS +#define _H_X86_FUNCTIONS +#include "cpu/cpu_functions_template.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/x86_functions_fp32.h" +#endif + +inline void array_add_x86(DataType dt, const void *inputA, const void *inputB, void *output, I32 len) +{ + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + array_add_f32((const F32 *)inputA, (const F32 *)inputB, (F32 *)output, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } +} + +inline void array_square_and_add_x86( + DataType dt, const void *inputA, const void *inputB, void *output, I32 len) +{ + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + array_square_and_add_f32((const F32 *)inputA, (const F32 *)inputB, (F32 *)output, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } +} + +// array mean +inline F32 array_mean_x86(DataType dt, const void *data, I32 len) +{ + F32 result = 0; + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + result = array_mean_f32((const F32 *)data, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return result; +} + +inline void array_power_x86(DataType dt, void *input, void *output, I32 len, F32 power) +{ + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + array_power_f32((F32 *)input, (F32 *)output, len, power); + break; +#endif + case DT_I32: + array_power_template((I32 *)input, (I32 *)output, len, power); + break; + case DT_U32: + array_power_template((U32 *)input, (U32 *)output, len, power); + break; + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } +} + +inline F32 array_sum_x86(DataType dt, const void *data, I32 len) +{ + F32 result = 0; + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + result = array_sum_f32((const F32 *)data, len); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return result; +} + +inline void array_scale_x86( + DataType dt, const void *input, void *output, I32 len, F32 alpha, F32 beta) +{ + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + array_scale_f32((const F32 *)input, (F32 *)output, len, alpha, beta); + break; +#endif + case DT_I32: + array_scale_template((const I32 *)input, (I32 *)output, len, alpha, beta); + break; + case DT_U32: + array_scale_template((const U32 *)input, (U32 *)output, len, alpha, beta); + break; + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } +} + +// array var +inline F32 array_var_x86(DataType dt, const void *data, I32 len, F32 mean) +{ + F32 result = 0; + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + result = array_var_f32((const F32 *)data, len, mean); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return result; +} + +inline EE array_activation_x86( + DataType dt, void *input, U32 len, ActivationParamSpec activationDesc, void *output) +{ + EE result = SUCCESS; + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + result = activation_fp32((F32 *)input, len, activationDesc, (F32 *)output); + break; +#endif + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return result; +} + +#endif // _H_X86_FUNCTIONS diff --git a/compute/tensor/src/cpu/yolov3detectionoutput.cpp b/compute/tensor/src/cpu/yolov3detectionoutput.cpp new file mode 100644 index 00000000..6310f3cd --- /dev/null +++ b/compute/tensor/src/cpu/yolov3detectionoutput.cpp @@ -0,0 +1,274 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "error.h" +#include "cpu/tensor_computing_cpu.h" + +inline EE qsort_descent(std::vector &boxes, std::vector &scores, int left, int right) +{ + if (boxes.empty() || scores.empty()) { + return NOT_SUPPORTED; + } + + int i = left; + int j = right; + F32 temp = scores[(left + right) / 2]; + + while (i <= j) { + while (scores[i] > temp) { + i++; + } + while (scores[j] < temp) { + j--; + } + if (i <= j) { + std::swap(boxes[i], boxes[j]); + std::swap(scores[i], scores[j]); + i++; + j--; + } + } + + if (left < j) { + qsort_descent(boxes, scores, left, j); + } + if (i < right) { + qsort_descent(boxes, scores, i, right); + } + + return SUCCESS; +} + +inline F32 intersectionarea(BoxRect a, BoxRect b) +{ + if (a.xmin > b.xmax || a.xmax < b.xmin || a.ymin > b.ymax || a.ymax < b.ymin) { + return 0.f; + } + F32 inter_width = std::min(a.xmax, b.xmax) - std::max(a.xmin, b.xmin); + F32 inter_height = std::min(a.ymax, b.ymax) - std::max(a.ymin, b.ymin); + + return inter_width * inter_height; +} + +inline EE nms_pickedboxes(std::vector boxes, std::vector &picked, F32 nms_threshold) +{ + I64 n = boxes.size(); + + std::vector areas(n); + for (I64 i = 0; i < n; i++) { + BoxRect box = boxes[i]; + + F32 width = box.xmax - box.xmin; + F32 height = box.ymax - box.ymin; + + areas[i] = width * height; + } + for (I64 i = 0; i < n; i++) { + BoxRect a = boxes[i]; + int keep = 1; + for (int j = 0; j < (int)picked.size(); j++) { + BoxRect b = boxes[picked[j]]; + F32 inter_area = intersectionarea(a, b); + F32 union_area = areas[i] + areas[picked[j]] - inter_area; + + if (inter_area / union_area > nms_threshold) { + keep = 0; + } + } + if (keep) { + picked.push_back(i); + } + } + return SUCCESS; +} + +template +EE yolov3detectionoutput(std::vector input, + T *output, + std::vector inputDesc, + Yolov3DetectionOutputParamSpec yolov3DetectionOutputParamSpec, + Arch arch) +{ + U32 num_class = yolov3DetectionOutputParamSpec.num_class; + U32 num_box = yolov3DetectionOutputParamSpec.num_box; + F32 confidence_threshold = yolov3DetectionOutputParamSpec.confidence_threshold; + F32 nms_threshold = yolov3DetectionOutputParamSpec.nms_threshold; + std::vector biases; + for (int i = 0; i < 18; i++) { + if (yolov3DetectionOutputParamSpec.biases[i] == 0) { + break; + } + biases.push_back(yolov3DetectionOutputParamSpec.biases[i]); + } + std::vector anchors_scale; + for (int i = 0; i < 3; i++) { + if (yolov3DetectionOutputParamSpec.anchors_scale[i] == 0) { + break; + } + anchors_scale.push_back(yolov3DetectionOutputParamSpec.anchors_scale[i]); + } + std::vector mask; + for (int i = 0; i < (int)(yolov3DetectionOutputParamSpec.mask_group_num * 3); i++) { + mask.push_back(yolov3DetectionOutputParamSpec.mask[i]); + } + + std::vector all_boxrects; + std::vector all_boxscores; + I64 input_size = inputDesc.size(); + U32 info_per_box = 4 + 1 + num_class; + ActivationParamSpec activationdesc_sigmoid; + activationdesc_sigmoid.mode = ACTIVATION_SIGMOID; + TensorDesc tmpDesc = tensor1d(inputDesc[0].dt, 1); + for (I64 i = 0; i < input_size; i++) { + T *in = (T *)input[i]; + CHECK_REQUIREMENT(inputDesc[i].df == DF_NCHWC8 || inputDesc[i].df == DF_NCHW); + if (inputDesc[i].df == DF_NCHWC8) { + T *tmp = (T *)malloc(tensorNumBytes(inputDesc[0])); + memcpy(tmp, in, tensorNumBytes(inputDesc[0])); + CHECK_STATUS(transformToNCHW(inputDesc[0], tmp, inputDesc[0], in)); + free(tmp); + } + std::vector> allbox_boxrects; + std::vector> allbox_boxscores; + allbox_boxrects.resize(num_box); + allbox_boxscores.resize(num_box); + + U32 w = inputDesc[i].dims[0]; + U32 h = inputDesc[i].dims[1]; + U32 net_w = (U32)(anchors_scale[i] * w); + U32 net_h = (U32)(anchors_scale[i] * h); + I64 mask_offset = i * num_box; + U32 hw_stride = w * h; + U32 idx = 0; + + for (U32 b = 0; b < num_box; b++) { + U32 biases_index = mask[b + mask_offset]; + F32 bias_w = biases[biases_index * 2]; + F32 bias_h = biases[biases_index * 2 + 1]; + idx = hw_stride * b * info_per_box; + for (U32 nh = 0; nh < h; nh++) { + for (U32 nw = 0; nw < w; nw++) { + T box_score = 0; + CHECK_STATUS(activation_cpu(tmpDesc, &in[idx + 4 * hw_stride], + activationdesc_sigmoid, tmpDesc, &box_score, arch)); + U32 label = 0; + T class_score_max = in[idx + 5 * hw_stride]; + T class_score = 0; + for (U32 c = 1; c < num_class; c++) { + class_score = in[idx + (5 + c) * hw_stride]; + if (class_score > class_score_max) { + label = c; + class_score_max = class_score; + } + } + CHECK_STATUS(activation_cpu(tmpDesc, &class_score_max, activationdesc_sigmoid, + tmpDesc, &class_score, arch)); + F32 score_conf = static_cast(box_score * class_score); + T cx, cy; + cx = cy = 0; + if (score_conf >= confidence_threshold) { + CHECK_STATUS(activation_cpu( + tmpDesc, &in[idx], activationdesc_sigmoid, tmpDesc, &cx, arch)); + F32 box_cx = static_cast((nw + cx) / w); + CHECK_STATUS(activation_cpu(tmpDesc, &in[idx + 1 * hw_stride], + activationdesc_sigmoid, tmpDesc, &cy, arch)); + F32 box_cy = static_cast((nh + cy) / h); + F32 box_w = static_cast(exp(in[idx + 2 * hw_stride]) * bias_w / net_w); + F32 box_h = static_cast(exp(in[idx + 3 * hw_stride]) * bias_h / net_h); + + F32 box_xmin = box_cx - box_w * 0.5; + F32 box_ymin = box_cy - box_h * 0.5; + F32 box_xmax = box_cx + box_w * 0.5; + F32 box_ymax = box_cy + box_h * 0.5; + BoxRect box = {box_xmin, box_ymin, box_xmax, box_ymax, label}; + allbox_boxrects[b].push_back(box); + allbox_boxscores[b].push_back(score_conf); + } + idx++; + } + } + } + + for (U32 b = 0; b < num_box; b++) { + all_boxrects.insert( + all_boxrects.end(), allbox_boxrects[b].begin(), allbox_boxrects[b].end()); + all_boxscores.insert( + all_boxscores.end(), allbox_boxscores[b].begin(), allbox_boxscores[b].end()); + } + } + // sort boxes + qsort_descent(all_boxrects, all_boxscores, 0, static_cast(all_boxscores.size() - 1)); + // apply nms + std::vector picked; + nms_pickedboxes(all_boxrects, picked, nms_threshold); + + std::vector boxrects; + std::vector boxscores; + for (I64 p = 0; p < (I64)picked.size(); p++) { + I64 picked_box = picked[p]; + boxrects.push_back(all_boxrects[picked_box]); + boxscores.push_back(all_boxscores[picked_box]); + } + + U32 num_detected = static_cast(boxrects.size()); + // the first box contains the number of availble boxes + output[0] = num_detected; + output[1] = output[2] = output[3] = output[4] = output[5] = 0; + for (U32 i = 0; i < num_detected; i++) { + BoxRect b = boxrects[i]; + F32 score = boxscores[i]; + + output[(i + 1) * 6] = b.label + 1; + output[(i + 1) * 6 + 1] = score; + output[(i + 1) * 6 + 2] = b.xmin; + output[(i + 1) * 6 + 3] = b.ymin; + output[(i + 1) * 6 + 4] = b.xmax; + output[(i + 1) * 6 + 5] = b.ymax; + } + return SUCCESS; +} + +EE yolov3detectionoutput_cpu(std::vector inputDesc, + std::vector input, + Yolov3DetectionOutputParamSpec yolov3DetectionOutputParamSpec, + TensorDesc outputDesc, + void *output, + Arch arch) +{ + UNUSED(outputDesc); + if (nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + EE ret = SUCCESS; + switch (inputDesc[0].dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = yolov3detectionoutput( + input, (F32 *)output, inputDesc, yolov3DetectionOutputParamSpec, arch); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = yolov3detectionoutput( + input, (F16 *)output, inputDesc, yolov3DetectionOutputParamSpec, arch); + break; + } +#endif + default: { + ret = NOT_SUPPORTED; + break; + } + } + return ret; +} diff --git a/compute/tensor/src/deconvolution.cpp b/compute/tensor/src/deconvolution.cpp new file mode 100644 index 00000000..d011de45 --- /dev/null +++ b/compute/tensor/src/deconvolution.cpp @@ -0,0 +1,278 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif +#include "cpu/tensor_computing_cpu.h" + +inline EE deconvolution_infer_output_size_cpu(TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc *outputDesc, + DataType targetDataType) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt; + DataFormat idf, fdf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + + CHECK_REQUIREMENT(1 == fn || ic == fn); + + if (fc % 8 != 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + + if (fh < 1 || fw < 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + + oh = fh + strideH * (ih - 1) - paddingT - paddingB; + ow = fw + strideW * (iw - 1) - paddingL - paddingR; + + *outputDesc = tensor4df(targetDataType, DF_NCHWC8, in, fc, oh, ow); + return SUCCESS; +} + +EE deconvolution_infer_output_size(Tensor *inputTensor, + Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + Tensor *outputTensor, + DataType targetDataType, + ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = deconvolution_infer_output_size_mali( + inputDesc, filterDesc, convParamSpec, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = deconvolution_infer_output_size_cpu( + inputDesc, filterDesc, convParamSpec, &outputDesc, targetDataType); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE deconvolution_infer_forward_algorithm(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ConvolutionForwardAlgorithm *algorithm, + DataType targetDataType, + ActivationParamSpec activationDesc, + ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = SUCCESS; +#endif +#if defined(_USE_NEON) || defined(_USE_X86) + } else if (IS_X86_AVX2(arch) || IS_ARM(arch)) { + ret = deconvolution_infer_forward_algorithm_cpu(inputDesc, filterDesc, outputDesc, + convParamSpec, policy, algorithm, targetDataType, arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = deconvolution_infer_forward_algorithm_mali(((MaliPara_t)(archInfo->archPara))->handle, + inputDesc, filterDesc, convParamSpec, outputDesc, policy, activationDesc.mode, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } + return ret; +} + +EE deconvolution_transform_filter_bytes(Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo) +{ + TensorDesc filterDesc = filterTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + *bytes = tensorNumBytes(filterDesc); + ret = SUCCESS; +#endif +#if defined(_USE_NEON) || defined(_USE_X86) + } else if (IS_X86_AVX2(arch) || IS_ARM(arch)) { + ret = deconvolution_transform_filter_bytes_cpu( + filterDesc, convParamSpec, algorithm, bytes, arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = deconvolution_transform_filter_bytes_mali(filterDesc, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, + ((MaliPara_t)(archInfo->archPara))->gclmemFilterDesc, bytes); +#endif + } + return ret; +} + +EE deconvolution_transform_filter(Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + Tensor tmpTensor, + Tensor *ftmTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc filterDesc = filterTensor.get_desc(); + void *filter = get_ptr_from_tensor(filterTensor, arch); + TensorDesc ftmDesc = ftmTensor->get_desc(); + void *filterTransformed = get_ptr_from_tensor(*ftmTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + UNI_memcpy(filterTransformed, filter, tensorNumBytes(filterDesc)); + ftmDesc = filterDesc; + ret = SUCCESS; +#endif +#if defined(_USE_NEON) || defined(_USE_X86) + } else if (IS_X86_AVX2(arch) || IS_ARM(arch)) { + ret = deconvolution_transform_filter_cpu( + filterDesc, filter, convParamSpec, algorithm, &ftmDesc, filterTransformed, arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + ret = deconvolution_transform_filter_mali(((MaliPara_t)(archInfo->archPara))->handle, + filterDesc, (GCLMem_t)filter, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, + (GCLMem_t)tmp, &ftmDesc, (GCLMem_t)filterTransformed); +#endif + } + ftmTensor->resize(ftmDesc); + return ret; +} + +EE deconvolution_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = SUCCESS; +#endif +#if defined(_USE_NEON) || defined(_USE_X86) + } else if (IS_X86_AVX2(arch) || IS_ARM(arch)) { + ret = deconvolution_infer_forward_tmp_bytes_cpu( + inputDesc, filterDesc, outputDesc, convParamSpec, algorithm, bytes, archInfo->arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = deconvolution_infer_forward_tmp_bytes_mali(inputDesc, filterDesc, outputDesc, + convParamSpec, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, bytes); +#endif + } + return ret; +} + +EE deconvolution(Tensor inputTensor, + Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + void *scale, + Tensor biasTensor, + Tensor tmpTensor, + Tensor outputTensor, + ActivationParamSpec activationDesc, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc filterDesc = filterTensor.get_desc(); + void *filter = get_ptr_from_tensor(filterTensor, arch); + TensorDesc biasDesc = biasTensor.get_desc(); + void *bias = get_ptr_from_tensor(biasTensor, arch); + U32 tmpBytes = tmpTensor.bytes(); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + TensorDesc scaleDesc = filterDesc; + + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = deconvolution_general(inputDesc, input, filterDesc, filter, convParamSpec, scaleDesc, + scale, biasDesc, bias, outputDesc, output, activationDesc); +#endif +#if defined(_USE_NEON) || defined(_USE_X86) + } else if (IS_X86_AVX2(arch) || IS_ARM(arch)) { + ret = deconvolution_cpu(inputDesc, input, filterDesc, filter, convParamSpec, algorithm, + scaleDesc, scale, biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc, + archInfo->arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = deconvolution_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (GCLMem_t)input, filterDesc, (GCLMem_t)filter, convParamSpec, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, scaleDesc, (GCLMem_t)scale, + biasDesc, (GCLMem_t)bias, tmpBytes, (GCLMem_t)tmp, outputDesc, (GCLMem_t)output, + activationDesc.mode); +#endif + } + return ret; +} diff --git a/compute/tensor/src/depth2space.cpp b/compute/tensor/src/depth2space.cpp new file mode 100644 index 00000000..a11bbdb9 --- /dev/null +++ b/compute/tensor/src/depth2space.cpp @@ -0,0 +1,79 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE depth2space_infer_output_size( + Tensor *inputTensor, Depth2SpaceParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + TensorDesc inputDesc = inputTensor->get_desc(); + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = depth2space_infer_output_size_mali( + inputDesc, p, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } + outputTensor->resize(outputDesc); + return ret; +} + +EE depth2space_infer_forward_tmp_bytes( + Tensor inputTensor, Depth2SpaceParamSpec p, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo) +{ + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + ret = depth2space_infer_tmpBuf_size_mali(inputDesc, p, outputDesc, bytes); +#endif + } + return ret; +} + +EE depth2space(Tensor inputTensor, + Depth2SpaceParamSpec p, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + ret = depth2space_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (GCLMem_t)input, p, (GCLMem_t)tmp, outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/depthwise_convolution.cpp b/compute/tensor/src/depthwise_convolution.cpp new file mode 100644 index 00000000..60213b13 --- /dev/null +++ b/compute/tensor/src/depthwise_convolution.cpp @@ -0,0 +1,306 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "cpu/tensor_computing_cpu.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif + +inline EE depthwise_convolution_infer_output_size_cpu(TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc *outputDesc, + DataType targetDataType) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt; + DataFormat idf, fdf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + if (fh < 1 || fw < 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + U32 fhDilated = (fh - 1) * dilateH + 1; + U32 fwDilated = (fw - 1) * dilateW + 1; + + CHECK_REQUIREMENT(fdf == DF_NCHW || fdf == DF_NCHWC8); + oh = (ih + paddingT + paddingB - fhDilated) / strideH + 1; + ow = (iw + paddingL + paddingR - fwDilated) / strideW + 1; + + if (ic % 8 != 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + + *outputDesc = tensor4df(targetDataType, DF_NCHWC8, in, ic, oh, ow); + return SUCCESS; +} + +EE depthwise_convolution_infer_output_size(Tensor *inputTensor, + Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + Tensor *outputTensor, + DataType targetDataType, + ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = depthwise_convolution_infer_output_size_mali( + inputDesc, filterDesc, convParamSpec, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = depthwise_convolution_infer_output_size_cpu( + inputDesc, filterDesc, convParamSpec, &outputDesc, targetDataType); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE depthwise_convolution_infer_forward_algorithm(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + DepthwiseConvolutionForwardAlgorithm *algorithm, + DataType targetDataType, + ActivationParamSpec depthwiseActivationParamSpec, + ArchInfo_t archInfo) +{ + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = SUCCESS; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + *algorithm = DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT; + ret = SUCCESS; +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + *algorithm = DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT; + ret = SUCCESS; +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + ret = depthwise_convolution_infer_forward_algorithm_mali( + ((MaliPara_t)(archInfo->archPara))->handle, inputDesc, filterDesc, outputDesc, + convParamSpec, policy, depthwiseActivationParamSpec.mode, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } + return ret; +} + +EE depthwise_convolution_transform_filter_bytes(Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo) +{ + TensorDesc filterDesc = filterTensor.get_desc(); + + UNUSED(convParamSpec); + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + *bytes = tensorNumBytes(filterDesc); + ret = SUCCESS; +#endif +#if defined(_USE_X86) || defined(_USE_NEON) + } else if (IS_CPU(arch)) { + ret = depthwise_convolution_transform_filter_bytes_cpu(filterDesc, algorithm, bytes); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = depthwise_convolution_transform_filter_bytes_mali(filterDesc, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, + ((MaliPara_t)(archInfo->archPara))->gclmemFilterDesc, bytes); +#endif + } + return ret; +} + +EE depthwise_convolution_transform_filter(Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + Tensor *ftmTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc filterDesc = filterTensor.get_desc(); + void *filter = get_ptr_from_tensor(filterTensor, arch); + TensorDesc ftmDesc = ftmTensor->get_desc(); + void *filterTransformed = get_ptr_from_tensor(*ftmTensor, arch); + + UNUSED(convParamSpec); + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + UNI_memcpy(filterTransformed, filter, tensorNumBytes(filterDesc)); + ftmDesc = filterDesc; + ret = SUCCESS; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = depthwise_convolution_transform_filter_x86( + filterDesc, filter, algorithm, &ftmDesc, filterTransformed); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = depthwise_convolution_transform_filter_arm( + filterDesc, filter, algorithm, &ftmDesc, filterTransformed); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = depthwise_convolution_transform_filter_mali(((MaliPara_t)(archInfo->archPara))->handle, + filterDesc, (GCLMem_t)filter, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, + &ftmDesc, (GCLMem_t)filterTransformed); +#endif + } + ftmTensor->resize(ftmDesc); + return ret; +} + +EE depthwise_convolution_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + *bytes = 0; + ret = SUCCESS; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = depthwise_convolution_infer_forward_tmp_bytes_x86( + inputDesc, outputDesc, convParamSpec, algorithm, bytes); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = depthwise_convolution_infer_forward_tmp_bytes_arm( + inputDesc, filterDesc, outputDesc, convParamSpec, algorithm, bytes); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = depthwise_convolution_infer_forward_tmp_bytes_mali(inputDesc, filterDesc, outputDesc, + convParamSpec, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, bytes); + ret = SUCCESS; +#endif + } + return ret; +} + +EE depthwise_convolution(Tensor inputTensor, + Tensor filterTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + Tensor biasTensor, + Tensor tmpTensor, + Tensor outputTensor, + ActivationParamSpec depthwiseActivationParamSpec, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc filterDesc = filterTensor.get_desc(); + void *filter = get_ptr_from_tensor(filterTensor, arch); + TensorDesc biasDesc = biasTensor.get_desc(); + void *bias = get_ptr_from_tensor(biasTensor, arch); + U32 tmpBytes = tmpTensor.bytes(); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = depthwise_convolution_general(inputDesc, input, filterDesc, filter, convParamSpec, + biasDesc, bias, tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = depthwise_convolution_x86(inputDesc, input, filterDesc, filter, convParamSpec, + algorithm, biasDesc, bias, tmpBytes, tmp, outputDesc, output, + depthwiseActivationParamSpec, archInfo->arch); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = depthwise_convolution_arm(inputDesc, input, filterDesc, filter, convParamSpec, + algorithm, biasDesc, bias, tmpBytes, tmp, outputDesc, output, + depthwiseActivationParamSpec, archInfo->arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = depthwise_convolution_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (GCLMem_t)input, filterDesc, (GCLMem_t)filter, convParamSpec, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, biasDesc, (GCLMem_t)bias, tmpBytes, + (GCLMem_t)tmp, outputDesc, (GCLMem_t)output, depthwiseActivationParamSpec.mode); +#endif + } + return ret; +} diff --git a/compute/tensor/src/depthwise_pointwise_convolution.cpp b/compute/tensor/src/depthwise_pointwise_convolution.cpp new file mode 100644 index 00000000..83617cc8 --- /dev/null +++ b/compute/tensor/src/depthwise_pointwise_convolution.cpp @@ -0,0 +1,355 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif + +inline EE depthwise_pointwise_convolution_infer_output_size_cpu(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc *outputDesc, + DataType targetDataType) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt, fdt2; + DataFormat idf, fdf, fdf2; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 fn2, fc2, fh2, fw2; + U32 oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(pwFilterDesc, &fdt2, &fdf2, &fn2, &fc2, &fh2, &fw2)); + if (fh < 1 || fw < 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + + U32 strideH = convParamSpec.stride_h; + U32 strideW = convParamSpec.stride_w; + U32 paddingT = convParamSpec.padding_top; + U32 paddingB = convParamSpec.padding_bottom; + U32 paddingL = convParamSpec.padding_left; + U32 paddingR = convParamSpec.padding_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; + + U32 fhDilated = (fh - 1) * dilateH + 1; + U32 fwDilated = (fw - 1) * dilateW + 1; + + oh = (ih + paddingT + paddingB - fhDilated) / strideH + 1; + ow = (iw + paddingL + paddingR - fwDilated) / strideW + 1; + + if (fn2 % 8 != 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + + *outputDesc = tensor4df(targetDataType, DF_NCHWC8, in, fn2, oh, ow); + return SUCCESS; +} + +EE depthwise_pointwise_convolution_infer_output_size(Tensor *inputTensor, + Tensor dwFilterTensor, + Tensor pwFilterTensor, + ConvolutionParamSpec convParamSpec, + Tensor *outputTensor, + DataType targetDataType, + ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + TensorDesc dwFilterDesc = dwFilterTensor.get_desc(); + TensorDesc pwFilterDesc = pwFilterTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = depthwise_pointwise_convolution_infer_output_size_mali(inputDesc, dwFilterDesc, + pwFilterDesc, convParamSpec, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = depthwise_pointwise_convolution_infer_output_size_cpu( + inputDesc, dwFilterDesc, pwFilterDesc, convParamSpec, &outputDesc, targetDataType); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE depthwise_pointwise_convolution_infer_forward_algorithm(Tensor inputTensor, + Tensor dwFilterTensor, + Tensor pwFilterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + DepthwiseConvolutionForwardAlgorithm *algorithm, + DataType targetDataType, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + TensorDesc dwFilterDesc = dwFilterTensor.get_desc(); + TensorDesc pwFilterDesc = pwFilterTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = SUCCESS; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + *algorithm = DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT; + ret = SUCCESS; +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = depthwise_pointwise_convolution_infer_forward_algorithm_arm(inputDesc, dwFilterDesc, + pwFilterDesc, outputDesc, convParamSpec, policy, algorithm, targetDataType); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = depthwise_pointwise_convolution_infer_forward_algorithm_mali( + ((MaliPara_t)(archInfo->archPara))->handle, inputDesc, dwFilterDesc, pwFilterDesc, + outputDesc, convParamSpec, policy, depthwiseActivationParamSpec.mode, + pointwiseActivationParamSpec.mode, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } + return ret; +} + +EE depthwise_pointwise_convolution_transform_filter_bytes(Tensor dwFilterTensor, + Tensor pwFilterTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *dwBytes, + U32 *pwBytes, + ArchInfo_t archInfo) +{ + TensorDesc dwFilterDesc = dwFilterTensor.get_desc(); + TensorDesc pwFilterDesc = pwFilterTensor.get_desc(); + UNUSED(convParamSpec); + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + *dwBytes = tensorNumBytes(dwFilterDesc); + *pwBytes = tensorNumBytes(pwFilterDesc); + ret = SUCCESS; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + *dwBytes = tensorNumBytes(dwFilterDesc) + 32; + *pwBytes = tensorNumBytes(pwFilterDesc) + 32; + ret = SUCCESS; +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + *dwBytes = tensorNumBytes(dwFilterDesc) + 32; + *pwBytes = tensorNumBytes(pwFilterDesc) + 32; + ret = SUCCESS; +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + GCLMemDesc_t gclmemFilterDesc = ((MaliPara_t)(archInfo->archPara))->gclmemFilterDesc; + GCLMemDesc_t gclmemDwFilterDesc = &gclmemFilterDesc[0]; + GCLMemDesc_t gclmemPwFilterDesc = &gclmemFilterDesc[1]; + ret = depthwise_pointwise_convolution_transform_filter_bytes_mali(dwFilterDesc, + pwFilterDesc, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, gclmemDwFilterDesc, + gclmemPwFilterDesc, dwBytes); + *pwBytes = 0; +#endif + } + return ret; +} + +EE depthwise_pointwise_convolution_transform_filter(Tensor dwFilterTensor, + Tensor pwFilterTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + Tensor *dwFtm, + Tensor *pwFtm, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc dwFilterDesc = dwFilterTensor.get_desc(); + void *dwFilter = get_ptr_from_tensor(dwFilterTensor, arch); + TensorDesc pwFilterDesc = pwFilterTensor.get_desc(); + void *pwFilter = get_ptr_from_tensor(pwFilterTensor, arch); + TensorDesc dwFtmDesc = dwFtm->get_desc(); + void *dwFilterTransformed = get_ptr_from_tensor(*dwFtm, arch); + TensorDesc pwFtmDesc = pwFtm->get_desc(); + void *pwFilterTransformed = get_ptr_from_tensor(*pwFtm, arch); + UNUSED(convParamSpec); + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + UNI_memcpy(dwFilterTransformed, dwFilter, tensorNumBytes(dwFilterDesc)); + dwFtmDesc = dwFilterDesc; + UNI_memcpy(pwFilterTransformed, pwFilter, tensorNumBytes(pwFilterDesc)); + pwFtmDesc = pwFilterDesc; + ret = SUCCESS; +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = depthwise_pointwise_convolution_transform_filter_x86(dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, algorithm, &dwFtmDesc, dwFilterTransformed, &pwFtmDesc, + pwFilterTransformed); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = depthwise_pointwise_convolution_transform_filter_arm(dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, algorithm, &dwFtmDesc, dwFilterTransformed, + &pwFtmDesc, pwFilterTransformed); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = depthwise_pointwise_convolution_transform_filter_mali( + ((MaliPara_t)(archInfo->archPara))->handle, dwFilterDesc, pwFilterDesc, + (GCLMem_t)dwFilter, (GCLMem_t)pwFilter, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, &dwFtmDesc, &pwFtmDesc, + (GCLMem_t)dwFilterTransformed, (GCLMem_t)pwFilterTransformed); +#endif + } + dwFtm->resize(dwFtmDesc); + pwFtm->resize(pwFtmDesc); + return ret; +} + +EE depthwise_pointwise_convolution_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor dwFilterTensor, + Tensor pwFilterTensor, + Tensor outputTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + U32 *bytes, + ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + TensorDesc dwFilterDesc = dwFilterTensor.get_desc(); + TensorDesc pwFilterDesc = pwFilterTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = depthwise_pointwise_convolution_infer_forward_tmp_bytes_general( + inputDesc, dwFilterDesc, pwFilterDesc, outputDesc, convParamSpec, algorithm, bytes); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = depthwise_convolution_infer_forward_tmp_bytes_x86( + inputDesc, outputDesc, convParamSpec, algorithm, bytes); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = depthwise_pointwise_convolution_infer_forward_tmp_bytes_arm( + inputDesc, dwFilterDesc, pwFilterDesc, outputDesc, convParamSpec, algorithm, bytes); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = depthwise_pointwise_convolution_infer_forward_tmp_bytes_mali(inputDesc, dwFilterDesc, + pwFilterDesc, outputDesc, convParamSpec, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, bytes); +#endif + } + return ret; +} + +EE depthwise_pointwise_convolution(Tensor inputTensor, + Tensor dwFilterTensor, + Tensor pwFilterTensor, + ConvolutionParamSpec convParamSpec, + DepthwiseConvolutionForwardAlgorithm algorithm, + Tensor dwBiasTensor, + Tensor pwBiasTensor, + Tensor tmpTensor, + Tensor outputTensor, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + U32 tmpBytes = tmpTensor.bytes(); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + TensorDesc dwFilterDesc = dwFilterTensor.get_desc(); + void *dwFilter = get_ptr_from_tensor(dwFilterTensor, arch); + TensorDesc pwFilterDesc = pwFilterTensor.get_desc(); + void *pwFilter = get_ptr_from_tensor(pwFilterTensor, arch); + TensorDesc dwBiasDesc = dwBiasTensor.get_desc(); + void *dwBias = get_ptr_from_tensor(dwBiasTensor, arch); + TensorDesc pwBiasDesc = pwBiasTensor.get_desc(); + void *pwBias = get_ptr_from_tensor(pwBiasTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = depthwise_pointwise_convolution_general(inputDesc, input, dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, dwBiasDesc, dwBias, pwBiasDesc, pwBias, tmpBytes, + tmp, outputDesc, output, depthwiseActivationParamSpec, pointwiseActivationParamSpec); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = depthwise_pointwise_convolution_x86(inputDesc, input, dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, algorithm, dwBiasDesc, dwBias, pwBiasDesc, + pwBias, tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, archInfo->arch); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = depthwise_pointwise_convolution_arm(inputDesc, input, dwFilterDesc, dwFilter, + pwFilterDesc, pwFilter, convParamSpec, algorithm, dwBiasDesc, dwBias, pwBiasDesc, + pwBias, tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, archInfo->arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = depthwise_pointwise_convolution_mali(((MaliPara_t)(archInfo->archPara))->handle, + inputDesc, (GCLMem_t)input, dwFilterDesc, pwFilterDesc, (GCLMem_t)dwFilter, + (GCLMem_t)pwFilter, convParamSpec, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, + dwBiasDesc, pwBiasDesc, (GCLMem_t)dwBias, (GCLMem_t)pwBias, tmpBytes, (GCLMem_t)tmp, + outputDesc, (GCLMem_t)output, depthwiseActivationParamSpec.mode, + pointwiseActivationParamSpec.mode); +#endif + } + return ret; +} diff --git a/compute/tensor/src/detectionoutput.cpp b/compute/tensor/src/detectionoutput.cpp new file mode 100644 index 00000000..c20344ba --- /dev/null +++ b/compute/tensor/src/detectionoutput.cpp @@ -0,0 +1,85 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif + +inline EE detectionoutput_infer_output_size_cpu(std::vector inputDesc, + DetectionOutputParamSpec detectionOutputParamSpec, + TensorDesc *outputDesc) +{ + if (inputDesc.size() != 3) { + CHECK_STATUS(NOT_MATCH); + } + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt0, idt2; + DataFormat idf0, idf2; + U32 ih0, iw0; + U32 in2, ic2, ilens2; + // loc + CHECK_STATUS(tensor2dGet(inputDesc[0], &idt0, &idf0, &ih0, &iw0)); + // priorbox + CHECK_STATUS(tensor3dGet(inputDesc[2], &idt2, &idf2, &in2, &ic2, &ilens2)); + CHECK_REQUIREMENT(iw0 == ilens2); + // output size + U32 oh, ow; + // oh = the first box for saving the number of available boxes(1) + the maximum number of dectected boxes(keep_top_k) + U32 num_detected_max = detectionOutputParamSpec.keep_top_k; + oh = 1 + num_detected_max; + // Each width is a 6 dimension vector, which stores [label, confidence, xmin, ymin, xmax, ymax] -> 6 + // The first box is [ number of available boxes, 0, 0, 0, 0, 0 ] + ow = 6; + *outputDesc = tensor2df(idt0, idf2, oh, ow); + return SUCCESS; +} + +EE detectionoutput_infer_output_size(std::vector inputTensor, + DetectionOutputParamSpec detectionOutputParamSpec, + Tensor *outputTensor, + ArchInfo_t archInfo) +{ + UNUSED(archInfo); + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + std::vector inputDesc = get_desc_from_tensor_ptrs(inputTensor); + TensorDesc outputDesc = outputTensor->get_desc(); + CHECK_STATUS( + detectionoutput_infer_output_size_cpu(inputDesc, detectionOutputParamSpec, &outputDesc)); + outputTensor->resize(outputDesc); + return SUCCESS; +} + +EE detectionoutput(std::vector inputTensor, + DetectionOutputParamSpec detectionOutputParamSpec, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + std::vector inputDesc = get_desc_from_tensors(inputTensor); + std::vector input = get_data_from_tensors(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = detectionoutput_cpu(inputDesc, input, detectionOutputParamSpec, outputDesc, output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/eltwise.cpp b/compute/tensor/src/eltwise.cpp new file mode 100644 index 00000000..329781e4 --- /dev/null +++ b/compute/tensor/src/eltwise.cpp @@ -0,0 +1,176 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#if defined(_USE_GENERAL) || defined(_USE_X86) || defined(_USE_NEON) +#include "cpu/tensor_computing_cpu.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +// [1, 10, 10] + [1, 10, 10] = [1, 10, 10] +// [1, 10, 1] + [1, 1, 10] = [1, 10, 10] +// [1, 20, 10] + [10] = [1, 20, 10] +inline EE eltwise_infer_output_size_cpu(std::vector inputDesc, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + U32 num = inputDesc.size(); + if (num <= 0) { + return NOT_MATCH; + } + + if (num == 1) { + *outputDesc = inputDesc[0]; + return SUCCESS; + } + + U32 arrayDimMax = 0; + U32 minDims = inputDesc[0].nDims; + for (U32 i = 1; i < num; i++) { + if (inputDesc[i].nDims > inputDesc[arrayDimMax].nDims) { + arrayDimMax = i; + } + if (inputDesc[i].nDims < minDims) { + minDims = inputDesc[i].nDims; + } + } + U32 nchwc8Count = 0; + for (U32 i = 0; i < num; i++) { + if (inputDesc[i].df == DF_NCHWC8) { + nchwc8Count++; + // Output from 1D-conv + 3D tensors + if (inputDesc[i].dims[0] == 1 && minDims == 3) { + inputDesc[i] = tensor3df(inputDesc[i].dt, DF_NCHW, + inputDesc[i].dims[3], inputDesc[i].dims[2], inputDesc[i].dims[1]); + } + } + } + + U32 dim = inputDesc[arrayDimMax].nDims; + *outputDesc = inputDesc[arrayDimMax]; + + if (nchwc8Count > 0 && nchwc8Count != num) { + outputDesc->df = DF_NCHW; + } + + for (U32 i = 0; i < dim; i++) { + for (U32 j = 0; j < num; j++) { + if (inputDesc[j].nDims > i) { + outputDesc->dims[i] = UNI_MAX(outputDesc->dims[i], inputDesc[j].dims[i]); + } + } + } + return SUCCESS; +} + +EE eltwise_infer_output_size( + std::vector inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + std::vector inputDesc = get_desc_from_tensor_ptrs(inputTensor); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + std::vector gclmemInputDescs; + for (auto p : inputTensor) { + gclmemInputDescs.push_back(ocl_get_desc(*p)); + } + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = eltwise_infer_output_size_mali( + inputDesc, &outputDesc, gclmemInputDescs.data(), &gclmemOutputDesc); + for (U32 i = 0; i < inputTensor.size(); i++) { + ocl_set_desc(inputTensor[i], gclmemInputDescs[i]); + } + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = eltwise_infer_output_size_cpu(inputDesc, &outputDesc); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE eltwise_infer_forward_tmp_bytes( + std::vector inputTensor, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo) +{ + std::vector inputDesc = get_desc_from_tensors(inputTensor); + UNUSED(outputTensor); + + *bytes = 0; + U32 nchwc8Count = 0; + for (U32 i = 0; i < inputDesc.size(); i++) { + if (inputDesc[i].df == DF_NCHWC8) { + nchwc8Count++; + *bytes += tensorNumBytes(inputDesc[i]); + } + } + if (nchwc8Count == inputDesc.size() || nchwc8Count == 0) { + *bytes = 0; + } + return SUCCESS; +} + +#ifdef _USE_INT8 +inline void eltwise_process_int8(F32 scale, U8 **tmp, TensorDesc *desc, U8 **input) +{ + INT8 *inQ = (INT8 *)(*input); + dequantize_int8_to_fp16(tensorNumElements(*desc), inQ, scale, (F16 *)*tmp); + desc->dt = DT_F16; + *input = *tmp; + *tmp += tensorNumElements(*desc); +} +#endif + +EE eltwise(std::vector inputTensor, + EltwiseParamSpec eltwiseDesc, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + std::vector inputDesc = get_desc_from_tensors(inputTensor); + std::vector input = get_data_from_tensors(inputTensor, arch); + U32 tmpBytes = tmpTensor.bytes(); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); +#ifdef _USE_INT8 + if (!IS_MALI_GPU(arch)) { + for (U32 i = 0; i < inputTensor.size(); i++) { + if (inputDesc[i].dt == DT_I8) { + F32 scale = inputTensor[i].get_scale(); + eltwise_process_int8(scale, (U8 **)&tmp, &inputDesc[i], (U8 **)&input[i]); + } + } + } +#endif + + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = eltwise_cpu(inputDesc, input, eltwiseDesc, tmpBytes, tmp, outputDesc, output, arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = eltwise_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, input, + eltwiseDesc, outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/embedding.cpp b/compute/tensor/src/embedding.cpp new file mode 100644 index 00000000..b81a89cb --- /dev/null +++ b/compute/tensor/src/embedding.cpp @@ -0,0 +1,97 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE embedding_infer_output_size(Tensor *inputTensor, + EmbedParamSpec p, + DataType outputDt, + Tensor *outputTensor, + ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = embedding_infer_output_size_mali( + inputDesc, p, outputDt, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif +#ifdef _USE_CPU + } else { + DataType dt; + DataFormat df; + U32 batch, step; + bool inputOneDim = false; + if (inputDesc.nDims == 1) { + inputOneDim = true; + inputDesc.nDims = 2; + inputDesc.dims[1] = 1; + } + CHECK_REQUIREMENT(tensorIs2d(inputDesc)); + CHECK_STATUS(tensor2dGet(inputDesc, &dt, &df, &batch, &step)); + outputDesc = tensor3df(outputDt, DF_MTK, batch, step, p.num_output); + if (inputOneDim) { + outputDesc.nDims = 2; + outputDesc.df = DF_NORMAL; + } + ret = SUCCESS; +#endif + } + outputTensor->resize(outputDesc); + return ret; +} + +EE embedding(Tensor inputTensor, + Tensor weightTensor, + EmbedParamSpec p, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + void *weight = get_ptr_from_tensor(weightTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + TensorDesc weightDesc = weightTensor.get_desc(); + ret = embedding_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, + weightDesc, (GCLMem_t)weight, p, outputDesc, (GCLMem_t)output); +#endif +#ifdef _USE_CPU + } else { + ret = embedding_cpu(inputDesc, input, weight, p, outputDesc, output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/fully_connected.cpp b/compute/tensor/src/fully_connected.cpp new file mode 100644 index 00000000..0d8e0848 --- /dev/null +++ b/compute/tensor/src/fully_connected.cpp @@ -0,0 +1,451 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "tensor_computing.h" +#include "blas_enhance.h" +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +// input format: NCHW|NCHWC8|NORMAL +// weight(filter) format: NORMAL +// result format: NORMAL + +inline EE fully_connected_infer_output_size_cpu( + TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc *outputDesc) +{ + if (outputDesc == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + + DataType idt, fdt; + DataFormat idf, fdf; + U32 in, ic, ih, iw; + U32 fh, fw; + if (tensorIs2d(inputDesc)) { + CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &in, &iw)); + ic = 1; + ih = 1; + } else if (tensorIs4d(inputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + if (idf != DF_NCHW && idf != DF_NCHWC8) { + CHECK_STATUS(NOT_MATCH); + } + } else { + return NOT_MATCH; + } + + CHECK_REQUIREMENT(tensorIs2d(filterDesc)); + CHECK_STATUS(tensor2dGet(filterDesc, &fdt, &fdf, &fh, &fw)); + if (fdf != DF_TRANSPOSE) { + CHECK_STATUS(NOT_MATCH); + } + + if (fw != ic * ih * iw) { + CHECK_STATUS(NOT_MATCH); + } + + *outputDesc = tensor2df(idt, DF_NORMAL, in, fh); + return SUCCESS; +} + +EE fully_connected_infer_output_size( + Tensor *inputTensor, Tensor filterTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = fully_connected_infer_output_size_mali( + inputDesc, filterDesc, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = fully_connected_infer_output_size_cpu(inputDesc, filterDesc, &outputDesc); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE fully_connected_infer_forward_algorithm( + Tensor inputTensor, Tensor filterTensor, Tensor outputTensor, ArchInfo_t archInfo) +{ + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + std::vector outputDescs; + outputDescs.push_back(outputDesc); + ret = fully_connected_infer_forward_algorithm_mali( + ((MaliPara_t)(archInfo->archPara))->handle, inputDesc, filterDesc, outputDescs, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } else { + UNUSED(inputTensor); + UNUSED(filterTensor); + UNUSED(outputTensor); + } + return ret; +} +EE fully_connected_infer_forward_tmp_bytes( + Tensor inputTensor, Tensor filterTensor, U32 *bytes, ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + // Match dt in int8 inference + inputDesc.dt = filterDesc.dt; + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + ret = fully_connected_infer_forward_tmp_bytes_mali( + inputDesc, filterDesc, bytes, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } else { + if (bytes == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + if (tensorIs2d(inputDesc)) { + CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &in, &iw)); + ic = ih = 1; + } else if (tensorIs4d(inputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + } else { + return NOT_MATCH; + } + + if (in != 1) { + // call gemm + TensorDesc in_desc = tensor2df(idt, DF_NORMAL, in, ic * ih * iw); + ret = matrix_matrix_multiply_tmp_bytes(in_desc, filterDesc, bytes, archInfo->arch); + } else { + // call gemv + TensorDesc in_desc = tensor1d(idt, ic * ih * iw); + ret = matrix_vector_multiply_tmp_bytes(filterDesc, in_desc, bytes, archInfo->arch); + } + if (DT_I8 == filterDesc.dt) { + if (DT_F16 == inputTensor.get_desc().dt) { + *bytes += tensorNumBytes(inputDesc); + } + *bytes += filterDesc.dims[0] * bytesOf(DT_I32); // Bias + *bytes += in * filterDesc.dims[1] * bytesOf(DT_I32); // Results before quantization + } + } + return ret; +} + +EE fully_connected_transform_filter_bytes(Tensor filterTensor, U32 *bytes, ArchInfo_t archInfo) +{ + TensorDesc filterDesc = filterTensor.get_desc(); + + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + CHECK_STATUS(fully_connected_transform_filter_bytes_mali(filterDesc, + ((MaliPara_t)(archInfo->archPara))->gclmemFilterDesc, bytes, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo)); +#endif + } else { + if (bytes == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + *bytes = tensorNumBytes(filterDesc) + 32; + } + return SUCCESS; +} + +template +EE fully_connected_transform_filter_kernel(TensorDesc inputDesc, + TensorDesc filterDesc, + const void *filter, + TensorDesc *ftmDesc, + void *filterTransformed) +{ + if (filter == nullptr || ftmDesc == nullptr || filterTransformed == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + + DataType idt, fdt; + DataFormat idf, fdf; + U32 in, ic, ih, iw; + U32 fh, fw; + if (tensorIs2d(inputDesc)) { + CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &in, &iw)); + ic = ih = 1; + } else if (tensorIs4d(inputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + } else { + return NOT_MATCH; + } + CHECK_STATUS(tensor2dGet(filterDesc, &fdt, &fdf, &fh, &fw)); + + if (fw != ic * ih * iw) { + CHECK_STATUS(NOT_MATCH); + } + bool need_transpose = false; + if (in > 1) { + need_transpose = true; + } + + if (idf == DF_NCHW || idf == DF_NORMAL) { + if (need_transpose) { + T *f_ptr = (T *)filter; + T *ftm_ptr = (T *)filterTransformed; + for (U32 h = 0; h < fh; h++) { + for (U32 w = 0; w < fw; w++) { + U32 f_index = h * fw + w; + U32 ftm_index = w * fh + h; + ftm_ptr[ftm_index] = f_ptr[f_index]; + } + } + } else { + memcpy(filterTransformed, filter, tensorNumBytes(filterDesc)); + } + } else if (idf == DF_NCHWC8) { + U32 align = 8; + U32 ic_new = ic / align; + T *f_ptr = (T *)filter; + T *ftm_ptr = (T *)filterTransformed; + for (U32 h = 0; h < fh; h++) { + for (U32 w = 0; w < fw; w++) { + U32 i_n = w / (ic * ih * iw); + U32 remain = w % (ic * ih * iw); + U32 i_c = remain / (ih * iw); + remain = remain % (ih * iw); + U32 i_h = remain / iw; + U32 i_w = remain % iw; + U32 i_c_outer = i_c / align; + U32 i_c_inner = i_c % align; + U32 h_new = h; + U32 w_new = (((i_n * ic_new + i_c_outer) * ih + i_h) * iw + i_w) * align + i_c_inner; + U32 ld = fw; + if (need_transpose) { + U32 tmp = h_new; + h_new = w_new; + w_new = tmp; + ld = fh; + } + U32 f_index = h * fw + w; + U32 ftm_index = h_new * ld + w_new; + ftm_ptr[ftm_index] = f_ptr[f_index]; + } + } + } else { + return NOT_MATCH; + } + + U32 fh_after = fh; + U32 fw_after = fw; + if (need_transpose) { + fh_after = fw; + fw_after = fh; + } + *ftmDesc = tensor2df(fdt, DF_NORMAL, fh_after, fw_after); + return SUCCESS; +} + +EE fully_connected_transform_filter( + Tensor inputTensor, Tensor filterTensor, Tensor *ftmTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + void *filter = get_ptr_from_tensor(filterTensor, arch); + TensorDesc ftmDesc = ftmTensor->get_desc(); + void *filterTransformed = get_ptr_from_tensor(*ftmTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + std::vector filterTransVec; + filterTransVec.push_back((GCLMem_t)filterTransformed); + ret = fully_connected_transform_filter_mali(((MaliPara_t)(archInfo->archPara))->handle, + filterDesc, (GCLMem_t)filter, &ftmDesc, filterTransVec, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } else { + switch (filterDesc.dt) { +#ifdef _USE_FP16 + case DT_F16: { + ret = fully_connected_transform_filter_kernel( + inputDesc, filterDesc, filter, &ftmDesc, filterTransformed); + break; + } +#endif +#ifdef _USE_FP32 + case DT_F32: { + ret = fully_connected_transform_filter_kernel( + inputDesc, filterDesc, filter, &ftmDesc, filterTransformed); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + } + ftmTensor->resize(ftmDesc); + return ret; +} + +EE fully_connected(Tensor inputTensor, + Tensor filterTensor, + Tensor biasTensor, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc filterDesc = filterTensor.get_desc(); + void *filter = get_ptr_from_tensor(filterTensor, arch); + TensorDesc biasDesc = biasTensor.get_desc(); + void *bias = get_ptr_from_tensor(biasTensor, arch); + U32 tmpBytes = tmpTensor.bytes(); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + std::vector filterVec; + std::vector biasVec; + std::vector outputVec; + filterVec.push_back((GCLMem_t)filter); + biasVec.push_back((GCLMem_t)bias); + outputVec.push_back((GCLMem_t)output); + ret = fully_connected_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (GCLMem_t)input, filterDesc, &filterVec, biasDesc, &biasVec, tmpBytes, (GCLMem_t)tmp, + outputDesc, &outputVec, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } else { + if (input == nullptr || filter == nullptr || output == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + +#ifdef _USE_INT8 + F32 scaleI = inputTensor.get_scale(); + if (DT_I8 == filterDesc.dt) { + if (DT_F16 == inputDesc.dt) { + F16 *inD = (F16 *)input; + INT8 *inQ = (INT8 *)tmp; + F16 scale = scaleI; + quantize_tensor(inputDesc, inD, &inputDesc, inQ, &scale); + scaleI = scale; + input = (U8 *)tmp; + tmp = (U8 *)tmp + tensorNumBytes(inputDesc); + } + if (nullptr != bias) { + if (DT_F16 == outputDesc.dt) { // dequantize and then add bias + bias = nullptr; + } else { + CHECK_REQUIREMENT(DT_I8 == outputDesc.dt); + biasDesc.dt = DT_I32; + F16 *biasF = (F16 *)bias; + I32 *biasI = (I32 *)tmp; + F32 scale = scaleI * filterTensor.get_scale(); + for (U32 i = 0; i < tensorNumElements(biasDesc); i++) { + biasI[i] = round(scale * biasF[i]); + } + bias = tmp; + tmp = (U8 *)tmp + tensorNumBytes(biasDesc); + } + } + outputDesc.dt = DT_I32; + output = tmp; + tmp = (U8 *)tmp + tensorNumBytes(outputDesc); + } +#endif + + U32 in, ic, ih, iw; + U32 oh, ow; + U32 fh, fw, bw; + DataType idt, fdt, odt, bdt; + DataFormat idf, fdf, odf, bdf; + if (tensorIs2d(inputDesc)) { + CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &in, &iw)); + ic = ih = 1; + } else if (tensorIs4d(inputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + } else { + CHECK_STATUS(NOT_MATCH); + } + + CHECK_REQUIREMENT(tensorIs2d(filterDesc)); + CHECK_STATUS(tensor2dGet(filterDesc, &fdt, &fdf, &fh, &fw)); + CHECK_STATUS(tensor2dGet(outputDesc, &odt, &odf, &oh, &ow)); + + if (bias != nullptr) { + CHECK_STATUS(tensor1dGet(biasDesc, &bdt, &bdf, &bw)); + + if (bw != ow) { + CHECK_STATUS(NOT_MATCH); + } else { + U8 *outArray = (U8 *)output; + U32 size = tensorNumBytes(biasDesc); + for (U32 i = 0; i < in; i++) { + memcpy(outArray + i * size, bias, size); + } + } + } else { + memset(output, 0, tensorNumBytes(outputDesc)); + } + if (in == 1 && + fdf != targetFormat4MatrixB(fdt)) { // If weight is transformed for mmm, don't run as mvm + TensorDesc vectorDesc = tensor1d(idt, ic * ih * iw); + TensorDesc resultDesc = tensor1d(odt, ow); + ret = matrix_vector_multiply(filterDesc, filter, vectorDesc, input, tmpBytes, tmp, + resultDesc, output, archInfo->arch); + } else { + TensorDesc in_desc = tensor2df(idt, DF_NORMAL, in, ic * ih * iw); + ret = matrix_matrix_multiply(in_desc, input, filterDesc, filter, tmpBytes, tmp, + outputDesc, output, archInfo->arch); + } +#ifdef _USE_INT8 + F32 scale = scaleI * filterTensor.get_scale(); + if (DT_I8 == filterDesc.dt) { + if (DT_I8 == outputTensor.get_desc().dt) { + CHECK_STATUS(quantize_tensor(outputDesc, output, &outputDesc, + get_ptr_from_tensor(outputTensor, arch), &scale)); + outputTensor.set_scale(scale); + } else { + CHECK_REQUIREMENT(DT_F16 == outputTensor.get_desc().dt); + F16 *biasF = (F16 *)get_ptr_from_tensor(biasTensor, arch); + U32 biasLen = nullptr == biasF ? 0 : tensorNumElements(biasDesc); + dequantize_int32_to_fp16(tensorNumElements(outputDesc), (I32 *)output, scale, + (F16 *)get_ptr_from_tensor(outputTensor, arch), biasLen, biasF); + } + } +#endif + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/activation.cpp b/compute/tensor/src/gpu/mali/activation.cpp new file mode 100644 index 00000000..0aeaea05 --- /dev/null +++ b/compute/tensor/src/gpu/mali/activation.cpp @@ -0,0 +1,96 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/activation_mali_fp16.h" + +EE activation_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + if (outputDesc) { + *outputDesc = inputDesc; + } + + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + if (gclmemInputDesc && gclmemOutputDesc) { + *gclmemOutputDesc = *gclmemInputDesc; + } + return SUCCESS; +} + +inline EE activation_checkpara_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + if (handle == nullptr || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + if (inputDesc.df != outputDesc.df) { + return NOT_SUPPORTED; + } + if (input->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + if (output->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + if (activationMode != ACTIVATION_NULL && activationMode != ACTIVATION_RELU && + activationMode != ACTIVATION_RELU6 && activationMode != ACTIVATION_H_SIGMOID && + activationMode != ACTIVATION_H_SWISH && activationMode != ACTIVATION_GELU && + activationMode != ACTIVATION_TANH && activationMode != ACTIVATION_SIGMOID) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE activation_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + EE ret = SUCCESS; + CHECK_STATUS( + activation_checkpara_mali(handle, inputDesc, input, outputDesc, output, activationMode)); + switch (inputDesc.dt) { + case DT_F16: { + ret = activation_mali_fp16(handle, inputDesc, input, outputDesc, output, activationMode); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/argmax.cpp b/compute/tensor/src/gpu/mali/argmax.cpp new file mode 100644 index 00000000..7d8b6096 --- /dev/null +++ b/compute/tensor/src/gpu/mali/argmax.cpp @@ -0,0 +1,117 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/argmax_mali_fp16.h" + +EE argmax_infer_output_size_mali(TensorDesc inputDesc, + ArgMaxParamSpec p, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + int axis = p.axis; + TensorDesc desc = inputDesc; + if (axis < 0) { + axis += inputDesc.nDims; + } + axis = inputDesc.nDims - 1 - axis; + for (int i = axis; i < (I32)(inputDesc.nDims) - 1; i++) { + desc.dims[i] = desc.dims[i + 1]; + } + desc.nDims = inputDesc.nDims - 1; + desc.dt = DT_U32; + if (outputDesc) { + *outputDesc = desc; + } + + if (gclmemInputDesc || gclmemOutputDesc) { + U32 iw, ih, ic; + U32 ow, oh, oc; + U32 inDims = inputDesc.nDims; + U32 onDims = desc.nDims; + DataType idt = inputDesc.dt; + DataType odt = desc.dt; + iw = inputDesc.dims[0]; + ih = (inDims > 1) ? inputDesc.dims[1] : 1; + ic = (inDims > 2) ? inputDesc.dims[2] : 1; + ow = desc.dims[0]; + oh = (onDims > 1) ? desc.dims[1] : 1; + oc = (onDims > 2) ? desc.dims[2] : 1; + U32 iw_align = (axis == 0) ? (iw + 7) / 8 * 8 : iw; + U32 ih_align = (axis == 1) ? (iw + 7) / 8 * 8 : ih; + U32 ic_align = (axis == 2) ? (iw + 7) / 8 * 8 : ic; + bool need_pad = false; + if (iw_align != iw || ih_align != ih || ic_align != ic) { + need_pad = true; + } + CHECK_STATUS(infer_gclmem_desc_nchw(iw_align, ih_align, ic_align, 0, 0, ow, oh, oc, idt, + odt, gclmemInputDesc, gclmemOutputDesc, need_pad)); + } + return SUCCESS; +} + +inline EE argmax_checkpara_mali(GCLHandle_t handle, GCLMem_t input, GCLMem_t tmpbuf, GCLMem_t output) +{ + if (handle == nullptr || input == nullptr || output == nullptr || tmpbuf == nullptr) { + return NULL_POINTER; + } + if (input->desc.memFormat != output->desc.memFormat || input->desc.memFormat != DF_NCHW) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE argmax_infer_forward_tmp_bytes_mali( + TensorDesc inputDesc, ArgMaxParamSpec p, TensorDesc outputDesc, U32 *bytes) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = argmax_infer_forward_tmp_bytes_mali_fp16(inputDesc, p.axis, outputDesc, bytes); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE argmax_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + ArgMaxParamSpec p, + GCLMem_t tmpbuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(argmax_checkpara_mali(handle, input, tmpbuf, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = argmax_mali_fp16(handle, inputDesc, input, p.axis, tmpbuf, outputDesc, output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/bilateral_slice_apply.cpp b/compute/tensor/src/gpu/mali/bilateral_slice_apply.cpp new file mode 100644 index 00000000..95b414a3 --- /dev/null +++ b/compute/tensor/src/gpu/mali/bilateral_slice_apply.cpp @@ -0,0 +1,214 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/bilateral_slice_apply_mali_fp16.h" +#include "gpu/mali/uchar/bilateral_slice_apply_mali_uchar.h" + +inline EE bilateral_slice_apply_checkpara_mali_common(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc guideDesc, + const GCLMem_t guide, + TensorDesc gridDesc, + const GCLMem_t grid, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (nullptr == handle || nullptr == input || nullptr == grid || nullptr == output) { + return NULL_POINTER; + } + if (bilateralSliceApplyParamSpec.mode == BSliceApply_NULL && nullptr == guide) { + return NULL_POINTER; + } + if (inputDesc.df != guideDesc.df || inputDesc.df != gridDesc.df) { + return NOT_SUPPORTED; + } + if (inputDesc.df != outputDesc.df || inputDesc.df != DF_NHWC) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[0] != guideDesc.dims[0] || inputDesc.dims[1] != guideDesc.dims[1]) { + return NOT_MATCH; + } + if (inputDesc.dims[0] != outputDesc.dims[0] || inputDesc.dims[1] != outputDesc.dims[1]) { + return NOT_MATCH; + } + if (inputDesc.dims[2] != outputDesc.dims[2]) { + return NOT_MATCH; + } + if ((gridDesc.dims[2] % bilateralSliceApplyParamSpec.coefficient_len) != 0) { + return NOT_MATCH; + } + if (bilateralSliceApplyParamSpec.has_offset == true) { + if (bilateralSliceApplyParamSpec.coefficient_len != + inputDesc.dims[2] * (inputDesc.dims[2] + 1)) { + return NOT_MATCH; + } + if (bilateralSliceApplyParamSpec.coefficient_len != 12) { + return NOT_SUPPORTED; + } + } else { + return NOT_SUPPORTED; + // if(bilateralSliceApplyParamSpec.coefficient_len != inputDesc.dims[2] * inputDesc.dims[2]) return NOT_MATCH; + // if(bilateralSliceApplyParamSpec.coefficient_len != 9) return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE bilateral_slice_apply_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc guideDesc, + TensorDesc gridDesc, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemGuideDesc, + GCLMemDesc_t gclmemGridDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + UNUSED(bilateralSliceApplyParamSpec); + DataType idt, gdt, guide_dt; + DataFormat idf, gdf; + U32 guide_w, guide_h, guide_c, guide_n; + U32 iw, ih, ic, in; + U32 ow, oh, oc, on; + U32 gw, gh, gc, gn; + + if (inputDesc.df != DF_NHWC || guideDesc.df != DF_NHWC) { + return NOT_MATCH; + } + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + tensorSelectGet(guideDesc, &guide_dt, &gdf, &guide_n, &guide_c, &guide_h, &guide_w); + tensorSelectGet(gridDesc, &gdt, &gdf, &gn, &gc, &gh, &gw); + ow = guide_w; + oh = guide_h; + oc = ic; + on = guide_n; + if (outputDesc) { + *outputDesc = tensor4df(idt, idf, on, oc, oh, ow); + } + CHECK_STATUS(infer_gclmem_desc_nhwc( + iw, ih, ic, 0, 0, ow, oh, oc, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + + if (gclmemGridDesc && gclmemGuideDesc) { + U32 s0, s1, s2; + U32 num, byteSize; + s0 = gc; + s1 = gw; + s2 = gh; + num = s0 * s1 * s2; + byteSize = s0 * s1 * s2 * bytesOf(gdt); + gclmemGridDesc->stride[0] = s0; + gclmemGridDesc->stride[1] = s1; + gclmemGridDesc->stride[2] = s2; + gclmemGridDesc->offset[0] = 0; + gclmemGridDesc->offset[1] = 0; + gclmemGridDesc->offset[2] = 0; + gclmemGridDesc->num = num; + gclmemGridDesc->byteSize = byteSize; + gclmemGridDesc->memType = GCL_MEM_BUF; + gclmemGridDesc->memFormat = DF_NHWC; + gclmemGridDesc->flags = CL_MEM_READ_WRITE; + gclmemGridDesc->host_ptr = NULL; + gclmemGridDesc->need_pad = false; + + if (bilateralSliceApplyParamSpec.mode == BSliceApply_NULL) { + s0 = guide_c; + s1 = guide_w; + s2 = guide_h; + num = s0 * s1 * s2; + byteSize = s0 * s1 * s2 * bytesOf(guide_dt); + gclmemGuideDesc->stride[0] = s0; + gclmemGuideDesc->stride[1] = s1; + gclmemGuideDesc->stride[2] = s2; + gclmemGuideDesc->offset[0] = 0; + gclmemGuideDesc->offset[1] = 0; + gclmemGuideDesc->offset[2] = 0; + gclmemGuideDesc->num = num; + gclmemGuideDesc->byteSize = byteSize; + gclmemGuideDesc->memType = GCL_MEM_BUF; + gclmemGuideDesc->memFormat = DF_NHWC; + gclmemGuideDesc->flags = CL_MEM_READ_WRITE; + gclmemGuideDesc->host_ptr = NULL; + gclmemGuideDesc->need_pad = false; + } + } + return SUCCESS; +} + +EE bilateral_slice_apply_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc guideDesc, + TensorDesc gridDesc, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + UNUSED(inputDesc); + UNUSED(guideDesc); + UNUSED(gridDesc); + UNUSED(bilateralSliceApplyParamSpec); + UNUSED(forwardRunInfo); + + DataType dt; + U32 gc, gw; + U32 ih; + tensorSelectGet(gridDesc, &dt, NULL, NULL, &gc, NULL, &gw); + tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, NULL); + *bytes = gc * gw * ih * bytesOf(dt); + return SUCCESS; +} + +EE bilateral_slice_apply_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc guideDesc, + const GCLMem_t guide, + TensorDesc gridDesc, + const GCLMem_t grid, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(bilateral_slice_apply_checkpara_mali_common(handle, inputDesc, input, guideDesc, + guide, gridDesc, grid, bilateralSliceApplyParamSpec, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = bilateral_slice_apply_mali_fp16(handle, inputDesc, input, guideDesc, guide, + gridDesc, grid, bilateralSliceApplyParamSpec, forwardRunInfo, tmpBytes, tmpBuf, + outputDesc, output); + break; + } + case DT_U8: { + ret = bilateral_slice_apply_mali_uchar(handle, inputDesc, input, guideDesc, guide, + gridDesc, grid, bilateralSliceApplyParamSpec, forwardRunInfo, tmpBytes, tmpBuf, + outputDesc, output); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/channel_resize.cpp b/compute/tensor/src/gpu/mali/channel_resize.cpp new file mode 100644 index 00000000..2fa5f14f --- /dev/null +++ b/compute/tensor/src/gpu/mali/channel_resize.cpp @@ -0,0 +1,84 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/channel_resize_mali_fp16.h" + +EE channel_resize_infer_output_size_mali(TensorDesc inputDesc, + ChannelResizeParamSpec p, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + if (outputDesc == nullptr || gclmemInputDesc == nullptr || gclmemOutputDesc == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_REQUIREMENT(((int)ic == p.channel_before)); + if (p.group != 1) { + return NOT_SUPPORTED; + } + + *outputDesc = tensor4df(idt, idf, in, p.channel_after, ih, iw); + if (gclmemInputDesc->memFormat == DF_NCHW || gclmemInputDesc->byteSize == 0) { + CHECK_STATUS( + infer_gclmem_desc_nchw(iw, ih, ic, 0, 0, 0, 0, 0, idt, idt, gclmemInputDesc, NULL)); + } else { + CHECK_STATUS( + infer_gclmem_desc_ncwhc4(iw, ih, ic, 0, 0, 0, 0, 0, idt, idt, gclmemInputDesc, NULL)); + } + CHECK_STATUS(infer_gclmem_desc_nchw( + 0, 0, 0, 0, 0, iw, ih, p.channel_after, idt, idt, NULL, gclmemOutputDesc)); + return SUCCESS; +} + +inline EE channel_resize_checkpara_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + ChannelResizeParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (handle == nullptr || input == nullptr || output == nullptr) { + return NULL_POINTER; + } + return SUCCESS; +} + +EE channel_resize_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + ChannelResizeParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(channel_resize_checkpara_mali(handle, inputDesc, input, p, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = channel_resize_mali_fp16(handle, inputDesc, input, p, outputDesc, output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/check.cpp b/compute/tensor/src/gpu/mali/check.cpp new file mode 100644 index 00000000..83710dfa --- /dev/null +++ b/compute/tensor/src/gpu/mali/check.cpp @@ -0,0 +1,153 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" + +EE check_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputADesc, + GCLMemDesc_t gclmemInputBDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + if (outputDesc) { + (*outputDesc).dt = DT_I32; + (*outputDesc).nDims = 1; + (*outputDesc).dims[0] = inputDesc.dims[inputDesc.nDims - 1]; + } + DataType idt = inputDesc.dt; + U32 ndims = inputDesc.nDims; + U32 iw = inputDesc.dims[0]; + U32 ih = (ndims > 1) ? inputDesc.dims[1] : 1; + U32 ic = (ndims > 2) ? inputDesc.dims[2] : 1; + U32 in = (ndims > 3) ? inputDesc.dims[3] : 1; + if (in > 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + CHECK_STATUS(infer_gclmem_desc_nchw( + iw, ih, ic, 0, 0, 1, 1, 1, idt, DT_I32, gclmemInputADesc, gclmemOutputDesc)); + CHECK_STATUS( + infer_gclmem_desc_nchw(iw, ih, ic, 0, 0, 0, 0, 0, idt, idt, gclmemInputBDesc, NULL)); + return SUCCESS; +} + +inline EE check_checkpara_mali(GCLHandle_t handle, + TensorDesc inputDescA, + GCLMem_t inputA, + TensorDesc inputDescB, + GCLMem_t inputB, + CheckParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (handle == nullptr || inputA == nullptr || inputB == nullptr || output == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (inputA->desc.memFormat != output->desc.memFormat || + inputB->desc.memFormat != output->desc.memFormat || inputA->desc.memFormat != DF_NCHW) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (inputDescA.dt == DT_I32 || inputDescA.dt == DT_U32) { + if (inputDescB.dt != DT_I32 && inputDescB.dt != DT_U32) { + CHECK_STATUS(NOT_MATCH); + } + } + if (outputDesc.dt != DT_I32) { + CHECK_STATUS(NOT_MATCH); + } + if (p.check_mode != CHECK_EQUAL) { + CHECK_STATUS(NOT_SUPPORTED); + } + return SUCCESS; +} + +inline EE check_core_mali(GCLHandle_t handle, + TensorDesc inputDescA, + GCLMem_t inputA, + TensorDesc inputDescB, + GCLMem_t inputB, + CheckParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + U32 ndims = inputDescA.nDims; + U32 iw = inputDescA.dims[0]; + U32 ih = (ndims > 1) ? inputDescA.dims[1] : 1; + U32 ic = (ndims > 2) ? inputDescA.dims[2] : 1; + if (iw == 1 && ih == 1 && ic == 1) { + U32 aw_str, ah_str, aw_off, ah_off; + U32 bw_str, bh_str, bw_off, bh_off; + U32 ow_str, oh_str, ow_off, oh_off; + get_gclmem_dim(inputA->desc, &aw_str, &ah_str, NULL, &aw_off, &ah_off); + get_gclmem_dim(inputB->desc, &bw_str, &bh_str, NULL, &bw_off, &bh_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + U32 gs = 1; + U32 ls = 0; + U32 dim = 1; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "check_int_spe", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs( + kernel, aw_off, bw_off, ow_off, gs, inputA->mem, inputB->mem, output->mem)); + gcl_set_kernelVec(handle, kernel, dim, &gs, &ls, "check_int_spe"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs, &ls, "check_int_spe")); + CHECK_STATUS(gcl_print_memory(handle, inputA, "clip_inputA")); + CHECK_STATUS(gcl_print_memory(handle, inputB, "clip_inputB")); + CHECK_STATUS(gcl_print_memory(handle, output, "clip_output")); +#endif + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE check_mali(GCLHandle_t handle, + TensorDesc inputDescA, + GCLMem_t inputA, + TensorDesc inputDescB, + GCLMem_t inputB, + CheckParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS( + check_checkpara_mali(handle, inputDescA, inputA, inputDescB, inputB, p, outputDesc, output)); + DataType dt = inputDescA.dt; + if (dt == DT_U32) { + dt = DT_I32; + } + switch (dt) { + case DT_F16: { + ret = NOT_SUPPORTED; + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + case DT_I32: { + ret = check_core_mali( + handle, inputDescA, inputA, inputDescB, inputB, p, outputDesc, output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/cl/activation.cl b/compute/tensor/src/gpu/mali/cl/activation.cl new file mode 100644 index 00000000..856c7fb2 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/activation.cl @@ -0,0 +1,58 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, AC, H) base##AC##H +#define MANGLE_NAME(base, AC, H) MANGLE_NAME_IMPL(base, AC, H) +__kernel void MANGLE_NAME(activation_, AC, H)(const int h, + const int w, + const int cd4, + const int ce4, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + __global T *input, + __global T *output) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + int idz = get_global_id(2); + if (idx >= h || idy >= w) { + return; + } + + T4 val; + int in_off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; + val = vload4(in_off, input); + ACTIVATION_V4(val); +#if defined(USE_TANH) || defined(USE_SIGMOID) || defined(USE_HSIGMOID) || defined(USE_GELU) + if (idz == cd4 - 1) { + if (ce4 < 2) { + val.y = 0; + } + if (ce4 < 3) { + val.z = 0; + } + if (ce4 < 4) { + val.w = 0; + } + } +#endif + int out_off = (idz * ow_str + idy + ow_off) * oh_str + idx + oh_off; + vstore4(val, out_off, output); +} diff --git a/compute/tensor/src/gpu/mali/cl/argmax_x.cl b/compute/tensor/src/gpu/mali/cl/argmax_x.cl new file mode 100644 index 00000000..7e3fe903 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/argmax_x.cl @@ -0,0 +1,136 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define get_max(val, dim) \ + { \ + dim.s0 = 0; \ + dim.s1 = 1; \ + dim.s2 = 2; \ + dim.s3 = 3; \ + if (val.s4 > val.s0) { \ + val.s0 = val.s4; \ + dim.s0 = 4; \ + } \ + if (val.s5 > val.s1) { \ + val.s1 = val.s5; \ + dim.s1 = 5; \ + } \ + if (val.s6 > val.s2) { \ + val.s2 = val.s6; \ + dim.s2 = 6; \ + } \ + if (val.s7 > val.s3) { \ + val.s3 = val.s7; \ + dim.s3 = 7; \ + } \ + if (val.s2 > val.s0) { \ + val.s0 = val.s2; \ + dim.s0 = dim.s2; \ + } \ + if (val.s3 > val.s1) { \ + val.s1 = val.s3; \ + dim.s1 = dim.s3; \ + } \ + if (val.s1 > val.s0) { \ + val.s0 = val.s1; \ + dim.s0 = dim.s1; \ + } \ + } + +#if defined(USE_INDEX) +__kernel void argmax_x_index +#else +__kernel void argmax_x +#endif + (const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int oh_str, + const int ow_off, + const int oh_off, + const int len, + const int bx, + const int by, + __global const T *in, + __global const uint *ini, + __global T *outv, + __global uint *outi) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + int bn = len >> 3; + int en = len & 7; + T8 val; + uchar4 dim; + T maxval = -65504; + uint maxIndex = 1; + const int in_off = (idz * ih_str + idy + ih_off) * iw_str + iw_off; + for (int i = idx; i < bn; i += bx) { + val = vload8(i, in + in_off); + get_max(val, dim); + if (val.s0 > maxval) { + maxval = val.s0; + maxIndex = (i << 3) + dim.s0; + } + } + + if (en != 0 && idx == bx - 1) { + int be = len - 8; + int rx = 0; + if (be < 0) { + be = 0; + rx = -be; + } + val = vload8(0, in + in_off + be); + if (rx > 0) { + val.s7 = -65504; + if (rx > 1) { + val.s6 = -65504; + } + if (rx > 2) { + val.s5 = -65504; + } + if (rx > 3) { + val.s4 = -65504; + } + if (rx > 4) { + val.s3 = -65504; + } + if (rx > 5) { + val.s2 = -65504; + } + if (rx > 6) { + val.s1 = -65504; + } + } + get_max(val, dim); + if (val.s0 > maxval) { + maxval = val.s0; + maxIndex = be + dim.s0; + } + } + int out_off = (idz * oh_str + idy + oh_off) * ow_str + idx + ow_off; +#if defined(USE_INDEX) + maxIndex = ini[maxIndex]; +#endif + if (bx > 1) { + outv[out_off] = maxval; + } + outi[out_off] = maxIndex; +} diff --git a/tensor_computing/src/gpu/mali/cl/bilateral_slice_apply_c12.cl b/compute/tensor/src/gpu/mali/cl/bilateral_slice_apply_c12.cl similarity index 61% rename from tensor_computing/src/gpu/mali/cl/bilateral_slice_apply_c12.cl rename to compute/tensor/src/gpu/mali/cl/bilateral_slice_apply_c12.cl index b43bf2a9..1f4028ea 100644 --- a/tensor_computing/src/gpu/mali/cl/bilateral_slice_apply_c12.cl +++ b/compute/tensor/src/gpu/mali/cl/bilateral_slice_apply_c12.cl @@ -11,35 +11,32 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - #if defined(USE_HALF) -#define READ_IMAGE(image, sampler, coord) read_imageh(image, sampler, coord) -#define WRITE_IMAGE(image, coord, data) write_imageh(image, coord, data) +#define READ_IMAGE(image, sampler, coord) read_imageh(image, sampler, coord) +#define WRITE_IMAGE(image, coord, data) write_imageh(image, coord, data) #else -#define READ_IMAGE(image, sampler, coord) read_imagef(image, sampler, coord) -#define WRITE_IMAGE(image, coord, data) write_imagef(image, coord, data) +#define READ_IMAGE(image, sampler, coord) read_imagef(image, sampler, coord) +#define WRITE_IMAGE(image, coord, data) write_imagef(image, coord, data) #endif __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; /*these parameters are belong to matrix mult/add and conv*/ /*they are extract from HDR model*/ /*they may be changful for different model*/ -#define guide_cal(v, g){\ - T3 tmp;\ - tmp.x = v.x * (T)0.900616 - v.y * (T)0.1006 - v.z * (T)0.058384 + (T)0.072721;\ - tmp.y =-v.x * (T)0.079311 + v.y * (T)0.91976 - v.z * (T)0.037624 + (T)0.124359;\ - tmp.z =-v.x * (T)0.068347 - v.y * (T)0.069032 + v.z * (T)0.975032 + (T)0.129721;\ - tmp.x = (tmp.x < 0) ? 0 : tmp.x;\ - tmp.y = (tmp.y < 0) ? 0 : tmp.y;\ - tmp.z = (tmp.z < 0) ? 0 : tmp.z;\ - tmp.x = tmp.x * (T)0.003211 * 16;\ - tmp.y = tmp.y * (T)0.007948 * 16;\ - tmp.z = tmp.z * (T)0.046259 * 16;\ - g = tmp.x * (T)0.249512 + tmp.y * (T)0.274577 + tmp.z * (T)0.324276 + (T)0.078941;\ -} +#define guide_cal(v, g) \ + { \ + T3 tmp; \ + tmp.x = v.x * (T)0.900616 - v.y * (T)0.1006 - v.z * (T)0.058384 + (T)0.072721; \ + tmp.y = -v.x * (T)0.079311 + v.y * (T)0.91976 - v.z * (T)0.037624 + (T)0.124359; \ + tmp.z = -v.x * (T)0.068347 - v.y * (T)0.069032 + v.z * (T)0.975032 + (T)0.129721; \ + tmp.x = (tmp.x < 0) ? 0 : tmp.x; \ + tmp.y = (tmp.y < 0) ? 0 : tmp.y; \ + tmp.z = (tmp.z < 0) ? 0 : tmp.z; \ + tmp.x = tmp.x * (T)0.003211 * 16; \ + tmp.y = tmp.y * (T)0.007948 * 16; \ + tmp.z = tmp.z * (T)0.046259 * 16; \ + g = tmp.x * (T)0.249512 + tmp.y * (T)0.274577 + tmp.z * (T)0.324276 + (T)0.078941; \ + } #if defined(CONV) #if defined(UCHAR) @@ -54,22 +51,38 @@ __kernel void bilateral_slice_apply_c12_uchar __kernel void bilateral_slice_apply_c12 #endif #endif - (const int w, const int wh, const int gc, const int gw, const int gh, const int gcw, const int gd, const int coe, const int bx, const int by, - const float scale_x, const float scale_y, global const T* guide, global const T* grid, -#if defined (UCHAR) - global const uchar* input, - global uchar* out){ + (const int w, + const int wh, + const int gc, + const int gw, + const int gh, + const int gcw, + const int gd, + const int coe, + const int bx, + const int by, + const float scale_x, + const float scale_y, + global const T *guide, + global const T *grid, +#if defined(UCHAR) + global const uchar *input, + global uchar *out) +{ #else - global const T* input, - global T* out){ -#endif - + global const T *input, + global T *out) +{ +#endif + const int x = get_global_id(0); const int y = get_global_id(1); - if(x >= bx || y >= by) return; + if (x >= bx || y >= by) { + return; + } int in_off = y * w + x; T3 in_val; -#if defined (UCHAR) +#if defined(UCHAR) uchar3 tmp = vload3(0, input + in_off * 3); in_val.x = tmp.x / 256.0; in_val.y = tmp.y / 256.0; @@ -80,23 +93,33 @@ __kernel void bilateral_slice_apply_c12 T gx = (x + (T)0.5) * (T)scale_x; T gz; -#if defined (CONV) +#if defined(CONV) guide_cal(in_val, gz); #else gz = guide[in_off]; -#endif +#endif gz = gz * gd; char fx = (char)floor(gx - (T)0.5); char fz = (char)floor(gz - (T)0.5); - char i = 0; - char k = 0; + char i = 0; + char k = 0; char x_ = fx; char z_ = fz; - if(fx < 0){x_ = 0; i = 1;} - if(fz < 0){z_ = 0; k = 1;} - if(fx == gw - 1) i = 1; - if(fz == gd - 1) k = 1; + if (fx < 0) { + x_ = 0; + i = 1; + } + if (fz < 0) { + z_ = 0; + k = 1; + } + if (fx == gw - 1) { + i = 1; + } + if (fz == gd - 1) { + k = 1; + } T8 g_val[3]; T4 p; @@ -111,10 +134,18 @@ __kernel void bilateral_slice_apply_c12 wz.s0 = (T)1 - fabs(fz + (T)0.5 - gz); wz.s1 = (T)1 - fabs(fz + (T)1.5 - gz); - if(wx.s0 < 0) wx.s0 = 0; - if(wx.s1 < 0) wx.s0 = 0; - if(wz.s0 < 0) wz.s0 = 0; - if(wz.s1 < 0) wz.s0 = 0; + if (wx.s0 < 0) { + wx.s0 = 0; + } + if (wx.s1 < 0) { + wx.s0 = 0; + } + if (wz.s0 < 0) { + wz.s0 = 0; + } + if (wz.s1 < 0) { + wz.s0 = 0; + } p.xy = wx.s0 * wz; p.zw = wx.s1 * wz; @@ -126,7 +157,7 @@ __kernel void bilateral_slice_apply_c12 sum[0] += g_val[0].s0123 * p.x; sum[1] += g_val[0].s4567 * p.x; sum[2] += g_val[1].s0123 * p.x; - if(k == 0) { + if (k == 0) { p.y = p.y + (T)i * p.w; g_val[2] = vload8(0, grid + grid_off + 16); sum[0] += g_val[1].s4567 * p.y; @@ -134,7 +165,7 @@ __kernel void bilateral_slice_apply_c12 sum[2] += g_val[2].s4567 * p.y; } - if(i == 0){ + if (i == 0) { grid_off += gc; p.z = p.z + (T)k * p.w; g_val[0] = vload8(0, grid + grid_off); @@ -142,18 +173,18 @@ __kernel void bilateral_slice_apply_c12 sum[0] += g_val[0].s0123 * p.z; sum[1] += g_val[0].s4567 * p.z; sum[2] += g_val[1].s0123 * p.z; - if(k == 0){ + if (k == 0) { g_val[2] = vload8(0, grid + grid_off + 16); sum[0] += g_val[1].s4567 * p.w; sum[1] += g_val[2].s0123 * p.w; sum[2] += g_val[2].s4567 * p.w; } } - + sum[0].x = sum[0].x * in_val.x + sum[0].y * in_val.y + sum[0].z * in_val.z + sum[0].w; sum[1].x = sum[1].x * in_val.x + sum[1].y * in_val.y + sum[1].z * in_val.z + sum[1].w; sum[2].x = sum[2].x * in_val.x + sum[2].y * in_val.y + sum[2].z * in_val.z + sum[2].w; -#if defined (UCHAR) +#if defined(UCHAR) tmp.x = (uchar)(sum[0].x * 256.0); tmp.y = (uchar)(sum[1].x * 256.0); tmp.z = (uchar)(sum[2].x * 256.0); diff --git a/tensor_computing/src/gpu/mali/cl/bilateral_slice_apply_pre.cl b/compute/tensor/src/gpu/mali/cl/bilateral_slice_apply_pre.cl similarity index 72% rename from tensor_computing/src/gpu/mali/cl/bilateral_slice_apply_pre.cl rename to compute/tensor/src/gpu/mali/cl/bilateral_slice_apply_pre.cl index 89d37654..5a9ee40d 100644 --- a/tensor_computing/src/gpu/mali/cl/bilateral_slice_apply_pre.cl +++ b/compute/tensor/src/gpu/mali/cl/bilateral_slice_apply_pre.cl @@ -11,24 +11,34 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - - -__kernel void bilateral_slice_apply_pre(const int gh, const int gc, const int gcw, const int bx, const int bw, const float scale_y, global const T* grid, global T* gridTran){ - const int idx = get_global_id(0);//dep * coe / 4 - const int idw = get_global_id(1);//gw - const int idh = get_global_id(2);//H - if(idx >= bx || idw >= bw) return; +__kernel void bilateral_slice_apply_pre(const int gh, + const int gc, + const int gcw, + const int bx, + const int bw, + const float scale_y, + global const T *grid, + global T *gridTran) +{ + const int idx = get_global_id(0); // dep * coe / 4 + const int idw = get_global_id(1); // gw + const int idh = get_global_id(2); // H + if (idx >= bx || idw >= bw) { + return; + } char j = 1; T2 wy; - T gy = (idh + (T)0.5) * (T)scale_y; + T gy = (idh + (T)0.5) * (T)scale_y; char fy = floor(gy - (T)0.5); char y_ = fy; - if(fy < 0) {y_ = 0; j = 0;} - if(fy == gh - 1) j = 0; + if (fy < 0) { + y_ = 0; + j = 0; + } + if (fy == gh - 1) { + j = 0; + } wy.x = (T)1 - fabs(fy + (T)0.5 - gy); wy.y = (T)1 - fabs(fy + (T)1.5 - gy); @@ -41,5 +51,5 @@ __kernel void bilateral_slice_apply_pre(const int gh, const int gc, const int gc res = wy.x * val0 + wy.y * val1; int gridTran_off = idh * gcw + idw * gc + (idx << 2); - vstore4(res, 0, gridTran + gridTran_off); + vstore4(res, 0, gridTran + gridTran_off); } diff --git a/compute/tensor/src/gpu/mali/cl/channel_resize.cl b/compute/tensor/src/gpu/mali/cl/channel_resize.cl new file mode 100644 index 00000000..f206cb91 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/channel_resize.cl @@ -0,0 +1,230 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#if defined(USE_NCHW) +#else +#endif + +#if defined(INPUT_NCHW) && defined(OUTPUT_NCHW) +#define LOAD_VAL(ew, ec, idx, idy, idz, ih_str, iw_str, ih_off, iw_off, buf, val) \ + { \ + int off = (idz * ih_str + idy + ih_off) * iw_str + (idx << 2) + iw_off; \ + val = 0; \ + if (ew == 4) { \ + val = vload4(0, buf + off); \ + } else { \ + if (ew == 1) { \ + val.x = buf[off]; \ + } \ + if (ew == 2) { \ + val.xy = vload2(0, buf + off); \ + } \ + if (ew == 3) { \ + val.xyz = vload3(0, buf + off); \ + } \ + } \ + } +#define STORE_VAL(ew, ec, idx, idy, idz, oh_str, ow_str, oh_off, ow_off, buf, val) \ + { \ + int off = (idz * oh_str + idy + oh_off) * ow_str + (idx << 2) + ow_off; \ + if (ew == 4) { \ + vstore4(val, 0, buf + off); \ + } else { \ + if (ew == 1) { \ + buf[off] = val.x; \ + } \ + if (ew == 2) { \ + vstore2((T2)(val.x, val.y), 0, buf + off); \ + } \ + if (ew == 3) { \ + vstore3((T3)(val.x, val.y, val.z), 0, buf + off); \ + } \ + } \ + } +#elif defined(INPUT_NCHW) && defined(OUTPUT_NCWHC4) +#define LOAD_VAL(ew, ec, idx, idy, idz, ih_str, iw_str, ih_off, iw_off, buf, val) \ + { \ + int off = ((idz << 2) * ih_str + idy + ih_off) * iw_str + (idx << 2) + iw_off; \ + int str = iw_str * ih_str; \ + if (ew == 4) { \ + val[0] = vload4(0, buf + off); \ + if (ec > 1) \ + val[1] = vload4(0, buf + off + str); \ + if (ec > 2) \ + val[2] = vload4(0, buf + off + str * 2); \ + if (ec > 3) \ + val[3] = vload4(0, buf + off + str * 3); \ + } else { \ + if (ew == 1) { \ + val[0].x = buf[off]; \ + if (ec > 1) \ + val[1].x = buf[off + str]; \ + if (ec > 2) \ + val[2].x = buf[off + str * 2]; \ + if (ec > 3) \ + val[3].x = buf[off + str * 3]; \ + } \ + if (ew == 2) { \ + val[0].xy = vload2(0, buf + off); \ + if (ec > 1) \ + val[1].xy = vload2(0, buf + off + str); \ + if (ec > 2) \ + val[2].xy = vload2(0, buf + off + str * 2); \ + if (ec > 3) \ + val[3].xy = vload2(0, buf + off + str * 3); \ + } \ + if (ew == 3) { \ + val[0].xyz = vload3(0, buf + off); \ + if (ec > 1) \ + val[1].xyz = vload3(0, buf + off + str); \ + if (ec > 2) \ + val[2].xyz = vload3(0, buf + off + str * 2); \ + if (ec > 3) \ + val[3].xyz = vload3(0, buf + off + str * 3); \ + } \ + } \ + } +#define STORE_VAL(ew, ec, idx, idy, idz, oh_str, ow_str, oh_off, ow_off, buf, val) \ + { \ + int off = (idz * ow_str + (idx << 2) + ow_off) * oh_str + idy + oh_off; \ + vstore4((T4)(val[0].x, val[1].x, val[2].x, val[3].x), off, buf); \ + if (ew > 1) \ + vstore4((T4)(val[0].y, val[1].y, val[2].y, val[3].y), off + oh_str, buf); \ + if (ew > 2) \ + vstore4((T4)(val[0].z, val[1].z, val[2].z, val[3].z), off + oh_str * 2, buf); \ + if (ew > 3) \ + vstore4((T4)(val[0].w, val[1].w, val[2].w, val[3].w), off + oh_str * 3, buf); \ + } +#elif defined(INPUT_NCWHC4) && defined(OUTPUT_NCHW) +#define LOAD_VAL(ew, ec, idx, idy, idz, ih_str, iw_str, ih_off, iw_off, buf, val) \ + { \ + int off = (idz * iw_str + (idy << 2) + iw_off) * ih_str + idx + ih_off; \ + val[0] = vload4(off, buf); \ + if (ew > 1) \ + val[1] = vload4(off + ih_str, buf); \ + if (ew > 2) \ + val[2] = vload4(off + ih_str * 2, buf); \ + if (ew > 3) \ + val[3] = vload4(off + ih_str * 3, buf); \ + } +#define STORE_VAL(ew, ec, idx, idy, idz, oh_str, ow_str, oh_off, ow_off, buf, val) \ + { \ + int off = ((idz << 2) * oh_str + idx + oh_off) * ow_str + (idy << 2) + ow_off; \ + int str = ow_str * oh_str; \ + if (ew == 4) { \ + vstore4((T4)(val[0].x, val[1].x, val[2].x, val[3].x), 0, buf + off); \ + if (ec > 1) \ + vstore4((T4)(val[0].y, val[1].y, val[2].y, val[3].y), 0, buf + off + str); \ + if (ec > 2) \ + vstore4((T4)(val[0].z, val[1].z, val[2].z, val[3].z), 0, buf + off + str * 2); \ + if (ec > 3) \ + vstore4((T4)(val[0].w, val[1].w, val[2].w, val[3].w), 0, buf + off + str * 3); \ + } else { \ + if (ew == 1) { \ + buf[off] = val[0].x; \ + if (ec > 1) \ + buf[off + str] = val[0].y; \ + if (ec > 2) \ + buf[off + str * 2] = val[0].z; \ + if (ec > 3) \ + buf[off + str * 3] = val[0].w; \ + } \ + if (ew == 2) { \ + vstore2((T2)(val[0].x, val[1].x), 0, buf + off); \ + if (ec > 1) \ + vstore2((T2)(val[0].y, val[1].y), 0, buf + off + str); \ + if (ec > 2) \ + vstore2((T2)(val[0].z, val[1].z), 0, buf + off + str * 2); \ + if (ec > 3) \ + vstore2((T2)(val[0].w, val[1].w), 0, buf + off + str * 3); \ + } \ + if (ew == 3) { \ + vstore3((T3)(val[0].x, val[1].x, val[2].x), 0, buf + off); \ + if (ec > 1) \ + vstore3((T3)(val[0].y, val[1].y, val[2].y), 0, buf + off + str); \ + if (ec > 2) \ + vstore3((T3)(val[0].z, val[1].z, val[2].z), 0, buf + off + str * 2); \ + if (ec > 3) \ + vstore3((T3)(val[0].w, val[1].w, val[2].w), 0, buf + off + str * 3); \ + } \ + } \ + } +#else +#define LOAD_VAL(ew, ec, idx, idy, idz, ih_str, iw_str, ih_off, iw_off, buf, val) \ + { \ + int off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; \ + val = vload4(off, buf); \ + } +#define STORE_VAL(ew, ec, idx, idy, idz, oh_str, ow_str, oh_off, ow_off, buf, val) \ + { \ + int off = (idz * ow_str + idy + ow_off) * oh_str + idx + oh_off; \ + vstore4(val, off, buf); \ + } +#endif + +__kernel void +#if defined(INPUT_NCHW) && defined(OUTPUT_NCHW) +channel_resize_nchw +#elif defined(INPUT_NCHW) && defined(OUTPUT_NCWHC4) +channel_resize_nchw_ncwhc4 +#elif defined(INPUT_NCWHC4) && defined(OUTPUT_NCHW) +channel_resize_ncwhc4_nchw +#else +channel_resize +#endif + (const int ih_str, + const int iw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int in_c, + const int out_c, + const int w, + const int bx, + const int by, + const __global const T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + + char ew = 0; + char ec = 0; +#if defined(INPUT_NCHW) && defined(OUTPUT_NCHW) + T4 val = 0; + ew = ((idx << 2) + 4 <= w) ? 4 : (w & 3); +#elif defined(INPUT_NCHW) && defined(OUTPUT_NCWHC4) + T4 val[4] = {0}; + ew = ((idx << 2) + 4 <= w) ? 4 : (w & 3); + ec = ((idz << 2) + 4 <= in_c) ? 4 : (in_c & 3); +#elif defined(INPUT_NCWHC4) && defined(OUTPUT_NCHW) + T4 val[4] = {0}; + ew = ((idy << 2) + 4 <= w) ? 4 : (w & 3); + ec = ((idz << 2) + 4 <= out_c) ? 4 : (out_c & 3); +#else + T4 val = 0; +#endif + + if (idz < ic_str) { + LOAD_VAL(ew, ec, idx, idy, idz, ih_str, iw_str, ih_off, iw_off, in, val); + } + STORE_VAL(ew, ec, idx, idy, idz, oh_str, ow_str, oh_off, ow_off, out, val); +} diff --git a/compute/tensor/src/gpu/mali/cl/check_int_spe.cl b/compute/tensor/src/gpu/mali/cl/check_int_spe.cl new file mode 100644 index 00000000..52155ed0 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/check_int_spe.cl @@ -0,0 +1,33 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void check_int_spe(const int aw_off, + const int bw_off, + const int ow_off, + const int bx, + __global const int *inputA, + __global const int *inputB, + __global int *output) +{ + int idx = get_global_id(0); + if (idx >= bx) { + return; + } + int va = inputA[idx + aw_off]; + int vb = inputB[idx + bw_off]; + int res = 0; + if (va == vb) { + res = 1; + } + output[idx + ow_off] = res; +} diff --git a/tensor_computing/src/gpu/mali/cl/clip.cl b/compute/tensor/src/gpu/mali/cl/clip.cl similarity index 83% rename from tensor_computing/src/gpu/mali/cl/clip.cl rename to compute/tensor/src/gpu/mali/cl/clip.cl index b9cc497d..426a4b44 100644 --- a/tensor_computing/src/gpu/mali/cl/clip.cl +++ b/compute/tensor/src/gpu/mali/cl/clip.cl @@ -11,17 +11,27 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -__kernel void clip(const int h, const int w, const int ih_str, const int iw_str, const int ih_off, const int iw_off, - const int oh_str, const int ow_str, const int oh_off, const int ow_off, const float min_value, const float max_value, __global T* input, __global T* output) { - +__kernel void clip(const int h, + const int w, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const float min_value, + const float max_value, + __global T *input, + __global T *output) +{ int idx = get_global_id(0); int idy = get_global_id(1); int idz = get_global_id(2); - if(idx >= h || idy >= w) return; + if (idx >= h || idy >= w) { + return; + } T4 val; int in_off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; diff --git a/compute/tensor/src/gpu/mali/cl/col2im.cl b/compute/tensor/src/gpu/mali/cl/col2im.cl new file mode 100644 index 00000000..0ad2e729 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/col2im.cl @@ -0,0 +1,78 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void col2im(const int ih, + const int iw, + const int ic, + const int kw, + const int kh, + const int pw, + const int ph, + const int sw, + const int sh, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int oh, + const int ow, + const int bx, + __global const T *bias, + __global const T *in, + __global T *out) +{ + const int index = get_global_id(0); + if (index >= bx) { + return; + } + const int idx = index % oh; + const int idy = (index % (ow * oh)) / oh; + const int idz = index / (ow * oh); + + const int pidx = idx + ph; + const int pidy = idy + pw; + + int sidw_i = pidy / sw; + int sidw_j = pidy % sw; + int in_wx = (sidw_i < iw) ? sidw_i : (iw - 1); + int in_wy = (sidw_i < iw) ? sidw_j : ((sidw_i - iw + 1) * sw + sidw_j); + int in_wl = (kw - in_wy + sw - 1) / sw; + if (in_wl > in_wx + 1) { + in_wl = in_wx + 1; + } + + int sidh_i = pidx / sh; + int sidh_j = pidx % sh; + int in_hx = (sidh_i < ih) ? sidh_i : (ih - 1); + int in_hy = (sidh_i < ih) ? sidh_j : ((sidh_i - ih + 1) * sh + sidh_j); + int in_hl = (kh - in_hy + sh - 1) / sh; + if (in_hl > in_hx + 1) { + in_hl = in_hx + 1; + } + + int in_off_w = ih * (in_wx + iw * kh * (in_wy + idz * kw)); + int in_str_w = ih * (iw * kh * sh - 1); + int in_off_h = in_hx + in_hy * ih * iw; + int in_str_h = ih * iw * sh - 1; + T4 sum = vload4(idz, bias); + + for (int i = 0; i < in_wl; i++) { + for (int j = 0; j < in_hl; j++) { + sum += vload4(in_off_w + in_off_h + j * in_str_h, in); + } + in_off_w += in_str_w; + } + + int out_off = (idz * ow_str + idy + ow_off) * oh_str + idx + oh_off; + vstore4(sum, out_off, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/concat.cl b/compute/tensor/src/gpu/mali/cl/concat.cl new file mode 100644 index 00000000..ec7bd923 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/concat.cl @@ -0,0 +1,186 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define MANGLE_NAME_IMPL(base, N) base##N +#define MANGLE_NAME(base, N) MANGLE_NAME_IMPL(base, N) + +#define LOAD_VAL(idx, idy, idz, h_str, w_str, h_off, w_off, val, buf) \ + { \ + int off = (idz * w_str + idy + w_off) * h_str + idx + h_off; \ + val = vload4(off, buf); \ + } + +__kernel void +#if defined(NON_ALIGN_C) + MANGLE_NAME(concat_nonalign_c_p1_, N) +#else +#if defined(AXIS_W) + MANGLE_NAME(concat_w, N) +#elif defined(AXIS_H) + MANGLE_NAME(concat_h, N) +#elif defined(AXIS_C) + MANGLE_NAME(concat_c, N) +#endif +#endif + (const int oh_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int axis_max, + const int nmax, + const int out_size, + const int bx, + const int by, + const int ih_str0, + const int iw_str0, + const int ih_off0, + const int iw_off0, + const int ic0, + __global const T *in0, +#if (N > 1) + const int ih_str1, + const int iw_str1, + const int ih_off1, + const int iw_off1, + const int ic1, + const int axis_len_0, + __global const T *in1, +#endif +#if (N > 2) + const int ih_str2, + const int iw_str2, + const int ih_off2, + const int iw_off2, + const int ic2, + const int axis_len_1, + __global const T *in2, +#endif +#if (N > 3) + const int ih_str3, + const int iw_str3, + const int ih_off3, + const int iw_off3, + const int ic3, + const int axis_len_2, + __global const T *in3, +#endif + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } +#if defined(AXIS_W) + int id_axis = idy - axis_max; +#elif defined(AXIS_H) + int id_axis = idx - axis_max; +#elif defined(AXIS_C) + int id_axis = idz - axis_max; +#endif + int idn = nmax; +#if (N > 3) + if (id_axis < 0) { + id_axis += axis_len_2; + idn = 2; + } +#endif +#if (N > 2) + if (id_axis < 0) { + id_axis += axis_len_1; + idn = 1; + } +#endif +#if (N > 1) + if (id_axis < 0) { + id_axis += axis_len_0; + idn = 0; + } +#endif + T4 val; + int in_idx = idx; + int in_idy = idy; + int in_idz = idz; + +#if defined(AXIS_W) + in_idy = id_axis; +#elif defined(AXIS_H) + in_idx = id_axis; +#elif defined(AXIS_C) + in_idz = id_axis; +#endif + +#if defined(NON_ALIGN_C) + char ec = 4; + int out_off = id_axis * ohw_str * 4 + idy * oh_str + idx; +#else + int out_off = idz * ohw_str + (idy + ow_off) * oh_str + idx + oh_off; +#endif + if (idn == 0) { + LOAD_VAL(in_idx, in_idy, in_idz, ih_str0, iw_str0, ih_off0, iw_off0, val, in0); +#if defined(NON_ALIGN_C) + if (id_axis * 4 + 4 > ic0) { + ec = ic0 & 3; + } +#endif + } +#if (N > 1) + if (idn == 1) { + LOAD_VAL(in_idx, in_idy, in_idz, ih_str1, iw_str1, ih_off1, iw_off1, val, in1); +#if defined(NON_ALIGN_C) + out_off += ic0 * ohw_str; + if (id_axis * 4 + 4 > ic1) { + ec = ic1 & 3; + } +#endif + } +#endif +#if (N > 2) + if (idn == 2) { + LOAD_VAL(in_idx, in_idy, in_idz, ih_str2, iw_str2, ih_off2, iw_off2, val, in2); +#if defined(NON_ALIGN_C) + out_off += (ic0 + ic1) * ohw_str; + if (id_axis * 4 + 4 > ic2) { + ec = ic2 & 3; + } +#endif + } +#endif +#if (N > 3) + if (idn == 3) { + LOAD_VAL(in_idx, in_idy, in_idz, ih_str3, iw_str3, ih_off3, iw_off3, val, in3); +#if defined(NON_ALIGN_C) + out_off += (ic0 + ic1 + ic2) * ohw_str; + if (id_axis * 4 + 4 > ic3) { + ec = ic3 & 3; + } +#endif + } +#endif + +#if defined(NON_ALIGN_C) + out[out_size + out_off] = val.x; + if (ec > 1) { + out[out_size + out_off + ohw_str] = val.y; + } + if (ec > 2) { + out[out_size + out_off + ohw_str * 2] = val.z; + } + if (ec > 3) { + out[out_size + out_off + ohw_str * 3] = val.w; + } +#else + vstore4(val, out_off, out + out_size); +#endif +} diff --git a/tensor_computing/src/gpu/mali/cl/conv_depthwise_s1.cl b/compute/tensor/src/gpu/mali/cl/conv_depthwise_s1.cl similarity index 71% rename from tensor_computing/src/gpu/mali/cl/conv_depthwise_s1.cl rename to compute/tensor/src/gpu/mali/cl/conv_depthwise_s1.cl index b42c4ce5..90fbe82e 100644 --- a/tensor_computing/src/gpu/mali/cl/conv_depthwise_s1.cl +++ b/compute/tensor/src/gpu/mali/cl/conv_depthwise_s1.cl @@ -11,50 +11,66 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -#include"kernel_def.h" -#define MANGLE_NAME_IMPL(base, F, ON) base ## F ## ON +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, F, ON) base##F##ON #define MANGLE_NAME(base, F, ON) MANGLE_NAME_IMPL(base, F, ON) - - #if defined(USE_NCWH) #if defined(USE_RELU) __kernel void MANGLE_NAME(conv_depthwise_s1_relu_ncwh_, F, ON) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_depthwise_s1_relu6_ncwh_, F, ON) #else __kernel void MANGLE_NAME(conv_depthwise_s1_ncwh_, F, ON) #endif #else #if defined(USE_RELU) __kernel void MANGLE_NAME(conv_depthwise_s1_relu_, F, ON) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_depthwise_s1_relu6_, F, ON) #else __kernel void MANGLE_NAME(conv_depthwise_s1_, F, ON) #endif #endif -(const int ih_str, const int ihw_str, const int ic_str, const int ih_off, const int iw_off, const int oh_str, const int ow_str, const int ohw_str, const int oh_off, const int ow_off, const int ow, const int bx , const int by, - __global const T* in, __global const T* flt, __read_only image1d_t bias, __global T* out) + (const int ih_str, + const int ihw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int ow, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out) { const int idx = get_global_id(0); const int idy = get_global_id(1); const int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; + if (idx >= bx || idy >= by) { + return; + } - T4 in_val[IN]; - T4 flt_val; - T4 out_val[ON]; + T4 in_val[IN]; + T4 flt_val; + T4 out_val[ON]; LOADBIAS_IMAGE_ARRAY_V4(out_val, idz, bias); - int in_off = idz * ihw_str + (idy * ON + iw_off) * ih_str + idx + ih_off; + int in_off = idz * ihw_str + (idy * ON + iw_off) * ih_str + idx + ih_off; int flt_off = idz * Fsq; - for(uchar i = 0; i < F; ++i) { + for (uchar i = 0; i < F; ++i) { LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + i, ih_str, in); - for(uchar j = 0; j < F; ++j) { + for (uchar j = 0; j < F; ++j) { #if defined(BASICE_REG) in_val[LN] = vload4(in_off + i + (LN + j) * ih_str, in); -#endif +#endif flt_val = vload4(flt_off + j, flt); DEPTHWISE_CAL_CORE_S1(in_val, flt_val, out_val); UPDATE_REG(in_val); @@ -67,5 +83,5 @@ __kernel void MANGLE_NAME(conv_depthwise_s1_, F, ON) #else int out_off = (idz * ow_str + idy * ON + ow_off) * oh_str + idx + oh_off; STORE_OUTPUT_BUF_ARRAY_V4(out_val, out_off, oh_str, idy * ON, ow, out); -#endif +#endif } diff --git a/tensor_computing/src/gpu/mali/cl/conv_depthwise_s2.cl b/compute/tensor/src/gpu/mali/cl/conv_depthwise_s2.cl similarity index 67% rename from tensor_computing/src/gpu/mali/cl/conv_depthwise_s2.cl rename to compute/tensor/src/gpu/mali/cl/conv_depthwise_s2.cl index e5f368be..39ce9d9b 100644 --- a/tensor_computing/src/gpu/mali/cl/conv_depthwise_s2.cl +++ b/compute/tensor/src/gpu/mali/cl/conv_depthwise_s2.cl @@ -11,67 +11,84 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - #include "kernel_def.h" -#define MANGLE_NAME_IMPL(base, F, ON) base ## F ## ON +#define MANGLE_NAME_IMPL(base, F, ON) base##F##ON #define MANGLE_NAME(base, F, ON) MANGLE_NAME_IMPL(base, F, ON) - #if defined(USE_NCWH) #if defined(USE_RELU) __kernel void MANGLE_NAME(conv_depthwise_s2_relu_ncwh_, F, ON) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_depthwise_s2_relu6_ncwh_, F, ON) #else __kernel void MANGLE_NAME(conv_depthwise_s2_ncwh_, F, ON) #endif #else #if defined(USE_RELU) __kernel void MANGLE_NAME(conv_depthwise_s2_relu_, F, ON) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_depthwise_s2_relu6_, F, ON) #else __kernel void MANGLE_NAME(conv_depthwise_s2_, F, ON) #endif #endif -(const int ih_str, const int ihw_str, const int ic_str, const int ih_off, const int iw_off, const int oh_str, const int ow_str, const int ohw_str, const int oh_off, const int ow_off, const int ow, const int bx, const int by, - __global const T* in, __global const T* flt, __read_only image1d_t bias, __global T* out) + (const int ih_str, + const int ihw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int ow, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out) { const int idx = get_global_id(0); const int idy = get_global_id(1); const int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; + if (idx >= bx || idy >= by) { + return; + } - T4 in_val[IN]; - T4 flt_val; - T4 out_val[ON]; + T4 in_val[IN]; + T4 flt_val; + T4 out_val[ON]; LOADBIAS_IMAGE_ARRAY_V4(out_val, idz, bias); - int in_off = idz * ihw_str + ((idy << 1) * ON + iw_off) * ih_str + (idx << 1) + ih_off; + int in_off = idz * ihw_str + ((idy << 1) * ON + iw_off) * ih_str + (idx << 1) + ih_off; int flt_off = idz * Fsq; - for(uchar i = 0; i < F; ++i) { + for (uchar i = 0; i < F; ++i) { #if defined(BASIC_REG) LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + i, (ih_str << 1), in); - for(uchar j = 0; j < F; j += 2) { + for (uchar j = 0; j < F; j += 2) { flt_val = vload4(flt_off + j, flt); - in_val[LN] = vload4(in_off + i + ((LN << 1) + j) * ih_str , in); + in_val[LN] = vload4(in_off + i + ((LN << 1) + j) * ih_str, in); DEPTHWISE_CAL_CORE_S1(in_val, flt_val, out_val); UPDATE_REG(in_val); } LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + i + ih_str, (ih_str << 1), in); - for(uchar j = 1; j < F; j += 2) { + for (uchar j = 1; j < F; j += 2) { flt_val = vload4(flt_off + j, flt); - in_val[LN] = vload4(in_off + i + ((LN << 1) + j) * ih_str , in); + in_val[LN] = vload4(in_off + i + ((LN << 1) + j) * ih_str, in); DEPTHWISE_CAL_CORE_S1(in_val, flt_val, out_val) UPDATE_REG(in_val); } #else - LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + i, ih_str, in); - for(uchar j = 0; j < F; ++j) { - flt_val = vload4(flt_off + j, flt); - DEPTHWISE_CAL_CORE_S2(in_val, flt_val, out_val); - UPDATE_REG(in_val); - } + LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + i, ih_str, in); + for (uchar j = 0; j < F; ++j) { + flt_val = vload4(flt_off + j, flt); + DEPTHWISE_CAL_CORE_S2(in_val, flt_val, out_val); + UPDATE_REG(in_val); + } #endif - flt_off += F; + flt_off += F; } #if defined(USE_NCWH) int out_off = (idz << 2) * ohw_str + (idy * ON + ow_off) * oh_str + idx + oh_off; @@ -79,5 +96,5 @@ __kernel void MANGLE_NAME(conv_depthwise_s2_, F, ON) #else int out_off = (idz * ow_str + idy * ON + ow_off) * oh_str + idx + oh_off; STORE_OUTPUT_BUF_ARRAY_V4(out_val, out_off, oh_str, idy * ON, ow, out); -#endif +#endif } diff --git a/compute/tensor/src/gpu/mali/cl/conv_depthwise_trans_fltbuf.cl b/compute/tensor/src/gpu/mali/cl/conv_depthwise_trans_fltbuf.cl new file mode 100644 index 00000000..77c712bc --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_depthwise_trans_fltbuf.cl @@ -0,0 +1,88 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define MANGLE_NAME_IMPL(base, K) base##K +#define MANGLE_NAME(base, K) MANGLE_NAME_IMPL(base, K) +#if (K == 4) +#define loadFltval(off, str, flt, val) \ + { \ + val.x = flt[off]; \ + val.y = flt[off + str]; \ + val.z = flt[off + (str << 1)]; \ + val.w = flt[off + str * 3]; \ + } + +#define loadFltvalEdge(off, str, flt, val, edge) \ + { \ + val.x = flt[off]; \ + if (edge > 1) \ + val.y = flt[off + str]; \ + if (edge > 2) \ + val.z = flt[off + (str << 1)]; \ + } +#endif + +#if (K == 8) +#define loadFltval(off, str, flt, val) \ + { \ + val.s0 = flt[off]; \ + val.s1 = flt[off + str]; \ + val.s2 = flt[off + (str << 1)]; \ + val.s3 = flt[off + str * 3]; \ + val.s4 = flt[off + (str << 2)]; \ + val.s5 = flt[off + str * 5]; \ + val.s6 = flt[off + str * 6]; \ + val.s7 = flt[off + str * 7]; \ + } +#define loadFltvalEdge(off, str, flt, val, edge) \ + { \ + val.s0 = flt[off]; \ + if (edge > 1) \ + val.s1 = flt[off + str]; \ + if (edge > 2) \ + val.s2 = flt[off + (str << 1)]; \ + if (edge > 3) \ + val.s3 = flt[off + str * 3]; \ + if (edge > 4) \ + val.s4 = flt[off + (str << 2)]; \ + if (edge > 5) \ + val.s5 = flt[off + str * 5]; \ + if (edge > 6) \ + val.s6 = flt[off + str * 6]; \ + } +#endif + +__kernel void MANGLE_NAME(conv_depthwise_trans_fltbuf_, K)( + const fwh, const fn, __global const T *fltdata, __global T *fltbuf) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int flt_off = idy * K * fwh + idx; + int ek = ((idy + 1) * K <= fn) ? K : (fn % K); +#if (K == 4) + T4 val = 0; +#elif (K == 8) + T8 val = 0; +#endif + if (ek == K) { + loadFltval(flt_off, fwh, fltdata, val); + } else { + loadFltvalEdge(flt_off, fwh, fltdata, val, ek); + } + const int out_off = idy * fwh + idx; +#if (K == 4) + vstore4(val, out_off, fltbuf); +#elif (K == 8) + vstore8(val, out_off, fltbuf); +#endif +} diff --git a/compute/tensor/src/gpu/mali/cl/conv_direct_3d_sw1_nchw_to_ncwhc4.cl b/compute/tensor/src/gpu/mali/cl/conv_direct_3d_sw1_nchw_to_ncwhc4.cl new file mode 100644 index 00000000..40c37948 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_direct_3d_sw1_nchw_to_ncwhc4.cl @@ -0,0 +1,170 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, FWH, FT, ON) base##FWH##FT##ON +#define MANGLE_NAME(base, FWH, FT, ON) MANGLE_NAME_IMPL(base, FWH, FT, ON) + +#if (FWH == 1) +#define calCore(A, B, C) \ + { \ + C[0] += A.s0 * B; \ + C[1] += A.s1 * B; \ + C[2] += A.s2 * B; \ + C[3] += A.s3 * B; \ + C[4] += A.s4 * B; \ + C[5] += A.s5 * B; \ + C[6] += A.s6 * B; \ + C[7] += A.s7 * B; \ + } +#elif (FWH == 3) +#define calCore(a0, a1, a2, a3, a4, a5, B, C) \ + { \ + C[0] += a0 * B; \ + C[1] += a1 * B; \ + C[2] += a2 * B; \ + C[3] += a3 * B; \ + C[4] += a4 * B; \ + C[5] += a5 * B; \ + } +#define calCore0(A, B, C) calCore(A.s0, A.s1, A.s2, A.s3, A.s4, A.s5, B, C) +#define calCore1(A, B, C) calCore(A.s1, A.s2, A.s3, A.s4, A.s5, A.s6, B, C) +#define calCore2(A, B, C) calCore(A.s2, A.s3, A.s4, A.s5, A.s6, A.s7, B, C) +#elif (FWH == 5) +#define calCore(a0, a1, a2, a3, B, C) \ + { \ + C[0] += a0 * B; \ + C[1] += a1 * B; \ + C[2] += a2 * B; \ + C[3] += a3 * B; \ + } +#define calCore0(A, B, C) calCore(A.s0, A.s1, A.s2, A.s3, B, C) +#define calCore1(A, B, C) calCore(A.s1, A.s2, A.s3, A.s4, B, C) +#define calCore2(A, B, C) calCore(A.s2, A.s3, A.s4, A.s5, B, C) +#define calCore3(A, B, C) calCore(A.s3, A.s4, A.s5, A.s6, B, C) +#define calCore4(A, B, C) calCore(A.s4, A.s5, A.s6, A.s7, B, C) +#elif (FWH == 7) +#define calCore(a0, a1, B, C) \ + { \ + C[0] += a0 * B; \ + C[1] += a1 * B; \ + } +#define calCore0(A, B, C) calCore(A.s0, A.s1, B, C) +#define calCore1(A, B, C) calCore(A.s1, A.s2, B, C) +#define calCore2(A, B, C) calCore(A.s2, A.s3, B, C) +#define calCore3(A, B, C) calCore(A.s3, A.s4, B, C) +#define calCore4(A, B, C) calCore(A.s4, A.s5, B, C) +#define calCore5(A, B, C) calCore(A.s5, A.s6, B, C) +#define calCore6(A, B, C) calCore(A.s6, A.s7, B, C) +#endif + +#if defined(USE_RELU) +__kernel void MANGLE_NAME(conv_direct_3d_sw1_nchw_to_ncwhc4_relu_, FWH, FT, ON) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_direct_3d_sw1_nchw_to_ncwhc4_relu6_, FWH, FT, ON) +#else +__kernel void MANGLE_NAME(conv_direct_3d_sw1_nchw_to_ncwhc4_, FWH, FT, ON) +#endif + (const int iw_str, + const int iwh_str, + const int ic_str, + const int iw_off, + const int ih_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int ow, + const int ot, + const int it, + const int pt, + const int sh, + const int st, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + const int idt = idz % ot; + const int idk = idz / ot; + + if (idx >= bx || idy >= by) { + return; + } + + T8 in_val; + T4 flt_val; + T4 out_val[ON]; + + LOADBIAS_IMAGE_ARRAY_V4(out_val, idk, bias); + int in_off = (idy * sh + ih_off) * iw_str + idx * ON + iw_off; + int flt_off = idk * ic_str * FWHT; + + int t_be = idt * st - pt; + int t_end = t_be + FT; + if (t_be < 0) { + t_be = 0; + flt_off += pt * FWH * FWH; + } + if (t_end > it) { + t_end = it; + } + + for (int i = 0; i < ic_str; ++i) { + for (int tt = t_be; tt < t_end; ++tt) { +#if (FWH == 1) + flt_val = vload4(flt_off, flt); + in_val = vload8(0, in + in_off + tt * iwh_str); + calCore(in_val, flt_val, out_val); + flt_off++; +#else + for (uchar j = 0; j < FWH; ++j) { + in_val = vload8(0, in + in_off + tt * iwh_str + j * iw_str); + for (uchar k = 0; k < FWH; ++k) { + flt_val = vload4(flt_off + k, flt); + if (k == 0) { + calCore0(in_val, flt_val, out_val); + } else if (k == 1) { + calCore1(in_val, flt_val, out_val); + } else if (k == 2) { + calCore2(in_val, flt_val, out_val); +#if (FWH >= 5) + } else if (k == 3) { + calCore3(in_val, flt_val, out_val); + } else if (k == 4) { + calCore4(in_val, flt_val, out_val); +#endif +#if (FWH == 7) + } else if (k == 5) { + calCore5(in_val, flt_val, out_val); + } else if (k == 6) { + calCore6(in_val, flt_val, out_val); +#endif + } + } + flt_off += FWH; + } +#endif + } + in_off += iwh_str * it; + } + + int xn = idx * ON; + int out_off = (idz * ow_str + xn + ow_off) * oh_str + idy + oh_off; + STORE_OUTPUT_BUF_ARRAY_V4(out_val, out_off, oh_str, xn, ow, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/conv_direct_3d_sw2_nchw_to_ncwhc4.cl b/compute/tensor/src/gpu/mali/cl/conv_direct_3d_sw2_nchw_to_ncwhc4.cl new file mode 100644 index 00000000..59917a11 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_direct_3d_sw2_nchw_to_ncwhc4.cl @@ -0,0 +1,176 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, FWH, FT, ON) base##FWH##FT##ON +#define MANGLE_NAME(base, FWH, FT, ON) MANGLE_NAME_IMPL(base, FWH, FT, ON) + +#if (FWH == 1) +#define calCore(A, B, C) \ + { \ + C[0] += A.s0 * B; \ + C[1] += A.s2 * B; \ + C[2] += A.s4 * B; \ + C[3] += A.s6 * B; \ + C[4] += A.s8 * B; \ + C[5] += A.sa * B; \ + C[6] += A.sc * B; \ + C[7] += A.se * B; \ + } +#elif (FWH == 3) +#define calCore(a0, a1, a2, a3, a4, a5, a6, B, C) \ + { \ + C[0] += a0 * B; \ + C[1] += a1 * B; \ + C[2] += a2 * B; \ + C[3] += a3 * B; \ + C[4] += a4 * B; \ + C[5] += a5 * B; \ + C[6] += a6 * B; \ + } +#define calCore0(A, B, C) calCore(A.s0, A.s2, A.s4, A.s6, A.s8, A.sa, A.sc, B, C) +#define calCore1(A, B, C) calCore(A.s1, A.s3, A.s5, A.s7, A.s9, A.sb, A.sd, B, C) +#define calCore2(A, B, C) calCore(A.s2, A.s4, A.s6, A.s8, A.sa, A.sc, A.se, B, C) +#elif (FWH == 5) +#define calCore(a0, a1, a2, a3, a4, a5, B, C) \ + { \ + C[0] += a0 * B; \ + C[1] += a1 * B; \ + C[2] += a2 * B; \ + C[3] += a3 * B; \ + C[4] += a4 * B; \ + C[5] += a5 * B; \ + } +#define calCore0(A, B, C) calCore(A.s0, A.s2, A.s4, A.s6, A.s8, A.sa, B, C) +#define calCore1(A, B, C) calCore(A.s1, A.s3, A.s5, A.s7, A.s9, A.sb, B, C) +#define calCore2(A, B, C) calCore(A.s2, A.s4, A.s6, A.s8, A.sa, A.sc, B, C) +#define calCore3(A, B, C) calCore(A.s3, A.s5, A.s7, A.s9, A.sb, A.sd, B, C) +#define calCore4(A, B, C) calCore(A.s4, A.s6, A.s8, A.sa, A.sc, A.se, B, C) +#elif (FWH == 7) +#define calCore(a0, a1, a2, a3, a4, B, C) \ + { \ + C[0] += a0 * B; \ + C[1] += a1 * B; \ + C[2] += a2 * B; \ + C[3] += a3 * B; \ + C[4] += a4 * B; \ + } +#define calCore0(A, B, C) calCore(A.s0, A.s2, A.s4, A.s6, A.s8, B, C) +#define calCore1(A, B, C) calCore(A.s1, A.s3, A.s5, A.s7, A.s9, B, C) +#define calCore2(A, B, C) calCore(A.s2, A.s4, A.s6, A.s8, A.sa, B, C) +#define calCore3(A, B, C) calCore(A.s3, A.s5, A.s7, A.s9, A.sb, B, C) +#define calCore4(A, B, C) calCore(A.s4, A.s6, A.s8, A.sa, A.sc, B, C) +#define calCore5(A, B, C) calCore(A.s5, A.s7, A.s9, A.sb, A.sd, B, C) +#define calCore6(A, B, C) calCore(A.s6, A.s8, A.sa, A.sc, A.se, B, C) +#endif + +#if defined(USE_RELU) +__kernel void MANGLE_NAME(conv_direct_3d_sw2_nchw_to_ncwhc4_relu_, FWH, FT, ON) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_direct_3d_sw2_nchw_to_ncwhc4_relu6_, FWH, FT, ON) +#else +__kernel void MANGLE_NAME(conv_direct_3d_sw2_nchw_to_ncwhc4_, FWH, FT, ON) +#endif + (const int iw_str, + const int iwh_str, + const int ic_str, + const int iw_off, + const int ih_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int ow, + const int ot, + const int it, + const int pt, + const int sh, + const int st, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + const int idt = idz % ot; + const int idk = idz / ot; + + if (idx >= bx || idy >= by) { + return; + } + + T16 in_val; + T4 flt_val; + T4 out_val[ON]; + + LOADBIAS_IMAGE_ARRAY_V4(out_val, idk, bias); + int in_off = (idy * sh + ih_off) * iw_str + (idx << 1) * ON + iw_off; + int flt_off = idk * ic_str * FWHT; + + int t_be = idt * st - pt; + int t_end = t_be + FT; + if (t_be < 0) { + t_be = 0; + flt_off += pt * FWH * FWH; + } + if (t_end > it) { + t_end = it; + } + + for (int i = 0; i < ic_str; ++i) { + for (int tt = t_be; tt < t_end; ++tt) { +#if (FWH == 1) + flt_val = vload4(flt_off, flt); + in_val = vload16(0, in + in_off + tt * iwh_str); + calCore(in_val, flt_val, out_val); + flt_off++; +#else + for (uchar j = 0; j < FWH; ++j) { + in_val = vload16(0, in + in_off + tt * iwh_str + j * iw_str); + for (uchar k = 0; k < FWH; ++k) { + flt_val = vload4(flt_off + k, flt); + if (k == 0) { + calCore0(in_val, flt_val, out_val); + } else if (k == 1) { + calCore1(in_val, flt_val, out_val); + } else if (k == 2) { + calCore2(in_val, flt_val, out_val); +#if (FWH >= 5) + } else if (k == 3) { + calCore3(in_val, flt_val, out_val); + } else if (k == 4) { + calCore4(in_val, flt_val, out_val); +#endif +#if (FWH == 7) + } else if (k == 5) { + calCore5(in_val, flt_val, out_val); + } else if (k == 6) { + calCore6(in_val, flt_val, out_val); +#endif + } + } + flt_off += FWH; + } +#endif + } + in_off += iwh_str * it; + } + + int xn = idx * ON; + int out_off = (idz * ow_str + xn + ow_off) * oh_str + idy + oh_off; + STORE_OUTPUT_BUF_ARRAY_V4(out_val, out_off, oh_str, xn, ow, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/conv_direct_s1.cl b/compute/tensor/src/gpu/mali/cl/conv_direct_s1.cl new file mode 100644 index 00000000..a8ba5bf7 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_direct_s1.cl @@ -0,0 +1,345 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, F, ON, KN) base##F##ON##KN +#define MANGLE_NAME(base, F, ON, KN) MANGLE_NAME_IMPL(base, F, ON, KN) + +#if defined(REUSE_H) +#if (ON == 2) +#define SET_BIAS_VAL(bv, ov) \ + { \ + ov.s0 = bv.x; \ + ov.s1 = bv.y; \ + ov.s2 = bv.z; \ + ov.s3 = bv.w; \ + ov.s4 = bv.x; \ + ov.s5 = bv.y; \ + ov.s6 = bv.z; \ + ov.s7 = bv.w; \ + } + +#define calCore(iv, fv, ov) \ + { \ + ov.s0 += iv.s0 * fv.s0 + iv.s1 * fv.s1 + iv.s2 * fv.s2 + iv.s3 * fv.s3; \ + ov.s1 += iv.s0 * fv.s4 + iv.s1 * fv.s5 + iv.s2 * fv.s6 + iv.s3 * fv.s7; \ + ov.s2 += iv.s0 * fv.s8 + iv.s1 * fv.s9 + iv.s2 * fv.sa + iv.s3 * fv.sb; \ + ov.s3 += iv.s0 * fv.sc + iv.s1 * fv.sd + iv.s2 * fv.se + iv.s3 * fv.sf; \ + ov.s4 += iv.s4 * fv.s0 + iv.s5 * fv.s1 + iv.s6 * fv.s2 + iv.s7 * fv.s3; \ + ov.s5 += iv.s4 * fv.s4 + iv.s5 * fv.s5 + iv.s6 * fv.s6 + iv.s7 * fv.s7; \ + ov.s6 += iv.s4 * fv.s8 + iv.s5 * fv.s9 + iv.s6 * fv.sa + iv.s7 * fv.sb; \ + ov.s7 += iv.s4 * fv.sc + iv.s5 * fv.sd + iv.s6 * fv.se + iv.s7 * fv.sf; \ + } + +#define VLOAD_VEC(off, buf) vload8(0, buf + off); +#define VSTORE_VEC(v, off, buf) \ + { \ + ACTIVATION_V8(v); \ + vstore8(v, 0, buf + off); \ + } +#elif (ON == 4) +#define SET_BIAS_VAL(bv, ov) \ + { \ + ov.s0 = bv.x; \ + ov.s1 = bv.y; \ + ov.s2 = bv.z; \ + ov.s3 = bv.w; \ + ov.s4 = bv.x; \ + ov.s5 = bv.y; \ + ov.s6 = bv.z; \ + ov.s7 = bv.w; \ + ov.s8 = bv.x; \ + ov.s9 = bv.y; \ + ov.sa = bv.z; \ + ov.sb = bv.w; \ + ov.sc = bv.x; \ + ov.sd = bv.y; \ + ov.se = bv.z; \ + ov.sf = bv.w; \ + } +#define calCore(iv, fv, ov) \ + { \ + ov.s0 += iv.s0 * fv.s0 + iv.s1 * fv.s1 + iv.s2 * fv.s2 + iv.s3 * fv.s3; \ + ov.s1 += iv.s0 * fv.s4 + iv.s1 * fv.s5 + iv.s2 * fv.s6 + iv.s3 * fv.s7; \ + ov.s2 += iv.s0 * fv.s8 + iv.s1 * fv.s9 + iv.s2 * fv.sa + iv.s3 * fv.sb; \ + ov.s3 += iv.s0 * fv.sc + iv.s1 * fv.sd + iv.s2 * fv.se + iv.s3 * fv.sf; \ + ov.s4 += iv.s4 * fv.s0 + iv.s5 * fv.s1 + iv.s6 * fv.s2 + iv.s7 * fv.s3; \ + ov.s5 += iv.s4 * fv.s4 + iv.s5 * fv.s5 + iv.s6 * fv.s6 + iv.s7 * fv.s7; \ + ov.s6 += iv.s4 * fv.s8 + iv.s5 * fv.s9 + iv.s6 * fv.sa + iv.s7 * fv.sb; \ + ov.s7 += iv.s4 * fv.sc + iv.s5 * fv.sd + iv.s6 * fv.se + iv.s7 * fv.sf; \ + ov.s8 += iv.s8 * fv.s0 + iv.s9 * fv.s1 + iv.sa * fv.s2 + iv.sb * fv.s3; \ + ov.s9 += iv.s8 * fv.s4 + iv.s9 * fv.s5 + iv.sa * fv.s6 + iv.sb * fv.s7; \ + ov.sa += iv.s8 * fv.s8 + iv.s9 * fv.s9 + iv.sa * fv.sa + iv.sb * fv.sb; \ + ov.sb += iv.s8 * fv.sc + iv.s9 * fv.sd + iv.sa * fv.se + iv.sb * fv.sf; \ + ov.sc += iv.sc * fv.s0 + iv.sd * fv.s1 + iv.se * fv.s2 + iv.sf * fv.s3; \ + ov.sd += iv.sc * fv.s4 + iv.sd * fv.s5 + iv.se * fv.s6 + iv.sf * fv.s7; \ + ov.se += iv.sc * fv.s8 + iv.sd * fv.s9 + iv.se * fv.sa + iv.sf * fv.sb; \ + ov.sf += iv.sc * fv.sc + iv.sd * fv.sd + iv.se * fv.se + iv.sf * fv.sf; \ + } + +#define VLOAD_VEC(off, buf) vload16(0, buf + off); +#define VSTORE_VEC(v, off, buf) \ + { \ + ACTIVATION_V16(v); \ + vstore16(v, 0, buf + off); \ + } +#endif +#if defined(USE_RELU) +__kernel void MANGLE_NAME(conv_direct_s1_h_relu_, F, ON, KN) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_direct_s1_h_relu6_, F, ON, KN) +#elif defined(USE_GELU) +__kernel void MANGLE_NAME(conv_direct_s1_h_gelu_, F, ON, KN) +#else +__kernel void MANGLE_NAME(conv_direct_s1_h_, F, ON, KN) +#endif + (const int ih_str, + int ihw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + int ohw_str, + const int oh_off, + const int ow_off, + const int oh, + const int sw, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } +#if (ON == 2) + T8 in_val; + T8 out_val[KN]; +#elif (ON == 4) + T16 in_val; + T16 out_val[KN]; +#endif + T16 flt_val; + T4 bias_val = read_imageh(bias, sampler, idz * KN); + SET_BIAS_VAL(bias_val, out_val[0]); +#if (KN > 1) + bias_val = read_imageh(bias, sampler, idz * KN + 1); + SET_BIAS_VAL(bias_val, out_val[1]); +#endif +#if (KN > 2) + bias_val = read_imageh(bias, sampler, idz * KN + 2); + SET_BIAS_VAL(bias_val, out_val[2]); + bias_val = read_imageh(bias, sampler, idz * KN + 3); + SET_BIAS_VAL(bias_val, out_val[3]); +#endif + int in_off = ((idy + iw_off) * ih_str + idx * ON + ih_off) << 2; + int flt_off = idz * ic_str * Fsq * KN; + ihw_str = ihw_str << 2; + + for (int i = 0; i < ic_str; ++i) { + in_val = VLOAD_VEC(in_off, in); +#if (KN == 1) + flt_val = vload16(flt_off, flt); + calCore(in_val, flt_val, out_val[0]); +#elif (KN == 2) + flt_val = vload16(flt_off, flt); + calCore(in_val, flt_val, out_val[0]); + flt_val = vload16(flt_off + 1, flt); + calCore(in_val, flt_val, out_val[1]); +#elif (KN == 4) + for (uchar j = 0; j < KN; ++j) { + flt_val = vload16(flt_off + j, flt); + if (j == 0) { + calCore(in_val, flt_val, out_val[0]); + } + if (j == 1) { + calCore(in_val, flt_val, out_val[1]); + } + if (j == 2) { + calCore(in_val, flt_val, out_val[2]); + } + if (j == 3) { + calCore(in_val, flt_val, out_val[3]); + } + } +#endif + flt_off += KN; + in_off += ihw_str; + } + int out_off = (idz * KN * ohw_str + (idy + ow_off) * oh_str + idx * ON + oh_off) << 2; + VSTORE_VEC(out_val[0], out_off, out); + +#if (KN > 1) + ohw_str = ohw_str << 2; + out_off += ohw_str; + VSTORE_VEC(out_val[1], out_off, out); +#endif + +#if (KN > 2) + out_off += ohw_str; + VSTORE_VEC(out_val[2], out_off, out); + out_off += ohw_str; + VSTORE_VEC(out_val[3], out_off, out); +#endif +} + +// // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // / +#else +#if defined(USE_RELU) +__kernel void MANGLE_NAME(conv_direct_s1_relu_, F, ON, KN) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_direct_s1_relu6_, F, ON, KN) +#elif defined(USE_GELU) +__kernel void MANGLE_NAME(conv_direct_s1_gelu_, F, ON, KN) +#elif defined(USE_ELTWISE_NCWHC4) +__kernel void MANGLE_NAME(conv_direct_s1_eltwise4_, F, ON, KN) +#else +__kernel void MANGLE_NAME(conv_direct_s1_, F, ON, KN) +#endif + (const int ih_str, + const int ihw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int ow, + const int sh, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out +#if defined(USE_ELTWISE_NCWHC4) + , + const int eh_str, + const int ehw_str, + const int eh_off, + const int ew_off, + __global const T *eltVal +#endif + ) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + T4 in_val[IN]; + T16 flt_val; + T4 out_val[KN][ON]; + LOADBIAS_IMAGE_ARRAY_V4(out_val[0], idz * KN, bias); +#if (KN > 1) + LOADBIAS_IMAGE_ARRAY_V4(out_val[1], idz * KN + 1, bias); +#endif +#if (KN > 2) + LOADBIAS_IMAGE_ARRAY_V4(out_val[2], idz * KN + 2, bias); + LOADBIAS_IMAGE_ARRAY_V4(out_val[3], idz * KN + 3, bias); +#endif + + int in_off = (idy * ON + iw_off) * ih_str + idx * sh + ih_off; + int flt_off = idz * ic_str * Fsq * KN; + + for (int i = 0; i < ic_str; ++i) { +#if (F == 1) + LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off, ih_str, in); +#if (KN == 1) + flt_val = vload16(flt_off, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]); +#elif (KN == 2) + flt_val = vload16(flt_off, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]); + flt_val = vload16(flt_off + 1, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[1]); +#elif (KN == 4) + for (uchar j = 0; j < KN; ++j) { + flt_val = vload16(flt_off + j, flt); + if (j == 0) { + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]); + } + if (j == 1) { + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[1]); + } + if (j == 2) { + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[2]); + } + if (j == 3) { + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[3]); + } + } +#endif + flt_off += KN; +#else + for (uchar j = 0; j < F; ++j) { + LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + j, ih_str, in); + for (uchar k = 0; k < F; ++k) { +#if defined(BASICE_REG) + in_val[LN] = vload4(in_off + j + (LN + k) * ih_str, in); +#endif + flt_val = vload16(flt_off + k * KN, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]); +#if (KN > 1) + flt_val = vload16(flt_off + k * KN + 1, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[1]); +#endif +#if (KN > 2) + flt_val = vload16(flt_off + k * KN + 2, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[2]); + flt_val = vload16(flt_off + k * KN + 3, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[3]); +#endif + UPDATE_REG(in_val); + } + flt_off += F * KN; + } +#endif + in_off += ihw_str; + } + +#if defined(USE_ELTWISE_NCWHC4) + int elt_off = idz * KN * ehw_str + (idy * ON + ew_off) * eh_str + idx + eh_off; + ADD_ELTWISE_BUF_ARRAY_V4(out_val[0], elt_off, eh_str, eltVal); +#endif + int out_off = idz * KN * ohw_str + (idy * ON + ow_off) * oh_str + idx + oh_off; + STORE_OUTPUT_BUF_ARRAY_V4(out_val[0], out_off, oh_str, idy * ON, ow, out); +#if (KN > 1) +#if defined(USE_ELTWISE_NCWHC4) + elt_off += ehw_str; + ADD_ELTWISE_BUF_ARRAY_V4(out_val[1], elt_off, eh_str, eltVal); +#endif + out_off += ohw_str; + STORE_OUTPUT_BUF_ARRAY_V4(out_val[1], out_off, oh_str, idy * ON, ow, out); +#endif +#if (KN > 2) +#if defined(USE_ELTWISE_NCWHC4) + elt_off += ehw_str; + ADD_ELTWISE_BUF_ARRAY_V4(out_val[2], elt_off, eh_str, eltVal); + elt_off += ehw_str; + ADD_ELTWISE_BUF_ARRAY_V4(out_val[3], elt_off, eh_str, eltVal); +#endif + out_off += ohw_str; + STORE_OUTPUT_BUF_ARRAY_V4(out_val[2], out_off, oh_str, idy * ON, ow, out); + out_off += ohw_str; + STORE_OUTPUT_BUF_ARRAY_V4(out_val[3], out_off, oh_str, idy * ON, ow, out); +#endif +} +#endif diff --git a/compute/tensor/src/gpu/mali/cl/conv_direct_s1_fn_spe.cl b/compute/tensor/src/gpu/mali/cl/conv_direct_s1_fn_spe.cl new file mode 100644 index 00000000..f54507ac --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_direct_s1_fn_spe.cl @@ -0,0 +1,503 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, F, ON) base##F##ON +#define MANGLE_NAME(base, F, ON) MANGLE_NAME_IMPL(base, F, ON) + +#if (ON == 3) +#define LOAD_BIAS(ov, bias) \ + { \ + ov[0] = bias[0]; \ + ov[1] = ov[0]; \ + ov[2] = ov[0]; \ + } +#define CALCORE(iv, fv, ov) \ + { \ + ov[0] += iv[0].x * fv.x + iv[0].y * fv.y + iv[0].z * fv.z + iv[0].w * fv.w; \ + ov[1] += iv[1].x * fv.x + iv[1].y * fv.y + iv[1].z * fv.z + iv[1].w * fv.w; \ + ov[2] += iv[2].x * fv.x + iv[2].y * fv.y + iv[2].z * fv.z + iv[2].w * fv.w; \ + } +#endif + +#if (ON == 4) +#define LOAD_BIAS(ov, bias) \ + { \ + ov[0] = bias[0]; \ + ov[1] = ov[0]; \ + ov[2] = ov[0]; \ + ov[3] = ov[0]; \ + } +#define CALCORE(iv, fv, ov) \ + { \ + ov[0] += iv[0].x * fv.x + iv[0].y * fv.y + iv[0].z * fv.z + iv[0].w * fv.w; \ + ov[1] += iv[1].x * fv.x + iv[1].y * fv.y + iv[1].z * fv.z + iv[1].w * fv.w; \ + ov[2] += iv[2].x * fv.x + iv[2].y * fv.y + iv[2].z * fv.z + iv[2].w * fv.w; \ + ov[3] += iv[3].x * fv.x + iv[3].y * fv.y + iv[3].z * fv.z + iv[3].w * fv.w; \ + } +#endif + +#if (ON == 5) +#define LOAD_BIAS(ov, bias) \ + { \ + ov[0] = bias[0]; \ + ov[1] = ov[0]; \ + ov[2] = ov[0]; \ + ov[3] = ov[0]; \ + ov[4] = ov[0]; \ + } +#define CALCORE(iv, fv, ov) \ + { \ + ov[0] += iv[0].x * fv.x + iv[0].y * fv.y + iv[0].z * fv.z + iv[0].w * fv.w; \ + ov[1] += iv[1].x * fv.x + iv[1].y * fv.y + iv[1].z * fv.z + iv[1].w * fv.w; \ + ov[2] += iv[2].x * fv.x + iv[2].y * fv.y + iv[2].z * fv.z + iv[2].w * fv.w; \ + ov[3] += iv[3].x * fv.x + iv[3].y * fv.y + iv[3].z * fv.z + iv[3].w * fv.w; \ + ov[4] += iv[4].x * fv.x + iv[4].y * fv.y + iv[4].z * fv.z + iv[4].w * fv.w; \ + } +#endif + +#if (ON == 6) +#define LOAD_BIAS(ov, bias) \ + { \ + ov[0] = bias[0]; \ + ov[1] = ov[0]; \ + ov[2] = ov[0]; \ + ov[3] = ov[0]; \ + ov[4] = ov[0]; \ + ov[5] = ov[0]; \ + } +#define CALCORE(iv, fv, ov) \ + { \ + ov[0] += iv[0].x * fv.x + iv[0].y * fv.y + iv[0].z * fv.z + iv[0].w * fv.w; \ + ov[1] += iv[1].x * fv.x + iv[1].y * fv.y + iv[1].z * fv.z + iv[1].w * fv.w; \ + ov[2] += iv[2].x * fv.x + iv[2].y * fv.y + iv[2].z * fv.z + iv[2].w * fv.w; \ + ov[3] += iv[3].x * fv.x + iv[3].y * fv.y + iv[3].z * fv.z + iv[3].w * fv.w; \ + ov[4] += iv[4].x * fv.x + iv[4].y * fv.y + iv[4].z * fv.z + iv[4].w * fv.w; \ + ov[5] += iv[5].x * fv.x + iv[5].y * fv.y + iv[5].z * fv.z + iv[5].w * fv.w; \ + } +#endif + +#if (ON == 7) +#define LOAD_BIAS(ov, bias) \ + { \ + ov[0] = bias[0]; \ + ov[1] = ov[0]; \ + ov[2] = ov[0]; \ + ov[3] = ov[0]; \ + ov[4] = ov[0]; \ + ov[5] = ov[0]; \ + ov[6] = ov[0]; \ + } +#define CALCORE(iv, fv, ov) \ + { \ + ov[0] += iv[0].x * fv.x + iv[0].y * fv.y + iv[0].z * fv.z + iv[0].w * fv.w; \ + ov[1] += iv[1].x * fv.x + iv[1].y * fv.y + iv[1].z * fv.z + iv[1].w * fv.w; \ + ov[2] += iv[2].x * fv.x + iv[2].y * fv.y + iv[2].z * fv.z + iv[2].w * fv.w; \ + ov[3] += iv[3].x * fv.x + iv[3].y * fv.y + iv[3].z * fv.z + iv[3].w * fv.w; \ + ov[4] += iv[4].x * fv.x + iv[4].y * fv.y + iv[4].z * fv.z + iv[4].w * fv.w; \ + ov[5] += iv[5].x * fv.x + iv[5].y * fv.y + iv[5].z * fv.z + iv[5].w * fv.w; \ + ov[6] += iv[6].x * fv.x + iv[6].y * fv.y + iv[6].z * fv.z + iv[6].w * fv.w; \ + } +#endif + +#if (ON == 8) +#define LOAD_BIAS(ov, bias) \ + { \ + ov[0] = bias[0]; \ + ov[1] = ov[0]; \ + ov[2] = ov[0]; \ + ov[3] = ov[0]; \ + ov[4] = ov[0]; \ + ov[5] = ov[0]; \ + ov[6] = ov[0]; \ + ov[7] = ov[0]; \ + } +#define CALCORE(iv, fv, ov) \ + { \ + ov[0] += iv[0].x * fv.x + iv[0].y * fv.y + iv[0].z * fv.z + iv[0].w * fv.w; \ + ov[1] += iv[1].x * fv.x + iv[1].y * fv.y + iv[1].z * fv.z + iv[1].w * fv.w; \ + ov[2] += iv[2].x * fv.x + iv[2].y * fv.y + iv[2].z * fv.z + iv[2].w * fv.w; \ + ov[3] += iv[3].x * fv.x + iv[3].y * fv.y + iv[3].z * fv.z + iv[3].w * fv.w; \ + ov[4] += iv[4].x * fv.x + iv[4].y * fv.y + iv[4].z * fv.z + iv[4].w * fv.w; \ + ov[5] += iv[5].x * fv.x + iv[5].y * fv.y + iv[5].z * fv.z + iv[5].w * fv.w; \ + ov[6] += iv[6].x * fv.x + iv[6].y * fv.y + iv[6].z * fv.z + iv[6].w * fv.w; \ + ov[7] += iv[7].x * fv.x + iv[7].y * fv.y + iv[7].z * fv.z + iv[7].w * fv.w; \ + } +#endif + +#if defined(USE_NCHW) +#if (ON == 3) +#define STORE_OUT(ov, off, id, ow, buf) \ + { \ + ACTIVATION_ARRAY3(ov); \ + if (id + 3 < ow) { \ + STORE_BUF_ARRAY3(ov, off, buf); \ + } else { \ + buf[off] = ov[0]; \ + if (id + 1 < ow) \ + buf[off + 1] = ov[1]; \ + if (id + 2 < ow) \ + buf[off + 2] = ov[2]; \ + } \ + } +#endif +#if (ON == 4) +#define STORE_OUT(ov, off, id, ow, buf) \ + { \ + ACTIVATION_ARRAY4(ov); \ + if (id + 4 < ow) { \ + STORE_BUF_ARRAY4(ov, off, buf); \ + } else { \ + buf[off] = ov[0]; \ + if (id + 1 < ow) \ + buf[off + 1] = ov[1]; \ + if (id + 2 < ow) \ + buf[off + 2] = ov[2]; \ + if (id + 3 < ow) \ + buf[off + 3] = ov[3]; \ + } \ + } +#endif +#if (ON == 5) +#define STORE_OUT(ov, off, id, ow, buf) \ + { \ + ACTIVATION_ARRAY5(ov); \ + if (id + 5 < ow) { \ + STORE_BUF_ARRAY5(ov, off, buf); \ + } else { \ + buf[off] = ov[0]; \ + if (id + 1 < ow) \ + buf[off + 1] = ov[1]; \ + if (id + 2 < ow) \ + buf[off + 2] = ov[2]; \ + if (id + 3 < ow) \ + buf[off + 3] = ov[3]; \ + if (id + 4 < ow) \ + buf[off + 4] = ov[4]; \ + } \ + } +#endif +#if (ON == 6) +#define STORE_OUT(ov, off, id, ow, buf) \ + { \ + ACTIVATION_ARRAY6(ov); \ + if (id + 6 < ow) { \ + STORE_BUF_ARRAY6(ov, off, buf); \ + } else { \ + buf[off] = ov[0]; \ + if (id + 1 < ow) \ + buf[off + 1] = ov[1]; \ + if (id + 2 < ow) \ + buf[off + 2] = ov[2]; \ + if (id + 3 < ow) \ + buf[off + 3] = ov[3]; \ + if (id + 4 < ow) \ + buf[off + 4] = ov[4]; \ + if (id + 5 < ow) \ + buf[off + 5] = ov[5]; \ + } \ + } +#endif +#if (ON == 7) +#define STORE_OUT(ov, off, id, ow, buf) \ + { \ + ACTIVATION_ARRAY7(ov); \ + if (id + 7 < ow) { \ + STORE_BUF_ARRAY7(ov, off, buf); \ + } else { \ + buf[off] = ov[0]; \ + if (id + 1 < ow) \ + buf[off + 1] = ov[1]; \ + if (id + 2 < ow) \ + buf[off + 2] = ov[2]; \ + if (id + 3 < ow) \ + buf[off + 3] = ov[3]; \ + if (id + 4 < ow) \ + buf[off + 4] = ov[4]; \ + if (id + 5 < ow) \ + buf[off + 5] = ov[5]; \ + if (id + 6 < ow) \ + buf[off + 6] = ov[6]; \ + } \ + } +#endif +#if (ON == 8) +#define STORE_OUT(ov, off, id, ow, buf) \ + { \ + ACTIVATION_ARRAY8(ov); \ + if (id + 8 < ow) { \ + STORE_BUF_ARRAY8(ov, off, buf); \ + } else { \ + buf[off] = ov[0]; \ + if (id + 1 < ow) \ + buf[off + 1] = ov[1]; \ + if (id + 2 < ow) \ + buf[off + 2] = ov[2]; \ + if (id + 3 < ow) \ + buf[off + 3] = ov[3]; \ + if (id + 4 < ow) \ + buf[off + 4] = ov[4]; \ + if (id + 5 < ow) \ + buf[off + 5] = ov[5]; \ + if (id + 6 < ow) \ + buf[off + 6] = ov[6]; \ + if (id + 7 < ow) \ + buf[off + 7] = ov[7]; \ + } \ + } +#endif +#else +#if (ON == 3) +#define STORE_OUT(ov, off, str, id, ow, buf) \ + { \ + ACTIVATION_ARRAY3(ov); \ + T4 tmp = 0; \ + tmp.x = ov[0]; \ + vstore4(tmp, off, buf); \ + if (id + 1 < ow) { \ + tmp.x = ov[1]; \ + vstore4(tmp, off + str, buf); \ + } \ + if (id + 2 < ow) { \ + tmp.x = ov[2]; \ + vstore4(tmp, off + 2 * str, buf); \ + } \ + } +#endif +#if (ON == 4) +#define STORE_OUT(ov, off, str, id, ow, buf) \ + { \ + ACTIVATION_ARRAY4(ov); \ + T4 tmp = 0; \ + tmp.x = ov[0]; \ + vstore4(tmp, off, buf); \ + if (id + 1 < ow) { \ + tmp.x = ov[1]; \ + vstore4(tmp, off + str, buf); \ + } \ + if (id + 2 < ow) { \ + tmp.x = ov[2]; \ + vstore4(tmp, off + 2 * str, buf); \ + } \ + if (id + 3 < ow) { \ + tmp.x = ov[3]; \ + vstore4(tmp, off + 3 * str, buf); \ + } \ + } +#endif +#if (ON == 5) +#define STORE_OUT(ov, off, str, id, ow, buf) \ + { \ + ACTIVATION_ARRAY5(ov); \ + T4 tmp = 0; \ + tmp.x = ov[0]; \ + vstore4(tmp, off, buf); \ + if (id + 1 < ow) { \ + tmp.x = ov[1]; \ + vstore4(tmp, off + str, buf); \ + } \ + if (id + 2 < ow) { \ + tmp.x = ov[2]; \ + vstore4(tmp, off + 2 * str, buf); \ + } \ + if (id + 3 < ow) { \ + tmp.x = ov[3]; \ + vstore4(tmp, off + 3 * str, buf); \ + } \ + if (id + 4 < ow) { \ + tmp.x = ov[4]; \ + vstore4(tmp, off + 4 * str, buf); \ + } \ + } +#endif +#if (ON == 6) +#define STORE_OUT(ov, off, str, id, ow, buf) \ + { \ + ACTIVATION_ARRAY6(ov); \ + T4 tmp = 0; \ + tmp.x = ov[0]; \ + vstore4(tmp, off, buf); \ + if (id + 1 < ow) { \ + tmp.x = ov[1]; \ + vstore4(tmp, off + str, buf); \ + } \ + if (id + 2 < ow) { \ + tmp.x = ov[2]; \ + vstore4(tmp, off + 2 * str, buf); \ + } \ + if (id + 3 < ow) { \ + tmp.x = ov[3]; \ + vstore4(tmp, off + 3 * str, buf); \ + } \ + if (id + 4 < ow) { \ + tmp.x = ov[4]; \ + vstore4(tmp, off + 4 * str, buf); \ + } \ + if (id + 5 < ow) { \ + tmp.x = ov[5]; \ + vstore4(tmp, off + 5 * str, buf); \ + } \ + } +#endif +#if (ON == 7) +#define STORE_OUT(ov, off, str, id, ow, buf) \ + { \ + ACTIVATION_ARRAY7(ov); \ + T4 tmp = 0; \ + tmp.x = ov[0]; \ + vstore4(tmp, off, buf); \ + if (id + 1 < ow) { \ + tmp.x = ov[1]; \ + vstore4(tmp, off + str, buf); \ + } \ + if (id + 2 < ow) { \ + tmp.x = ov[2]; \ + vstore4(tmp, off + 2 * str, buf); \ + } \ + if (id + 3 < ow) { \ + tmp.x = ov[3]; \ + vstore4(tmp, off + 3 * str, buf); \ + } \ + if (id + 4 < ow) { \ + tmp.x = ov[4]; \ + vstore4(tmp, off + 4 * str, buf); \ + } \ + if (id + 5 < ow) { \ + tmp.x = ov[5]; \ + vstore4(tmp, off + 5 * str, buf); \ + } \ + if (id + 6 < ow) { \ + tmp.x = ov[6]; \ + vstore4(tmp, off + 6 * str, buf); \ + } \ + } +#endif +#if (ON == 8) +#define STORE_OUT(ov, off, str, id, ow, buf) \ + { \ + ACTIVATION_ARRAY8(ov); \ + T4 tmp = 0; \ + tmp.x = ov[0]; \ + vstore4(tmp, off, buf); \ + if (id + 1 < ow) { \ + tmp.x = ov[1]; \ + vstore4(tmp, off + str, buf); \ + } \ + if (id + 2 < ow) { \ + tmp.x = ov[2]; \ + vstore4(tmp, off + 2 * str, buf); \ + } \ + if (id + 3 < ow) { \ + tmp.x = ov[3]; \ + vstore4(tmp, off + 3 * str, buf); \ + } \ + if (id + 4 < ow) { \ + tmp.x = ov[4]; \ + vstore4(tmp, off + 4 * str, buf); \ + } \ + if (id + 5 < ow) { \ + tmp.x = ov[5]; \ + vstore4(tmp, off + 5 * str, buf); \ + } \ + if (id + 6 < ow) { \ + tmp.x = ov[6]; \ + vstore4(tmp, off + 6 * str, buf); \ + } \ + if (id + 7 < ow) { \ + tmp.x = ov[7]; \ + vstore4(tmp, off + 7 * str, buf); \ + } \ + } +#endif +#endif + +#if defined(USE_NCHW) +#if defined(USE_RELU) +__kernel void MANGLE_NAME(conv_direct_s1_fn_spe_relu_nchw_, F, ON) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_direct_s1_fn_spe_relu6_nchw_, F, ON) +#else +__kernel void MANGLE_NAME(conv_direct_s1_fn_spe_nchw_, F, ON) +#endif +#else +#if defined(USE_RELU) +__kernel void MANGLE_NAME(conv_direct_s1_fn_spe_relu_, F, ON) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_direct_s1_fn_spe_relu6_, F, ON) +#else +__kernel void MANGLE_NAME(conv_direct_s1_fn_spe_, F, ON) +#endif +#endif + (const int ih_str, + const int ihw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int ow, + const int sh, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __global const T *bias, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + if (idx >= bx || idy >= by) { + return; + } + T4 flt_val; + T4 in_val[IN]; + T out_val[ON]; + + LOAD_BIAS(out_val, bias); + int flt_off = 0; + int in_off = (idy * ON + iw_off) * ih_str + idx * sh + ih_off; + +#if (F == 1) + for (int i = 0; i < ic_str; ++i) { + LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off, ih_str, in); + flt_val = vload4(flt_off, flt); + CALCORE(in_val, flt_val, out_val); + flt_off += 1; + in_off += ihw_str; + } +#else + for (int i = 0; i < ic_str; ++i) { + for (uchar j = 0; j < F; j++) { + LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + j, ih_str, in); + for (uchar k = 0; k < F; k++) { + flt_val = vload4(flt_off + k, flt); + CALCORE(in_val, flt_val, out_val); + UPDATE_REG(in_val); + } + flt_off += F; + } + in_off += ihw_str; + } +#endif + +#if defined(USE_NCHW) + int out_off = (idx + oh_off) * ow_str + idy * ON + ow_off; + STORE_OUT(out_val, out_off, idy * ON, ow, out); +#else + int out_off = (idy * ON + ow_off) * oh_str + idx + oh_off; + STORE_OUT(out_val, out_off, oh_str, idy * ON, ow, out); +#endif +} diff --git a/compute/tensor/src/gpu/mali/cl/conv_direct_s1_nchw_to_ncwhc4.cl b/compute/tensor/src/gpu/mali/cl/conv_direct_s1_nchw_to_ncwhc4.cl new file mode 100644 index 00000000..55e20be3 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_direct_s1_nchw_to_ncwhc4.cl @@ -0,0 +1,156 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, F, ON) base##F##ON +#define MANGLE_NAME(base, F, ON) MANGLE_NAME_IMPL(base, F, ON) + +#if (F == 1) +#define calCore(A, B, C) \ + { \ + C[0] += A.s0 * B; \ + C[1] += A.s1 * B; \ + C[2] += A.s2 * B; \ + C[3] += A.s3 * B; \ + C[4] += A.s4 * B; \ + C[5] += A.s5 * B; \ + C[6] += A.s6 * B; \ + C[7] += A.s7 * B; \ + } +#elif (F == 3) +#define calCore(a0, a1, a2, a3, a4, a5, B, C) \ + { \ + C[0] += a0 * B; \ + C[1] += a1 * B; \ + C[2] += a2 * B; \ + C[3] += a3 * B; \ + C[4] += a4 * B; \ + C[5] += a5 * B; \ + } +#define calCore0(A, B, C) calCore(A.s0, A.s1, A.s2, A.s3, A.s4, A.s5, B, C) +#define calCore1(A, B, C) calCore(A.s1, A.s2, A.s3, A.s4, A.s5, A.s6, B, C) +#define calCore2(A, B, C) calCore(A.s2, A.s3, A.s4, A.s5, A.s6, A.s7, B, C) +#elif (F == 5) +#define calCore(a0, a1, a2, a3, B, C) \ + { \ + C[0] += a0 * B; \ + C[1] += a1 * B; \ + C[2] += a2 * B; \ + C[3] += a3 * B; \ + } +#define calCore0(A, B, C) calCore(A.s0, A.s1, A.s2, A.s3, B, C) +#define calCore1(A, B, C) calCore(A.s1, A.s2, A.s3, A.s4, B, C) +#define calCore2(A, B, C) calCore(A.s2, A.s3, A.s4, A.s5, B, C) +#define calCore3(A, B, C) calCore(A.s3, A.s4, A.s5, A.s6, B, C) +#define calCore4(A, B, C) calCore(A.s4, A.s5, A.s6, A.s7, B, C) +#elif (F == 7) +#define calCore(a0, a1, B, C) \ + { \ + C[0] += a0 * B; \ + C[1] += a1 * B; \ + } +#define calCore0(A, B, C) calCore(A.s0, A.s1, B, C) +#define calCore1(A, B, C) calCore(A.s1, A.s2, B, C) +#define calCore2(A, B, C) calCore(A.s2, A.s3, B, C) +#define calCore3(A, B, C) calCore(A.s3, A.s4, B, C) +#define calCore4(A, B, C) calCore(A.s4, A.s5, B, C) +#define calCore5(A, B, C) calCore(A.s5, A.s6, B, C) +#define calCore6(A, B, C) calCore(A.s6, A.s7, B, C) +#endif + +#if defined(USE_RELU) +__kernel void MANGLE_NAME(conv_direct_s1_nchw_to_ncwhc4_relu_, F, ON) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_direct_s1_nchw_to_ncwhc4_relu6_, F, ON) +#else +__kernel void MANGLE_NAME(conv_direct_s1_nchw_to_ncwhc4_, F, ON) +#endif + (const int iw_str, + const int iwh_str, + const int ic_str, + const int iw_off, + const int ih_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int ow, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + + T8 in_val; + T4 flt_val; + T4 out_val[ON]; + + LOADBIAS_IMAGE_ARRAY_V4(out_val, idz, bias); + int in_off = (idy + ih_off) * iw_str + idx * ON + iw_off; + int flt_off = idz * ic_str * Fsq; + + for (int i = 0; i < ic_str; ++i) { +#if (F == 1) + flt_val = vload4(flt_off, flt); + in_val = vload8(0, in + in_off); + calCore(in_val, flt_val, out_val); + flt_off++; +#else + for (uchar j = 0; j < F; ++j) { + in_val = vload8(0, in + in_off + j * iw_str); + for (uchar k = 0; k < F; ++k) { + flt_val = vload4(flt_off + k, flt); + if (k == 0) { + calCore0(in_val, flt_val, out_val); + } + if (k == 1) { + calCore1(in_val, flt_val, out_val); + } + if (k == 2) { + calCore2(in_val, flt_val, out_val); + } +#if (F > 3) + if (k == 3) { + calCore3(in_val, flt_val, out_val); + } + if (k == 4) { + calCore4(in_val, flt_val, out_val); + } +#endif +#if (F > 5) + if (k == 5) { + calCore5(in_val, flt_val, out_val); + } + if (k == 6) { + calCore6(in_val, flt_val, out_val); + } +#endif + } + flt_off += F; + } +#endif + in_off += iwh_str; + } + + int xn = idx * ON; + int out_off = (idz * ow_str + xn + ow_off) * oh_str + idy + oh_off; + STORE_OUTPUT_BUF_ARRAY_V4(out_val, out_off, oh_str, xn, ow, out); +} diff --git a/tensor_computing/src/gpu/mali/cl/conv_direct_s1_spe_f1c3k1.cl b/compute/tensor/src/gpu/mali/cl/conv_direct_s1_spe_f1c3k1.cl similarity index 83% rename from tensor_computing/src/gpu/mali/cl/conv_direct_s1_spe_f1c3k1.cl rename to compute/tensor/src/gpu/mali/cl/conv_direct_s1_spe_f1c3k1.cl index cf1fbd39..f973a38e 100644 --- a/tensor_computing/src/gpu/mali/cl/conv_direct_s1_spe_f1c3k1.cl +++ b/compute/tensor/src/gpu/mali/cl/conv_direct_s1_spe_f1c3k1.cl @@ -11,19 +11,25 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -#define MANGLE_NAME_IMPL(base, EW) base ## EW +#define MANGLE_NAME_IMPL(base, EW) base##EW #define MANGLE_NAME(base, EW) MANGLE_NAME_IMPL(base, EW) -__kernel void MANGLE_NAME(conv_direct_s1_spe_f1c3k1_, EW)(const int iw_str, const int ow_str, const int ow_off, const int oh_off, const int ow_d2, const int bx, const int by, - __global const T* in, __global const T* flt, __global T* out){ - +__kernel void MANGLE_NAME(conv_direct_s1_spe_f1c3k1_, EW)(const int iw_str, + const int ow_str, + const int ow_off, + const int oh_off, + const int ow_d2, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __global T *out) +{ const int idx = get_global_id(0); const int idy = get_global_id(1); - if(idx >= bx || idy >= by) return; + if (idx >= bx || idy >= by) { + return; + } T4 flt_val; T8 in_val; T2 out_val; @@ -31,16 +37,16 @@ __kernel void MANGLE_NAME(conv_direct_s1_spe_f1c3k1_, EW)(const int iw_str, cons out_val.x = flt_val.w; out_val.y = flt_val.w; int in_off = (idy * iw_str + (idx << 1)) * 3; - + in_val = vload8(0, in + in_off); out_val.x += in_val.s0 * flt_val.x + in_val.s1 * flt_val.y + in_val.s2 * flt_val.z; out_val.y += in_val.s3 * flt_val.x + in_val.s4 * flt_val.y + in_val.s5 * flt_val.z; - + int out_off = (idy + oh_off) * ow_str + (idx << 1) + ow_off; -#if(EW == 0) +#if (EW == 0) vstore2(out_val, 0, out + out_off); -#elif(EW == 1) - if(idx < ow_d2){ +#elif (EW == 1) + if (idx < ow_d2) { vstore2(out_val, 0, out + out_off); } else { out[out_off] = out_val.x; diff --git a/tensor_computing/src/gpu/mali/cl/conv_direct_s2.cl b/compute/tensor/src/gpu/mali/cl/conv_direct_s2.cl similarity index 80% rename from tensor_computing/src/gpu/mali/cl/conv_direct_s2.cl rename to compute/tensor/src/gpu/mali/cl/conv_direct_s2.cl index d8bfdc38..b1ab72fa 100644 --- a/tensor_computing/src/gpu/mali/cl/conv_direct_s2.cl +++ b/compute/tensor/src/gpu/mali/cl/conv_direct_s2.cl @@ -11,55 +11,67 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - #include "kernel_def.h" -#define MANGLE_NAME_IMPL(base, F, ON, KN) base ## F ## ON ## KN +#define MANGLE_NAME_IMPL(base, F, ON, KN) base##F##ON##KN #define MANGLE_NAME(base, F, ON, KN) MANGLE_NAME_IMPL(base, F, ON, KN) - - #if defined(USE_RELU) __kernel void MANGLE_NAME(conv_direct_s2_relu_, F, ON, KN) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_direct_s2_relu6_, F, ON, KN) #else __kernel void MANGLE_NAME(conv_direct_s2_, F, ON, KN) #endif -(const int ih_str, const int ihw_str, const int ic_str, const int ih_off, const int iw_off, const int oh_str, const int ohw_str, const int oh_off, const int ow_off, -const int ow, const int bx, const int by, __global const T* in, __global const T* flt, __read_only image1d_t bias, __global T* out) + (const int ih_str, + const int ihw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int ow, + const int sh, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out) { - const int idx = get_global_id(0); const int idy = get_global_id(1); const int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; + if (idx >= bx || idy >= by) { + return; + } - T4 in_val[IN]; + T4 in_val[IN]; T16 flt_val; - T4 out_val[KN][ON]; + T4 out_val[KN][ON]; LOADBIAS_IMAGE_ARRAY_V4(out_val[0], idz * KN, bias); -#if(KN > 1) +#if (KN > 1) LOADBIAS_IMAGE_ARRAY_V4(out_val[1], idz * KN + 1, bias); #endif -#if(KN > 2) +#if (KN > 2) LOADBIAS_IMAGE_ARRAY_V4(out_val[2], idz * KN + 2, bias); LOADBIAS_IMAGE_ARRAY_V4(out_val[3], idz * KN + 3, bias); -#endif +#endif - int in_off = ((idy << 1) * ON + iw_off) * ih_str + (idx << 1) + ih_off; + int in_off = ((idy << 1) * ON + iw_off) * ih_str + idx * sh + ih_off; int flt_off = idz * ic_str * Fsq * KN; - for(int i = 0; i < ic_str; ++i) { -#if(F == 1) + for (int i = 0; i < ic_str; ++i) { +#if (F == 1) LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off, (ih_str << 1), in); flt_val = vload16(flt_off, flt); DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]); -#if(KN > 1) +#if (KN > 1) flt_val = vload16(flt_off + 1, flt); DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[1]); #endif -#if(KN > 2) +#if (KN > 2) flt_val = vload16(flt_off + 2, flt); DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[2]); flt_val = vload16(flt_off + 3, flt); @@ -67,18 +79,18 @@ const int ow, const int bx, const int by, __global const T* in, __global const T #endif flt_off += KN; #else - for(uchar j = 0; j < F; ++j) { + for (uchar j = 0; j < F; ++j) { #if defined(BASIC_REG) LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + j, (ih_str << 1), in); - for(uchar k = 0; k < F; k += 2) { - in_val[LN] = vload4(in_off + j + ((LN << 1) + k) * ih_str , in); + for (uchar k = 0; k < F; k += 2) { + in_val[LN] = vload4(in_off + j + ((LN << 1) + k) * ih_str, in); flt_val = vload16(flt_off + k * KN, flt); DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]); -#if(KN > 1) +#if (KN > 1) flt_val = vload16(flt_off + k * KN + 1, flt); DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[1]); -#endif -#if(KN > 2) +#endif +#if (KN > 2) flt_val = vload16(flt_off + k * KN + 2, flt); DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[2]); flt_val = vload16(flt_off + k * KN + 3, flt); @@ -87,15 +99,15 @@ const int ow, const int bx, const int by, __global const T* in, __global const T UPDATE_REG(in_val); } LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + j + ih_str, (ih_str << 1), in); - for(uchar k = 1; k < F; k += 2) { - in_val[LN] = vload4(in_off + j + ((LN << 1) + k) * ih_str , in); + for (uchar k = 1; k < F; k += 2) { + in_val[LN] = vload4(in_off + j + ((LN << 1) + k) * ih_str, in); flt_val = vload16(flt_off + k * KN, flt); DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]) -#if(KN > 1) +#if (KN > 1) flt_val = vload16(flt_off + k * KN + 1, flt); DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[1]); -#endif -#if(KN > 2) +#endif +#if (KN > 2) flt_val = vload16(flt_off + k * KN + 2, flt); DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[2]); flt_val = vload16(flt_off + k * KN + 3, flt); @@ -105,14 +117,14 @@ const int ow, const int bx, const int by, __global const T* in, __global const T } #else LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + j, ih_str, in); - for(uchar k = 0; k < F; ++k) { + for (uchar k = 0; k < F; ++k) { flt_val = vload16(flt_off + k * KN, flt); DIRECT_CONV_CAL_CORE_S2(in_val, flt_val, out_val[0]); -#if(KN > 1) +#if (KN > 1) flt_val = vload16(flt_off + k * KN + 1, flt); DIRECT_CONV_CAL_CORE_S2(in_val, flt_val, out_val[1]); -#endif -#if(KN > 2) +#endif +#if (KN > 2) flt_val = vload16(flt_off + k * KN + 2, flt); DIRECT_CONV_CAL_CORE_S2(in_val, flt_val, out_val[2]); flt_val = vload16(flt_off + k * KN + 3, flt); @@ -120,7 +132,7 @@ const int ow, const int bx, const int by, __global const T* in, __global const T #endif UPDATE_REG(in_val); } -#endif +#endif flt_off += F * KN; } #endif @@ -129,11 +141,11 @@ const int ow, const int bx, const int by, __global const T* in, __global const T int out_off = idz * KN * ohw_str + (idy * ON + ow_off) * oh_str + idx + oh_off; STORE_OUTPUT_BUF_ARRAY_V4(out_val[0], out_off, oh_str, idy * ON, ow, out); -#if(KN > 1) +#if (KN > 1) out_off += ohw_str; STORE_OUTPUT_BUF_ARRAY_V4(out_val[1], out_off, oh_str, idy * ON, ow, out); #endif -#if(KN > 2) +#if (KN > 2) out_off += ohw_str; STORE_OUTPUT_BUF_ARRAY_V4(out_val[2], out_off, oh_str, idy * ON, ow, out); out_off += ohw_str; diff --git a/compute/tensor/src/gpu/mali/cl/conv_direct_s2_nchw_to_ncwhc4.cl b/compute/tensor/src/gpu/mali/cl/conv_direct_s2_nchw_to_ncwhc4.cl new file mode 100644 index 00000000..dce60fdb --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_direct_s2_nchw_to_ncwhc4.cl @@ -0,0 +1,168 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, F, ON) base##F##ON +#define MANGLE_NAME(base, F, ON) MANGLE_NAME_IMPL(base, F, ON) + +#if (F == 1) +#define calCore(A, B, C) \ + { \ + C[0] += A.s0 * B; \ + C[1] += A.s2 * B; \ + C[2] += A.s4 * B; \ + C[3] += A.s6 * B; \ + C[4] += A.s8 * B; \ + C[5] += A.sa * B; \ + C[6] += A.sc * B; \ + C[7] += A.se * B; \ + } +#elif (F == 3) +#define calCore(a0, a1, a2, a3, a4, a5, a6, B, C) \ + { \ + C[0] += a0 * B; \ + C[1] += a1 * B; \ + C[2] += a2 * B; \ + C[3] += a3 * B; \ + C[4] += a4 * B; \ + C[5] += a5 * B; \ + C[6] += a6 * B; \ + } +#define calCore0(A, B, C) calCore(A.s0, A.s2, A.s4, A.s6, A.s8, A.sa, A.sc, B, C) +#define calCore1(A, B, C) calCore(A.s1, A.s3, A.s5, A.s7, A.s9, A.sb, A.sd, B, C) +#define calCore2(A, B, C) calCore(A.s2, A.s4, A.s6, A.s8, A.sa, A.sc, A.se, B, C) +#elif (F == 5) +#define calCore(a0, a1, a2, a3, a4, a5, B, C) \ + { \ + C[0] += a0 * B; \ + C[1] += a1 * B; \ + C[2] += a2 * B; \ + C[3] += a3 * B; \ + C[4] += a4 * B; \ + C[5] += a5 * B; \ + } +#define calCore0(A, B, C) calCore(A.s0, A.s2, A.s4, A.s6, A.s8, A.sa, B, C) +#define calCore1(A, B, C) calCore(A.s1, A.s3, A.s5, A.s7, A.s9, A.sb, B, C) +#define calCore2(A, B, C) calCore(A.s2, A.s4, A.s6, A.s8, A.sa, A.sc, B, C) +#define calCore3(A, B, C) calCore(A.s3, A.s5, A.s7, A.s9, A.sb, A.sd, B, C) +#define calCore4(A, B, C) calCore(A.s4, A.s6, A.s8, A.sa, A.sc, A.se, B, C) +#elif (F == 7) +#define calCore(a0, a1, a2, a3, a4, B, C) \ + { \ + C[0] += a0 * B; \ + C[1] += a1 * B; \ + C[2] += a2 * B; \ + C[3] += a3 * B; \ + C[4] += a4 * B; \ + } +#define calCore0(A, B, C) calCore(A.s0, A.s2, A.s4, A.s6, A.s8, B, C) +#define calCore1(A, B, C) calCore(A.s1, A.s3, A.s5, A.s7, A.s9, B, C) +#define calCore2(A, B, C) calCore(A.s2, A.s4, A.s6, A.s8, A.sa, B, C) +#define calCore3(A, B, C) calCore(A.s3, A.s5, A.s7, A.s9, A.sb, B, C) +#define calCore4(A, B, C) calCore(A.s4, A.s6, A.s8, A.sa, A.sc, B, C) +#define calCore5(A, B, C) calCore(A.s5, A.s7, A.s9, A.sb, A.sd, B, C) +#define calCore6(A, B, C) calCore(A.s6, A.s8, A.sa, A.sc, A.se, B, C) +#endif + +#if defined(USE_RELU) +__kernel void MANGLE_NAME(conv_direct_s2_nchw_to_ncwhc4_relu_, F, ON) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_direct_s2_nchw_to_ncwhc4_relu6_, F, ON) +#else +__kernel void MANGLE_NAME(conv_direct_s2_nchw_to_ncwhc4_, F, ON) +#endif + (const int iw_str, + const int iwh_str, + const int ic_str, + const int iw_off, + const int ih_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int ow, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + + T16 in_val; + T4 flt_val; + T4 out_val[ON]; + + LOADBIAS_IMAGE_ARRAY_V4(out_val, idz, bias); + int in_off = ((idy << 1) + ih_off) * iw_str + (idx << 1) * ON + iw_off; + int flt_off = idz * ic_str * Fsq; + + for (int i = 0; i < ic_str; ++i) { +#if (F == 1) + flt_val = vload4(flt_off, flt); + in_val = vload16(0, in + in_off); + calCore(in_val, flt_val, out_val); + flt_off++; +#else + for (uchar j = 0; j < F; ++j) { + in_val = vload16(0, in + in_off + j * iw_str); + for (uchar k = 0; k < F; ++k) { + flt_val = vload4(flt_off + k, flt); + if (k == 0) { + calCore0(in_val, flt_val, out_val); + } + if (k == 1) { + calCore1(in_val, flt_val, out_val); + } + if (k == 2) { + calCore2(in_val, flt_val, out_val); + } +#if (F == 5) + if (k == 3) { + calCore3(in_val, flt_val, out_val); + } + if (k == 4) { + calCore4(in_val, flt_val, out_val); + } +#endif +#if (F == 7) + if (k == 3) { + calCore3(in_val, flt_val, out_val); + } + if (k == 4) { + calCore4(in_val, flt_val, out_val); + } + if (k == 5) { + calCore5(in_val, flt_val, out_val); + } + if (k == 6) { + calCore6(in_val, flt_val, out_val); + } +#endif + } + flt_off += F; + } +#endif + in_off += iwh_str; + } + + int xn = idx * ON; + int out_off = (idz * ow_str + xn + ow_off) * oh_str + idy + oh_off; + STORE_OUTPUT_BUF_ARRAY_V4(out_val, out_off, oh_str, xn, ow, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/conv_direct_spe_fwhs1.cl b/compute/tensor/src/gpu/mali/cl/conv_direct_spe_fwhs1.cl new file mode 100644 index 00000000..21985ac8 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_direct_spe_fwhs1.cl @@ -0,0 +1,134 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NOCINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTIOC OF COCTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN COCNECTIOC WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, OC) base##OC +#define MANGLE_NAME(base, OC) MANGLE_NAME_IMPL(base, OC) + +#if (OC == 1) +#define calCore(ov, i_off, f_off, in, flt) \ + { \ + T iv = in[i_off]; \ + T fv = flt[f_off]; \ + ov += iv * fv; \ + } +#endif + +#if (OC == 2) +#define calCore(ov, i_off, f_off, in, flt) \ + { \ + T2 iv = vload2(i_off, in); \ + T2 fv = vload2(f_off, flt); \ + ov += iv.x * fv.x + iv.y * fv.y; \ + } +#endif + +#if (OC == 3) +#define calCore(ov, i_off, f_off, in, flt) \ + { \ + T3 iv = vload3(i_off, in); \ + T3 fv = vload3(f_off, flt); \ + ov += iv.x * fv.x + iv.y * fv.y + iv.z * fv.z; \ + } +#endif + +#if (OC == 4) +#define calCore(ov, i_off, f_off, in, flt) \ + { \ + T4 iv = vload4(i_off, in); \ + T4 fv = vload4(f_off, flt); \ + DOT_A4B4C1(iv, fv, ov); \ + } +#endif + +#if (OC == 8) +#define calCore(ov, i_off, f_off, in, flt) \ + { \ + T8 iv = vload8(i_off, in); \ + T8 fv = vload8(f_off, flt); \ + DOT_A4B4C1(iv.s0123, fv.s0123, ov); \ + DOT_A4B4C1(iv.s4567, fv.s4567, ov); \ + } +#endif + +#if (OC == 16) +#define calCore(ov, i_off, f_off, in, flt) \ + { \ + T16 iv = vload16(i_off, in); \ + T16 fv = vload16(f_off, flt); \ + DOT_A4B4C1(iv.s0123, fv.s0123, ov); \ + DOT_A4B4C1(iv.s4567, fv.s4567, ov); \ + DOT_A4B4C1(iv.s89ab, fv.s89ab, ov); \ + DOT_A4B4C1(iv.scdef, fv.scdef, ov); \ + } +#endif + +#if defined(USE_RELU) +#if defined(NO_BIAS) +__kernel MANGLE_NAME(void conv_direct_spe_fwhs1_nobias_relu_, OC) +#else +__kernel MANGLE_NAME(void conv_direct_spe_fwhs1_relu_, OC) +#endif +#elif defined(USE_RELU6) +#if defined(NO_BIAS) +__kernel MANGLE_NAME(void conv_direct_spe_fwhs1_nobias_relu6_, OC) +#else +__kernel MANGLE_NAME(void conv_direct_spe_fwhs1_relu6_, OC) +#endif +#else +#if defined(NO_BIAS) +__kernel MANGLE_NAME(void conv_direct_spe_fwhs1_nobias_, OC) +#else +__kernel MANGLE_NAME(void conv_direct_spe_fwhs1_, OC) +#endif +#endif + (const int ih_str, + const int ihw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int flt_str, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __global const T *bias, + __global T *out) +{ + const int idx = get_global_id(0); + if (idx >= bx) { + return; + } +#if defined(NO_BIAS) + T out_val = 0; +#else + T out_val = bias[idx]; +#endif + int in_off = iw_off * ih_str + ih_off; + int flt_off = idx; + for (int i = 0; i < ic_str; ++i) { + calCore(out_val, in_off, flt_off, in, flt); + in_off += ihw_str; + flt_off += flt_str; + } + + ACTIVATION_V1(out_val); + const int ox = idx >> 2; + const int oy = idx & 3; + int out_off = (ox * ow_str + ow_off) * oh_str + oh_off; + out[out_off * 4 + oy] = out_val; +} diff --git a/compute/tensor/src/gpu/mali/cl/conv_direct_trans_fltbuf.cl b/compute/tensor/src/gpu/mali/cl/conv_direct_trans_fltbuf.cl new file mode 100644 index 00000000..6a819e48 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_direct_trans_fltbuf.cl @@ -0,0 +1,207 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define MANGLE_NAME_IMPL(base, C, K) base##C##K +#define MANGLE_NAME(base, C, K) MANGLE_NAME_IMPL(base, C, K) +#if (C == 1) +#define loadFltval(off, str, flt, val) \ + { \ + val = flt[off]; \ + } + +#define loadFltvalEdge(off, str, flt, val, edge) \ + {} +#endif + +#if (C == 2) +#define loadFltval(off, str, flt, val) \ + { \ + val.x = flt[off]; \ + val.y = flt[off + str]; \ + } + +#define loadFltvalEdge(off, str, flt, val, edge) \ + { \ + val.x = flt[off]; \ + } +#endif + +#if (C == 3) +#define loadFltval(off, str, flt, val) \ + { \ + val.x = flt[off]; \ + val.y = flt[off + str]; \ + val.z = flt[off + str * 2]; \ + } + +#define loadFltvalEdge(off, str, flt, val, edge) \ + { \ + val.x = flt[off]; \ + if (edge > 1) \ + val.y = flt[off + str]; \ + } +#endif + +#if (C == 4) +#define loadFltval(off, str, flt, val) \ + { \ + val.x = flt[off]; \ + val.y = flt[off + str]; \ + val.z = flt[off + str * 2]; \ + val.w = flt[off + str * 3]; \ + } + +#define loadFltvalEdge(off, str, flt, val, edge) \ + { \ + val.x = flt[off]; \ + if (edge > 1) \ + val.y = flt[off + str]; \ + if (edge > 2) \ + val.z = flt[off + str * 2]; \ + } +#endif + +#if (C == 8) +#define loadFltval(off, str, flt, val) \ + { \ + val.s0 = flt[off]; \ + val.s1 = flt[off + str]; \ + val.s2 = flt[off + str * 2]; \ + val.s3 = flt[off + str * 3]; \ + val.s4 = flt[off + str * 4]; \ + val.s5 = flt[off + str * 5]; \ + val.s6 = flt[off + str * 6]; \ + val.s7 = flt[off + str * 7]; \ + } +#define loadFltvalEdge(off, str, flt, val, edge) \ + { \ + val.s0 = flt[off]; \ + if (edge > 1) \ + val.s1 = flt[off + str]; \ + if (edge > 2) \ + val.s2 = flt[off + str * 2]; \ + if (edge > 3) \ + val.s3 = flt[off + str * 3]; \ + if (edge > 4) \ + val.s4 = flt[off + str * 4]; \ + if (edge > 5) \ + val.s5 = flt[off + str * 5]; \ + if (edge > 6) \ + val.s6 = flt[off + str * 6]; \ + } +#endif + +#if (C == 16) +#define loadFltval(off, str, flt, val) \ + { \ + val.s0 = flt[off]; \ + val.s1 = flt[off + str]; \ + val.s2 = flt[off + str * 2]; \ + val.s3 = flt[off + str * 3]; \ + val.s4 = flt[off + str * 4]; \ + val.s5 = flt[off + str * 5]; \ + val.s6 = flt[off + str * 6]; \ + val.s7 = flt[off + str * 7]; \ + val.s8 = flt[off + str * 8]; \ + val.s9 = flt[off + str * 9]; \ + val.sa = flt[off + str * 10]; \ + val.sb = flt[off + str * 11]; \ + val.sc = flt[off + str * 12]; \ + val.sd = flt[off + str * 13]; \ + val.se = flt[off + str * 14]; \ + val.sf = flt[off + str * 15]; \ + } +#define loadFltvalEdge(off, str, flt, val, edge) \ + { \ + val.s0 = flt[off]; \ + if (edge > 1) \ + val.s1 = flt[off + str]; \ + if (edge > 2) \ + val.s2 = flt[off + str * 2]; \ + if (edge > 3) \ + val.s3 = flt[off + str * 3]; \ + if (edge > 4) \ + val.s4 = flt[off + str * 4]; \ + if (edge > 5) \ + val.s5 = flt[off + str * 5]; \ + if (edge > 6) \ + val.s6 = flt[off + str * 6]; \ + if (edge > 7) \ + val.s7 = flt[off + str * 7]; \ + if (edge > 8) \ + val.s8 = flt[off + str * 8]; \ + if (edge > 9) \ + val.s9 = flt[off + str * 9]; \ + if (edge > 10) \ + val.sa = flt[off + str * 10]; \ + if (edge > 11) \ + val.sb = flt[off + str * 11]; \ + if (edge > 12) \ + val.sc = flt[off + str * 12]; \ + if (edge > 13) \ + val.sd = flt[off + str * 13]; \ + if (edge > 14) \ + val.se = flt[off + str * 14]; \ + } +#endif + +__kernel void MANGLE_NAME(conv_direct_trans_fltbuf_, C, K)( + const int fwh, const int fc, const int fn, __global const T *fltdata, __global T *fltbuf) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + short ec = ((idy + 1) * C <= fc) ? C : (fc % C); + + const int flt_off = (idz * fc + idy * C) * fwh + idx; +#if (C == 1) + T val = 0; +#elif (C == 2) + T2 val = 0; +#elif (C == 3) + T3 val = 0; +#elif (C == 4) + T4 val = 0; +#elif (C == 8) + T8 val = 0; +#elif (C == 16) + T16 val = 0; +#endif + if (idz < fn) { + if (ec == C) { + loadFltval(flt_off, fwh, fltdata, val); + } else { + loadFltvalEdge(flt_off, fwh, fltdata, val, ec); + } + } + const int bc = (fc + C - 1) / C; + int out_off; +#if (K == 0) + out_off = (idy * fwh + idx) * fn + idz; +#else + out_off = (idz / K * bc + idy) * fwh * K + idx * K + (idz % K); +#endif +#if (C == 1) + fltbuf[out_off] = val; +#elif (C == 2) + vstore2(val, out_off, fltbuf); +#elif (C == 3) + vstore3(val, out_off, fltbuf); +#elif (C == 4) + vstore4(val, out_off, fltbuf); +#elif (C == 8) + vstore8(val, out_off, fltbuf); +#elif (C == 16) + vstore16(val, out_off, fltbuf); +#endif +} diff --git a/tensor_computing/src/gpu/mali/cl/conv_direct_s1.cl b/compute/tensor/src/gpu/mali/cl/conv_direct_wh_s1.cl similarity index 65% rename from tensor_computing/src/gpu/mali/cl/conv_direct_s1.cl rename to compute/tensor/src/gpu/mali/cl/conv_direct_wh_s1.cl index d830f8eb..62df1ca7 100644 --- a/tensor_computing/src/gpu/mali/cl/conv_direct_s1.cl +++ b/compute/tensor/src/gpu/mali/cl/conv_direct_wh_s1.cl @@ -11,75 +11,70 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - #include "kernel_def.h" -#define MANGLE_NAME_IMPL(base, F, ON, KN) base ## F ## ON ## KN -#define MANGLE_NAME(base, F, ON, KN) MANGLE_NAME_IMPL(base, F, ON, KN) - - +#define MANGLE_NAME_IMPL(base, W, H, ON, KN) base##W##H##ON##KN +#define MANGLE_NAME(base, W, H, ON, KN) MANGLE_NAME_IMPL(base, W, H, ON, KN) #if defined(USE_RELU) -__kernel void MANGLE_NAME(conv_direct_s1_relu_, F, ON, KN) +__kernel void MANGLE_NAME(conv_direct_wh_s1_relu_, W, H, ON, KN) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_direct_wh_s1_relu6_, W, H, ON, KN) #else -__kernel void MANGLE_NAME(conv_direct_s1_, F, ON, KN) +__kernel void MANGLE_NAME(conv_direct_wh_s1_, W, H, ON, KN) #endif -(const int ih_str, const int ihw_str, const int ic_str, const int ih_off, const int iw_off, const int oh_str, const int ohw_str, const int oh_off, const int ow_off, -const int ow, const int bx, const int by, __global const T* in, __global const T* flt, __read_only image1d_t bias, __global T* out) + (const int ih_str, + const int ihw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int ow, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out) { - const int idx = get_global_id(0); const int idy = get_global_id(1); const int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; + if (idx >= bx || idy >= by) { + return; + } - T4 in_val[IN]; + T4 in_val[IN]; T16 flt_val; - T4 out_val[KN][ON]; + T4 out_val[KN][ON]; LOADBIAS_IMAGE_ARRAY_V4(out_val[0], idz * KN, bias); -#if(KN > 1) +#if (KN > 1) LOADBIAS_IMAGE_ARRAY_V4(out_val[1], idz * KN + 1, bias); #endif -#if(KN > 2) +#if (KN > 2) LOADBIAS_IMAGE_ARRAY_V4(out_val[2], idz * KN + 2, bias); LOADBIAS_IMAGE_ARRAY_V4(out_val[3], idz * KN + 3, bias); -#endif +#endif - int in_off = (idy * ON + iw_off) * ih_str + idx + ih_off; + int in_off = (idy * ON + iw_off) * ih_str + idx + ih_off; int flt_off = idz * ic_str * Fsq * KN; - for(int i = 0; i < ic_str; ++i) { -#if(F == 1) - LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off, ih_str, in); - flt_val = vload16(flt_off, flt); - DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]); -#if(KN > 1) - flt_val = vload16(flt_off + 1, flt); - DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[1]); -#endif -#if(KN > 2) - flt_val = vload16(flt_off + 2, flt); - DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[2]); - flt_val = vload16(flt_off + 3, flt); - DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[3]); -#endif - flt_off += KN; -#else - for(uchar j = 0; j < F; ++j) { + for (int i = 0; i < ic_str; ++i) { + for (uchar j = 0; j < H; ++j) { LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + j, ih_str, in); - for(uchar k = 0; k < F; ++k) { + for (uchar k = 0; k < W; ++k) { #if defined(BASICE_REG) in_val[LN] = vload4(in_off + j + (LN + k) * ih_str, in); #endif flt_val = vload16(flt_off + k * KN, flt); DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]); -#if(KN > 1) +#if (KN > 1) flt_val = vload16(flt_off + k * KN + 1, flt); DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[1]); -#endif -#if(KN > 2) +#endif +#if (KN > 2) flt_val = vload16(flt_off + k * KN + 2, flt); DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[2]); flt_val = vload16(flt_off + k * KN + 3, flt); @@ -87,19 +82,18 @@ const int ow, const int bx, const int by, __global const T* in, __global const T #endif UPDATE_REG(in_val); } - flt_off += F * KN; + flt_off += W * KN; } -#endif in_off += ihw_str; } int out_off = idz * KN * ohw_str + (idy * ON + ow_off) * oh_str + idx + oh_off; STORE_OUTPUT_BUF_ARRAY_V4(out_val[0], out_off, oh_str, idy * ON, ow, out); -#if(KN > 1) +#if (KN > 1) out_off += ohw_str; STORE_OUTPUT_BUF_ARRAY_V4(out_val[1], out_off, oh_str, idy * ON, ow, out); #endif -#if(KN > 2) +#if (KN > 2) out_off += ohw_str; STORE_OUTPUT_BUF_ARRAY_V4(out_val[2], out_off, oh_str, idy * ON, ow, out); out_off += ohw_str; diff --git a/compute/tensor/src/gpu/mali/cl/conv_direct_wh_s2.cl b/compute/tensor/src/gpu/mali/cl/conv_direct_wh_s2.cl new file mode 100644 index 00000000..2da47d26 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_direct_wh_s2.cl @@ -0,0 +1,136 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, W, H, ON, KN) base##W##H##ON##KN +#define MANGLE_NAME(base, W, H, ON, KN) MANGLE_NAME_IMPL(base, W, H, ON, KN) + +#if defined(USE_RELU) +__kernel void MANGLE_NAME(conv_direct_wh_s2_relu_, W, H, ON, KN) +#elif defined(USE_RELU6) +__kernel void MANGLE_NAME(conv_direct_wh_s2_relu6_, W, H, ON, KN) +#else +__kernel void MANGLE_NAME(conv_direct_wh_s2_, W, H, ON, KN) +#endif + (const int ih_str, + const int ihw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int ow, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + + T4 in_val[IN]; + T16 flt_val; + T4 out_val[KN][ON]; + LOADBIAS_IMAGE_ARRAY_V4(out_val[0], idz * KN, bias); +#if (KN > 1) + LOADBIAS_IMAGE_ARRAY_V4(out_val[1], idz * KN + 1, bias); +#endif +#if (KN > 2) + LOADBIAS_IMAGE_ARRAY_V4(out_val[2], idz * KN + 2, bias); + LOADBIAS_IMAGE_ARRAY_V4(out_val[3], idz * KN + 3, bias); +#endif + + int in_off = ((idy << 1) * ON + iw_off) * ih_str + (idx << 1) + ih_off; + int flt_off = idz * ic_str * Fsq * KN; + + for (int i = 0; i < ic_str; ++i) { + for (uchar j = 0; j < H; ++j) { +#if defined(BASIC_REG) + LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + j, (ih_str << 1), in); + for (uchar k = 0; k < W; k += 2) { + in_val[LN] = vload4(in_off + j + ((LN << 1) + k) * ih_str, in); + flt_val = vload16(flt_off + k * KN, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]); +#if (KN > 1) + flt_val = vload16(flt_off + k * KN + 1, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[1]); +#endif +#if (KN > 2) + flt_val = vload16(flt_off + k * KN + 2, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[2]); + flt_val = vload16(flt_off + k * KN + 3, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[3]); +#endif + UPDATE_REG(in_val); + } + LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + j + ih_str, (ih_str << 1), in); + for (uchar k = 1; k < W; k += 2) { + in_val[LN] = vload4(in_off + j + ((LN << 1) + k) * ih_str, in); + flt_val = vload16(flt_off + k * KN, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]) +#if (KN > 1) + flt_val = vload16(flt_off + k * KN + 1, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[1]); +#endif +#if (KN > 2) + flt_val = vload16(flt_off + k * KN + 2, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[2]); + flt_val = vload16(flt_off + k * KN + 3, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[3]); +#endif + UPDATE_REG(in_val); + } +#else + LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off + j, ih_str, in); + for (uchar k = 0; k < W; ++k) { + flt_val = vload16(flt_off + k * KN, flt); + DIRECT_CONV_CAL_CORE_S2(in_val, flt_val, out_val[0]); +#if (KN > 1) + flt_val = vload16(flt_off + k * KN + 1, flt); + DIRECT_CONV_CAL_CORE_S2(in_val, flt_val, out_val[1]); +#endif +#if (KN > 2) + flt_val = vload16(flt_off + k * KN + 2, flt); + DIRECT_CONV_CAL_CORE_S2(in_val, flt_val, out_val[2]); + flt_val = vload16(flt_off + k * KN + 3, flt); + DIRECT_CONV_CAL_CORE_S2(in_val, flt_val, out_val[3]); +#endif + UPDATE_REG(in_val); + } +#endif + flt_off += W * KN; + } + in_off += ihw_str; + } + + int out_off = idz * KN * ohw_str + (idy * ON + ow_off) * oh_str + idx + oh_off; + STORE_OUTPUT_BUF_ARRAY_V4(out_val[0], out_off, oh_str, idy * ON, ow, out); +#if (KN > 1) + out_off += ohw_str; + STORE_OUTPUT_BUF_ARRAY_V4(out_val[1], out_off, oh_str, idy * ON, ow, out); +#endif +#if (KN > 2) + out_off += ohw_str; + STORE_OUTPUT_BUF_ARRAY_V4(out_val[2], out_off, oh_str, idy * ON, ow, out); + out_off += ohw_str; + STORE_OUTPUT_BUF_ARRAY_V4(out_val[3], out_off, oh_str, idy * ON, ow, out); +#endif +} diff --git a/tensor_computing/src/gpu/mali/cl/conv_wino_gemm36_tn.cl b/compute/tensor/src/gpu/mali/cl/conv_wino_gemm36_tn.cl similarity index 74% rename from tensor_computing/src/gpu/mali/cl/conv_wino_gemm36_tn.cl rename to compute/tensor/src/gpu/mali/cl/conv_wino_gemm36_tn.cl index d1419203..aa24e28d 100644 --- a/tensor_computing/src/gpu/mali/cl/conv_wino_gemm36_tn.cl +++ b/compute/tensor/src/gpu/mali/cl/conv_wino_gemm36_tn.cl @@ -11,23 +11,29 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -#include"kernel_def.h" -#define MANGLE_NAME_LMPL(base, LM, LN) base ## LM ## LN +#include "kernel_def.h" +#define MANGLE_NAME_LMPL(base, LM, LN) base##LM##LN #define MANGLE_NAME(base, LM, LN) MANGLE_NAME_LMPL(base, LM, LN) - -__kernel void MANGLE_NAME(conv_wino_gemm36_tn_, LM, LN) -(int M, int N, int K, int a_str, int b_str, int c_str, const int bx, const int by, __global const T* A, __global const T* B, global T* C) +__kernel void MANGLE_NAME(conv_wino_gemm36_tn_, LM, LN)(int M, + int N, + int K, + int a_str, + int b_str, + int c_str, + const int bx, + const int by, + __global const T *A, + __global const T *B, + global T *C) { - const int idx = get_global_id(0); + const int idx = get_global_id(0); const int idy = get_global_id(1); - if(idx >= bx || idy >= by) return; - const int ix = idx * LN; - const int iy = idy * LM; + if (idx >= bx || idy >= by) { + return; + } + const int ix = idx * LN; + const int iy = idy * LM; T a[LM]; T b[LN]; @@ -36,15 +42,15 @@ __kernel void MANGLE_NAME(conv_wino_gemm36_tn_, LM, LN) int a_off = iy + a_str; int b_off = ix + b_str; - for(int i = 0; i < K; i++) { + for (int i = 0; i < K; i++) { GEMM_LOAD_A(a, a_off, A); GEMM_LOAD_B(b, b_off, B); GEMM_CALCORE(a, b, c); a_off += M; b_off += N; } - + int c_off = iy * N + ix + c_str; - GEMM_MUL_C((float)(0.1111111111), c); + GEMM_MUL_C((float)(0.1111111111), 0, c); GEMM_STORE_C(c, c_off, N, C); } diff --git a/tensor_computing/src/gpu/mali/cl/conv_wino_rotate_fltbuf.cl b/compute/tensor/src/gpu/mali/cl/conv_wino_rotate_fltbuf.cl similarity index 87% rename from tensor_computing/src/gpu/mali/cl/conv_wino_rotate_fltbuf.cl rename to compute/tensor/src/gpu/mali/cl/conv_wino_rotate_fltbuf.cl index 8149bd3b..68dc362b 100644 --- a/tensor_computing/src/gpu/mali/cl/conv_wino_rotate_fltbuf.cl +++ b/compute/tensor/src/gpu/mali/cl/conv_wino_rotate_fltbuf.cl @@ -11,23 +11,21 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -#define MANGLE_NAME_IMPL(base, F) base ## F +#define MANGLE_NAME_IMPL(base, F) base##F #define MANGLE_NAME(base, F) MANGLE_NAME_IMPL(base, F) -__kernel void MANGLE_NAME(conv_wino_rotate_fltbuf_, F)(const int fwhc, const int fnc, const int fn, __global const T* fltdata, __global T* fltbuf) { +__kernel void MANGLE_NAME(conv_wino_rotate_fltbuf_, F)( + const int fwhc, const int fnc, const int fn, __global const T *fltdata, __global T *fltbuf) +{ const int idx = get_global_id(0); const int idy = get_global_id(1); T val = 0; - if(idy < fn) { + if (idy < fn) { const int in_off = idy * fwhc + idx; val = fltdata[in_off]; } - + const int ox = idy; const int oy = idx / Fsq; const int oz = idx % Fsq; diff --git a/compute/tensor/src/gpu/mali/cl/conv_wino_trans_fltbuf_3x3.cl b/compute/tensor/src/gpu/mali/cl/conv_wino_trans_fltbuf_3x3.cl new file mode 100644 index 00000000..79ab36a8 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_wino_trans_fltbuf_3x3.cl @@ -0,0 +1,124 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define loadG(val, str, off, flt) \ + { \ + val[0] = flt[off]; \ + val[1] = flt[off + str]; \ + val[2] = flt[off + str * 2]; \ + } + +#define setReg6(reg0, reg1) \ + { \ + reg1[0] = reg0[0]; \ + reg1[1] = reg0[1]; \ + reg1[2] = reg0[2]; \ + reg1[3] = reg0[3]; \ + reg1[4] = reg0[4]; \ + reg1[5] = reg0[5]; \ + } + +#define addReg6(reg0, reg1) \ + { \ + reg1[0] += reg0[0]; \ + reg1[1] += reg0[1]; \ + reg1[2] += reg0[2]; \ + reg1[3] += reg0[3]; \ + reg1[4] += reg0[4]; \ + reg1[5] += reg0[5]; \ + } + +#define minReg6(reg0, reg1) \ + { \ + reg1[0] -= reg0[0]; \ + reg1[1] -= reg0[1]; \ + reg1[2] -= reg0[2]; \ + reg1[3] -= reg0[3]; \ + reg1[4] -= reg0[4]; \ + reg1[5] -= reg0[5]; \ + } + +#define mulReg6(s, reg0, reg1) \ + { \ + reg1[0] = s * reg0[0]; \ + reg1[1] = s * reg0[1]; \ + reg1[2] = s * reg0[2]; \ + reg1[3] = s * reg0[3]; \ + reg1[4] = s * reg0[4]; \ + reg1[5] = s * reg0[5]; \ + } + +#define calCore(g, t) \ + { \ + t[0] = (T)(0.75) * g[0]; \ + t[1] = (g[0] + g[1] + g[2]) * (T)(-0.5); \ + t[2] = (g[0] - g[1] + g[2]) * (T)(-0.5); \ + t[3] = ((T)(0.125) * g[0] + (T)(0.25) * g[1] + (T)(0.5) * g[2]); \ + t[4] = ((T)(0.125) * g[0] - (T)(0.25) * g[1] + (T)(0.5) * g[2]); \ + t[5] = (T)(3.0) * g[2]; \ + } + +#define storeReg6(reg, off, str, flt) \ + { \ + flt[off] = reg[0]; \ + flt[off + str] = reg[1]; \ + flt[off + str * 2] = reg[2]; \ + flt[off + str * 3] = reg[3]; \ + flt[off + str * 4] = reg[4]; \ + flt[off + str * 5] = reg[5]; \ + } + +__kernel void conv_wino_trans_fltbuf_3x3( + const int fn, const int fc, const int fnc, __global const T *fltbuf, __global T *flttran) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int in_off = idy * fn + idx; + + T g[3]; + T h0[6], h1[6], h2[6], h3[6], h4[6], h5[6], t[6], tmp[6]; + loadG(g, fnc, in_off, fltbuf); + calCore(g, tmp); + mulReg6((T)(0.75), tmp, h0); + mulReg6((T)(-0.5), tmp, t); + setReg6(t, h1); + setReg6(t, h2); + mulReg6((T)(0.125), tmp, t); + setReg6(t, h3); + setReg6(t, h4); + + loadG(g, fnc, in_off + 3 * fnc, fltbuf); + calCore(g, tmp); + mulReg6((T)(0.5), tmp, t); + minReg6(t, h1); + addReg6(t, h2); + mulReg6((T)(0.25), tmp, t); + addReg6(t, h3); + minReg6(t, h4); + + loadG(g, fnc, in_off + 6 * fnc, fltbuf); + calCore(g, tmp); + mulReg6((T)(0.5), tmp, t); + minReg6(t, h1); + minReg6(t, h2); + addReg6(t, h3); + addReg6(t, h4); + mulReg6((T)(3.0), tmp, h5); + + storeReg6(h0, in_off, fnc, flttran); + storeReg6(h1, in_off + 6 * fnc, fnc, flttran); + storeReg6(h2, in_off + 12 * fnc, fnc, flttran); + storeReg6(h3, in_off + 18 * fnc, fnc, flttran); + storeReg6(h4, in_off + 24 * fnc, fnc, flttran); + storeReg6(h5, in_off + 30 * fnc, fnc, flttran); +} diff --git a/compute/tensor/src/gpu/mali/cl/conv_wino_trans_outbuf.cl b/compute/tensor/src/gpu/mali/cl/conv_wino_trans_outbuf.cl new file mode 100644 index 00000000..ac5aa2cf --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_wino_trans_outbuf.cl @@ -0,0 +1,262 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define loadR(val, str, off, in) \ + { \ + val[0] = in[off]; \ + val[1] = in[off + str]; \ + val[2] = in[off + str * 2]; \ + val[3] = in[off + str * 3]; \ + val[4] = in[off + str * 4]; \ + val[5] = in[off + str * 5]; \ + } + +#define calCore(s, t, tmp) \ + { \ + t.x = s[1] + s[2]; \ + t.y = s[3] + s[4]; \ + t.z = s[1] - s[2]; \ + t.w = s[3] - s[4]; \ + tmp[0] = s[0] + t.x + t.y; \ + tmp[1] = t.z + (T)(2.0) * t.w; \ + tmp[2] = t.x + (T)(4.0) * t.y; \ + tmp[3] = t.z + (T)(8.0) * t.w + s[5]; \ + } + +#if defined(ALIGN) +#if defined(USE_RELU) +__kernel void conv_wino_trans_outbuf_relu_align +#else +__kernel void conv_wino_trans_outbuf_align +#endif +#else +#if defined(USE_RELU) +__kernel void conv_wino_trans_outbuf_relu +#else +__kernel void conv_wino_trans_outbuf +#endif +#endif + (const int wino_h, + const int wino_w, + const int pw_str, + const int pwh_str, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int oh, + const int ow, + __read_only image1d_t bias, + __global const T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= wino_h || idy >= wino_w) { + return; + } + + T4 r0, r1, r2, r3; + T4 r4, r5, r6, r7; + T4 r8, r9, ra, rb; + T4 rc, rd, re, rf; + T4 bias_v4 = READ_IMAGE(bias, sampler, idz); + + int in_off = (idz << 2) * pw_str + idy * wino_h + idx; + for (uchar ii = 0; ii < 4; ii++) { + r0 = r4; + r1 = r5; + r2 = r6; + r3 = r7; + + r4 = r8; + r5 = r9; + r6 = ra; + r7 = rb; + + r8 = rc; + r9 = rd; + ra = re; + rb = rf; + + T s[6]; + T4 t; + T bias_val; + if (ii == 0) { + bias_val = bias_v4.x; + } + if (ii == 1) { + bias_val = bias_v4.y; + } + if (ii == 2) { + bias_val = bias_v4.z; + } + if (ii == 3) { + bias_val = bias_v4.w; + } + + rd = (T4)bias_val; + re = (T4)bias_val; + for (uchar i = 0; i < 2; ++i) { + rc.x = rf.x; + rc.y = rf.y; + rc.z = rf.z; + rc.w = rf.w; + loadR(s, pwh_str, in_off + i * 30 * pwh_str, in); + for (uchar j = 0; j < 4; ++j) { + rf.x = rf.y; + rf.y = rf.z; + rf.z = rf.w; + rf.w = bias_val; + if (j == 0) { + rf.w += s[0] + s[1] + s[2] + s[3] + s[4]; + } + if (j == 1) { + rf.w += s[1] - s[2] + (T)2 * (s[3] - s[4]); + } + if (j == 2) { + rf.w += s[1] + s[2] + (T)4 * (s[3] + s[4]); + } + if (j == 3) { + rf.w += s[1] - s[2] + (T)8 * (s[3] - s[4]) + s[5]; + } + } + } + + for (uchar i = 0; i < 4; ++i) { + loadR(s, pwh_str, in_off + (i + 1) * 6 * pwh_str, in); + for (uchar j = 0; j < 4; ++j) { + t.x = t.y; + t.y = t.z; + t.z = t.w; + if (j == 0) { + t.w = s[0] + s[1] + s[2] + s[3] + s[4]; + } + if (j == 1) { + t.w = s[1] - s[2] + (T)2 * (s[3] - s[4]); + } + if (j == 2) { + t.w = s[1] + s[2] + (T)4 * (s[3] + s[4]); + } + if (j == 3) { + t.w = s[1] - s[2] + (T)8 * (s[3] - s[4]) + s[5]; + } + } + if (i == 0) { + rc += t; + rd += t; + re += t; + rf += t; + } + if (i == 1) { + rc += t; + rd -= t; + re += t; + rf -= t; + } + if (i == 2) { + rc += t; + rd += (T)2 * t; + re += (T)4 * t; + rf += (T)8 * t; + } + if (i == 3) { + rc += t; + rd -= (T)2 * t; + re += (T)4 * t; + rf -= (T)8 * t; + } + } + ACTIVATION_V4(rc); + ACTIVATION_V4(rd); + ACTIVATION_V4(re); + ACTIVATION_V4(rf); + in_off += pw_str; + } + + const int x_off = idx << 2; + const int y_off = idy << 2; + int out_off = (idz * ow_str + y_off + ow_off) * (oh_str << 2) + (x_off << 2) + (oh_off << 2); +#if defined(ALIGN) + vstore16((T16)(r0.x, r4.x, r8.x, rc.x, r1.x, r5.x, r9.x, rd.x, r2.x, r6.x, ra.x, re.x, r3.x, + r7.x, rb.x, rf.x), + 0, out + out_off); + out_off += (oh_str << 2); + vstore16((T16)(r0.y, r4.y, r8.y, rc.y, r1.y, r5.y, r9.y, rd.y, r2.y, r6.y, ra.y, re.y, r3.y, + r7.y, rb.y, rf.y), + 0, out + out_off); + out_off += (oh_str << 2); + vstore16((T16)(r0.z, r4.z, r8.z, rc.z, r1.z, r5.z, r9.z, rd.z, r2.z, r6.z, ra.z, re.z, r3.z, + r7.z, rb.z, rf.z), + 0, out + out_off); + out_off += (oh_str << 2); + vstore16((T16)(r0.w, r4.w, r8.w, rc.w, r1.w, r5.w, r9.w, rd.w, r2.w, r6.w, ra.w, re.w, r3.w, + r7.w, rb.w, rf.w), + 0, out + out_off); +#else + vstore4((T4)(r0.x, r4.x, r8.x, rc.x), 0, out + out_off); + if (x_off + 1 < oh) { + vstore4((T4)(r1.x, r5.x, r9.x, rd.x), 0, out + out_off + 4); + } + if (x_off + 2 < oh) { + vstore4((T4)(r2.x, r6.x, ra.x, re.x), 0, out + out_off + 8); + } + if (x_off + 3 < oh) { + vstore4((T4)(r3.x, r7.x, rb.x, rf.x), 0, out + out_off + 12); + } + + if (y_off + 1 < ow) { + out_off += (oh_str << 2); + vstore4((T4)(r0.y, r4.y, r8.y, rc.y), 0, out + out_off); + if (x_off + 1 < oh) { + vstore4((T4)(r1.y, r5.y, r9.y, rd.y), 0, out + out_off + 4); + } + if (x_off + 2 < oh) { + vstore4((T4)(r2.y, r6.y, ra.y, re.y), 0, out + out_off + 8); + } + if (x_off + 3 < oh) { + vstore4((T4)(r3.y, r7.y, rb.y, rf.y), 0, out + out_off + 12); + } + } + + if (y_off + 2 < ow) { + out_off += (oh_str << 2); + vstore4((T4)(r0.z, r4.z, r8.z, rc.z), 0, out + out_off); + if (x_off + 1 < oh) { + vstore4((T4)(r1.z, r5.z, r9.z, rd.z), 0, out + out_off + 4); + } + if (x_off + 2 < oh) { + vstore4((T4)(r2.z, r6.z, ra.z, re.z), 0, out + out_off + 8); + } + if (x_off + 3 < oh) { + vstore4((T4)(r3.z, r7.z, rb.z, rf.z), 0, out + out_off + 12); + } + } + + if (y_off + 3 < ow) { + out_off += (oh_str << 2); + vstore4((T4)(r0.w, r4.w, r8.w, rc.w), 0, out + out_off); + if (x_off + 1 < oh) { + vstore4((T4)(r1.w, r5.w, r9.w, rd.w), 0, out + out_off + 4); + } + if (x_off + 2 < oh) { + vstore4((T4)(r2.w, r6.w, ra.w, re.w), 0, out + out_off + 8); + } + if (x_off + 3 < oh) { + vstore4((T4)(r3.w, r7.w, rb.w, rf.w), 0, out + out_off + 12); + } + } +#endif +} diff --git a/tensor_computing/src/gpu/mali/cl/conv_wino_trans_outbuf_right.cl b/compute/tensor/src/gpu/mali/cl/conv_wino_trans_outbuf_right.cl similarity index 64% rename from tensor_computing/src/gpu/mali/cl/conv_wino_trans_outbuf_right.cl rename to compute/tensor/src/gpu/mali/cl/conv_wino_trans_outbuf_right.cl index 5fbfe37e..5c975e25 100644 --- a/tensor_computing/src/gpu/mali/cl/conv_wino_trans_outbuf_right.cl +++ b/compute/tensor/src/gpu/mali/cl/conv_wino_trans_outbuf_right.cl @@ -11,39 +11,49 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include"kernel_def.h" -#define loadR(val, str, off, in) {\ - val[0] = in[off];\ - val[1] = in[off + str];\ - val[2] = in[off + str * 2];\ - val[3] = in[off + str * 3];\ - val[4] = in[off + str * 4];\ - val[5] = in[off + str * 5];\ -} - -#define calCore(s, t, tmp){\ - t[0] = s[1] + s[2];\ - t[1] = s[3] + s[4];\ - t[2] = s[1] - s[2];\ - t[3] = s[3] - s[4];\ - tmp[0] = s[0] + t[0] + t[1];\ - tmp[1] = t[2] + (T)(2.0) * t[3];\ - tmp[2] = t[0] + (T)(4.0) * t[1];\ - tmp[3] = t[2] + (T)(8.0) * t[3] + s[5];\ -} +#include "kernel_def.h" +#define loadR(val, str, off, in) \ + { \ + val[0] = in[off]; \ + val[1] = in[off + str]; \ + val[2] = in[off + str * 2]; \ + val[3] = in[off + str * 3]; \ + val[4] = in[off + str * 4]; \ + val[5] = in[off + str * 5]; \ + } +#define calCore(s, t, tmp) \ + { \ + t[0] = s[1] + s[2]; \ + t[1] = s[3] + s[4]; \ + t[2] = s[1] - s[2]; \ + t[3] = s[3] - s[4]; \ + tmp[0] = s[0] + t[0] + t[1]; \ + tmp[1] = t[2] + (T)(2.0) * t[3]; \ + tmp[2] = t[0] + (T)(4.0) * t[1]; \ + tmp[3] = t[2] + (T)(8.0) * t[3] + s[5]; \ + } -__kernel void conv_wino_trans_outbuf_right - (const int iw_str, const int iwh_str, const int wino_h, const int wino_w, const int wino_h6, const int wino_hw, __global const T* in, __global T* out) { +__kernel void conv_wino_trans_outbuf_right(const int iw_str, + const int iwh_str, + const int wino_h, + const int wino_w, + const int wino_h6, + const int wino_hw, + __global const T *in, + __global T *out) +{ const int idx = get_global_id(0); const int idy = get_global_id(1); const int idz = get_global_id(2); - if(idx >= wino_hw) return; + if (idx >= wino_hw) { + return; + } int in_off = idz * iwh_str * 6 + (idy << 2) * iw_str + idx; T s[6]; T4 res[4]; - for(int ii = 0; ii < 4; ++ii) { + for (int ii = 0; ii < 4; ++ii) { loadR(s, iwh_str, in_off, in); res[0] = res[1]; res[1] = res[2]; @@ -58,8 +68,8 @@ __kernel void conv_wino_trans_outbuf_right const int idx_i = idx % wino_h; const int idx_j = idx / wino_h; const int out_off = (idy * 24 * wino_w + idx_j * 24 + idz) * wino_h + idx_i; - vstore4((T4)(res[0].x, res[1].x, res[2].x, res[3].x), out_off, out); - vstore4((T4)(res[0].y, res[1].y, res[2].y, res[3].y), out_off + wino_h6, out); + vstore4((T4)(res[0].x, res[1].x, res[2].x, res[3].x), out_off, out); + vstore4((T4)(res[0].y, res[1].y, res[2].y, res[3].y), out_off + wino_h6, out); vstore4((T4)(res[0].z, res[1].z, res[2].z, res[3].z), out_off + wino_h6 * 2, out); vstore4((T4)(res[0].w, res[1].w, res[2].w, res[3].w), out_off + wino_h6 * 3, out); } diff --git a/compute/tensor/src/gpu/mali/cl/conv_wino_trans_picbuf.cl b/compute/tensor/src/gpu/mali/cl/conv_wino_trans_picbuf.cl new file mode 100644 index 00000000..6493518c --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_wino_trans_picbuf.cl @@ -0,0 +1,137 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define loadH(val, off, pic) \ + { \ + val[0] = pic[off]; \ + val[1] = pic[off + 4]; \ + val[2] = pic[off + 8]; \ + val[3] = pic[off + 12]; \ + val[4] = pic[off + 16]; \ + val[5] = pic[off + 20]; \ + } + +__kernel void conv_wino_trans_picbuf(const int ih_str4, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh4, + const int pw_str, + const int pwh_str, + __global const T *in, + __global T *pictran) +{ + const int id = get_global_id(0); + const int idhc = id % oh4; + const int idx = idhc >> 2; + const int idc = idhc & 3; + const int idy = id / oh4; + const int idz = get_global_id(1); + + const int in_off = + (idz * iw_str + (idy << 2) + iw_off) * ih_str4 + (idx << 4) + idc + (ih_off << 2); + const int pictran_off = ((idz << 2) + idc) * pw_str + (id >> 2); + T tmp[16]; + T h0[6], h1[6], h2[6], h3[6], h4[6], h5[6]; + + loadH(h0, in_off, in); + loadH(h1, in_off + ih_str4, in); + loadH(h2, in_off + ih_str4 * 2, in); + loadH(h3, in_off + ih_str4 * 3, in); + loadH(h4, in_off + ih_str4 * 4, in); + loadH(h5, in_off + ih_str4 * 5, in); + + h1[0] = (T)(4.0) * h1[0] - (T)(5.0) * h1[2] + h1[4]; + h2[0] = (T)(4.0) * h2[0] - (T)(5.0) * h2[2] + h2[4]; + h3[0] = (T)(4.0) * h3[0] - (T)(5.0) * h3[2] + h3[4]; + h4[0] = (T)(4.0) * h4[0] - (T)(5.0) * h4[2] + h4[4]; + + tmp[0] = (T)(-4.0) * (h1[1] + h1[2]) + h1[3] + h1[4]; + tmp[1] = (T)(-4.0) * (h2[1] + h2[2]) + h2[3] + h2[4]; + tmp[2] = (T)(-4.0) * (h3[1] + h3[2]) + h3[3] + h3[4]; + tmp[3] = (T)(-4.0) * (h4[1] + h4[2]) + h4[3] + h4[4]; + + tmp[4] = (T)(4.0) * (h1[1] - h1[2]) - h1[3] + h1[4]; + tmp[5] = (T)(4.0) * (h2[1] - h2[2]) - h2[3] + h2[4]; + tmp[6] = (T)(4.0) * (h3[1] - h3[2]) - h3[3] + h3[4]; + tmp[7] = (T)(4.0) * (h4[1] - h4[2]) - h4[3] + h4[4]; + + tmp[8] = (T)(2.0) * (h1[3] - h1[1]) - h1[2] + h1[4]; + tmp[9] = (T)(2.0) * (h2[3] - h2[1]) - h2[2] + h2[4]; + tmp[10] = (T)(2.0) * (h3[3] - h3[1]) - h3[2] + h3[4]; + tmp[11] = (T)(2.0) * (h4[3] - h4[1]) - h4[2] + h4[4]; + + tmp[12] = (T)(2.0) * (h1[1] - h1[3]) - h1[2] + h1[4]; + tmp[13] = (T)(2.0) * (h2[1] - h2[3]) - h2[2] + h2[4]; + tmp[14] = (T)(2.0) * (h3[1] - h3[3]) - h3[2] + h3[4]; + tmp[15] = (T)(2.0) * (h4[1] - h4[3]) - h4[2] + h4[4]; + + h1[5] = (T)(4.0) * h1[1] - (T)(5.0) * h1[3] + h1[5]; + h2[5] = (T)(4.0) * h2[1] - (T)(5.0) * h2[3] + h2[5]; + h3[5] = (T)(4.0) * h3[1] - (T)(5.0) * h3[3] + h3[5]; + h4[5] = (T)(4.0) * h4[1] - (T)(5.0) * h4[3] + h4[5]; + + pictran[pictran_off] = + (T)(16.0) * h0[0] - (T)(20.0) * h0[2] + (T)(4.0) * h0[4] - (T)(5.0) * h2[0] + h4[0]; + pictran[pictran_off + pwh_str] = (T)(-4.0) * (h1[0] + h2[0]) + h3[0] + h4[0]; + pictran[pictran_off + pwh_str * 2] = (T)(4.0) * (h1[0] - h2[0]) - h3[0] + h4[0]; + pictran[pictran_off + pwh_str * 3] = (T)(2.0) * (h3[0] - h1[0]) - h2[0] + h4[0]; + pictran[pictran_off + pwh_str * 4] = (T)(2.0) * (h1[0] - h3[0]) - h2[0] + h4[0]; + pictran[pictran_off + pwh_str * 5] = + (T)(4.0) * (h1[0] + h5[0]) - (T)(5.0) * (h3[0] + h5[2]) + h5[4]; + + pictran[pictran_off + pwh_str * 6] = + (T)(-16.0) * (h0[1] + h0[2]) + (T)(4.0) * (h0[3] + h0[4]) - (T)(5.0) * tmp[1] + tmp[3]; + pictran[pictran_off + pwh_str * 7] = (T)(-4.0) * (tmp[0] + tmp[1]) + tmp[2] + tmp[3]; + pictran[pictran_off + pwh_str * 8] = (T)(4.0) * (tmp[0] - tmp[1]) - tmp[2] + tmp[3]; + pictran[pictran_off + pwh_str * 9] = (T)(2.0) * (tmp[2] - tmp[0]) - tmp[1] + tmp[3]; + pictran[pictran_off + pwh_str * 10] = (T)(2.0) * (tmp[0] - tmp[2]) - tmp[1] + tmp[3]; + pictran[pictran_off + pwh_str * 11] = + (T)(4.0) * (tmp[0] - h5[1] - h5[2]) - (T)(5.0) * tmp[2] + h5[3] + h5[4]; + + pictran[pictran_off + pwh_str * 12] = + (T)(16.0) * (h0[1] - h0[2]) + (T)(4.0) * (h0[4] - h0[3]) - (T)(5.0) * tmp[5] + tmp[7]; + pictran[pictran_off + pwh_str * 13] = (T)(-4.0) * (tmp[4] + tmp[5]) + tmp[6] + tmp[7]; + pictran[pictran_off + pwh_str * 14] = (T)(4.0) * (tmp[4] - tmp[5]) - tmp[6] + tmp[7]; + pictran[pictran_off + pwh_str * 15] = (T)(2.0) * (tmp[6] - tmp[4]) - tmp[5] + tmp[7]; + pictran[pictran_off + pwh_str * 16] = (T)(2.0) * (tmp[4] - tmp[6]) - tmp[5] + tmp[7]; + pictran[pictran_off + pwh_str * 17] = + (T)(4.0) * (tmp[4] + h5[1] - h5[2]) - (T)(5.0) * tmp[6] - h5[3] + h5[4]; + + pictran[pictran_off + pwh_str * 18] = + (T)(8.0) * (h0[3] - h0[1]) + (T)(4.0) * (h0[4] - h0[2]) - (T)(5.0) * tmp[9] + tmp[11]; + pictran[pictran_off + pwh_str * 19] = (T)(-4.0) * (tmp[8] + tmp[9]) + tmp[10] + tmp[11]; + pictran[pictran_off + pwh_str * 20] = (T)(4.0) * (tmp[8] - tmp[9]) - tmp[10] + tmp[11]; + pictran[pictran_off + pwh_str * 21] = (T)(2.0) * (tmp[10] - tmp[8]) - tmp[9] + tmp[11]; + pictran[pictran_off + pwh_str * 22] = (T)(2.0) * (tmp[8] - tmp[10]) - tmp[9] + tmp[11]; + pictran[pictran_off + pwh_str * 23] = + (T)(4.0) * tmp[8] + (T)(2.0) * (h5[3] - h5[1]) - h5[2] - (T)(5.0) * tmp[10] + h5[4]; + + pictran[pictran_off + pwh_str * 24] = + (T)(8.0) * (h0[1] - h0[3]) + (T)(4.0) * (h0[4] - h0[2]) - (T)(5.0) * tmp[13] + tmp[15]; + pictran[pictran_off + pwh_str * 25] = (T)(-4.0) * (tmp[12] + tmp[13]) + tmp[14] + tmp[15]; + pictran[pictran_off + pwh_str * 26] = (T)(4.0) * (tmp[12] - tmp[13]) - tmp[14] + tmp[15]; + pictran[pictran_off + pwh_str * 27] = (T)(2.0) * (tmp[14] - tmp[12]) - tmp[13] + tmp[15]; + pictran[pictran_off + pwh_str * 28] = (T)(2.0) * (tmp[12] - tmp[14]) - tmp[13] + tmp[15]; + pictran[pictran_off + pwh_str * 29] = + (T)(4.0) * tmp[12] + (T)(2.0) * (h5[1] - h5[3]) - h5[2] - (T)(5.0) * tmp[14] + h5[4]; + + pictran[pictran_off + pwh_str * 30] = + (T)(16.0) * h0[1] - (T)(20.0) * h0[3] + (T)(4.0) * h0[5] - (T)(5.0) * h2[5] + h4[5]; + pictran[pictran_off + pwh_str * 31] = (T)(-4.0) * (h1[5] + h2[5]) + h3[5] + h4[5]; + pictran[pictran_off + pwh_str * 32] = (T)(4.0) * (h1[5] - h2[5]) - h3[5] + h4[5]; + pictran[pictran_off + pwh_str * 33] = (T)(2.0) * (h3[5] - h1[5]) - h2[5] + h4[5]; + pictran[pictran_off + pwh_str * 34] = (T)(2.0) * (h1[5] - h3[5]) - h2[5] + h4[5]; + pictran[pictran_off + pwh_str * 35] = + (T)(4.0) * (h1[5] + h5[1]) - (T)(5.0) * (h3[5] + h5[3]) + h5[5]; +} diff --git a/tensor_computing/src/gpu/mali/cl/conv_wino_trans_picbuf_left.cl b/compute/tensor/src/gpu/mali/cl/conv_wino_trans_picbuf_left.cl similarity index 78% rename from tensor_computing/src/gpu/mali/cl/conv_wino_trans_picbuf_left.cl rename to compute/tensor/src/gpu/mali/cl/conv_wino_trans_picbuf_left.cl index 7f9e6299..7adc3603 100644 --- a/tensor_computing/src/gpu/mali/cl/conv_wino_trans_picbuf_left.cl +++ b/compute/tensor/src/gpu/mali/cl/conv_wino_trans_picbuf_left.cl @@ -12,19 +12,32 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "kernel_def.h" -#define MANGLE_NAME_IMPL(base, ON) base ## ON +#define MANGLE_NAME_IMPL(base, ON) base##ON #define MANGLE_NAME(base, ON) MANGLE_NAME_IMPL(base, ON) - -__kernel void MANGLE_NAME(conv_wino_trans_picbuf_left_, ON) -(const int ih_str, const int iw_str, const int ic_str, const int oh_str, const int ow_str, const int ohw_str, const int ohwc_str, const int bx, const int by, __global const T* in, __global T* out) { +__kernel void MANGLE_NAME(conv_wino_trans_picbuf_left_, ON)(const int ih_str, + const int iw_str, + const int ic_str, + const int oh_str, + const int ow_str, + const int ohw_str, + const int ohwc_str, + const int bx, + const int by, + __global const T *in, + __global T *out) +{ const int idx = get_global_id(0); const int idy = get_global_id(1); const int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; + if (idx >= bx || idy >= by) { + return; + } const int idzx = idz % ic_str; const int idzy = idz / ic_str; - if(idx * ON >= oh_str) return; + if (idx * ON >= oh_str) { + return; + } T in_val[6]; T out_val0[ON]; T out_val1[ON]; @@ -38,7 +51,7 @@ __kernel void MANGLE_NAME(conv_wino_trans_picbuf_left_, ON) LOAD_BUF_ARRAY2(in_val, in_off, in); - for(uchar i = 0; i < ON; ++i) { + for (uchar i = 0; i < ON; ++i) { T4 tmp = vload4(0, in + in_off + 2); in_val[2] = tmp.x; in_val[3] = tmp.y; @@ -51,17 +64,17 @@ __kernel void MANGLE_NAME(conv_wino_trans_picbuf_left_, ON) UPDATE_REG(out_val4); UPDATE_REG(out_val5); p[0] = -4; - p[1] = 1; - for(uchar j = 0; j < 2; ++j) { + p[1] = 1; + for (uchar j = 0; j < 2; ++j) { out_val1[UN] = out_val2[UN]; out_val2[UN] = p[0] * in_val[1] - (T)(4.0) * in_val[2] + p[1] * in_val[3] + in_val[4]; - p[0] = -p[0]; - p[1] = -p[1]; + p[0] = -p[0]; + p[1] = -p[1]; } p[0] = -2; - p[1] = 2; - for(uchar j = 0; j < 2; ++j) { + p[1] = 2; + for (uchar j = 0; j < 2; ++j) { out_val3[UN] = out_val4[UN]; out_val4[UN] = p[0] * in_val[1] - in_val[2] + p[1] * in_val[3] + in_val[4]; p[0] = -p[0]; @@ -70,7 +83,7 @@ __kernel void MANGLE_NAME(conv_wino_trans_picbuf_left_, ON) p[0] = 4; p[1] = -5; - for(uchar j = 0; j < 2; j++) { + for (uchar j = 0; j < 2; j++) { out_val0[UN] = out_val5[UN]; out_val5[UN] = p[0] * in_val[0] + p[1] * in_val[2] + in_val[4]; in_val[0] = in_val[1]; @@ -83,10 +96,10 @@ __kernel void MANGLE_NAME(conv_wino_trans_picbuf_left_, ON) in_val[1] = in_val[3]; in_off += 4; } - - int out_off = idzy * ohwc_str + idzx * ohw_str + idy * oh_str + idx * ON; - STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val0, out_off, 1, out); - STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val1, out_off + 6 * ohwc_str, 1, out); + + int out_off = idzy * ohwc_str + idzx * ohw_str + idy * oh_str + idx * ON; + STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val0, out_off, 1, out); + STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val1, out_off + 6 * ohwc_str, 1, out); STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val2, out_off + 12 * ohwc_str, 1, out); STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val3, out_off + 18 * ohwc_str, 1, out); STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val4, out_off + 24 * ohwc_str, 1, out); diff --git a/tensor_computing/src/gpu/mali/cl/conv_wino_trans_picbuf_right.cl b/compute/tensor/src/gpu/mali/cl/conv_wino_trans_picbuf_right.cl similarity index 77% rename from tensor_computing/src/gpu/mali/cl/conv_wino_trans_picbuf_right.cl rename to compute/tensor/src/gpu/mali/cl/conv_wino_trans_picbuf_right.cl index 65c56270..62894fb7 100644 --- a/tensor_computing/src/gpu/mali/cl/conv_wino_trans_picbuf_right.cl +++ b/compute/tensor/src/gpu/mali/cl/conv_wino_trans_picbuf_right.cl @@ -12,14 +12,29 @@ // OUT OF OR IN C4NECTI4 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "kernel_def.h" -__kernel void conv_wino_trans_picbuf_right -(const int ih_str4, const int iw_str, const int ih_off4, const int iw_off, const int oh_str, const int ow_str, const int ohwc_str, const int oh_off4, const int bx, const int by, __global const T* in, __global T* out) { +__kernel void conv_wino_trans_picbuf_right(const int ih_str4, + const int iw_str, + const int ih_off4, + const int iw_off, + const int oh_str, + const int ow_str, + const int ohwc_str, + const int oh_off4, + const int bx, + const int by, + __global const T *in, + __global T *out) +{ int idx = get_global_id(0); const int idy = get_global_id(1); const int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; + if (idx >= bx || idy >= by) { + return; + } idx = idx - oh_off4; - if((idy << 2) >= ow_str) return; + if ((idy << 2) >= ow_str) { + return; + } T out_val0[4]; T out_val1[4]; T out_val2[4]; @@ -34,12 +49,12 @@ __kernel void conv_wino_trans_picbuf_right SET_REG_ARRAY(0, out_val4); SET_REG_ARRAY(0, out_val5); - if(idx >= 0 && idx < ih_str4) { + if (idx >= 0 && idx < ih_str4) { int in_off = (idz * iw_str + (idy << 4) + iw_off) * ih_str4 + idx + ih_off4; - T in_val[6]; + T in_val[6]; in_val[0] = in[in_off]; in_val[1] = in[in_off + ih_str4]; - for(uchar i = 0; i < 4; ++i) { + for (uchar i = 0; i < 4; ++i) { in_val[2] = in[in_off + 2 * ih_str4]; in_val[3] = in[in_off + 3 * ih_str4]; in_val[4] = in[in_off + 4 * ih_str4]; @@ -53,17 +68,17 @@ __kernel void conv_wino_trans_picbuf_right UPDATE_REG(out_val5); p[0] = -4; - p[1] = 1; - for(uchar j = 0; j < 2; ++j) { + p[1] = 1; + for (uchar j = 0; j < 2; ++j) { out_val1[UN] = out_val2[UN]; out_val2[UN] = p[0] * in_val[1] - (T)(4) * in_val[2] + p[1] * in_val[3] + in_val[4]; - p[0] = -p[0]; - p[1] = -p[1]; + p[0] = -p[0]; + p[1] = -p[1]; } p[0] = -2; - p[1] = 2; - for(uchar j = 0; j < 2; ++j) { + p[1] = 2; + for (uchar j = 0; j < 2; ++j) { out_val3[UN] = out_val4[UN]; out_val4[UN] = p[0] * in_val[1] - in_val[2] + p[1] * in_val[3] + in_val[4]; p[0] = -p[0]; @@ -72,7 +87,7 @@ __kernel void conv_wino_trans_picbuf_right p[0] = 4; p[1] = -5; - for(uchar j = 0; j < 2; ++j) { + for (uchar j = 0; j < 2; ++j) { out_val0[UN] = out_val5[UN]; out_val5[UN] = p[0] * in_val[0] + p[1] * in_val[2] + in_val[4]; in_val[0] = in_val[1]; @@ -86,11 +101,11 @@ __kernel void conv_wino_trans_picbuf_right in_off += (ih_str4 << 2); } } - + idx += oh_off4; - int out_off = (((idz << 2) + (idx & 3)) * ow_str + (idy << 2)) * oh_str + (idx >> 2); - STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val0, out_off, oh_str, out); - STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val1, out_off + ohwc_str, oh_str, out); + int out_off = (((idz << 2) + (idx & 3)) * ow_str + (idy << 2)) * oh_str + (idx >> 2); + STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val0, out_off, oh_str, out); + STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val1, out_off + ohwc_str, oh_str, out); STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val2, out_off + 2 * ohwc_str, oh_str, out); STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val3, out_off + 3 * ohwc_str, oh_str, out); STORE_OUTPUT_BUF_ARRAY_ALIGN(out_val4, out_off + 4 * ohwc_str, oh_str, out); diff --git a/compute/tensor/src/gpu/mali/cl/copy.cl b/compute/tensor/src/gpu/mali/cl/copy.cl new file mode 100644 index 00000000..fdd65036 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/copy.cl @@ -0,0 +1,87 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define MANGLE_NAME_IMPL(base, DT) base##DT +#define MANGLE_NAME(base, DT) MANGLE_NAME_IMPL(base, DT) + +#if defined(USE_BLOCK_INDEX) +__kernel void MANGLE_NAME(copy_with_block_index_, DT)(const int s_len, + const int d_len, + const int s_off, + const int d_off, + const int s_str, + const int d_str, + const int bx, + __global const int *srcBlockIndex, + __global const int *dstBlockIndex, + __global const T *src, + __global T *dst) +{ +#else +__kernel void MANGLE_NAME(copy_, DT)(const int s_len, + const int d_len, + const int s_off, + const int d_off, + const int bx, + __global const T *src, + __global T *dst) +{ +#endif + int idx = get_global_id(0); + if (idx >= bx) { + return; + } + char s_ex = (((idx << 2) + 4) <= s_len) ? 4 : (s_len & 3); + char d_ex = (((idx << 2) + 4) <= d_len) ? 4 : (d_len & 3); + if ((idx << 2) >= s_len) { + s_ex = 0; + } + if ((idx << 2) >= d_len) { + d_ex = 0; + } +#if defined(USE_BLOCK_INDEX) + s_off = s_off + s_str * srcBlockIndex[0]; + d_off = d_off + d_str * dstBlockIndex[0]; +#endif + int src_off = s_off + (idx << 2); + int dst_off = d_off + (idx << 2); + + T4 val = 0; + if (s_ex == 4) { + val = vload4(0, src + src_off); + } else { + if (s_ex == 1) { + val.x = src[src_off]; + } + if (s_ex == 2) { + val.xy = vload2(0, src + src_off); + } + if (s_ex == 3) { + val.xyz = vload3(0, src + src_off); + } + } + + if (d_ex == 4) { + vstore4(val, 0, dst + dst_off); + } else { + if (d_ex == 1) { + dst[dst_off] = val.x; + } + if (d_ex == 2) { + vstore2(val.xy, 0, dst + dst_off); + } + if (d_ex == 3) { + vstore3(val.xyz, 0, dst + dst_off); + } + } +} diff --git a/compute/tensor/src/gpu/mali/cl/deconv_direct.cl b/compute/tensor/src/gpu/mali/cl/deconv_direct.cl new file mode 100644 index 00000000..039fa0d2 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/deconv_direct.cl @@ -0,0 +1,100 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + +__kernel void deconv_direct(__global const T *input, + __global const T *weights, + __global T *output, + __read_only image1d_t bias, + int iw, + int iw_str, + int iw_off, + int ih, + int ih_str, + int ih_off, + int kw, + int kh, + int kc, + int kn, + int sw, + int sh, + int pw, + int ph, + int ow, + int ow_str, + int ow_off, + int oh, + int oh_str, + int oh_off, + int ic, + int oc, + int align_h, + int align_w, + int in_channel_blocks, + int out_channel_blocks) +{ + const int oh_idx = get_global_id(0); + const int ow_idx = get_global_id(1); + const int oc_idx = get_global_id(2); + if (oh_idx >= oh || ow_idx >= ow || oc_idx >= oc) { + return; + } + + T4 out0 = read_imageh(bias, sampler, oc_idx); + + int kernel_start_x = max(0, (oh_idx + align_h) / sh); + int kernel_start_y = max(0, (ow_idx + align_w) / sw); + + int deal_kernel_width = kw - (kernel_start_y * sw + pw) + ow_idx - 1; + int deal_kernel_height = kh - (kernel_start_x * sh + ph) + oh_idx - 1; + + int kernel_0, kernel_1, kernel_2, kernel_3, kernel_y; + T4 in0; + T4 weights0, weights1, weights2, weights3; + int in_off, kernel_off; + for (int i = 0; i < in_channel_blocks; i++) { + kernel_0 = 0; + kernel_1 = kernel_0 + 1; + kernel_2 = kernel_0 + 2; + kernel_3 = kernel_0 + 3; + for (int k_y = deal_kernel_width, idx_w = kernel_start_y; k_y >= 0; k_y -= sw, idx_w++) { + int in_width0 = idx_w; + int in_height0 = kernel_start_x; + for (int k_x = deal_kernel_height; k_x >= 0; k_x -= sh) { + kernel_off = + (oc_idx * kw * kh * in_channel_blocks + i * kw * kh + k_x * kh + k_y) * 4; + weights0 = vload4(kernel_off + kernel_0, weights); + weights1 = vload4(kernel_off + kernel_1, weights); + weights2 = vload4(kernel_off + kernel_2, weights); + weights3 = vload4(kernel_off + kernel_3, weights); + + // in_off = i * ih * iw + ih * in_width0 + in_height0; + in_off = (i * iw_str + in_width0 + iw_off) * ih_str + ih_off + in_height0; + if (in_height0 < 0 || in_height0 >= ih || in_width0 < 0 || in_width0 >= iw) { + in0 = (T4)0; + } else { + in0 = vload4(in_off, input); + } + + out0 = mad(in0.x, weights0, out0); + out0 = mad(in0.y, weights1, out0); + out0 = mad(in0.z, weights2, out0); + out0 = mad(in0.w, weights3, out0); + in_height0++; + } + } + } + int out_off = (oc_idx * ow_str + ow_idx + ow_off) * oh_str + oh_idx + oh_off; + vstore4(out0, out_off, output); +} diff --git a/compute/tensor/src/gpu/mali/cl/deconv_direct_trans_fltbuf.cl b/compute/tensor/src/gpu/mali/cl/deconv_direct_trans_fltbuf.cl new file mode 100644 index 00000000..6f2f4b7c --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/deconv_direct_trans_fltbuf.cl @@ -0,0 +1,48 @@ +#define loadFltval(off, str, flt, val) \ + { \ + val.x = flt[off]; \ + val.y = flt[off + str]; \ + val.z = flt[off + str * 2]; \ + val.w = flt[off + str * 3]; \ + } +#define loadFltvalEdge(off, str, flt, val, edge) \ + { \ + val.x = flt[off]; \ + if (edge > 1) \ + val.y = flt[off + str]; \ + if (edge > 2) \ + val.z = flt[off + str * 2]; \ + } + +// conv filter gs[3] = {fwh, (fc+3)/4, (fn+3)/4*4}; +// deconv filter gs[3] = {fwh, (fc+3)/4*4, (fn+3)/4}; +// iohw -> nchwn4c4 + +__kernel void deconv_direct_trans_fltbuf( + const int fwh, const int fc, const int fn, __global const T *fltdata, __global T *fltbuf) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + + short ec = ((idy + 1) * 4 <= fc) ? 4 : (fc % 4); + + int flt_off = (idz * fc + idy * 4) * fwh + idx; + + T4 val = 0; + + int str = fwh; + if (idz < fn) { + if (ec == 4) { + loadFltval(flt_off, str, fltdata, val); + } else { + loadFltvalEdge(flt_off, str, fltdata, val, ec); + } + } + int bc = (fn + 4 - 1) / 4; + int out_off; + out_off = (idy * bc + idz / 4) * fwh * 4 + idx * 4 + (idz % 4); + // out_off = (idy / 4 * bc + idz) * fwh * 4 + idx * 4 + (idy % 4); + + vstore4(val, out_off, fltbuf); +} diff --git a/compute/tensor/src/gpu/mali/cl/deconv_gemm_f2s2.cl b/compute/tensor/src/gpu/mali/cl/deconv_gemm_f2s2.cl new file mode 100644 index 00000000..e9d95f59 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/deconv_gemm_f2s2.cl @@ -0,0 +1,339 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, ON, KN) base##ON##KN +#define MANGLE_NAME(base, ON, KN) MANGLE_NAME_IMPL(base, ON, KN) + +#if defined(REUSE_H) +#if (ON == 2) +#define SET_BIAS_VAL(bv, ov) \ + { \ + ov.s0 = bv.x; \ + ov.s1 = bv.y; \ + ov.s2 = bv.z; \ + ov.s3 = bv.w; \ + ov.s4 = bv.x; \ + ov.s5 = bv.y; \ + ov.s6 = bv.z; \ + ov.s7 = bv.w; \ + } + +#define calCore(iv, fv, ov) \ + { \ + ov.s0 += iv.s0 * fv.s0 + iv.s1 * fv.s1 + iv.s2 * fv.s2 + iv.s3 * fv.s3; \ + ov.s1 += iv.s0 * fv.s4 + iv.s1 * fv.s5 + iv.s2 * fv.s6 + iv.s3 * fv.s7; \ + ov.s2 += iv.s0 * fv.s8 + iv.s1 * fv.s9 + iv.s2 * fv.sa + iv.s3 * fv.sb; \ + ov.s3 += iv.s0 * fv.sc + iv.s1 * fv.sd + iv.s2 * fv.se + iv.s3 * fv.sf; \ + ov.s4 += iv.s4 * fv.s0 + iv.s5 * fv.s1 + iv.s6 * fv.s2 + iv.s7 * fv.s3; \ + ov.s5 += iv.s4 * fv.s4 + iv.s5 * fv.s5 + iv.s6 * fv.s6 + iv.s7 * fv.s7; \ + ov.s6 += iv.s4 * fv.s8 + iv.s5 * fv.s9 + iv.s6 * fv.sa + iv.s7 * fv.sb; \ + ov.s7 += iv.s4 * fv.sc + iv.s5 * fv.sd + iv.s6 * fv.se + iv.s7 * fv.sf; \ + } +#define VLOAD_VEC(off, buf) vload8(0, buf + off); +#define VSTORE_VEC(v0, v1, off, buf) \ + { \ + ACTIVATION_V8(v0); \ + ACTIVATION_V8(v1); \ + vstore16((T16)(v0.s0, v0.s1, v0.s2, v0.s3, v1.s0, v1.s1, v1.s2, v1.s3, v0.s4, v0.s5, \ + v0.s6, v0.s7, v1.s4, v1.s5, v1.s6, v1.s7), \ + 0, buf + off); \ + } +#elif (ON == 4) +#define SET_BIAS_VAL(bv, ov) \ + { \ + ov.s0 = bv.x; \ + ov.s1 = bv.y; \ + ov.s2 = bv.z; \ + ov.s3 = bv.w; \ + ov.s4 = bv.x; \ + ov.s5 = bv.y; \ + ov.s6 = bv.z; \ + ov.s7 = bv.w; \ + ov.s8 = bv.x; \ + ov.s9 = bv.y; \ + ov.sa = bv.z; \ + ov.sb = bv.w; \ + ov.sc = bv.x; \ + ov.sd = bv.y; \ + ov.se = bv.z; \ + ov.sf = bv.w; \ + } +#define calCore(iv, fv, ov) \ + { \ + ov.s0 += iv.s0 * fv.s0 + iv.s1 * fv.s1 + iv.s2 * fv.s2 + iv.s3 * fv.s3; \ + ov.s1 += iv.s0 * fv.s4 + iv.s1 * fv.s5 + iv.s2 * fv.s6 + iv.s3 * fv.s7; \ + ov.s2 += iv.s0 * fv.s8 + iv.s1 * fv.s9 + iv.s2 * fv.sa + iv.s3 * fv.sb; \ + ov.s3 += iv.s0 * fv.sc + iv.s1 * fv.sd + iv.s2 * fv.se + iv.s3 * fv.sf; \ + ov.s4 += iv.s4 * fv.s0 + iv.s5 * fv.s1 + iv.s6 * fv.s2 + iv.s7 * fv.s3; \ + ov.s5 += iv.s4 * fv.s4 + iv.s5 * fv.s5 + iv.s6 * fv.s6 + iv.s7 * fv.s7; \ + ov.s6 += iv.s4 * fv.s8 + iv.s5 * fv.s9 + iv.s6 * fv.sa + iv.s7 * fv.sb; \ + ov.s7 += iv.s4 * fv.sc + iv.s5 * fv.sd + iv.s6 * fv.se + iv.s7 * fv.sf; \ + ov.s8 += iv.s8 * fv.s0 + iv.s9 * fv.s1 + iv.sa * fv.s2 + iv.sb * fv.s3; \ + ov.s9 += iv.s8 * fv.s4 + iv.s9 * fv.s5 + iv.sa * fv.s6 + iv.sb * fv.s7; \ + ov.sa += iv.s8 * fv.s8 + iv.s9 * fv.s9 + iv.sa * fv.sa + iv.sb * fv.sb; \ + ov.sb += iv.s8 * fv.sc + iv.s9 * fv.sd + iv.sa * fv.se + iv.sb * fv.sf; \ + ov.sc += iv.sc * fv.s0 + iv.sd * fv.s1 + iv.se * fv.s2 + iv.sf * fv.s3; \ + ov.sd += iv.sc * fv.s4 + iv.sd * fv.s5 + iv.se * fv.s6 + iv.sf * fv.s7; \ + ov.se += iv.sc * fv.s8 + iv.sd * fv.s9 + iv.se * fv.sa + iv.sf * fv.sb; \ + ov.sf += iv.sc * fv.sc + iv.sd * fv.sd + iv.se * fv.se + iv.sf * fv.sf; \ + } + +#define VLOAD_VEC(off, buf) vload16(0, buf + off); +#define VSTORE_VEC(v0, v1, off, buf) \ + { \ + ACTIVATION_V16(v0); \ + ACTIVATION_V16(v1); \ + vstore16((T16)(v0.s0, v0.s1, v0.s2, v0.s3, v1.s0, v1.s1, v1.s2, v1.s3, v0.s4, v0.s5, \ + v0.s6, v0.s7, v1.s4, v1.s5, v1.s6, v1.s7), \ + 0, buf + off); \ + vstore16((T16)(v0.s8, v0.s9, v0.sa, v0.sb, v1.s8, v1.s9, v1.sa, v1.sb, v0.sc, v0.sd, \ + v0.se, v0.sf, v1.sc, v1.sd, v1.se, v1.sf), \ + 0, buf + off + 16); \ + } +#endif + +#if defined(USE_RELU) +__kernel void MANGLE_NAME(deconv_gemm_f2s2_h_relu_, ON, KN) +#else +__kernel void MANGLE_NAME(deconv_gemm_f2s2_h_, ON, KN) +#endif + (const int ih_str, + int ihw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + int ohw_str, + const int oh_off, + const int ow_off, + const int oh, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } +#if (ON == 2) + T8 in_val; + T8 out_val[KN]; +#elif (ON == 4) + T16 in_val; + T16 out_val[KN]; +#endif + T16 flt_val; + T4 bias_val; + +#if (KN == 2) + bias_val = read_imageh(bias, sampler, (idz >> 1)); + SET_BIAS_VAL(bias_val, out_val[0]); + SET_BIAS_VAL(bias_val, out_val[1]); +#elif (KN == 4) + bias_val = read_imageh(bias, sampler, idz); + SET_BIAS_VAL(bias_val, out_val[0]); + SET_BIAS_VAL(bias_val, out_val[1]); + SET_BIAS_VAL(bias_val, out_val[2]); + SET_BIAS_VAL(bias_val, out_val[3]); +#endif + + int in_off = ((idy + iw_off) * ih_str + idx * ON + ih_off) << 2; + int flt_off = idz * ic_str * KN; + ihw_str = ihw_str << 2; + + for (int i = 0; i < ic_str; ++i) { + in_val = VLOAD_VEC(in_off, in); +#if (KN == 2) + flt_val = vload16(flt_off, flt); + calCore(in_val, flt_val, out_val[0]); + flt_val = vload16(flt_off + 1, flt); + calCore(in_val, flt_val, out_val[1]); +#elif (KN == 4) + for (uchar j = 0; j < KN; ++j) { + flt_val = vload16(flt_off + j, flt); + if (j == 0) { + calCore(in_val, flt_val, out_val[0]); + } + if (j == 1) { + calCore(in_val, flt_val, out_val[1]); + } + if (j == 2) { + calCore(in_val, flt_val, out_val[2]); + } + if (j == 3) { + calCore(in_val, flt_val, out_val[3]); + } + } +#endif + flt_off += KN; + in_off += ihw_str; + } + +#if (KN == 2) + int out_off = (idx << 1) * ON + oh_off; + out_off += ((idy << 1) + ow_off + (idz & 1)) * oh_str; + out_off += (idz >> 1) * ohw_str; + out_off = (out_off << 2); + VSTORE_VEC(out_val[0], out_val[1], out_off, out); +#elif (KN == 4) + int out_off = (idx << 1) * ON + oh_off; + out_off += ((idy << 1) + ow_off) * oh_str; + out_off += idz * ohw_str; + out_off = (out_off << 2); + VSTORE_VEC(out_val[0], out_val[1], out_off, out); + VSTORE_VEC(out_val[2], out_val[3], out_off + oh_str * 4, out); +#endif +} + +// // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // / +#else + +#define VSTORE_VEC(v0, v1, off, buf) \ + { \ + ACTIVATION_V4(v0); \ + ACTIVATION_V4(v1); \ + vstore8((T8)(v0.s0, v0.s1, v0.s2, v0.s3, v1.s0, v1.s1, v1.s2, v1.s3), 0, buf + off); \ + } + +#if defined(USE_RELU) +__kernel void MANGLE_NAME(deconv_gemm_f2s2_relu_, ON, KN) +#else +__kernel void MANGLE_NAME(deconv_gemm_f2s2_, ON, KN) +#endif + (const int ih_str, + const int ihw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int ow, + const int bx, + const int by, + __global const T *in, + __global const T *flt, + __read_only image1d_t bias, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + T4 in_val[IN]; + T16 flt_val; + T4 out_val[KN][ON]; + T4 bias_val; + +#if (KN == 2) + bias_val = read_imageh(bias, sampler, (idz >> 1)); + SET_REG_ARRAY(bias_val, out_val[0]); + SET_REG_ARRAY(bias_val, out_val[1]); +#elif (KN == 4) + bias_val = read_imageh(bias, sampler, idz); + SET_REG_ARRAY(bias_val, out_val[0]); + SET_REG_ARRAY(bias_val, out_val[1]); + SET_REG_ARRAY(bias_val, out_val[2]); + SET_REG_ARRAY(bias_val, out_val[3]); +#endif + + int in_off = (idy * ON + iw_off) * ih_str + idx + ih_off; + int flt_off = idz * ic_str * KN; + + for (int i = 0; i < ic_str; ++i) { + LOAD_INPUT_BUF_ARRAY_V4(in_val, in_off, ih_str, in); +#if (KN == 2) + flt_val = vload16(flt_off, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]); + flt_val = vload16(flt_off + 1, flt); + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[1]); +#elif (KN == 4) + for (uchar j = 0; j < KN; ++j) { + flt_val = vload16(flt_off + j, flt); + if (j == 0) { + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[0]); + } + if (j == 1) { + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[1]); + } + if (j == 2) { + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[2]); + } + if (j == 3) { + DIRECT_CONV_CAL_CORE_S1(in_val, flt_val, out_val[3]); + } + } +#endif + flt_off += KN; + in_off += ihw_str; + } +#if (KN == 2) + int index_y = (idy << 1) * ON + (idz & 1); + int out_off = (idx << 1) + oh_off; + out_off += (index_y + ow_off) * oh_str; + out_off += (idz >> 1) * ohw_str; + out_off = (out_off << 2); + VSTORE_VEC(out_val[0][0], out_val[1][0], out_off, out); +#if (ON > 1) + if (index_y + 2 < ow) { + VSTORE_VEC(out_val[0][1], out_val[1][1], out_off + oh_str * 8, out); + } +#endif +#if (ON > 2) + if (index_y + 4 < ow) { + VSTORE_VEC(out_val[0][2], out_val[1][2], out_off + oh_str * 16, out); + } +#endif +#if (ON > 3) + if (index_y + 6 < ow) { + VSTORE_VEC(out_val[0][3], out_val[1][3], out_off + oh_str * 24, out); + } +#endif +#elif (KN == 4) + int index_y = (idy << 1) * ON; + int out_off = (idx << 1) + oh_off; + out_off += (index_y + ow_off) * oh_str; + out_off += idz * ohw_str; + out_off = (out_off << 2); + VSTORE_VEC(out_val[0][0], out_val[1][0], out_off, out); + if (index_y + 1 < ow) { + VSTORE_VEC(out_val[2][0], out_val[3][0], out_off + oh_str * 4, out); + } +#if (ON > 1) + if (index_y + 2 < ow) { + VSTORE_VEC(out_val[0][1], out_val[1][1], out_off + oh_str * 8, out); + } + if (index_y + 3 < ow) { + VSTORE_VEC(out_val[2][1], out_val[3][1], out_off + oh_str * 12, out); + } +#endif +#if (ON > 2) + if (index_y + 4 < ow) { + VSTORE_VEC(out_val[0][2], out_val[1][2], out_off + oh_str * 16, out); + } + if (index_y + 5 < ow) { + VSTORE_VEC(out_val[2][2], out_val[3][2], out_off + oh_str * 20, out); + } +#endif +#endif +} +#endif diff --git a/compute/tensor/src/gpu/mali/cl/deconv_gemm_trans_fltbuf.cl b/compute/tensor/src/gpu/mali/cl/deconv_gemm_trans_fltbuf.cl new file mode 100644 index 00000000..ea29c034 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/deconv_gemm_trans_fltbuf.cl @@ -0,0 +1,92 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define MANGLE_NAME_IMPL(base, C, K) base##C##K +#define MANGLE_NAME(base, C, K) MANGLE_NAME_IMPL(base, C, K) +__kernel void MANGLE_NAME(deconv_gemm_trans_fltbuf_, C, K)(const int fw, + const int fwh, + const int fwhc, + const int fc, + const int fn, + __global const T *fltdata, + __global T *fltbuf) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); // (fn + 3) / 4; + const int idx_wh = idx % fwh; // fwh + const int idx_c = idx / fwh; // (fc + 3) / 4; + uchar ec = ((idx_c + 1) * 4 <= fc) ? 4 : (fc % 4); + uchar ek = ((idy + 1) * K <= fn) ? K : (fn % K); + + T16 val = 0; + int flt_off = idy * fwhc * 4 + idx_c * fwh * 4 + idx_wh; + val.s0 = fltdata[flt_off]; + if (ec > 1) { + val.s4 = fltdata[flt_off + fwh]; + } + if (ec > 2) { + val.s8 = fltdata[flt_off + fwh * 2]; + } + if (ec > 3) { + val.sc = fltdata[flt_off + fwh * 3]; + } + + if (ek > 1) { + flt_off += fwhc; + val.s1 = fltdata[flt_off]; + if (ec > 1) { + val.s5 = fltdata[flt_off + fwh]; + } + if (ec > 2) { + val.s9 = fltdata[flt_off + fwh * 2]; + } + if (ec > 3) { + val.sd = fltdata[flt_off + fwh * 3]; + } + } + + if (ek > 2) { + flt_off += fwhc; + val.s2 = fltdata[flt_off]; + if (ec > 1) { + val.s6 = fltdata[flt_off + fwh]; + } + if (ec > 2) { + val.sa = fltdata[flt_off + fwh * 2]; + } + if (ec > 3) { + val.se = fltdata[flt_off + fwh * 3]; + } + } + + if (ek > 3) { + flt_off += fwhc; + val.s3 = fltdata[flt_off]; + if (ec > 1) { + val.s7 = fltdata[flt_off + fwh]; + } + if (ec > 2) { + val.sb = fltdata[flt_off + fwh * 2]; + } + if (ec > 3) { + val.sf = fltdata[flt_off + fwh * 3]; + } + } + + /*C = 1 C = 2 C = 4*/ + const int idx_w = idx_wh % fw; + const int idx_h = idx_wh / fw; + const int idx_tran = idx_c * fwh + idx_w * fw + idx_h; + int out_off = (idx_tran / C) * ((fn + 3) >> 2) * C + idy * C + (idx_tran % C); + vstore16(val, out_off, fltbuf); +} diff --git a/compute/tensor/src/gpu/mali/cl/depth2space.cl b/compute/tensor/src/gpu/mali/cl/depth2space.cl new file mode 100644 index 00000000..1903add6 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/depth2space.cl @@ -0,0 +1,33 @@ +__kernel void depth2space(const int iw, + const int ih, + const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int oh_str, + const int ow_off, + const int oh_off, + __global const T *in, + __global uchar *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + if (idx >= ih || idy >= (iw << 2)) { + return; + } + const int ix = idx; + const int iy = idy % iw; + const int iz = idy / iw; + + const int in_off = (iz * iw_str + iy + iw_off) * ih_str + ix + ih_off; + T4 tmp = vload4(in_off, in); + uchar4 val; + val.x = tmp.x * 255.0; + val.y = tmp.y * 255.0; + val.z = tmp.z * 255.0; + val.w = tmp.w * 255.0; + + const int out_off = ((ix << 2) + iz + oh_off) * ow_str + (iy << 2) + ow_off; + vstore4(val, 0, out + out_off); +} diff --git a/compute/tensor/src/gpu/mali/cl/depth2space_nchw.cl b/compute/tensor/src/gpu/mali/cl/depth2space_nchw.cl new file mode 100644 index 00000000..3bd3dea2 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/depth2space_nchw.cl @@ -0,0 +1,46 @@ +__kernel void depth2space_nchw(const int blockSize, + const int iw_str, + const int iwh_str, + const int iw_off, + const int ih_off, + const int oh_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int iw, + const int ih, + const int ic, + __global const T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + if (idx >= iw || idy >= ih) { + return; + } + const int idz = get_global_id(2); + const int bs2 = blockSize * blockSize; + const int z_group = idz / bs2; + const int z_group_lane = idz % bs2; + const int z_group_lane_x = z_group_lane % blockSize; + const int z_group_lane_y = z_group_lane / blockSize; + + const int z_off = z_group * (bs2 << 2) + z_group_lane; + int in_off = z_off * iwh_str + (idy + ih_off) * iw_str + idx + iw_off; + T4 val = 0; + val.x = in[in_off]; + if (z_off + bs2 < ic) { + val.y = in[in_off + bs2 * iwh_str]; + } + if (z_off + bs2 * 2 < ic) { + val.z = in[in_off + bs2 * 2 * iwh_str]; + } + if (z_off + bs2 * 3 < ic) { + val.w = in[in_off + bs2 * 3 * iwh_str]; + } + + int out_off = idy * blockSize + z_group_lane_y + oh_off; + out_off += (idx * blockSize + z_group_lane_x + ow_off) * oh_str; + out_off += z_group * ohw_str; + vstore4(val, out_off, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/depth2space_ncwhc4_2x2.cl b/compute/tensor/src/gpu/mali/cl/depth2space_ncwhc4_2x2.cl new file mode 100644 index 00000000..cd0ff9d2 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/depth2space_ncwhc4_2x2.cl @@ -0,0 +1,62 @@ +__kernel void depth2space_ncwhc4_2x2(const int blockSize, + const int ih_str, + const int ihw_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int ih, + const int iw, + __global const T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + if (idx >= ih || idy >= iw) { + return; + } + const int idz = get_global_id(2); + const int in_off = idz * 4 * ihw_str + (idy + iw_off) * ih_str + idx + ih_off; + T4 val[4] = {0}; + T4 val_0, val_1, val_2, val_3; + + val[0] = vload4(in_off, in); + if (idz * 4 + 1 < ic_str) { + val[1] = vload4(in_off + ihw_str, in); + } + if (idz * 4 + 2 < ic_str) { + val[2] = vload4(in_off + ihw_str * 2, in); + } + if (idz * 4 + 3 < ic_str) { + val[3] = vload4(in_off + ihw_str * 3, in); + } + + val_0.x = val[0].x; + val_1.x = val[0].y; + val_2.x = val[0].z; + val_3.x = val[0].w; + + val_0.y = val[1].x; + val_1.y = val[1].y; + val_2.y = val[1].z; + val_3.y = val[1].w; + + val_0.z = val[2].x; + val_1.z = val[2].y; + val_2.z = val[2].z; + val_3.z = val[2].w; + + val_0.w = val[3].x; + val_1.w = val[3].y; + val_2.w = val[3].z; + val_3.w = val[3].w; + + const int out_off = idz * ohw_str + ((idy << 1) + ow_off) * oh_str + (idx << 1) + oh_off; + vstore4(val_0, out_off, out); + vstore4(val_2, out_off + 1, out); + vstore4(val_1, out_off + oh_str, out); + vstore4(val_3, out_off + oh_str + 1, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/eltwise.cl b/compute/tensor/src/gpu/mali/cl/eltwise.cl new file mode 100644 index 00000000..a4c418f7 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/eltwise.cl @@ -0,0 +1,175 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, TP, N) base##TP##N +#define MANGLE_NAME(base, TP, N) MANGLE_NAME_IMPL(base, TP, N) + +#if defined(USE_SUM) +#define calCore(v, res) \ + { \ + res.s0 += v.s0; \ + res.s1 += v.s1; \ + res.s2 += v.s2; \ + res.s3 += v.s3; \ + } +#endif + +#if defined(USE_MAX) +#define calCore(v, res) \ + { \ + res = fmax(res, v); \ + } +#endif + +#if defined(USE_PROD) +#define calCore(v, res) \ + { \ + res.s0 *= v.s0; \ + res.s1 *= v.s1; \ + res.s2 *= v.s2; \ + res.s3 *= v.s3; \ + } +#endif + +#if defined(USE_NCHW) +#define LOAD_VAL(ew, idx, idy, idz, ih_str, iw_str, ih_off, iw_off, buf, val) \ + { \ + int off = (idz * ih_str + idy + ih_off) * iw_str + (idx << 2) + iw_off; \ + val = 0; \ + if (ew == 4) { \ + val = vload4(0, buf + off); \ + } else { \ + if (ew == 1) \ + val.x = buf[off]; \ + if (ew == 2) { \ + T2 tmp = vload2(0, buf + off); \ + val.x = tmp.x; \ + val.y = tmp.y; \ + } \ + if (ew == 3) { \ + T3 tmp = vload3(0, buf + off); \ + val.x = tmp.x; \ + val.y = tmp.y; \ + val.z = tmp.z; \ + } \ + } \ + } +#define STORE_VAL(ew, idx, idy, idz, oh_str, ow_str, oh_off, ow_off, buf, val) \ + { \ + int off = (idz * oh_str + idy + oh_off) * ow_str + (idx << 2) + ow_off; \ + if (ew == 4) { \ + vstore4(val, 0, buf + off); \ + } else { \ + if (ew == 1) \ + buf[off] = val.x; \ + if (ew == 2) { \ + vstore2((T2)(val.x, val.y), 0, buf + off); \ + } \ + if (ew == 3) { \ + vstore3((T3)(val.x, val.y, val.z), 0, buf + off); \ + } \ + } \ + } +#else +#define LOAD_VAL(ew, idx, idy, idz, ih_str, iw_str, ih_off, iw_off, buf, val) \ + { \ + int off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; \ + val = vload4(off, buf); \ + } +#define STORE_VAL(ew, idx, idy, idz, oh_str, ow_str, oh_off, ow_off, buf, val) \ + { \ + int off = (idz * ow_str + idy + ow_off) * oh_str + idx + oh_off; \ + vstore4(val, off, buf); \ + } +#endif + +#if (USE_NCHW) +#if (USE_RELU) +__kernel void MANGLE_NAME(eltwise_nchw_relu_, TP, N) +#else +__kernel void MANGLE_NAME(eltwise_nchw_, TP, N) +#endif +#else +#if (USE_RELU) +__kernel void MANGLE_NAME(eltwise_relu_, TP, N) +#else +__kernel void MANGLE_NAME(eltwise_, TP, N) +#endif +#endif + (const int h, + const int w, + const int c, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int bx, + const int by, + const int ih0_str, + const int iw0_str, + const int ih0_off, + const int iw0_off, + __global const T *in0, +#if (N > 1) + const int ih1_str, + const int iw1_str, + const int ih1_off, + const int iw1_off, + __global const T *in1, +#endif +#if (N > 2) + const int ih2_str, + const int iw2_str, + const int ih2_off, + const int iw2_off, + __global const T *in2, +#endif +#if (N > 3) + const int ih3_str, + const int iw3_str, + const int ih3_off, + const int iw3_off, + __global const T *in3, +#endif + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + char ew = 0; +#if defined(USE_NCHW) + ew = ((idx << 2) + 4 < w) ? 4 : (w & 3); +#endif + + T4 val; + T4 res; + LOAD_VAL(ew, idx, idy, idz, ih0_str, iw0_str, ih0_off, iw0_off, in0, res); +#if (N > 1) + LOAD_VAL(ew, idx, idy, idz, ih1_str, iw1_str, ih1_off, iw1_off, in1, val); + calCore(val, res); +#endif +#if (N > 2) + LOAD_VAL(ew, idx, idy, idz, ih2_str, iw2_str, ih2_off, iw2_off, in2, val); + calCore(val, res); +#endif +#if (N > 3) + LOAD_VAL(ew, idx, idy, idz, ih3_str, iw3_str, ih3_off, iw3_off, in3, val); + calCore(val, res); +#endif + ACTIVATION_V4(res); + STORE_VAL(ew, idx, idy, idz, oh_str, ow_str, oh_off, ow_off, out, res); +} diff --git a/compute/tensor/src/gpu/mali/cl/eltwise_broadcast.cl b/compute/tensor/src/gpu/mali/cl/eltwise_broadcast.cl new file mode 100644 index 00000000..993d2bb0 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/eltwise_broadcast.cl @@ -0,0 +1,89 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define MANGLE_NAME_IMPL(base, TP, N) base##TP##N +#define MANGLE_NAME(base, TP, N) MANGLE_NAME_IMPL(base, TP, N) + +#if defined(USE_SUM) +#define calCore(in, off, v, res) \ + { \ + v = vload4(off, in); \ + res.s0 += v.s0; \ + res.s1 += v.s1; \ + res.s2 += v.s2; \ + res.s3 += v.s3; \ + } +#endif + +#if defined(USE_MAX) +#define calCore(in, off, v, res) \ + { \ + v = vload4(off, in); \ + res = fmax(res, v); \ + } +#endif + +#if defined(USE_PROD) +#define calCore(in, off, v, res) \ + { \ + v = vload4(off, in); \ + res.s0 *= v.s0; \ + res.s1 *= v.s1; \ + res.s2 *= v.s2; \ + res.s3 *= v.s3; \ + } +#endif + +__kernel void MANGLE_NAME(eltwise_broadcast_, TP, N)(const int h, + const int w, + const int c, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + __global const T *in0, + __global const T *in1, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= h || idy >= w) { + return; + } + + T4 val; + T4 res; + const int in_off_res = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; + // c = h = w = 1 have bugs to fix +#if (N == 0) + const int in_off_val = 0; + // h = w = 1 +#elif (N == 1) + const int in_off_val = idz; + // h = 1 +#elif (N == 2) + const int in_off_val = idz * iw_str + idy + iw_str; + // w = 1 +#elif (N == 3) + const int in_off_val = idz * ih_str + idx + ih_str; +#endif + res = vload4(in_off_res, in0); + calCore(in1, in_off_val, val, res); + const int out_off = (idz * ow_str + idy + ow_off) * oh_str + idx + oh_off; + vstore4(res, out_off, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/eltwise_spe_nchw_c.cl b/compute/tensor/src/gpu/mali/cl/eltwise_spe_nchw_c.cl new file mode 100644 index 00000000..0992daa7 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/eltwise_spe_nchw_c.cl @@ -0,0 +1,74 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define MANGLE_NAME_IMPL(base, TP) base##TP +#define MANGLE_NAME(base, TP) MANGLE_NAME_IMPL(base, TP) + +#if defined(USE_SUM) +#define calCore(v, res) \ + { \ + res.s0 += v.s0; \ + res.s1 += v.s1; \ + res.s2 += v.s2; \ + res.s3 += v.s3; \ + } +#endif + +#if defined(USE_MAX) +#define calCore(v, res) \ + { \ + res = fmax(res, v); \ + } +#endif + +#if defined(USE_PROD) +#define calCore(v, res) \ + { \ + res.s0 *= v.s0; \ + res.s1 *= v.s1; \ + res.s2 *= v.s2; \ + res.s3 *= v.s3; \ + } +#endif + +__kernel void MANGLE_NAME(eltwise_spe_nchw_c_, TP)(const int h, + const int w, + const int c, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + __global const T *in, + __global const T *ine, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= h || idy >= w) { + return; + } + + T4 val; + T4 res; + const int in_off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; + res = vload4(in_off, in); + val = vload4(idz, ine); + calCore(val, res); + const int out_off = (idz * ow_str + idy + ow_off) * oh_str + idx + oh_off; + vstore4(res, out_off, out); +} diff --git a/tensor_computing/src/gpu/mali/cl/embedding.cl b/compute/tensor/src/gpu/mali/cl/embedding.cl similarity index 77% rename from tensor_computing/src/gpu/mali/cl/embedding.cl rename to compute/tensor/src/gpu/mali/cl/embedding.cl index 8f302066..9ae2db32 100644 --- a/tensor_computing/src/gpu/mali/cl/embedding.cl +++ b/compute/tensor/src/gpu/mali/cl/embedding.cl @@ -11,35 +11,43 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -__kernel void embedding(const int step, const int on, const int on_d4, const int oh_str, const int oh_off, const int ow_off, __global const unsigned int* input, __global const T* weight, __global T* output) { - +__kernel void embedding(const int step, + const int on, + const int on_d4, + const int oh_str, + const int oh_off, + const int ow_off, + __global const unsigned int *input, + __global const T *weight, + __global T *output) +{ int idx = get_global_id(0); int idy = get_global_id(1); - if(idx >= on_d4 || idy >= step) return; + if (idx >= on_d4 || idy >= step) { + return; + } T4 val = 0; unsigned int index = input[idy]; const int wei_off = index * on + (idx << 2); uchar rn = ((idx << 2) + 4 <= on) ? 0 : (on & 3); - if(rn == 0) { + if (rn == 0) { val = vload4(0, weight + wei_off); } else { - if(rn == 1) val.x = weight[wei_off]; - if(rn == 2) { + if (rn == 1) { + val.x = weight[wei_off]; + } + if (rn == 2) { T2 tmp = vload2(0, weight + wei_off); val.x = tmp.x; val.y = tmp.y; } - if(rn == 3) { + if (rn == 3) { T3 tmp = vload3(0, weight + wei_off); val.x = tmp.x; val.y = tmp.y; val.z = tmp.z; } } - const int out_off = (idx + ow_off) * oh_str + idy + oh_off; + const int out_off = (idx + ow_off) * oh_str + idy + oh_off; vstore4(val, out_off, output); } diff --git a/tensor_computing/src/gpu/mali/cl/fc_p1.cl b/compute/tensor/src/gpu/mali/cl/fc_p1.cl similarity index 55% rename from tensor_computing/src/gpu/mali/cl/fc_p1.cl rename to compute/tensor/src/gpu/mali/cl/fc_p1.cl index c2c7f565..8b6cf12f 100644 --- a/tensor_computing/src/gpu/mali/cl/fc_p1.cl +++ b/compute/tensor/src/gpu/mali/cl/fc_p1.cl @@ -11,41 +11,56 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -#define calCore(iv, fv, res){\ - res.x += iv.x * fv.s0 + iv.y * fv.s1 + iv.z * fv.s2 + iv.w * fv.s3;\ - res.y += iv.x * fv.s4 + iv.y * fv.s5 + iv.z * fv.s6 + iv.w * fv.s7;\ - res.z += iv.x * fv.s8 + iv.y * fv.s9 + iv.z * fv.sa + iv.w * fv.sb;\ - res.w += iv.x * fv.sc + iv.y * fv.sd + iv.z * fv.se + iv.w * fv.sf;\ -} -__kernel void fc_p1(const int item_y, const int ih_str, const int iw_str, const int ih_off, const int iw_off, const int ihy_str, const int ihw_str, - const int fh, const int fw, const int fc, const int fn, const int fhy_str, const int fhw_str, const int fwc_str, __global const T* flt, __global const T* in, __global T* out){ +#define calCore(iv, fv, res) \ + { \ + res.x += iv.x * fv.s0 + iv.y * fv.s1 + iv.z * fv.s2 + iv.w * fv.s3; \ + res.y += iv.x * fv.s4 + iv.y * fv.s5 + iv.z * fv.s6 + iv.w * fv.s7; \ + res.z += iv.x * fv.s8 + iv.y * fv.s9 + iv.z * fv.sa + iv.w * fv.sb; \ + res.w += iv.x * fv.sc + iv.y * fv.sd + iv.z * fv.se + iv.w * fv.sf; \ + } +__kernel void fc_p1(const int item_y, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int ihy_str, + const int ihw_str, + const int fh, + const int fw, + const int fc, + const int fn, + const int fhy_str, + const int fhw_str, + const int fwc_str, + __global const T *flt, + __global const T *in, + __global T *out) +{ const int idx = get_global_id(0); const int idy = get_global_id(1); const int idz = get_global_id(2); - if(idx >= fh || idy >= item_y) return; + if (idx >= fh || idy >= item_y) { + return; + } - T4 in_val; + T4 in_val; T16 flt_val; - T4 sum = 0; - int in_off = (idy + iw_off) * ih_str + idx + ih_off; + T4 sum = 0; + int in_off = (idy + iw_off) * ih_str + idx + ih_off; int flt_off = (idz * fwc_str + idy) * fh + idx; - for(int i = 0; i < fc; i++){ + for (int i = 0; i < fc; i++) { int k = 0; - for(int j = idy; j < fw; j += item_y){ - in_val = vload4 (in_off + k * ihy_str, in); + for (int j = idy; j < fw; j += item_y) { + in_val = vload4(in_off + k * ihy_str, in); flt_val = vload16(flt_off + k * fhy_str, flt); calCore(in_val, flt_val, sum); k++; } - in_off += ihw_str; + in_off += ihw_str; flt_off += fhw_str; } - const int out_off = (idy * fh + idx) * fn + idz; + const int out_off = (idy * fh + idx) * fn + idz; vstore4(sum, out_off, out); } diff --git a/tensor_computing/src/gpu/mali/cl/fc_p2.cl b/compute/tensor/src/gpu/mali/cl/fc_p2.cl similarity index 59% rename from tensor_computing/src/gpu/mali/cl/fc_p2.cl rename to compute/tensor/src/gpu/mali/cl/fc_p2.cl index cc5944d1..e79209e7 100644 --- a/tensor_computing/src/gpu/mali/cl/fc_p2.cl +++ b/compute/tensor/src/gpu/mali/cl/fc_p2.cl @@ -11,33 +11,40 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - #if defined(USE_HALF) -#define READ_IMAGE(image, sampler, coord) read_imageh(image, sampler, coord) -#define WRITE_IMAGE(image, coord, data) write_imageh(image, coord, data) +#define READ_IMAGE(image, sampler, coord) read_imageh(image, sampler, coord) +#define WRITE_IMAGE(image, coord, data) write_imageh(image, coord, data) #else -#define READ_IMAGE(image, sampler, coord) read_imagef(image, sampler, coord) -#define WRITE_IMAGE(image, coord, data) write_imagef(image, coord, data) +#define READ_IMAGE(image, sampler, coord) read_imagef(image, sampler, coord) +#define WRITE_IMAGE(image, coord, data) write_imagef(image, coord, data) #endif __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -#define calCore(iv, fv, res) {\ - res.x += iv.x * fv.s0 + iv.y * fv.s1 + iv.z * fv.s2 + iv.w * fv.s3;\ - res.y += iv.x * fv.s4 + iv.y * fv.s5 + iv.z * fv.s6 + iv.w * fv.s7;\ - res.z += iv.x * fv.s8 + iv.y * fv.s9 + iv.z * fv.sa + iv.w * fv.sb;\ - res.w += iv.x * fv.sc + iv.y * fv.sd + iv.z * fv.se + iv.w * fv.sf;\ -} -__kernel void fc_p2(const int loop, const int len, const int oh_str, const int ow_str, const int oh_off, const int ow_off, - __global const T* in, __global const T* bias, __global T* out) { +#define calCore(iv, fv, res) \ + { \ + res.x += iv.x * fv.s0 + iv.y * fv.s1 + iv.z * fv.s2 + iv.w * fv.s3; \ + res.y += iv.x * fv.s4 + iv.y * fv.s5 + iv.z * fv.s6 + iv.w * fv.s7; \ + res.z += iv.x * fv.s8 + iv.y * fv.s9 + iv.z * fv.sa + iv.w * fv.sb; \ + res.w += iv.x * fv.sc + iv.y * fv.sd + iv.z * fv.se + iv.w * fv.sf; \ + } +__kernel void fc_p2(const int loop, + const int len, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + __global const T *in, + __global const T *bias, + __global T *out) +{ const int idx = get_global_id(0); - if(idx >= len) return; + if (idx >= len) { + return; + } T4 sum = vload4(idx, bias); T4 val; - for(int i = 0; i < loop; i++) { + for (int i = 0; i < loop; i++) { val = vload4(idx + i * len, in); sum.x += val.x; sum.y += val.y; diff --git a/compute/tensor/src/gpu/mali/cl/fc_trans_fltbuf.cl b/compute/tensor/src/gpu/mali/cl/fc_trans_fltbuf.cl new file mode 100644 index 00000000..2d996aef --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/fc_trans_fltbuf.cl @@ -0,0 +1,100 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define MANGLE_NAME_IMPL(base, C, K) base##C##K +#define MANGLE_NAME(base, C, K) MANGLE_NAME_IMPL(base, C, K) +#if (C == 4) +#define loadFltval(off, str, flt, val) \ + { \ + val.x = flt[off]; \ + val.y = flt[off + str]; \ + val.z = flt[off + (str << 1)]; \ + val.w = flt[off + str * 3]; \ + } + +#define loadFltvalEdge(off, str, flt, val, edge) \ + { \ + val.x = flt[off]; \ + if (edge > 1) \ + val.y = flt[off + str]; \ + if (edge > 2) \ + val.z = flt[off + (str << 1)]; \ + } +#endif + +#if (C == 8) +#define loadFltval(off, str, flt, val) \ + { \ + val.s0 = flt[off]; \ + val.s1 = flt[off + str]; \ + val.s2 = flt[off + (str << 1)]; \ + val.s3 = flt[off + str * 3]; \ + val.s4 = flt[off + (str << 2)]; \ + val.s5 = flt[off + str * 5]; \ + val.s6 = flt[off + str * 6]; \ + val.s7 = flt[off + str * 7]; \ + } +#define loadFltvalEdge(off, str, flt, val, edge) \ + { \ + val.s0 = flt[off]; \ + if (edge > 1) \ + val.s1 = flt[off + str]; \ + if (edge > 2) \ + val.s2 = flt[off + (str << 1)]; \ + if (edge > 3) \ + val.s3 = flt[off + str * 3]; \ + if (edge > 4) \ + val.s4 = flt[off + (str << 2)]; \ + if (edge > 5) \ + val.s5 = flt[off + str * 5]; \ + if (edge > 6) \ + val.s6 = flt[off + str * 6]; \ + } +#endif + +__kernel void MANGLE_NAME(fc_trans_fltbuf_, C, K)(const int fw, + const int fh, + const int fwh, + const int fc, + const int fn, + __global const T *fltdata, + __global T *fltbuf) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + const int bc = (fc + C - 1) / C; + const int idc = idz % bc; + const int idn = idz / bc; + short ec = ((idc + 1) * C <= fc) ? C : (fc % C); + + const int flt_off = ((idn * fc + idc * C) * fh + idy) * fw + idx; +#if (C == 4) + T4 val = 0; +#elif (C == 8) + T8 val = 0; +#endif + if (idn < fn) { + if (ec == C) { + loadFltval(flt_off, fwh, fltdata, val); + } else { + loadFltvalEdge(flt_off, fwh, fltdata, val, ec); + } + } + const int out_off = ((idn / K * bc + idc) * fh + idx) * fw * K + idy * K + (idn % K); +#if (C == 4) + vstore4(val, out_off, fltbuf); +#elif (C == 8) + vstore8(val, out_off, fltbuf); +#endif +} diff --git a/uni/include/sys.h b/compute/tensor/src/gpu/mali/cl/fill_memory_zero.cl similarity index 67% rename from uni/include/sys.h rename to compute/tensor/src/gpu/mali/cl/fill_memory_zero.cl index 3f376930..1d032e00 100644 --- a/uni/include/sys.h +++ b/compute/tensor/src/gpu/mali/cl/fill_memory_zero.cl @@ -1,35 +1,23 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#ifndef _H_SYS -#define _H_SYS - -#ifdef __cplusplus -extern "C" { -#endif - - typedef enum { - CPU_GENERAL = 1, - MALI = 2, - ARM_V7 = 3, - ARM_V8 = 4, - ARM_A55 = 5, - ARM_A76 = 6, - } Arch; - -#ifdef __cplusplus +#define MANGLE_NAME_IMPL(base, DT) base##DT +#define MANGLE_NAME(base, DT) MANGLE_NAME_IMPL(base, DT) +__kernel void MANGLE_NAME(fill_memory_zero_, DT)(const int len, __global T *data) +{ + int idx = get_global_id(0); + if (idx >= len) { + return; + } + data[idx] = 0; } -#endif - -#endif diff --git a/compute/tensor/src/gpu/mali/cl/fill_memory_zero_vec4.cl b/compute/tensor/src/gpu/mali/cl/fill_memory_zero_vec4.cl new file mode 100644 index 00000000..e0f25587 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/fill_memory_zero_vec4.cl @@ -0,0 +1,38 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define MANGLE_NAME_IMPL(base, DT) base##DT +#define MANGLE_NAME(base, DT) MANGLE_NAME_IMPL(base, DT) +__kernel void MANGLE_NAME(fill_memory_zero_vec4_, DT)( + const int len, const int offset, const int bx, __global T *data) +{ + int idx = get_global_id(0); + if (idx >= bx) { + return; + } + char el = ((idx << 2) + 4 <= len) ? 4 : (len & 3); + const int off = offset + (idx << 2); + if (el == 4) { + vstore4((T4)0, 0, data + off); + } else { + if (el == 1) { + data[off] = 0; + } + if (el == 2) { + vstore2((T2)0, 0, data + off); + } + if (el == 3) { + vstore3((T3)0, 0, data + off); + } + } +} diff --git a/tensor_computing/src/gpu/mali/cl/gemm_nt.cl b/compute/tensor/src/gpu/mali/cl/gemm_nt.cl similarity index 56% rename from tensor_computing/src/gpu/mali/cl/gemm_nt.cl rename to compute/tensor/src/gpu/mali/cl/gemm_nt.cl index 08c2c5e7..66b9964d 100644 --- a/tensor_computing/src/gpu/mali/cl/gemm_nt.cl +++ b/compute/tensor/src/gpu/mali/cl/gemm_nt.cl @@ -11,62 +11,86 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -#include"kernel_def.h" -#define MANGLE_NAME_LMPL(base, LM, LN, LK) base ## LM ## LN ## LK +#include "kernel_def.h" +#define MANGLE_NAME_LMPL(base, LM, LN, LK) base##LM##LN##LK #define MANGLE_NAME(base, LM, LN, LK) MANGLE_NAME_LMPL(base, LM, LN, LK) #if defined(NO_BIAS) -__kernel void MANGLE_NAME(gemm_nt_nobias_, LM, LN, LK) +__kernel void MANGLE_NAME(gemm_nt_nobias_, LM, LN, LK)(const int KA, + const int KB, + const int K, + const int ow_str, + const int A_str, + const int B_str, + const int C_str, + const int A_off, + const int B_off, + const int ow, + const int oh, + const int bx, + const int by, + __global const T *A, + __global const T *B, + __global T *C) #else -__kernel void MANGLE_NAME(gemm_nt_, LM, LN, LK) -#endif -#if defined(NO_BIAS) -(const int KA, const int KB, const int K, const int ow_str, const int A_str, const int B_str, const int C_str, const int bx, const int by, __global const T* A, __global const T* B, __global T* C) -#else -(const int KA, const int KB, const int K, const int ow_str, const int A_str, const int B_str, const int C_str, const int bx, const int by, __global const T* A, __global const T* B, __global const T* bias, __global T* C) +__kernel void MANGLE_NAME(gemm_nt_, LM, LN, LK)(const int KA, + const int KB, + const int K, + const int ow_str, + const int A_str, + const int B_str, + const int C_str, + const int A_off, + const int B_off, + const int ow, + const int oh, + const int bx, + const int by, + __global const T *A, + __global const T *B, + __global const T *bias, + __global T *C) #endif { - const int idx = get_global_id(0); + const int idx = get_global_id(0); const int idy = get_global_id(1); const int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; - const int ix = idx * LN; - const int iy = idy * LM; - const int L = K >> LK; + if (idx >= bx || idy >= by) { + return; + } + const int ix = idx * LN; + const int iy = idy * LM; + const int L = K >> LK; const int VN = 1 << LK; T c[LM][LN]; -#if(LK == 0) +#if (LK == 0) T a[LM]; T b[LN]; -#elif(LK == 1) +#elif (LK == 1) T2 a[LM]; T2 b[LN]; -#elif(LK == 2) +#elif (LK == 2) T4 a[LM]; T4 b[LN]; -#elif(LK == 3) +#elif (LK == 3) T8 a[LM]; T8 b[LN]; -#elif(LK == 4) +#elif (LK == 4) T16 a[LM]; T16 b[LN]; #endif #if defined(NO_BIAS) - GEMM_SET_C_ZERO(c); + GEMM_SET_C_ZERO(c); #else GEMM_LOAD_A(a, iy, bias); GEMM_SET_C_BIAS(a, c); -#endif +#endif - int a_off = iy * KA + idz * A_str; - int b_off = ix * KB + idz * B_str; - for(int i = 0; i < L; ++i) { + int a_off = iy * KA + idz * A_str + A_off; + int b_off = ix * KB + idz * B_str + B_off; + for (int i = 0; i < L; ++i) { GEMM_NT_LOAD_A(a, a_off, KA, A); GEMM_NT_LOAD_B(b, b_off, KB, B); GEMM_CALCORE(a, b, c); @@ -74,5 +98,13 @@ __kernel void MANGLE_NAME(gemm_nt_, LM, LN, LK) b_off += VN; } int c_off = iy * ow_str + ix + idz * C_str; + int ex = ix + LN - ow; + int ey = iy + LM - oh; + if (ex > 0) { + GEMM_SET_C_EDGE_ZERO_W(c, ex); + } + if (ey > 0) { + GEMM_SET_C_EDGE_ZERO_H(c, ey); + } GEMM_STORE_C(c, c_off, ow_str, C); } diff --git a/compute/tensor/src/gpu/mali/cl/gemm_tn.cl b/compute/tensor/src/gpu/mali/cl/gemm_tn.cl new file mode 100644 index 00000000..60f5b2fa --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/gemm_tn.cl @@ -0,0 +1,269 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_LMPL(base, LM, LN) base##LM##LN +#define MANGLE_NAME(base, LM, LN) MANGLE_NAME_LMPL(base, LM, LN) + +#if defined(USE_NCWHC4) +#if defined(USE_RELU) +__kernel void MANGLE_NAME(gemm_tn_relu_ncwhc4_, LM, LN) +#elif defined(USE_GELU) +__kernel void MANGLE_NAME(gemm_tn_gelu_ncwhc4_, LM, LN) +#elif defined(USE_ELTWISE_NCHW) +__kernel void MANGLE_NAME(gemm_tn_eltwise1_ncwhc4_, LM, LN) +#elif defined(USE_ELTWISE_NCWHC4) +__kernel void MANGLE_NAME(gemm_tn_eltwise4_ncwhc4_, LM, LN) +#else +__kernel void MANGLE_NAME(gemm_tn_ncwhc4_, LM, LN) +#endif + (const int M, + const int N, + const int K, + const int oh, + const int ow, + const int oc, + const int oh_str, + const int ow_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int bx, + const int by, + __global const T *A, + __global const T *B, + __global const T *bias, + __global T *C +#if defined(USE_ELTWISE_NCHW) + , + const int ew_str, + const int ew_off, + const int eh_off, + __global const T *eltVal +#endif +#if defined(USE_ELTWISE_NCWHC4) + , + const int eh_str, + const int ew_str, + const int ehw_str, + const int eh_off, + const int ew_off, + __global const T *eltVal +#endif + ) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + if (idx >= bx || idy >= by) { + return; + } + const int ix = idx * LN; + const int iy = idy * LM; + + T a[LM]; + T b[LN]; + T c[LM][LN]; + int a_off = iy; + int b_off = ix; + GEMM_LOAD_A(a, iy, bias); + GEMM_SET_C_BIAS(a, c); +#if defined(USE_ELTWISE_NCHW) + int c_off = (iy + eh_off) * ew_str + ix + ew_off; + ADD_ELTWISE_NCHW(c, c_off, ew_str, eltVal); +#endif + + for (int i = 0; i < K; ++i) { + GEMM_LOAD_A(a, a_off, A); + GEMM_LOAD_B(b, b_off, B); + GEMM_CALCORE(a, b, c); + a_off += M; + b_off += N; + } + + /*LM = 4 or LM = 8*/ + int c_base = (iy >> 2) * ohw_str; +#if defined(USE_ELTWISE_NCWHC4) + int e_base = (iy >> 2) * ehw_str; +#endif + for (uchar i = 0; i < LN; ++i) { + int oxh = (ix + i) % oh; + int oxw = (ix + i) / oh; + if (oxw >= ow) { + break; + } + int c_off = c_base + (oxw + ow_off) * oh_str + oxh + oh_off; + T4 tmp; +#if defined(USE_ELTWISE_NCWHC4) + int e_off = e_base + (oxw + ew_off) * eh_str + oxh + eh_off; + tmp = vload4(e_off, eltVal); + tmp.x += c[0][0]; + tmp.y += c[1][0]; + tmp.z += c[2][0]; + tmp.w += c[3][0]; +#else + tmp.x = c[0][0]; + tmp.y = c[1][0]; + tmp.z = c[2][0]; + tmp.w = c[3][0]; + ACTIVATION_V4(tmp); +#endif + vstore4(tmp, c_off, C); + UPDATE_REG(c[0]); + UPDATE_REG(c[1]); + UPDATE_REG(c[2]); + UPDATE_REG(c[3]); +#if (LM == 8) + if (iy + 4 >= oc) { + continue; + } + c_off += ohw_str; +#if defined(USE_ELTWISE_NCWHC4) + e_off += ohw_str; + tmp = vload4(e_off, eltVal); + tmp.x += c[4][0]; + tmp.y += c[5][0]; + tmp.z += c[6][0]; + tmp.w += c[7][0]; +#else + tmp.x = c[4][0]; + tmp.y = c[5][0]; + tmp.z = c[6][0]; + tmp.w = c[7][0]; + ACTIVATION_V4(tmp); +#endif + vstore4(tmp, c_off, C); + UPDATE_REG(c[4]); + UPDATE_REG(c[5]); + UPDATE_REG(c[6]); + UPDATE_REG(c[7]); +#endif + } +} + +#elif defined(NO_BIAS) +__kernel void MANGLE_NAME(gemm_tn_nobias_, LM, LN)(const int M, + const int N, + const int K, + const int ow_str, + const int A_str, + const int B_str, + const int C_str, + const int A_off, + const int B_off, + const int ow, + const int oh, + const int bx, + const int by, + float alp, + float bet, + __global const T *A, + __global const T *B, + __global T *C) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + const int ix = idx * LN; + const int iy = idy * LM; + + T a[LM]; + T b[LN]; + T c[LM][LN]; + int a_off = iy + A_off; + int b_off = ix + B_off; + a_off += idz * A_str; + b_off += idz * B_str; + GEMM_SET_C_ZERO(c); + + for (int i = 0; i < K; ++i) { + GEMM_LOAD_A(a, a_off, A); + GEMM_LOAD_B(b, b_off, B); + GEMM_CALCORE(a, b, c); + a_off += M; + b_off += N; + } + + int c_off = iy * ow_str + ix; + c_off += idz * C_str; + int ex = ix + LN - ow; + int ey = iy + LM - oh; + GEMM_MUL_C(alp, bet, c); + if (ex > 0) { + GEMM_SET_C_EDGE_ZERO_W(c, ex); + } + if (ey > 0) { + GEMM_SET_C_EDGE_ZERO_H(c, ey); + } + GEMM_STORE_C(c, c_off, ow_str, C); +} + +#else +#if defined(USE_RELU) +__kernel void MANGLE_NAME(gemm_tn_relu_, LM, LN) +#elif defined(USE_GELU) +__kernel void MANGLE_NAME(gemm_tn_gelu_, LM, LN) +#else +__kernel void MANGLE_NAME(gemm_tn_, LM, LN) +#endif + (const int M, + const int N, + const int K, + const int ow_str, + const int ow, + const int oh, + const int bx, + const int by, + __global const T *A, + __global const T *B, + __global const T *bias, + __global T *C) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + if (idx >= bx || idy >= by) { + return; + } + const int ix = idx * LN; + const int iy = idy * LM; + + T a[LM]; + T b[LN]; + T c[LM][LN]; + int a_off = iy; + int b_off = ix; + GEMM_LOAD_A(a, iy, bias); + GEMM_SET_C_BIAS(a, c); + + for (int i = 0; i < K; ++i) { + GEMM_LOAD_A(a, a_off, A); + GEMM_LOAD_B(b, b_off, B); + GEMM_CALCORE(a, b, c); + a_off += M; + b_off += N; + } + + int c_off = iy * ow_str + ix; + int ex = ix + LN - ow; + int ey = iy + LM - oh; + if (ex > 0) { + GEMM_SET_C_EDGE_ZERO_W(c, ex); + } + if (ey > 0) { + GEMM_SET_C_EDGE_ZERO_H(c, ey); + } + GEMM_STORE_C(c, c_off, ow_str, C); +} +#endif diff --git a/compute/tensor/src/gpu/mali/cl/kernel_def.h b/compute/tensor/src/gpu/mali/cl/kernel_def.h new file mode 100644 index 00000000..8d9ce80e --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/kernel_def.h @@ -0,0 +1,3424 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _KERNEL_DEF +#define _KERNEL_DEF + +/* + * READ IMAGE + */ +#if defined(USE_HALF) +#define READ_IMAGE(image, sampler, coord) read_imageh(image, sampler, coord) +#define WRITE_IMAGE(image, coord, data) write_imageh(image, coord, data) +#else +#define READ_IMAGE(image, sampler, coord) read_imagef(image, sampler, coord) +#define WRITE_IMAGE(image, coord, data) write_imagef(image, coord, data) +#endif + +__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + +#if defined(USE_V1) +#define READ_BUF(v, off, buf) \ + { \ + v = buf[off]; \ + } +#elif defined(USE_V2) +#define READ_BUF(v, off, buf) \ + { \ + v = vload2(0, buf + off); \ + } +#elif defined(USE_V3) +#define READ_BUF(v, off, buf) \ + { \ + v = vload3(0, buf + off); \ + } +#elif defined(USE_V4) +#define READ_BUF(v, off, buf) \ + { \ + v = vload4(0, buf + off); \ + } +#elif defined(USE_V8) +#define READ_BUF(v, off, buf) \ + { \ + v = vload8(0, buf + off); \ + } +#elif defined(USE_V16) +#define READ_BUF(v, off, buf) \ + { \ + v = vload16(0, buf + off); \ + } +#endif + +/* + * load data from buffer to reg array + */ +#define LOAD_BUF_ARRAY1(v, off, buf) \ + { \ + v[0] = buf[off]; \ + } + +#define LOAD_BUF_ARRAY2(v, off, buf) \ + { \ + T2 tmp = vload2(0, buf + off); \ + v[0] = tmp.x; \ + v[1] = tmp.y; \ + } + +#define LOAD_BUF_ARRAY3(v, off, buf) \ + { \ + T3 tmp = vload3(0, buf + off); \ + v[0] = tmp.x; \ + v[1] = tmp.y; \ + v[2] = tmp.z; \ + } + +#define LOAD_BUF_ARRAY4(v, off, buf) \ + { \ + T4 tmp = vload4(0, buf + off); \ + v[0] = tmp.x; \ + v[1] = tmp.y; \ + v[2] = tmp.z; \ + v[3] = tmp.w; \ + } + +#define LOAD_BUF_ARRAY5(v, off, buf) \ + { \ + T8 tmp = vload8(0, buf + off); \ + v[0] = tmp.s0; \ + v[1] = tmp.s1; \ + v[2] = tmp.s2; \ + v[3] = tmp.s3; \ + v[4] = tmp.s4; \ + } + +#define LOAD_BUF_ARRAY6(v, off, buf) \ + { \ + T8 tmp = vload8(0, buf + off); \ + v[0] = tmp.s0; \ + v[1] = tmp.s1; \ + v[2] = tmp.s2; \ + v[3] = tmp.s3; \ + v[4] = tmp.s4; \ + v[5] = tmp.s5; \ + } + +#define LOAD_BUF_ARRAY7(v, off, buf) \ + { \ + T8 tmp = vload8(0, buf + off); \ + v[0] = tmp.s0; \ + v[1] = tmp.s1; \ + v[2] = tmp.s2; \ + v[3] = tmp.s3; \ + v[4] = tmp.s4; \ + v[5] = tmp.s5; \ + v[6] = tmp.s6; \ + } + +#define LOAD_BUF_ARRAY8(v, off, buf) \ + { \ + T8 tmp = vload8(0, buf + off); \ + v[0] = tmp.s0; \ + v[1] = tmp.s1; \ + v[2] = tmp.s2; \ + v[3] = tmp.s3; \ + v[4] = tmp.s4; \ + v[5] = tmp.s5; \ + v[6] = tmp.s6; \ + v[7] = tmp.s7; \ + } + +#define ADD_BUF_ARRAY1(v, off, buf) \ + { \ + v[0] += buf[off]; \ + } + +#define ADD_BUF_ARRAY2(v, off, buf) \ + { \ + T2 tmp = vload2(0, buf + off); \ + v[0] += tmp.x; \ + v[1] += tmp.y; \ + } + +#define ADD_BUF_ARRAY3(v, off, buf) \ + { \ + T3 tmp = vload3(0, buf + off); \ + v[0] += tmp.x; \ + v[1] += tmp.y; \ + v[2] += tmp.z; \ + } + +#define ADD_BUF_ARRAY4(v, off, buf) \ + { \ + T4 tmp = vload4(0, buf + off); \ + v[0] += tmp.x; \ + v[1] += tmp.y; \ + v[2] += tmp.z; \ + v[3] += tmp.w; \ + } + +#define ADD_BUF_ARRAY5(v, off, buf) \ + { \ + T8 tmp = vload8(0, buf + off); \ + v[0] += tmp.s0; \ + v[1] += tmp.s1; \ + v[2] += tmp.s2; \ + v[3] += tmp.s3; \ + v[4] += tmp.s4; \ + } + +#define ADD_BUF_ARRAY6(v, off, buf) \ + { \ + T8 tmp = vload8(0, buf + off); \ + v[0] += tmp.s0; \ + v[1] += tmp.s1; \ + v[2] += tmp.s2; \ + v[3] += tmp.s3; \ + v[4] += tmp.s4; \ + v[5] += tmp.s5; \ + } + +#define ADD_BUF_ARRAY7(v, off, buf) \ + { \ + T8 tmp = vload8(0, buf + off); \ + v[0] += tmp.s0; \ + v[1] += tmp.s1; \ + v[2] += tmp.s2; \ + v[3] += tmp.s3; \ + v[4] += tmp.s4; \ + v[5] += tmp.s5; \ + v[6] += tmp.s6; \ + } + +#define ADD_BUF_ARRAY8(v, off, buf) \ + { \ + T8 tmp = vload8(0, buf + off); \ + v[0] += tmp.s0; \ + v[1] += tmp.s1; \ + v[2] += tmp.s2; \ + v[3] += tmp.s3; \ + v[4] += tmp.s4; \ + v[5] += tmp.s5; \ + v[6] += tmp.s6; \ + v[7] += tmp.s7; \ + } +/* + * set reg array to normal val + */ +#define SET_REG_ARRAY1(v, reg) \ + { \ + reg[0] = v; \ + } + +#define SET_REG_ARRAY2(v, reg) \ + { \ + reg[0] = v; \ + reg[1] = v; \ + } + +#define SET_REG_ARRAY3(v, reg) \ + { \ + reg[0] = v; \ + reg[1] = v; \ + reg[2] = v; \ + } + +#define SET_REG_ARRAY4(v, reg) \ + { \ + reg[0] = v; \ + reg[1] = v; \ + reg[2] = v; \ + reg[3] = v; \ + } +#define SET_REG_ARRAY5(v, reg) \ + { \ + reg[0] = v; \ + reg[1] = v; \ + reg[2] = v; \ + reg[3] = v; \ + reg[4] = v; \ + } + +#define SET_REG_ARRAY6(v, reg) \ + { \ + reg[0] = v; \ + reg[1] = v; \ + reg[2] = v; \ + reg[3] = v; \ + reg[4] = v; \ + reg[5] = v; \ + } + +#define SET_REG_ARRAY7(v, reg) \ + { \ + reg[0] = v; \ + reg[1] = v; \ + reg[2] = v; \ + reg[3] = v; \ + reg[4] = v; \ + reg[5] = v; \ + reg[6] = v; \ + } + +#define SET_REG_ARRAY8(v, reg) \ + { \ + reg[0] = v; \ + reg[1] = v; \ + reg[2] = v; \ + reg[3] = v; \ + reg[4] = v; \ + reg[5] = v; \ + reg[6] = v; \ + reg[7] = v; \ + } + +#define MUL_REG_NORMAL_ARRAY1(a, b, reg) \ + { \ + reg[0] = a * reg[0] + b; \ + } + +#define MUL_REG_NORMAL_ARRAY2(a, b, reg) \ + { \ + reg[0] = a * reg[0] + b; \ + reg[1] = a * reg[1] + b; \ + } + +#define MUL_REG_NORMAL_ARRAY3(a, b, reg) \ + { \ + reg[0] = a * reg[0] + b; \ + reg[1] = a * reg[1] + b; \ + reg[2] = a * reg[2] + b; \ + } + +#define MUL_REG_NORMAL_ARRAY4(a, b, reg) \ + { \ + reg[0] = a * reg[0] + b; \ + reg[1] = a * reg[1] + b; \ + reg[2] = a * reg[2] + b; \ + reg[3] = a * reg[3] + b; \ + } + +#define MUL_REG_NORMAL_ARRAY5(a, b, reg) \ + { \ + reg[0] = a * reg[0] + b; \ + reg[1] = a * reg[1] + b; \ + reg[2] = a * reg[2] + b; \ + reg[3] = a * reg[3] + b; \ + reg[4] = a * reg[4] + b; \ + } + +#define MUL_REG_NORMAL_ARRAY6(a, b, reg) \ + { \ + reg[0] = a * reg[0] + b; \ + reg[1] = a * reg[1] + b; \ + reg[2] = a * reg[2] + b; \ + reg[3] = a * reg[3] + b; \ + reg[4] = a * reg[4] + b; \ + reg[5] = a * reg[5] + b; \ + } + +#define MUL_REG_NORMAL_ARRAY7(a, b, reg) \ + { \ + reg[0] = a * reg[0] + b; \ + reg[1] = a * reg[1] + b; \ + reg[2] = a * reg[2] + b; \ + reg[3] = a * reg[3] + b; \ + reg[4] = a * reg[4] + b; \ + reg[5] = a * reg[5] + b; \ + reg[6] = a * reg[6] + b; \ + } + +#define MUL_REG_NORMAL_ARRAY8(a, b, reg) \ + { \ + reg[0] = a * reg[0] + b; \ + reg[1] = a * reg[1] + b; \ + reg[2] = a * reg[2] + b; \ + reg[3] = a * reg[3] + b; \ + reg[4] = a * reg[4] + b; \ + reg[5] = a * reg[5] + b; \ + reg[6] = a * reg[6] + b; \ + reg[7] = a * reg[7] + b; \ + } + +#define ADD_REG_ARRAY4(reg0, reg1) \ + { \ + reg1[0] += reg0[0]; \ + reg1[1] += reg0[1]; \ + reg1[2] += reg0[2]; \ + reg1[3] += reg0[3]; \ + } + +#define MINUS_REG_ARRAY4(reg0, reg1) \ + { \ + reg1[0] -= reg0[0]; \ + reg1[1] -= reg0[1]; \ + reg1[2] -= reg0[2]; \ + reg1[3] -= reg0[3]; \ + } + +/* + * DOT + */ +#define DOT_A4B16C4(a, b, c) \ + { \ + c.x += (a.x * b.s0 + a.y * b.s1 + a.z * b.s2 + a.w * b.s3); \ + c.y += (a.x * b.s4 + a.y * b.s5 + a.z * b.s6 + a.w * b.s7); \ + c.z += (a.x * b.s8 + a.y * b.s9 + a.z * b.sa + a.w * b.sb); \ + c.w += (a.x * b.sc + a.y * b.sd + a.z * b.se + a.w * b.sf); \ + } + +#define DOT_A4B4C1(a, b, c) \ + { \ + c += (a.x * b.s0 + a.y * b.s1 + a.z * b.s2 + a.w * b.s3); \ + } + +#define DOT_A4B4C4(a, b, c) \ + { \ + c.x += a.x * b.x; \ + c.y += a.y * b.y; \ + c.z += a.z * b.z; \ + c.w += a.w * b.w; \ + } + +#define DOT_A2B2C1(a, b, c) \ + { \ + c += (a.s0 * b.s0 + a.s1 * b.s1); \ + } + +#define DOT_A8B8C1(a, b, c) \ + { \ + c += (a.s0 * b.s0 + a.s1 * b.s1 + a.s2 * b.s2 + a.s3 * b.s3); \ + c += (a.s4 * b.s4 + a.s5 * b.s5 + a.s6 * b.s6 + a.s7 * b.s7); \ + } + +#define DOT_A16B16C1(a, b, c) \ + { \ + c += (a.x * b.s0 + a.y * b.s1 + a.z * b.s2 + a.w * b.s3); \ + c += (a.x * b.s4 + a.y * b.s5 + a.z * b.s6 + a.w * b.s7); \ + c += (a.x * b.s8 + a.y * b.s9 + a.z * b.sa + a.w * b.sb); \ + c += (a.x * b.sc + a.y * b.sd + a.z * b.se + a.w * b.sf); \ + } + +#define DOT_A_NORMAL_B1C1_ARRAY(a, b, c) \ + { \ + c[0] += a * b[0]; \ + } + +#define DOT_A_NORMAL_B2C2_ARRAY(a, b, c) \ + { \ + c[0] += a * b[0]; \ + c[1] += a * b[1]; \ + } + +#define DOT_A_NORMAL_B3C3_ARRAY(a, b, c) \ + { \ + c[0] += a * b[0]; \ + c[1] += a * b[1]; \ + c[2] += a * b[2]; \ + } + +#define DOT_A_NORMAL_B4C4_ARRAY(a, b, c) \ + { \ + c[0] += a * b[0]; \ + c[1] += a * b[1]; \ + c[2] += a * b[2]; \ + c[3] += a * b[3]; \ + } + +#define DOT_A_NORMAL_B5C5_ARRAY(a, b, c) \ + { \ + c[0] += a * b[0]; \ + c[1] += a * b[1]; \ + c[2] += a * b[2]; \ + c[3] += a * b[3]; \ + c[4] += a * b[4]; \ + } + +#define DOT_A_NORMAL_B6C6_ARRAY(a, b, c) \ + { \ + c[0] += a * b[0]; \ + c[1] += a * b[1]; \ + c[2] += a * b[2]; \ + c[3] += a * b[3]; \ + c[4] += a * b[4]; \ + c[5] += a * b[5]; \ + } + +#define DOT_A_NORMAL_B7C7_ARRAY(a, b, c) \ + { \ + c[0] += a * b[0]; \ + c[1] += a * b[1]; \ + c[2] += a * b[2]; \ + c[3] += a * b[3]; \ + c[4] += a * b[4]; \ + c[5] += a * b[5]; \ + c[6] += a * b[6]; \ + } + +#define DOT_A_NORMAL_B8C8_ARRAY(a, b, c) \ + { \ + c[0] += a * b[0]; \ + c[1] += a * b[1]; \ + c[2] += a * b[2]; \ + c[3] += a * b[3]; \ + c[4] += a * b[4]; \ + c[5] += a * b[5]; \ + c[6] += a * b[6]; \ + c[7] += a * b[7]; \ + } + +#if defined(USE_V2) +#define DOT_VEC(a, b, c) DOT_A2B2C1(a, b, c) +#elif defined(USE_V4) +#define DOT_VEC(a, b, c) DOT_A4B4C1(a, b, c) +#elif defined(USE_V8) +#define DOT_VEC(a, b, c) DOT_A8B8C1(a, b, c) +#elif defined(USE_V16) +#define DOT_VEC(a, b, c) DOT_A16B16C1(a, b, c) +#else +#define DOT_A_VEC_B1C1_ARRAY(a, b, c) DOT_A_NORMAL_B1C1_ARRAY(a, b, c) +#define DOT_A_VEC_B2C2_ARRAY(a, b, c) DOT_A_NORMAL_B2C2_ARRAY(a, b, c) +#define DOT_A_VEC_B3C3_ARRAY(a, b, c) DOT_A_NORMAL_B3C3_ARRAY(a, b, c) +#define DOT_A_VEC_B4C4_ARRAY(a, b, c) DOT_A_NORMAL_B4C4_ARRAY(a, b, c) +#define DOT_A_VEC_B5C5_ARRAY(a, b, c) DOT_A_NORMAL_B5C5_ARRAY(a, b, c) +#define DOT_A_VEC_B6C6_ARRAY(a, b, c) DOT_A_NORMAL_B6C6_ARRAY(a, b, c) +#define DOT_A_VEC_B7C7_ARRAY(a, b, c) DOT_A_NORMAL_B7C7_ARRAY(a, b, c) +#define DOT_A_VEC_B8C8_ARRAY(a, b, c) DOT_A_NORMAL_B8C8_ARRAY(a, b, c) +#endif + +#if defined(USE_V2) || defined(USE_V4) || defined(USE_V8) || defined(USE_V16) +#define DOT_A_VEC_B1C1_ARRAY(a, b, c) \ + { \ + DOT_VEC(a, b[0], c[0]); \ + } + +#define DOT_A_VEC_B2C2_ARRAY(a, b, c) \ + { \ + DOT_VEC(a, b[0], c[0]); \ + DOT_VEC(a, b[1], c[1]); \ + } + +#define DOT_A_VEC_B3C3_ARRAY(a, b, c) \ + { \ + DOT_VEC(a, b[0], c[0]); \ + DOT_VEC(a, b[1], c[1]); \ + DOT_VEC(a, b[2], c[2]); \ + } + +#define DOT_A_VEC_B4C4_ARRAY(a, b, c) \ + { \ + DOT_VEC(a, b[0], c[0]); \ + DOT_VEC(a, b[1], c[1]); \ + DOT_VEC(a, b[2], c[2]); \ + DOT_VEC(a, b[3], c[3]); \ + } + +#define DOT_A_VEC_B5C5_ARRAY(a, b, c) \ + { \ + DOT_VEC(a, b[0], c[0]); \ + DOT_VEC(a, b[1], c[1]); \ + DOT_VEC(a, b[2], c[2]); \ + DOT_VEC(a, b[3], c[3]); \ + DOT_VEC(a, b[4], c[4]); \ + } + +#define DOT_A_VEC_B6C6_ARRAY(a, b, c) \ + { \ + DOT_VEC(a, b[0], c[0]); \ + DOT_VEC(a, b[1], c[1]); \ + DOT_VEC(a, b[2], c[2]); \ + DOT_VEC(a, b[3], c[3]); \ + DOT_VEC(a, b[4], c[4]); \ + DOT_VEC(a, b[5], c[5]); \ + } + +#define DOT_A_VEC_B7C7_ARRAY(a, b, c) \ + { \ + DOT_VEC(a, b[0], c[0]); \ + DOT_VEC(a, b[1], c[1]); \ + DOT_VEC(a, b[2], c[2]); \ + DOT_VEC(a, b[3], c[3]); \ + DOT_VEC(a, b[4], c[4]); \ + DOT_VEC(a, b[5], c[5]); \ + DOT_VEC(a, b[6], c[6]); \ + } + +#define DOT_A_VEC_B8C8_ARRAY(a, b, c) \ + { \ + DOT_VEC(a, b[0], c[0]); \ + DOT_VEC(a, b[1], c[1]); \ + DOT_VEC(a, b[2], c[2]); \ + DOT_VEC(a, b[3], c[3]); \ + DOT_VEC(a, b[4], c[4]); \ + DOT_VEC(a, b[5], c[5]); \ + DOT_VEC(a, b[6], c[6]); \ + DOT_VEC(a, b[7], c[7]); \ + } +#endif +/* + * ACTIVATION + */ +#if defined(USE_RELU) +#define ACTIVATION_V4(v) \ + { \ + v.x = fmax(v.x, 0); \ + v.y = fmax(v.y, 0); \ + v.z = fmax(v.z, 0); \ + v.w = fmax(v.w, 0); \ + } + +#define ACTIVATION_V8(v) \ + { \ + v.s0 = fmax(v.s0, 0); \ + v.s1 = fmax(v.s1, 0); \ + v.s2 = fmax(v.s2, 0); \ + v.s3 = fmax(v.s3, 0); \ + v.s4 = fmax(v.s4, 0); \ + v.s5 = fmax(v.s5, 0); \ + v.s6 = fmax(v.s6, 0); \ + v.s7 = fmax(v.s7, 0); \ + } + +#define ACTIVATION_V16(v) \ + { \ + v.s0 = fmax(v.s0, 0); \ + v.s1 = fmax(v.s1, 0); \ + v.s2 = fmax(v.s2, 0); \ + v.s3 = fmax(v.s3, 0); \ + v.s4 = fmax(v.s4, 0); \ + v.s5 = fmax(v.s5, 0); \ + v.s6 = fmax(v.s6, 0); \ + v.s7 = fmax(v.s7, 0); \ + v.s8 = fmax(v.s8, 0); \ + v.s9 = fmax(v.s9, 0); \ + v.sa = fmax(v.sa, 0); \ + v.sb = fmax(v.sb, 0); \ + v.sc = fmax(v.sc, 0); \ + v.sd = fmax(v.sd, 0); \ + v.se = fmax(v.se, 0); \ + v.sf = fmax(v.sf, 0); \ + } + +#define ACTIVATION_V1(v) \ + { \ + v = fmax(v, 0); \ + } + +#define ACTIVATION_ARRAY1(v) \ + { \ + v[0] = fmax(v[0], 0); \ + } + +#define ACTIVATION_ARRAY2(v) \ + { \ + v[0] = fmax(v[0], 0); \ + v[1] = fmax(v[1], 0); \ + } + +#define ACTIVATION_ARRAY3(v) \ + { \ + v[0] = fmax(v[0], 0); \ + v[1] = fmax(v[1], 0); \ + v[2] = fmax(v[2], 0); \ + } + +#define ACTIVATION_ARRAY4(v) \ + { \ + v[0] = fmax(v[0], 0); \ + v[1] = fmax(v[1], 0); \ + v[2] = fmax(v[2], 0); \ + v[3] = fmax(v[3], 0); \ + } + +#define ACTIVATION_ARRAY5(v) \ + { \ + v[0] = fmax(v[0], 0); \ + v[1] = fmax(v[1], 0); \ + v[2] = fmax(v[2], 0); \ + v[3] = fmax(v[3], 0); \ + v[4] = fmax(v[4], 0); \ + } + +#define ACTIVATION_ARRAY6(v) \ + { \ + v[0] = fmax(v[0], 0); \ + v[1] = fmax(v[1], 0); \ + v[2] = fmax(v[2], 0); \ + v[3] = fmax(v[3], 0); \ + v[4] = fmax(v[4], 0); \ + v[5] = fmax(v[5], 0); \ + } + +#define ACTIVATION_ARRAY7(v) \ + { \ + v[0] = fmax(v[0], 0); \ + v[1] = fmax(v[1], 0); \ + v[2] = fmax(v[2], 0); \ + v[3] = fmax(v[3], 0); \ + v[4] = fmax(v[4], 0); \ + v[5] = fmax(v[5], 0); \ + v[6] = fmax(v[6], 0); \ + } + +#define ACTIVATION_ARRAY8(v) \ + { \ + v[0] = fmax(v[0], 0); \ + v[1] = fmax(v[1], 0); \ + v[2] = fmax(v[2], 0); \ + v[3] = fmax(v[3], 0); \ + v[4] = fmax(v[4], 0); \ + v[5] = fmax(v[5], 0); \ + v[6] = fmax(v[6], 0); \ + v[7] = fmax(v[7], 0); \ + } +#elif defined(USE_RELU6) +#define ACTIVATION_V4(v) \ + { \ + v.x = clamp(v.x, (T)0, (T)6); \ + v.y = clamp(v.y, (T)0, (T)6); \ + v.z = clamp(v.z, (T)0, (T)6); \ + v.w = clamp(v.w, (T)0, (T)6); \ + } + +#define ACTIVATION_V8(v) \ + { \ + v.s0 = clamp(v.s0, (T)0, (T)6); \ + v.s1 = clamp(v.s1, (T)0, (T)6); \ + v.s2 = clamp(v.s2, (T)0, (T)6); \ + v.s3 = clamp(v.s3, (T)0, (T)6); \ + v.s4 = clamp(v.s4, (T)0, (T)6); \ + v.s5 = clamp(v.s5, (T)0, (T)6); \ + v.s6 = clamp(v.s6, (T)0, (T)6); \ + v.s7 = clamp(v.s7, (T)0, (T)6); \ + } + +#define ACTIVATION_V16(v) \ + { \ + v.s0 = clamp(v.s0, (T)0, (T)6); \ + v.s1 = clamp(v.s1, (T)0, (T)6); \ + v.s2 = clamp(v.s2, (T)0, (T)6); \ + v.s3 = clamp(v.s3, (T)0, (T)6); \ + v.s4 = clamp(v.s4, (T)0, (T)6); \ + v.s5 = clamp(v.s5, (T)0, (T)6); \ + v.s6 = clamp(v.s6, (T)0, (T)6); \ + v.s7 = clamp(v.s7, (T)0, (T)6); \ + v.s8 = clamp(v.s0, (T)0, (T)6); \ + v.s9 = clamp(v.s1, (T)0, (T)6); \ + v.sa = clamp(v.sa, (T)0, (T)6); \ + v.sb = clamp(v.sb, (T)0, (T)6); \ + v.sc = clamp(v.sc, (T)0, (T)6); \ + v.sd = clamp(v.sd, (T)0, (T)6); \ + v.se = clamp(v.se, (T)0, (T)6); \ + v.sf = clamp(v.sf, (T)0, (T)6); \ + } + +#define ACTIVATION_V1(v) \ + { \ + v = clamp(v, (T)0, (T)6); \ + } + +#define ACTIVATION_ARRAY1(v) \ + { \ + v[0] = clamp(v[0], (T)0, (T)6); \ + } + +#define ACTIVATION_ARRAY2(v) \ + { \ + v[0] = clamp(v[0], (T)0, (T)6); \ + v[1] = clamp(v[1], (T)0, (T)6); \ + } + +#define ACTIVATION_ARRAY3(v) \ + { \ + v[0] = clamp(v[0], (T)0, (T)6); \ + v[1] = clamp(v[1], (T)0, (T)6); \ + v[2] = clamp(v[2], (T)0, (T)6); \ + } + +#define ACTIVATION_ARRAY4(v) \ + { \ + v[0] = clamp(v[0], (T)0, (T)6); \ + v[1] = clamp(v[1], (T)0, (T)6); \ + v[2] = clamp(v[2], (T)0, (T)6); \ + v[3] = clamp(v[3], (T)0, (T)6); \ + } + +#define ACTIVATION_ARRAY5(v) \ + { \ + v[0] = clamp(v[0], (T)0, (T)6); \ + v[1] = clamp(v[1], (T)0, (T)6); \ + v[2] = clamp(v[2], (T)0, (T)6); \ + v[3] = clamp(v[3], (T)0, (T)6); \ + v[4] = clamp(v[4], (T)0, (T)6); \ + } + +#define ACTIVATION_ARRAY6(v) \ + { \ + v[0] = clamp(v[0], (T)0, (T)6); \ + v[1] = clamp(v[1], (T)0, (T)6); \ + v[2] = clamp(v[2], (T)0, (T)6); \ + v[3] = clamp(v[3], (T)0, (T)6); \ + v[4] = clamp(v[4], (T)0, (T)6); \ + v[5] = clamp(v[5], (T)0, (T)6); \ + } + +#define ACTIVATION_ARRAY7(v) \ + { \ + v[0] = clamp(v[0], (T)0, (T)6); \ + v[1] = clamp(v[1], (T)0, (T)6); \ + v[2] = clamp(v[2], (T)0, (T)6); \ + v[3] = clamp(v[3], (T)0, (T)6); \ + v[4] = clamp(v[4], (T)0, (T)6); \ + v[5] = clamp(v[5], (T)0, (T)6); \ + v[6] = clamp(v[6], (T)0, (T)6); \ + } + +#define ACTIVATION_ARRAY8(v) \ + { \ + v[0] = clamp(v[0], (T)0, (T)6); \ + v[1] = clamp(v[1], (T)0, (T)6); \ + v[2] = clamp(v[2], (T)0, (T)6); \ + v[3] = clamp(v[3], (T)0, (T)6); \ + v[4] = clamp(v[4], (T)0, (T)6); \ + v[5] = clamp(v[5], (T)0, (T)6); \ + v[6] = clamp(v[6], (T)0, (T)6); \ + v[7] = clamp(v[7], (T)0, (T)6); \ + } +#elif defined(USE_GELU) +#define ACTIVATION_V4(v) \ + { \ + T4 tmp = v; \ + v.s0 = 0.797885 * (v.s0 + 0.044715 * pown(v.s0, 3)); \ + v.s1 = 0.797885 * (v.s1 + 0.044715 * pown(v.s1, 3)); \ + v.s2 = 0.797885 * (v.s2 + 0.044715 * pown(v.s2, 3)); \ + v.s3 = 0.797885 * (v.s3 + 0.044715 * pown(v.s3, 3)); \ + v.s0 = 1.0 - 2.0 / (exp(2.0 * v.s0) + 1.0); \ + v.s1 = 1.0 - 2.0 / (exp(2.0 * v.s1) + 1.0); \ + v.s2 = 1.0 - 2.0 / (exp(2.0 * v.s2) + 1.0); \ + v.s3 = 1.0 - 2.0 / (exp(2.0 * v.s3) + 1.0); \ + v.s0 = (v.s0 + (T)1.0) * (T)0.5; \ + v.s1 = (v.s1 + (T)1.0) * (T)0.5; \ + v.s2 = (v.s2 + (T)1.0) * (T)0.5; \ + v.s3 = (v.s3 + (T)1.0) * (T)0.5; \ + v.s0 = v.s0 * tmp.s0; \ + v.s1 = v.s1 * tmp.s1; \ + v.s2 = v.s2 * tmp.s2; \ + v.s3 = v.s3 * tmp.s3; \ + } + +#define ACTIVATION_V8(v) \ + { \ + T8 tmp = v; \ + v.s0 = 0.797885 * (v.s0 + 0.044715 * pown(v.s0, 3)); \ + v.s1 = 0.797885 * (v.s1 + 0.044715 * pown(v.s1, 3)); \ + v.s2 = 0.797885 * (v.s2 + 0.044715 * pown(v.s2, 3)); \ + v.s3 = 0.797885 * (v.s3 + 0.044715 * pown(v.s3, 3)); \ + v.s4 = 0.797885 * (v.s4 + 0.044715 * pown(v.s4, 3)); \ + v.s5 = 0.797885 * (v.s5 + 0.044715 * pown(v.s5, 3)); \ + v.s6 = 0.797885 * (v.s6 + 0.044715 * pown(v.s6, 3)); \ + v.s7 = 0.797885 * (v.s7 + 0.044715 * pown(v.s7, 3)); \ + v.s0 = 1.0 - 2.0 / (exp(2.0 * v.s0) + 1.0); \ + v.s1 = 1.0 - 2.0 / (exp(2.0 * v.s1) + 1.0); \ + v.s2 = 1.0 - 2.0 / (exp(2.0 * v.s2) + 1.0); \ + v.s3 = 1.0 - 2.0 / (exp(2.0 * v.s3) + 1.0); \ + v.s4 = 1.0 - 2.0 / (exp(2.0 * v.s4) + 1.0); \ + v.s5 = 1.0 - 2.0 / (exp(2.0 * v.s5) + 1.0); \ + v.s6 = 1.0 - 2.0 / (exp(2.0 * v.s6) + 1.0); \ + v.s7 = 1.0 - 2.0 / (exp(2.0 * v.s7) + 1.0); \ + v.s0 = (v.s0 + (T)1.0) * (T)0.5; \ + v.s1 = (v.s1 + (T)1.0) * (T)0.5; \ + v.s2 = (v.s2 + (T)1.0) * (T)0.5; \ + v.s3 = (v.s3 + (T)1.0) * (T)0.5; \ + v.s4 = (v.s4 + (T)1.0) * (T)0.5; \ + v.s5 = (v.s5 + (T)1.0) * (T)0.5; \ + v.s6 = (v.s6 + (T)1.0) * (T)0.5; \ + v.s7 = (v.s7 + (T)1.0) * (T)0.5; \ + v.s0 = v.s0 * tmp.s0; \ + v.s1 = v.s1 * tmp.s1; \ + v.s2 = v.s2 * tmp.s2; \ + v.s3 = v.s3 * tmp.s3; \ + v.s4 = v.s4 * tmp.s4; \ + v.s5 = v.s5 * tmp.s5; \ + v.s6 = v.s6 * tmp.s6; \ + v.s7 = v.s7 * tmp.s7; \ + } + +#define ACTIVATION_V16(v) \ + { \ + T16 tmp = v; \ + v.s0 = 0.797885 * (v.s0 + 0.044715 * pown(v.s0, 3)); \ + v.s1 = 0.797885 * (v.s1 + 0.044715 * pown(v.s1, 3)); \ + v.s2 = 0.797885 * (v.s2 + 0.044715 * pown(v.s2, 3)); \ + v.s3 = 0.797885 * (v.s3 + 0.044715 * pown(v.s3, 3)); \ + v.s4 = 0.797885 * (v.s4 + 0.044715 * pown(v.s4, 3)); \ + v.s5 = 0.797885 * (v.s5 + 0.044715 * pown(v.s5, 3)); \ + v.s6 = 0.797885 * (v.s6 + 0.044715 * pown(v.s6, 3)); \ + v.s7 = 0.797885 * (v.s7 + 0.044715 * pown(v.s7, 3)); \ + v.s8 = 0.797885 * (v.s8 + 0.044715 * pown(v.s8, 3)); \ + v.s9 = 0.797885 * (v.s9 + 0.044715 * pown(v.s9, 3)); \ + v.sa = 0.797885 * (v.sa + 0.044715 * pown(v.sa, 3)); \ + v.sb = 0.797885 * (v.sb + 0.044715 * pown(v.sb, 3)); \ + v.sc = 0.797885 * (v.sc + 0.044715 * pown(v.sc, 3)); \ + v.sd = 0.797885 * (v.sd + 0.044715 * pown(v.sd, 3)); \ + v.se = 0.797885 * (v.se + 0.044715 * pown(v.se, 3)); \ + v.sf = 0.797885 * (v.sf + 0.044715 * pown(v.sf, 3)); \ + v.s0 = 1.0 - 2.0 / (exp(2.0 * v.s0) + 1.0); \ + v.s1 = 1.0 - 2.0 / (exp(2.0 * v.s1) + 1.0); \ + v.s2 = 1.0 - 2.0 / (exp(2.0 * v.s2) + 1.0); \ + v.s3 = 1.0 - 2.0 / (exp(2.0 * v.s3) + 1.0); \ + v.s4 = 1.0 - 2.0 / (exp(2.0 * v.s4) + 1.0); \ + v.s5 = 1.0 - 2.0 / (exp(2.0 * v.s5) + 1.0); \ + v.s6 = 1.0 - 2.0 / (exp(2.0 * v.s6) + 1.0); \ + v.s7 = 1.0 - 2.0 / (exp(2.0 * v.s7) + 1.0); \ + v.s8 = 1.0 - 2.0 / (exp(2.0 * v.s8) + 1.0); \ + v.s9 = 1.0 - 2.0 / (exp(2.0 * v.s9) + 1.0); \ + v.sa = 1.0 - 2.0 / (exp(2.0 * v.sa) + 1.0); \ + v.sb = 1.0 - 2.0 / (exp(2.0 * v.sb) + 1.0); \ + v.sc = 1.0 - 2.0 / (exp(2.0 * v.sc) + 1.0); \ + v.sd = 1.0 - 2.0 / (exp(2.0 * v.sd) + 1.0); \ + v.se = 1.0 - 2.0 / (exp(2.0 * v.se) + 1.0); \ + v.sf = 1.0 - 2.0 / (exp(2.0 * v.sf) + 1.0); \ + v.s0 = (v.s0 + (T)1.0) * (T)0.5; \ + v.s1 = (v.s1 + (T)1.0) * (T)0.5; \ + v.s2 = (v.s2 + (T)1.0) * (T)0.5; \ + v.s3 = (v.s3 + (T)1.0) * (T)0.5; \ + v.s4 = (v.s4 + (T)1.0) * (T)0.5; \ + v.s5 = (v.s5 + (T)1.0) * (T)0.5; \ + v.s6 = (v.s6 + (T)1.0) * (T)0.5; \ + v.s7 = (v.s7 + (T)1.0) * (T)0.5; \ + v.s8 = (v.s8 + (T)1.0) * (T)0.5; \ + v.s9 = (v.s9 + (T)1.0) * (T)0.5; \ + v.sa = (v.sa + (T)1.0) * (T)0.5; \ + v.sb = (v.sb + (T)1.0) * (T)0.5; \ + v.sc = (v.sc + (T)1.0) * (T)0.5; \ + v.sd = (v.sd + (T)1.0) * (T)0.5; \ + v.se = (v.se + (T)1.0) * (T)0.5; \ + v.sf = (v.sf + (T)1.0) * (T)0.5; \ + v.s0 = v.s0 * tmp.s0; \ + v.s1 = v.s1 * tmp.s1; \ + v.s2 = v.s2 * tmp.s2; \ + v.s3 = v.s3 * tmp.s3; \ + v.s4 = v.s4 * tmp.s4; \ + v.s5 = v.s5 * tmp.s5; \ + v.s6 = v.s6 * tmp.s6; \ + v.s7 = v.s7 * tmp.s7; \ + v.s8 = v.s8 * tmp.s8; \ + v.s9 = v.s9 * tmp.s9; \ + v.sa = v.sa * tmp.sa; \ + v.sb = v.sb * tmp.sb; \ + v.sc = v.sc * tmp.sc; \ + v.sd = v.sd * tmp.sd; \ + v.se = v.se * tmp.se; \ + v.sf = v.sf * tmp.sf; \ + } + +#define ACTIVATION_ARRAY1(v) \ + { \ + T tmp = v[0]; \ + v[0] = 0.797885 * (v[0] + 0.044715 * pown(v[0], 3)); \ + v[0] = 1.0 - 2.0 / (exp(2.0 * v[0]) + 1.0); \ + v[0] = (v[0] + (T)1.0) * (T)0.5; \ + v[0] = v[0] * tmp; \ + } + +#define ACTIVATION_ARRAY2(v) \ + { \ + T tmp[2]; \ + tmp[0] = v[0]; \ + tmp[1] = v[1]; \ + v[0] = 0.797885 * (v[0] + 0.044715 * pown(v[0], 3)); \ + v[1] = 0.797885 * (v[1] + 0.044715 * pown(v[1], 3)); \ + v[0] = 1.0 - 2.0 / (exp(2.0 * v[0]) + 1.0); \ + v[1] = 1.0 - 2.0 / (exp(2.0 * v[1]) + 1.0); \ + v[0] = (v[0] + (T)1.0) * (T)0.5; \ + v[1] = (v[1] + (T)1.0) * (T)0.5; \ + v[0] = v[0] * tmp[0]; \ + v[1] = v[1] * tmp[1]; \ + } + +#define ACTIVATION_ARRAY3(v) \ + { \ + T tmp[3]; \ + tmp[0] = v[0]; \ + tmp[1] = v[1]; \ + tmp[2] = v[2]; \ + v[0] = 0.797885 * (v[0] + 0.044715 * pown(v[0], 3)); \ + v[1] = 0.797885 * (v[1] + 0.044715 * pown(v[1], 3)); \ + v[2] = 0.797885 * (v[2] + 0.044715 * pown(v[2], 3)); \ + v[0] = 1.0 - 2.0 / (exp(2.0 * v[0]) + 1.0); \ + v[1] = 1.0 - 2.0 / (exp(2.0 * v[1]) + 1.0); \ + v[2] = 1.0 - 2.0 / (exp(2.0 * v[2]) + 1.0); \ + v[0] = (v[0] + (T)1.0) * (T)0.5; \ + v[1] = (v[1] + (T)1.0) * (T)0.5; \ + v[2] = (v[2] + (T)1.0) * (T)0.5; \ + v[0] = v[0] * tmp[0]; \ + v[1] = v[1] * tmp[1]; \ + v[2] = v[2] * tmp[2]; \ + } + +#define ACTIVATION_ARRAY4(v) \ + { \ + T tmp[4]; \ + tmp[0] = v[0]; \ + tmp[1] = v[1]; \ + tmp[2] = v[2]; \ + tmp[3] = v[3]; \ + v[0] = 0.797885 * (v[0] + 0.044715 * pown(v[0], 3)); \ + v[1] = 0.797885 * (v[1] + 0.044715 * pown(v[1], 3)); \ + v[2] = 0.797885 * (v[2] + 0.044715 * pown(v[2], 3)); \ + v[3] = 0.797885 * (v[3] + 0.044715 * pown(v[3], 3)); \ + v[0] = 1.0 - 2.0 / (exp(2.0 * v[0]) + 1.0); \ + v[1] = 1.0 - 2.0 / (exp(2.0 * v[1]) + 1.0); \ + v[2] = 1.0 - 2.0 / (exp(2.0 * v[2]) + 1.0); \ + v[3] = 1.0 - 2.0 / (exp(2.0 * v[3]) + 1.0); \ + v[0] = (v[0] + (T)1.0) * (T)0.5; \ + v[1] = (v[1] + (T)1.0) * (T)0.5; \ + v[2] = (v[2] + (T)1.0) * (T)0.5; \ + v[3] = (v[3] + (T)1.0) * (T)0.5; \ + v[0] = v[0] * tmp[0]; \ + v[1] = v[1] * tmp[1]; \ + v[2] = v[2] * tmp[2]; \ + v[3] = v[3] * tmp[3]; \ + } + +#define ACTIVATION_ARRAY5(v) \ + { \ + T tmp[5]; \ + tmp[0] = v[0]; \ + tmp[1] = v[1]; \ + tmp[2] = v[2]; \ + tmp[3] = v[3]; \ + tmp[4] = v[4]; \ + v[0] = 0.797885 * (v[0] + 0.044715 * pown(v[0], 3)); \ + v[1] = 0.797885 * (v[1] + 0.044715 * pown(v[1], 3)); \ + v[2] = 0.797885 * (v[2] + 0.044715 * pown(v[2], 3)); \ + v[3] = 0.797885 * (v[3] + 0.044715 * pown(v[3], 3)); \ + v[4] = 0.797885 * (v[4] + 0.044715 * pown(v[4], 3)); \ + v[0] = 1.0 - 2.0 / (exp(2.0 * v[0]) + 1.0); \ + v[1] = 1.0 - 2.0 / (exp(2.0 * v[1]) + 1.0); \ + v[2] = 1.0 - 2.0 / (exp(2.0 * v[2]) + 1.0); \ + v[3] = 1.0 - 2.0 / (exp(2.0 * v[3]) + 1.0); \ + v[4] = 1.0 - 2.0 / (exp(2.0 * v[4]) + 1.0); \ + v[0] = (v[0] + (T)1.0) * (T)0.5; \ + v[1] = (v[1] + (T)1.0) * (T)0.5; \ + v[2] = (v[2] + (T)1.0) * (T)0.5; \ + v[3] = (v[3] + (T)1.0) * (T)0.5; \ + v[4] = (v[4] + (T)1.0) * (T)0.5; \ + v[0] = v[0] * tmp[0]; \ + v[1] = v[1] * tmp[1]; \ + v[2] = v[2] * tmp[2]; \ + v[3] = v[3] * tmp[3]; \ + v[4] = v[4] * tmp[4]; \ + } + +#define ACTIVATION_ARRAY6(v) \ + { \ + T tmp[6]; \ + tmp[0] = v[0]; \ + tmp[1] = v[1]; \ + tmp[2] = v[2]; \ + tmp[3] = v[3]; \ + tmp[4] = v[4]; \ + tmp[5] = v[5]; \ + v[0] = 0.797885 * (v[0] + 0.044715 * pown(v[0], 3)); \ + v[1] = 0.797885 * (v[1] + 0.044715 * pown(v[1], 3)); \ + v[2] = 0.797885 * (v[2] + 0.044715 * pown(v[2], 3)); \ + v[3] = 0.797885 * (v[3] + 0.044715 * pown(v[3], 3)); \ + v[4] = 0.797885 * (v[4] + 0.044715 * pown(v[4], 3)); \ + v[5] = 0.797885 * (v[5] + 0.044715 * pown(v[5], 3)); \ + v[0] = 1.0 - 2.0 / (exp(2.0 * v[0]) + 1.0); \ + v[1] = 1.0 - 2.0 / (exp(2.0 * v[1]) + 1.0); \ + v[2] = 1.0 - 2.0 / (exp(2.0 * v[2]) + 1.0); \ + v[3] = 1.0 - 2.0 / (exp(2.0 * v[3]) + 1.0); \ + v[4] = 1.0 - 2.0 / (exp(2.0 * v[4]) + 1.0); \ + v[5] = 1.0 - 2.0 / (exp(2.0 * v[5]) + 1.0); \ + v[0] = (v[0] + (T)1.0) * (T)0.5; \ + v[1] = (v[1] + (T)1.0) * (T)0.5; \ + v[2] = (v[2] + (T)1.0) * (T)0.5; \ + v[3] = (v[3] + (T)1.0) * (T)0.5; \ + v[4] = (v[4] + (T)1.0) * (T)0.5; \ + v[5] = (v[5] + (T)1.0) * (T)0.5; \ + v[0] = v[0] * tmp[0]; \ + v[1] = v[1] * tmp[1]; \ + v[2] = v[2] * tmp[2]; \ + v[3] = v[3] * tmp[3]; \ + v[4] = v[4] * tmp[4]; \ + v[5] = v[5] * tmp[5]; \ + } + +#define ACTIVATION_ARRAY7(v) \ + { \ + T tmp[7]; \ + tmp[0] = v[0]; \ + tmp[1] = v[1]; \ + tmp[2] = v[2]; \ + tmp[3] = v[3]; \ + tmp[4] = v[4]; \ + tmp[5] = v[5]; \ + tmp[6] = v[6]; \ + v[0] = 0.797885 * (v[0] + 0.044715 * pown(v[0], 3)); \ + v[1] = 0.797885 * (v[1] + 0.044715 * pown(v[1], 3)); \ + v[2] = 0.797885 * (v[2] + 0.044715 * pown(v[2], 3)); \ + v[3] = 0.797885 * (v[3] + 0.044715 * pown(v[3], 3)); \ + v[4] = 0.797885 * (v[4] + 0.044715 * pown(v[4], 3)); \ + v[5] = 0.797885 * (v[5] + 0.044715 * pown(v[5], 3)); \ + v[6] = 0.797885 * (v[6] + 0.044715 * pown(v[6], 3)); \ + v[0] = 1.0 - 2.0 / (exp(2.0 * v[0]) + 1.0); \ + v[1] = 1.0 - 2.0 / (exp(2.0 * v[1]) + 1.0); \ + v[2] = 1.0 - 2.0 / (exp(2.0 * v[2]) + 1.0); \ + v[3] = 1.0 - 2.0 / (exp(2.0 * v[3]) + 1.0); \ + v[4] = 1.0 - 2.0 / (exp(2.0 * v[4]) + 1.0); \ + v[5] = 1.0 - 2.0 / (exp(2.0 * v[5]) + 1.0); \ + v[6] = 1.0 - 2.0 / (exp(2.0 * v[6]) + 1.0); \ + v[0] = (v[0] + (T)1.0) * (T)0.5; \ + v[1] = (v[1] + (T)1.0) * (T)0.5; \ + v[2] = (v[2] + (T)1.0) * (T)0.5; \ + v[3] = (v[3] + (T)1.0) * (T)0.5; \ + v[4] = (v[4] + (T)1.0) * (T)0.5; \ + v[5] = (v[5] + (T)1.0) * (T)0.5; \ + v[6] = (v[6] + (T)1.0) * (T)0.5; \ + v[0] = v[0] * tmp[0]; \ + v[1] = v[1] * tmp[1]; \ + v[2] = v[2] * tmp[2]; \ + v[3] = v[3] * tmp[3]; \ + v[4] = v[4] * tmp[4]; \ + v[5] = v[5] * tmp[5]; \ + v[6] = v[6] * tmp[6]; \ + } + +#define ACTIVATION_ARRAY8(v) \ + { \ + T tmp[8]; \ + tmp[0] = v[0]; \ + tmp[1] = v[1]; \ + tmp[2] = v[2]; \ + tmp[3] = v[3]; \ + tmp[4] = v[4]; \ + tmp[5] = v[5]; \ + tmp[6] = v[6]; \ + tmp[7] = v[7]; \ + v[0] = 0.797885 * (v[0] + 0.044715 * pown(v[0], 3)); \ + v[1] = 0.797885 * (v[1] + 0.044715 * pown(v[1], 3)); \ + v[2] = 0.797885 * (v[2] + 0.044715 * pown(v[2], 3)); \ + v[3] = 0.797885 * (v[3] + 0.044715 * pown(v[3], 3)); \ + v[4] = 0.797885 * (v[4] + 0.044715 * pown(v[4], 3)); \ + v[5] = 0.797885 * (v[5] + 0.044715 * pown(v[5], 3)); \ + v[6] = 0.797885 * (v[6] + 0.044715 * pown(v[6], 3)); \ + v[7] = 0.797885 * (v[7] + 0.044715 * pown(v[7], 3)); \ + v[0] = 1.0 - 2.0 / (exp(2.0 * v[0]) + 1.0); \ + v[1] = 1.0 - 2.0 / (exp(2.0 * v[1]) + 1.0); \ + v[2] = 1.0 - 2.0 / (exp(2.0 * v[2]) + 1.0); \ + v[3] = 1.0 - 2.0 / (exp(2.0 * v[3]) + 1.0); \ + v[4] = 1.0 - 2.0 / (exp(2.0 * v[4]) + 1.0); \ + v[5] = 1.0 - 2.0 / (exp(2.0 * v[5]) + 1.0); \ + v[6] = 1.0 - 2.0 / (exp(2.0 * v[6]) + 1.0); \ + v[7] = 1.0 - 2.0 / (exp(2.0 * v[7]) + 1.0); \ + v[0] = (v[0] + (T)1.0) * (T)0.5; \ + v[1] = (v[1] + (T)1.0) * (T)0.5; \ + v[2] = (v[2] + (T)1.0) * (T)0.5; \ + v[3] = (v[3] + (T)1.0) * (T)0.5; \ + v[4] = (v[4] + (T)1.0) * (T)0.5; \ + v[5] = (v[5] + (T)1.0) * (T)0.5; \ + v[6] = (v[6] + (T)1.0) * (T)0.5; \ + v[7] = (v[7] + (T)1.0) * (T)0.5; \ + v[0] = v[0] * tmp[0]; \ + v[1] = v[1] * tmp[1]; \ + v[2] = v[2] * tmp[2]; \ + v[3] = v[3] * tmp[3]; \ + v[4] = v[4] * tmp[4]; \ + v[5] = v[5] * tmp[5]; \ + v[6] = v[6] * tmp[6]; \ + v[7] = v[7] * tmp[7]; \ + } +#elif defined(USE_HSIGMOID) +#define ACTIVATION_V4(v) \ + { \ + v.s0 = v.s0 + (T)3.0; \ + v.s1 = v.s1 + (T)3.0; \ + v.s2 = v.s2 + (T)3.0; \ + v.s3 = v.s3 + (T)3.0; \ + v.s0 = clamp(v.s0, (T)0, (T)6.0); \ + v.s1 = clamp(v.s1, (T)0, (T)6.0); \ + v.s2 = clamp(v.s2, (T)0, (T)6.0); \ + v.s3 = clamp(v.s3, (T)0, (T)6.0); \ + v.s0 = v.s0 * 0.166667; \ + v.s1 = v.s1 * 0.166667; \ + v.s2 = v.s2 * 0.166667; \ + v.s3 = v.s3 * 0.166667; \ + } +#elif defined(USE_HSWISH) +#define ACTIVATION_V4(v) \ + { \ + T4 tmp = v; \ + v.s0 = v.s0 + (T)3.0; \ + v.s1 = v.s1 + (T)3.0; \ + v.s2 = v.s2 + (T)3.0; \ + v.s3 = v.s3 + (T)3.0; \ + v.s0 = clamp(v.s0, (T)0, (T)6.0); \ + v.s1 = clamp(v.s1, (T)0, (T)6.0); \ + v.s2 = clamp(v.s2, (T)0, (T)6.0); \ + v.s3 = clamp(v.s3, (T)0, (T)6.0); \ + v.s0 = tmp.s0 * (v.s0 * 0.166667); \ + v.s1 = tmp.s1 * (v.s1 * 0.166667); \ + v.s2 = tmp.s2 * (v.s2 * 0.166667); \ + v.s3 = tmp.s3 * (v.s3 * 0.166667); \ + } +#elif defined(USE_TANH) +#define ACTIVATION_V4(v) \ + { \ + v.s0 = 1.0 - 2.0 / (exp(2.0 * v.s0) + 1.0); \ + v.s1 = 1.0 - 2.0 / (exp(2.0 * v.s1) + 1.0); \ + v.s2 = 1.0 - 2.0 / (exp(2.0 * v.s2) + 1.0); \ + v.s3 = 1.0 - 2.0 / (exp(2.0 * v.s3) + 1.0); \ + } +#elif defined(USE_SIGMOID) +#define ACTIVATION_V4(v) \ + { \ + v.s0 = 1.0 / (1.0 + exp(-1.0 * v.s0)); \ + v.s1 = 1.0 / (1.0 + exp(-1.0 * v.s1)); \ + v.s2 = 1.0 / (1.0 + exp(-1.0 * v.s2)); \ + v.s3 = 1.0 / (1.0 + exp(-1.0 * v.s3)); \ + } +#else +#define ACTIVATION_V1(v) \ + {} + +#define ACTIVATION_V4(v) \ + {} + +#define ACTIVATION_V8(v) \ + {} + +#define ACTIVATION_V16(v) \ + {} + +#define ACTIVATION_ARRAY1(v) \ + {} + +#define ACTIVATION_ARRAY2(v) \ + {} + +#define ACTIVATION_ARRAY3(v) \ + {} + +#define ACTIVATION_ARRAY4(v) \ + {} + +#define ACTIVATION_ARRAY5(v) \ + {} + +#define ACTIVATION_ARRAY6(v) \ + {} + +#define ACTIVATION_ARRAY7(v) \ + {} + +#define ACTIVATION_ARRAY8(v) \ + {} +#endif + +/* + * store data reg array to buffer + */ +#define STORE_BUF_ARRAY1(v, off, buf) \ + { \ + ACTIVATION_ARRAY1(v); \ + buf[off] = v[0]; \ + } + +#define STORE_BUF_ARRAY2(v, off, buf) \ + { \ + ACTIVATION_ARRAY2(v); \ + vstore2((T2)(v[0], v[1]), 0, buf + off); \ + } + +#define STORE_BUF_ARRAY3(v, off, buf) \ + { \ + ACTIVATION_ARRAY3(v); \ + vstore3((T3)(v[0], v[1], v[2]), 0, buf + off); \ + } + +#define STORE_BUF_ARRAY4(v, off, buf) \ + { \ + ACTIVATION_ARRAY4(v); \ + vstore4((T4)(v[0], v[1], v[2], v[3]), 0, buf + off); \ + } + +#define STORE_BUF_ARRAY5(v, off, buf) \ + { \ + ACTIVATION_ARRAY5(v); \ + vstore4((T4)(v[0], v[1], v[2], v[3]), 0, buf + off); \ + buf[off + 4] = v[4]; \ + } + +#define STORE_BUF_ARRAY6(v, off, buf) \ + { \ + ACTIVATION_ARRAY6(v); \ + vstore3((T3)(v[0], v[1], v[2]), 0, buf + off); \ + vstore3((T3)(v[3], v[4], v[5]), 0, buf + off + 3); \ + } + +#define STORE_BUF_ARRAY7(v, off, buf) \ + { \ + ACTIVATION_ARRAY7(v); \ + vstore4((T4)(v[0], v[1], v[2], v[3]), 0, buf + off); \ + vstore3((T3)(v[4], v[5], v[6]), 0, buf + off + 4); \ + } + +#define STORE_BUF_ARRAY8(v, off, buf) \ + { \ + ACTIVATION_ARRAY8(v); \ + vstore8((T8)(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]), 0, buf + off); \ + } +/* + * LOAD BIAS + * Load bias from image 1D based on out number + * ON is out number + */ + +#if (ON == 1) +#define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) \ + { \ + V[0] = READ_IMAGE(img, sampler, id); \ + } +#elif (ON == 2) +#define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) \ + { \ + V[0] = READ_IMAGE(img, sampler, id); \ + V[1] = V[0]; \ + } +#elif (ON == 3) +#define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) \ + { \ + V[0] = READ_IMAGE(img, sampler, id); \ + V[1] = V[0]; \ + V[2] = V[0]; \ + } +#elif (ON == 4) +#define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) \ + { \ + V[0] = READ_IMAGE(img, sampler, id); \ + V[1] = V[0]; \ + V[2] = V[0]; \ + V[3] = V[0]; \ + } +#elif (ON == 5) +#define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) \ + { \ + V[0] = READ_IMAGE(img, sampler, id); \ + V[1] = V[0]; \ + V[2] = V[0]; \ + V[3] = V[0]; \ + V[4] = V[0]; \ + } +#elif (ON == 6) +#define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) \ + { \ + V[0] = READ_IMAGE(img, sampler, id); \ + V[1] = V[0]; \ + V[2] = V[0]; \ + V[3] = V[0]; \ + V[4] = V[0]; \ + V[5] = V[0]; \ + } +#elif (ON == 7) +#define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) \ + { \ + V[0] = READ_IMAGE(img, sampler, id); \ + V[1] = V[0]; \ + V[2] = V[0]; \ + V[3] = V[0]; \ + V[4] = V[0]; \ + V[5] = V[0]; \ + V[6] = V[0]; \ + } +#elif (ON == 8) +#define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) \ + { \ + V[0] = READ_IMAGE(img, sampler, id); \ + V[1] = V[0]; \ + V[2] = V[0]; \ + V[3] = V[0]; \ + V[4] = V[0]; \ + V[5] = V[0]; \ + V[6] = V[0]; \ + V[7] = V[0]; \ + } +#elif (ON == 9) +#define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) \ + { \ + V[0] = READ_IMAGE(img, sampler, id); \ + V[1] = V[0]; \ + V[2] = V[0]; \ + V[3] = V[0]; \ + V[4] = V[0]; \ + V[5] = V[0]; \ + V[6] = V[0]; \ + V[7] = V[0]; \ + V[8] = V[0]; \ + } +#elif (ON == 10) +#define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) \ + { \ + V[0] = READ_IMAGE(img, sampler, id); \ + V[1] = V[0]; \ + V[2] = V[0]; \ + V[3] = V[0]; \ + V[4] = V[0]; \ + V[5] = V[0]; \ + V[6] = V[0]; \ + V[7] = V[0]; \ + V[8] = V[0]; \ + V[9] = V[0]; \ + } +#elif (ON == 11) +#define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) \ + { \ + V[0] = READ_IMAGE(img, sampler, id); \ + V[1] = V[0]; \ + V[2] = V[0]; \ + V[3] = V[0]; \ + V[4] = V[0]; \ + V[5] = V[0]; \ + V[6] = V[0]; \ + V[7] = V[0]; \ + V[8] = V[0]; \ + V[9] = V[0]; \ + V[10] = V[0]; \ + } +#elif (ON == 12) +#define LOADBIAS_IMAGE_ARRAY_V4(V, id, img) \ + { \ + V[0] = READ_IMAGE(img, sampler, id); \ + V[1] = V[0]; \ + V[2] = V[0]; \ + V[3] = V[0]; \ + V[4] = V[0]; \ + V[5] = V[0]; \ + V[6] = V[0]; \ + V[7] = V[0]; \ + V[8] = V[0]; \ + V[9] = V[0]; \ + V[10] = V[0]; \ + V[11] = V[0]; \ + } +#endif + +/* + * LOAD INPUT + * load input from buffer based on len of array vector 4 + * len = N; + * N is usually associated with number W + * + * + * GEMM TN A x B = C + * Matrix A has been transposed + * Operator define for Matrix B and Matrix C + */ +#if (LN == 0) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + {} +#elif (LN == 1) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] = vload4(off, buf); \ + } + +#define GEMM_SET_C_BIAS_X(v, reg) \ + { \ + SET_REG_ARRAY1(v, reg); \ + } + +#define GEMM_SET_C_EDGE_ZERO_X(reg, ex) \ + {} + +#define ADD_ELTWISE_NCHW_X(v, off, buf) \ + { \ + ADD_BUF_ARRAY1(v, off, buf); \ + } + +#define GEMM_LOAD_B(v, off, buf) \ + { \ + LOAD_BUF_ARRAY1(v, off, buf); \ + } + +#define GEMM_CALCORE_X(a, b, c) \ + { \ + DOT_A_VEC_B1C1_ARRAY(a, b, c); \ + } + +#define GEMM_MUL_C_X(a, b, reg) \ + { \ + MUL_REG_NORMAL_ARRAY1(a, b, reg); \ + } + +#define GEMM_STORE_C_X(v, off, buf) \ + { \ + STORE_BUF_ARRAY1(v, off, buf); \ + } + +#define GEMM_NT_LOAD_B(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + } +#elif (LN == 2) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] = vload4(off, buf); \ + V[1] = vload4(off + str, buf); \ + } + +#define GEMM_SET_C_BIAS_X(v, reg) \ + { \ + SET_REG_ARRAY2(v, reg); \ + } + +#define GEMM_SET_C_EDGE_ZERO_X(reg, ex) \ + { \ + reg[1] = 0; \ + } + +#define ADD_ELTWISE_NCHW_X(v, off, buf) \ + { \ + ADD_BUF_ARRAY2(v, off, buf); \ + } + +#define GEMM_LOAD_B(v, off, buf) \ + { \ + LOAD_BUF_ARRAY2(v, off, buf); \ + } + +#define GEMM_CALCORE_X(a, b, c) \ + { \ + DOT_A_VEC_B2C2_ARRAY(a, b, c); \ + } + +#define GEMM_MUL_C_X(a, b, reg) \ + { \ + MUL_REG_NORMAL_ARRAY2(a, b, reg); \ + } + +#define GEMM_STORE_C_X(v, off, buf) \ + { \ + STORE_BUF_ARRAY2(v, off, buf); \ + } + +#define GEMM_NT_LOAD_B(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + } +#elif (LN == 3) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] = vload4(off, buf); \ + V[1] = vload4(off + str, buf); \ + V[2] = vload4(off + str * 2, buf); \ + } + +#define GEMM_SET_C_BIAS_X(v, reg) \ + { \ + SET_REG_ARRAY3(v, reg); \ + } + +#define GEMM_SET_C_EDGE_ZERO_X(reg, ex) \ + { \ + if (ex > 1) \ + reg[1] = 0; \ + reg[2] = 0; \ + } + +#define ADD_ELTWISE_NCHW_X(v, off, buf) \ + { \ + ADD_BUF_ARRAY3(v, off, buf); \ + } + +#define GEMM_LOAD_B(v, off, buf) \ + { \ + LOAD_BUF_ARRAY3(v, off, buf); \ + } + +#define GEMM_CALCORE_X(a, b, c) \ + { \ + DOT_A_VEC_B3C3_ARRAY(a, b, c); \ + } + +#define GEMM_MUL_C_X(a, b, reg) \ + { \ + MUL_REG_NORMAL_ARRAY3(a, b, reg); \ + } + +#define GEMM_STORE_C_X(v, off, buf) \ + { \ + STORE_BUF_ARRAY3(v, off, buf); \ + } + +#define GEMM_NT_LOAD_B(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + READ_BUF(v[2], off + str * 2, buf); \ + } +#elif (LN == 4) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] = vload4(off, buf); \ + V[1] = vload4(off + str, buf); \ + V[2] = vload4(off + str * 2, buf); \ + V[3] = vload4(off + str * 3, buf); \ + } + +#define GEMM_SET_C_BIAS_X(v, reg) \ + { \ + SET_REG_ARRAY4(v, reg); \ + } + +#define GEMM_SET_C_EDGE_ZERO_X(reg, ex) \ + { \ + if (ex > 2) \ + reg[1] = 0; \ + if (ex > 1) \ + reg[2] = 0; \ + reg[3] = 0; \ + } + +#define ADD_ELTWISE_NCHW_X(v, off, buf) \ + { \ + ADD_BUF_ARRAY4(v, off, buf); \ + } + +#define GEMM_LOAD_B(v, off, buf) \ + { \ + LOAD_BUF_ARRAY4(v, off, buf); \ + } + +#define GEMM_CALCORE_X(a, b, c) \ + { \ + DOT_A_VEC_B4C4_ARRAY(a, b, c); \ + } + +#define GEMM_MUL_C_X(a, b, reg) \ + { \ + MUL_REG_NORMAL_ARRAY4(a, b, reg); \ + } + +#define GEMM_STORE_C_X(v, off, buf) \ + { \ + STORE_BUF_ARRAY4(v, off, buf); \ + } + +#define GEMM_NT_LOAD_B(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + READ_BUF(v[2], off + str * 2, buf); \ + READ_BUF(v[3], off + str * 3, buf); \ + } +#elif (LN == 5) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] = vload4(off, buf); \ + V[1] = vload4(off + str, buf); \ + V[2] = vload4(off + str * 2, buf); \ + V[3] = vload4(off + str * 3, buf); \ + V[4] = vload4(off + str * 4, buf); \ + } + +#define GEMM_SET_C_BIAS_X(v, reg) \ + { \ + SET_REG_ARRAY5(v, reg); \ + } + +#define GEMM_SET_C_EDGE_ZERO_X(reg, ex) \ + { \ + if (ex > 3) \ + reg[1] = 0; \ + if (ex > 2) \ + reg[2] = 0; \ + if (ex > 1) \ + reg[3] = 0; \ + reg[4] = 0; \ + } + +#define ADD_ELTWISE_NCHW_X(v, off, buf) \ + { \ + ADD_BUF_ARRAY5(v, off, buf); \ + } + +#define GEMM_LOAD_B(v, off, buf) \ + { \ + LOAD_BUF_ARRAY5(v, off, buf); \ + } + +#define GEMM_CALCORE_X(a, b, c) \ + { \ + DOT_A_VEC_B5C5_ARRAY(a, b, c); \ + } + +#define GEMM_MUL_C_X(a, b, reg) \ + { \ + MUL_REG_NORMAL_ARRAY5(a, b, reg); \ + } + +#define GEMM_STORE_C_X(v, off, buf) \ + { \ + STORE_BUF_ARRAY5(v, off, buf); \ + } + +#define GEMM_NT_LOAD_B(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + READ_BUF(v[2], off + str * 2, buf); \ + READ_BUF(v[3], off + str * 3, buf); \ + READ_BUF(v[4], off + str * 4, buf); \ + } +#elif (LN == 6) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] = vload4(off, buf); \ + V[1] = vload4(off + str, buf); \ + V[2] = vload4(off + str * 2, buf); \ + V[3] = vload4(off + str * 3, buf); \ + V[4] = vload4(off + str * 4, buf); \ + V[5] = vload4(off + str * 5, buf); \ + } + +#define GEMM_SET_C_BIAS_X(v, reg) \ + { \ + SET_REG_ARRAY6(v, reg); \ + } + +#define GEMM_SET_C_EDGE_ZERO_X(reg, ex) \ + { \ + if (ex > 4) \ + reg[1] = 0; \ + if (ex > 3) \ + reg[2] = 0; \ + if (ex > 2) \ + reg[3] = 0; \ + if (ex > 1) \ + reg[4] = 0; \ + reg[5] = 0; \ + } + +#define ADD_ELTWISE_NCHW_X(v, off, buf) \ + { \ + ADD_BUF_ARRAY6(v, off, buf); \ + } + +#define GEMM_LOAD_B(v, off, buf) \ + { \ + LOAD_BUF_ARRAY6(v, off, buf); \ + } + +#define GEMM_CALCORE_X(a, b, c) \ + { \ + DOT_A_VEC_B6C6_ARRAY(a, b, c); \ + } + +#define GEMM_MUL_C_X(a, b, reg) \ + { \ + MUL_REG_NORMAL_ARRAY6(a, b, reg); \ + } + +#define GEMM_STORE_C_X(v, off, buf) \ + { \ + STORE_BUF_ARRAY6(v, off, buf); \ + } + +#define GEMM_NT_LOAD_B(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + READ_BUF(v[2], off + str * 2, buf); \ + READ_BUF(v[3], off + str * 3, buf); \ + READ_BUF(v[4], off + str * 4, buf); \ + READ_BUF(v[5], off + str * 5, buf); \ + } +#elif (LN == 7) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] = vload4(off, buf); \ + V[1] = vload4(off + str, buf); \ + V[2] = vload4(off + str * 2, buf); \ + V[3] = vload4(off + str * 3, buf); \ + V[4] = vload4(off + str * 4, buf); \ + V[5] = vload4(off + str * 5, buf); \ + V[6] = vload4(off + str * 6, buf); \ + } + +#define GEMM_SET_C_BIAS_X(v, reg) \ + { \ + SET_REG_ARRAY7(v, reg); \ + } + +#define GEMM_SET_C_EDGE_ZERO_X(reg, ex) \ + { \ + if (ex > 5) \ + reg[1] = 0; \ + if (ex > 4) \ + reg[2] = 0; \ + if (ex > 3) \ + reg[3] = 0; \ + if (ex > 2) \ + reg[4] = 0; \ + if (ex > 1) \ + reg[5] = 0; \ + reg[6] = 0; \ + } + +#define ADD_ELTWISE_NCHW_X(v, off, buf) \ + { \ + ADD_BUF_ARRAY7(v, off, buf); \ + } + +#define GEMM_LOAD_B(v, off, buf) \ + { \ + LOAD_BUF_ARRAY7(v, off, buf); \ + } + +#define GEMM_CALCORE_X(a, b, c) \ + { \ + DOT_A_VEC_B7C7_ARRAY(a, b, c); \ + } + +#define GEMM_MUL_C_X(a, b, reg) \ + { \ + MUL_REG_NORMAL_ARRAY7(a, b, reg); \ + } + +#define GEMM_STORE_C_X(v, off, buf) \ + { \ + STORE_BUF_ARRAY7(v, off, buf); \ + } + +#define GEMM_NT_LOAD_B(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + READ_BUF(v[2], off + str * 2, buf); \ + READ_BUF(v[3], off + str * 3, buf); \ + READ_BUF(v[4], off + str * 4, buf); \ + READ_BUF(v[5], off + str * 5, buf); \ + READ_BUF(v[6], off + str * 6, buf); \ + } +#elif (LN == 8) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] = vload4(off, buf); \ + V[1] = vload4(off + str, buf); \ + V[2] = vload4(off + str * 2, buf); \ + V[3] = vload4(off + str * 3, buf); \ + V[4] = vload4(off + str * 4, buf); \ + V[5] = vload4(off + str * 5, buf); \ + V[6] = vload4(off + str * 6, buf); \ + V[7] = vload4(off + str * 7, buf); \ + } + +#define GEMM_SET_C_BIAS_X(v, reg) \ + { \ + SET_REG_ARRAY8(v, reg); \ + } + +#define GEMM_SET_C_EDGE_ZERO_X(reg, ex) \ + { \ + if (ex > 6) \ + reg[1] = 0; \ + if (ex > 5) \ + reg[2] = 0; \ + if (ex > 4) \ + reg[3] = 0; \ + if (ex > 3) \ + reg[4] = 0; \ + if (ex > 2) \ + reg[5] = 0; \ + if (ex > 1) \ + reg[6] = 0; \ + reg[7] = 0; \ + } + +#define ADD_ELTWISE_NCHW_X(v, off, buf) \ + { \ + ADD_BUF_ARRAY8(v, off, buf); \ + } + +#define GEMM_LOAD_B(v, off, buf) \ + { \ + LOAD_BUF_ARRAY8(v, off, buf); \ + } + +#define GEMM_CALCORE_X(a, b, c) \ + { \ + DOT_A_VEC_B8C8_ARRAY(a, b, c); \ + } + +#define GEMM_MUL_C_X(a, b, reg) \ + { \ + MUL_REG_NORMAL_ARRAY8(a, b, reg); \ + } + +#define GEMM_STORE_C_X(v, off, buf) \ + { \ + STORE_BUF_ARRAY8(v, off, buf); \ + } + +#define GEMM_NT_LOAD_B(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + READ_BUF(v[2], off + str * 2, buf); \ + READ_BUF(v[3], off + str * 3, buf); \ + READ_BUF(v[4], off + str * 4, buf); \ + READ_BUF(v[5], off + str * 5, buf); \ + READ_BUF(v[6], off + str * 6, buf); \ + READ_BUF(v[7], off + str * 7, buf); \ + } +#elif (LN == 9) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] = vload4(off, buf); \ + V[1] = vload4(off + str, buf); \ + V[2] = vload4(off + str * 2, buf); \ + V[3] = vload4(off + str * 3, buf); \ + V[4] = vload4(off + str * 4, buf); \ + V[5] = vload4(off + str * 5, buf); \ + V[6] = vload4(off + str * 6, buf); \ + V[7] = vload4(off + str * 7, buf); \ + V[8] = vload4(off + str * 8, buf); \ + } +#elif (LN == 10) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] = vload4(off, buf); \ + V[1] = vload4(off + str, buf); \ + V[2] = vload4(off + str * 2, buf); \ + V[3] = vload4(off + str * 3, buf); \ + V[4] = vload4(off + str * 4, buf); \ + V[5] = vload4(off + str * 5, buf); \ + V[6] = vload4(off + str * 6, buf); \ + V[7] = vload4(off + str * 7, buf); \ + V[8] = vload4(off + str * 8, buf); \ + V[9] = vload4(off + str * 9, buf); \ + } +#elif (LN == 11) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] = vload4(off, buf); \ + V[1] = vload4(off + str, buf); \ + V[2] = vload4(off + str * 2, buf); \ + V[3] = vload4(off + str * 3, buf); \ + V[4] = vload4(off + str * 4, buf); \ + V[5] = vload4(off + str * 5, buf); \ + V[6] = vload4(off + str * 6, buf); \ + V[7] = vload4(off + str * 7, buf); \ + V[8] = vload4(off + str * 8, buf); \ + V[9] = vload4(off + str * 9, buf); \ + V[10] = vload4(off + str * 10, buf); \ + } +#elif (LN == 12) +#define LOAD_INPUT_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] = vload4(off, buf); \ + V[1] = vload4(off + str, buf); \ + V[2] = vload4(off + str * 2, buf); \ + V[3] = vload4(off + str * 3, buf); \ + V[4] = vload4(off + str * 4, buf); \ + V[5] = vload4(off + str * 5, buf); \ + V[6] = vload4(off + str * 6, buf); \ + V[7] = vload4(off + str * 7, buf); \ + V[8] = vload4(off + str * 8, buf); \ + V[9] = vload4(off + str * 9, buf); \ + V[10] = vload4(off + str * 10, buf); \ + V[11] = vload4(off + str * 11, buf); \ + } +#endif + +/* + * GEMM A x B = C + */ +#if (LM == 1) +#define GEMM_LOAD_A(v, off, buf) \ + { \ + LOAD_BUF_ARRAY1(v, off, buf); \ + } + +#define GEMM_SET_C_BIAS(v, reg) \ + { \ + GEMM_SET_C_BIAS_X(v[0], reg[0]); \ + } + +#define GEMM_SET_C_ZERO(reg) \ + { \ + GEMM_SET_C_BIAS_X(0, reg[0]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_H(reg, ey) \ + {} + +#define GEMM_SET_C_EDGE_ZERO_W(reg, ex) \ + { \ + GEMM_SET_C_EDGE_ZERO_X(reg[0], ex); \ + } + +#define GEMM_CALCORE(a, b, c) \ + { \ + GEMM_CALCORE_X(a[0], b, c[0]); \ + } + +#define GEMM_MUL_C(a, b, reg) \ + { \ + GEMM_MUL_C_X(a, b, reg[0]); \ + } + +#define GEMM_STORE_C(v, off, str, buf) \ + { \ + GEMM_STORE_C_X(v[0], off, buf); \ + } + +#define GEMM_NT_LOAD_A(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + } +#elif (LM == 2) +#define GEMM_LOAD_A(v, off, buf) \ + { \ + LOAD_BUF_ARRAY2(v, off, buf); \ + } + +#define GEMM_SET_C_BIAS(v, reg) \ + { \ + GEMM_SET_C_BIAS_X(v[0], reg[0]); \ + GEMM_SET_C_BIAS_X(v[1], reg[1]); \ + } + +#define GEMM_SET_C_ZERO(reg) \ + { \ + GEMM_SET_C_BIAS_X(0, reg[0]); \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_H(reg, ey) \ + { \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_W(reg, ex) \ + { \ + GEMM_SET_C_EDGE_ZERO_X(reg[0], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[1], ex); \ + } + +#define GEMM_CALCORE(a, b, c) \ + { \ + GEMM_CALCORE_X(a[0], b, c[0]); \ + GEMM_CALCORE_X(a[1], b, c[1]); \ + } + +#define GEMM_MUL_C(a, b, reg) \ + { \ + GEMM_MUL_C_X(a, b, reg[0]) \ + { \ + GEMM_MUL_C_X(a, b, reg[1]) \ + {} + +#define GEMM_MUL_C(a, b, reg) \ + { \ + GEMM_MUL_C_X(a, b, reg[0]); \ + GEMM_MUL_C_X(a, b, reg[1]); \ + } + +#define GEMM_STORE_C(v, off, str, buf) \ + { \ + GEMM_STORE_C_X(v[0], off, buf); \ + GEMM_STORE_C_X(v[1], off + str, buf); \ + } + +#define GEMM_NT_LOAD_A(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + } +#elif (LM == 3) +#define GEMM_LOAD_A(v, off, buf) \ + { \ + LOAD_BUF_ARRAY3(v, off, buf); \ + } + +#define GEMM_SET_C_BIAS(v, reg) \ + { \ + GEMM_SET_C_BIAS_X(v[0], reg[0]); \ + GEMM_SET_C_BIAS_X(v[1], reg[1]); \ + GEMM_SET_C_BIAS_X(v[2], reg[2]); \ + } + +#define GEMM_SET_C_ZERO(reg) \ + { \ + GEMM_SET_C_BIAS_X(0, reg[0]); \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + GEMM_SET_C_BIAS_X(0, reg[2]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_H(reg, ey) \ + { \ + if (ey > 1) \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + GEMM_SET_C_BIAS_X(0, reg[2]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_W(reg, ex) \ + { \ + GEMM_SET_C_EDGE_ZERO_X(reg[0], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[1], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[2], ex); \ + } + +#define GEMM_CALCORE(a, b, c) \ + { \ + GEMM_CALCORE_X(a[0], b, c[0]); \ + GEMM_CALCORE_X(a[1], b, c[1]); \ + GEMM_CALCORE_X(a[2], b, c[2]); \ + } + +#define GEMM_MUL_C(a, b, reg) \ + { \ + GEMM_MUL_C_X(a, b, reg[0]); \ + GEMM_MUL_C_X(a, b, reg[1]); \ + GEMM_MUL_C_X(a, b, reg[2]); \ + } + +#define GEMM_STORE_C(v, off, str, buf) \ + { \ + GEMM_STORE_C_X(v[0], off, buf); \ + GEMM_STORE_C_X(v[1], off + str, buf); \ + GEMM_STORE_C_X(v[2], off + str * 2, buf); \ + } + +#define GEMM_NT_LOAD_A(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + READ_BUF(v[2], off + str * 2, buf); \ + } +#elif (LM == 4) +#define GEMM_LOAD_A(v, off, buf) \ + { \ + LOAD_BUF_ARRAY4(v, off, buf); \ + } + +#define GEMM_SET_C_BIAS(v, reg) \ + { \ + GEMM_SET_C_BIAS_X(v[0], reg[0]); \ + GEMM_SET_C_BIAS_X(v[1], reg[1]); \ + GEMM_SET_C_BIAS_X(v[2], reg[2]); \ + GEMM_SET_C_BIAS_X(v[3], reg[3]); \ + } + +#define GEMM_SET_C_ZERO(reg) \ + { \ + GEMM_SET_C_BIAS_X(0, reg[0]); \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + GEMM_SET_C_BIAS_X(0, reg[2]); \ + GEMM_SET_C_BIAS_X(0, reg[3]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_H(reg, ey) \ + { \ + if (ey > 2) \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + if (ey > 1) \ + GEMM_SET_C_BIAS_X(0, reg[2]); \ + GEMM_SET_C_BIAS_X(0, reg[3]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_W(reg, ex) \ + { \ + GEMM_SET_C_EDGE_ZERO_X(reg[0], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[1], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[2], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[3], ex); \ + } + +#define ADD_ELTWISE_NCHW(v, off, str, buf) \ + { \ + ADD_ELTWISE_NCHW_X(v[0], off, buf); \ + ADD_ELTWISE_NCHW_X(v[1], off + str, buf); \ + ADD_ELTWISE_NCHW_X(v[2], off + str * 2, buf); \ + ADD_ELTWISE_NCHW_X(v[3], off + str * 3, buf); \ + } + +#define GEMM_CALCORE(a, b, c) \ + { \ + GEMM_CALCORE_X(a[0], b, c[0]); \ + GEMM_CALCORE_X(a[1], b, c[1]); \ + GEMM_CALCORE_X(a[2], b, c[2]); \ + GEMM_CALCORE_X(a[3], b, c[3]); \ + } + +#define GEMM_MUL_C(a, b, reg) \ + { \ + GEMM_MUL_C_X(a, b, reg[0]); \ + GEMM_MUL_C_X(a, b, reg[1]); \ + GEMM_MUL_C_X(a, b, reg[2]); \ + GEMM_MUL_C_X(a, b, reg[3]); \ + } + +#define GEMM_STORE_C(v, off, str, buf) \ + { \ + GEMM_STORE_C_X(v[0], off, buf); \ + GEMM_STORE_C_X(v[1], off + str, buf); \ + GEMM_STORE_C_X(v[2], off + str * 2, buf); \ + GEMM_STORE_C_X(v[3], off + str * 3, buf); \ + } + +#define GEMM_NT_LOAD_A(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + READ_BUF(v[2], off + str * 2, buf); \ + READ_BUF(v[3], off + str * 3, buf); \ + } +#elif (LM == 5) +#define GEMM_LOAD_A(v, off, buf) \ + { \ + LOAD_BUF_ARRAY5(v, off, buf); \ + } + +#define GEMM_SET_C_BIAS(v, reg) \ + { \ + GEMM_SET_C_BIAS_X(v[0], reg[0]); \ + GEMM_SET_C_BIAS_X(v[1], reg[1]); \ + GEMM_SET_C_BIAS_X(v[2], reg[2]); \ + GEMM_SET_C_BIAS_X(v[3], reg[3]); \ + GEMM_SET_C_BIAS_X(v[4], reg[4]); \ + } + +#define GEMM_SET_C_ZERO(reg) \ + { \ + GEMM_SET_C_BIAS_X(0, reg[0]); \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + GEMM_SET_C_BIAS_X(0, reg[2]); \ + GEMM_SET_C_BIAS_X(0, reg[3]); \ + GEMM_SET_C_BIAS_X(0, reg[4]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_H(reg, ey) \ + { \ + if (ey > 3) \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + if (ey > 2) \ + GEMM_SET_C_BIAS_X(0, reg[2]); \ + if (ey > 1) \ + GEMM_SET_C_BIAS_X(0, reg[3]); \ + GEMM_SET_C_BIAS_X(0, reg[4]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_W(reg, ex) \ + { \ + GEMM_SET_C_EDGE_ZERO_X(reg[0], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[1], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[2], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[3], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[4], ex); \ + } + +#define GEMM_CALCORE(a, b, c) \ + { \ + GEMM_CALCORE_X(a[0], b, c[0]); \ + GEMM_CALCORE_X(a[1], b, c[1]); \ + GEMM_CALCORE_X(a[2], b, c[2]); \ + GEMM_CALCORE_X(a[3], b, c[3]); \ + GEMM_CALCORE_X(a[4], b, c[4]); \ + } + +#define GEMM_MUL_C(a, b, reg) \ + { \ + GEMM_MUL_C_X(a, b, reg[0]); \ + GEMM_MUL_C_X(a, b, reg[1]); \ + GEMM_MUL_C_X(a, b, reg[2]); \ + GEMM_MUL_C_X(a, b, reg[3]); \ + GEMM_MUL_C_X(a, b, reg[4]); \ + } + +#define GEMM_STORE_C(v, off, str, buf) \ + { \ + GEMM_STORE_C_X(v[0], off, buf); \ + GEMM_STORE_C_X(v[1], off + str, buf); \ + GEMM_STORE_C_X(v[2], off + str * 2, buf); \ + GEMM_STORE_C_X(v[3], off + str * 3, buf); \ + GEMM_STORE_C_X(v[4], off + str * 4, buf); \ + } + +#define GEMM_NT_LOAD_A(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + READ_BUF(v[2], off + str * 2, buf); \ + READ_BUF(v[3], off + str * 3, buf); \ + READ_BUF(v[4], off + str * 4, buf); \ + } +#elif (LM == 6) +#define GEMM_LOAD_A(v, off, buf) \ + { \ + LOAD_BUF_ARRAY6(v, off, buf); \ + } + +#define GEMM_SET_C_BIAS(v, reg) \ + { \ + GEMM_SET_C_BIAS_X(v[0], reg[0]); \ + GEMM_SET_C_BIAS_X(v[1], reg[1]); \ + GEMM_SET_C_BIAS_X(v[2], reg[2]); \ + GEMM_SET_C_BIAS_X(v[3], reg[3]); \ + GEMM_SET_C_BIAS_X(v[4], reg[4]); \ + GEMM_SET_C_BIAS_X(v[5], reg[5]); \ + } + +#define GEMM_SET_C_ZERO(reg) \ + { \ + GEMM_SET_C_BIAS_X(0, reg[0]); \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + GEMM_SET_C_BIAS_X(0, reg[2]); \ + GEMM_SET_C_BIAS_X(0, reg[3]); \ + GEMM_SET_C_BIAS_X(0, reg[4]); \ + GEMM_SET_C_BIAS_X(0, reg[5]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_H(reg, ey) \ + { \ + if (ey > 4) \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + if (ey > 3) \ + GEMM_SET_C_BIAS_X(0, reg[2]); \ + if (ey > 2) \ + GEMM_SET_C_BIAS_X(0, reg[3]); \ + if (ey > 1) \ + GEMM_SET_C_BIAS_X(0, reg[4]); \ + GEMM_SET_C_BIAS_X(0, reg[5]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_W(reg, ex) \ + { \ + GEMM_SET_C_EDGE_ZERO_X(reg[0], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[1], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[2], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[3], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[4], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[5], ex); \ + } + +#define GEMM_CALCORE(a, b, c) \ + { \ + GEMM_CALCORE_X(a[0], b, c[0]); \ + GEMM_CALCORE_X(a[1], b, c[1]); \ + GEMM_CALCORE_X(a[2], b, c[2]); \ + GEMM_CALCORE_X(a[3], b, c[3]); \ + GEMM_CALCORE_X(a[4], b, c[4]); \ + GEMM_CALCORE_X(a[5], b, c[5]); \ + } + +#define GEMM_MUL_C(a, b, reg) \ + { \ + GEMM_MUL_C_X(a, b, reg[0]); \ + GEMM_MUL_C_X(a, b, reg[1]); \ + GEMM_MUL_C_X(a, b, reg[2]); \ + GEMM_MUL_C_X(a, b, reg[3]); \ + GEMM_MUL_C_X(a, b, reg[4]); \ + GEMM_MUL_C_X(a, b, reg[5]); \ + } + +#define GEMM_STORE_C(v, off, str, buf) \ + { \ + GEMM_STORE_C_X(v[0], off, buf); \ + GEMM_STORE_C_X(v[1], off + str, buf); \ + GEMM_STORE_C_X(v[2], off + str * 2, buf); \ + GEMM_STORE_C_X(v[3], off + str * 3, buf); \ + GEMM_STORE_C_X(v[4], off + str * 4, buf); \ + GEMM_STORE_C_X(v[5], off + str * 5, buf); \ + } + +#define GEMM_NT_LOAD_A(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + READ_BUF(v[2], off + str * 2, buf); \ + READ_BUF(v[3], off + str * 3, buf); \ + READ_BUF(v[4], off + str * 4, buf); \ + READ_BUF(v[5], off + str * 5, buf); \ + } +#elif (LM == 7) +#define GEMM_LOAD_A(v, off, buf) \ + { \ + LOAD_BUF_ARRAY7(v, off, buf); \ + } + +#define GEMM_SET_C_BIAS(v, reg) \ + { \ + GEMM_SET_C_BIAS_X(v[0], reg[0]); \ + GEMM_SET_C_BIAS_X(v[1], reg[1]); \ + GEMM_SET_C_BIAS_X(v[2], reg[2]); \ + GEMM_SET_C_BIAS_X(v[3], reg[3]); \ + GEMM_SET_C_BIAS_X(v[4], reg[4]); \ + GEMM_SET_C_BIAS_X(v[5], reg[5]); \ + GEMM_SET_C_BIAS_X(v[6], reg[6]); \ + } + +#define GEMM_SET_C_ZERO(reg) \ + { \ + GEMM_SET_C_BIAS_X(0, reg[0]); \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + GEMM_SET_C_BIAS_X(0, reg[2]); \ + GEMM_SET_C_BIAS_X(0, reg[3]); \ + GEMM_SET_C_BIAS_X(0, reg[4]); \ + GEMM_SET_C_BIAS_X(0, reg[5]); \ + GEMM_SET_C_BIAS_X(0, reg[6]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_H(reg, ey) \ + { \ + if (ey > 5) \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + if (ey > 4) \ + GEMM_SET_C_BIAS_X(0, reg[2]); \ + if (ey > 3) \ + GEMM_SET_C_BIAS_X(0, reg[3]); \ + if (ey > 2) \ + GEMM_SET_C_BIAS_X(0, reg[4]); \ + if (ey > 1) \ + GEMM_SET_C_BIAS_X(0, reg[5]); \ + GEMM_SET_C_BIAS_X(0, reg[6]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_W(reg, ex) \ + { \ + GEMM_SET_C_EDGE_ZERO_X(reg[0], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[1], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[2], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[3], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[4], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[5], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[6], ex); \ + } + +#define GEMM_CALCORE(a, b, c) \ + { \ + GEMM_CALCORE_X(a[0], b, c[0]); \ + GEMM_CALCORE_X(a[1], b, c[1]); \ + GEMM_CALCORE_X(a[2], b, c[2]); \ + GEMM_CALCORE_X(a[3], b, c[3]); \ + GEMM_CALCORE_X(a[4], b, c[4]); \ + GEMM_CALCORE_X(a[5], b, c[5]); \ + GEMM_CALCORE_X(a[6], b, c[6]); \ + } + +#define GEMM_MUL_C(a, b, reg) \ + { \ + GEMM_MUL_C_X(a, b, reg[0]); \ + GEMM_MUL_C_X(a, b, reg[1]); \ + GEMM_MUL_C_X(a, b, reg[2]); \ + GEMM_MUL_C_X(a, b, reg[3]); \ + GEMM_MUL_C_X(a, b, reg[4]); \ + GEMM_MUL_C_X(a, b, reg[5]); \ + GEMM_MUL_C_X(a, b, reg[6]); \ + } + +#define GEMM_STORE_C(v, off, str, buf) \ + { \ + GEMM_STORE_C_X(v[0], off, buf); \ + GEMM_STORE_C_X(v[1], off + str, buf); \ + GEMM_STORE_C_X(v[2], off + str * 2, buf); \ + GEMM_STORE_C_X(v[3], off + str * 3, buf); \ + GEMM_STORE_C_X(v[4], off + str * 4, buf); \ + GEMM_STORE_C_X(v[5], off + str * 5, buf); \ + GEMM_STORE_C_X(v[6], off + str * 6, buf); \ + } + +#define GEMM_NT_LOAD_A(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + READ_BUF(v[2], off + str * 2, buf); \ + READ_BUF(v[3], off + str * 3, buf); \ + READ_BUF(v[4], off + str * 4, buf); \ + READ_BUF(v[5], off + str * 5, buf); \ + READ_BUF(v[6], off + str * 6, buf); \ + } +#elif (LM == 8) +#define GEMM_LOAD_A(v, off, buf) \ + { \ + LOAD_BUF_ARRAY8(v, off, buf); \ + } + +#define GEMM_SET_C_BIAS(v, reg) \ + { \ + GEMM_SET_C_BIAS_X(v[0], reg[0]); \ + GEMM_SET_C_BIAS_X(v[1], reg[1]); \ + GEMM_SET_C_BIAS_X(v[2], reg[2]); \ + GEMM_SET_C_BIAS_X(v[3], reg[3]); \ + GEMM_SET_C_BIAS_X(v[4], reg[4]); \ + GEMM_SET_C_BIAS_X(v[5], reg[5]); \ + GEMM_SET_C_BIAS_X(v[6], reg[6]); \ + GEMM_SET_C_BIAS_X(v[7], reg[7]); \ + } + +#define GEMM_SET_C_ZERO(reg) \ + { \ + GEMM_SET_C_BIAS_X(0, reg[0]); \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + GEMM_SET_C_BIAS_X(0, reg[2]); \ + GEMM_SET_C_BIAS_X(0, reg[3]); \ + GEMM_SET_C_BIAS_X(0, reg[4]); \ + GEMM_SET_C_BIAS_X(0, reg[5]); \ + GEMM_SET_C_BIAS_X(0, reg[6]); \ + GEMM_SET_C_BIAS_X(0, reg[7]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_H(reg, ey) \ + { \ + if (ey > 6) \ + GEMM_SET_C_BIAS_X(0, reg[1]); \ + if (ey > 5) \ + GEMM_SET_C_BIAS_X(0, reg[2]); \ + if (ey > 4) \ + GEMM_SET_C_BIAS_X(0, reg[3]); \ + if (ey > 3) \ + GEMM_SET_C_BIAS_X(0, reg[4]); \ + if (ey > 2) \ + GEMM_SET_C_BIAS_X(0, reg[5]); \ + if (ey > 1) \ + GEMM_SET_C_BIAS_X(0, reg[6]); \ + GEMM_SET_C_BIAS_X(0, reg[7]); \ + } + +#define GEMM_SET_C_EDGE_ZERO_W(reg, ex) \ + { \ + GEMM_SET_C_EDGE_ZERO_X(reg[0], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[1], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[2], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[3], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[4], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[5], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[6], ex); \ + GEMM_SET_C_EDGE_ZERO_X(reg[7], ex); \ + } + +#define ADD_ELTWISE_NCHW(v, off, str, buf) \ + { \ + ADD_ELTWISE_NCHW_X(v[0], off, buf); \ + ADD_ELTWISE_NCHW_X(v[1], off + str, buf); \ + ADD_ELTWISE_NCHW_X(v[2], off + str * 2, buf); \ + ADD_ELTWISE_NCHW_X(v[3], off + str * 3, buf); \ + ADD_ELTWISE_NCHW_X(v[4], off + str * 4, buf); \ + ADD_ELTWISE_NCHW_X(v[5], off + str * 5, buf); \ + ADD_ELTWISE_NCHW_X(v[6], off + str * 6, buf); \ + ADD_ELTWISE_NCHW_X(v[7], off + str * 7, buf); \ + } + +#define GEMM_CALCORE(a, b, c) \ + { \ + GEMM_CALCORE_X(a[0], b, c[0]); \ + GEMM_CALCORE_X(a[1], b, c[1]); \ + GEMM_CALCORE_X(a[2], b, c[2]); \ + GEMM_CALCORE_X(a[3], b, c[3]); \ + GEMM_CALCORE_X(a[4], b, c[4]); \ + GEMM_CALCORE_X(a[5], b, c[5]); \ + GEMM_CALCORE_X(a[6], b, c[6]); \ + GEMM_CALCORE_X(a[7], b, c[7]); \ + } + +#define GEMM_MUL_C(a, b, reg) \ + { \ + GEMM_MUL_C_X(a, b, reg[0]); \ + GEMM_MUL_C_X(a, b, reg[1]); \ + GEMM_MUL_C_X(a, b, reg[2]); \ + GEMM_MUL_C_X(a, b, reg[3]); \ + GEMM_MUL_C_X(a, b, reg[4]); \ + GEMM_MUL_C_X(a, b, reg[5]); \ + GEMM_MUL_C_X(a, b, reg[6]); \ + GEMM_MUL_C_X(a, b, reg[7]); \ + } + +#define GEMM_STORE_C(v, off, str, buf) \ + { \ + GEMM_STORE_C_X(v[0], off, buf); \ + GEMM_STORE_C_X(v[1], off + str, buf); \ + GEMM_STORE_C_X(v[2], off + str * 2, buf); \ + GEMM_STORE_C_X(v[3], off + str * 3, buf); \ + GEMM_STORE_C_X(v[4], off + str * 4, buf); \ + GEMM_STORE_C_X(v[5], off + str * 5, buf); \ + GEMM_STORE_C_X(v[6], off + str * 6, buf); \ + GEMM_STORE_C_X(v[7], off + str * 7, buf); \ + } + +#define GEMM_NT_LOAD_A(v, off, str, buf) \ + { \ + READ_BUF(v[0], off, buf); \ + READ_BUF(v[1], off + str, buf); \ + READ_BUF(v[2], off + str * 2, buf); \ + READ_BUF(v[3], off + str * 3, buf); \ + READ_BUF(v[4], off + str * 4, buf); \ + READ_BUF(v[5], off + str * 5, buf); \ + READ_BUF(v[6], off + str * 6, buf); \ + READ_BUF(v[7], off + str * 7, buf); \ + } +#endif + +/* + * UPDATE VALUE OF REG + */ +#if (UN == 0) +#define UPDATE_REG(A) \ + {} +#elif (UN == 1) +#define UPDATE_REG(A) \ + { \ + A[0] = A[1]; \ + } +#elif (UN == 2) +#define UPDATE_REG(A) \ + { \ + A[0] = A[1]; \ + A[1] = A[2]; \ + } +#elif (UN == 3) +#define UPDATE_REG(A) \ + { \ + A[0] = A[1]; \ + A[1] = A[2]; \ + A[2] = A[3]; \ + } +#elif (UN == 4) +#define UPDATE_REG(A) \ + { \ + A[0] = A[1]; \ + A[1] = A[2]; \ + A[2] = A[3]; \ + A[3] = A[4]; \ + } +#elif (UN == 5) +#define UPDATE_REG(A) \ + { \ + A[0] = A[1]; \ + A[1] = A[2]; \ + A[2] = A[3]; \ + A[3] = A[4]; \ + A[4] = A[5]; \ + } +#elif (UN == 6) +#define UPDATE_REG(A) \ + { \ + A[0] = A[1]; \ + A[1] = A[2]; \ + A[2] = A[3]; \ + A[3] = A[4]; \ + A[4] = A[5]; \ + A[5] = A[6]; \ + } +#elif (UN == 7) +#define UPDATE_REG(A) \ + { \ + A[0] = A[1]; \ + A[1] = A[2]; \ + A[2] = A[3]; \ + A[3] = A[4]; \ + A[4] = A[5]; \ + A[5] = A[6]; \ + A[6] = A[7]; \ + } +#elif (UN == 8) +#define UPDATE_REG(A) \ + { \ + A[0] = A[1]; \ + A[1] = A[2]; \ + A[2] = A[3]; \ + A[3] = A[4]; \ + A[4] = A[5]; \ + A[5] = A[6]; \ + A[6] = A[7]; \ + A[7] = A[8]; \ + } +#elif (UN == 9) +#define UPDATE_REG(A) \ + { \ + A[0] = A[1]; \ + A[1] = A[2]; \ + A[2] = A[3]; \ + A[3] = A[4]; \ + A[4] = A[5]; \ + A[5] = A[6]; \ + A[6] = A[7]; \ + A[7] = A[8]; \ + A[8] = A[9]; \ + } +#elif (UN == 10) +#define UPDATE_REG(A) \ + { \ + A[0] = A[1]; \ + A[1] = A[2]; \ + A[2] = A[3]; \ + A[3] = A[4]; \ + A[4] = A[5]; \ + A[5] = A[6]; \ + A[6] = A[7]; \ + A[7] = A[8]; \ + A[8] = A[9]; \ + A[9] = A[10]; \ + } +#elif (UN == 11) +#define UPDATE_REG(A) \ + { \ + A[0] = A[1]; \ + A[1] = A[2]; \ + A[2] = A[3]; \ + A[3] = A[4]; \ + A[4] = A[5]; \ + A[5] = A[6]; \ + A[6] = A[7]; \ + A[7] = A[8]; \ + A[8] = A[9]; \ + A[9] = A[10]; \ + A[10] = A[11]; \ + } +#elif (UN == 12) +#define UPDATE_REG(A) \ + { \ + A[0] = A[1]; \ + A[1] = A[2]; \ + A[2] = A[3]; \ + A[3] = A[4]; \ + A[4] = A[5]; \ + A[5] = A[6]; \ + A[6] = A[7]; \ + A[7] = A[8]; \ + A[8] = A[9]; \ + A[9] = A[10]; \ + A[10] = A[11]; \ + A[11] = A[12]; \ + } +#endif + +/* + * Direct convolution calculate core + * Depthwise calculate core + */ +#if (ON == 1) +#define DIRECT_CONV_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B16C4(A[0], B, C[0]); \ + } +#define DIRECT_CONV_CAL_CORE_S2(A, B, C) \ + { \ + DOT_A4B16C4(A[0], B, C[0]); \ + } +#define DEPTHWISE_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B4C4(A[0], B, C[0]); \ + } +#define DEPTHWISE_CAL_CORE_S2(A, B, C) \ + { \ + DOT_A4B4C4(A[0], B, C[0]); \ + } +#elif (ON == 2) +#define DIRECT_CONV_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B16C4(A[0], B, C[0]); \ + DOT_A4B16C4(A[1], B, C[1]); \ + } +#define DIRECT_CONV_CAL_CORE_S2(A, B, C) \ + { \ + DOT_A4B16C4(A[0], B, C[0]); \ + DOT_A4B16C4(A[2], B, C[1]); \ + } +#define DEPTHWISE_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B4C4(A[0], B, C[0]); \ + DOT_A4B4C4(A[1], B, C[1]); \ + } +#define DEPTHWISE_CAL_CORE_S2(A, B, C) \ + { \ + DOT_A4B4C4(A[0], B, C[0]); \ + DOT_A4B4C4(A[2], B, C[1]); \ + } +#elif (ON == 3) +#define DIRECT_CONV_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B16C4(A[0], B, C[0]); \ + DOT_A4B16C4(A[1], B, C[1]); \ + DOT_A4B16C4(A[2], B, C[2]); \ + } +#define DIRECT_CONV_CAL_CORE_S2(A, B, C) \ + { \ + DOT_A4B16C4(A[0], B, C[0]); \ + DOT_A4B16C4(A[2], B, C[1]); \ + DOT_A4B16C4(A[4], B, C[2]); \ + } +#define DEPTHWISE_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B4C4(A[0], B, C[0]); \ + DOT_A4B4C4(A[1], B, C[1]); \ + DOT_A4B4C4(A[2], B, C[2]); \ + } +#define DEPTHWISE_CAL_CORE_S2(A, B, C) \ + { \ + DOT_A4B4C4(A[0], B, C[0]); \ + DOT_A4B4C4(A[2], B, C[1]); \ + DOT_A4B4C4(A[4], B, C[2]); \ + } +#elif (ON == 4) +#define DIRECT_CONV_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B16C4(A[0], B, C[0]); \ + DOT_A4B16C4(A[1], B, C[1]); \ + DOT_A4B16C4(A[2], B, C[2]); \ + DOT_A4B16C4(A[3], B, C[3]); \ + } +#define DIRECT_CONV_CAL_CORE_S2(A, B, C) \ + { \ + DOT_A4B16C4(A[0], B, C[0]); \ + DOT_A4B16C4(A[2], B, C[2]); \ + DOT_A4B16C4(A[4], B, C[4]); \ + DOT_A4B16C4(A[6], B, C[6]); \ + } +#define DEPTHWISE_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B4C4(A[0], B, C[0]); \ + DOT_A4B4C4(A[1], B, C[1]); \ + DOT_A4B4C4(A[2], B, C[2]); \ + DOT_A4B4C4(A[3], B, C[3]); \ + } +#define DEPTHWISE_CAL_CORE_S2(A, B, C) \ + { \ + DOT_A4B4C4(A[0], B, C[0]); \ + DOT_A4B4C4(A[2], B, C[1]); \ + DOT_A4B4C4(A[4], B, C[2]); \ + DOT_A4B4C4(A[6], B, C[3]); \ + } +#elif (ON == 5) +#define DIRECT_CONV_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B16C4(A[0], B, C[0]); \ + DOT_A4B16C4(A[1], B, C[1]); \ + DOT_A4B16C4(A[2], B, C[2]); \ + DOT_A4B16C4(A[3], B, C[3]); \ + DOT_A4B16C4(A[4], B, C[4]); \ + } +#define DEPTHWISE_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B4C4(A[0], B, C[0]); \ + DOT_A4B4C4(A[1], B, C[1]); \ + DOT_A4B4C4(A[2], B, C[2]); \ + DOT_A4B4C4(A[3], B, C[3]); \ + DOT_A4B4C4(A[4], B, C[4]); \ + } +#elif (ON == 6) +#define DIRECT_CONV_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B16C4(A[0], B, C[0]); \ + DOT_A4B16C4(A[1], B, C[1]); \ + DOT_A4B16C4(A[2], B, C[2]); \ + DOT_A4B16C4(A[3], B, C[3]); \ + DOT_A4B16C4(A[4], B, C[4]); \ + DOT_A4B16C4(A[5], B, C[5]); \ + } +#define DEPTHWISE_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B4C4(A[0], B, C[0]); \ + DOT_A4B4C4(A[1], B, C[1]); \ + DOT_A4B4C4(A[2], B, C[2]); \ + DOT_A4B4C4(A[3], B, C[3]); \ + DOT_A4B4C4(A[4], B, C[4]); \ + DOT_A4B4C4(A[5], B, C[5]); \ + } +#elif (ON == 7) +#define DIRECT_CONV_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B16C4(A[0], B, C[0]); \ + DOT_A4B16C4(A[1], B, C[1]); \ + DOT_A4B16C4(A[2], B, C[2]); \ + DOT_A4B16C4(A[3], B, C[3]); \ + DOT_A4B16C4(A[4], B, C[4]); \ + DOT_A4B16C4(A[5], B, C[5]); \ + DOT_A4B16C4(A[6], B, C[6]); \ + } +#define DEPTHWISE_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B4C4(A[0], B, C[0]); \ + DOT_A4B4C4(A[1], B, C[1]); \ + DOT_A4B4C4(A[2], B, C[2]); \ + DOT_A4B4C4(A[3], B, C[3]); \ + DOT_A4B4C4(A[4], B, C[4]); \ + DOT_A4B4C4(A[5], B, C[5]); \ + DOT_A4B4C4(A[6], B, C[6]); \ + } +#elif (ON == 8) +#define DIRECT_CONV_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B16C4(A[0], B, C[0]); \ + DOT_A4B16C4(A[1], B, C[1]); \ + DOT_A4B16C4(A[2], B, C[2]); \ + DOT_A4B16C4(A[3], B, C[3]); \ + DOT_A4B16C4(A[4], B, C[4]); \ + DOT_A4B16C4(A[5], B, C[5]); \ + DOT_A4B16C4(A[6], B, C[6]); \ + DOT_A4B16C4(A[7], B, C[7]); \ + } +#define DEPTHWISE_CAL_CORE_S1(A, B, C) \ + { \ + DOT_A4B4C4(A[0], B, C[0]); \ + DOT_A4B4C4(A[1], B, C[1]); \ + DOT_A4B4C4(A[2], B, C[2]); \ + DOT_A4B4C4(A[3], B, C[3]); \ + DOT_A4B4C4(A[4], B, C[4]); \ + DOT_A4B4C4(A[5], B, C[5]); \ + DOT_A4B4C4(A[6], B, C[6]); \ + DOT_A4B4C4(A[7], B, C[7]); \ + } +#endif + +/* + * STORE_OUTPUT_BUF_ARRAY_V4 WITH ACTIVATION + */ +#if (ON == 1) +#define STORE_OUTPUT_BUF_ARRAY_V4(V, off, str, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + vstore4(V[0], off, buf); \ + } + +#define ADD_ELTWISE_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] += vload4(off, buf); \ + } + +#define STORE_OUTPUT_BUF_ARRAY_V4_NCWH(V, off, str_h, str_hw, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + buf[off] = V[0].x; \ + buf[off + str_hw] = V[0].y; \ + buf[off + str_hw * 2] = V[0].z; \ + buf[off + str_hw * 3] = V[0].w; \ + } + +#define STORE_OUTPUT_BUF_ARRAY_ALIGN(val, off, str, out) \ + { \ + out[off] = val[0]; \ + } + +#define SET_REG_ARRAY(v, reg) \ + { \ + SET_REG_ARRAY1(v, reg); \ + } +#elif (ON == 2) +#define STORE_OUTPUT_BUF_ARRAY_V4(V, off, str, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + vstore4(V[0], off, buf); \ + if (id + 1 < bd) \ + vstore4(V[1], off + str, buf); \ + } + +#define ADD_ELTWISE_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] += vload4(off, buf); \ + V[1] += vload4(off + str, buf); \ + } + +#define STORE_OUTPUT_BUF_ARRAY_V4_NCWH(V, off, str_h, str_hw, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + buf[off] = V[0].x; \ + buf[off + str_hw] = V[0].y; \ + buf[off + str_hw * 2] = V[0].z; \ + buf[off + str_hw * 3] = V[0].w; \ + if (id + 1 < bd) { \ + buf[off + str_h] = V[1].x; \ + buf[off + str_h + str_hw] = V[1].y; \ + buf[off + str_h + str_hw * 2] = V[1].z; \ + buf[off + str_h + str_hw * 3] = V[1].w; \ + } \ + } + +#define STORE_OUTPUT_BUF_ARRAY_ALIGN(val, off, str, out) \ + { \ + out[off] = val[0]; \ + out[off + str] = val[1]; \ + } + +#define SET_REG_ARRAY(v, reg) \ + { \ + SET_REG_ARRAY2(v, reg); \ + } +#elif (ON == 3) +#define STORE_OUTPUT_BUF_ARRAY_V4(V, off, str, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + ACTIVATION_V4(V[2]); \ + vstore4(V[0], off, buf); \ + if (id + 1 < bd) \ + vstore4(V[1], off + str, buf); \ + if (id + 2 < bd) \ + vstore4(V[2], off + str * 2, buf); \ + } + +#define ADD_ELTWISE_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] += vload4(off, buf); \ + V[1] += vload4(off + str, buf); \ + V[2] += vload4(off + str * 2, buf); \ + } + +#define STORE_OUTPUT_BUF_ARRAY_V4_NCWH(V, off, str_h, str_hw, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + ACTIVATION_V4(V[2]); \ + buf[off] = V[0].x; \ + buf[off + str_hw] = V[0].y; \ + buf[off + str_hw * 2] = V[0].z; \ + buf[off + str_hw * 3] = V[0].w; \ + if (id + 1 < bd) { \ + buf[off + str_h] = V[1].x; \ + buf[off + str_h + str_hw] = V[1].y; \ + buf[off + str_h + str_hw * 2] = V[1].z; \ + buf[off + str_h + str_hw * 3] = V[1].w; \ + } \ + if (id + 2 < bd) { \ + buf[off + str_h * 2] = V[2].x; \ + buf[off + str_h * 2 + str_hw] = V[2].y; \ + buf[off + str_h * 2 + str_hw * 2] = V[2].z; \ + buf[off + str_h * 2 + str_hw * 3] = V[2].w; \ + } \ + } + +#define STORE_OUTPUT_BUF_ARRAY_ALIGN(val, off, str, out) \ + { \ + out[off] = val[0]; \ + out[off + str] = val[1]; \ + out[off + str * 2] = val[2]; \ + } + +#define SET_REG_ARRAY(v, reg) \ + { \ + SET_REG_ARRAY3(v, reg); \ + } +#elif (ON == 4) +#define STORE_OUTPUT_BUF_ARRAY_V4(V, off, str, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + ACTIVATION_V4(V[2]); \ + ACTIVATION_V4(V[3]); \ + vstore4(V[0], off, buf); \ + if (id + 1 < bd) \ + vstore4(V[1], off + str, buf); \ + if (id + 2 < bd) \ + vstore4(V[2], off + str * 2, buf); \ + if (id + 3 < bd) \ + vstore4(V[3], off + str * 3, buf); \ + } + +#define ADD_ELTWISE_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] += vload4(off, buf); \ + V[1] += vload4(off + str, buf); \ + V[2] += vload4(off + str * 2, buf); \ + V[3] += vload4(off + str * 3, buf); \ + } + +#define STORE_OUTPUT_BUF_ARRAY_V4_NCWH(V, off, str_h, str_hw, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + ACTIVATION_V4(V[2]); \ + ACTIVATION_V4(V[3]); \ + buf[off] = V[0].x; \ + buf[off + str_hw] = V[0].y; \ + buf[off + str_hw * 2] = V[0].z; \ + buf[off + str_hw * 3] = V[0].w; \ + if (id + 1 < bd) { \ + buf[off + str_h] = V[1].x; \ + buf[off + str_h + str_hw] = V[1].y; \ + buf[off + str_h + str_hw * 2] = V[1].z; \ + buf[off + str_h + str_hw * 3] = V[1].w; \ + } \ + if (id + 2 < bd) { \ + buf[off + str_h * 2] = V[2].x; \ + buf[off + str_h * 2 + str_hw] = V[2].y; \ + buf[off + str_h * 2 + str_hw * 2] = V[2].z; \ + buf[off + str_h * 2 + str_hw * 3] = V[2].w; \ + } \ + if (id + 3 < bd) { \ + buf[off + str_h * 3] = V[3].x; \ + buf[off + str_h * 3 + str_hw] = V[3].y; \ + buf[off + str_h * 3 + str_hw * 2] = V[3].z; \ + buf[off + str_h * 3 + str_hw * 3] = V[3].w; \ + } \ + } + +#define STORE_OUTPUT_BUF_ARRAY_ALIGN(val, off, str, out) \ + { \ + out[off] = val[0]; \ + out[off + str] = val[1]; \ + out[off + str * 2] = val[2]; \ + out[off + str * 3] = val[3]; \ + } + +#define SET_REG_ARRAY(v, reg) \ + { \ + SET_REG_ARRAY4(v, reg); \ + } +#elif (ON == 5) +#define STORE_OUTPUT_BUF_ARRAY_V4(V, off, str, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + ACTIVATION_V4(V[2]); \ + ACTIVATION_V4(V[3]); \ + ACTIVATION_V4(V[4]); \ + vstore4(V[0], off, buf); \ + if (id + 1 < bd) \ + vstore4(V[1], off + str, buf); \ + if (id + 2 < bd) \ + vstore4(V[2], off + str * 2, buf); \ + if (id + 3 < bd) \ + vstore4(V[3], off + str * 3, buf); \ + if (id + 4 < bd) \ + vstore4(V[4], off + str * 4, buf); \ + } + +#define ADD_ELTWISE_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] += vload4(off, buf); \ + V[1] += vload4(off + str, buf); \ + V[2] += vload4(off + str * 2, buf); \ + V[3] += vload4(off + str * 3, buf); \ + V[4] += vload4(off + str * 4, buf); \ + } + +#define STORE_OUTPUT_BUF_ARRAY_V4_NCWH(V, off, str_h, str_hw, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + ACTIVATION_V4(V[2]); \ + ACTIVATION_V4(V[3]); \ + ACTIVATION_V4(V[4]); \ + buf[off] = V[0].x; \ + buf[off + str_hw] = V[0].y; \ + buf[off + str_hw * 2] = V[0].z; \ + buf[off + str_hw * 3] = V[0].w; \ + if (id + 1 < bd) { \ + buf[off + str_h] = V[1].x; \ + buf[off + str_h + str_hw] = V[1].y; \ + buf[off + str_h + str_hw * 2] = V[1].z; \ + buf[off + str_h + str_hw * 3] = V[1].w; \ + } \ + if (id + 2 < bd) { \ + buf[off + str_h * 2] = V[2].x; \ + buf[off + str_h * 2 + str_hw] = V[2].y; \ + buf[off + str_h * 2 + str_hw * 2] = V[2].z; \ + buf[off + str_h * 2 + str_hw * 3] = V[2].w; \ + } \ + if (id + 3 < bd) { \ + buf[off + str_h * 3] = V[3].x; \ + buf[off + str_h * 3 + str_hw] = V[3].y; \ + buf[off + str_h * 3 + str_hw * 2] = V[3].z; \ + buf[off + str_h * 3 + str_hw * 3] = V[3].w; \ + } \ + if (id + 4 < bd) { \ + buf[off + str_h * 4] = V[4].x; \ + buf[off + str_h * 4 + str_hw] = V[4].y; \ + buf[off + str_h * 4 + str_hw * 2] = V[4].z; \ + buf[off + str_h * 4 + str_hw * 3] = V[4].w; \ + } \ + } + +#define SET_REG_ARRAY(v, reg) \ + { \ + SET_REG_ARRAY5(v, reg); \ + } +#elif (ON == 6) +#define STORE_OUTPUT_BUF_ARRAY_V4(V, off, str, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + ACTIVATION_V4(V[2]); \ + ACTIVATION_V4(V[3]); \ + ACTIVATION_V4(V[4]); \ + ACTIVATION_V4(V[5]); \ + vstore4(V[0], off, buf); \ + if (id + 1 < bd) \ + vstore4(V[1], off + str, buf); \ + if (id + 2 < bd) \ + vstore4(V[2], off + str * 2, buf); \ + if (id + 3 < bd) \ + vstore4(V[3], off + str * 3, buf); \ + if (id + 4 < bd) \ + vstore4(V[4], off + str * 4, buf); \ + if (id + 5 < bd) \ + vstore4(V[5], off + str * 5, buf); \ + } + +#define ADD_ELTWISE_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] += vload4(off, buf); \ + V[1] += vload4(off + str, buf); \ + V[2] += vload4(off + str * 2, buf); \ + V[3] += vload4(off + str * 3, buf); \ + V[4] += vload4(off + str * 4, buf); \ + V[5] += vload4(off + str * 5, buf); \ + } + +#define STORE_OUTPUT_BUF_ARRAY_V4_NCWH(V, off, str_h, str_hw, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + ACTIVATION_V4(V[2]); \ + ACTIVATION_V4(V[3]); \ + ACTIVATION_V4(V[4]); \ + ACTIVATION_V4(V[5]); \ + buf[off] = V[0].x; \ + buf[off + str_hw] = V[0].y; \ + buf[off + str_hw * 2] = V[0].z; \ + buf[off + str_hw * 3] = V[0].w; \ + if (id + 1 < bd) { \ + buf[off + str_h] = V[1].x; \ + buf[off + str_h + str_hw] = V[1].y; \ + buf[off + str_h + str_hw * 2] = V[1].z; \ + buf[off + str_h + str_hw * 3] = V[1].w; \ + } \ + if (id + 2 < bd) { \ + buf[off + str_h * 2] = V[2].x; \ + buf[off + str_h * 2 + str_hw] = V[2].y; \ + buf[off + str_h * 2 + str_hw * 2] = V[2].z; \ + buf[off + str_h * 2 + str_hw * 3] = V[2].w; \ + } \ + if (id + 3 < bd) { \ + buf[off + str_h * 3] = V[3].x; \ + buf[off + str_h * 3 + str_hw] = V[3].y; \ + buf[off + str_h * 3 + str_hw * 2] = V[3].z; \ + buf[off + str_h * 3 + str_hw * 3] = V[3].w; \ + } \ + if (id + 4 < bd) { \ + buf[off + str_h * 4] = V[4].x; \ + buf[off + str_h * 4 + str_hw] = V[4].y; \ + buf[off + str_h * 4 + str_hw * 2] = V[4].z; \ + buf[off + str_h * 4 + str_hw * 3] = V[4].w; \ + } \ + if (id + 5 < bd) { \ + buf[off + str_h * 5] = V[5].x; \ + buf[off + str_h * 5 + str_hw] = V[5].y; \ + buf[off + str_h * 5 + str_hw * 2] = V[5].z; \ + buf[off + str_h * 5 + str_hw * 3] = V[5].w; \ + } \ + } + +#define SET_REG_ARRAY(v, reg) \ + { \ + SET_REG_ARRAY6(v, reg); \ + } +#elif (ON == 7) +#define STORE_OUTPUT_BUF_ARRAY_V4(V, off, str, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + ACTIVATION_V4(V[2]); \ + ACTIVATION_V4(V[3]); \ + ACTIVATION_V4(V[4]); \ + ACTIVATION_V4(V[5]); \ + ACTIVATION_V4(V[6]); \ + vstore4(V[0], off, buf); \ + if (id + 1 < bd) \ + vstore4(V[1], off + str, buf); \ + if (id + 2 < bd) \ + vstore4(V[2], off + str * 2, buf); \ + if (id + 3 < bd) \ + vstore4(V[3], off + str * 3, buf); \ + if (id + 4 < bd) \ + vstore4(V[4], off + str * 4, buf); \ + if (id + 5 < bd) \ + vstore4(V[5], off + str * 5, buf); \ + if (id + 6 < bd) \ + vstore4(V[6], off + str * 6, buf); \ + } + +#define ADD_ELTWISE_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] += vload4(off, buf); \ + V[1] += vload4(off + str, buf); \ + V[2] += vload4(off + str * 2, buf); \ + V[3] += vload4(off + str * 3, buf); \ + V[4] += vload4(off + str * 4, buf); \ + V[5] += vload4(off + str * 5, buf); \ + V[6] += vload4(off + str * 6, buf); \ + } + +#define STORE_OUTPUT_BUF_ARRAY_V4_NCWH(V, off, str_h, str_hw, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + ACTIVATION_V4(V[2]); \ + ACTIVATION_V4(V[3]); \ + ACTIVATION_V4(V[4]); \ + ACTIVATION_V4(V[5]); \ + ACTIVATION_V4(V[6]); \ + buf[off] = V[0].x; \ + buf[off + str_hw] = V[0].y; \ + buf[off + str_hw * 2] = V[0].z; \ + buf[off + str_hw * 3] = V[0].w; \ + if (id + 1 < bd) { \ + buf[off + str_h] = V[1].x; \ + buf[off + str_h + str_hw] = V[1].y; \ + buf[off + str_h + str_hw * 2] = V[1].z; \ + buf[off + str_h + str_hw * 3] = V[1].w; \ + } \ + if (id + 2 < bd) { \ + buf[off + str_h * 2] = V[2].x; \ + buf[off + str_h * 2 + str_hw] = V[2].y; \ + buf[off + str_h * 2 + str_hw * 2] = V[2].z; \ + buf[off + str_h * 2 + str_hw * 3] = V[2].w; \ + } \ + if (id + 3 < bd) { \ + buf[off + str_h * 3] = V[3].x; \ + buf[off + str_h * 3 + str_hw] = V[3].y; \ + buf[off + str_h * 3 + str_hw * 2] = V[3].z; \ + buf[off + str_h * 3 + str_hw * 3] = V[3].w; \ + } \ + if (id + 4 < bd) { \ + buf[off + str_h * 4] = V[4].x; \ + buf[off + str_h * 4 + str_hw] = V[4].y; \ + buf[off + str_h * 4 + str_hw * 2] = V[4].z; \ + buf[off + str_h * 4 + str_hw * 3] = V[4].w; \ + } \ + if (id + 5 < bd) { \ + buf[off + str_h * 5] = V[5].x; \ + buf[off + str_h * 5 + str_hw] = V[5].y; \ + buf[off + str_h * 5 + str_hw * 2] = V[5].z; \ + buf[off + str_h * 5 + str_hw * 3] = V[5].w; \ + } \ + if (id + 6 < bd) { \ + buf[off + str_h * 6] = V[6].x; \ + buf[off + str_h * 6 + str_hw] = V[6].y; \ + buf[off + str_h * 6 + str_hw * 2] = V[6].z; \ + buf[off + str_h * 6 + str_hw * 3] = V[6].w; \ + } \ + } + +#define SET_REG_ARRAY(v, reg) \ + { \ + SET_REG_ARRAY7(v, reg); \ + } +#elif (ON == 8) +#define STORE_OUTPUT_BUF_ARRAY_V4(V, off, str, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + ACTIVATION_V4(V[2]); \ + ACTIVATION_V4(V[3]); \ + ACTIVATION_V4(V[4]); \ + ACTIVATION_V4(V[5]); \ + ACTIVATION_V4(V[6]); \ + ACTIVATION_V4(V[7]); \ + vstore4(V[0], off, buf); \ + if (id + 1 < bd) \ + vstore4(V[1], off + str, buf); \ + if (id + 2 < bd) \ + vstore4(V[2], off + str * 2, buf); \ + if (id + 3 < bd) \ + vstore4(V[3], off + str * 3, buf); \ + if (id + 4 < bd) \ + vstore4(V[4], off + str * 4, buf); \ + if (id + 5 < bd) \ + vstore4(V[5], off + str * 5, buf); \ + if (id + 6 < bd) \ + vstore4(V[6], off + str * 6, buf); \ + if (id + 7 < bd) \ + vstore4(V[7], off + str * 7, buf); \ + } + +#define ADD_ELTWISE_BUF_ARRAY_V4(V, off, str, buf) \ + { \ + V[0] += vload4(off, buf); \ + V[1] += vload4(off + str, buf); \ + V[2] += vload4(off + str * 2, buf); \ + V[3] += vload4(off + str * 3, buf); \ + V[4] += vload4(off + str * 4, buf); \ + V[5] += vload4(off + str * 5, buf); \ + V[6] += vload4(off + str * 6, buf); \ + V[7] += vload4(off + str * 7, buf); \ + } + +#define STORE_OUTPUT_BUF_ARRAY_V4_NCWH(V, off, str_h, str_hw, id, bd, buf) \ + { \ + ACTIVATION_V4(V[0]); \ + ACTIVATION_V4(V[1]); \ + ACTIVATION_V4(V[2]); \ + ACTIVATION_V4(V[3]); \ + ACTIVATION_V4(V[4]); \ + ACTIVATION_V4(V[5]); \ + ACTIVATION_V4(V[6]); \ + ACTIVATION_V4(V[7]); \ + buf[off] = V[0].x; \ + buf[off + str_hw] = V[0].y; \ + buf[off + str_hw * 2] = V[0].z; \ + buf[off + str_hw * 3] = V[0].w; \ + if (id + 1 < bd) { \ + buf[off + str_h] = V[1].x; \ + buf[off + str_h + str_hw] = V[1].y; \ + buf[off + str_h + str_hw * 2] = V[1].z; \ + buf[off + str_h + str_hw * 3] = V[1].w; \ + } \ + if (id + 2 < bd) { \ + buf[off + str_h * 2] = V[2].x; \ + buf[off + str_h * 2 + str_hw] = V[2].y; \ + buf[off + str_h * 2 + str_hw * 2] = V[2].z; \ + buf[off + str_h * 2 + str_hw * 3] = V[2].w; \ + } \ + if (id + 3 < bd) { \ + buf[off + str_h * 3] = V[3].x; \ + buf[off + str_h * 3 + str_hw] = V[3].y; \ + buf[off + str_h * 3 + str_hw * 2] = V[3].z; \ + buf[off + str_h * 3 + str_hw * 3] = V[3].w; \ + } \ + if (id + 4 < bd) { \ + buf[off + str_h * 4] = V[4].x; \ + buf[off + str_h * 4 + str_hw] = V[4].y; \ + buf[off + str_h * 4 + str_hw * 2] = V[4].z; \ + buf[off + str_h * 4 + str_hw * 3] = V[4].w; \ + } \ + if (id + 5 < bd) { \ + buf[off + str_h * 5] = V[5].x; \ + buf[off + str_h * 5 + str_hw] = V[5].y; \ + buf[off + str_h * 5 + str_hw * 2] = V[5].z; \ + buf[off + str_h * 5 + str_hw * 3] = V[5].w; \ + } \ + if (id + 6 < bd) { \ + buf[off + str_h * 6] = V[6].x; \ + buf[off + str_h * 6 + str_hw] = V[6].y; \ + buf[off + str_h * 6 + str_hw * 2] = V[6].z; \ + buf[off + str_h * 6 + str_hw * 3] = V[6].w; \ + } \ + if (id + 7 < bd) { \ + buf[off + str_h * 7] = V[7].x; \ + buf[off + str_h * 7 + str_hw] = V[7].y; \ + buf[off + str_h * 7 + str_hw * 2] = V[7].z; \ + buf[off + str_h * 7 + str_hw * 3] = V[7].w; \ + } \ + } + +#define SET_REG_ARRAY(v, reg) \ + { \ + SET_REG_ARRAY8(v, reg); \ + } +#endif +#endif diff --git a/compute/tensor/src/gpu/mali/cl/mem_trans_3d_ncwhc4_to_nchw.cl b/compute/tensor/src/gpu/mali/cl/mem_trans_3d_ncwhc4_to_nchw.cl new file mode 100644 index 00000000..3a313cf7 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/mem_trans_3d_ncwhc4_to_nchw.cl @@ -0,0 +1,123 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void mem_trans_3d_ncwhc4_to_nchw(const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int oh_str, + const int ow_off, + const int oh_off, + const int iw, + const int ih, + const int ic, + const int it, + const int ow, + const int oh, + const int oc, + const int ot, + const int offset_in, + const int offset_out, + __global T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + const int idt = idz % it; + const int idc = idz / it; + + if (idx >= oh || idy >= (ow + 3) >> 2 || idt >= ot) { + return; + } + int in_off = (idz * iw_str + (idy << 2) + iw_off) * ih_str + idx + ih_off; + + int out_off = + ((idc << 2) * ot + idt * oh_str + idx + oh_off) * ow_str + (idy << 2) + ow_off + offset_out; + char iex = ((idy << 2) + 4 <= iw) ? 4 : (iw & 3); + char oex = ((idy << 2) + 4 <= ow) ? 4 : (ow & 3); + if (idx >= ih || (idy << 2) >= iw || idz >= ((ic + 3) >> 2) * it) { + iex = 0; + } + char oec = ((idc << 2) + 4 <= oc) ? 4 : (oc & 3); + T4 val[4]; + val[0] = 0; + val[1] = 0; + val[2] = 0; + val[3] = 0; + + if (iex > 0) { + val[0] = vload4(in_off, in + offset_in); + } + if (iex > 1) { + val[1] = vload4(in_off + ih_str, in + offset_in); + } + if (iex > 2) { + val[2] = vload4(in_off + (ih_str << 1), in + offset_in); + } + if (iex > 3) { + val[3] = vload4(in_off + ih_str * 3, in + offset_in); + } + + int owh_str = ow_str * oh_str * ot; + if (oex == 4) { + vstore4((T4)(val[0].x, val[1].x, val[2].x, val[3].x), 0, out + out_off); + if (oec > 1) { + vstore4((T4)(val[0].y, val[1].y, val[2].y, val[3].y), 0, out + out_off + owh_str); + } + if (oec > 2) { + vstore4((T4)(val[0].z, val[1].z, val[2].z, val[3].z), 0, out + out_off + (owh_str << 1)); + } + if (oec > 3) { + vstore4((T4)(val[0].w, val[1].w, val[2].w, val[3].w), 0, out + out_off + owh_str * 3); + } + } else { + if (oex == 1) { + out[out_off] = val[0].x; + if (oec > 1) { + out[out_off + owh_str] = val[0].y; + } + if (oec > 2) { + out[out_off + (owh_str << 1)] = val[0].z; + } + if (oec > 3) { + out[out_off + owh_str * 3] = val[0].w; + } + } + if (oex == 2) { + vstore2((T2)(val[0].x, val[1].x), 0, out + out_off); + if (oec > 1) { + vstore2((T2)(val[0].y, val[1].y), 0, out + out_off + owh_str); + } + if (oec > 2) { + vstore2((T2)(val[0].z, val[1].z), 0, out + out_off + (owh_str << 1)); + } + if (oec > 3) { + vstore2((T2)(val[0].w, val[1].w), 0, out + out_off + owh_str * 3); + } + } + if (oex == 3) { + vstore3((T3)(val[0].x, val[1].x, val[2].x), 0, out + out_off); + if (oec > 1) { + vstore3((T3)(val[0].y, val[1].y, val[2].y), 0, out + out_off + owh_str); + } + if (oec > 2) { + vstore3((T3)(val[0].z, val[1].z, val[2].z), 0, out + out_off + (owh_str << 1)); + } + if (oec > 3) { + vstore3((T3)(val[0].w, val[1].w, val[2].w), 0, out + out_off + owh_str * 3); + } + } + } +} diff --git a/compute/tensor/src/gpu/mali/cl/mem_trans_nchw_to_nchw.cl b/compute/tensor/src/gpu/mali/cl/mem_trans_nchw_to_nchw.cl new file mode 100644 index 00000000..551af4a3 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/mem_trans_nchw_to_nchw.cl @@ -0,0 +1,79 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void mem_trans_nchw_to_nchw(const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int oh_str, + const int ow_off, + const int oh_off, + const int iw, + const int ih, + const int ic, + const int ow, + const int oh, + const int oc, + const int offset_in, + const int offset_out, + const __global T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= ((ow + 3) >> 2) || idy >= oh) { + return; + } + char ie = (((idx << 2) + 4) <= iw) ? 4 : (iw & 3); + char oe = (((idx << 2) + 4) <= ow) ? 4 : (ow & 3); + if (idx >= ((iw + 3) >> 2) || idy >= ih || idz >= ic) { + ie = 0; + } + + T4 val = 0; + const int in_off = (idz * ih_str + idy + ih_off) * iw_str + (idx << 2) + iw_off + offset_in; + if (ie == 4) { + val = vload4(0, in + in_off); + } else { + if (ie == 1) { + val.x = in[in_off]; + } + if (ie == 2) { + T2 tmp = vload2(0, in + in_off); + val.x = tmp.x; + val.y = tmp.y; + } + if (ie == 3) { + T3 tmp = vload3(0, in + in_off); + val.x = tmp.x; + val.y = tmp.y; + val.z = tmp.z; + } + } + const int out_off = (idz * oh_str + idy + oh_off) * ow_str + (idx << 2) + ow_off + offset_out; + if (oe == 4) { + vstore4(val, 0, out + out_off); + } else { + if (oe == 1) { + out[out_off] = val.x; + } + if (oe == 2) { + vstore2(val.xy, 0, out + out_off); + } + if (oe == 3) { + vstore3(val.xyz, 0, out + out_off); + } + } +} diff --git a/compute/tensor/src/gpu/mali/cl/mem_trans_nchw_to_ncwhc4.cl b/compute/tensor/src/gpu/mali/cl/mem_trans_nchw_to_ncwhc4.cl new file mode 100644 index 00000000..b863dfc9 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/mem_trans_nchw_to_ncwhc4.cl @@ -0,0 +1,153 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void +#if defined(INPUT_TRAN) +mem_trans_nchw_to_ncwhc4_input_tran +#elif defined(OUTPUT_TRAN) +mem_trans_nchw_to_ncwhc4_output_tran +#else +mem_trans_nchw_to_ncwhc4 +#endif + (const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int oh_str, + const int ow_off, + const int oh_off, + const int iw, + const int ih, + const int ic, + const int ow, + const int oh, + const int oc, + const int offset_in, + const int offset_out, + const __global T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + int ocd4 = (oc + 3) >> 2; + const int idc = idz % ocd4; + const int idn = idz / ocd4; + const int iz_off = (idn * ic + (idc << 2)) * iw_str * ih_str; + +#if defined(INPUT_TRAN) + if (idx >= (oh + 3) >> 2 || idy >= ow) { + return; + } + int in_off = iz_off + (idy + iw_off) * ih_str + (idx << 2) + ih_off + offset_in; + int out_off = (idz * ow_str + idy + ow_off) * oh_str + (idx << 2) + oh_off; + char iex = ((idx << 2) + 4 <= ih) ? 4 : (ih & 3); + char oex = ((idx << 2) + 4 <= oh) ? 4 : (oh & 3); + if ((idx << 2) >= ih || idy >= iw || idc >= ((ic + 3) >> 2)) { + iex = 0; + } + int out_str = 1; +#else +#if defined(OUTPUT_TRAN) + if (idx >= (oh + 3) >> 2 || idy >= ow) { + return; + } + int out_off = (idz * ow_str + idy + ow_off) * oh_str + (idx << 2) + oh_off; + int out_str = 1; + char oex = ((idx << 2) + 4 <= oh) ? 4 : (oh & 3); +#else + if (idx >= (ow + 3) >> 2 || idy >= oh) { + return; + } + int out_off = (idz * ow_str + (idx << 2) + ow_off) * oh_str + idy + oh_off; + int out_str = oh_str; + char oex = ((idx << 2) + 4 <= ow) ? 4 : (ow & 3); +#endif + int in_off = iz_off + (idy + ih_off) * iw_str + (idx << 2) + iw_off + offset_in; + char iex = ((idx << 2) + 4 <= iw) ? 4 : (iw & 3); + if ((idx << 2) >= iw || idy >= ih || idc >= ((ic + 3) >> 2)) { + iex = 0; + } +#endif + char iec = ((idc << 2) + 4 <= ic) ? 4 : (ic & 3); + T4 val[4]; + val[0] = 0; + val[1] = 0; + val[2] = 0; + val[3] = 0; + + int iwh_str = iw_str * ih_str; + if (iex == 4) { + val[0] = vload4(0, in + in_off); + if (iec > 1) { + val[1] = vload4(0, in + in_off + iwh_str); + } + if (iec > 2) { + val[2] = vload4(0, in + in_off + (iwh_str << 1)); + } + if (iec > 3) { + val[3] = vload4(0, in + in_off + iwh_str * 3); + } + } else { + if (iex == 1) { + val[0].x = in[in_off]; + if (iec > 1) { + val[1].x = in[in_off + iwh_str]; + } + if (iec > 2) { + val[2].x = in[in_off + (iwh_str << 1)]; + } + if (iec > 3) { + val[3].x = in[in_off + iwh_str * 3]; + } + } + if (iex == 2) { + val[0].xy = vload2(0, in + in_off); + if (iec > 1) { + val[1].xy = vload2(0, in + in_off + iwh_str); + } + if (iec > 2) { + val[2].xy = vload2(0, in + in_off + (iwh_str << 1)); + } + if (iec > 3) { + val[3].xy = vload2(0, in + in_off + iwh_str * 3); + } + } + if (iex == 3) { + val[0].xyz = vload3(0, in + in_off); + if (iec > 1) { + val[1].xyz = vload3(0, in + in_off + iwh_str); + } + if (iec > 2) { + val[2].xyz = vload3(0, in + in_off + (iwh_str << 1)); + } + if (iec > 3) { + val[3].xyz = vload3(0, in + in_off + iwh_str * 3); + } + } + } + + vstore4((T4)(val[0].x, val[1].x, val[2].x, val[3].x), out_off, out + offset_out); + if (oex > 1) { + vstore4((T4)(val[0].y, val[1].y, val[2].y, val[3].y), out_off + out_str, out + offset_out); + } + if (oex > 2) { + vstore4((T4)(val[0].z, val[1].z, val[2].z, val[3].z), out_off + (out_str << 1), + out + offset_out); + } + if (oex > 3) { + vstore4( + (T4)(val[0].w, val[1].w, val[2].w, val[3].w), out_off + out_str * 3, out + offset_out); + } +} diff --git a/tensor_computing/src/gpu/mali/cl/reshape_nchw_to_mkt.cl b/compute/tensor/src/gpu/mali/cl/mem_trans_nchw_to_ncwhc4_iw_equal_oh.cl similarity index 60% rename from tensor_computing/src/gpu/mali/cl/reshape_nchw_to_mkt.cl rename to compute/tensor/src/gpu/mali/cl/mem_trans_nchw_to_ncwhc4_iw_equal_oh.cl index 302a3762..f1ae3cfc 100644 --- a/tensor_computing/src/gpu/mali/cl/reshape_nchw_to_mkt.cl +++ b/compute/tensor/src/gpu/mali/cl/mem_trans_nchw_to_ncwhc4_iw_equal_oh.cl @@ -11,31 +11,51 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -__kernel void reshape_nchw_to_mkt(const int iw_str, ih_str, const int iw_off, const int ih_off, const int ih, const int k, const int oh_str, const int ow_str, const int oh_off, - const int ow_off, const int bx, const int by, __global const T* in, __global T* out) { +__kernel void mem_trans_nchw_to_ncwhc4_iw_equal_oh(const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int oh_str, + const int ow_off, + const int oh_off, + const int ih, + const int ic, + const int bx, + const int by, + __global const T *in, + __global T *out) +{ int idx = get_global_id(0); - int idy = get_global_id(1); - if(idx >= bx || idy >= by) return; + int idy = get_global_id(0); + int idz = get_global_id(1); + if (idx >= bx || idy >= by) { + return; + } T4 val = 0; - int idk = (idy << 2); + int k = ih * ic; + int idk = (idz * by + idy) << 2; int ix = idx; int4 iy; int4 iz; - iy.s0 = idk % ih; + iy.s0 = idk % ih; iy.s1 = (idk + 1) % ih; iy.s2 = (idk + 2) % ih; iy.s3 = (idk + 3) % ih; - iz.s0 = idk / ih; + iz.s0 = idk / ih; iz.s1 = (idk + 1) / ih; iz.s2 = (idk + 2) / ih; iz.s3 = (idk + 3) / ih; - val.x = in[(iz.s0 * ih_str + iy.s0 + ih_off) * iw_str + ix + iw_off]; - if(idk + 1 < k) val.y = in[(iz.s1 * ih_str + iy.s1 + ih_off) * iw_str + ix + iw_off]; - if(idk + 2 < k) val.z = in[(iz.s2 * ih_str + iy.s2 + ih_off) * iw_str + ix + iw_off]; - if(idk + 3 < k) val.w = in[(iz.s3 * ih_str + iy.s3 + ih_off) * iw_str + ix + iw_off]; + val.x = in[(iz.s0 * ih_str + iy.s0 + ih_off) * iw_str + ix + iw_off]; + if (idk + 1 < k) { + val.y = in[(iz.s1 * ih_str + iy.s1 + ih_off) * iw_str + ix + iw_off]; + } + if (idk + 2 < k) { + val.z = in[(iz.s2 * ih_str + iy.s2 + ih_off) * iw_str + ix + iw_off]; + } + if (idk + 3 < k) { + val.w = in[(iz.s3 * ih_str + iy.s3 + ih_off) * iw_str + ix + iw_off]; + } const int out_off = (idy * ow_str + ow_off) * oh_str + idx + oh_off; vstore4(val, out_off, out); } diff --git a/tensor_computing/src/gpu/mali/cl/mem_trans_ncwhc4_to_mtk.cl b/compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_mtk.cl similarity index 71% rename from tensor_computing/src/gpu/mali/cl/mem_trans_ncwhc4_to_mtk.cl rename to compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_mtk.cl index 91a51f20..2d0939cc 100644 --- a/tensor_computing/src/gpu/mali/cl/mem_trans_ncwhc4_to_mtk.cl +++ b/compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_mtk.cl @@ -11,25 +11,37 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -__kernel void mem_trans_ncwhc4_to_mtk(const int ih_str, const int iw_str, const int ih_off, const int iw_off, const int k, const int offset, const int bx, const int by, - __global T* in, __global T* out){ +__kernel void mem_trans_ncwhc4_to_mtk(const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int k, + const int offset, + const int bx, + const int by, + __global T *in, + __global T *out) +{ const int idx = get_global_id(0); const int idy = get_global_id(1); - if(idx >= bx || idy >= by) return; + if (idx >= bx || idy >= by) { + return; + } uchar ek = ((idy << 2) + 4 <= k) ? 4 : (k & 3); const int in_off = (idy * iw_str + iw_off) * ih_str + idx + ih_off; T4 val = vload4(in_off, in); const int out_off = idx * k + (idy << 2) + offset; - if(ek == 4) { + if (ek == 4) { vstore4(val, 0, out + out_off); } else { - if(ek == 1) out[out_off] = val.x; - if(ek == 2) vstore2((T2)(val.x, val.y), 0, out + out_off); - if(ek == 3) vstore3((T3)(val.x, val.y, val.z), 0, out + out_off); + if (ek == 1) { + out[out_off] = val.x; + } + if (ek == 2) { + vstore2((T2)(val.x, val.y), 0, out + out_off); + } + if (ek == 3) { + vstore3((T3)(val.x, val.y, val.z), 0, out + out_off); + } } } - diff --git a/compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_nchw.cl b/compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_nchw.cl new file mode 100644 index 00000000..5207e701 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_nchw.cl @@ -0,0 +1,137 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void +#if defined(OUTPUT_TRAN) +mem_trans_ncwhc4_to_nchw_output_tran +#else +mem_trans_ncwhc4_to_nchw +#endif + (const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int oh_str, + const int ow_off, + const int oh_off, + const int iw, + const int ih, + const int ic, + const int ow, + const int oh, + const int oc, + const int offset_in, + const int offset_out, + __global T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); +#if defined(OUTPUT_TRAN) + if (idx >= (ow + 3) >> 2 || idy >= oh) { + return; + } + int in_off = (idz * iw_str + idy + iw_off) * ih_str + (idx << 2) + ih_off; + int out_off = ((idz << 2) * oh_str + idy + oh_off) * ow_str + (idx << 2) + ow_off + offset_out; + char iex = ((idx << 2) + 4 <= ih) ? 4 : (ih & 3); + char oex = ((idx << 2) + 4 <= ow) ? 4 : (ow & 3); + if ((idx << 2) >= ih || idy >= iw || idz >= (ic + 3) >> 2) { + iex = 0; + } + const int in_str = 1; +#else + if (idx >= oh || idy >= (ow + 3) >> 2) { + return; + } + int in_off = (idz * iw_str + (idy << 2) + iw_off) * ih_str + idx + ih_off; + int out_off = ((idz << 2) * oh_str + idx + oh_off) * ow_str + (idy << 2) + ow_off + offset_out; + char iex = ((idy << 2) + 4 <= iw) ? 4 : (iw & 3); + char oex = ((idy << 2) + 4 <= ow) ? 4 : (ow & 3); + if (idx >= ih || (idy << 2) >= iw || idz >= (ic + 3) >> 2) { + iex = 0; + } + const int in_str = ih_str; +#endif + short oec = ((idz << 2) + 4 <= oc) ? 4 : (oc & 3); + T4 val[4]; + val[0] = 0; + val[1] = 0; + val[2] = 0; + val[3] = 0; + + if (iex > 0) { + val[0] = vload4(in_off, in + offset_in); + } + if (iex > 1) { + val[1] = vload4(in_off + in_str, in + offset_in); + } + if (iex > 2) { + val[2] = vload4(in_off + (in_str << 1), in + offset_in); + } + if (iex > 3) { + val[3] = vload4(in_off + in_str * 3, in + offset_in); + } + + int owh_str = ow_str * oh_str; + if (oex == 4) { + vstore4((T4)(val[0].x, val[1].x, val[2].x, val[3].x), 0, out + out_off); + if (oec > 1) { + vstore4((T4)(val[0].y, val[1].y, val[2].y, val[3].y), 0, out + out_off + owh_str); + } + if (oec > 2) { + vstore4((T4)(val[0].z, val[1].z, val[2].z, val[3].z), 0, out + out_off + (owh_str << 1)); + } + if (oec > 3) { + vstore4((T4)(val[0].w, val[1].w, val[2].w, val[3].w), 0, out + out_off + owh_str * 3); + } + } else { + if (oex == 1) { + out[out_off] = val[0].x; + if (oec > 1) { + out[out_off + owh_str] = val[0].y; + } + if (oec > 2) { + out[out_off + (owh_str << 1)] = val[0].z; + } + if (oec > 3) { + out[out_off + owh_str * 3] = val[0].w; + } + } + if (oex == 2) { + vstore2((T2)(val[0].x, val[1].x), 0, out + out_off); + if (oec > 1) { + vstore2((T2)(val[0].y, val[1].y), 0, out + out_off + owh_str); + } + if (oec > 2) { + vstore2((T2)(val[0].z, val[1].z), 0, out + out_off + (owh_str << 1)); + } + if (oec > 3) { + vstore2((T2)(val[0].w, val[1].w), 0, out + out_off + owh_str * 3); + } + } + if (oex == 3) { + vstore3((T3)(val[0].x, val[1].x, val[2].x), 0, out + out_off); + if (oec > 1) { + vstore3((T3)(val[0].y, val[1].y, val[2].y), 0, out + out_off + owh_str); + } + if (oec > 2) { + vstore3((T3)(val[0].z, val[1].z, val[2].z), 0, out + out_off + (owh_str << 1)); + } + if (oec > 3) { + vstore3((T3)(val[0].w, val[1].w, val[2].w), 0, out + out_off + owh_str * 3); + } + } + } +} diff --git a/tensor_computing/src/gpu/mali/cl/reshape_mkt_to_nchw.cl b/compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_nchw_ih_equal_ow.cl similarity index 61% rename from tensor_computing/src/gpu/mali/cl/reshape_mkt_to_nchw.cl rename to compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_nchw_ih_equal_ow.cl index 83c1c8af..d15f2fcf 100644 --- a/tensor_computing/src/gpu/mali/cl/reshape_mkt_to_nchw.cl +++ b/compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_nchw_ih_equal_ow.cl @@ -11,32 +11,51 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -__kernel void reshape_mkt_to_nchw(const int ih_str, const int iw_str, const int ih_off, const int iw_off, const int ow_str, const int oh_str, - const int ow_off, const int oh_off, const int oh, const int bx, const int by, __global const T* in, __global T* out) { +__kernel void mem_trans_ncwhc4_to_nchw_ih_equal_ow(const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int oh_str, + const int ow_off, + const int oh_off, + const int oh, + const int oc, + const int bx, + const int by, + __global const T *in, + __global T *out) +{ int idx = get_global_id(0); int idy = get_global_id(1); - if(idx >= bx || idy >= by) return; + int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } T4 val; - const int in_off = (idy * iw_str + iw_off) * ih_str + idx + ih_off; + const int in_off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; val = vload4(in_off, in); - int idk = (idy << 2); - int ox = idx; + int idk = (idz * by + idy) << 2; + int ox = idx; int4 oy; int4 oz; - oy.s0 = idk % oh; + oy.s0 = idk % oh; oy.s1 = (idk + 1) % oh; oy.s2 = (idk + 2) % oh; oy.s3 = (idk + 3) % oh; - oz.s0 = idk / oh; + oz.s0 = idk / oh; oz.s1 = (idk + 1) / oh; oz.s2 = (idk + 2) / oh; oz.s3 = (idk + 3) / oh; out[(oz.s0 * oh_str + oy.s0 + oh_off) * ow_str + ox + ow_off] = val.x; - out[(oz.s1 * oh_str + oy.s1 + oh_off) * ow_str + ox + ow_off] = val.y; - out[(oz.s2 * oh_str + oy.s2 + oh_off) * ow_str + ox + ow_off] = val.z; - out[(oz.s3 * oh_str + oy.s3 + oh_off) * ow_str + ox + ow_off] = val.w; + if (oz.s1 < oc) { + out[(oz.s1 * oh_str + oy.s1 + oh_off) * ow_str + ox + ow_off] = val.y; + } + if (oz.s2 < oc) { + out[(oz.s2 * oh_str + oy.s2 + oh_off) * ow_str + ox + ow_off] = val.z; + } + if (oz.s3 < oc) { + out[(oz.s3 * oh_str + oy.s3 + oh_off) * ow_str + ox + ow_off] = val.w; + } } diff --git a/compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_ncwhc4.cl b/compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_ncwhc4.cl new file mode 100644 index 00000000..a24987ca --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/mem_trans_ncwhc4_to_ncwhc4.cl @@ -0,0 +1,61 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void +#if defined(OUTPUT_TRAN) +mem_trans_ncwhc4_to_ncwhc4_output_tran +#else +mem_trans_ncwhc4_to_ncwhc4 +#endif + (const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int oh_str, + const int ow_off, + const int oh_off, + const int iw, + const int ih, + const int ic, + const int ow, + const int oh, + const int oc, + const int offset_in, + const int offset_out, + __global T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + +#if defined(OUTPUT_TRAN) + if (idx >= ow || idy >= oh) { + return; + } + const int out_off = (idz * ow_str + idx + ow_off) * oh_str + idy + oh_off; +#else + if (idx >= oh || idy >= ow) { + return; + } + const int out_off = (idz * ow_str + idy + ow_off) * oh_str + idx + oh_off; +#endif + + T4 val = 0; + const int in_off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; + if (idx < ih && idy < iw && idz < ((ic + 3) >> 2)) { + val = vload4(in_off, in + offset_in); + } + vstore4(val, out_off, out + offset_out); +} diff --git a/tensor_computing/src/gpu/mali/cl/normalization.cl b/compute/tensor/src/gpu/mali/cl/normalization.cl similarity index 72% rename from tensor_computing/src/gpu/mali/cl/normalization.cl rename to compute/tensor/src/gpu/mali/cl/normalization.cl index e7b6eb46..6dbe61e2 100644 --- a/tensor_computing/src/gpu/mali/cl/normalization.cl +++ b/compute/tensor/src/gpu/mali/cl/normalization.cl @@ -11,21 +11,35 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -__kernel void normalization(const int len, const int on, const int ih_str, const int ic_str, const int ih_off, const int iw_off, const int oh_str, const int oh_off, const int ow_off, - __global const T* alpha, __global const T* beta, __global const T* in, __global T* out) { +#if defined(USE_C1) +__kernel void normalization_c1 +#else +__kernel void normalization +#endif + (const int len, + const int ih_str, + const int ic_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int oh_off, + const int ow_off, + const float para, + __global const T *alpha, + __global const T *beta, + __global const T *in, + __global T *out) +{ int idx = get_global_id(0); - if(idx >= len) return; + if (idx >= len) { + return; + } float mean = 0; - float var = 0; - float para = 1.0 / on; - + float var = 0; + int in_off = iw_off * ih_str + idx + ih_off; - for(int i = 0; i < ic_str; ++i) { + for (int i = 0; i < ic_str; ++i) { T4 tmp = vload4(in_off + i * ih_str, in); float4 tmpf; tmpf.x = tmp.x; @@ -36,7 +50,7 @@ __kernel void normalization(const int len, const int on, const int ih_str, const } mean = mean * para; - for(int i = 0; i < ic_str; ++i) { + for (int i = 0; i < ic_str; ++i) { T4 tmp = vload4(in_off + i * ih_str, in); float4 tmpf; tmpf.x = tmp.x; @@ -54,14 +68,22 @@ __kernel void normalization(const int len, const int on, const int ih_str, const float std_val = sqrt(var + 1e-6); std_val = 1.0 / std_val; int out_off = ow_off * oh_str + idx + oh_off; - for(int i = 0; i < ic_str; ++i) { + for (int i = 0; i < ic_str; ++i) { T4 out_val = vload4(in_off + i * ih_str, in); - T4 alp = vload4(i, alpha); - T4 bet = vload4(i, beta); + T4 alp = vload4(i, alpha); + T4 bet = vload4(i, beta); out_val.x = alp.x * (out_val.x - mean) * std_val + bet.x; out_val.y = alp.y * (out_val.y - mean) * std_val + bet.y; out_val.z = alp.z * (out_val.z - mean) * std_val + bet.z; out_val.w = alp.w * (out_val.w - mean) * std_val + bet.w; +#if (USE_C1) + out[out_off] = out_val.x; + out[out_off + oh_str] = out_val.y; + out[out_off + oh_str * 2] = out_val.z; + out[out_off + oh_str * 3] = out_val.w; + out_off += (oh_str << 2); +#else vstore4(out_val, out_off + i * oh_str, out); +#endif } } diff --git a/compute/tensor/src/gpu/mali/cl/padding_constant.cl b/compute/tensor/src/gpu/mali/cl/padding_constant.cl new file mode 100644 index 00000000..58ba79f5 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/padding_constant.cl @@ -0,0 +1,51 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void padding_constant(const int ih, + const int iw, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh, + const int ow, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int ph, + const int pb, + const int pw, + const int pr, + __global const T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= oh || idy >= ow) { + return; + } + if (idx < ph || idx >= ph + ih) { + return; + } + if (idy < pw || idy >= pw + iw) { + return; + } + + int in_off = (idz * iw_str + idy - pw + iw_off) * ih_str + ih_off + idx - ph; + int out_off = (idz * ow_str + idy + ow_off) * oh_str + oh_off + idx; + T4 val; + val = vload4(in_off, in); + vstore4(val, out_off, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/padding_edge.cl b/compute/tensor/src/gpu/mali/cl/padding_edge.cl new file mode 100644 index 00000000..b6c1f428 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/padding_edge.cl @@ -0,0 +1,72 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void padding_edge(const int ih, + const int iw, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh, + const int ow, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int ph, + const int pb, + const int pw, + const int pr, + __global const T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= oh || idy >= ow) { + return; + } + + int in_off = idz * iw_str * ih_str; + if (idx < ph) { + if (idy < pw) { + in_off = in_off + iw_off * ih_str + ih_off; + } else if (idy >= pw + iw) { + in_off = in_off + (iw_off + iw - 1) * ih_str + ih_off; + } else { + in_off = in_off + (idy + iw_off - pw) * ih_str + ih_off; + } + } else if (idx >= ph + ih) { + in_off = in_off + iw_off * ih_str + ih_off + ih - 1; + if (idy < pw) { + in_off = in_off; + } else if (idy >= pw + iw) { + in_off = in_off + (iw - 1) * ih_str; + } else { + in_off = in_off + (idy - pw) * ih_str; + } + } else { + in_off = in_off + iw_off * ih_str + ih_off; + if (idy < pw) { + in_off = in_off + idx - ph; + } else if (idy >= pw + iw) { + in_off = in_off + idx - ph + (iw - 1) * ih_str; + } else { + in_off = in_off + (idy - pw) * ih_str + idx - ph; + } + } + T4 val; + val = vload4(in_off, in); + int out_off = (idz * ow_str + idy + ow_off) * oh_str + oh_off + idx; + vstore4(val, out_off, out); +} diff --git a/tensor_computing/src/gpu/mali/cl/padding_input_gclmem.cl b/compute/tensor/src/gpu/mali/cl/padding_input_gclmem.cl similarity index 61% rename from tensor_computing/src/gpu/mali/cl/padding_input_gclmem.cl rename to compute/tensor/src/gpu/mali/cl/padding_input_gclmem.cl index b839466c..287b8acf 100644 --- a/tensor_computing/src/gpu/mali/cl/padding_input_gclmem.cl +++ b/compute/tensor/src/gpu/mali/cl/padding_input_gclmem.cl @@ -11,41 +11,56 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -__kernel void padding_input_gclmem(const int iw, const int ih, const int pw, const int ph, - const int ow, const int oh, const __global const T* in, __global T* out) { - +__kernel void padding_input_gclmem(const int iw, + const int ih, + const int pw, + const int ph, + const int ow, + const int oh, + const __global const T *in, + __global T *out) +{ int idx = get_global_id(0) << 2; int idy = get_global_id(1); int idz = get_global_id(2); - if(idx >= ow || idy >= oh) return; + if (idx >= ow || idy >= oh) { + return; + } int in_y = idy - ph; int be_x = idx - pw; int en_x = be_x + 4; T4 val = 0; - if(in_y >= 0 && in_y < ih) { - int in_off = (idz * ih + in_y) * iw; - if(be_x >= 0 && en_x < iw) { + if (in_y >= 0 && in_y < ih) { + int in_off = (idz * ih + in_y) * iw; + if (be_x >= 0 && en_x < iw) { val = vload4(0, in + in_off + be_x); } else { - if(be_x >= 0 && be_x < iw) val.x = in[in_off + be_x]; - if(be_x + 1 >= 0 && be_x + 1 < iw) val.y = in[in_off + be_x + 1]; - if(be_x + 2 >= 0 && be_x + 2 < iw) val.z = in[in_off + be_x + 2]; - if(be_x + 3 >= 0 && be_x + 3 < iw) val.w = in[in_off + be_x + 3]; + if (be_x >= 0 && be_x < iw) { + val.x = in[in_off + be_x]; + } + if (be_x + 1 >= 0 && be_x + 1 < iw) { + val.y = in[in_off + be_x + 1]; + } + if (be_x + 2 >= 0 && be_x + 2 < iw) { + val.z = in[in_off + be_x + 2]; + } + if (be_x + 3 >= 0 && be_x + 3 < iw) { + val.w = in[in_off + be_x + 3]; + } } } int out_off = (idz * oh + idy) * ow + idx; - if(idx + 3 >= ow) { + if (idx + 3 >= ow) { out[out_off] = val.x; - if(idx + 1 < ow) out[out_off + 1] = val.y; - if(idx + 2 < ow) out[out_off + 2] = val.z; + if (idx + 1 < ow) { + out[out_off + 1] = val.y; + } + if (idx + 2 < ow) { + out[out_off + 2] = val.z; + } } else { vstore4(val, 0, out + out_off); } } - diff --git a/compute/tensor/src/gpu/mali/cl/padding_reflect.cl b/compute/tensor/src/gpu/mali/cl/padding_reflect.cl new file mode 100644 index 00000000..984c82d8 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/padding_reflect.cl @@ -0,0 +1,75 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void padding_reflect(const int ih, + const int iw, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh, + const int ow, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int ph, + const int pb, + const int pw, + const int pr, + __global const T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= oh || idy >= ow) { + return; + } + + int in_off = idz * iw_str * ih_str; + if (idx < ph) { + in_off = in_off + iw_off * ih_str + ih_off; + if (idy < pw) { + in_off = in_off + (pw - idy) * ih_str + ph - idx; + } else if (idy >= pw + iw) { + in_off = in_off + (iw - 1) * ih_str; + in_off = in_off - (idy + 1 - pw - iw) * ih_str + ph - idx; + } else { + in_off = in_off + (idy - pw) * ih_str + ph - idx; + } + } else if (idx >= ph + ih) { + in_off = in_off + iw_off * ih_str + ih_off + ih - 1; + if (idy + ow_off < pw) { + in_off = in_off + (pw - idy) * ih_str - (idx + 1 - ph - ih); + } else if (idy >= pw + iw) { + in_off = in_off + (iw - 1) * ih_str; + in_off = in_off - (idy + 1 - pw - iw) * ih_str - (idx + 1 - ph - ih); + } else { + in_off = in_off + (idy - pw) * ih_str - (idx + 1 - ih - ph); + } + } else { + in_off = in_off + iw_off * ih_str + ih_off; + if (idy < pw) { + in_off = in_off + (pw - idy) * ih_str + idx - ph; + } else if (idy >= pw + iw) { + in_off = in_off + (iw - 1) * ih_str - (idy + 1 - iw - pw) * ih_str + idx - ph; + } else { + in_off = in_off + (idy - pw) * ih_str + idx - ph; + } + } + T4 val; + val = vload4(in_off, in); + int out_off = (idz * ow_str + idy + ow_off) * oh_str + oh_off + idx; + vstore4(val, out_off, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/padding_symmetric.cl b/compute/tensor/src/gpu/mali/cl/padding_symmetric.cl new file mode 100644 index 00000000..3dd5ebcc --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/padding_symmetric.cl @@ -0,0 +1,75 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void padding_symmetric(const int ih, + const int iw, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh, + const int ow, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int ph, + const int pb, + const int pw, + const int pr, + __global const T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + if (idx >= oh || idy >= ow) { + return; + } + + int in_off = idz * iw_str * ih_str; + if (idx < ph) { + in_off = in_off + iw_off * ih_str + ih_off; + if (idy < pw) { + in_off = in_off + (pw - 1 - idy) * ih_str + ph - 1 - idx; + } else if (idy >= pw + iw) { + in_off = in_off + (iw - 1) * ih_str; + in_off = in_off - (idy - pw - iw) * ih_str + ph - 1 - idx; + } else { + in_off = in_off + (idy - pw) * ih_str + ph - 1 - idx; + } + } else if (idx >= ph + ih) { + in_off = in_off + iw_off * ih_str + ih_off + ih - 1; + if (idy + ow_off < pw) { + in_off = in_off + (pw - 1 - idy) * ih_str - (idx - ph - ih); + } else if (idy >= pw + iw) { + in_off = in_off + (iw - 1) * ih_str; + in_off = in_off - (idy - pw - iw) * ih_str - (idx - ph - ih); + } else { + in_off = in_off + (idy - pw) * ih_str - (idx - ih - ph); + } + } else { + in_off = in_off + iw_off * ih_str + ih_off; + if (idy < pw) { + in_off = in_off + (pw - 1 - idy) * ih_str + idx - ph; + } else if (idy >= pw + iw) { + in_off = in_off + (iw - 1) * ih_str - (idy - iw - pw) * ih_str + idx - ph; + } else { + in_off = in_off + (idy - pw) * ih_str + idx - ph; + } + } + T4 val; + val = vload4(in_off, in); + int out_off = (idz * ow_str + idy + ow_off) * oh_str + oh_off + idx; + vstore4(val, out_off, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/pooling_global_mean_h.cl b/compute/tensor/src/gpu/mali/cl/pooling_global_mean_h.cl new file mode 100644 index 00000000..637c0513 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/pooling_global_mean_h.cl @@ -0,0 +1,46 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define sumvec4(x, y) \ + { \ + x.s0 += (float)y.s0; \ + x.s1 += (float)y.s1; \ + x.s2 += (float)y.s2; \ + x.s3 += (float)y.s3; \ + } + +__kernel void pooling_global_mean_h(const int ih, + const int oh_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int bx, + __global const T *in, + __global T *out) +{ + const int idx = get_global_id(0); + if (idx >= bx) { + return; + } + const int in_off = idx * ih; + + T4 val; + float4 sum = 0; + for (int i = 0; i < ih; ++i) { + val = vload4(in_off + i, in); + sumvec4(sum, val); + } + sum = sum / ((float)(ih)); + int out_off = idx * ohw_str + ow_off * oh_str + oh_off; + vstore4((T4)(sum.x, sum.y, sum.z, sum.w), out_off, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/pooling_global_mean_w.cl b/compute/tensor/src/gpu/mali/cl/pooling_global_mean_w.cl new file mode 100644 index 00000000..08af8310 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/pooling_global_mean_w.cl @@ -0,0 +1,50 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define sumvec4(x, y) \ + { \ + x.s0 += (float)y.s0; \ + x.s1 += (float)y.s1; \ + x.s2 += (float)y.s2; \ + x.s3 += (float)y.s3; \ + } + +__kernel void pooling_global_mean_w(const int ih_str, + const int ihw_str, + const int ih_off, + const int iw_off, + const int ih, + const int iw, + const int bx, + const int by, + __global const T *in, + __global T *out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + if (idx >= bx || idy >= by) { + return; + } + + int in_off = idy * ihw_str + iw_off * ih_str + idx + ih_off; + + T4 val; + float4 sum = 0; + for (int i = 0; i < iw; ++i) { + val = vload4(in_off + ih_str * i, in); + sumvec4(sum, val); + } + sum = sum / (float)(iw); + int out_off = (idy * ih) + idx; + vstore4((T4)(sum.x, sum.y, sum.z, sum.w), out_off, out); +} diff --git a/tensor_computing/src/gpu/mali/cl/pooling_max.cl b/compute/tensor/src/gpu/mali/cl/pooling_max.cl similarity index 65% rename from tensor_computing/src/gpu/mali/cl/pooling_max.cl rename to compute/tensor/src/gpu/mali/cl/pooling_max.cl index 7e8be2ae..3abb101f 100644 --- a/tensor_computing/src/gpu/mali/cl/pooling_max.cl +++ b/compute/tensor/src/gpu/mali/cl/pooling_max.cl @@ -11,45 +11,61 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#define maxvec4(x, y) \ + { \ + x.s0 = (x.s0 > y.s0) ? x.s0 : y.s0; \ + x.s1 = (x.s1 > y.s1) ? x.s1 : y.s1; \ + x.s2 = (x.s2 > y.s2) ? x.s2 : y.s2; \ + x.s3 = (x.s3 > y.s3) ? x.s3 : y.s3; \ + } - - - -#define maxvec4(x , y){\ - x.s0 = (x.s0 > y.s0) ? x.s0 : y.s0;\ - x.s1 = (x.s1 > y.s1) ? x.s1 : y.s1;\ - x.s2 = (x.s2 > y.s2) ? x.s2 : y.s2;\ - x.s3 = (x.s3 > y.s3) ? x.s3 : y.s3;\ -} - -__kernel void pooling_max(const int ih, const int iw, const int ih_off, const int iw_off, const int ih_str, const int iw_str, - const int oh, const int ow, const int oh_off, const int ow_off, const int oh_str, const int ow_str, - const int sh, const int sw, const int ph, const int pw, const int kh, const int kw, - __global const T* in, __global T* out) { +__kernel void pooling_max(const int ih, + const int iw, + const int ih_off, + const int iw_off, + const int ih_str, + const int iw_str, + const int oh, + const int ow, + const int oh_off, + const int ow_off, + const int oh_str, + const int ow_str, + const int sh, + const int sw, + const int ph, + const int pw, + const int kh, + const int kw, + __global const T *in, + __global T *out) +{ const int idx = get_global_id(0); const int idy = get_global_id(1); const int idz = get_global_id(2); - if(idx >= oh || idy >= ow) return; - + if (idx >= oh || idy >= ow) { + return; + } + int bh = idx * sh - ph; int bw = idy * sw - pw; int eh = bh + kh; int ew = bw + kw; - bh = (bh < 0) ? 0 : bh; - bw = (bw < 0) ? 0 : bw; + bh = (bh < 0) ? 0 : bh; + bw = (bw < 0) ? 0 : bw; eh = (eh < ih) ? eh : ih; ew = (ew < iw) ? ew : iw; - + bh += ih_off; bw += iw_off; eh += ih_off; ew += iw_off; int in_off = (idz * iw_str + bw) * ih_str; - T4 val = -FLT_MAX; + T4 val = -FLT_MAX; T4 maxval = -FLT_MAX; - for(int i = bw; i < ew; ++i){ - for(int j = bh; j < eh; ++j){ + for (int i = bw; i < ew; ++i) { + for (int j = bh; j < eh; ++j) { val = vload4(in_off + j, in); maxvec4(maxval, val); } diff --git a/tensor_computing/src/gpu/mali/cl/pooling_mean.cl b/compute/tensor/src/gpu/mali/cl/pooling_mean.cl similarity index 66% rename from tensor_computing/src/gpu/mali/cl/pooling_mean.cl rename to compute/tensor/src/gpu/mali/cl/pooling_mean.cl index e2aab879..6e040e56 100644 --- a/tensor_computing/src/gpu/mali/cl/pooling_mean.cl +++ b/compute/tensor/src/gpu/mali/cl/pooling_mean.cl @@ -11,36 +11,52 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#define sumvec4(x, y) \ + { \ + x.s0 += (float)y.s0; \ + x.s1 += (float)y.s1; \ + x.s2 += (float)y.s2; \ + x.s3 += (float)y.s3; \ + } - - - -#define sumvec4(x, y){\ - x.s0 += (float)y.s0;\ - x.s1 += (float)y.s1;\ - x.s2 += (float)y.s2;\ - x.s3 += (float)y.s3;\ -} - -__kernel void pooling_mean(const int ih, const int iw, const int ih_off, const int iw_off, const int ih_str, const int iw_str, - const int oh, const int ow, const int oh_off, const int ow_off, const int oh_str, const int ow_str, - const int sh, const int sw, const int ph, const int pw, const int kh, const int kw, - __global const T* in, __global T* out){ +__kernel void pooling_mean(const int ih, + const int iw, + const int ih_off, + const int iw_off, + const int ih_str, + const int iw_str, + const int oh, + const int ow, + const int oh_off, + const int ow_off, + const int oh_str, + const int ow_str, + const int sh, + const int sw, + const int ph, + const int pw, + const int kh, + const int kw, + __global const T *in, + __global T *out) +{ const int idx = get_global_id(0); const int idy = get_global_id(1); const int idz = get_global_id(2); - if(idx >= oh || idy >= ow) return; - + if (idx >= oh || idy >= ow) { + return; + } + int bh = idx * sh - ph; int bw = idy * sw - pw; int eh = bh + kh; int ew = bw + kw; - bh = (bh < 0) ? 0 : bh; - bw = (bw < 0) ? 0 : bw; + bh = (bh < 0) ? 0 : bh; + bw = (bw < 0) ? 0 : bw; eh = (eh < ih) ? eh : ih; ew = (ew < iw) ? ew : iw; float psize = (eh - bh) * (ew - bw); - + bh += ih_off; bw += iw_off; eh += ih_off; @@ -49,14 +65,14 @@ __kernel void pooling_mean(const int ih, const int iw, const int ih_off, const i T4 val; float4 sum = 0; - for(int i = bw; i< ew; ++i){ - for(int j = bh; j < eh; ++j){ + for (int i = bw; i < ew; ++i) { + for (int j = bh; j < eh; ++j) { val = vload4(in_off + j, in); sumvec4(sum, val); } in_off += ih_str; } - sum = sum / psize; + sum = sum / psize; int out_off = (idz * ow_str + ow_off + idy) * oh_str + oh_off + idx; - vstore4((T4)(sum.x, sum.y, sum.z , sum.w), out_off, out); + vstore4((T4)(sum.x, sum.y, sum.z, sum.w), out_off, out); } diff --git a/compute/tensor/src/gpu/mali/cl/power.cl b/compute/tensor/src/gpu/mali/cl/power.cl new file mode 100644 index 00000000..d09c718d --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/power.cl @@ -0,0 +1,93 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define MANGLE_NAME_IMPL(base, DT) base##DT +#define MANGLE_NAME(base, DT) MANGLE_NAME_IMPL(base, DT) +__kernel void MANGLE_NAME(power_, DT)(const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int w, + const int bx, + const int by, + const int has_power, + const float alp, + const float bet, + float power, + __global T *input, + __global T *output) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + char ew = (((idx << 2) + 4) <= w) ? 4 : (w & 3); + + int in_off = (idz * ih_str + idy + ih_off) * iw_str + (idx << 2) + iw_off; + int out_off = (idz * oh_str + idy + oh_off) * ow_str + (idx << 2) + ow_off; + if (ew == 4) { + T4 val; + val = vload4(0, input + in_off); + val.x = (T)(((float)val.x) * alp + bet); + val.y = (T)(((float)val.y) * alp + bet); + val.z = (T)(((float)val.z) * alp + bet); + val.w = (T)(((float)val.w) * alp + bet); + if (has_power) { + val.x = pow((float)val.x, power); + val.y = pow((float)val.y, power); + val.z = pow((float)val.z, power); + val.w = pow((float)val.w, power); + } + vstore4(val, 0, output + out_off); + } else { + if (ew == 1) { + T val; + val = input[in_off]; + val = ((float)val) * alp + bet; + if (has_power) { + val = pow((float)val, power); + } + output[out_off] = (T)val; + } + if (ew == 2) { + T2 val; + val = vload2(0, input + in_off); + val.x = (T)(((float)val.x) * alp + bet); + val.y = (T)(((float)val.y) * alp + bet); + if (has_power) { + val.x = pow((float)val.x, power); + val.y = pow((float)val.y, power); + } + vstore2(val, 0, output + out_off); + } + if (ew == 3) { + T3 val; + val = vload3(0, input + in_off); + val.x = (T)(((float)val.x) * alp + bet); + val.y = (T)(((float)val.y) * alp + bet); + val.z = (T)(((float)val.z) * alp + bet); + if (has_power) { + val.x = pow((float)val.x, power); + val.y = pow((float)val.y, power); + val.z = pow((float)val.z, power); + } + vstore3(val, 0, output + out_off); + } + } +} diff --git a/compute/tensor/src/gpu/mali/cl/prelu.cl b/compute/tensor/src/gpu/mali/cl/prelu.cl new file mode 100644 index 00000000..7708b0cc --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/prelu.cl @@ -0,0 +1,60 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define MANGLE_NAME_IMPL(base, MD) base##MD +#define MANGLE_NAME(base, MD) MANGLE_NAME_IMPL(base, MD) + +__kernel void MANGLE_NAME(prelu_, MD)(const int ih, + const int iw, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh, + const int ow, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + __global const T *weight, + __global T *input, + __global T *output) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + int idz = get_global_id(2); + if (idx >= oh || idy >= ow) { + return; + } + +#if defined(USE_SAME) + T4 wei = vload4(0, weight); + wei.y = wei.x; + wei.z = wei.x; + wei.w = wei.x; +#else + T4 wei = vload4(idz, weight); +#endif + + T4 val; + int in_off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; + val = vload4(in_off, input); + + val.s0 = val.s0 > 0 ? val.s0 : wei.x * val.s0; + val.s1 = val.s1 > 0 ? val.s1 : wei.y * val.s1; + val.s2 = val.s2 > 0 ? val.s2 : wei.z * val.s2; + val.s3 = val.s3 > 0 ? val.s3 : wei.w * val.s3; + + int out_off = (idz * ow_str + idy + ow_off) * oh_str + idx + oh_off; + vstore4(val, out_off, output); +} diff --git a/tensor_computing/src/gpu/mali/cl/reshape.cl b/compute/tensor/src/gpu/mali/cl/reshape.cl similarity index 79% rename from tensor_computing/src/gpu/mali/cl/reshape.cl rename to compute/tensor/src/gpu/mali/cl/reshape.cl index 2edc4d28..f2ee1395 100644 --- a/tensor_computing/src/gpu/mali/cl/reshape.cl +++ b/compute/tensor/src/gpu/mali/cl/reshape.cl @@ -11,17 +11,28 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -//TODO -__kernel void reshape(const int h, const int ih_str, const int iw_str, const int ih_off, const int iw_off, const int oh_str, const int ow_str, - const int oh_off, const int ow_off, const int bx, const int by, __global const T* in, __global T* out) { +// TODO +__kernel void reshape(const int h, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int bx, + const int by, + __global const T *in, + __global T *out) +{ int idx = get_global_id(0); int idy = get_global_id(1); int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; - T4 val; + if (idx >= bx || idy >= by) { + return; + } + T4 val; int in_off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; val = vload4(in_off, in); int out_off = (idz * ow_str + idy + ow_off) * oh_str + idx + oh_off; diff --git a/compute/tensor/src/gpu/mali/cl/rnncell_build_xh.cl b/compute/tensor/src/gpu/mali/cl/rnncell_build_xh.cl new file mode 100644 index 00000000..1553c2bd --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/rnncell_build_xh.cl @@ -0,0 +1,33 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void rnncell_build_xh(const int xDim, + const int xhDim, + const int s_off, + const int bx, + __global const T *xmem, + __global const T *smem, + __global T *xhmem) +{ + int idx = get_global_id(0); + if (idx >= bx) { + return; + } + T val = 0; + if (idx < xDim) { + val = xmem[idx]; + } else if (idx < xhDim) { + val = smem[idx + s_off - xDim]; + } + xhmem[idx] = val; +} diff --git a/compute/tensor/src/gpu/mali/cl/rnncell_update_project_res.cl b/compute/tensor/src/gpu/mali/cl/rnncell_update_project_res.cl new file mode 100644 index 00000000..117fa1ee --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/rnncell_update_project_res.cl @@ -0,0 +1,118 @@ +// Copyright (C) 2019. Huawei Tehhnologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subjeht to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define load_float4(off, val, buf) \ + { \ + T4 tmp; \ + tmp = vload4(0, buf + off); \ + val.x = tmp.x; \ + val.y = tmp.y; \ + val.z = tmp.z; \ + val.w = tmp.w; \ + } + +#define load_float3(off, val, buf) \ + { \ + T3 tmp; \ + tmp = vload3(0, buf + off); \ + val.x = tmp.x; \ + val.y = tmp.y; \ + val.z = tmp.z; \ + } + +#define load_float2(off, val, buf) \ + { \ + T2 tmp; \ + tmp = vload2(0, buf + off); \ + val.x = tmp.x; \ + val.y = tmp.y; \ + } + +#define store_float4(off, val, buf) \ + { \ + T4 tmp; \ + tmp.x = (T)val.x; \ + tmp.y = (T)val.y; \ + tmp.z = (T)val.z; \ + tmp.w = (T)val.w; \ + vstore4(tmp, 0, buf + off); \ + } + +#define store_float3(off, val, buf) \ + { \ + T3 tmp; \ + tmp.x = (T)val.x; \ + tmp.y = (T)val.y; \ + tmp.z = (T)val.z; \ + vstore3(tmp, 0, buf + off); \ + } + +#define store_float2(off, val, buf) \ + { \ + T2 tmp; \ + tmp.x = (T)val.x; \ + tmp.y = (T)val.y; \ + vstore2(tmp, 0, buf + off); \ + } + +__kernel void rnncell_update_projeht_state( + const int hDim, const int col, const int bx, float zoneout, __global T *out, __global T *smem) +{ + int idx = get_global_id(0); + if (idx >= bx) { + return; + } + char eh = ((idx << 2) + 4 <= hDim) ? 4 : (hDim & 3); + float4 res; + float4 hres; + int off = idx << 2; + if (eh == 4) { + load_float4(off, res, out); + } + if (eh == 3) { + load_float3(off, res, out); + } + if (eh == 2) { + load_float2(off, res, out); + } + if (eh == 1) { + res.x = out[off]; + } + hres = res; + + if (zoneout != 0) { + if (eh == 4) { + load_float4(off + col, hres, smem); + } + hres.x = res.x * (1 - zoneout) + hres.x * zoneout; + hres.y = res.y * (1 - zoneout) + hres.y * zoneout; + hres.z = res.z * (1 - zoneout) + hres.z * zoneout; + hres.w = res.w * (1 - zoneout) + hres.w * zoneout; + } + + if (eh == 4) { + store_float4(off + col, hres, smem); + return; + } + if (eh == 3) { + store_float3(off + col, hres, smem); + return; + } + if (eh == 2) { + store_float2(off + col, hres, smem); + return; + } + if (eh == 1) { + smem[off + col] = hres.x; + } +} diff --git a/compute/tensor/src/gpu/mali/cl/rnncell_update_res.cl b/compute/tensor/src/gpu/mali/cl/rnncell_update_res.cl new file mode 100644 index 00000000..707da037 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/rnncell_update_res.cl @@ -0,0 +1,166 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define load_float4(off, val, buf) \ + { \ + T4 tmp; \ + tmp = vload4(0, buf + off); \ + val.x = tmp.x; \ + val.y = tmp.y; \ + val.z = tmp.z; \ + val.w = tmp.w; \ + } + +#define load_float3(off, val, buf) \ + { \ + T3 tmp; \ + tmp = vload3(0, buf + off); \ + val.x = tmp.x; \ + val.y = tmp.y; \ + val.z = tmp.z; \ + } + +#define load_float2(off, val, buf) \ + { \ + T2 tmp; \ + tmp = vload2(0, buf + off); \ + val.x = tmp.x; \ + val.y = tmp.y; \ + } + +#define store_float4(off, val, buf) \ + { \ + T4 tmp; \ + tmp.x = (T)val.x; \ + tmp.y = (T)val.y; \ + tmp.z = (T)val.z; \ + tmp.w = (T)val.w; \ + vstore4(tmp, 0, buf + off); \ + } +#define store_float3(off, val, buf) \ + { \ + T3 tmp; \ + tmp.x = (T)val.x; \ + tmp.y = (T)val.y; \ + tmp.z = (T)val.z; \ + vstore3(tmp, 0, buf + off); \ + } +#define store_float2(off, val, buf) \ + { \ + T2 tmp; \ + tmp.x = (T)val.x; \ + tmp.y = (T)val.y; \ + vstore2(tmp, 0, buf + off); \ + } + +__kernel void rnncell_update_res(const int col, + const uchar noproject, + const int bx, + float fbias, + float zonecell, + float zoneout, + __global T *smem, + __global T *imem, + __global T *out) +{ + int idx = get_global_id(0); + if (idx >= bx) { + return; + } + char ec = ((idx << 2) + 4 <= col) ? 4 : (col & 3); + float4 cval; + float4 lcval; + float4 ival; + float4 gval; + float4 fval; + float4 oval; + float4 res; + float4 hres; + int off = idx << 2; + load_float4(off, cval, smem); + load_float4(off, ival, imem); + load_float4(off + col, gval, imem); + load_float4(off + col * 2, fval, imem); + load_float4(off + col * 3, oval, imem); + ival.x = 1.0 / (1.0 + exp(-ival.x)); + ival.y = 1.0 / (1.0 + exp(-ival.y)); + ival.z = 1.0 / (1.0 + exp(-ival.z)); + ival.w = 1.0 / (1.0 + exp(-ival.w)); + gval.x = tanh(gval.x); + gval.y = tanh(gval.y); + gval.z = tanh(gval.z); + gval.w = tanh(gval.w); + fval.x = 1.0 / (1.0 + exp(-(fval.x + fbias))); + fval.y = 1.0 / (1.0 + exp(-(fval.y + fbias))); + fval.z = 1.0 / (1.0 + exp(-(fval.z + fbias))); + fval.w = 1.0 / (1.0 + exp(-(fval.w + fbias))); + oval.x = 1.0 / (1.0 + exp(-oval.x)); + oval.y = 1.0 / (1.0 + exp(-oval.y)); + oval.z = 1.0 / (1.0 + exp(-oval.z)); + oval.w = 1.0 / (1.0 + exp(-oval.w)); + lcval = cval; + cval.x = cval.x * fval.x + ival.x * gval.x; + cval.y = cval.y * fval.y + ival.y * gval.y; + cval.z = cval.z * fval.z + ival.z * gval.z; + cval.w = cval.w * fval.w + ival.w * gval.w; + res.x = oval.x * tanh(cval.x); + res.y = oval.y * tanh(cval.y); + res.z = oval.z * tanh(cval.z); + res.w = oval.w * tanh(cval.w); + hres = res; + + if (zonecell != 0) { + cval.x = cval.x * (1 - zonecell) + lcval.x * zonecell; + cval.y = cval.y * (1 - zonecell) + lcval.y * zonecell; + cval.z = cval.z * (1 - zonecell) + lcval.z * zonecell; + cval.w = cval.w * (1 - zonecell) + lcval.w * zonecell; + } + + if (zoneout != 0 && noproject) { + load_float4(off + col, hres, smem); + hres.x = res.x * (1 - zoneout) + hres.x * zoneout; + hres.y = res.y * (1 - zoneout) + hres.y * zoneout; + hres.z = res.z * (1 - zoneout) + hres.z * zoneout; + hres.w = res.w * (1 - zoneout) + hres.w * zoneout; + } + + if (ec == 4) { + store_float4(off, cval, smem); + store_float4(off, res, out); + if (noproject) { + store_float4(off + col, hres, smem); + } + } else { + if (ec == 1) { + smem[off] = (T)cval.x; + out[off] = (T)res.x; + if (noproject) { + smem[off + col] = (T)hres.x; + } + } + if (ec == 2) { + store_float2(off, cval, smem); + store_float2(off, res, out); + if (noproject) { + store_float2(off + col, hres, smem); + } + } + if (ec == 3) { + store_float3(off, cval, smem); + store_float3(off, res, out); + if (noproject) { + store_float3(off + col, hres, smem); + } + } + } +} diff --git a/tensor_computing/src/gpu/mali/cl/scale.cl b/compute/tensor/src/gpu/mali/cl/scale.cl similarity index 58% rename from tensor_computing/src/gpu/mali/cl/scale.cl rename to compute/tensor/src/gpu/mali/cl/scale.cl index 3b227d6b..2549a4b8 100644 --- a/tensor_computing/src/gpu/mali/cl/scale.cl +++ b/compute/tensor/src/gpu/mali/cl/scale.cl @@ -11,43 +11,65 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -#define MANGLE_NAME_IMPL(base, MD) base ## MD +#define MANGLE_NAME_IMPL(base, MD) base##MD #define MANGLE_NAME(base, MD) MANGLE_NAME_IMPL(base, MD) - -__kernel void MANGLE_NAME(scale_, MD)(const int h, const int ih_str, const int iw_str, const int ih_off, const int iw_off, - const int oh_str, const int ow_str, const int oh_off, const int ow_off, const int bx, const int by, __global const T* alpha, __global const T* beta, __global T* input, __global T* output) { - +#if defined(USE_SAME) +__kernel void MANGLE_NAME(scale1_, MD) +#else +__kernel void MANGLE_NAME(scale_, MD) +#endif + (const int h, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + const int bx, + const int by, + __global const T *alpha, + __global const T *beta, + __global T *input, + __global T *output) +{ int idx = get_global_id(0); int idy = get_global_id(1); int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; + if (idx >= bx || idy >= by) { + return; + } +#if defined(USE_SAME) + T4 alp = vload4(0, alpha); + alp.y = alp.x; + alp.z = alp.x; + alp.w = alp.x; + T4 bet = 0; +#if defined(USE_BETA) + bet = vload4(0, beta); + bet.y = bet.x; + bet.z = bet.x; + bet.w = bet.x; +#endif +#else T4 alp = vload4(idz, alpha); T4 bet = 0; #if defined(USE_BETA) bet = vload4(idz, beta); -#endif - T8 val; - int in_off = (idz * iw_str + idy + iw_off) * ih_str + (idx << 1) + ih_off; - val = vload8(0, input + (in_off << 2)); +#endif +#endif + T4 val; + int in_off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; + val = vload4(in_off, input); + val.s0 = val.s0 * alp.x + bet.x; val.s1 = val.s1 * alp.y + bet.y; val.s2 = val.s2 * alp.z + bet.z; val.s3 = val.s3 * alp.w + bet.w; - val.s4 = val.s4 * alp.x + bet.x; - val.s5 = val.s5 * alp.y + bet.y; - val.s6 = val.s6 * alp.z + bet.z; - val.s7 = val.s7 * alp.w + bet.w; - - int out_off = (idz * ow_str + idy + ow_off) * oh_str + (idx << 1) + oh_off; - if((idx << 1) + 1 < h){ - vstore8(val, 0, output + (out_off << 2)); - } else { - vstore4(val.s0123, 0, output + (out_off << 2)); - } + + int out_off = (idz * ow_str + idy + ow_off) * oh_str + idx + oh_off; + vstore4(val, out_off, output); } diff --git a/tensor_computing/src/gpu/mali/cl/slice_h.cl b/compute/tensor/src/gpu/mali/cl/slice_h.cl similarity index 72% rename from tensor_computing/src/gpu/mali/cl/slice_h.cl rename to compute/tensor/src/gpu/mali/cl/slice_h.cl index 4ab95e97..2124b446 100644 --- a/tensor_computing/src/gpu/mali/cl/slice_h.cl +++ b/compute/tensor/src/gpu/mali/cl/slice_h.cl @@ -11,30 +11,45 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -#define MANGLE_NAME_IMPL(base, N) base ## N +#define MANGLE_NAME_IMPL(base, N) base##N #define MANGLE_NAME(base, N) MANGLE_NAME_IMPL(base, N) -__kernel void MANGLE_NAME(slice_h_, N)(const int ih_str, const int iw_str, const int ih_off, const int iw_off, const int bx, const int by, __global T* input, - const int oh_str0, const int ow_str0, const int oh_off0, const int ow_off0, const int slice_end0, __global T* output0, - const int oh_str1, const int ow_str1, const int oh_off1, const int ow_off1, const int slice_end1, __global T* output1 - ) { +__kernel void MANGLE_NAME(slice_h_, N)(const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int bx, + const int by, + __global T *input, + const int oh_str0, + const int ow_str0, + const int oh_off0, + const int ow_off0, + const int slice_end0, + __global T *output0, + const int oh_str1, + const int ow_str1, + const int oh_off1, + const int ow_off1, + const int slice_end1, + __global T *output1) +{ int idx = get_global_id(0); int idy = get_global_id(1); int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; + if (idx >= bx || idy >= by) { + return; + } T4 val; int in_off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; val = vload4(in_off, input); - if(idx < slice_end0) { + if (idx < slice_end0) { int out_off = (idz * ow_str0 + idy + ow_off0) * oh_str0 + idx + oh_off0; vstore4(val, out_off, output0); return; } - if(idx < slice_end1) { + if (idx < slice_end1) { int out_off = (idz * ow_str1 + idy + ow_off1) * oh_str1 + idx + oh_off1; vstore4(val, out_off, output1); return; diff --git a/compute/tensor/src/gpu/mali/cl/softmax.cl b/compute/tensor/src/gpu/mali/cl/softmax.cl new file mode 100644 index 00000000..c135577e --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/softmax.cl @@ -0,0 +1,115 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void softmax(const int cd4, + const int ce4, + const int ih_str, + const int ihw_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ohw_str, + const int oh_off, + const int ow_off, + const int bx, + const int by, + __global T *in, + __global T *out) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + if (idx >= bx || idy >= by) { + return; + } + + float4 maxval = (float4)(-FLT_MAX); + float4 tmp; + float4 lval; + T4 val; + int index = (idy + iw_off) * ih_str + idx + ih_off; + for (int i = 0; i < cd4 - 1; i++) { + val = vload4(index + i * ihw_str, in); + tmp.x = (float)val.x; + tmp.y = (float)val.y; + tmp.z = (float)val.z; + tmp.w = (float)val.w; + maxval = fmax(maxval, tmp); + } + val = vload4(index + (cd4 - 1) * ihw_str, in); + lval.x = (float)val.x; + lval.y = (float)val.y; + lval.z = (float)val.z; + lval.w = (float)val.w; + if (maxval.x < maxval.y) { + maxval.x = maxval.y; + } + if (maxval.x < maxval.z) { + maxval.x = maxval.z; + } + if (maxval.x < maxval.w) { + maxval.x = maxval.w; + } + if (maxval.x < lval.x) { + maxval.x = lval.x; + } + if (ce4 > 1 && maxval.x < lval.y) { + maxval.x = lval.y; + } + if (ce4 > 2 && maxval.x < lval.z) { + maxval.x = lval.z; + } + if (ce4 > 3 && maxval.x < lval.w) { + maxval.x = lval.w; + } + + float sumexp = 0; + for (int i = 0; i < cd4 - 1; i++) { + val = vload4(index + i * ihw_str, in); + sumexp += exp((float)val.x - maxval.x); + sumexp += exp((float)val.y - maxval.x); + sumexp += exp((float)val.z - maxval.x); + sumexp += exp((float)val.w - maxval.x); + } + sumexp += exp(lval.x - maxval.x); + if (ce4 > 1) { + sumexp += exp(lval.y - maxval.x); + } + if (ce4 > 2) { + sumexp += exp(lval.z - maxval.x); + } + if (ce4 > 3) { + sumexp += exp(lval.w - maxval.x); + } + + sumexp = 1.0 / sumexp; + int out_off = (idy + ow_off) * oh_str + idx + oh_off; + for (int i = 0; i < cd4 - 1; i++) { + val = vload4(index + i * ihw_str, in); + val.x = exp((float)val.x - maxval.x) * sumexp; + val.y = exp((float)val.y - maxval.x) * sumexp; + val.z = exp((float)val.z - maxval.x) * sumexp; + val.w = exp((float)val.w - maxval.x) * sumexp; + vstore4(val, out_off + i * ohw_str, out); + } + val.x = exp(lval.x - maxval.x) * sumexp; + if (ce4 > 1) { + val.y = exp(lval.y - maxval.x) * sumexp; + } + if (ce4 > 2) { + val.z = exp(lval.z - maxval.x) * sumexp; + } + if (ce4 > 3) { + val.w = exp(lval.w - maxval.x) * sumexp; + } + vstore4(val, out_off + (cd4 - 1) * ohw_str, out); +} diff --git a/tensor_computing/src/gpu/mali/cl/multiply_align_nchw.cl b/compute/tensor/src/gpu/mali/cl/softmax_h1w1_max_all.cl similarity index 59% rename from tensor_computing/src/gpu/mali/cl/multiply_align_nchw.cl rename to compute/tensor/src/gpu/mali/cl/softmax_h1w1_max_all.cl index d21cbaee..2e162b16 100644 --- a/tensor_computing/src/gpu/mali/cl/multiply_align_nchw.cl +++ b/compute/tensor/src/gpu/mali/cl/softmax_h1w1_max_all.cl @@ -11,23 +11,35 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -__kernel void multiply_align_nchw(const int ih_str, const int iw_str, const int ih_off, const int iw_off, const int oh_str, const int ow_str, const int oh_off, const int ow_off, - const int bx, const int by, const float alp, const float bet, __global T* input, __global T* output) { +__kernel void softmax_h1w1_max_all(const int kn, __global T *in) +{ int idx = get_global_id(0); - int idy = get_global_id(1); - int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; + if (idx >= 1) { + return; + } + + float4 maxval = (float4)(-FLT_MAX); + float4 tmp; T4 val; - int in_off = (idz * ih_str + idy + ih_off) * iw_str + (idx << 2) + iw_off; - val = vload4(0, input + in_off); - val.x = ((float)val.x) * alp + bet; - val.y = ((float)val.y) * alp + bet; - val.z = ((float)val.z) * alp + bet; - val.w = ((float)val.w) * alp + bet; - int out_off = (idz * oh_str + idy + oh_off) * ow_str + (idx << 2) + ow_off; - vstore4(val, 0, output + out_off); -} + int kn4 = kn >> 2; + for (int i = 0; i < kn4; ++i) { + val = vload4(i, in); + tmp.x = (float)val.x; + tmp.y = (float)val.y; + tmp.z = (float)val.z; + tmp.w = (float)val.w; + maxval = fmax(maxval, tmp); + } + + if (maxval.x < maxval.y) { + maxval.x = maxval.y; + } + if (maxval.x < maxval.z) { + maxval.x = maxval.z; + } + if (maxval.x < maxval.w) { + maxval.x = maxval.w; + } + + in[kn + 1] = maxval.x; +} \ No newline at end of file diff --git a/compute/tensor/src/gpu/mali/cl/softmax_h1w1_max_part.cl b/compute/tensor/src/gpu/mali/cl/softmax_h1w1_max_part.cl new file mode 100644 index 00000000..ead93c75 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/softmax_h1w1_max_part.cl @@ -0,0 +1,60 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void softmax_h1w1_max_part( + const int cd4, const int ce4, const int kn, __global const T *in, __global T *out) +{ + int idx = get_global_id(0); + if (idx >= kn) { + return; + } + + float4 maxval = (float4)(-FLT_MAX); + float4 tmp; + T4 val; + + for (int i = idx; i < cd4 - 1; i = i + kn) { + val = vload4(i, in); + tmp.x = (float)val.x; + tmp.y = (float)val.y; + tmp.z = (float)val.z; + tmp.w = (float)val.w; + maxval = fmax(maxval, tmp); + } + + if (maxval.x < maxval.y) { + maxval.x = maxval.y; + } + if (maxval.x < maxval.z) { + maxval.x = maxval.z; + } + if (maxval.x < maxval.w) { + maxval.x = maxval.w; + } + + if (idx == kn - 1) { + val = vload4(cd4 - 1, in); + maxval.x = fmax((float)val.x, maxval.x); + if (ce4 >= 2) { + maxval.x = fmax((float)val.y, maxval.x); + } + if (ce4 >= 3) { + maxval.x = fmax((float)val.z, maxval.x); + } + if (ce4 >= 4) { + maxval.x = fmax((float)val.w, maxval.x); + } + } + + out[idx] = (T)maxval.x; +} diff --git a/tensor_computing/src/gpu/mali/cl/transpose_nchw_0132.cl b/compute/tensor/src/gpu/mali/cl/softmax_h1w1_output.cl similarity index 59% rename from tensor_computing/src/gpu/mali/cl/transpose_nchw_0132.cl rename to compute/tensor/src/gpu/mali/cl/softmax_h1w1_output.cl index 81a3f571..049d43e4 100644 --- a/tensor_computing/src/gpu/mali/cl/transpose_nchw_0132.cl +++ b/compute/tensor/src/gpu/mali/cl/softmax_h1w1_output.cl @@ -11,24 +11,27 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +__kernel void softmax_h1w1_output(const int cd4, + const int ce4, + const int kn, + __global const T *in, + __global const T *tmp, + __global T *out) +{ + int idx = get_global_id(0); + if (idx >= cd4) { + return; + } + T4 val; + val = vload4(idx, in); + float maxv = (float)(tmp[kn + 1]); + float sumexp = (float)(tmp[kn]); + val.x = (T)(exp((float)val.x - maxv) * sumexp); + val.y = (T)(exp((float)val.y - maxv) * sumexp); + val.z = (T)(exp((float)val.z - maxv) * sumexp); + val.w = (T)(exp((float)val.w - maxv) * sumexp); - -__kernel void transpose_nchw_0132(const int ih_str, const int iw_str, const int ih_off, const int iw_off, const int oh_str, const int ow_str, - const int oh_off, const int ow_off, const int oh, const int bx, const int by, __global const T* input, __global T* output) { - - int idx = get_global_id(0); - int idy = get_global_id(1); - int idz = get_global_id(2); - if(idx >= bx || idy >= by) return; - T4 val; - int idx4 = idx << 2; - int in_off = (idz * ih_str + idy + ih_off) * iw_str + idx4 + iw_off; - val = vload4(0, input + in_off); - int out_off = (idz * oh_str + idx4 + oh_off) * ow_str + idy + ow_off; - output[out_off] = val.x; - if(idx4 + 1 < oh) output[out_off + ow_str] = val.y; - if(idx4 + 2 < oh) output[out_off + ow_str * 2] = val.z; - if(idx4 + 3 < oh) output[out_off + ow_str * 3] = val.w; -} + vstore4(val, idx, out); +} \ No newline at end of file diff --git a/compute/tensor/src/gpu/mali/cl/softmax_h1w1_sum_all.cl b/compute/tensor/src/gpu/mali/cl/softmax_h1w1_sum_all.cl new file mode 100644 index 00000000..08014343 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/softmax_h1w1_sum_all.cl @@ -0,0 +1,33 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void softmax_h1w1_sum_all(const int kn, __global T *in) +{ + int idx = get_global_id(0); + if (idx >= 1) { + return; + } + + T4 val; + float sumexp = 0; + int kn4 = kn >> 2; + for (int i = 0; i < kn4; ++i) { + val = vload4(i, in); + sumexp += (float)val.x; + sumexp += (float)val.y; + sumexp += (float)val.z; + sumexp += (float)val.w; + } + + in[kn] = (T)(1.0f / sumexp); +} diff --git a/compute/tensor/src/gpu/mali/cl/softmax_h1w1_sum_part.cl b/compute/tensor/src/gpu/mali/cl/softmax_h1w1_sum_part.cl new file mode 100644 index 00000000..aae3784c --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/softmax_h1w1_sum_part.cl @@ -0,0 +1,49 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void softmax_h1w1_sum_part( + const int cd4, const int ce4, const int kn, __global const T *in, __global T *out) +{ + int idx = get_global_id(0); + if (idx >= kn) { + return; + } + + T4 val; + float maxval = (float)(out[kn + 1]); + float sumexp = 0.0f; + for (int i = idx; i < cd4 - 1; i = i + kn) { + val = vload4(i, in); + + sumexp += exp((float)val.x - maxval); + sumexp += exp((float)val.y - maxval); + sumexp += exp((float)val.z - maxval); + sumexp += exp((float)val.w - maxval); + } + + if (idx == kn - 1) { + val = vload4(cd4 - 1, in); + sumexp += exp((float)val.x - maxval); + if (ce4 >= 2) { + sumexp += exp((float)val.y - maxval); + } + if (ce4 >= 3) { + sumexp += exp((float)val.z - maxval); + } + if (ce4 >= 4) { + sumexp += exp((float)val.w - maxval); + } + } + + out[idx] = (T)sumexp; +} \ No newline at end of file diff --git a/tensor_computing/src/gpu/mali/cl/softmax_nchw_c.cl b/compute/tensor/src/gpu/mali/cl/softmax_nchw_c.cl similarity index 57% rename from tensor_computing/src/gpu/mali/cl/softmax_nchw_c.cl rename to compute/tensor/src/gpu/mali/cl/softmax_nchw_c.cl index ea50ad50..5825115a 100644 --- a/tensor_computing/src/gpu/mali/cl/softmax_nchw_c.cl +++ b/compute/tensor/src/gpu/mali/cl/softmax_nchw_c.cl @@ -11,22 +11,34 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -__kernel void softmax_nchw_c(const int c, const int iw_str, const int ihw_str, const int iw_off, const int ih_off, - const int ow_str, const int ohw_str, const int ow_off, const int oh_off, const int bx, const int by, __global const T* in, __global T* out) { +__kernel void softmax_nchw_c(const int c, + const int iw_str, + const int ihw_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int ohw_str, + const int ow_off, + const int oh_off, + const int ow, + const int bx, + const int by, + __global T *in, + __global T *out) +{ int idx = get_global_id(0); int idy = get_global_id(1); - if(idx >= bx || idy >= by) return; + if (idx >= bx || idy >= by) { + return; + } + int ew = ((idx << 2) + 4 <= ow) ? 4 : (ow & 3); float4 maxval = (float4)(-FLT_MAX); float4 tmp; - T4 val; + T4 val; int index = (idy + ih_off) * iw_str + (idx << 2) + iw_off; for (int i = 0; i < c; i++) { - val = vload4(0, in + index + i * ihw_str); + val = vload4(0, in + index + i * ihw_str); tmp.x = (float)val.x; tmp.y = (float)val.y; tmp.z = (float)val.z; @@ -36,7 +48,7 @@ __kernel void softmax_nchw_c(const int c, const int iw_str, const int ihw_str, c float4 sumexp = 0; for (int i = 0; i < c; i++) { - val = vload4(0, in + index + i * ihw_str); + val = vload4(0, in + index + i * ihw_str); sumexp.x += exp((float)val.x - maxval.x); sumexp.y += exp((float)val.y - maxval.y); sumexp.z += exp((float)val.z - maxval.z); @@ -47,14 +59,30 @@ __kernel void softmax_nchw_c(const int c, const int iw_str, const int ihw_str, c sumexp.y = 1.0 / sumexp.y; sumexp.z = 1.0 / sumexp.z; sumexp.w = 1.0 / sumexp.w; - T4 res; int out_off = (idy + oh_off) * ow_str + (idx << 2) + ow_off; - for (int i = 0; i < c; i++) { - val = vload4(0, in + index + i * ihw_str); - res.x = (T)exp(val.x - maxval.x) * sumexp.x; - res.y = (T)exp(val.y - maxval.y) * sumexp.y; - res.z = (T)exp(val.z - maxval.z) * sumexp.z; - res.w = (T)exp(val.w - maxval.w) * sumexp.w; - vstore4(res, 0, out + out_off + i * ohw_str); + if (ew == 4) { + for (int i = 0; i < c; i++) { + val = vload4(0, in + index + i * ihw_str); + val.x = exp((float)val.x - maxval.x) * sumexp.x; + val.y = exp((float)val.y - maxval.y) * sumexp.y; + val.z = exp((float)val.z - maxval.z) * sumexp.z; + val.w = exp((float)val.w - maxval.w) * sumexp.w; + vstore4(val, 0, out + out_off + i * ohw_str); + } + } else { + for (int i = 0; i < c; i++) { + val = vload4(0, in + index + i * ihw_str); + val.x = exp((float)val.x - maxval.x) * sumexp.x; + val.y = exp((float)val.y - maxval.y) * sumexp.y; + val.z = exp((float)val.z - maxval.z) * sumexp.z; + if (ew < 2) { + val.y = 0; + } + if (ew < 3) { + val.z = 0; + } + val.w = 0; + vstore4(val, 0, out + out_off + i * ohw_str); + } } } diff --git a/compute/tensor/src/gpu/mali/cl/softmax_nchw_w.cl b/compute/tensor/src/gpu/mali/cl/softmax_nchw_w.cl new file mode 100644 index 00000000..90f1e079 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/softmax_nchw_w.cl @@ -0,0 +1,116 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void softmax_nchw_w(const int wd4, + const int we4, + const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int oh_str, + const int ow_off, + const int oh_off, + const int bx, + const int by, + __global T *in, + __global T *out) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + if (idx >= bx || idy >= by) { + return; + } + + float4 maxval = (float4)(-FLT_MAX); + float4 tmp; + float4 lval; + T4 val; + + int index = (idy * ih_str + idx + ih_off) * iw_str + iw_off; + for (int i = 0; i < wd4 - 1; i++) { + val = vload4(i, in + index); + tmp.x = (float)val.x; + tmp.y = (float)val.y; + tmp.z = (float)val.z; + tmp.w = (float)val.w; + maxval = fmax(maxval, tmp); + } + val = vload4(wd4 - 1, in + index); + lval.x = (float)val.x; + lval.y = (float)val.y; + lval.z = (float)val.z; + lval.w = (float)val.w; + if (maxval.x < maxval.y) { + maxval.x = maxval.y; + } + if (maxval.x < maxval.z) { + maxval.x = maxval.z; + } + if (maxval.x < maxval.w) { + maxval.x = maxval.w; + } + if (maxval.x < lval.x) { + maxval.x = lval.x; + } + if (we4 > 1 && maxval.x < lval.y) { + maxval.x = lval.y; + } + if (we4 > 2 && maxval.x < lval.z) { + maxval.x = lval.z; + } + if (we4 > 3 && maxval.x < lval.w) { + maxval.x = lval.w; + } + + float sumexp = 0; + for (int i = 0; i < wd4 - 1; i++) { + val = vload4(i, in + index); + sumexp += exp((float)val.x - maxval.x); + sumexp += exp((float)val.y - maxval.x); + sumexp += exp((float)val.z - maxval.x); + sumexp += exp((float)val.w - maxval.x); + } + sumexp += exp(lval.x - maxval.x); + if (we4 > 1) { + sumexp += exp(lval.y - maxval.x); + } + if (we4 > 2) { + sumexp += exp(lval.z - maxval.x); + } + if (we4 > 3) { + sumexp += exp(lval.w - maxval.x); + } + + sumexp = 1.0 / sumexp; + int out_off = (idy * oh_str + idx + oh_off) * ow_str + ow_off; + for (int i = 0; i < wd4 - 1; i++) { + val = vload4(i, in + index); + val.x = exp((float)val.x - maxval.x) * sumexp; + val.y = exp((float)val.y - maxval.x) * sumexp; + val.z = exp((float)val.z - maxval.x) * sumexp; + val.w = exp((float)val.w - maxval.x) * sumexp; + vstore4(val, i, out + out_off); + } + val.x = exp(lval.x - maxval.x) * sumexp; + if (we4 > 1) { + val.y = exp(lval.y - maxval.x) * sumexp; + } + if (we4 > 2) { + val.z = exp(lval.z - maxval.x) * sumexp; + } + if (we4 > 3) { + val.w = exp(lval.w - maxval.x) * sumexp; + } + vstore4(val, wd4 - 1, out + out_off); +} diff --git a/tensor_computing/src/gpu/mali/cl/space2depth.cl b/compute/tensor/src/gpu/mali/cl/space2depth.cl similarity index 67% rename from tensor_computing/src/gpu/mali/cl/space2depth.cl rename to compute/tensor/src/gpu/mali/cl/space2depth.cl index 6cbcbaee..b145410f 100644 --- a/tensor_computing/src/gpu/mali/cl/space2depth.cl +++ b/compute/tensor/src/gpu/mali/cl/space2depth.cl @@ -1,11 +1,23 @@ -__kernel void space2depth(const int iw_str, const int ih_str, const int iw_off, const int ih_off, - const int oh_str, const int ohw_str, const int ow_off, const int oh_off, - const int bx, const int by, __global const uchar* in, __global T* out) { +__kernel void space2depth(const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int oh_str, + const int ohw_str, + const int ow_off, + const int oh_off, + const int bx, + const int by, + __global const uchar *in, + __global T *out) +{ const int idx = get_global_id(0); const int idy = get_global_id(1); - if(idx >= bx || idy >= by) return; + if (idx >= bx || idy >= by) { + return; + } - const int in_off = ((idx << 2)+ ih_off) * iw_str + (idy << 2) + iw_off; + const int in_off = ((idx << 2) + ih_off) * iw_str + (idy << 2) + iw_off; uchar4 tmp0 = vload4(0, in + in_off); uchar4 tmp1 = vload4(0, in + in_off + iw_str); uchar4 tmp2 = vload4(0, in + in_off + (iw_str << 1)); @@ -28,7 +40,7 @@ __kernel void space2depth(const int iw_str, const int ih_str, const int iw_off, val3.z = tmp3.z / (T)(255); val3.w = tmp3.w / (T)(255); - const int out_off = (idy + ow_off) * oh_str + idx + oh_off; + const int out_off = (idy + ow_off) * oh_str + idx + oh_off; vstore4(val0, out_off, out); vstore4(val1, out_off + ohw_str, out); vstore4(val2, out_off + ohw_str * 2, out); diff --git a/tensor_computing/src/gpu/mali/cl/squeeze.cl b/compute/tensor/src/gpu/mali/cl/squeeze.cl similarity index 80% rename from tensor_computing/src/gpu/mali/cl/squeeze.cl rename to compute/tensor/src/gpu/mali/cl/squeeze.cl index 51717f11..e44aa6fb 100644 --- a/tensor_computing/src/gpu/mali/cl/squeeze.cl +++ b/compute/tensor/src/gpu/mali/cl/squeeze.cl @@ -11,17 +11,26 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -__kernel void squeeze(const int h, const int w, const int ih_str, const int iw_str, const int ih_off, const int iw_off, - const int oh_str, const int ow_str, const int oh_off, const int ow_off, __global const T* in, __global T* out) { +__kernel void squeeze(const int h, + const int w, + const int ih_str, + const int iw_str, + const int ih_off, + const int iw_off, + const int oh_str, + const int ow_str, + const int oh_off, + const int ow_off, + __global const T *in, + __global T *out) +{ int idx = get_global_id(0); int idy = get_global_id(1); - if(idx >= h || idy >= w) return; + if (idx >= h || idy >= w) { + return; + } int idz = get_global_id(2); - T4 val; + T4 val; int in_off = (idz * iw_str + idy + iw_off) * ih_str + idx + ih_off; val = vload4(in_off, in); int out_off = (idz * ow_str + idy + ow_off) * oh_str + idx + oh_off; diff --git a/compute/tensor/src/gpu/mali/cl/transpose_3d_nchw.cl b/compute/tensor/src/gpu/mali/cl/transpose_3d_nchw.cl new file mode 100644 index 00000000..dd9c513c --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/transpose_3d_nchw.cl @@ -0,0 +1,119 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void transpose_nchw(const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int oh_str, + const int ow_off, + const int oh_off, + const int dim0, + const int dim1, + const int dim2, + const int dim3, + const int iw, + const int it, + const int ot, + const int bx, + const int by, + __global const T *in, + __global T *out) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + int idz = get_global_id(2); + int idt = idz % it; + int idc = idz / it; + if (idx >= bx || idy >= by) { + return; + } + char ew = (((idx << 2) + 4) <= iw) ? 4 : (iw & 3); + T4 val = 0; + const int in_off = (idz * ih_str + idy + ih_off) * iw_str + (idx << 2) + iw_off; + if (ew == 4) { + val = vload4(0, in + in_off); + } else { + if (ew == 1) { + val.x = in[in_off]; + } + if (ew == 2) { + val.xy = vload2(0, in + in_off); + } + if (ew == 3) { + val.xyz = vload3(0, in + in_off); + } + } + int idox = idx << 2; + int idoy = idy; + int idot = idt; + int idoc = idc; + + int out_str = 1; + + if (dim0 == 1) { + idox = idy; + } + if (dim0 == 2) { + idox = idt; + } + if (dim0 == 3) { + idox = idc; + } + + if (dim1 == 0) { + idoy = idx << 2; + out_str = ow_str; + } + if (dim1 == 2) { + idoy = idt; + } + if (dim1 == 3) { + idoy = idc; + } + + if (dim2 == 0) { + idot = idx << 2; + out_str = ow_str * oh_str; + } + if (dim2 == 1) { + idot = idy; + } + if (dim2 == 3) { + idot = idc; + } + + if (dim3 == 0) { + idoc = idx << 2; + out_str = ow_str * oh_str * ot; + } + if (dim3 == 1) { + idoc = idy; + } + if (dim3 == 2) { + idoc = idt; + } + + int out_off = ((idoc * ot + idot) * oh_str + idoy + oh_off) * ow_str + idox + ow_off; + out[out_off] = val.x; + if (ew > 1) { + out[out_off + out_str] = val.y; + } + if (ew > 2) { + out[out_off + out_str * 2] = val.z; + } + if (ew > 3) { + out[out_off + out_str * 3] = val.w; + } +} diff --git a/compute/tensor/src/gpu/mali/cl/transpose_nchw.cl b/compute/tensor/src/gpu/mali/cl/transpose_nchw.cl new file mode 100644 index 00000000..b78112b3 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/transpose_nchw.cl @@ -0,0 +1,92 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__kernel void transpose_nchw(const int iw_str, + const int ih_str, + const int iw_off, + const int ih_off, + const int ow_str, + const int oh_str, + const int ow_off, + const int oh_off, + const int dim0, + const int dim1, + const int dim2, + const int iw, + const int bx, + const int by, + __global const T *in, + __global T *out) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + int idz = get_global_id(2); + if (idx >= bx || idy >= by) { + return; + } + char ew = (((idx << 2) + 4) <= iw) ? 4 : (iw & 3); + T4 val = 0; + const int in_off = (idz * ih_str + idy + ih_off) * iw_str + (idx << 2) + iw_off; + if (ew == 4) { + val = vload4(0, in + in_off); + } else { + if (ew == 1) { + val.x = in[in_off]; + } + if (ew == 2) { + val.xy = vload2(0, in + in_off); + } + if (ew == 3) { + val.xyz = vload3(0, in + in_off); + } + } + int ox = idx << 2; + int oy = idy; + int oz = idz; + int out_str = 1; + + if (dim0 == 1) { + ox = idy; + } + if (dim0 == 2) { + ox = idz; + } + + if (dim1 == 0) { + oy = idx << 2; + out_str = ow_str; + } + if (dim1 == 2) { + oy = idz; + } + + if (dim2 == 0) { + oz = idx << 2; + out_str = ow_str * oh_str; + } + if (dim2 == 1) { + oz = idy; + } + + int out_off = (oz * oh_str + oy + oh_off) * ow_str + ox + ow_off; + out[out_off] = val.x; + if (ew > 1) { + out[out_off + out_str] = val.y; + } + if (ew > 2) { + out[out_off + out_str * 2] = val.z; + } + if (ew > 3) { + out[out_off + out_str * 3] = val.w; + } +} diff --git a/compute/tensor/src/gpu/mali/clip.cpp b/compute/tensor/src/gpu/mali/clip.cpp new file mode 100644 index 00000000..93878bfa --- /dev/null +++ b/compute/tensor/src/gpu/mali/clip.cpp @@ -0,0 +1,85 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/clip_mali_fp16.h" + +EE clip_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + if (outputDesc) { + *outputDesc = inputDesc; + } + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + + if (idf == DF_NCHW) { + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + if (gclmemInputDesc && gclmemOutputDesc) { + *gclmemOutputDesc = *gclmemInputDesc; // the input and output mem maybe the same + } + return SUCCESS; + } + return NOT_SUPPORTED; +} + +inline EE clip_checkpara_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + if (handle == nullptr || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + if (inputDesc.df != outputDesc.df || inputDesc.df != DF_NCHW) { + return NOT_SUPPORTED; + } + if (input->desc.memFormat != output->desc.memFormat || input->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE clip_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + ClipParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(clip_checkpara_mali(handle, inputDesc, input, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = clip_mali_fp16(handle, inputDesc, input, p, outputDesc, output); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/concat.cpp b/compute/tensor/src/gpu/mali/concat.cpp new file mode 100644 index 00000000..5e5fff4d --- /dev/null +++ b/compute/tensor/src/gpu/mali/concat.cpp @@ -0,0 +1,167 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/concat_mali_fp16.h" + +EE concat_infer_output_size_mali(std::vector inputDesc, + ConcatParamSpec p, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + if (outputDesc) { + *outputDesc = inputDesc[0]; + } + U32 sumDimSize = 0; + I32 dim = inputDesc[0].nDims; + int concatDim = p.axis; + concatDim = (concatDim + dim) % dim; + concatDim = dim - 1 - concatDim; + for (auto p : inputDesc) { + if (inputDesc[0].df != p.df) { + CHECK_STATUS(NOT_SUPPORTED); + } + } + if (inputDesc[0].df == DF_MKT) { + concatDim = 1 - concatDim; + } + for (U32 i = 0; i < inputDesc.size(); i++) { + sumDimSize += inputDesc[i].dims[concatDim]; + } + + if (outputDesc) { + *outputDesc = inputDesc[0]; + (*outputDesc).dims[concatDim] = sumDimSize; + } + + if (gclmemInputDesc && gclmemOutputDesc) { + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + for (U32 i = 0; i < inputDesc.size(); i++) { + tensorSelectGet(inputDesc[i], &idt, &idf, &in, &ic, &ih, &iw); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, &gclmemInputDesc[i], gclmemOutputDesc)); + } + U32 s0 = gclmemOutputDesc->stride[0]; + U32 s1 = gclmemOutputDesc->stride[1]; + U32 s2 = gclmemOutputDesc->stride[2]; + if (inputDesc[0].df == DF_NCHW) { + if (concatDim == 0) { + s1 = sumDimSize; + } else if (concatDim == 1) { + s0 = sumDimSize; + } else if (concatDim == 2) { + s2 = (sumDimSize + 3) / 4; + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + } + if (inputDesc[0].df == DF_MKT || inputDesc[0].df == DF_MTK) { + if (concatDim == 0) { + s2 = (sumDimSize + 3) / 4; + } else if (concatDim == 1) { + s0 = sumDimSize; + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + } + gclmemOutputDesc->stride[0] = s0; + gclmemOutputDesc->stride[1] = s1; + gclmemOutputDesc->stride[2] = s2; + gclmemOutputDesc->num = s0 * s1 * s2 * 4; + gclmemOutputDesc->byteSize = s0 * s1 * s2 * 4 * bytesOf(idt); + } + return SUCCESS; +} + +inline EE concat_checkpara_mali(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + ConcatParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (handle == nullptr || nullptr == output) { + return NULL_POINTER; + } + if (input.size() < 1) { + return NOT_MATCH; + } + for (auto it : inputDesc) { + if (it.df != outputDesc.df) { + return NOT_MATCH; + } + } + if (outputDesc.df != DF_NCHW && outputDesc.df != DF_MKT && outputDesc.df != DF_MTK) { + return NOT_SUPPORTED; + } + for (auto it : input) { + GCLMem_t ptr = (GCLMem_t)it; + if (ptr == nullptr) { + return NULL_POINTER; + } + if (ptr->desc.memFormat != output->desc.memFormat) { + return NOT_SUPPORTED; + } + } + return SUCCESS; +} + +EE concat_infer_forward_tmp_bytes_mali(std::vector inputDesc, U32 *bytes) +{ + EE ret = SUCCESS; + switch (inputDesc[0].dt) { + case DT_F16: { + ret = concat_infer_forward_tmp_bytes_mali_fp16(inputDesc, bytes); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE concat_mali(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + GCLMem_t inputScale, + ConcatParamSpec p, + GCLMem_t tmpbuf, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t outputScale) +{ + UNUSED(inputScale); + UNUSED(outputScale); + EE ret = SUCCESS; + CHECK_STATUS(concat_checkpara_mali(handle, inputDesc, input, p, outputDesc, output)); + switch (inputDesc[0].dt) { + case DT_F16: { + ret = concat_mali_fp16(handle, inputDesc, input, outputDesc, output, tmpbuf, p.axis); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/convolution.cpp b/compute/tensor/src/gpu/mali/convolution.cpp new file mode 100644 index 00000000..c9510349 --- /dev/null +++ b/compute/tensor/src/gpu/mali/convolution.cpp @@ -0,0 +1,606 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/convolution_mali_fp16.h" + +inline void convolution_produce_algos_paras(TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + DataFormat inputGclmemFormat, + std::vector *convolutionAlgorithms, + std::vector *algoNumIndex, + std::vector *vecW, + std::vector *vecC, + std::vector *vecK) +{ + DataFormat idf; + U32 ic, ih, iw, fn, fh, fw, sh, sw; + tensorSelectGet(inputDesc, NULL, &idf, NULL, &ic, &ih, &iw); + tensorSelectGet(filterDesc, NULL, NULL, &fn, NULL, &fh, &fw); + sh = convParamSpec.stride_h; + sw = convParamSpec.stride_w; + U32 configInfo[3][128]; + U32 configNums[2]; + ConvolutionForwardAlgorithm algo[2]; + U32 algoNum = 1; + algo[0] = CONVOLUTION_ALGORITHM_DIRECT; + if (inputGclmemFormat == DF_NCHW && (ih != 1 || iw != 1 || fw != 1 || fh != 1)) { + configInfo[0][0] = (8 * sw - (fw - 1)) / sw; + configInfo[1][0] = 1; + configInfo[2][0] = 4; + configNums[0] = 1; + } else if (fn == 1 && sw == 1 && (fw == fh) && (fw == 1 || fw == 3 || fw == 5 || fw == 7)) { + configInfo[0][0] = (fw == 7) ? 6 : 8; + configInfo[1][0] = 4; + configInfo[2][0] = 1; + configNums[0] = 1; + } else { + if (fw == 3 && fh == 3 && sw == 1 && sh == 1) { + algo[1] = CONVOLUTION_ALGORITHM_WINOGRAD; + algoNum = 2; + } + U32 configNum = 0; + for (U32 ii = 0; ii < algoNum; ii++) { + if (algo[ii] == CONVOLUTION_ALGORITHM_DIRECT) { + if (ih == 1 && iw == 1 && fh == 1 && fw == 1) { + U32 j = 8; + for (U32 i = 0; i < 3; i++) { + configInfo[0][configNum] = 1; + configInfo[1][configNum] = 1 << (2 + i); + configInfo[2][configNum] = 0; + configNum++; + if (ic % j != 0) { + break; + } + j = j << 1; + } + } else { + U32 k = 4; + U32 nj = 8; + for (U32 i = 0; i < 2; i++) { + for (U32 j = 0; j < nj; j++) { + configInfo[0][configNum] = j + 1; + configInfo[1][configNum] = 4; + configInfo[2][configNum] = k; + configNum++; + } + k = k << 1; + if (fn % k != 0) { + break; + } + nj = 4; + } + if (fw == 1 && fh == 1 && sw == 1 && sh == 1) { + U32 k = 4; + U32 nj = 2; + for (U32 i = 0; i < 3; i++) { + U32 w = 2; + if (i == 2) { + nj = 1; + } + for (U32 j = 0; j < nj; j++) { + if (ih % w != 0) { + continue; + } + configInfo[0][configNum] = w << 8; + configInfo[1][configNum] = 4; + configInfo[2][configNum] = k; + configNum += 1; + w = w << 1; + } + k = k << 1; + if (fn % k != 0) { + break; + } + } + if (fn % 16 == 0) { + for (U32 i = 0; i < 3; i++) { + configInfo[0][configNum] = i + 1; + configInfo[1][configNum] = 4; + configInfo[2][configNum] = 16; + configNum++; + } + } + } + } + } + + if (algo[ii] == CONVOLUTION_ALGORITHM_WINOGRAD) { + for (U32 i = 1; i <= 8; i++) { + for (U32 j = 4; j <= 8; j += 4) { + if (i * j <= 2) { + continue; + } + configInfo[0][configNum] = i; + configInfo[1][configNum] = 1; + configInfo[2][configNum] = j; + configNum++; + } + } + } + configNums[ii] = configNum; + } + } + + for (U32 i = 0; i < algoNum; i++) { + (*convolutionAlgorithms).push_back(algo[i]); + (*algoNumIndex).push_back(configNums[i]); + U32 be = (i == 0) ? 0 : configNums[i - 1]; + U32 end = configNums[i]; + for (U32 j = be; j < end; j++) { + if (vecW) { + (*vecW).push_back(configInfo[0][j]); + } + if (vecC) { + (*vecC).push_back(configInfo[1][j]); + } + if (vecK) { + (*vecK).push_back(configInfo[2][j]); + } + } + } +} + +EE convolution_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + if (outputDesc == nullptr || gclmemInputDesc == nullptr || gclmemOutputDesc == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, fdt; + DataFormat idf, fdf; + U32 iw, ih, ic, in, it; + U32 fw, fh, fc, fn, ft; + U32 ow, oh, ot; + U32 sw, sh, st, dw, dh, fdw, fdh; + U32 pl, pr, pt, pb, pt_b, pt_a; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw, &it); + tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw, &ft); + pl = convParamSpec.padding_left; + pr = convParamSpec.padding_right; + pt = convParamSpec.padding_top; + pb = convParamSpec.padding_bottom; + sw = convParamSpec.stride_w; + sh = convParamSpec.stride_h; + dw = convParamSpec.dilatedRate_w; + dh = convParamSpec.dilatedRate_h; + pt_b = convParamSpec.padding_before; + pt_a = convParamSpec.padding_after; + st = convParamSpec.stride_t; + + if (fw > 7 || fw == 6) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (in != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (fw < 1 || fh < 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (dw != 1 || dh != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (sw != 1 && sw != 2) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (sh != 1 && sh != 2) { + CHECK_STATUS(NOT_SUPPORTED); + } + fdw = (fw - 1) * dw + 1; + fdh = (fh - 1) * dh + 1; + ow = (iw + pl + pr - fdw) / sw + 1; + oh = (ih + pt + pb - fdh) / sh + 1; + ot = (inputDesc.df == DF_NCTHW) ? (it + pt_b + pt_a - ft) / st + 1 : 1; + + U32 iw_align, ih_align, item_w, ext_w, ext_h; + bool need_pad = false; + + if (inputDesc.df == DF_NCTHW) { + *outputDesc = tensor5df(idt, idf, in, fn, ot, oh, ow); + } else { + *outputDesc = tensor4df(idt, idf, in, fn, oh, ow); + } + ext_w = (fw / 2 < pl) ? pl : fw / 2; // if fw / 2 < pl, use pl as offset + ext_h = pt; + + std::vector convolutionAlgorithms; + std::vector algoNumIndex; + std::vector vecW; + DataFormat inputGclmemFormat = DF_NCWHC4; + if (gclmemInputDesc->byteSize == 0) { + inputGclmemFormat = DF_NCHW; + } + convolution_produce_algos_paras(inputDesc, filterDesc, convParamSpec, inputGclmemFormat, + &convolutionAlgorithms, &algoNumIndex, &vecW, NULL, NULL); + iw_align = ow; + for (auto p : convolutionAlgorithms) { + U32 tmp_align = 0; + if (p == CONVOLUTION_ALGORITHM_WINOGRAD) { + tmp_align = ALIGN(ow, 16); + } else { + for (U32 i = 0; i < algoNumIndex[0]; i++) { + item_w = vecW[i]; + item_w = ((item_w >> 8) > 0) ? 1 : item_w; + U32 j = ALIGN(ow, item_w); + tmp_align = (tmp_align < j) ? j : tmp_align; + } + } + iw_align = (iw_align < tmp_align) ? tmp_align : iw_align; + } + iw_align = iw_align * sw; + ih_align = ih + pt + pb; + ih_align = ih_align - ext_h * 2; + + if (pl < ext_w) { // if fw / 2 > pl, use pl as offset, and pad (ext_w - pl) * 2 in the end + iw_align = iw_align + 2 * (ext_w - pl); + ext_w = pl; + } + if (iw_align != iw || ih_align != ih) { + need_pad = true; + } + if (ext_w != 0 || ext_h != 0) { + need_pad = true; + } + + if (fw == 1 && fh == 1 && ft == 1 && iw == 1 && ih == 1 && it == 1) { + CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw_align, ih_align, ic, ext_w, ext_h, ow, oh, fn, idt, + idt, gclmemInputDesc, gclmemOutputDesc, need_pad)); + return SUCCESS; + } + + if (inputGclmemFormat == DF_NCHW) { + if (fw == fh && (fw == 1 || fw == 3 || fw == 5 || fw == 7)) { + CHECK_STATUS(infer_gclmem_desc_nchw(iw_align, ih_align, ic * it, ext_w, ext_h, 0, 0, 0, + idt, idt, gclmemInputDesc, NULL, need_pad)); + } else { + ic = ALIGN(ic, 4); + CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw_align, ih_align, ic * it, ext_w, ext_h, 0, 0, + 0, idt, idt, gclmemInputDesc, NULL, need_pad)); + } + fn = ALIGN(fn, 4); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + 0, 0, 0, 0, 0, ow, oh, fn * ot, idt, idt, NULL, gclmemOutputDesc)); + return SUCCESS; + } + + ic = ALIGN(ic, 4); + CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw_align, ih_align, ic * it, ext_w, ext_h, 0, 0, 0, idt, + idt, gclmemInputDesc, NULL, need_pad)); + if (fn == 1 && sw == 1 && (fw == fh) && ft == 1 && (fw == 1 || fw == 3 || fw == 5 || fw == 7)) { + CHECK_STATUS(infer_gclmem_desc_nchw( + 0, 0, 0, 0, 0, ow, oh, fn * ot, idt, idt, NULL, gclmemOutputDesc)); + } else { + fn = ALIGN(fn, 4); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + 0, 0, 0, 0, 0, ow, oh, fn * ot, idt, idt, NULL, gclmemOutputDesc)); + } + return SUCCESS; +} + +EE convolution_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc outputDesc, + GCLMemDesc inputMemDesc, + GCLMemDesc outputMemDesc, + ConvolutionPolicy policy, + ActivationMode activationMode, + ForwardRunInfoMali_t forwardRunInfo) +{ + if (forwardRunInfo == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + if (algorithm != CONVOLUTION_ALGORITHM_NULL) { + return SUCCESS; + } + if (policy == CONVOLUTION_LIBRARY_SEARCH) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (policy == CONVOLUTION_FASTEST) { + CHECK_STATUS(NOT_SUPPORTED); + } + DataType dt; + U32 ih, iw, fn, fh, fw; + tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, &iw); + tensorSelectGet(filterDesc, &dt, NULL, &fn, NULL, &fh, &fw); + + std::vector convolutionAlgorithms; + std::vector algoNumIndex; + std::vector vecW; + std::vector vecC; + std::vector vecK; + DataFormat inputGclmemFormat = inputMemDesc.memFormat; + convolution_produce_algos_paras(inputDesc, filterDesc, convParamSpec, inputGclmemFormat, + &convolutionAlgorithms, &algoNumIndex, &vecW, &vecC, &vecK); + if (vecW.size() == 1) { + forwardRunInfo->best_w[0] = vecW[0]; + forwardRunInfo->best_k[0] = vecK[0]; + forwardRunInfo->best_c[0] = vecC[0]; + forwardRunInfo->algorithm = convolutionAlgorithms[0]; + return SUCCESS; + } + + if (policy == CONVOLUTION_TUNNING) { + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_enable_queue_profiling(handle)); + GCLMem_t input = gcl_create_gclmem(); + GCLMem_t filter = gcl_create_gclmem(); + GCLMem_t output = gcl_create_gclmem(); + GCLMem_t bias = gcl_create_gclmem(); + GCLMem_t tmpbuf = gcl_create_gclmem(); + U32 maxFilterSize = 0; + U32 maxBytes = 0; + U32 algosNum = 0; + std::vector runInfos; + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + std::vector filterMemDescs; + for (U32 i = 0; i < algoNumIndex.size(); i++) { + U32 bytes = 0; + ForwardRunInfoMali runInfo; + U32 be = (i == 0) ? 0 : algoNumIndex[i - 1]; + U32 end = algoNumIndex[i]; + runInfo.algorithm = convolutionAlgorithms[i]; + for (U32 j = be; j < end; j++) { + GCLMemDesc filterMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + runInfo.best_w[0] = vecW[j]; + runInfo.best_c[0] = vecC[j]; + runInfo.best_k[0] = vecK[j]; + if (convolution_transform_filter_bytes_mali( + filterDesc, &runInfo, &filterMemDesc, &bytes) != SUCCESS) { + continue; + } + maxBytes = (maxBytes < bytes) ? bytes : maxBytes; + if (convolution_infer_forward_tmp_bytes_mali(inputDesc, filterDesc, outputDesc, + convParamSpec, &runInfo, &bytes) != SUCCESS) { + continue; + } + maxBytes = (maxBytes < bytes) ? bytes : maxBytes; + maxFilterSize = (maxFilterSize < filterMemDesc.byteSize) ? filterMemDesc.byteSize + : maxFilterSize; + filterMemDescs.push_back(filterMemDesc); + runInfos.push_back(runInfo); + } + } + + if (ih == 1 && iw == 1 && fh == 1 && fw == 1) { + U32 stride[3] = {fn, 1, 1}; + U32 offset[3] = {0, 0, 0}; + CHECK_STATUS(gclmem_set_desc_padding( + &bias->desc, stride, offset, dt, DF_NHWC, GCL_MEM_BUF, CL_MEM_READ_WRITE)); + } else { + U32 stride[3] = {(fn + 3) / 4, 1, 1}; + U32 offset[3] = {0, 0, 0}; + CHECK_STATUS(gclmem_set_desc_padding( + &bias->desc, stride, offset, dt, DF_NHWC, GCL_MEM_IMG_1D, CL_MEM_READ_WRITE)); + } + algosNum = runInfos.size(); + if (algosNum == 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + TensorDesc scaleDesc = tensor1d(DT_F32, 0); + TensorDesc biasDesc = tensor1d(dt, fn); + filterMemDescs[0].byteSize = maxFilterSize; + outputMemDesc.need_pad = false; + input->desc = inputMemDesc; + output->desc = outputMemDesc; + filter->desc = filterMemDescs[0]; + tmpbuf->desc.byteSize = maxBytes; + gcl_create_memory(handle, input); + gcl_create_memory(handle, output); + gcl_create_memory(handle, filter); + gcl_create_memory(handle, bias); + if (maxBytes) { + gcl_create_memory(handle, tmpbuf); + } + + double minTimeDirect = DBL_MAX; + double minTimeWinograd = DBL_MAX; + double minTime = DBL_MAX; + double winogradPicTranTime = DBL_MAX; + double winogradOutTranTime = DBL_MAX; + U32 runKernelBe = 0; + U32 runKernelEnd = 0; + ForwardRunInfoMali bestRunInfo; + ForwardRunInfoMali bestRunInfoDirect; + ForwardRunInfoMali bestRunInfoWinograd; + for (U32 i = 0; i < algosNum; i++) { + filter->desc = filterMemDescs[i]; + if (convolution_mali(handle, inputDesc, input, filterDesc, filter, convParamSpec, + &runInfos[i], scaleDesc, NULL, biasDesc, bias, maxBytes, tmpbuf, outputDesc, + output, activationMode) == SUCCESS) { + if (runInfos[i].algorithm == (I32)CONVOLUTION_ALGORITHM_DIRECT) { + runKernelEnd = handle->kernelVec->size(); + gcl_run_kernelVec_timing(handle, runKernelBe, runKernelEnd); + runKernelBe = runKernelEnd; + if (minTimeDirect > handle->t_execute) { + minTimeDirect = handle->t_execute; + bestRunInfoDirect = runInfos[i]; + } + } + + if (runInfos[i].algorithm == (I32)CONVOLUTION_ALGORITHM_WINOGRAD) { + if (winogradPicTranTime == DBL_MAX) { + runKernelEnd = runKernelBe + 2; + gcl_run_kernelVec_timing(handle, runKernelBe, runKernelEnd); + winogradPicTranTime = handle->t_execute; + } + runKernelBe += 2; + runKernelEnd = runKernelBe + 1; + gcl_run_kernelVec_timing(handle, runKernelBe, runKernelEnd); + if (minTimeWinograd > handle->t_execute) { + minTimeWinograd = handle->t_execute; + bestRunInfoWinograd = runInfos[i]; + } + runKernelBe += 36; + if (winogradOutTranTime == DBL_MAX) { + runKernelEnd = runKernelBe + 1; + gcl_run_kernelVec_timing(handle, runKernelBe, runKernelEnd); + winogradOutTranTime = handle->t_execute; + } + runKernelBe = handle->kernelVec->size(); + } + } + } + + if (minTimeWinograd != DBL_MAX) { + minTimeWinograd = 36 * minTimeWinograd + winogradPicTranTime + winogradOutTranTime; + } + minTime = minTimeDirect; + bestRunInfo = bestRunInfoDirect; + if (minTimeWinograd < minTime) { + minTime = minTimeWinograd; + bestRunInfo = bestRunInfoWinograd; + } + if (minTime == DBL_MAX) { + CHECK_STATUS(NOT_SUPPORTED); + } + *forwardRunInfo = bestRunInfo; + CHECK_STATUS(gcl_finish(handle)); + gcl_destroy_gclmem(input); + gcl_destroy_gclmem(filter); + gcl_destroy_gclmem(output); + gcl_destroy_gclmem(bias); + gcl_destroy_gclmem(tmpbuf); + convolutionAlgorithms.clear(); + runInfos.clear(); + filterMemDescs.clear(); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_off_queue_profiling(handle)); + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE convolution_transform_filter_bytes_mali(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { + case DT_F16: { + ret = convolution_transform_filter_bytes_mali_fp16( + filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE convolution_transform_filter_mali(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + GCLMem_t tmp, + TensorDesc *fltmemDesc, + GCLMem_t fltmem) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { + case DT_F16: { + ret = convolution_transform_filter_mali_fp16( + handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem, tmp); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE convolution_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = convolution_infer_forward_tmp_bytes_mali_fp16( + inputDesc, filterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} +EE convolution_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc scaleDesc, + const GCLMem_t scale, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + UNUSED(scaleDesc); + UNUSED(scale); + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = convolution_mali_fp16(handle, inputDesc, input, filterDesc, filter, convParamSpec, + forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, + activationMode); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/copy.cpp b/compute/tensor/src/gpu/mali/copy.cpp new file mode 100644 index 00000000..f6ae5097 --- /dev/null +++ b/compute/tensor/src/gpu/mali/copy.cpp @@ -0,0 +1,177 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" + +inline void check_tensordesc_dims( + U32 sn, U32 sc, U32 sh, U32 sw, U32 dn, U32 dc, U32 dh, U32 dw, U32 srcOffset, U32 dstOffset, U32 length) +{ + U32 srcElementNum = sw * sh * sc * sn; + U32 dstElementNum = dw * dh * dc * dn; + if (sn > 1 || dn > 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (length + srcOffset > srcElementNum) { + CHECK_STATUS(NOT_MATCH); + } + if (length + dstOffset > dstElementNum) { + CHECK_STATUS(NOT_MATCH); + } +} + +inline EE copy_checkpara_mali(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + U32 srcOffset, + U32 dstOffset, + U32 length) +{ + if (handle == nullptr) { + return NULL_POINTER; + } + if (inputDesc.size() != 2) { + return NOT_SUPPORTED; + } + if (input.size() != 2 && input.size() != 4) { + return NOT_SUPPORTED; + } + if (input[0] == nullptr || input[1] == nullptr) { + return NOT_SUPPORTED; + } + GCLMem_t srcMem = (GCLMem_t)input[0]; + GCLMem_t dstMem = (GCLMem_t)input[1]; + U32 sn, sc, sh, sw, sw_off, sh_off; + U32 dn, dc, dh, dw, dw_off, dh_off; + sn = 1; + dn = 1; + get_gclmem_dim(srcMem->desc, &sw, &sh, &sc, &sw_off, &sh_off); + get_gclmem_dim(dstMem->desc, &dw, &dh, &dc, &dw_off, &dh_off); + if (sw_off != 0 || sh_off != 0 || dw_off != 0 || dh_off != 0) { + return NOT_SUPPORTED; + } + check_tensordesc_dims(sn, sc, sh, sw, dn, dc, dh, dw, srcOffset, dstOffset, length); + return SUCCESS; +} + +inline EE copy_core_mali_fp16(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + U32 srcOffset, + U32 dstOffset, + U32 srcStride, + U32 dstStride, + U32 length) +{ + DataType sdt = inputDesc[0].dt; + DataType ddt = inputDesc[1].dt; + if (sdt == DT_U32 && ddt == DT_I32) { + sdt = DT_I32; + } + cl_mem srcbuf = ((GCLMem_t)(input[0]))->mem; + cl_mem dstbuf = ((GCLMem_t)(input[1]))->mem; + cl_mem srcBlockIndex = NULL; + cl_mem dstBlockIndex = NULL; + bool useBlockIndex = false; + if (input.size() == 4) { + srcBlockIndex = ((GCLMem_t)(input[2]))->mem; + dstBlockIndex = ((GCLMem_t)(input[3]))->mem; + useBlockIndex = true; + } + U32 gs = (length + 3) / 4; + U32 ls = 0; + U32 dim = 1; + Kernel kernel; + char dataType[16]; + if (sdt == DT_I32) { + strcpy(dataType, "i32"); + } + if (sdt == DT_U32) { + strcpy(dataType, "u32"); + } + if (sdt == DT_F16) { + strcpy(dataType, "f16"); + } + char kernelName[128]; + if (!useBlockIndex) { + sprintf(kernelName, "copy_%s", dataType); + } else { + sprintf(kernelName, "copy_with_block_index_%s", dataType); + } + + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + if (!useBlockIndex) { + CHECK_STATUS( + gcl_set_kernelArgs(kernel, length, length, srcOffset, dstOffset, gs, srcbuf, dstbuf)); + } else { + CHECK_STATUS(gcl_set_kernelArgs(kernel, length, length, srcOffset, dstOffset, srcStride, + dstStride, gs, srcBlockIndex, dstBlockIndex, srcbuf, dstbuf)); + } + gcl_set_kernelVec(handle, kernel, dim, &gs, &ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs, &ls, kernelName)); + CHECK_STATUS(gcl_print_memory(handle, (GCLMem_t)input[0], "copy_srcbuf")); + CHECK_STATUS(gcl_print_memory(handle, (GCLMem_t)input[1], "copy_dstbuf")); +#endif + return SUCCESS; +} + +EE copy_infer_output_size_mali(std::vector inputDesc, GCLMemDesc_t gclmemInputDesc) +{ + if (inputDesc.size() != 2) { + CHECK_STATUS(NOT_SUPPORTED); + } + DataType sdt, ddt; + U32 sw, sh, sc, sn; + U32 dw, dh, dc, dn; + TensorDesc srcDesc = inputDesc[0]; + TensorDesc dstDesc = inputDesc[1]; + tensorSelectGet(srcDesc, &sdt, NULL, &sn, &sc, &sh, &sw); + tensorSelectGet(dstDesc, &ddt, NULL, &dn, &dc, &dh, &dw); + if (sdt == DT_U32 && ddt == DT_I32) { + sdt = DT_I32; + } + if (sdt != DT_F16 && sdt != DT_I32 && sdt != DT_U32) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (sdt != ddt) { + CHECK_STATUS(NOT_MATCH); + } + if (gclmemInputDesc) { + CHECK_STATUS( + infer_gclmem_desc_nchw(sw, sh, sc, 0, 0, 0, 0, 0, sdt, sdt, &gclmemInputDesc[0], NULL)); + CHECK_STATUS( + infer_gclmem_desc_nchw(dw, dh, dc, 0, 0, 0, 0, 0, ddt, ddt, &gclmemInputDesc[1], NULL)); + } + return SUCCESS; +} + +EE copy_mali(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + U32 srcOffset, + U32 dstOffset, + U32 srcStride, + U32 dstStride, + U32 length) +{ + EE ret = SUCCESS; + CHECK_STATUS(copy_checkpara_mali(handle, inputDesc, input, srcOffset, dstOffset, length)); + CHECK_STATUS(fill_output_zero(handle, (GCLMem_t)input[1], inputDesc[1])); + CHECK_STATUS(copy_core_mali_fp16( + handle, inputDesc, input, srcOffset, dstOffset, srcStride, dstStride, length)); + return ret; +} diff --git a/compute/tensor/src/gpu/mali/deconvolution.cpp b/compute/tensor/src/gpu/mali/deconvolution.cpp new file mode 100644 index 00000000..66abf3b8 --- /dev/null +++ b/compute/tensor/src/gpu/mali/deconvolution.cpp @@ -0,0 +1,433 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/deconvolution_mali_fp16.h" + +inline void deconvolution_produce_algos_paras(TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + std::vector *deconvAlgorithms, + std::vector *algoNumIndex, + std::vector *vecW, + std::vector *vecC, + std::vector *vecK) +{ + DataFormat idf; + U32 ic, ih, iw, fn, fc, fh, fw, sh, sw; + tensorSelectGet(inputDesc, NULL, &idf, NULL, &ic, &ih, &iw); + tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); + sh = convParamSpec.stride_h; + sw = convParamSpec.stride_w; + U32 configInfo[3][128]; + U32 configNums[2]; + ConvolutionForwardAlgorithm algo[2]; + U32 algoNum = 1; + algo[0] = CONVOLUTION_ALGORITHM_DIRECT; + if (fw != 2 || fh != 2 || sw != 2 || sh != 2) { + configInfo[0][0] = 1; + configInfo[1][0] = 4; + configInfo[2][0] = 4; + configNums[0] = 1; + } else { + algo[0] = CONVOLUTION_ALGORITHM_GEMM; + U32 configNum = 0; + U32 c = 8; + U32 ni = 4; + for (U32 ii = 0; ii < 2; ii++) { + for (U32 i = 0; i < ni; i++) { + configInfo[0][configNum] = i + 1; + configInfo[1][configNum] = c; + configInfo[2][configNum] = 4; + configNum++; + } + c = c << 1; + ni = 3; + } + + ni = 2; + U32 w = 2; + for (U32 ii = 0; ii < 2; ii++) { + c = 8; + if (ih % w == 0) { + for (U32 i = 0; i < ni; i++) { + configInfo[0][configNum] = w << 8; + configInfo[1][configNum] = c; + configInfo[2][configNum] = 4; + configNum++; + c = c << 1; + } + } + w = w << 1; + ni = 1; + } + configNums[0] = configNum; + } + + for (U32 i = 0; i < algoNum; i++) { + (*deconvAlgorithms).push_back(algo[i]); + (*algoNumIndex).push_back(configNums[i]); + U32 be = (i == 0) ? 0 : configNums[i - 1]; + U32 end = configNums[i]; + for (U32 j = be; j < end; j++) { + if (vecW) { + (*vecW).push_back(configInfo[0][j]); + } + if (vecC) { + (*vecC).push_back(configInfo[1][j]); + } + if (vecK) { + (*vecK).push_back(configInfo[2][j]); + } + } + } +} +EE deconvolution_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + DataType idt, fdt; + DataFormat idf, fdf; + U32 iw, ih, ic, in; + U32 fw, fh, fc, fn; + U32 ow, oh; + U32 sw, sh, dw, dh; + U32 pt, pb, pl, pr; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw); + sw = convParamSpec.stride_w; + sh = convParamSpec.stride_h; + dw = convParamSpec.dilatedRate_w; + dh = convParamSpec.dilatedRate_h; + if (in != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (fw < 1 || fh < 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (dw != 1 || dh != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + + sw = convParamSpec.stride_h; + sh = convParamSpec.stride_w; + pt = convParamSpec.padding_top; + pb = convParamSpec.padding_bottom; + pl = convParamSpec.padding_left; + pr = convParamSpec.padding_right; + + oh = fh + sh * (ih - 1) - pt - pb; + ow = fw + sw * (iw - 1) - pl - pr; + + bool need_pad = false; + std::vector deconvAlgorithms; + std::vector algoNumIndex; + std::vector vecW; + deconvolution_produce_algos_paras( + inputDesc, filterDesc, convParamSpec, &deconvAlgorithms, &algoNumIndex, &vecW, NULL, NULL); + + if (idf == DF_NCHW) { + if (outputDesc) { + *outputDesc = tensor4df(idt, DF_NCHW, in, fc, oh, ow); + } + if (fw == 2 && fh == 2 && sw == 2 && sh == 2) { + U32 iw_align, item_w; + iw_align = ow; + U32 tmp_align = 0; + for (U32 i = 0; i < algoNumIndex[0]; i++) { + item_w = vecW[i]; + item_w = ((item_w >> 8) > 0) ? 1 : item_w; + U32 j = ALIGN(ow, item_w); + tmp_align = (tmp_align < j) ? j : tmp_align; + } + iw_align = (iw_align < tmp_align) ? tmp_align : iw_align; + if (iw_align != iw) { + need_pad = true; + } + CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw_align, ih, ic, 0, 0, ow, oh, fc, idt, idt, + gclmemInputDesc, gclmemOutputDesc, need_pad)); + } else { + CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw, ih, ic, 0, 0, ow, oh, fc, idt, idt, + gclmemInputDesc, gclmemOutputDesc, need_pad)); + } + return SUCCESS; + } + + return NOT_SUPPORTED; +} + +EE deconvolution_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc outputDesc, + ConvolutionPolicy policy, + ActivationMode activationMode, + ForwardRunInfoMali_t forwardRunInfo) +{ + if (forwardRunInfo == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + if (algorithm != CONVOLUTION_ALGORITHM_NULL) { + return SUCCESS; + } + DataType dt; + U32 ih, iw, fc, fh, fw; + tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, &iw); + tensorSelectGet(filterDesc, &dt, NULL, NULL, &fc, &fh, &fw); + std::vector deconvAlgorithms; + std::vector algoNumIndex; + std::vector vecW; + std::vector vecC; + std::vector vecK; + deconvolution_produce_algos_paras(inputDesc, filterDesc, convParamSpec, &deconvAlgorithms, + &algoNumIndex, &vecW, &vecC, &vecK); + if (vecW.size() == 1) { + forwardRunInfo->best_w[0] = vecW[0]; + forwardRunInfo->best_k[0] = vecK[0]; + forwardRunInfo->best_c[0] = vecC[0]; + forwardRunInfo->algorithm = deconvAlgorithms[0]; + return SUCCESS; + } + + if (policy == CONVOLUTION_TUNNING) { + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_enable_queue_profiling(handle)); + GCLMem_t input = gcl_create_gclmem(); + GCLMem_t filter = gcl_create_gclmem(); + GCLMem_t output = gcl_create_gclmem(); + GCLMem_t bias = gcl_create_gclmem(); + GCLMem_t tmpbuf = gcl_create_gclmem(); + U32 maxFilterSize = 0; + U32 maxBytes = 0; + U32 algosNum = 0; + std::vector runInfos; + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + GCLMemDesc inputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + GCLMemDesc outputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + CHECK_STATUS(deconvolution_infer_output_size_mali( + inputDesc, filterDesc, convParamSpec, NULL, &inputMemDesc, &outputMemDesc)); + std::vector filterMemDescs; + for (U32 i = 0; i < algoNumIndex.size(); i++) { + U32 bytes = 0; + ForwardRunInfoMali runInfo; + U32 be = (i == 0) ? 0 : algoNumIndex[i - 1]; + U32 end = algoNumIndex[i]; + runInfo.algorithm = deconvAlgorithms[i]; + for (U32 j = be; j < end; j++) { + GCLMemDesc filterMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + runInfo.best_w[0] = vecW[j]; + runInfo.best_c[0] = vecC[j]; + runInfo.best_k[0] = vecK[j]; + if (deconvolution_transform_filter_bytes_mali( + filterDesc, &runInfo, &filterMemDesc, &bytes) != SUCCESS) { + continue; + } + maxBytes = (maxBytes < bytes) ? bytes : maxBytes; + if (deconvolution_infer_forward_tmp_bytes_mali(inputDesc, filterDesc, outputDesc, + convParamSpec, &runInfo, &bytes) != SUCCESS) { + continue; + } + maxBytes = (maxBytes < bytes) ? bytes : maxBytes; + maxFilterSize = (maxFilterSize < filterMemDesc.byteSize) ? filterMemDesc.byteSize + : maxFilterSize; + filterMemDescs.push_back(filterMemDesc); + runInfos.push_back(runInfo); + } + } + + algosNum = runInfos.size(); + TensorDesc biasDesc = tensor1d(dt, fc); + filterMemDescs[0].byteSize = maxFilterSize; + input->desc = inputMemDesc; + output->desc = outputMemDesc; + filter->desc = filterMemDescs[0]; + stride[0] = (fc + 3) / 4; + stride[1] = 1; + stride[2] = 1; + MemFlags flags = CL_MEM_READ_WRITE; + CHECK_STATUS(gclmem_set_desc_padding( + &bias->desc, stride, offset, dt, DF_NHWC, GCL_MEM_IMG_1D, flags)); + tmpbuf->desc.byteSize = maxBytes; + gcl_create_memory(handle, input); + gcl_create_memory(handle, output); + gcl_create_memory(handle, filter); + gcl_create_memory(handle, bias); + if (maxBytes) { + gcl_create_memory(handle, tmpbuf); + } + + double minTimeGemm = DBL_MAX; + double minTime = DBL_MAX; + U32 runKernelBe = 0; + U32 runKernelEnd = 0; + ForwardRunInfoMali bestRunInfo; + ForwardRunInfoMali bestRunInfoGemm; + for (U32 i = 0; i < algosNum; i++) { + filter->desc = filterMemDescs[i]; + if (deconvolution_mali(handle, inputDesc, input, filterDesc, filter, convParamSpec, + &runInfos[i], biasDesc, NULL, biasDesc, bias, maxBytes, tmpbuf, outputDesc, + output, activationMode) == SUCCESS) { + if (runInfos[i].algorithm == (I32)CONVOLUTION_ALGORITHM_GEMM) { + runKernelEnd = handle->kernelVec->size(); + gcl_run_kernelVec_timing(handle, runKernelBe, runKernelEnd); + runKernelBe = runKernelEnd; + if (minTimeGemm > handle->t_execute) { + minTimeGemm = handle->t_execute; + bestRunInfoGemm = runInfos[i]; + } + } + } + } + minTime = minTimeGemm; + bestRunInfo = bestRunInfoGemm; + if (minTime == DBL_MAX) { + CHECK_STATUS(NOT_SUPPORTED); + } + *forwardRunInfo = bestRunInfo; + CHECK_STATUS(gcl_finish(handle)); + gcl_destroy_gclmem(input); + gcl_destroy_gclmem(filter); + gcl_destroy_gclmem(output); + gcl_destroy_gclmem(bias); + gcl_destroy_gclmem(tmpbuf); + deconvAlgorithms.clear(); + runInfos.clear(); + filterMemDescs.clear(); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_off_queue_profiling(handle)); + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE deconvolution_transform_filter_bytes_mali(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { + case DT_F16: { + ret = deconvolution_transform_filter_bytes_mali_fp16( + filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE deconvolution_transform_filter_mali(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + GCLMem_t tmp, + TensorDesc *fltmemDesc, + GCLMem_t fltmem) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { + case DT_F16: { + ret = deconvolution_transform_filter_mali_fp16( + handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem, tmp); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE deconvolution_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = deconvolution_infer_forward_tmp_bytes_mali_fp16( + inputDesc, filterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} +EE deconvolution_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc scaleDesc, + const GCLMem_t scale, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + UNUSED(scaleDesc); + UNUSED(scale); + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = deconvolution_mali_fp16(handle, inputDesc, input, filterDesc, filter, + convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, + activationMode); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/depth2space.cpp b/compute/tensor/src/gpu/mali/depth2space.cpp new file mode 100644 index 00000000..2efe7598 --- /dev/null +++ b/compute/tensor/src/gpu/mali/depth2space.cpp @@ -0,0 +1,110 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/depth2space_mali_fp16.h" + +inline EE depth2space_checkpara_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + Depth2SpaceParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (handle == nullptr || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + return SUCCESS; +} + +EE depth2space_infer_output_size_mali(TensorDesc inputDesc, + Depth2SpaceParamSpec p, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + if (outputDesc == nullptr || gclmemInputDesc == nullptr || gclmemOutputDesc == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + U32 ow, oh, oc, on; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + on = in; + oc = ic / (p.blockSize * p.blockSize); + oh = ih * p.blockSize; + ow = iw * p.blockSize; + if (ic % (p.blockSize * p.blockSize) != 0) { + return NOT_MATCH; + } + + *outputDesc = tensor4df(idt, idf, on, oc, oh, ow); + if (gclmemInputDesc->byteSize == 0) { + CHECK_STATUS(infer_gclmem_desc_nchw( + iw, ih, ic, 0, 0, 0, 0, 0, DT_F16, DT_F16, gclmemInputDesc, NULL)); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + 0, 0, 0, 0, 0, ow, oh, oc, DT_F16, DT_F16, NULL, gclmemOutputDesc)); + return SUCCESS; + } + + if (idf == DF_NCHW) { + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, ow, oh, oc, DT_F16, DT_F16, gclmemInputDesc, gclmemOutputDesc)); + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE depth2space_infer_tmpBuf_size_mali( + TensorDesc inputDesc, Depth2SpaceParamSpec p, TensorDesc outputDesc, U32 *bytes) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = depth2space_infer_tmpBuf_size_mali_fp16(inputDesc, p, outputDesc, bytes); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depth2space_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + Depth2SpaceParamSpec p, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(depth2space_checkpara_mali(handle, inputDesc, input, p, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = depth2space_mali_fp16(handle, inputDesc, input, p, tmpBuf, outputDesc, output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/depthwise_convolution.cpp b/compute/tensor/src/gpu/mali/depthwise_convolution.cpp new file mode 100644 index 00000000..ca09f531 --- /dev/null +++ b/compute/tensor/src/gpu/mali/depthwise_convolution.cpp @@ -0,0 +1,359 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/depthwise_convolution_mali_fp16.h" + +inline void depthwise_convolution_produce_algos_paras( + std::vector *depthwiseConvAlgorithms, + std::vector *algoNumIndex, + std::vector *vecW, + std::vector *vecC, + std::vector *vecK) +{ + U32 configNum = 8; + for (U32 i = 0; i < configNum; i++) { + if (vecW) { + (*vecW).push_back(i + 1); + } + if (vecC) { + (*vecC).push_back(1); + } + if (vecK) { + (*vecK).push_back(4); + } + } + if (depthwiseConvAlgorithms) { + (*depthwiseConvAlgorithms).push_back(DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT); + } + if (algoNumIndex) { + (*algoNumIndex).push_back(configNum); + } +} + +EE depthwise_convolution_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + U32 fw, fh; + U32 ow, oh; + U32 sw, sh, pl, pt, dw, dh, pr, pb; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + tensorSelectGet(filterDesc, NULL, NULL, NULL, NULL, &fh, &fw); + pl = convParamSpec.padding_left; + pr = convParamSpec.padding_right; + pt = convParamSpec.padding_top; + pb = convParamSpec.padding_bottom; + sw = convParamSpec.stride_w; + sh = convParamSpec.stride_h; + dw = convParamSpec.dilatedRate_w; + dh = convParamSpec.dilatedRate_h; + if (fw < 1 || fh < 1) { + return NOT_SUPPORTED; + } + if (dw != 1 || dh != 1) { + return NOT_SUPPORTED; + } + if (sw != sh) { + return NOT_SUPPORTED; + } + if ((ic & 3) != 0) { + return NOT_SUPPORTED; + } + ow = (iw + pl + pr - fw) / sw + 1; + oh = (ih + pt + pb - fh) / sh + 1; + if (outputDesc) { + *outputDesc = tensor4df(idt, idf, in, ic, oh, ow); + } + bool need_pad = false; + + std::vector vecW; + depthwise_convolution_produce_algos_paras(NULL, NULL, &vecW, NULL, NULL); + U32 iw_align = ow; + for (auto item_w : vecW) { + U32 i = ALIGN(ow, item_w); + iw_align = (iw_align < i) ? i : iw_align; + } + U32 ext_w = (fw / 2 < pl) ? pl : fw / 2; + iw_align = iw_align * sw; + if (pl < ext_w) { + iw_align = iw_align + 2 * (ext_w - pl); + ext_w = pl; + } + if (iw_align != iw) { + need_pad = true; + } + if (ext_w != 0 || pt != 0) { + need_pad = true; + } + CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw_align, ih, ic, ext_w, pt, ow, oh, ic, idt, idt, + gclmemInputDesc, gclmemOutputDesc, need_pad)); + return SUCCESS; +} + +EE depthwise_convolution_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ActivationMode depthwiseActivationMode, + ForwardRunInfoMali_t forwardRunInfo) +{ + if (forwardRunInfo == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + DepthwiseConvolutionForwardAlgorithm algorithm = + (DepthwiseConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + if (algorithm != DEPTHWISE_CONVOLUTION_ALGORITHM_NULL) { + return SUCCESS; + } + if (policy == CONVOLUTION_LIBRARY_SEARCH) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (policy == CONVOLUTION_FASTEST) { + CHECK_STATUS(NOT_SUPPORTED); + } + std::vector depthwiseConvAlgorithms; + std::vector algoNumIndex; + std::vector vecW; + std::vector vecC; + std::vector vecK; + depthwise_convolution_produce_algos_paras( + &depthwiseConvAlgorithms, &algoNumIndex, &vecW, &vecC, &vecK); + + if (policy == CONVOLUTION_TUNNING) { + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_enable_queue_profiling(handle)); + GCLMem_t input = gcl_create_gclmem(); + GCLMem_t filter = gcl_create_gclmem(); + GCLMem_t output = gcl_create_gclmem(); + GCLMem_t bias = gcl_create_gclmem(); + GCLMem_t tmpbuf = gcl_create_gclmem(); + U32 maxFilterSize = 0; + U32 maxBytes = 0; + U32 algosNum = 0; + std::vector runInfos; + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + GCLMemDesc inputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + GCLMemDesc outputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + CHECK_STATUS(depthwise_convolution_infer_output_size_mali( + inputDesc, filterDesc, convParamSpec, NULL, &inputMemDesc, &outputMemDesc)); + std::vector filterMemDescs; + U32 ic; + DataType dt; + tensorSelectGet(inputDesc, &dt, NULL, NULL, &ic, NULL, NULL); + for (U32 i = 0; i < algoNumIndex.size(); i++) { + U32 bytes = 0; + ForwardRunInfoMali runInfo; + U32 be = (i == 0) ? 0 : algoNumIndex[i - 1]; + U32 end = algoNumIndex[i]; + runInfo.algorithm = depthwiseConvAlgorithms[i]; + for (U32 j = be; j < end; j++) { + GCLMemDesc filterMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + runInfo.best_w[0] = vecW[j]; + runInfo.best_c[0] = vecC[j]; + runInfo.best_k[0] = vecK[j]; + if (depthwise_convolution_transform_filter_bytes_mali( + filterDesc, &runInfo, &filterMemDesc, &bytes) != SUCCESS) { + continue; + } + maxBytes = (maxBytes < bytes) ? bytes : maxBytes; + if (depthwise_convolution_infer_forward_tmp_bytes_mali(inputDesc, filterDesc, + outputDesc, convParamSpec, &runInfo, &bytes) != SUCCESS) { + continue; + } + maxBytes = (maxBytes < bytes) ? bytes : maxBytes; + maxFilterSize = (maxFilterSize < filterMemDesc.byteSize) ? filterMemDesc.byteSize + : maxFilterSize; + filterMemDescs.push_back(filterMemDesc); + runInfos.push_back(runInfo); + } + } + + TensorDesc biasDesc = tensor1d(dt, ic); + stride[0] = (ic + 3) / 4; + CHECK_STATUS(gclmem_set_desc_padding( + &bias->desc, stride, offset, dt, DF_NHWC, GCL_MEM_IMG_1D, CL_MEM_READ_WRITE)); + algosNum = runInfos.size(); + if (algosNum == 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + filterMemDescs[0].byteSize = maxFilterSize; + input->desc = inputMemDesc; + output->desc = outputMemDesc; + filter->desc = filterMemDescs[0]; + tmpbuf->desc.byteSize = maxBytes; + gcl_create_memory(handle, input); + gcl_create_memory(handle, output); + gcl_create_memory(handle, filter); + gcl_create_memory(handle, bias); + if (maxBytes) { + gcl_create_memory(handle, tmpbuf); + } + + double minTime = DBL_MAX; + U32 runKernelBe = 0; + U32 runKernelEnd = 0; + ForwardRunInfoMali bestRunInfo; + for (U32 i = 0; i < algosNum; i++) { + filter->desc = filterMemDescs[i]; + if (depthwise_convolution_mali(handle, inputDesc, input, filterDesc, filter, + convParamSpec, &runInfos[i], biasDesc, bias, maxBytes, tmpbuf, outputDesc, + output, depthwiseActivationMode) == SUCCESS) { + runKernelEnd = handle->kernelVec->size(); + gcl_run_kernelVec_timing(handle, runKernelBe, runKernelEnd); + if (minTime > handle->t_execute) { + minTime = handle->t_execute; + bestRunInfo = runInfos[i]; + } + runKernelBe = runKernelEnd; + } + } + if (minTime == DBL_MAX) { + CHECK_STATUS(NOT_SUPPORTED); + } + *forwardRunInfo = bestRunInfo; + CHECK_STATUS(gcl_finish(handle)); + gcl_destroy_gclmem(input); + gcl_destroy_gclmem(filter); + gcl_destroy_gclmem(output); + gcl_destroy_gclmem(bias); + gcl_destroy_gclmem(tmpbuf); + depthwiseConvAlgorithms.clear(); + runInfos.clear(); + filterMemDescs.clear(); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_off_queue_profiling(handle)); + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE depthwise_convolution_transform_filter_bytes_mali(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { + case DT_F16: { + ret = depthwise_convolution_transform_filter_bytes_mali_fp16( + filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_convolution_transform_filter_mali(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { + case DT_F16: { + ret = depthwise_convolution_transform_filter_mali_fp16( + handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_convolution_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { + case DT_F16: { + ret = depthwise_convolution_infer_forward_tmp_bytes_mali_fp16( + inputDesc, filterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_convolution_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = depthwise_convolution_mali_fp16(handle, inputDesc, input, filterDesc, filter, + convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, + depthwiseActivationMode); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/depthwise_pointwise_convolution.cpp b/compute/tensor/src/gpu/mali/depthwise_pointwise_convolution.cpp new file mode 100644 index 00000000..f7498c2b --- /dev/null +++ b/compute/tensor/src/gpu/mali/depthwise_pointwise_convolution.cpp @@ -0,0 +1,522 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/depthwise_pointwise_convolution_mali_fp16.h" +inline void depthwise_pointwise_convolution_produce_algos_paras(U32 pointwiseFilterNum, + std::vector *depthwisePointwiseConvAlgorithms, + std::vector *algoNumIndexD, + std::vector *vecWD, + std::vector *vecCD, + std::vector *vecKD, + std::vector *algoNumIndexP, + std::vector *vecWP, + std::vector *vecCP, + std::vector *vecKP) +{ + U32 algoNum = 2; + DepthwiseConvolutionForwardAlgorithm algo[2]; + algo[0] = DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT; + algo[1] = DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM; + U32 configNumsD[2]; + U32 configNumsP[2]; + U32 configNumD = 0; + U32 configNumP = 0; + U32 configInfo[3][128]; + for (U32 ii = 0; ii < algoNum; ii++) { + for (U32 i = 0; i < 8; i++) { + if (vecWD) { + (*vecWD).push_back(i + 1); + } + if (vecCD) { + (*vecCD).push_back(1); + } + if (vecKD) { + (*vecKD).push_back(4); + } + configNumD++; + } + configNumsD[ii] = configNumD; + U32 c = (algo[ii] == DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT) ? 4 : 1; + U32 k = 4; + U32 nj = 8; + for (U32 i = 0; i < 2; i++) { + for (U32 j = 0; j < nj; j++) { + configInfo[0][configNumP] = j + 1; + configInfo[1][configNumP] = c; + configInfo[2][configNumP] = k; + configNumP++; + } + k = k << 1; + if (pointwiseFilterNum % k != 0) { + break; + } + if (algo[ii] == DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT) { + nj = 4; + } + } + configNumsP[ii] = configNumP; + } + + for (U32 i = 0; i < algoNum; i++) { + if (depthwisePointwiseConvAlgorithms) { + (*depthwisePointwiseConvAlgorithms).push_back(algo[i]); + } + if (algoNumIndexD) { + (*algoNumIndexD).push_back(configNumsD[i]); + } + if (algoNumIndexP) { + (*algoNumIndexP).push_back(configNumsP[i]); + } + } + for (U32 i = 0; i < configNumP; i++) { + if (vecWP) { + (*vecWP).push_back(configInfo[0][i]); + } + if (vecCP) { + (*vecCP).push_back(configInfo[1][i]); + } + if (vecKP) { + (*vecKP).push_back(configInfo[2][i]); + } + } +} + +EE depthwise_pointwise_convolution_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + U32 dfw, dfh; + U32 pfn; + U32 ow, oh; + U32 sw, sh, pl, pt, dw, dh, pr, pb; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + tensorSelectGet(dwFilterDesc, NULL, NULL, NULL, NULL, &dfh, &dfw); + tensorSelectGet(pwFilterDesc, NULL, NULL, &pfn, NULL, NULL, NULL); + pl = convParamSpec.padding_left; + pr = convParamSpec.padding_right; + pt = convParamSpec.padding_top; + pb = convParamSpec.padding_bottom; + sw = convParamSpec.stride_w; + sh = convParamSpec.stride_h; + dw = convParamSpec.dilatedRate_w; + dh = convParamSpec.dilatedRate_h; + if (dfw < 1 || dfh < 1) { + return NOT_SUPPORTED; + } + if (dw != 1 || dh != 1) { + return NOT_SUPPORTED; + } + if (sw != sh) { + return NOT_SUPPORTED; + } + if ((pfn & 3) != 0) { + return NOT_SUPPORTED; + } + ow = (iw + pl + pr - dfw) / sw + 1; + oh = (ih + pt + pb - dfh) / sh + 1; + if (outputDesc) { + *outputDesc = tensor4df(idt, idf, in, pfn, oh, ow); + } + bool need_pad = false; + + std::vector vecW; + depthwise_pointwise_convolution_produce_algos_paras( + pfn, NULL, NULL, &vecW, NULL, NULL, NULL, NULL, NULL, NULL); + U32 iw_align = ow; + for (auto item_w : vecW) { + U32 i = ALIGN(ow, item_w); + iw_align = (iw_align < i) ? i : iw_align; + } + U32 ext_w = (dfw / 2 < pl) ? pl : dfw / 2; + iw_align = iw_align * sw; + if (pl < ext_w) { + iw_align = iw_align + 2 * (ext_w - pl); + ext_w = pl; + } + if (iw_align != iw) { + need_pad = true; + } + if (ext_w != 0 || pt != 0) { + need_pad = true; + } + CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw_align, ih, ic, ext_w, pt, ow, oh, pfn, idt, idt, + gclmemInputDesc, gclmemOutputDesc, need_pad)); + return SUCCESS; +} + +EE depthwise_pointwise_convolution_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ActivationMode depthwiseActivationMode, + ActivationMode pointwiseActivationMode, + ForwardRunInfoMali_t forwardRunInfo) +{ + if (forwardRunInfo == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + DepthwiseConvolutionForwardAlgorithm algorithm = + (DepthwiseConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + if (algorithm != DEPTHWISE_CONVOLUTION_ALGORITHM_NULL) { + return SUCCESS; + } + if (policy == CONVOLUTION_LIBRARY_SEARCH) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (policy == CONVOLUTION_FASTEST) { + CHECK_STATUS(NOT_SUPPORTED); + } + std::vector depthwisePointwiseConvAlgorithms; + std::vector algoNumIndexD; + std::vector vecWD; + std::vector vecCD; + std::vector vecKD; + std::vector algoNumIndexP; + std::vector vecWP; + std::vector vecCP; + std::vector vecKP; + DataType dt; + U32 pfn; + tensorSelectGet(pwFilterDesc, &dt, NULL, &pfn, NULL, NULL, NULL); + depthwise_pointwise_convolution_produce_algos_paras(pfn, &depthwisePointwiseConvAlgorithms, + &algoNumIndexD, &vecWD, &vecCD, &vecKD, &algoNumIndexP, &vecWP, &vecCP, &vecKP); + + if (policy == CONVOLUTION_TUNNING) { + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_enable_queue_profiling(handle)); + GCLMem_t input = gcl_create_gclmem(); + GCLMem_t dwFilter = gcl_create_gclmem(); + GCLMem_t pwFilter = gcl_create_gclmem(); + GCLMem_t output = gcl_create_gclmem(); + GCLMem_t tmpbuf = gcl_create_gclmem(); + GCLMem_t dwBias = gcl_create_gclmem(); + GCLMem_t pwBiasBuf = gcl_create_gclmem(); + GCLMem_t pwBiasImg = gcl_create_gclmem(); + U32 maxDwFilterSize = 0; + U32 maxPwFilterSize = 0; + U32 maxBytes = 0; + U32 algosNum = 0; + std::vector runInfos; + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + GCLMemDesc inputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + GCLMemDesc outputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + CHECK_STATUS(depthwise_pointwise_convolution_infer_output_size_mali(inputDesc, dwFilterDesc, + pwFilterDesc, convParamSpec, NULL, &inputMemDesc, &outputMemDesc)); + std::vector dwFilterMemDescs; + std::vector pwFilterMemDescs; + U32 ic, pfn; + tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, NULL, NULL); + tensorSelectGet(pwFilterDesc, NULL, NULL, &pfn, NULL, NULL, NULL); + if (algoNumIndexD.size() != algoNumIndexP.size()) { + CHECK_STATUS(NOT_MATCH); + } + + U32 runInfoBe[2][2]; + U32 runInfoEnd[2][2]; + U32 runInfoCount = 0; + for (U32 i = 0; i < algoNumIndexD.size(); i++) { + U32 bytes = 0; + ForwardRunInfoMali runInfo; + U32 be = (i == 0) ? 0 : algoNumIndexD[i - 1]; + U32 end = algoNumIndexD[i]; + runInfo.algorithm = depthwisePointwiseConvAlgorithms[i]; + for (U32 j = 0; j < 2; j++) { + runInfoBe[i][j] = runInfoCount; + U32 depthwiseIndex = 0; + U32 pointwiseIndex = 0; + for (U32 k = be; k < end; k++) { + GCLMemDesc dwFilterMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + GCLMemDesc pwFilterMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + if (j == 0) { + depthwiseIndex = k; + } + if (j == 1) { + pointwiseIndex = k; + } + runInfo.best_w[0] = vecWD[depthwiseIndex]; + runInfo.best_c[0] = vecCD[depthwiseIndex]; + runInfo.best_k[0] = vecKD[depthwiseIndex]; + runInfo.best_w[1] = vecWP[pointwiseIndex]; + runInfo.best_c[1] = vecCP[pointwiseIndex]; + runInfo.best_k[1] = vecKP[pointwiseIndex]; + runInfoCount++; + if (depthwise_pointwise_convolution_transform_filter_bytes_mali(dwFilterDesc, + pwFilterDesc, &runInfo, &dwFilterMemDesc, &pwFilterMemDesc, + &bytes) != SUCCESS) { + continue; + } + maxBytes = (maxBytes < bytes) ? bytes : maxBytes; + if (depthwise_pointwise_convolution_infer_forward_tmp_bytes_mali(inputDesc, + dwFilterDesc, pwFilterDesc, outputDesc, convParamSpec, &runInfo, + &bytes) != SUCCESS) { + continue; + } + maxBytes = (maxBytes < bytes) ? bytes : maxBytes; + maxDwFilterSize = (maxDwFilterSize < dwFilterMemDesc.byteSize) + ? dwFilterMemDesc.byteSize + : maxDwFilterSize; + maxPwFilterSize = (maxPwFilterSize < pwFilterMemDesc.byteSize) + ? pwFilterMemDesc.byteSize + : maxPwFilterSize; + dwFilterMemDescs.push_back(dwFilterMemDesc); + pwFilterMemDescs.push_back(pwFilterMemDesc); + runInfos.push_back(runInfo); + } + runInfoEnd[i][j] = runInfoCount; + be = (i == 0) ? 0 : algoNumIndexP[i - 1]; + end = algoNumIndexP[i]; + } + } + + TensorDesc dwBiasDesc = tensor1d(dt, ic); + TensorDesc pwBiasDesc = tensor1d(dt, pfn); + U32 dwStride[3] = {(ic + 3) / 4, 1, 1}; + CHECK_STATUS(gclmem_set_desc_padding( + &dwBias->desc, dwStride, offset, dt, DF_NHWC, GCL_MEM_IMG_1D, CL_MEM_READ_WRITE)); + U32 pwStride[3] = {(pfn + 3) / 4, 1, 1}; + CHECK_STATUS(gclmem_set_desc_padding( + &pwBiasImg->desc, pwStride, offset, dt, DF_NHWC, GCL_MEM_IMG_1D, CL_MEM_READ_WRITE)); + pwStride[0] = (pfn + 7) / 8 * 8; + CHECK_STATUS(gclmem_set_desc_padding( + &pwBiasBuf->desc, pwStride, offset, dt, DF_NHWC, GCL_MEM_BUF, CL_MEM_READ_WRITE)); + + algosNum = runInfos.size(); + if (algosNum == 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + dwFilterMemDescs[0].byteSize = maxDwFilterSize; + pwFilterMemDescs[0].byteSize = maxPwFilterSize; + input->desc = inputMemDesc; + output->desc = outputMemDesc; + dwFilter->desc = dwFilterMemDescs[0]; + pwFilter->desc = pwFilterMemDescs[0]; + tmpbuf->desc.byteSize = maxBytes; + gcl_create_memory(handle, input); + gcl_create_memory(handle, output); + gcl_create_memory(handle, dwFilter); + gcl_create_memory(handle, pwFilter); + gcl_create_memory(handle, dwBias); + gcl_create_memory(handle, pwBiasImg); + gcl_create_memory(handle, pwBiasBuf); + if (maxBytes) { + gcl_create_memory(handle, tmpbuf); + } + + double minTimeDepthwise[2] = {DBL_MAX, DBL_MAX}; + double minTimePointwise[2] = {DBL_MAX, DBL_MAX}; + ForwardRunInfoMali bestRunInfo[2]; + U32 runKernelBe = 0; + U32 runKernelEnd = 0; + + for (U32 i = 0; i < 2; i++) { + U32 depthwiseBe = runInfoBe[i][0]; + U32 depthwiseEnd = runInfoEnd[i][0]; + U32 pointwiseBe = runInfoBe[i][1]; + U32 pointwiseEnd = runInfoEnd[i][1]; + GCLMem_t pwBias = (i == 0) ? pwBiasImg : pwBiasBuf; + for (U32 j = depthwiseBe; j < depthwiseEnd; j++) { + if (depthwise_pointwise_convolution_mali(handle, inputDesc, input, dwFilterDesc, + pwFilterDesc, dwFilter, pwFilter, convParamSpec, &runInfos[j], dwBiasDesc, + pwBiasDesc, dwBias, pwBias, maxBytes, tmpbuf, outputDesc, output, + depthwiseActivationMode, pointwiseActivationMode) == SUCCESS) { + runKernelEnd = handle->kernelVec->size(); + gcl_run_kernelVec_timing(handle, runKernelBe, runKernelBe + 1); + if (minTimeDepthwise[i] > handle->t_execute) { + minTimeDepthwise[i] = handle->t_execute; + bestRunInfo[i].algorithm = runInfos[j].algorithm; + bestRunInfo[i].best_w[0] = runInfos[j].best_w[0]; + bestRunInfo[i].best_c[0] = runInfos[j].best_c[0]; + bestRunInfo[i].best_k[0] = runInfos[j].best_k[0]; + } + runKernelBe = runKernelEnd; + } + } + for (U32 j = pointwiseBe; j < pointwiseEnd; j++) { + if (depthwise_pointwise_convolution_mali(handle, inputDesc, input, dwFilterDesc, + pwFilterDesc, dwFilter, pwFilter, convParamSpec, &runInfos[j], dwBiasDesc, + pwBiasDesc, dwBias, pwBias, maxBytes, tmpbuf, outputDesc, output, + depthwiseActivationMode, pointwiseActivationMode) == SUCCESS) { + runKernelEnd = handle->kernelVec->size(); + gcl_run_kernelVec_timing(handle, runKernelEnd - 1, runKernelEnd); + if (minTimePointwise[i] > handle->t_execute) { + minTimePointwise[i] = handle->t_execute; + bestRunInfo[i].algorithm = runInfos[j].algorithm; + bestRunInfo[i].best_w[1] = runInfos[j].best_w[1]; + bestRunInfo[i].best_c[1] = runInfos[j].best_c[1]; + bestRunInfo[i].best_k[1] = runInfos[j].best_k[1]; + } + runKernelBe = runKernelEnd; + } + } + } + + double minTimeDirect = minTimeDepthwise[0] + minTimePointwise[0]; + double minTimeGemm = minTimeDepthwise[1] + minTimePointwise[1]; + if (minTimeDirect == DBL_MAX && minTimeGemm == DBL_MAX) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (minTimeDirect > minTimeGemm) { + bestRunInfo[0] = bestRunInfo[1]; + } + + *forwardRunInfo = bestRunInfo[0]; + CHECK_STATUS(gcl_finish(handle)); + gcl_destroy_gclmem(input); + gcl_destroy_gclmem(dwFilter); + gcl_destroy_gclmem(pwFilter); + gcl_destroy_gclmem(output); + gcl_destroy_gclmem(dwBias); + gcl_destroy_gclmem(pwBiasImg); + gcl_destroy_gclmem(pwBiasBuf); + gcl_destroy_gclmem(tmpbuf); + runInfos.clear(); + dwFilterMemDescs.clear(); + pwFilterMemDescs.clear(); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_off_queue_profiling(handle)); + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE depthwise_pointwise_convolution_transform_filter_bytes_mali(TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemDwFilterDesc, + GCLMemDesc_t gclmemPwFilterDesc, + U32 *bytes) +{ + EE ret = SUCCESS; + switch (dwFilterDesc.dt) { + case DT_F16: { + ret = depthwise_pointwise_convolution_transform_filter_bytes_mali_fp16(dwFilterDesc, + pwFilterDesc, forwardRunInfo, gclmemDwFilterDesc, gclmemPwFilterDesc, bytes); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_pointwise_convolution_transform_filter_mali(GCLHandle_t handle, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + GCLMem_t dwFilter, + GCLMem_t pwFilter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *dwfltmemDesc, + TensorDesc *pwfltmemDesc, + GCLMem_t dwfltmem, + GCLMem_t pwfltmem) +{ + EE ret = SUCCESS; + switch (dwFilterDesc.dt) { + case DT_F16: { + ret = depthwise_pointwise_convolution_transform_filter_mali_fp16(handle, dwFilterDesc, + pwFilterDesc, dwFilter, pwFilter, forwardRunInfo, dwfltmemDesc, pwfltmemDesc, + dwfltmem, pwfltmem); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_pointwise_convolution_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = depthwise_pointwise_convolution_infer_forward_tmp_bytes_mali_fp16(inputDesc, + dwFilterDesc, pwFilterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_pointwise_convolution_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + const GCLMem_t dwFilter, + const GCLMem_t pwFilter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc dwBiasDesc, + TensorDesc pwBiasDesc, + const GCLMem_t dwBias, + const GCLMem_t pwBias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode, + ActivationMode pointwiseActivationMode) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = depthwise_pointwise_convolution_mali_fp16(handle, inputDesc, input, dwFilterDesc, + pwFilterDesc, dwFilter, pwFilter, convParamSpec, forwardRunInfo, dwBiasDesc, + pwBiasDesc, dwBias, pwBias, tmpBytes, tmpBuf, outputDesc, output, + depthwiseActivationMode, pointwiseActivationMode); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/eltwise.cpp b/compute/tensor/src/gpu/mali/eltwise.cpp new file mode 100644 index 00000000..fd5211ea --- /dev/null +++ b/compute/tensor/src/gpu/mali/eltwise.cpp @@ -0,0 +1,202 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/eltwise_mali_fp16.h" + +inline void gcl_mem_desc_align(U32 size, DataType dt, GCLMemDesc_t desc) +{ + U32 s0 = desc[0].stride[0]; + U32 s1 = desc[0].stride[1]; + U32 s2 = desc[0].stride[2]; + U32 off0 = desc[0].offset[0]; + U32 off1 = desc[0].offset[1]; + U32 off2 = desc[0].offset[2]; + for (U32 i = 1; i < size; i++) { + s0 = (s0 >= desc[i].stride[0]) ? s0 : desc[i].stride[0]; + s1 = (s1 >= desc[i].stride[1]) ? s1 : desc[i].stride[1]; + s2 = (s2 >= desc[i].stride[2]) ? s2 : desc[i].stride[2]; + off0 = (off0 >= desc[i].offset[0]) ? off0 : desc[i].offset[0]; + off1 = (off1 >= desc[i].offset[1]) ? off1 : desc[i].offset[1]; + off2 = (off2 >= desc[i].offset[2]) ? off2 : desc[i].offset[2]; + } + U32 num = s0 * s1 * s2 * 4; + U32 byteSize = num * bytesOf(dt); + for (U32 i = 0; i < size; i++) { + desc[i].stride[0] = s0; + desc[i].stride[1] = s1; + desc[i].stride[2] = s2; + desc[i].offset[0] = off0; + desc[i].offset[1] = off1; + desc[i].offset[2] = off2; + desc[i].num = num; + desc[i].byteSize = byteSize; + } +} +EE eltwise_infer_output_size_mali(std::vector inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + U32 arrayDimMax = 0; + bool sameDesc = eltwise_same_desc(inputDesc, &arrayDimMax); + if (outputDesc) { + *outputDesc = inputDesc[arrayDimMax]; + } + + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc[arrayDimMax], &idt, &idf, &in, &ic, &ih, &iw); + + if (sameDesc) { + U32 size = inputDesc.size(); + if (gclmemInputDesc) { + bool inputIsModelInput = true; + bool inputIsAllNCHW = true; + + for (U32 i = 0; i < size; i++) { + if (gclmemInputDesc[i].byteSize > 0) { + inputIsModelInput = false; + } + if (gclmemInputDesc[i].memFormat != DF_NCHW) { + inputIsAllNCHW = false; + } + } + + if (inputIsAllNCHW && !inputIsModelInput) { + CHECK_STATUS(infer_gclmem_desc_nchw( + iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + for (U32 i = 0; i < size; i++) { + DataType tdt; + U32 tw, th, tc; + tensorSelectGet(inputDesc[i], &tdt, NULL, NULL, &tc, &th, &tw); + CHECK_STATUS(infer_gclmem_desc_nchw( + tw, th, tc, 0, 0, tw, th, tc, tdt, tdt, &gclmemInputDesc[i], NULL)); + } + } else { + for (U32 i = 0; i < size; i++) { + DataType tdt; + U32 tw, th, tc; + tensorSelectGet(inputDesc[i], &tdt, NULL, NULL, &tc, &th, &tw); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + tw, th, tc, 0, 0, tw, th, tc, tdt, tdt, &gclmemInputDesc[i], NULL)); + } + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, NULL, gclmemOutputDesc)); + } + gcl_mem_desc_align(size, idt, gclmemInputDesc); + } + return SUCCESS; + } else { + if (inputDesc.size() > 2) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (gclmemInputDesc) { + CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, + &gclmemInputDesc[arrayDimMax], gclmemOutputDesc)); + tensorSelectGet(inputDesc[1 - arrayDimMax], &idt, NULL, &in, &ic, &ih, &iw); + if (gclmemInputDesc[1 - arrayDimMax].byteSize == 0 || + gclmemInputDesc[1 - arrayDimMax].memFormat == DF_NCHW) { + CHECK_STATUS(infer_gclmem_desc_nchw(iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, + &gclmemInputDesc[1 - arrayDimMax], NULL)); + } else { + CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, + &gclmemInputDesc[1 - arrayDimMax], NULL)); + } + } + return SUCCESS; + } + return NOT_SUPPORTED; +} + +inline EE eltwise_checkpara_mali(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + EltwiseParamSpec eltwiseDesc, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (handle == nullptr || nullptr == output) { + return NULL_POINTER; + } + for (auto it : input) { + GCLMem_t ptr = (GCLMem_t)it; + if (ptr == nullptr) { + return NULL_POINTER; + } + } + EltwiseMode eltwiseMode = eltwiseDesc.elt_mode; + U32 arrayDimMax = 0; + bool sameDesc = eltwise_same_desc(inputDesc, &arrayDimMax); + if (sameDesc) { + for (auto it : input) { + if (((GCLMem_t)(it))->desc.memFormat != output->desc.memFormat) { + return NOT_SUPPORTED; + } + } + for (auto it : inputDesc) { + // if(it.df != outputDesc.df) return NOT_SUPPORTED; + if (it.dims[0] != outputDesc.dims[0]) { + return NOT_SUPPORTED; + } + if (it.dims[1] != outputDesc.dims[1]) { + return NOT_SUPPORTED; + } + if (it.dims[2] != outputDesc.dims[2]) { + return NOT_SUPPORTED; + } + if (it.dims[3] != outputDesc.dims[3]) { + return NOT_SUPPORTED; + } + } + } else { + if (inputDesc.size() > 2) { + CHECK_STATUS(NOT_SUPPORTED); + } + } + if (outputDesc.df != DF_NCHW && outputDesc.df != DF_MKT) { + return NOT_SUPPORTED; + } + if (eltwiseMode != ELTWISE_SUM && eltwiseMode != ELTWISE_MAX && eltwiseMode != ELTWISE_PROD) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE eltwise_mali(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + EltwiseParamSpec eltwiseDesc, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(eltwise_checkpara_mali(handle, inputDesc, input, eltwiseDesc, outputDesc, output)); + switch (inputDesc[0].dt) { + case DT_F16: { + ret = eltwise_mali_fp16(handle, inputDesc, input, outputDesc, output, eltwiseDesc); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/embedding.cpp b/compute/tensor/src/gpu/mali/embedding.cpp new file mode 100644 index 00000000..62f0a063 --- /dev/null +++ b/compute/tensor/src/gpu/mali/embedding.cpp @@ -0,0 +1,86 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/embedding_mali_fp16.h" + +EE embedding_infer_output_size_mali(TensorDesc inputDesc, + EmbedParamSpec p, + DataType dt, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + DataType idt; + DataFormat df; + U32 batch, step; + CHECK_REQUIREMENT(tensorIs2d(inputDesc)); + CHECK_STATUS(tensor2dGet(inputDesc, &idt, &df, &batch, &step)); + if (outputDesc) { + *outputDesc = tensor3df(dt, DF_MKT, batch, p.num_output, step); + } + + if (df == DF_NORMAL) { + U32 iw = step; + U32 ih = batch; + U32 ic = 1; + CHECK_STATUS( + infer_gclmem_desc_nchw(iw, ih, ic, 0, 0, 0, 0, 0, idt, dt, gclmemInputDesc, NULL)); + + U32 m = 1; + U32 ow, oh, oc; + map_nlp_mkt_to_ncwhc4(m, p.num_output, step, &ow, &oh, &oc); + /*oc has been divided 4 in map_nlp_xxx, need to mul 4 for infer_xxx_ncwhc4*/ + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + 0, 0, 0, 0, 0, ow, oh, oc * 4, idt, dt, NULL, gclmemOutputDesc)); + return SUCCESS; + } + return NOT_SUPPORTED; +} + +inline EE embedding_checkpara_mali( + GCLHandle_t handle, GCLMem_t input, GCLMem_t weight, GCLMem_t output) +{ + if (nullptr == handle || nullptr == input || nullptr == weight || nullptr == output) { + return NULL_POINTER; + } + return SUCCESS; +} + +EE embedding_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc weightDesc, + GCLMem_t weight, + EmbedParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(embedding_checkpara_mali(handle, input, weight, output)); + switch (outputDesc.dt) { + case DT_F16: { + ret = embedding_mali_fp16( + handle, inputDesc, input, weightDesc, weight, p, outputDesc, output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/fp16/activation_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/activation_mali_fp16.cpp new file mode 100644 index 00000000..1c80e604 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/activation_mali_fp16.cpp @@ -0,0 +1,106 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/activation_mali_fp16.h" + +inline EE activation_checkpara_mali_fp16(TensorDesc inputDesc) +{ + if (inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE activation_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + UNUSED(inputDesc); + U32 ow, oh, oc, on; + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + U32 iw_str, ih_str, iw_off, ih_off; + U32 ow_str, oh_str, ow_off, oh_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, NULL, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + cl_mem inbuf, outbuf; + inbuf = input->mem; + outbuf = output->mem; + + char modeName[16]; + switch (activationMode) { + case ACTIVATION_NULL: + return SUCCESS; + case ACTIVATION_RELU: + strcpy(modeName, "relu"); + break; + case ACTIVATION_RELU6: + strcpy(modeName, "relu6"); + break; + case ACTIVATION_H_SIGMOID: + strcpy(modeName, "hsigmoid"); + break; + case ACTIVATION_H_SWISH: + strcpy(modeName, "hswish"); + break; + case ACTIVATION_GELU: + strcpy(modeName, "gelu"); + break; + case ACTIVATION_TANH: + strcpy(modeName, "tanh"); + break; + case ACTIVATION_SIGMOID: + strcpy(modeName, "sigmoid"); + break; + default: + return NOT_SUPPORTED; + } + char kernelName[128]; + U32 H = 1; + sprintf(kernelName, "activation_%s%d", modeName, H); + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + U32 cd4 = (oc + 3) / 4; + U32 ce4 = (oc & 3) == 0 ? 4 : (oc & 3); + CHECK_STATUS(gcl_set_kernelArgs(kernel, oh, ow, cd4, ce4, ih_str, iw_str, ih_off, iw_off, + oh_str, ow_str, oh_off, ow_off, inbuf, outbuf)); + U32 gs[3] = {oh, ow, (oc + 3) / 4}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif + return SUCCESS; +} + +EE activation_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + CHECK_STATUS(activation_checkpara_mali_fp16(inputDesc)); + if (input->mem != output->mem) { + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + } + CHECK_STATUS( + activation_core_mali_fp16(handle, inputDesc, input, outputDesc, output, activationMode)); + return SUCCESS; +} diff --git a/tensor_computing/src/gpu/mali/fp16/activation_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/activation_mali_fp16.h similarity index 78% rename from tensor_computing/src/gpu/mali/fp16/activation_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/activation_mali_fp16.h index 8bc9fdb6..ca8af3f0 100644 --- a/tensor_computing/src/gpu/mali/fp16/activation_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/activation_mali_fp16.h @@ -14,15 +14,14 @@ #ifndef _ACTIVATION_MALI_FP16 #define _ACTIVATION_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" #include "error.h" +#include "types.h" #include "tensor_computing_type.h" -EE activation_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output, - ActivationMode activationMode); +EE activation_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode); #endif diff --git a/compute/tensor/src/gpu/mali/fp16/argmax_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/argmax_mali_fp16.cpp new file mode 100644 index 00000000..ceef38be --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/argmax_mali_fp16.cpp @@ -0,0 +1,204 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/argmax_mali_fp16.h" +#define get_thread_num(len, maxThreadNum, threadNum) \ + { \ + threadNum = ((len + 7) / 8 < maxThreadNum) ? (len + 7) / 8 : maxThreadNum; \ + } + +inline EE argmax_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + if (outputDesc.dt != DT_U32) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE argmax_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + I32 axis, + GCLMem_t tmpbuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + UNUSED(outputDesc); + if (axis < 0) { + axis += inputDesc.nDims; + } + axis = inputDesc.nDims - 1 - axis; + if (axis == 0) { + DataType dt = inputDesc.dt; + U32 iw, ih, ic; + U32 inDims = inputDesc.nDims; + iw = inputDesc.dims[0]; + ih = (inDims > 1) ? inputDesc.dims[1] : 1; + ic = (inDims > 2) ? inputDesc.dims[2] : 1; + U32 iw_str, ih_str, iw_off, ih_off; + U32 ow_str, oh_str, ow_off, oh_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, NULL, &iw_off, &ih_off); + U32 threadNum; + Kernel kernel; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Mem inv1024 = input->mem; + Mem ini1024 = input->mem; + Mem inv128 = input->mem; + Mem ini128 = input->mem; + Mem inv1 = input->mem; + Mem ini1 = input->mem; + Mem outv1024, outi1024, outv128, outi128; + char kernelName[128]; + char kernelNameIndex[128]; + sprintf(kernelName, "argmax_x"); + sprintf(kernelNameIndex, "argmax_x_index"); + bool use_index = false; + U32 offset = 0; + U32 len = iw; + get_thread_num(len, 1024, threadNum); + if (threadNum > 128) { + U32 outNum = 1024 * ih * ic; + U32 outvSize = outNum * bytesOf(dt); + U32 outiSize = outNum * bytesOf(DT_U32); + ow_str = threadNum; + oh_str = ih; + ow_off = 0; + oh_off = 0; + CHECK_STATUS(gcl_create_sub_buffer(outvSize, &offset, tmpbuf, &outv1024)); + CHECK_STATUS(gcl_create_sub_buffer(outiSize, &offset, tmpbuf, &outi1024)); + gs[0] = threadNum; + gs[1] = ih; + gs[2] = ic; + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, ow_str, oh_str, + ow_off, oh_off, len, gs[0], gs[1], inv1024, ini1024, outv1024, outi1024)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); + inv128 = outv1024; + ini128 = outi1024; + iw_str = ow_str; + ih_str = oh_str; + iw_off = ow_off; + ih_off = oh_off; + use_index = true; + len = threadNum; +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); + CHECK_STATUS(gcl_print_buffer(handle, inv1024, input->desc.num, "argmax_input")); + CHECK_STATUS(gcl_print_buffer(handle, outv1024, outNum, "argmax_output_value")); + CHECK_STATUS(gcl_print_buffer(handle, outi1024, outNum, "argmax_output_value")); +#endif + } + + get_thread_num(len, 128, threadNum); + if (threadNum > 1) { + U32 outNum = 128 * ih * ic; + U32 outvSize = outNum * bytesOf(dt); + U32 outiSize = outNum * bytesOf(DT_U32); + ow_str = threadNum; + oh_str = ih; + ow_off = 0; + oh_off = 0; + CHECK_STATUS(gcl_create_sub_buffer(outvSize, &offset, tmpbuf, &outv128)); + CHECK_STATUS(gcl_create_sub_buffer(outiSize, &offset, tmpbuf, &outi128)); + gs[0] = threadNum; + gs[1] = ih; + gs[2] = ic; + if (use_index) { + CHECK_STATUS(gcl_create_kernel(handle, kernelNameIndex, &kernel)); + } else { + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + } + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, ow_str, oh_str, + ow_off, oh_off, len, gs[0], gs[1], inv128, ini128, outv128, outi128)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); + inv1 = outv128; + ini1 = outi128; + iw_str = ow_str; + ih_str = oh_str; + iw_off = ow_off; + ih_off = oh_off; + use_index = true; + len = threadNum; +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); + CHECK_STATUS(gcl_print_buffer(handle, outv128, outNum, "argmax_output_index")); + CHECK_STATUS(gcl_print_buffer(handle, outi128, outNum, "argmax_output_value")); +#endif + } + + gs[0] = 1; + gs[1] = ih; + gs[2] = ic; + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + if (use_index) { + CHECK_STATUS(gcl_create_kernel(handle, kernelNameIndex, &kernel)); + } else { + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + } + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, ow_str, oh_str, + ow_off, oh_off, len, gs[0], gs[1], inv1, ini1, output->mem, output->mem)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + if (use_index) { + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelNameIndex)); + } else { + CHECK_STATUS(gcl_print_memory(handle, input, "argmax_input")); + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); + } + CHECK_STATUS(gcl_print_memory(handle, output, "argmax_output")); +#endif + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE argmax_infer_forward_tmp_bytes_mali_fp16( + TensorDesc inputDesc, I32 axis, TensorDesc outputDesc, U32 *bytes) +{ + UNUSED(axis); + UNUSED(outputDesc); + DataType dt = inputDesc.dt; + U32 iw, ih, ic; + U32 inDims = inputDesc.nDims; + iw = inputDesc.dims[0]; + ih = (inDims > 1) ? inputDesc.dims[1] : 1; + ic = (inDims > 2) ? inputDesc.dims[2] : 1; + U32 size = 1024 * ih * ic * bytesOf(dt); + size += 1024 * ih * ic * bytesOf(DT_U32); + size += (128 * ih * ic * bytesOf(dt) + 1023) / 1024 * 1024; + size += (128 * ih * ic * bytesOf(DT_U32) + 1023) / 1024 * 1024; + *bytes = size; + return SUCCESS; +} + +EE argmax_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + I32 axis, + GCLMem_t tmpbuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + CHECK_STATUS(argmax_checkpara_mali_fp16(inputDesc, outputDesc)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(argmax_core_mali_fp16(handle, inputDesc, input, axis, tmpbuf, outputDesc, output)); + return SUCCESS; +} diff --git a/tensor_computing/src/gpu/mali/fp16/multiply_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/argmax_mali_fp16.h similarity index 76% rename from tensor_computing/src/gpu/mali/fp16/multiply_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/argmax_mali_fp16.h index 9060a2a6..76e52ab3 100644 --- a/tensor_computing/src/gpu/mali/fp16/multiply_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/argmax_mali_fp16.h @@ -11,21 +11,21 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#ifndef _MULTIPLY_MALI_FP16 -#define _MULTIPLY_MALI_FP16 +#ifndef _ARGMAX_MALI_FP16 +#define _ARGMAX_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" #include "error.h" +#include "types.h" #include "tensor_computing_type.h" +EE argmax_infer_forward_tmp_bytes_mali_fp16( + TensorDesc inputDesc, I32 axis, TensorDesc outputDesc, U32 *bytes); -EE multiply_mali_fp16(GCLHandle_t handle, - void* alpha, - void* beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output); +EE argmax_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + I32 axis, + GCLMem_t tmpbuf, + TensorDesc outputDesc, + GCLMem_t output); #endif diff --git a/compute/tensor/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.cpp new file mode 100644 index 00000000..99a2400d --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.cpp @@ -0,0 +1,132 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/bilateral_slice_apply_mali_fp16.h" + +inline EE bilateral_slice_apply_checkpara_mali_fp16( + TensorDesc inputDesc, TensorDesc guideDesc, TensorDesc gridDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != guideDesc.dt || inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + if (inputDesc.dt != gridDesc.dt || inputDesc.dt != outputDesc.dt) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE bilateral_slice_apply_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc guideDesc, + const GCLMem_t guide, + TensorDesc gridDesc, + const GCLMem_t grid, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + UNUSED(guideDesc); + UNUSED(forwardRunInfo); + U32 iw, ih, ic, in; + U32 gw, gh, gc, gn; + U32 ow, oh, oc, on; + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + tensorSelectGet(gridDesc, NULL, NULL, &gn, &gc, &gh, &gw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + U32 coe = bilateralSliceApplyParamSpec.coefficient_len; + BilateralSliceApplyMode mode = bilateralSliceApplyParamSpec.mode; + // bool has_offset = bilateralSliceApplyParamSpec.has_offset; + U32 dep = gc / coe; + U32 gcw = gc * gw; + U32 wh = iw * ih; + F32 scale_x = (F32)gw / iw; + F32 scale_y = (F32)gh / ih; + Mem inbuf, gridbuf, guidebuf, outbuf, gridTran; + inbuf = input->mem; + gridbuf = grid->mem; + outbuf = output->mem; + gridTran = tmpBuf->mem; + if (mode == BSliceApply_NULL) { + guidebuf = guide->mem; + } else { + guidebuf = inbuf; + } + + U32 gs0[3] = {gc / 4, gw, ih}; + U32 ls0[3] = {0, 0, 0}; + U32 dim0 = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "bilateral_slice_apply_pre", &kernel)); + CHECK_STATUS( + gcl_set_kernelArgs(kernel, gh, gc, gcw, gs0[0], gs0[1], scale_y, gridbuf, gridTran)); + gcl_set_kernelVec(handle, kernel, dim0, gs0, ls0, "bilateral_slice_apply_pre"); + +#ifdef _DEBUG + CHECK_STATUS( + gcl_run_kernel_profiling(handle, kernel, dim0, gs0, ls0, "bilateral_slice_apply_pre")); + CHECK_STATUS(gcl_print_memory(handle, grid, "bilateral_slice_apply_grid")); +#endif + char kernelname[128]; + if (mode == BSliceApply_CONV) { + sprintf(kernelname, "bilateral_slice_apply_c12_conv"); + } else { + sprintf(kernelname, "bilateral_slice_apply_c12"); + } + U32 gs[2] = {ow, oh}; + U32 ls[2] = {0, 0}; + U32 dim = 2; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw, wh, gc, gw, gh, gcw, dep, coe, gs[0], gs[1], + scale_x, scale_y, guidebuf, gridTran, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel_profiling(handle, kernel, dim, gs, ls, kernelname)); + CHECK_STATUS(gcl_print_memory(handle, input, "bilateral_slice_apply_input")); + CHECK_STATUS(gcl_print_memory(handle, output, "bilateral_slice_apply_output")); + if (mode == BSliceApply_NULL) { + CHECK_STATUS(gcl_print_memory(handle, guide, "bilateral_slice_apply_guide")); + } +#endif + return SUCCESS; +} + +EE bilateral_slice_apply_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc guideDesc, + const GCLMem_t guide, + TensorDesc gridDesc, + const GCLMem_t grid, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + UNUSED(tmpBytes); + CHECK_STATUS( + bilateral_slice_apply_checkpara_mali_fp16(inputDesc, guideDesc, gridDesc, outputDesc)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(bilateral_slice_apply_core_mali_fp16(handle, inputDesc, input, guideDesc, guide, + gridDesc, grid, bilateralSliceApplyParamSpec, forwardRunInfo, tmpBuf, outputDesc, output)); + return SUCCESS; +} diff --git a/tensor_computing/src/cpu/arm/int8/depthwise_convolution.h b/compute/tensor/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.h similarity index 59% rename from tensor_computing/src/cpu/arm/int8/depthwise_convolution.h rename to compute/tensor/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.h index 96c7b464..538b9790 100644 --- a/tensor_computing/src/cpu/arm/int8/depthwise_convolution.h +++ b/compute/tensor/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.h @@ -1,36 +1,34 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#ifndef _H_DEPTHWISE_CONVOLUTION -#define _H_DEPTHWISE_CONVOLUTION -#ifdef _USE_INT8 -#include - +#ifndef _BILATERAL_SLICE_APPLY_MALI_FP16 +#define _BILATERAL_SLICE_APPLY_MALI_FP16 #include "sys.h" -#include "type.h" #include "error.h" -#include "tensor_desc.h" +#include "types.h" #include "tensor_computing_type.h" -EE depthwise_pointwise_convolution_direct(TensorDesc inputDesc, INT8* inArray, - TensorDesc filterDesc, const INT8* filterArray, - ConvolutionDesc convDesc, - TensorDesc biasDesc, const I32* biasArray, - U32 tmpBytes, void* tmp, - TensorDesc outputDesc, I32* outArray, - ActivationDesc depthwiseActivationDesc, - ActivationDesc pointwiseActivationDesc, - Arch arch); -#endif +EE bilateral_slice_apply_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc guideDesc, + const GCLMem_t guide, + TensorDesc gridDesc, + const GCLMem_t grid, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output); #endif diff --git a/compute/tensor/src/gpu/mali/fp16/channel_resize_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/channel_resize_mali_fp16.cpp new file mode 100644 index 00000000..d9c4c29d --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/channel_resize_mali_fp16.cpp @@ -0,0 +1,92 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/channel_resize_mali_fp16.h" + +inline EE channel_resize_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE channel_resize_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + ChannelResizeParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + U32 iw, ih, ic, in, oc; + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + tensorSelectGet(outputDesc, NULL, NULL, NULL, &oc, NULL, NULL); + U32 iw_str, ih_str, ic_str, iw_off, ih_off; + U32 ow_str, oh_str, ow_off, oh_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + cl_mem inbuf, outbuf; + inbuf = input->mem; + outbuf = output->mem; + DataFormat imf = input->desc.memFormat; + DataFormat omf = output->desc.memFormat; + U32 dim = 3; + Kernel kernel; + char kernelName[128]; + U32 gs[3] = {ih, iw, (U32)(p.channel_after + 3) / 4}; + U32 ls[3] = {0, 0, 0}; + sprintf(kernelName, "channel_resize"); + if (imf == DF_NCHW && omf == DF_NCHW) { + gs[0] = (iw + 3) / 4; + gs[1] = ih; + gs[2] = p.channel_after; + sprintf(kernelName, "channel_resize_nchw"); + } + + if (imf == DF_NCHW && omf == DF_NCWHC4) { + gs[0] = (iw + 3) / 4; + gs[1] = ih; + gs[2] = (p.channel_after + 3) / 4; + sprintf(kernelName, "channel_resize_nchw_ncwhc4"); + } + + if (imf == DF_NCWHC4 && omf == DF_NCHW) { + gs[0] = ih; + gs[1] = (iw + 3) / 4; + gs[2] = (p.channel_after + 3) / 4; + sprintf(kernelName, "channel_resize_ncwhc4_nchw"); + } + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, iw_str, ic_str, ih_off, iw_off, oh_str, ow_str, + oh_off, ow_off, p.channel_before, p.channel_after, iw, gs[0], gs[1], inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif + return SUCCESS; +} + +EE channel_resize_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + ChannelResizeParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + CHECK_STATUS(channel_resize_checkpara_mali_fp16(inputDesc, outputDesc)); + CHECK_STATUS(channel_resize_core_mali_fp16(handle, inputDesc, input, p, outputDesc, output)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/channel_resize_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/channel_resize_mali_fp16.h new file mode 100644 index 00000000..c020a482 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/channel_resize_mali_fp16.h @@ -0,0 +1,26 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CHANNEL_RESIZE_MALI_FP16 +#define _CHANNEL_RESIZE_MALI_FP16 +#include "sys.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE channel_resize_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + ChannelResizeParamSpec p, + TensorDesc outputDesc, + GCLMem_t output); +#endif diff --git a/tensor_computing/src/gpu/mali/fp16/clip_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/clip_mali_fp16.cpp similarity index 61% rename from tensor_computing/src/gpu/mali/fp16/clip_mali_fp16.cpp rename to compute/tensor/src/gpu/mali/fp16/clip_mali_fp16.cpp index 25f0a554..7c94b353 100644 --- a/tensor_computing/src/gpu/mali/fp16/clip_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/clip_mali_fp16.cpp @@ -11,30 +11,29 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "sys.h" -#include "type.h" -#include "tensor_desc.h" #include "error.h" -#include "tensor_computing_type.h" +#include "types.h" #include "gpu/mali/fp16/clip_mali_fp16.h" -inline EE clip_checkpara_mali_fp16(TensorDesc inputDesc, - TensorDesc outputDesc) { - if(inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) return NOT_SUPPORTED; - return SUCCESS; +inline EE clip_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; } inline EE clip_core_mali_fp16(GCLHandle_t handle, - float* min_value, - float* max_value, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { + TensorDesc inputDesc, + GCLMem_t input, + ClipParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ UNUSED(outputDesc); U32 iw, ih, ic, in; - tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); U32 iw_str, ih_str, iw_off, ih_off; U32 ow_str, oh_str, ow_off, oh_off; ih_str = input->desc.stride[0]; @@ -46,36 +45,36 @@ inline EE clip_core_mali_fp16(GCLHandle_t handle, oh_off = output->desc.offset[0]; ow_off = output->desc.offset[1]; cl_mem inbuf, outbuf; - inbuf = input->mem; + inbuf = input->mem; outbuf = output->mem; - float min = *min_value; - float max = *max_value; U32 gs[3] = {ih, iw, (ic + 3) / 4}; U32 ls[3] = {0, 0, 0}; - U32 dim = 3; + U32 dim = 3; Kernel kernel; - CHECK_STATUS(gcl_create_kernel_binary(handle, "clip", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_str, iw_str, ih_off, iw_off, oh_str, ow_str, oh_off, ow_off, min, max, inbuf, outbuf)); + CHECK_STATUS(gcl_create_kernel(handle, "clip", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_str, iw_str, ih_off, iw_off, oh_str, ow_str, + oh_off, ow_off, p.min, p.max, inbuf, outbuf)); gcl_set_kernelVec(handle, kernel, dim, gs, ls, "clip"); #ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, input, "clip_input")); + CHECK_STATUS(gcl_print_memory(handle, input, "clip_input")); CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "clip")); CHECK_STATUS(gcl_print_memory(handle, output, "clip_output")); #endif - return SUCCESS; + return SUCCESS; } - EE clip_mali_fp16(GCLHandle_t handle, - void* min_value, - void* max_value, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { + TensorDesc inputDesc, + GCLMem_t input, + ClipParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ CHECK_STATUS(clip_checkpara_mali_fp16(inputDesc, outputDesc)); - CHECK_STATUS(clip_core_mali_fp16(handle, (float*)min_value, (float*)max_value, inputDesc, input, outputDesc, output)); - return SUCCESS; + if (input->mem != output->mem) { + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + } + CHECK_STATUS(clip_core_mali_fp16(handle, inputDesc, input, p, outputDesc, output)); + return SUCCESS; } - diff --git a/tensor_computing/src/gpu/mali/fp16/clip_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/clip_mali_fp16.h similarity index 81% rename from tensor_computing/src/gpu/mali/fp16/clip_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/clip_mali_fp16.h index ccec6caf..fecb9e65 100644 --- a/tensor_computing/src/gpu/mali/fp16/clip_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/clip_mali_fp16.h @@ -11,20 +11,17 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _CLIP_MALI_FP16 #define _CLIP_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" #include "error.h" +#include "types.h" #include "tensor_computing_type.h" EE clip_mali_fp16(GCLHandle_t handle, - void* min_value, - void* max_value, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output); + TensorDesc inputDesc, + GCLMem_t input, + ClipParamSpec p, + TensorDesc outputDesc, + GCLMem_t output); #endif diff --git a/compute/tensor/src/gpu/mali/fp16/concat_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/concat_mali_fp16.cpp new file mode 100644 index 00000000..57a07e7d --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/concat_mali_fp16.cpp @@ -0,0 +1,248 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/concat_mali_fp16.h" + +inline EE concat_checkpara_mali_fp16(std::vector inputDesc, TensorDesc outputDesc) +{ + for (auto it : inputDesc) { + if (it.dt != outputDesc.dt) { + return NOT_SUPPORTED; + } + } + if (outputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE concat_core_mali_fp16(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t tmpbuf, + I32 concatDim) +{ + U32 ow, oh, oc; + tensorSelectGet(outputDesc, NULL, NULL, NULL, &oc, &oh, &ow); + U32 ow_str, oh_str, oc_str, ow_off, oh_off; + get_gclmem_dim(output->desc, &ow_str, &oh_str, &oc_str, &ow_off, &oh_off); + U32 num = input.size(); + GCLMem_t inputMem[4]; + cl_mem inbuf[4]; + I32 dim = outputDesc.nDims; + concatDim = (concatDim + dim) % dim; + concatDim = dim - 1 - concatDim; + char kernelName[128]; + char dimName[128]; + U32 axis; + if (inputDesc[0].df == DF_NCHW) { + switch (concatDim) { + case 0: + strcpy(dimName, "w"); + axis = 1; + break; + case 1: + strcpy(dimName, "h"); + axis = 0; + break; + case 2: + strcpy(dimName, "c"); + axis = 2; + break; + default: + CHECK_STATUS(NOT_SUPPORTED); + } + } + if (inputDesc[0].df == DF_MKT) { + concatDim = 1 - concatDim; + } + if (inputDesc[0].df == DF_MKT || inputDesc[0].df == DF_MTK) { + switch (concatDim) { + case 0: + strcpy(dimName, "c"); + axis = 2; + break; + case 1: + strcpy(dimName, "h"); + axis = 0; + break; + default: + CHECK_STATUS(NOT_SUPPORTED); + } + } + bool concatDimCAlign = true; + if (axis == 2) { + for (auto p : inputDesc) { + U32 tc; + tensorSelectGet(p, NULL, NULL, NULL, &tc, NULL, NULL); + if (tc % 4 != 0) { + concatDimCAlign = false; + break; + } + } + } + U32 ic[4]; + U32 axis_len[4]; + U32 bn = (num + 3) / 4; + U32 en, nmax, axis_max; + U32 out_size = 0; + U32 ih_str[4]; + U32 iw_str[4]; + U32 ih_off[4]; + U32 iw_off[4]; + U32 oh_val = oh_str; + U32 ohw_val = oh_str * ow_str; + U32 oh_off_val = oh_off; + U32 ow_off_val = ow_off; + cl_mem outbuf = output->mem; + if (!concatDimCAlign) { + oh_val = oh; + ohw_val = oh * ow; + oh_off_val = 0; + ow_off_val = 0; + outbuf = tmpbuf->mem; + } + for (U32 i = 0; i < bn; i++) { + en = (i * 4 + 4 <= num) ? 4 : (num & 3); + axis_max = 0; + nmax = en - 1; + for (U32 j = 0; j < en; ++j) { + inputMem[j] = (GCLMem_t)input[i * 4 + j]; + inbuf[j] = inputMem[j]->mem; + get_gclmem_dim(inputMem[j]->desc, &iw_str[j], &ih_str[j], NULL, &iw_off[j], &ih_off[j]); + } + for (U32 j = 0; j < en; ++j) { + axis_len[j] = inputDesc[i * 4 + j].dims[concatDim]; + ic[j] = 0; + if (axis == 2) { + ic[j] = axis_len[j]; + axis_len[j] = (axis_len[j] + 3) / 4; + } + axis_max += axis_len[j]; + } + U32 gs[3] = {oh, ow, (oc + 3) / 4}; + gs[axis] = axis_max; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + axis_max -= axis_len[nmax]; + if (!concatDimCAlign) { + sprintf(kernelName, "concat_nonalign_c_p1_%d", en); + } else { + sprintf(kernelName, "concat_%s%d", dimName, en); + } + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + switch (en) { + case 1: + CHECK_STATUS(gcl_set_kernelArgs(kernel, oh_val, ohw_val, oh_off_val, ow_off_val, + axis_max, nmax, out_size, gs[0], gs[1], ih_str[0], iw_str[0], ih_off[0], + iw_off[0], ic[0], inbuf[0], outbuf)); + break; + case 2: + CHECK_STATUS(gcl_set_kernelArgs(kernel, oh_val, ohw_val, oh_off_val, ow_off_val, + axis_max, nmax, out_size, gs[0], gs[1], ih_str[0], iw_str[0], ih_off[0], + iw_off[0], ic[0], inbuf[0], ih_str[1], iw_str[1], ih_off[1], iw_off[1], ic[1], + axis_len[0], inbuf[1], outbuf)); + break; + case 3: + CHECK_STATUS(gcl_set_kernelArgs(kernel, oh_val, ohw_val, oh_off_val, ow_off_val, + axis_max, nmax, out_size, gs[0], gs[1], ih_str[0], iw_str[0], ih_off[0], + iw_off[0], ic[0], inbuf[0], ih_str[1], iw_str[1], ih_off[1], iw_off[1], ic[1], + axis_len[0], inbuf[1], ih_str[2], iw_str[2], ih_off[2], iw_off[2], ic[2], + axis_len[1], inbuf[2], outbuf)); + break; + case 4: + CHECK_STATUS(gcl_set_kernelArgs(kernel, oh_val, ohw_val, oh_off_val, ow_off_val, + axis_max, nmax, out_size, gs[0], gs[1], ih_str[0], iw_str[0], ih_off[0], + iw_off[0], ic[0], inbuf[0], ih_str[1], iw_str[1], ih_off[1], iw_off[1], ic[1], + axis_len[0], inbuf[1], ih_str[2], iw_str[2], ih_off[2], iw_off[2], ic[2], + axis_len[1], inbuf[2], ih_str[3], iw_str[3], ih_off[3], iw_off[3], ic[3], + axis_len[2], inbuf[3], outbuf)); + break; + default: + return NOT_SUPPORTED; + } + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); + if (!concatDimCAlign) { + out_size += oh * ow * (ic[0] + ic[1] + ic[2] + ic[3]); + } else { + if (axis == 0) { + out_size += gs[0] * 4; + } + if (axis == 1) { + out_size += oh_str * gs[1] * 4; + } + if (axis == 2) { + out_size += oh_str * ow_str * gs[2] * 4; + } + } +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif + } + if (!concatDimCAlign) { + U32 gs[3] = {(oh + 3) / 4, ow, (oc + 3) / 4}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "mem_trans_nchw_to_ncwhc4_input_tran", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ow, oh, 0, 0, ow_str, oh_str, ow_off, oh_off, ow, + oh, oc, ow, oh, oc, 0, 0, tmpbuf->mem, output->mem)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "mem_trans_nchw_to_ncwhc4_input_tran"); +#ifdef _DEBUG + CHECK_STATUS( + gcl_run_kernel(handle, kernel, dim, gs, ls, "mem_trans_nchw_to_ncwhc4_input_tran")); +#endif + } + return SUCCESS; +} + +EE concat_infer_forward_tmp_bytes_mali_fp16(std::vector inputDesc, U32 *bytes) +{ + *bytes = 0; + bool concatDimCAlign = true; + for (auto p : inputDesc) { + U32 tc; + tensorSelectGet(p, NULL, NULL, NULL, &tc, NULL, NULL); + if (tc % 4 != 0) { + concatDimCAlign = false; + break; + } + } + if (!concatDimCAlign) { + for (auto p : inputDesc) { + *bytes += tensorNumBytes(p); + } + } + return SUCCESS; +} + +EE concat_mali_fp16(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t tmpbuf, + I32 concatDim) +{ + CHECK_STATUS(concat_checkpara_mali_fp16(inputDesc, outputDesc)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS( + concat_core_mali_fp16(handle, inputDesc, input, outputDesc, output, tmpbuf, concatDim)); + return SUCCESS; +} diff --git a/tensor_computing/src/gpu/mali/fp16/concat_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/concat_mali_fp16.h similarity index 77% rename from tensor_computing/src/gpu/mali/fp16/concat_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/concat_mali_fp16.h index baa13080..7b5e9757 100644 --- a/tensor_computing/src/gpu/mali/fp16/concat_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/concat_mali_fp16.h @@ -11,19 +11,20 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _CONCAT_MALI_FP16 #define _CONCAT_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" #include "error.h" +#include "types.h" #include "tensor_computing_type.h" -EE concat_mali_fp16(GCLHandle_t handle, - std::vector inputDesc, - std::vector input, - TensorDesc outputDesc, - GCLMem_t output, - U32 concatDim); +EE concat_infer_forward_tmp_bytes_mali_fp16(std::vector inputDesc, U32 *bytes); + +EE concat_mali_fp16(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t tmpbuf, + I32 concatDim); #endif diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_direct_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/convolution_direct_mali_fp16.cpp new file mode 100644 index 00000000..3f7588c6 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/convolution_direct_mali_fp16.cpp @@ -0,0 +1,453 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/convolution_mali_fp16.h" +#include "gpu/mali/fp16/convolution_direct_mali_fp16.h" + +inline EE direct_core_nchw_to_ncwhc4_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(tmpBuf); + cl_mem inbuf, biasmem, outbuf, fltbuf; + inbuf = input->mem; + fltbuf = filter->mem; + biasmem = bias->mem; + outbuf = output->mem; + DataFormat df; + U32 iw, ih, it; + U32 fw, fh, fn, ft, sw, sh, st, pw, ph, pt; + U32 ow, oh, oc, on, ot; + sw = convParamSpec.stride_w; + sh = convParamSpec.stride_h; + st = convParamSpec.stride_t; + ph = convParamSpec.padding_top; + pw = convParamSpec.padding_left; + pt = convParamSpec.padding_before; + + tensorSelectGet(inputDesc, NULL, &df, NULL, NULL, &ih, &iw, &it); + tensorSelectGet(filterDesc, NULL, NULL, &fn, NULL, &fh, &fw, &ft); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow, &ot); + U32 iw_str, ih_str, iwh_str, ic_str, iw_off, ih_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); + iw_off -= pw; + ih_off -= ph; + iwh_str = iw_str * ih_str; + ic_str = ic_str / it; + + U32 ow_str, oh_str, ow_off, oh_off; + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + + U32 item_w = forwardRunInfo->best_w[0]; + char kernelname[128]; + char modeName[16]; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim; + Kernel kernel; + switch (activationMode) { + case ACTIVATION_RELU: + strcpy(modeName, "relu_"); + break; + case ACTIVATION_RELU6: + strcpy(modeName, "relu6_"); + break; + case ACTIVATION_NULL: + strcpy(modeName, ""); + break; + default: + return NOT_SUPPORTED; + } + + gs[0] = (ow + item_w - 1) / item_w; + gs[1] = oh; + gs[2] = (oc + 3) / 4 * on * ot; + dim = 3; + if (df == DF_NCTHW) { + sprintf(kernelname, "conv_direct_3d_sw%d_nchw_to_ncwhc4_%s%d%d%d", sw, modeName, fw, ft, + item_w); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, iwh_str, ic_str, iw_off, ih_off, oh_str, + ow_str, oh_off, ow_off, ow, ot, it, pt, sh, st, gs[0], gs[1], inbuf, fltbuf, biasmem, + outbuf)); + } else { + sprintf(kernelname, "conv_direct_s%d_nchw_to_ncwhc4_%s%d%d", sw, modeName, fw, item_w); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, iwh_str, ic_str, iw_off, ih_off, oh_str, + ow_str, oh_off, ow_off, ow, gs[0], gs[1], inbuf, fltbuf, biasmem, outbuf)); + } + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, iwh_str, ic_str, iw_off, ih_off, oh_str, ow_str, + oh_off, ow_off, ow, gs[0], gs[1], inbuf, fltbuf, biasmem, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; +} + +inline EE direct_core_fn_spe(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(tmpBuf); + cl_mem inbuf, biasmem, outbuf, fltbuf; + inbuf = input->mem; + fltbuf = filter->mem; + biasmem = bias->mem; + outbuf = output->mem; + U32 iw, ih; + U32 fw, fh, fn, sw, sh, pw, ph; + U32 ow, oh, oc, on; + sw = convParamSpec.stride_w; + sh = convParamSpec.stride_h; + ph = convParamSpec.padding_top; + pw = convParamSpec.padding_left; + tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, &iw); + tensorSelectGet(filterDesc, NULL, NULL, &fn, NULL, &fh, &fw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + U32 iw_str, ih_str, ihw_str, ic_str, iw_off, ih_off; + U32 ow_str, oh_str, ow_off, oh_off, ohw_str; + get_gclmem_dim(input->desc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + iw_off -= pw; + ih_off -= ph; + ihw_str = iw_str * ih_str; + ohw_str = ow_str * oh_str; + + U32 item_w = forwardRunInfo->best_w[0]; + char kernelname[128]; + char modeName[16]; + char outFormat[16] = ""; + if (output->desc.memFormat == DF_NCHW) { + strcpy(outFormat, "nchw_"); + } + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim; + Kernel kernel; + switch (activationMode) { + case ACTIVATION_RELU: + strcpy(modeName, "relu_"); + break; + case ACTIVATION_RELU6: + strcpy(modeName, "relu6_"); + break; + case ACTIVATION_NULL: + strcpy(modeName, ""); + break; + default: + return NOT_SUPPORTED; + } + sprintf(kernelname, "conv_direct_s%d_fn_spe_%s%s%d%d", sw, modeName, outFormat, fw, item_w); + gs[0] = oh; + gs[1] = (ow + item_w - 1) / item_w; + dim = 2; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, oh_str, ow_str, + ohw_str, oh_off, ow_off, ow, sh, gs[0], gs[1], inbuf, fltbuf, biasmem, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; +} +inline EE direct_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(tmpBuf); + cl_mem inbuf, biasmem, outbuf, fltbuf; + inbuf = input->mem; + fltbuf = filter->mem; + biasmem = bias->mem; + outbuf = output->mem; + U32 iw, ih; + U32 fw, fh, fn, sw, sh, pw, ph; + U32 ow, oh, oc, on; + sw = convParamSpec.stride_w; + sh = convParamSpec.stride_h; + ph = convParamSpec.padding_top; + pw = convParamSpec.padding_left; + tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, &iw); + tensorSelectGet(filterDesc, NULL, NULL, &fn, NULL, &fh, &fw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + U32 iw_str, ih_str, ihw_str, ic_str, iw_off, ih_off; + U32 ow_str, oh_str, ohw_str, ow_off, oh_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + ih_off -= ph; + iw_off -= pw; + ihw_str = ih_str * iw_str; + ohw_str = oh_str * ow_str; + + U32 item_w = forwardRunInfo->best_w[0]; + U32 item_c = forwardRunInfo->best_c[0]; + U32 item_k = forwardRunInfo->best_k[0]; + char kernelname[128]; + char modeName[16]; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim; + Kernel kernel; + switch (activationMode) { + case ACTIVATION_RELU: + strcpy(modeName, "relu_"); + break; + case ACTIVATION_RELU6: + strcpy(modeName, "relu6_"); + break; + case ACTIVATION_NULL: + strcpy(modeName, ""); + break; + default: + return NOT_SUPPORTED; + } + if (item_k == 0) { + if ((ih_str > 1 || iw_str > 1) && (item_c != 4)) { + CHECK_STATUS(NOT_SUPPORTED); + } + sprintf(kernelname, "conv_direct_spe_fwhs1_%s%d", modeName, item_c); + ic_str = filter->desc.stride[1]; + ow = fn; + gs[0] = fn; + gs[1] = 1; + gs[2] = 1; + dim = 1; + } else if ((item_w >> 8) > 0) { + U32 item_h = item_w >> 8; + item_k = item_k >> 2; + sprintf(kernelname, "conv_direct_s%d_h_%s%d%d%d", sw, modeName, fw, item_h, item_k); + gs[0] = (oh + item_h - 1) / item_h; + gs[1] = ow; + gs[2] = (oc + 3) / 4 * on / item_k; + dim = 3; + } else { + item_k = item_k >> 2; + sprintf(kernelname, "conv_direct_s%d_%s%d%d%d", sw, modeName, fw, item_w, item_k); + if (fw != fh) { + sprintf( + kernelname, "conv_direct_wh_s%d_%s%d%d%d%d", sw, modeName, fw, fh, item_w, item_k); + } + gs[0] = oh; + gs[1] = (ow + item_w - 1) / item_w; + gs[2] = (oc + 3) / 4 * on / item_k; + dim = 3; + } + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + if (item_k == 0 || fw != fh) { + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, oh_str, + ohw_str, oh_off, ow_off, ow, gs[0], gs[1], inbuf, fltbuf, biasmem, outbuf)); + } else { + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, oh_str, + ohw_str, oh_off, ow_off, ow, sh, gs[0], gs[1], inbuf, fltbuf, biasmem, outbuf)); + } + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; +} + +EE convolution_direct_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes) +{ + U32 fw, fh, fc, fn; + tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); + U32 item_c = forwardRunInfo->best_c[0]; + U32 item_k = forwardRunInfo->best_k[0]; + U32 s0 = 0; + U32 s1 = 0; + U32 s2 = 0; + U32 num = 0; + U32 byteSize; + if (item_k == 0) { + s0 = fn; + s1 = (fc + item_c - 1) / item_c; + s2 = 1; + DataFormat df = DF_CHWNC4; + if (item_c == 8) { + df = DF_CHWNC8; + } + if (item_c == 16) { + df = DF_CHWNC16; + } + gclmemFilterDesc->memFormat = df; + num = s0 * s1 * s2 * item_c; + } else if (item_c == 4) { + U32 item_kd4 = (item_k == 1) ? 1 : (item_k >> 2); + s0 = fw * fh * item_kd4; + s1 = (fc + item_c - 1) / item_c; + s2 = (fn + item_k - 1) / item_k; + gclmemFilterDesc->memFormat = DF_NCHWN4C4; + num = s0 * s1 * s2 * item_c * item_k / item_kd4; + } else if (item_c == 1) { + s0 = fw * fh; + s1 = fc; + s2 = (fn + item_k - 1) / item_k; + gclmemFilterDesc->memFormat = DF_NCHWN4; + num = s0 * s1 * s2 * item_k; + } + byteSize = num * bytesOf(DT_F16); + gclmemFilterDesc->stride[0] = s0; + gclmemFilterDesc->stride[1] = s1; + gclmemFilterDesc->stride[2] = s2; + gclmemFilterDesc->offset[0] = 0; + gclmemFilterDesc->offset[1] = 0; + gclmemFilterDesc->offset[2] = 0; + gclmemFilterDesc->num = num; + gclmemFilterDesc->byteSize = byteSize; + gclmemFilterDesc->memType = GCL_MEM_BUF; + gclmemFilterDesc->flags = CL_MEM_READ_WRITE; + gclmemFilterDesc->host_ptr = NULL; + *bytes = 0; + return SUCCESS; +} + +EE convolution_direct_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem) +{ + DataType fdt; + DataFormat fdf; + U32 fw, fh, fc, fn, ft; + tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw, &ft); + U32 fwh = fw * fh * ft; + U32 item_c = forwardRunInfo->best_c[0]; + U32 item_k = forwardRunInfo->best_k[0]; + U32 nk = item_k; + if (item_k == 0) { + item_k = fn; + } + char kernelname[128]; + Kernel kernel; + sprintf(kernelname, "conv_direct_trans_fltbuf_%d%d", item_c, nk); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, fwh, fc, fn, filter->mem, fltmem->mem)); + U32 gs[3] = {fwh, (fc + item_c - 1) / item_c, (fn + item_k - 1) / item_k * item_k}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + *fltmemDesc = tensor4df(fdt, fdf, fn, fc, fh, fw); + return SUCCESS; +} + +EE convolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + UNUSED(inputDesc); + UNUSED(filterDesc); + UNUSED(outputDesc); + UNUSED(convParamSpec); + UNUSED(forwardRunInfo); + *bytes = 0; + return SUCCESS; +} + +EE convolution_direct_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + U32 fw, fh, fn, ih, iw, sw; + tensorSelectGet(filterDesc, NULL, NULL, &fn, NULL, &fh, &fw); + tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, &iw); + sw = convParamSpec.stride_w; + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + if ((fw == 1 && fw == 1 && ih == 1 && iw == 1) || input->desc.memFormat == DF_NCWHC4) { + if (fn == 1 && sw == 1 && (fw == fh) && (fw == 1 || fw == 3 || fw == 5 || fw == 7)) { + CHECK_STATUS(direct_core_fn_spe(handle, inputDesc, input, filterDesc, filter, + convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, + activationMode)); + } else { + CHECK_STATUS(direct_core_mali_fp16(handle, inputDesc, input, filterDesc, filter, + convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, + activationMode)); + } + } else if (input->desc.memFormat == DF_NCHW) { + CHECK_STATUS(direct_core_nchw_to_ncwhc4_mali_fp16(handle, inputDesc, input, filterDesc, + filter, convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, + output, activationMode)); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_direct_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/convolution_direct_mali_fp16.h new file mode 100644 index 00000000..067222d4 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/convolution_direct_mali_fp16.h @@ -0,0 +1,55 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONVOLUTION_DIRECT_MALI_FP16 +#define _H_CONVOLUTION_DIRECT_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE convolution_direct_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes); + +EE convolution_direct_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem); + +EE convolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE convolution_direct_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode); + +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.cpp new file mode 100644 index 00000000..595f3b67 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.cpp @@ -0,0 +1,188 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/convolution_mali_fp16.h" +#include "gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.h" + +inline EE direct_spe_ck_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + UNUSED(inputDesc); + UNUSED(forwardRunInfo); + UNUSED(biasDesc); + UNUSED(bias); + UNUSED(tmpBytes); + UNUSED(tmpBuf); + UNUSED(activationMode); + + cl_mem inbuf, outbuf, fltbuf; + inbuf = input->mem; + fltbuf = filter->mem; + outbuf = output->mem; + U32 fn, fc, fw, sw; + U32 ow, oh, oc, on; + sw = convParamSpec.stride_w; + tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, NULL, &fw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + Kernel kernel; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim; + char kernelname[128]; + + if (fn == 1 && fc == 4 && fw == 1) { // fc = orgfc + fn + U32 iw_str, ih_str; + iw_str = input->desc.stride[0]; + ih_str = input->desc.stride[1]; + U32 ow_str, oh_str, ow_off, oh_off; + ow_str = output->desc.stride[0]; + oh_str = output->desc.stride[1]; + ow_off = output->desc.offset[0]; + oh_off = output->desc.offset[1]; + if (output->desc.memFormat != DF_NCHW) { + return NOT_SUPPORTED; + } + U32 item_w = 2; + U32 item_h = 1; + U32 ew = ow % item_w; + gs[0] = (ow + item_w - 1) / item_w; + gs[1] = (oh + item_h - 1) / item_h; + dim = 2; + sprintf(kernelname, "conv_direct_s%d_spe_f1c3k1_%d", sw, ew); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ow_str, ow_off, oh_off, ow >> 1, gs[0], + gs[1], inbuf, fltbuf, outbuf)); // c = 3 k = 1, bias val has been set in fltbuf + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); + } else { + return NOT_SUPPORTED; + } + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + CHECK_STATUS(gcl_print_memory(handle, input, "conv_direct_spe_ck_input")); + CHECK_STATUS(gcl_print_memory(handle, filter, "conv_direct_spe_ck_filter")); + CHECK_STATUS(gcl_print_memory(handle, output, "conv_direct_spe_ck_output")); +#endif + return SUCCESS; +} + +EE convolution_direct_spe_ck_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes) +{ + UNUSED(forwardRunInfo); + U32 fw, fh, fc, fn; + tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); + U32 s0, s1, s2; + U32 num, byteSize; + if (fn == 1 && fc == 3 && fw == 1) { + s0 = fw * fh; + s1 = fc + fn; // set bias val in flt + s2 = fn; + gclmemFilterDesc->memFormat = DF_NCHW; + } else { + return NOT_SUPPORTED; + } + num = s0 * s1 * s2; + byteSize = num * bytesOf(DT_F16); + gclmemFilterDesc->stride[0] = s0; + gclmemFilterDesc->stride[1] = s1; + gclmemFilterDesc->stride[2] = s2; + gclmemFilterDesc->offset[0] = 0; + gclmemFilterDesc->offset[1] = 0; + gclmemFilterDesc->offset[2] = 0; + gclmemFilterDesc->num = num; + gclmemFilterDesc->byteSize = byteSize; + gclmemFilterDesc->memType = GCL_MEM_BUF; + gclmemFilterDesc->flags = CL_MEM_READ_ONLY; + gclmemFilterDesc->host_ptr = NULL; + *bytes = 0; + return SUCCESS; +} + +EE convolution_direct_spe_ck_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem) +{ + UNUSED(forwardRunInfo); + DataType fdt; + DataFormat fdf; + U32 fw, fh, fc, fn; + tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw); + *fltmemDesc = tensor4df(fdt, fdf, fn, fc + fn, fh, fw); // set bias val in flt + U32 size = tensorNumBytes(*fltmemDesc); + CHECK_STATUS(gcl_trans_memory(handle, filter, fltmem, &size, DEVICE_BUF_TO_BUF, CL_FALSE)); +#ifdef _DEBUG + CHECK_STATUS(gcl_print_memory(handle, filter, "conv_direct_spe_ck_filter_org")); + CHECK_STATUS(gcl_print_memory(handle, fltmem, "conv_direct_spe_ck_filter_tran")); +#endif + return SUCCESS; +} + +EE convolution_direct_spe_ck_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + UNUSED(inputDesc); + UNUSED(filterDesc); + UNUSED(outputDesc); + UNUSED(convParamSpec); + UNUSED(forwardRunInfo); + *bytes = 0; + return SUCCESS; +} + +EE convolution_direct_spe_ck_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS( + direct_spe_ck_core_mali_fp16(handle, inputDesc, input, filterDesc, filter, convParamSpec, + forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, activationMode)); + + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.h new file mode 100644 index 00000000..877c39cb --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.h @@ -0,0 +1,55 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONVOLUTION_DIRECT_SPE_CK_MALI_FP16 +#define _H_CONVOLUTION_DIRECT_SPE_CK_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE convolution_direct_spe_ck_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes); + +EE convolution_direct_spe_ck_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem); + +EE convolution_direct_spe_ck_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE convolution_direct_spe_ck_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode); + +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.cpp new file mode 100644 index 00000000..f37147ea --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.cpp @@ -0,0 +1,201 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" + +#include "gpu/mali/fp16/convolution_mali_fp16.h" +#include "gpu/mali/fp16/convolution_direct_mali_fp16.h" +#include "gpu/mali/fp16/convolution_wino_mali_fp16.h" +#include "gpu/mali/fp16/convolution_direct_spe_ck_mali_fp16.h" + +inline EE convolution_checkpara_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + const GCLMem_t bias, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (nullptr == handle || nullptr == input || nullptr == filter || nullptr == output || + nullptr == bias) { + return NULL_POINTER; + } + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != filterDesc.dt || inputDesc.dt != DT_F16) { + return NOT_MATCH; + } + + U32 ic, fc, fn, fh, fw, oc; + CHECK_STATUS(tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, NULL, NULL)); + CHECK_STATUS(tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensorSelectGet(outputDesc, NULL, NULL, NULL, &oc, NULL, NULL)); + + if (input->desc.memFormat == DF_NCWHC4) { + if (output->desc.memFormat == DF_NCHW) { + if (fn != 1) { + return NOT_SUPPORTED; + } + } else if (output->desc.memFormat != DF_NCWHC4) { + return NOT_MATCH; + } + } + if (fn != oc) { + return NOT_MATCH; + } + if (ic != fc) { + return NOT_MATCH; + } + return SUCCESS; +} + +EE convolution_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes) +{ + EE ret = SUCCESS; + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + ret = convolution_direct_transform_filter_bytes_mali_fp16( + filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); + break; + case CONVOLUTION_ALGORITHM_DIRECT_SPE_CK: + ret = convolution_direct_spe_ck_transform_filter_bytes_mali_fp16( + filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); + break; + case CONVOLUTION_ALGORITHM_GEMM: + ret = NOT_SUPPORTED; + break; + case CONVOLUTION_ALGORITHM_WINOGRAD: + ret = convolution_wino_transform_filter_bytes_mali_fp16( + filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE convolution_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem, + GCLMem_t tmp) +{ + EE ret = SUCCESS; + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + ret = convolution_direct_transform_filter_mali_fp16( + handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem); + break; + case CONVOLUTION_ALGORITHM_DIRECT_SPE_CK: + ret = convolution_direct_spe_ck_transform_filter_mali_fp16( + handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem); + break; + case CONVOLUTION_ALGORITHM_GEMM: + ret = NOT_SUPPORTED; + break; + case CONVOLUTION_ALGORITHM_WINOGRAD: + ret = convolution_wino_transform_filter_mali_fp16( + handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem, tmp); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE convolution_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + EE ret = SUCCESS; + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + ret = convolution_direct_infer_forward_tmp_bytes_mali_fp16( + inputDesc, filterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); + break; + case CONVOLUTION_ALGORITHM_DIRECT_SPE_CK: + ret = convolution_direct_spe_ck_infer_forward_tmp_bytes_mali_fp16( + inputDesc, filterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); + break; + case CONVOLUTION_ALGORITHM_GEMM: + ret = NOT_SUPPORTED; + break; + case CONVOLUTION_ALGORITHM_WINOGRAD: + ret = convolution_wino_infer_forward_tmp_bytes_mali_fp16( + inputDesc, filterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE convolution_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + CHECK_STATUS(convolution_checkpara_mali_fp16( + handle, inputDesc, input, filterDesc, filter, bias, outputDesc, output)); + EE ret = SUCCESS; + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + ret = convolution_direct_mali_fp16(handle, inputDesc, input, filterDesc, filter, + convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, + activationMode); + break; + case CONVOLUTION_ALGORITHM_DIRECT_SPE_CK: + ret = convolution_direct_spe_ck_mali_fp16(handle, inputDesc, input, filterDesc, filter, + convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, + activationMode); + break; + case CONVOLUTION_ALGORITHM_GEMM: + ret = NOT_SUPPORTED; + break; + case CONVOLUTION_ALGORITHM_WINOGRAD: + ret = convolution_wino_mali_fp16(handle, inputDesc, input, filterDesc, filter, + convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, + activationMode); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/tensor_computing/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.h b/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.h similarity index 52% rename from tensor_computing/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.h rename to compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.h index 25aa85c7..af361da8 100644 --- a/tensor_computing/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.h +++ b/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.h @@ -11,26 +11,45 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#ifndef _BILATERAL_SLICE_APPLY_MALI_UCHAR -#define _BILATERAL_SLICE_APPLY_MALI_UCHAR +#ifndef _CONVOLUTION_MALI_FP16 +#define _CONVOLUTION_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" #include "error.h" +#include "types.h" #include "tensor_computing_type.h" -EE bilateral_slice_apply_mali_uchar(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc guideDesc, - const GCLMem_t guide, - TensorDesc gridDesc, - const GCLMem_t grid, - BilateralSliceApplyDesc bilateralSliceApplyDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output); -#endif +EE convolution_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes); +EE convolution_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem, + GCLMem_t tmp); + +EE convolution_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE convolution_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_wino_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/convolution_wino_mali_fp16.cpp new file mode 100644 index 00000000..5dba19ef --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/convolution_wino_mali_fp16.cpp @@ -0,0 +1,420 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/convolution_mali_fp16.h" +#include "gpu/mali/fp16/convolution_wino_mali_fp16.h" + +#define calPicTranRDesc( \ + wino_h, wino_w, wino_num, ic, fh, ph, dt, prh, prw, prc, prn, prh_off, prw_off, prhwc, prsize) \ + { \ + U32 ext_h = (fh / 2 < ph) ? ph : fh / 2; \ + prh = wino_h * 4 + 2 * ext_h; \ + prw = ((wino_w + 3) / 4 * 4); \ + prc = ic; \ + prn = wino_num; \ + prhwc = prh * prw * prc; \ + prsize = prhwc * prn * bytesOf(dt); \ + prh_off = ph; \ + prw_off = 0; \ + } + +#define calPtrTranRLDesc( \ + wino_h, wino_w, wino_num, ic, item_n, dt, prlh, prlw, prlc, prln, prlhw, prlhwc, prlsize) \ + { \ + prlh = wino_h; \ + prlw = wino_w; \ + prlc = ic; \ + prln = wino_num * wino_num; \ + prlhw = (wino_h * wino_w + item_n - 1) / item_n * item_n; \ + prlhwc = prlhw * ic; \ + prlsize = prlhwc * prln * bytesOf(dt); \ + } + +#define calGemmOutDesc(wino_num, fn, phw, ic, item_m, dt, M, N, C, MC, NC, MN, gSize) \ + { \ + M = (fn + item_m - 1) / item_m * item_m; \ + N = prlhw_str; \ + C = ic; \ + MC = M * C; \ + NC = N * C; \ + MN = M * N; \ + gSize = MN * wino_num * wino_num * bytesOf(dt); \ + } +inline EE wino_trans_pic(GCLHandle_t handle, + U32 ih_str, + U32 iw_str, + U32 ih_off, + U32 iw_off, + U32 ic_str, + U32 prh_str, + U32 prw_str, + U32 prc_str, + U32 prhwc_str, + U32 prh_off, + U32 prw_off, + U32 prlh_str, + U32 prlw_str, + U32 prlc_str, + U32 prlhw_str, + U32 prlhwc_str, + Mem pic, + Mem picTranR, + Mem picTranRL) + +{ + UNUSED(prw_str); + UNUSED(prw_off); + Kernel kernel; + char kernelname[128]; + U32 ih_str4 = ih_str * 4; + U32 ih_off4 = ih_off * 4; + U32 prh_off4 = prh_off * 4; + U32 gs[3] = {prh_str * 4, (prw_str / 4 + 3) / 4 * 4, ic_str}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + sprintf(kernelname, "conv_wino_trans_picbuf_right"); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str4, iw_str, ih_off4, iw_off, prh_str, prw_str, + prhwc_str, prh_off4, gs[0], gs[1], pic, picTranR)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + U32 item_h = 1; + if (prlh_str % 2 == 0) { + item_h = 2; + } + if (prlh_str % 3 == 0) { + item_h = 3; + } + if (prlh_str % 4 == 0) { + item_h = 4; + } + gs[0] = (prlh_str / item_h + 3) / 4 * 4; + gs[1] = prlw_str; + gs[2] = prlc_str * 6; + sprintf(kernelname, "conv_wino_trans_picbuf_left_%d", item_h); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, prh_str, prw_str, prc_str, prlh_str, prlw_str, + prlhw_str, prlhwc_str, gs[0], gs[1], picTranR, picTranRL)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; +} + +inline EE wino_gemm(GCLHandle_t handle, + U32 M, + U32 N, + U32 C, + U32 item_m, + U32 item_n, + U32 flttran_str, + U32 pictran_str, + U32 out_str, + U32 wino_num, + Mem flttran, + Mem pictran, + Mem out) +{ + Kernel kernel; + wino_num = wino_num * wino_num; + char kernelname[128]; + sprintf(kernelname, "conv_wino_gemm%d_tn_%d%d", wino_num, item_m, item_n); + U32 gs[2] = {(N + item_n - 1) / item_n, (M + item_m - 1) / item_m}; + U32 ls[2] = {0, 0}; + U32 dim = 2; + for (U32 i = 0; i < wino_num; i++) { + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, M, N, C, i * flttran_str, i * pictran_str, + i * out_str, gs[0], gs[1], flttran, pictran, out)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + } + return SUCCESS; +} + +inline EE wino_trans_out(GCLHandle_t handle, + U32 wino_h, + U32 wino_w, + U32 pw_str, + U32 pwh_str, + U32 oh_str, + U32 ow_str, + U32 oh_off, + U32 ow_off, + U32 oh, + U32 ow, + U32 oc, + ActivationMode activationMode, + Mem bias, + Mem gemm_out, + Mem output) +{ + Kernel kernel; + char kernelname[128]; + char modeName[16]; + switch (activationMode) { + case ACTIVATION_RELU: + strcpy(modeName, "_relu"); + break; + case ACTIVATION_NULL: + strcpy(modeName, ""); + break; + default: + return NOT_SUPPORTED; + } + sprintf(kernelname, "conv_wino_trans_outbuf%s", modeName); + if ((oh & 3) == 0 && (ow & 3) == 0) { + sprintf(kernelname, "conv_wino_trans_outbuf%s_align", modeName); + } + U32 gs[3] = {(wino_h + 3) / 4 * 4, (wino_w + 3) / 4 * 4, oc / 4}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, wino_h, wino_w, pw_str, pwh_str, oh_str, ow_str, oh_off, + ow_off, oh, ow, bias, gemm_out, output)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; +} + +EE convolution_wino_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes) +{ + U32 item_k = forwardRunInfo->best_k[0]; + U32 fw, fh, fc, fn; + U32 winoTransNum = 36; + tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); + U32 s0 = (fn + item_k - 1) / item_k * item_k; + U32 s1 = fc; + U32 s2 = winoTransNum; + U32 num = s0 * s1 * s2; + U32 byteSize = num * bytesOf(DT_F16); + gclmemFilterDesc->stride[0] = s0; + gclmemFilterDesc->stride[1] = s1; + gclmemFilterDesc->stride[2] = s2; + gclmemFilterDesc->offset[0] = 0; + gclmemFilterDesc->offset[1] = 0; + gclmemFilterDesc->offset[2] = 0; + gclmemFilterDesc->num = num; + gclmemFilterDesc->byteSize = byteSize; + gclmemFilterDesc->memType = GCL_MEM_BUF; + gclmemFilterDesc->flags = CL_MEM_READ_WRITE; + gclmemFilterDesc->memFormat = DF_HWCN; + gclmemFilterDesc->host_ptr = NULL; + *bytes = fn * fc * fh * fw * bytesOf(DT_F16); + return SUCCESS; +} + +EE convolution_wino_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem, + GCLMem_t tmp) +{ + UNUSED(forwardRunInfo); + DataType fdt; + DataFormat fdf; + U32 fw, fh, fc, fn; + tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw); + U32 item_k = forwardRunInfo->best_k[0]; + U32 fn_align = (fn + item_k - 1) / item_k * item_k; + U32 fwhc = fw * fh * fc; + U32 fnc = fn_align * fc; + + char kernelname[128]; + Kernel kernel; + sprintf(kernelname, "conv_wino_rotate_fltbuf_%d", fw); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, fwhc, fnc, fn, filter->mem, tmp->mem)); + U32 gs[2] = {fwhc, fn_align}; + U32 ls[2] = {0, 0}; + U32 dim = 2; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); +#ifdef _DEBUG + CHECK_STATUS(gcl_print_memory(handle, filter, "conv_wino_filter_org")); + CHECK_STATUS( + gcl_print_buffer(handle, tmp->mem, fn_align * fc * fw * fh, "conv_wino_filter_tmp")); +#endif + sprintf(kernelname, "conv_wino_trans_fltbuf_3x3"); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, fn_align, fc, fnc, tmp->mem, fltmem->mem)); + gs[0] = fn_align; + gs[1] = fc; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); +#ifdef _DEBUG + CHECK_STATUS(gcl_print_memory(handle, fltmem, "conv_wino_filter_tran")); +#endif + *fltmemDesc = tensor4df(fdt, fdf, fn, fc, fh, fw); + return SUCCESS; +} + +EE convolution_wino_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + UNUSED(inputDesc); + UNUSED(outputDesc); + UNUSED(convParamSpec); + DataType fdt; + DataFormat fdf; + U32 fw, fh, fc, fn; + tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw); + U32 item_k = forwardRunInfo->best_k[0]; + U32 fn_align = (fn + item_k - 1) / item_k * item_k; + U32 tempBufNum = fn_align * fc * fw * fh; + U32 fltTempBufSize = tempBufNum * bytesOf(fdt); + + DataType odt; + U32 ow, oh, oc, on; + tensorSelectGet(outputDesc, &odt, NULL, &on, &oc, &oh, &ow); + U32 ph = convParamSpec.padding_top; + U32 wino_num = 6; + U32 wino_h = (oh + 3) / 4; + U32 wino_w = (ow + 3) / 4; + U32 prh_str, prw_str, prc_str, prn_str, prh_off, prw_off, prhwc_str, prSize; + calPicTranRDesc(wino_h, wino_w, wino_num, fc, fh, ph, odt, prh_str, prw_str, prc_str, prn_str, + prh_off, prw_off, prhwc_str, prSize); + + U32 item_n = forwardRunInfo->best_w[0]; + U32 item_m = forwardRunInfo->best_k[0]; + U32 prlh_str, prlw_str, prlc_str, prln_str, prlhw_str, prlhwc_str, prlSize; + calPtrTranRLDesc(wino_h, wino_w, wino_num, fc, item_n, odt, prlh_str, prlw_str, prlc_str, + prln_str, prlhw_str, prlhwc_str, prlSize); + + U32 M, N, C, MC, NC, MN, gemmOutSize; + calGemmOutDesc(wino_num, fn, prlhw_str, fc, item_m, odt, M, N, C, MC, NC, MN, gemmOutSize); + + U32 tempBufSize = (prSize + 1023) / 1024 * 1024; + tempBufSize += (prlSize + 1023) / 1024 * 1024; + tempBufSize += gemmOutSize; + if (tempBufSize < fltTempBufSize) { + tempBufSize = fltTempBufSize; + } + *bytes = tempBufSize; + return SUCCESS; +} + +EE convolution_wino_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + U32 wino_num = 6; + DataType idt; + U32 iw, ih, ic; + U32 fw, fh, fc, fn, pw, ph; + U32 ow, oh, oc, on; + ph = convParamSpec.padding_top; + pw = convParamSpec.padding_left; + tensorSelectGet(inputDesc, &idt, NULL, NULL, &ic, &ih, &iw); + tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + U32 iw_str, ih_str, ih_str4, ic_str, iw_off, ih_off; + ih_str = input->desc.stride[0]; + iw_str = input->desc.stride[1]; + ic_str = input->desc.stride[2]; + ih_off = input->desc.offset[0]; // input have not pad in h axis + iw_off = input->desc.offset[1] - pw; + ih_str4 = ih_str * 4; + + U32 ow_str, oh_str, ow_off, oh_off; + oh_str = output->desc.stride[0]; + ow_str = output->desc.stride[1]; + oh_off = output->desc.offset[0]; + ow_off = output->desc.offset[1]; + + Mem pic = input->mem; + Mem picTranR, picTranRL, gemmOut; + U32 wino_h = (oh + 3) / 4; + U32 wino_w = (ow + 3) / 4; + U32 offset = 0; + U32 prh_str, prw_str, prc_str, prn_str, prh_off, prw_off, prhwc_str, prSize; + calPicTranRDesc(wino_h, wino_w, wino_num, ic, fh, ph, idt, prh_str, prw_str, prc_str, prn_str, + prh_off, prw_off, prhwc_str, prSize); + CHECK_STATUS(gcl_create_sub_buffer(prSize, &offset, tmpBuf, &picTranR)); + + U32 item_n = forwardRunInfo->best_w[0]; + U32 item_m = forwardRunInfo->best_k[0]; + U32 prlh_str, prlw_str, prlc_str, prln_str, prlhw_str, prlhwc_str, prlSize; + calPtrTranRLDesc(wino_h, wino_w, wino_num, ic, item_n, idt, prlh_str, prlw_str, prlc_str, + prln_str, prlhw_str, prlhwc_str, prlSize); + CHECK_STATUS(gcl_create_sub_buffer(prlSize, &offset, tmpBuf, &picTranRL)); + + U32 M, N, C, MC, NC, MN, gemmOutSize; + calGemmOutDesc(wino_num, fn, prlhw_str, ic, item_m, idt, M, N, C, MC, NC, MN, gemmOutSize); + CHECK_STATUS(gcl_create_sub_buffer(gemmOutSize, &offset, tmpBuf, &gemmOut)); + + CHECK_STATUS(wino_trans_pic(handle, ih_str, iw_str, ih_off, iw_off, ic_str, prh_str, prw_str, + prc_str, prhwc_str, prh_off, prw_off, prlh_str, prlw_str, prlc_str, prlhw_str, prlhwc_str, + pic, picTranR, picTranRL)); +#ifdef _DEBUG + CHECK_STATUS(gcl_print_memory(handle, input, "conv_wino_input")); + CHECK_STATUS( + gcl_print_buffer(handle, picTranR, prSize / bytesOf(idt), "conv_wino_pictran_right")); + CHECK_STATUS( + gcl_print_buffer(handle, picTranRL, prlSize / bytesOf(idt), "conv_wino_pictran_left")); +#endif + + Mem fltTran = filter->mem; + CHECK_STATUS(wino_gemm( + handle, M, N, C, item_m, item_n, MC, NC, MN, wino_num, fltTran, picTranRL, gemmOut)); +#ifdef _DEBUG + CHECK_STATUS(gcl_print_memory(handle, filter, "conv_wino_flttran")); + CHECK_STATUS( + gcl_print_buffer(handle, gemmOut, gemmOutSize / bytesOf(idt), "conv_wino_gemm_out")); +#endif + + Mem biasbuf = bias->mem; + Mem outbuf = output->mem; + CHECK_STATUS(wino_trans_out(handle, wino_h, wino_w, N, MN, oh_str, ow_str, oh_off, ow_off, oh, + ow, oc, activationMode, biasbuf, gemmOut, outbuf)); +#ifdef _DEBUG + CHECK_STATUS(gcl_print_memory(handle, output, "conv_wino_output")); +#endif + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_wino_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/convolution_wino_mali_fp16.h new file mode 100644 index 00000000..918985c7 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/convolution_wino_mali_fp16.h @@ -0,0 +1,55 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONVOLUTION_WINO_MALI_FP16 +#define _H_CONVOLUTION_WINO_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE convolution_wino_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes); + +EE convolution_wino_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem, + GCLMem_t tmp); + +EE convolution_wino_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE convolution_wino_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/deconvolution_direct_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/deconvolution_direct_mali_fp16.cpp new file mode 100644 index 00000000..f7043def --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/deconvolution_direct_mali_fp16.cpp @@ -0,0 +1,246 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/deconvolution_mali_fp16.h" +#include "gpu/mali/fp16/deconvolution_direct_mali_fp16.h" + +inline EE deconv_direct_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(tmpBuf); + cl_mem inbuf, biasmem, outbuf, fltbuf; + inbuf = input->mem; + fltbuf = filter->mem; + biasmem = bias->mem; + outbuf = output->mem; + U32 iw, ih, ic; + U32 fn, fw, fh, fc, sw, sh, pw, ph; + U32 ow, oh, oc, on; + sw = convParamSpec.stride_w; + sh = convParamSpec.stride_h; + ph = convParamSpec.padding_top; + pw = convParamSpec.padding_left; + tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, &ih, &iw); + tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + U32 iw_str, ih_str, ihw_str, ic_str, iw_off, ih_off; + ih_str = input->desc.stride[0]; + iw_str = input->desc.stride[1]; + ic_str = input->desc.stride[2]; + ih_off = input->desc.offset[0]; + iw_off = input->desc.offset[1]; + ihw_str = ih_str * iw_str; + + U32 ow_str, oh_str, ohw_str, ow_off, oh_off; + oh_str = output->desc.stride[0]; + ow_str = output->desc.stride[1]; + oh_off = output->desc.offset[0]; + ow_off = output->desc.offset[1]; + ohw_str = oh_str * ow_str; + + char kernelname[128]; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim; + Kernel kernel; + // switch(activationMode) { + // case ACTIVATION_RELU: + // strcpy(modeName, "relu_"); + // break; + // case ACTIVATION_RELU6: + // strcpy(modeName, "relu6_"); + // break; + // case ACTIVATION_NULL: + // strcpy(modeName, ""); + // break; + // default: + // return NOT_SUPPORTED; + // } + // if(item_k == 0) { + // if((ih_str > 1 || iw_str > 1) && (item_c != 4)) CHECK_STATUS(NOT_SUPPORTED); + // sprintf(kernelname, "conv_direct_spe_fwhs1_%s%d", modeName, item_c); + // ic_str = filter->desc.stride[1]; + // ow = fn; + // gs[0] = fn; + // gs[1] = 1; + // gs[2] = 1; + // dim = 1; + // } else { + // item_k = item_k >> 2; + // sprintf(kernelname, "conv_direct_s%d_%s%d%d%d",sw, modeName, fw, item_w, item_k); + sprintf(kernelname, "deconv_direct"); + gs[0] = oh; + gs[1] = ow; + gs[2] = (oc + 3) / 4; + dim = 3; + // } + U32 in_channel_blocks = (ic + 3) / 4; + U32 out_channel_blocks = gs[2]; + + pw = fw - pw - 1; + ph = fh - ph - 1; + U32 align_h = sh - 1 - ph; + U32 align_w = sw - 1 - pw; + + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, inbuf, fltbuf, outbuf, biasmem, iw, iw_str, iw_off, ih, + ih_str, ih_off, fw, fh, fc, fn, sw, sh, pw, ph, ow, ow_str, ow_off, oh, oh_str, oh_off, ic, + oc, align_h, align_w, in_channel_blocks, out_channel_blocks)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + CHECK_STATUS(gcl_print_memory(handle, input, "deconv_direct_input")); + CHECK_STATUS(gcl_print_memory(handle, filter, "deconv_direct_filter")); + CHECK_STATUS(gcl_print_memory(handle, bias, "deconv_direct_bias")); + CHECK_STATUS(gcl_print_memory(handle, output, "deconv_direct_output")); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; +} + +EE deconvolution_direct_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes) +{ + U32 fw, fh, fc, fn; + tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); + U32 item_c = forwardRunInfo->best_c[0]; + U32 item_k = forwardRunInfo->best_k[0]; + U32 s0 = 0; + U32 s1 = 0; + U32 s2 = 0; + U32 num = 0; + U32 byteSize; + if (item_c == 4) { + s0 = fw * fh; + s1 = (fc + item_c - 1) / item_c; + s2 = (fn + item_k - 1) / item_k; + gclmemFilterDesc->memFormat = DF_NCHWN4C4; + num = s0 * s1 * s2 * item_c * item_k; + } else { + CHECK_STATUS(NOT_MATCH); + } + byteSize = num * bytesOf(DT_F16); + gclmemFilterDesc->stride[0] = s0; + gclmemFilterDesc->stride[1] = s1; + gclmemFilterDesc->stride[2] = s2; + gclmemFilterDesc->offset[0] = 0; + gclmemFilterDesc->offset[1] = 0; + gclmemFilterDesc->offset[2] = 0; + gclmemFilterDesc->num = num; + gclmemFilterDesc->byteSize = byteSize; + gclmemFilterDesc->memType = GCL_MEM_BUF; + gclmemFilterDesc->flags = CL_MEM_READ_WRITE; + gclmemFilterDesc->host_ptr = NULL; + *bytes = 0; + return SUCCESS; +} + +EE deconvolution_direct_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem) +{ + DataType fdt; + DataFormat fdf; + U32 fw, fh, fc, fn; + tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw); + U32 fwh = fw * fh; + U32 item_k = forwardRunInfo->best_k[0]; + if (item_k != 4) { + CHECK_STATUS(NOT_MATCH); + } + // if(item_k == 0) item_k = fn; + char kernelname[128]; + Kernel kernel; + sprintf(kernelname, "deconv_direct_trans_fltbuf"); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, fwh, fc, fn, filter->mem, fltmem->mem)); + U32 gs[3] = {fwh, (fc + 3) / 4, (fn + 3) / 4 * 4}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + *fltmemDesc = tensor4df(fdt, fdf, fn, fc, fh, fw); +#ifdef _DEBUG + CHECK_STATUS(gcl_print_memory(handle, filter, "deconv_direct_filter_org")); + CHECK_STATUS(gcl_print_memory(handle, fltmem, "deconv_direct_filter_tran")); +#endif + return SUCCESS; +} + +EE deconvolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + UNUSED(inputDesc); + UNUSED(filterDesc); + UNUSED(outputDesc); + UNUSED(convParamSpec); + UNUSED(forwardRunInfo); + *bytes = 0; + return SUCCESS; +} + +EE deconvolution_direct_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + U32 fw, fh, ih, iw; + tensorSelectGet(filterDesc, NULL, NULL, NULL, NULL, &fh, &fw); + tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, &iw); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + if (inputDesc.df == DF_NCHW || (fw == 1 && fw == 1 && ih == 1 && iw == 1)) { + CHECK_STATUS(deconv_direct_core_mali_fp16(handle, inputDesc, input, filterDesc, filter, + convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, + activationMode)); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/deconvolution_direct_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/deconvolution_direct_mali_fp16.h new file mode 100644 index 00000000..8fc35c66 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/deconvolution_direct_mali_fp16.h @@ -0,0 +1,54 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_DECONVOLUTION_DIRECT_MALI_FP16 +#define _H_DECONVOLUTION_DIRECT_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE deconvolution_direct_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes); + +EE deconvolution_direct_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem); + +EE deconvolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE deconvolution_direct_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/deconvolution_gemm_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/deconvolution_gemm_mali_fp16.cpp new file mode 100644 index 00000000..bf862ac1 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/deconvolution_gemm_mali_fp16.cpp @@ -0,0 +1,261 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/deconvolution_mali_fp16.h" +#include "gpu/mali/fp16/deconvolution_gemm_mali_fp16.h" + +inline EE deconv_gemm_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + cl_mem inbuf, biasmem, outbuf, fltbuf, tmp; + inbuf = input->mem; + fltbuf = filter->mem; + biasmem = bias->mem; + outbuf = output->mem; + tmp = tmpBuf->mem; + U32 iw, ih, ic; + U32 fn, fw, fh, fc, sw, sh, pw, ph; + U32 ow, oh, oc, on; + sw = convParamSpec.stride_w; + sh = convParamSpec.stride_h; + ph = convParamSpec.padding_top; + pw = convParamSpec.padding_left; + tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, &ih, &iw); + tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + U32 iw_str, ih_str, ic_str, iw_off, ih_off; + U32 ow_str, oh_str, ow_off, oh_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + U32 ihw_str = ih_str * iw_str; + U32 ohw_str = oh_str * ow_str; + + U32 item_w = forwardRunInfo->best_w[0]; + U32 item_c = forwardRunInfo->best_c[0]; + item_c = item_c >> 2; + char kernelname[128]; + char modeName[16]; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + switch (activationMode) { + case ACTIVATION_RELU: + strcpy(modeName, "relu_"); + break; + case ACTIVATION_NULL: + strcpy(modeName, ""); + break; + default: + return NOT_SUPPORTED; + } + + if (fw == 2 && fh == 2 && sw == 2 && sh == 2) { + if ((item_w >> 8) > 0) { + U32 item_h = item_w >> 8; + sprintf(kernelname, "deconv_gemm_f2s2_h_%s%d%d", modeName, item_h, item_c); + gs[0] = ((oh + 1) / 2 + item_h - 1) / item_h; + gs[1] = (ow + 1) / 2; + gs[2] = (fc * fw * fh + 3) / 4 / item_c; + } else { + sprintf(kernelname, "deconv_gemm_f2s2_%s%d%d", modeName, item_w, item_c); + gs[0] = (oh + 1) / 2; + gs[1] = ((ow + 1) / 2 + item_w - 1) / item_w; + gs[2] = (fc * fw * fh + 3) / 4 / item_c; + } + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, oh_str, + ohw_str, oh_off, ow_off, ow, gs[0], gs[1], inbuf, fltbuf, biasmem, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + } else { + return NOT_SUPPORTED; + U32 th_str = ih; + U32 tw_str = iw; + U32 th_off = 0; + U32 tw_off = 0; + U32 th = ih; + U32 tw = iw; + U32 tc = fw * fh * fc; + U32 thw_str = th_str * tw_str; + if ((item_w >> 8) > 0) { + U32 item_h = item_w >> 8; + sprintf(kernelname, "conv_direct_s1_h_1%d%d", item_w, item_c); + gs[0] = (th + item_h - 1) / item_h; + gs[1] = tw; + gs[2] = (tc + 3) / 4 / item_c; + } else { + sprintf(kernelname, "conv_direct_s1_1%d%d", item_w, item_c); + gs[0] = th; + gs[1] = (tw + item_w - 1) / item_w; + gs[2] = (tc + 3) / 4 / item_c; + } + + bool has_bias = false; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, th_str, + thw_str, th_off, tw_off, tw, 1, gs[0], gs[1], has_bias, inbuf, fltbuf, biasmem, tmp)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + CHECK_STATUS(gcl_print_memory(handle, input, "deconv_gemm_input")); + CHECK_STATUS(gcl_print_memory(handle, filter, "deconv_gemm_filter")); + handle->t_total += handle->t_execute; +#endif + + gs[0] = oh * ow * (oc + 3) / 4; + ls[0] = 0; + dim = 1; + sprintf(kernelname, "col2im"); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, th, tw, tc, fw, fh, pw, ph, sw, sh, oh_str, ow_str, + oh_off, ow_off, oh, ow, gs[0], biasmem, tmp, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + CHECK_STATUS(gcl_print_memory(handle, bias, "deconv_col2im_bias")); + CHECK_STATUS(gcl_print_memory(handle, output, "deconv_col2im_output")); + handle->t_total += handle->t_execute; +#endif + } + return SUCCESS; +} + +EE deconvolution_gemm_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes) +{ + U32 fw, fh, fc, fn; + tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); + U32 item_c = forwardRunInfo->best_c[0]; + U32 item_k = forwardRunInfo->best_k[0]; + U32 s0 = 0; + U32 s1 = 0; + U32 s2 = 0; + U32 num = 0; + U32 byteSize; + s0 = item_c >> 2; + s1 = (fn + item_k - 1) / item_k; + s2 = (fc * fw * fh + item_c - 1) / item_c; + gclmemFilterDesc->memFormat = DF_NCHWN4C4; + num = s1 * s1 * s2 * item_c * item_k / (item_c >> 2); + byteSize = num * bytesOf(DT_F16); + gclmemFilterDesc->stride[0] = s0; + gclmemFilterDesc->stride[1] = s1; + gclmemFilterDesc->stride[2] = s2; + gclmemFilterDesc->offset[0] = 0; + gclmemFilterDesc->offset[1] = 0; + gclmemFilterDesc->offset[2] = 0; + gclmemFilterDesc->num = num; + gclmemFilterDesc->byteSize = byteSize; + gclmemFilterDesc->memType = GCL_MEM_BUF; + gclmemFilterDesc->flags = CL_MEM_READ_WRITE; + gclmemFilterDesc->host_ptr = NULL; + *bytes = 0; + return SUCCESS; +} + +EE deconvolution_gemm_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem) +{ + DataType fdt; + DataFormat fdf; + U32 fw, fh, fc, fn; + tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw); + U32 fwh = fw * fh; + U32 fwhc = fwh * fc; + U32 item_c = forwardRunInfo->best_c[0]; + U32 item_k = forwardRunInfo->best_k[0]; + char kernelname[128]; + Kernel kernel; + sprintf(kernelname, "deconv_gemm_trans_fltbuf_%d%d", (item_c >> 2), item_k); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + + CHECK_STATUS(gcl_set_kernelArgs(kernel, fw, fwh, fwhc, fc, fn, filter->mem, fltmem->mem)); + U32 gs[2] = {fwh * ((fc + 3) / 4), (fn + 3) / 4}; + U32 ls[2] = {0, 0}; + U32 dim = 2; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + *fltmemDesc = tensor4df(fdt, fdf, fn, fc, fh, fw); +#ifdef _DEBUG + CHECK_STATUS(gcl_print_memory(handle, filter, "deconv_gemm_filter_org")); + CHECK_STATUS(gcl_print_memory(handle, fltmem, "deconv_gemm_filter_tran")); +#endif + return SUCCESS; +} + +EE deconvolution_gemm_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + UNUSED(outputDesc); + UNUSED(convParamSpec); + UNUSED(forwardRunInfo); + U32 iw, ih; + U32 fw, fh, fc; + tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, &iw); + tensorSelectGet(filterDesc, NULL, NULL, NULL, &fc, &fh, &fw); + *bytes = iw * ih * fw * fh * fc * bytesOf(inputDesc.dt); + return SUCCESS; +} + +EE deconvolution_gemm_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS( + deconv_gemm_core_mali_fp16(handle, inputDesc, input, filterDesc, filter, convParamSpec, + forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, activationMode)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/deconvolution_gemm_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/deconvolution_gemm_mali_fp16.h new file mode 100644 index 00000000..3cab694d --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/deconvolution_gemm_mali_fp16.h @@ -0,0 +1,54 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_DECONVOLUTION_GEMM_MALI_FP16 +#define _H_DECONVOLUTION_GEMM_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE deconvolution_gemm_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes); + +EE deconvolution_gemm_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem); + +EE deconvolution_gemm_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE deconvolution_gemm_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/deconvolution_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/deconvolution_mali_fp16.cpp new file mode 100644 index 00000000..31a6e82c --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/deconvolution_mali_fp16.cpp @@ -0,0 +1,166 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/deconvolution_mali_fp16.h" +#include "gpu/mali/fp16/deconvolution_direct_mali_fp16.h" +#include "gpu/mali/fp16/deconvolution_gemm_mali_fp16.h" + +inline EE deconvolution_checkpara_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + const GCLMem_t bias, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (nullptr == handle || nullptr == input || nullptr == filter || nullptr == output || + nullptr == bias) { + return NULL_POINTER; + } + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != filterDesc.dt || inputDesc.dt != DT_F16) { + return NOT_MATCH; + } + + U32 ic, fc, fn, fh, fw, oc; + CHECK_STATUS(tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, NULL, NULL)); + CHECK_STATUS(tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensorSelectGet(outputDesc, NULL, NULL, NULL, &oc, NULL, NULL)); + if (input->desc.memFormat != DF_NCWHC4) { + return NOT_MATCH; + } + if (output->desc.memFormat != DF_NCWHC4) { + return NOT_MATCH; + } + if (fc != oc) { + return NOT_MATCH; + } + if (ic != fn) { + return NOT_MATCH; + } + return SUCCESS; +} + +EE deconvolution_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes) +{ + EE ret = SUCCESS; + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + ret = deconvolution_direct_transform_filter_bytes_mali_fp16( + filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); + break; + case CONVOLUTION_ALGORITHM_GEMM: + ret = deconvolution_gemm_transform_filter_bytes_mali_fp16( + filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE deconvolution_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem, + GCLMem_t tmp) +{ + EE ret = SUCCESS; + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + ret = deconvolution_direct_transform_filter_mali_fp16( + handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem); + break; + case CONVOLUTION_ALGORITHM_GEMM: + ret = deconvolution_gemm_transform_filter_mali_fp16( + handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE deconvolution_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + EE ret = SUCCESS; + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + ret = deconvolution_direct_infer_forward_tmp_bytes_mali_fp16( + inputDesc, filterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); + break; + case CONVOLUTION_ALGORITHM_GEMM: + ret = deconvolution_gemm_infer_forward_tmp_bytes_mali_fp16( + inputDesc, filterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE deconvolution_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + CHECK_STATUS(deconvolution_checkpara_mali_fp16( + handle, inputDesc, input, filterDesc, filter, bias, outputDesc, output)); + EE ret = SUCCESS; + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case CONVOLUTION_ALGORITHM_DIRECT: + ret = deconvolution_direct_mali_fp16(handle, inputDesc, input, filterDesc, filter, + convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, + activationMode); + break; + case CONVOLUTION_ALGORITHM_GEMM: + ret = deconvolution_gemm_mali_fp16(handle, inputDesc, input, filterDesc, filter, + convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, + activationMode); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/fp16/deconvolution_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/deconvolution_mali_fp16.h new file mode 100644 index 00000000..ac168e81 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/deconvolution_mali_fp16.h @@ -0,0 +1,55 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CONVOLUTION_MALI_FP16 +#define _CONVOLUTION_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE deconvolution_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes); + +EE deconvolution_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem, + GCLMem_t tmp); + +EE deconvolution_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE deconvolution_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/depth2space_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/depth2space_mali_fp16.cpp new file mode 100644 index 00000000..db03ecd2 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/depth2space_mali_fp16.cpp @@ -0,0 +1,128 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/depth2space_mali_fp16.h" + +inline EE depth2space_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + if (outputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE depth2space_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + Depth2SpaceParamSpec p, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + UNUSED(outputDesc); + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + U32 iw_str, ih_str, iw_off, ih_off, iwh_str, ic_str; + U32 ow_str, oh_str, ow_off, oh_off, owh_str; + get_gclmem_dim(input->desc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + iwh_str = iw_str * ih_str; + owh_str = ow_str * oh_str; + cl_mem inbuf, outbuf, tmp; + inbuf = input->mem; + outbuf = output->mem; + tmp = tmpBuf->mem; + DataFormat memFormat = input->desc.memFormat; + + if (memFormat == DF_NCWHC4 && p.blockSize == 2) { + U32 gs[3] = {ih, iw, (ic_str + 3) / 4}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "depth2space_ncwhc4_2x2", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, p.blockSize, ih_str, iwh_str, ic_str, ih_off, + iw_off, oh_str, owh_str, oh_off, ow_off, ih, iw, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "depth2space_ncwhc4_2x2"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "depth2space_ncwhc4_2x2")); +#endif + return SUCCESS; + } else if (memFormat == DF_NCHW || memFormat == DF_NCWHC4) { + if (memFormat == DF_NCWHC4) { + U32 gs0[3] = {ih, (iw + 3) / 4, (ic + 3) / 4}; + U32 ls0[3] = {0, 0, 0}; + U32 dim0 = 3; + Kernel kernel0; + CHECK_STATUS(gcl_create_kernel(handle, "mem_trans_ncwhc4_to_nchw", &kernel0)); + CHECK_STATUS(gcl_set_kernelArgs(kernel0, iw_str, ih_str, iw_off, ih_off, iw, ih, 0, 0, + iw, ih, ic, iw, ih, ic, 0, 0, inbuf, tmp)); + gcl_set_kernelVec(handle, kernel0, dim0, gs0, ls0, "mem_trans_ncwhc4_to_nchw"); +#ifdef _DEBUG + CHECK_STATUS( + gcl_run_kernel(handle, kernel0, dim0, gs0, ls0, "mem_trans_ncwhc4_to_nchw")); +#endif + inbuf = tmp; + } + U32 gs[3] = { + iw, ih, (ic / (p.blockSize * p.blockSize) + 3) / 4 * (p.blockSize * p.blockSize)}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "depth2space_nchw", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, p.blockSize, iw_str, iwh_str, iw_off, ih_off, + oh_str, owh_str, oh_off, ow_off, iw, ih, ic, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "depth2space_nchw"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "depth2space_nchw")); +#endif + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE depth2space_infer_tmpBuf_size_mali_fp16( + TensorDesc inputDesc, Depth2SpaceParamSpec p, TensorDesc outputDesc, U32 *bytes) +{ + UNUSED(outputDesc); + DataFormat idf; + DataType idt; + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + *bytes = 0; + if (idf == DF_NCHW && p.blockSize != 2) { + *bytes = in * ic * ih * iw * bytesOf(idt); + } + return SUCCESS; +} + +EE depth2space_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + Depth2SpaceParamSpec p, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(depth2space_checkpara_mali_fp16(inputDesc, outputDesc)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS( + depth2space_core_mali_fp16(handle, inputDesc, input, p, tmpBuf, outputDesc, output)); + return ret; +} diff --git a/compute/tensor/src/gpu/mali/fp16/depth2space_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/depth2space_mali_fp16.h new file mode 100644 index 00000000..90d1efe8 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/depth2space_mali_fp16.h @@ -0,0 +1,31 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _DEPTH2SPACE_MALI_FP16 +#define _DEPTH2SPACE_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE depth2space_infer_tmpBuf_size_mali_fp16( + TensorDesc inputDesc, Depth2SpaceParamSpec p, TensorDesc outputDesc, U32 *bytes); + +EE depth2space_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + Depth2SpaceParamSpec p, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.cpp new file mode 100644 index 00000000..1b764e92 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.cpp @@ -0,0 +1,184 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.h" + +inline EE depthwise_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode) +{ + UNUSED(inputDesc); + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(tmpBuf); + + cl_mem inbuf, biasimg, outbuf, fltbuf; + inbuf = input->mem; + fltbuf = filter->mem; + biasimg = bias->mem; + outbuf = output->mem; + U32 fw, sw, pw, ph; + U32 ow, oh, oc, on; + sw = convParamSpec.stride_w; + ph = convParamSpec.padding_top; + pw = convParamSpec.padding_left; + tensorSelectGet(filterDesc, NULL, NULL, NULL, NULL, NULL, &fw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + U32 iw_str, ih_str, ihw_str, ic_str, ih_off, iw_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); + iw_off -= pw; + ih_off -= ph; + ihw_str = iw_str * ih_str; + + U32 ow_str, oh_str, ow_off, oh_off, ohw_str; + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + ohw_str = oh_str * ow_str; + + U32 item_w = forwardRunInfo->best_w[0]; + U32 gs[3] = {oh, (ow + item_w - 1) / item_w, (oc + 3) / 4 * on}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + char kernelname[128]; + Kernel kernel; + if (depthwiseActivationMode == ACTIVATION_NULL) { + sprintf(kernelname, "conv_depthwise_s%d_%d%d", sw, fw, item_w); + } else if (depthwiseActivationMode == ACTIVATION_RELU) { + sprintf(kernelname, "conv_depthwise_s%d_relu_%d%d", sw, fw, item_w); + } else if (depthwiseActivationMode == ACTIVATION_RELU6) { + sprintf(kernelname, "conv_depthwise_s%d_relu6_%d%d", sw, fw, item_w); + } else { + UNI_ERROR_LOG("xxx %d \n", (int)depthwiseActivationMode); + CHECK_STATUS(NOT_SUPPORTED); + return NOT_SUPPORTED; + } + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, oh_str, ow_str, + ohw_str, oh_off, ow_off, ow, gs[0], gs[1], inbuf, fltbuf, biasimg, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; +} + +EE depthwise_convolution_direct_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes) +{ + U32 fw, fh, fc; + tensorSelectGet(filterDesc, NULL, NULL, NULL, &fc, &fh, &fw); + U32 item_k = forwardRunInfo->best_k[0]; + U32 s0, s1, s2; + U32 num, byteSize; + s0 = fw * fh; + s1 = (fc + item_k - 1) / item_k; + s2 = 1; + num = s0 * s1 * s2 * item_k; + byteSize = num * bytesOf(DT_F16); + gclmemFilterDesc->stride[0] = s0; + gclmemFilterDesc->stride[1] = s1; + gclmemFilterDesc->stride[2] = s2; + gclmemFilterDesc->offset[0] = 0; + gclmemFilterDesc->offset[1] = 0; + gclmemFilterDesc->offset[2] = 0; + gclmemFilterDesc->num = num; + gclmemFilterDesc->byteSize = byteSize; + gclmemFilterDesc->memType = GCL_MEM_BUF; + gclmemFilterDesc->memFormat = DF_NHWCN4; + gclmemFilterDesc->flags = CL_MEM_READ_WRITE; + gclmemFilterDesc->host_ptr = NULL; + *bytes = 0; + return SUCCESS; +} + +EE depthwise_convolution_direct_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem) +{ + DataType fdt; + DataFormat fdf; + U32 fw, fh, fc; + tensorSelectGet(filterDesc, &fdt, &fdf, NULL, &fc, &fh, &fw); + U32 fwh = fw * fh; + U32 item_k = forwardRunInfo->best_k[0]; + char kernelname[128]; + Kernel kernel; + sprintf(kernelname, "conv_depthwise_trans_fltbuf_%d", item_k); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, fwh, fc, filter->mem, fltmem->mem)); + U32 gs[3] = {fwh, (fc + item_k - 1) / item_k}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 2; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + *fltmemDesc = tensor4df(fdt, fdf, 1, fc, fh, fw); + return SUCCESS; +} + +EE depthwise_convolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + UNUSED(inputDesc); + UNUSED(filterDesc); + UNUSED(outputDesc); + UNUSED(convParamSpec); + UNUSED(forwardRunInfo); + *bytes = 0; + return SUCCESS; +} + +EE depthwise_convolution_direct_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode) +{ + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(depthwise_core_mali_fp16(handle, inputDesc, input, filterDesc, filter, + convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, + depthwiseActivationMode)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.h new file mode 100644 index 00000000..84c06e38 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.h @@ -0,0 +1,54 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _DEPTHWISE_CONVOLUTION_DIRECT_MALI_FP16 +#define _DEPTHWISE_CONVOLUTION_DIRECT_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE depthwise_convolution_direct_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes); + +EE depthwise_convolution_direct_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem); + +EE depthwise_convolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE depthwise_convolution_direct_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.cpp new file mode 100644 index 00000000..a546c75b --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.cpp @@ -0,0 +1,157 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/depthwise_convolution_mali_fp16.h" +#include "gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.h" + +inline EE depthwise_convolution_checkpara_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + const GCLMem_t bias, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (nullptr == handle || nullptr == input || nullptr == filter || nullptr == output || + nullptr == bias) { + return NULL_POINTER; + } + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != filterDesc.dt || inputDesc.dt != DT_F16) { + return NOT_MATCH; + } + + DataFormat fdf; + U32 ic, fc, fh, fw, oc; + CHECK_STATUS(tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, NULL, NULL)); + CHECK_STATUS(tensorSelectGet(filterDesc, NULL, &fdf, NULL, &fc, &fh, &fw)); + CHECK_STATUS(tensorSelectGet(outputDesc, NULL, NULL, NULL, &oc, NULL, NULL)); + if (input->desc.memFormat == DF_NCWHC4) { + if (filter->desc.memFormat != DF_NHWCN4) { + return NOT_MATCH; + } + if (output->desc.memFormat != DF_NCWHC4) { + return NOT_MATCH; + } + } + if (fw != 3 && fw != 5 && fw != 7) { + return NOT_MATCH; + } + if (fdf == DF_NCHW && ic != fc) { + return NOT_MATCH; + } + if (fc != oc) { + return NOT_MATCH; + } + return SUCCESS; +} + +EE depthwise_convolution_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes) +{ + EE ret = SUCCESS; + DepthwiseConvolutionForwardAlgorithm algorithm = + (DepthwiseConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: + ret = depthwise_convolution_direct_transform_filter_bytes_mali_fp16( + filterDesc, forwardRunInfo, gclmemFilterDesc, bytes); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_convolution_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem) +{ + EE ret = SUCCESS; + DepthwiseConvolutionForwardAlgorithm algorithm = + (DepthwiseConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: + ret = depthwise_convolution_direct_transform_filter_mali_fp16( + handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_convolution_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + EE ret = SUCCESS; + DepthwiseConvolutionForwardAlgorithm algorithm = + (DepthwiseConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: + ret = depthwise_convolution_direct_infer_forward_tmp_bytes_mali_fp16( + inputDesc, filterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_convolution_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode) +{ + EE ret = SUCCESS; + CHECK_STATUS(depthwise_convolution_checkpara_mali_fp16( + handle, inputDesc, input, filterDesc, filter, bias, outputDesc, output)); + DepthwiseConvolutionForwardAlgorithm algorithm = + (DepthwiseConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT: + ret = depthwise_convolution_direct_mali_fp16(handle, inputDesc, input, filterDesc, + filter, convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, + output, depthwiseActivationMode); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.h new file mode 100644 index 00000000..5a876c9d --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.h @@ -0,0 +1,54 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _DEPTHWISE_CONVOLUTION_MALI_FP16 +#define _DEPTHWISE_CONVOLUTION_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE depthwise_convolution_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes); + +EE depthwise_convolution_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem); + +EE depthwise_convolution_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE depthwise_convolution_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.cpp new file mode 100644 index 00000000..551ea9e2 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.cpp @@ -0,0 +1,277 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.h" + +inline EE depthwise_pointwise_direct_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + const GCLMem_t dwFilter, + const GCLMem_t pwFilter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc dwBiasDesc, + TensorDesc pwBiasDesc, + const GCLMem_t dwBias, + const GCLMem_t pwBias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode, + ActivationMode pointwiseActivationMode) +{ + UNUSED(inputDesc); + UNUSED(dwBiasDesc); + UNUSED(pwBiasDesc); + UNUSED(tmpBytes); + + cl_mem inbuf, dwBiasimg, pwBiasimg, outbuf, dwFltbuf, pwFltbuf, tmp; + inbuf = input->mem; + dwFltbuf = dwFilter->mem; + pwFltbuf = pwFilter->mem; + dwBiasimg = dwBias->mem; + pwBiasimg = pwBias->mem; + outbuf = output->mem; + tmp = tmpBuf->mem; + U32 fw, sw, pw, ph, fc; + U32 ow, oh, oc, on; + sw = convParamSpec.stride_w; + ph = convParamSpec.padding_top; + pw = convParamSpec.padding_left; + tensorSelectGet(dwFilterDesc, NULL, NULL, NULL, &fc, NULL, &fw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + U32 iw_str, ih_str, ihw_str, ic_str, ih_off, iw_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); + iw_off -= pw; + ih_off -= ph; + ihw_str = iw_str * ih_str; + + U32 th_str, tw_str, th_off, tw_off, thw_str; + U32 w_align, item_wd, item_wp; + item_wd = forwardRunInfo->best_w[0]; + item_wp = forwardRunInfo->best_w[1]; + w_align = (ow + item_wp - 1) / item_wp * item_wp; + th_str = oh; + tw_str = w_align; + th_off = 0; + tw_off = 0; + thw_str = th_str * tw_str; + + U32 ow_str, oh_str, ow_off, oh_off, ohw_str; + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + ohw_str = oh_str * ow_str; + + U32 gs[3] = {oh, (ow + item_wd - 1) / item_wd, (fc + 3) / 4}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + char kernelname[128]; + Kernel kernel; + if (depthwiseActivationMode == ACTIVATION_NULL) { + sprintf(kernelname, "conv_depthwise_s%d_%d%d", sw, fw, item_wd); + } else if (depthwiseActivationMode == ACTIVATION_RELU) { + sprintf(kernelname, "conv_depthwise_s%d_relu_%d%d", sw, fw, item_wd); + } else if (depthwiseActivationMode == ACTIVATION_RELU6) { + sprintf(kernelname, "conv_depthwise_s%d_relu6_%d%d", sw, fw, item_wd); + } else { + CHECK_STATUS(NOT_SUPPORTED); + return NOT_SUPPORTED; + } + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, th_str, tw_str, + thw_str, th_off, tw_off, ow, gs[0], gs[1], inbuf, dwFltbuf, dwBiasimg, tmp)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + fw = 1; + sw = 1; + U32 item_kp = forwardRunInfo->best_k[1]; + item_kp = item_kp >> 2; + if (pointwiseActivationMode == ACTIVATION_NULL) { + sprintf(kernelname, "conv_direct_s%d_%d%d%d", sw, fw, item_wp, item_kp); + } else if (pointwiseActivationMode == ACTIVATION_RELU) { + sprintf(kernelname, "conv_direct_s%d_relu_%d%d%d", sw, fw, item_wp, item_kp); + } else { + CHECK_STATUS(NOT_SUPPORTED); + return NOT_SUPPORTED; + } + + U32 gsp[3] = {oh, (ow + item_wp - 1) / item_wp, (oc + 3) / 4 * on / item_kp}; + U32 lsp[3] = {0, 0, 0}; + U32 dimp = 3; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, th_str, thw_str, ic_str, th_off, tw_off, oh_str, + ohw_str, oh_off, ow_off, ow, 1, gsp[0], gsp[1], tmp, pwFltbuf, pwBiasimg, outbuf)); + gcl_set_kernelVec(handle, kernel, dimp, gsp, lsp, kernelname); + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dimp, gsp, lsp, kernelname)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; +} + +EE depthwise_pointwise_convolution_direct_transform_filter_bytes_mali_fp16(TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemDwFilterDesc, + GCLMemDesc_t gclmemPwFilterDesc, + U32 *bytes) +{ + U32 fw, fh, fc, fn; + tensorSelectGet(dwFilterDesc, NULL, NULL, NULL, &fc, &fh, &fw); + tensorSelectGet(pwFilterDesc, NULL, NULL, &fn, NULL, NULL, NULL); + U32 item_kd = forwardRunInfo->best_k[0]; + U32 item_kp = forwardRunInfo->best_k[1]; + U32 item_c = forwardRunInfo->best_c[1]; + U32 s0, s1, s2; + U32 num, byteSize; + s0 = fw * fh; + s1 = (fc + item_kd - 1) / item_kd; + s2 = 1; + num = s0 * s1 * s2 * item_kd; + byteSize = num * bytesOf(DT_F16); + gclmemDwFilterDesc->stride[0] = s0; + gclmemDwFilterDesc->stride[1] = s1; + gclmemDwFilterDesc->stride[2] = s2; + gclmemDwFilterDesc->offset[0] = 0; + gclmemDwFilterDesc->offset[1] = 0; + gclmemDwFilterDesc->offset[2] = 0; + gclmemDwFilterDesc->num = num; + gclmemDwFilterDesc->byteSize = byteSize; + gclmemDwFilterDesc->memType = GCL_MEM_BUF; + gclmemDwFilterDesc->memFormat = DF_NHWCN4; + gclmemDwFilterDesc->flags = CL_MEM_READ_WRITE; + gclmemDwFilterDesc->host_ptr = NULL; + + s0 = item_kp >> 2; + s1 = (fc + item_c - 1) / item_c; + s2 = (fn + item_kp - 1) / item_kp; + num = s0 * s1 * s2 * item_c * item_kp / (item_kp >> 2); + byteSize = num * bytesOf(DT_F16); + gclmemPwFilterDesc->stride[0] = s0; + gclmemPwFilterDesc->stride[1] = s1; + gclmemPwFilterDesc->stride[2] = s2; + gclmemPwFilterDesc->offset[0] = 0; + gclmemPwFilterDesc->offset[1] = 0; + gclmemPwFilterDesc->offset[2] = 0; + gclmemPwFilterDesc->num = num; + gclmemPwFilterDesc->byteSize = byteSize; + gclmemPwFilterDesc->memType = GCL_MEM_BUF; + gclmemPwFilterDesc->memFormat = DF_NCHWN4C4; + gclmemPwFilterDesc->flags = CL_MEM_READ_WRITE; + gclmemPwFilterDesc->host_ptr = NULL; + *bytes = 0; + return SUCCESS; +} + +EE depthwise_pointwise_convolution_direct_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + GCLMem_t dwFilter, + GCLMem_t pwFilter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *dwFltmemDesc, + TensorDesc *pwFltmemDesc, + GCLMem_t dwFltmem, + GCLMem_t pwFltmem) +{ + U32 dfw, dfh, dfc; + U32 pfc, pfn; + tensorSelectGet(dwFilterDesc, NULL, NULL, NULL, &dfc, &dfh, &dfw); + tensorSelectGet(pwFilterDesc, NULL, NULL, &pfn, &pfc, NULL, NULL); + U32 dfwh = dfw * dfh; + U32 item_kd = forwardRunInfo->best_k[0]; + U32 item_kp = forwardRunInfo->best_k[1]; + U32 item_c = forwardRunInfo->best_c[1]; + char kernelname[128]; + Kernel kernel; + sprintf(kernelname, "conv_depthwise_trans_fltbuf_%d", item_kd); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, dfwh, dfc, dwFilter->mem, dwFltmem->mem)); + U32 gs[2] = {dfwh, (dfc + item_kd - 1) / item_kd}; + U32 ls[2] = {0, 0}; + U32 dim = 2; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + *dwFltmemDesc = dwFilterDesc; + + sprintf(kernelname, "conv_direct_trans_fltbuf_%d%d", item_c, item_kp); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, 1, pfc, pfn, pwFilter->mem, pwFltmem->mem)); + U32 gsc[3] = {1, (pfc + item_c - 1) / item_c, (pfn + item_kp - 1) / item_kp * item_kp}; + U32 lsc[3] = {0, 0, 0}; + U32 dimc = 3; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dimc, gsc, lsc, kernelname)); + *pwFltmemDesc = pwFilterDesc; + return SUCCESS; +} + +EE depthwise_pointwise_convolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + UNUSED(inputDesc); + UNUSED(pwFilterDesc); + UNUSED(convParamSpec); + DataType odt; + U32 oh, ow, fc; + tensorSelectGet(dwFilterDesc, NULL, NULL, NULL, &fc, NULL, NULL); + tensorSelectGet(outputDesc, &odt, NULL, NULL, NULL, &oh, &ow); + + U32 w_align; + U32 item_w = forwardRunInfo->best_w[1]; + w_align = (ow + item_w - 1) / item_w * item_w; + *bytes = oh * w_align * ((fc + 3) / 4) * 4 * bytesOf(odt); + return SUCCESS; +} + +EE depthwise_pointwise_convolution_direct_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + const GCLMem_t dwFilter, + const GCLMem_t pwFilter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc dwBiasDesc, + TensorDesc pwBiasDesc, + const GCLMem_t dwBias, + const GCLMem_t pwBias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode, + ActivationMode pointwiseActivationMode) +{ + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(depthwise_pointwise_direct_core_mali_fp16(handle, inputDesc, input, dwFilterDesc, + pwFilterDesc, dwFilter, pwFilter, convParamSpec, forwardRunInfo, dwBiasDesc, pwBiasDesc, + dwBias, pwBias, tmpBytes, tmpBuf, outputDesc, output, depthwiseActivationMode, + pointwiseActivationMode)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.h new file mode 100644 index 00000000..72a682d8 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.h @@ -0,0 +1,66 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _DEPTHWISE_POINTWISE_CONVOLUTION_DIRECT_MALI_FP16 +#define _DEPTHWISE_POINTWISE_CONVOLUTION_DIRECT_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE depthwise_pointwise_convolution_direct_transform_filter_bytes_mali_fp16(TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemDwFilterDesc, + GCLMemDesc_t gclmemPwFilterDesc, + U32 *bytes); + +EE depthwise_pointwise_convolution_direct_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + GCLMem_t dwFilter, + GCLMem_t pwFilter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *dwFltmemDesc, + TensorDesc *pwFltmemDesc, + GCLMem_t dwFltmem, + GCLMem_t pwFltmem); + +EE depthwise_pointwise_convolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE depthwise_pointwise_convolution_direct_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + const GCLMem_t dwFilter, + const GCLMem_t pwFilter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc dwBiasDesc, + TensorDesc pwBiasDesc, + const GCLMem_t dwBias, + const GCLMem_t pwBias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode, + ActivationMode pointwiseActivationMode); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.cpp new file mode 100644 index 00000000..b08de440 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.cpp @@ -0,0 +1,278 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.h" + +inline EE depthwise_pointwise_gemm_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + const GCLMem_t dwFilter, + const GCLMem_t pwFilter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc dwBiasDesc, + TensorDesc pwBiasDesc, + const GCLMem_t dwBias, + const GCLMem_t pwBias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode, + ActivationMode pointwiseActivationMode) +{ + UNUSED(inputDesc); + UNUSED(dwBiasDesc); + UNUSED(pwBiasDesc); + UNUSED(tmpBytes); + + cl_mem inbuf, dwBiasimg, pwBiasbuf, outbuf, dwFltbuf, pwFltbuf, tmp; + inbuf = input->mem; + dwFltbuf = dwFilter->mem; + pwFltbuf = pwFilter->mem; + dwBiasimg = dwBias->mem; + pwBiasbuf = pwBias->mem; + outbuf = output->mem; + tmp = tmpBuf->mem; + U32 fw, sw, pw, ph, fc; + U32 ow, oh, oc, on; + sw = convParamSpec.stride_w; + ph = convParamSpec.padding_top; + pw = convParamSpec.padding_left; + tensorSelectGet(dwFilterDesc, NULL, NULL, NULL, &fc, NULL, &fw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + U32 iw_str, ih_str, ihw_str, ic_str, ih_off, iw_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); + iw_off -= pw; + ih_off -= ph; + ihw_str = iw_str * ih_str; + + U32 th_str, tw_str, th_off, tw_off, thw_str; + U32 item_wd, item_whp, item_kp; + item_wd = forwardRunInfo->best_w[0]; + item_whp = forwardRunInfo->best_w[1]; + item_kp = forwardRunInfo->best_k[1]; + th_str = oh; + tw_str = ow; + th_off = 0; + tw_off = 0; + thw_str = ALIGN(th_str * tw_str, item_whp); + + U32 ow_str, oh_str, ow_off, oh_off, ohw_str; + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + ohw_str = oh_str * ow_str; + + U32 gs[3] = {oh, ALIGN(ow, item_wd) / item_wd, ALIGN(fc, 4) / 4}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + char kernelname[128]; + Kernel kernel; + if (depthwiseActivationMode == ACTIVATION_NULL) { + sprintf(kernelname, "conv_depthwise_s%d_ncwh_%d%d", sw, fw, item_wd); + } else if (depthwiseActivationMode == ACTIVATION_RELU) { + sprintf(kernelname, "conv_depthwise_s%d_relu_ncwh_%d%d", sw, fw, item_wd); + } else if (depthwiseActivationMode == ACTIVATION_RELU6) { + sprintf(kernelname, "conv_depthwise_s%d_relu6_ncwh_%d%d", sw, fw, item_wd); + } else { + CHECK_STATUS(NOT_SUPPORTED); + return NOT_SUPPORTED; + } + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, th_str, tw_str, + thw_str, th_off, tw_off, ow, gs[0], gs[1], inbuf, dwFltbuf, dwBiasimg, tmp)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + if (pointwiseActivationMode == ACTIVATION_NULL) { + sprintf(kernelname, "gemm_tn_ncwhc4_%d%d", item_kp, item_whp); + } else if (pointwiseActivationMode == ACTIVATION_RELU) { + sprintf(kernelname, "gemm_tn_relu_ncwhc4_%d%d", item_kp, item_whp); + } else { + CHECK_STATUS(NOT_SUPPORTED); + return NOT_SUPPORTED; + } + + U32 M, N, K; + M = ALIGN(oc, item_kp); + N = thw_str; + K = fc; + U32 gsp[3] = {N / item_whp, M / item_kp}; + U32 lsp[3] = {0, 0}; + U32 dimp = 2; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, M, N, K, oh, ow, oc, oh_str, ow_str, ohw_str, oh_off, + ow_off, gsp[0], gsp[1], pwFltbuf, tmp, pwBiasbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dimp, gsp, lsp, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dimp, gsp, lsp, kernelname)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; +} + +EE depthwise_pointwise_convolution_gemm_transform_filter_bytes_mali_fp16(TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemDwFilterDesc, + GCLMemDesc_t gclmemPwFilterDesc, + U32 *bytes) +{ + U32 fw, fh, fc, fn; + tensorSelectGet(dwFilterDesc, NULL, NULL, NULL, &fc, &fh, &fw); + tensorSelectGet(pwFilterDesc, NULL, NULL, &fn, NULL, NULL, NULL); + U32 item_kd = forwardRunInfo->best_k[0]; + U32 item_kp = forwardRunInfo->best_k[1]; + U32 item_c = forwardRunInfo->best_c[1]; + U32 s0, s1, s2; + U32 num, byteSize; + s0 = fw * fh; + s1 = ALIGN(fc, item_kd) / item_kd; + s2 = 1; + num = s0 * s1 * s2 * item_kd; + byteSize = num * bytesOf(DT_F16); + gclmemDwFilterDesc->stride[0] = s0; + gclmemDwFilterDesc->stride[1] = s1; + gclmemDwFilterDesc->stride[2] = s2; + gclmemDwFilterDesc->offset[0] = 0; + gclmemDwFilterDesc->offset[1] = 0; + gclmemDwFilterDesc->offset[2] = 0; + gclmemDwFilterDesc->num = num; + gclmemDwFilterDesc->byteSize = byteSize; + gclmemDwFilterDesc->memType = GCL_MEM_BUF; + gclmemDwFilterDesc->memFormat = DF_NHWCN4; + gclmemDwFilterDesc->flags = CL_MEM_READ_WRITE; + gclmemDwFilterDesc->host_ptr = NULL; + + s0 = ALIGN(fn, item_kp); + s1 = ALIGN(fc, item_c); + s2 = 1; + num = s0 * s1 * s2; + byteSize = num * bytesOf(DT_F16); + gclmemPwFilterDesc->stride[0] = s0; + gclmemPwFilterDesc->stride[1] = s1; + gclmemPwFilterDesc->stride[2] = s2; + gclmemPwFilterDesc->offset[0] = 0; + gclmemPwFilterDesc->offset[1] = 0; + gclmemPwFilterDesc->offset[2] = 0; + gclmemPwFilterDesc->num = num; + gclmemPwFilterDesc->byteSize = byteSize; + gclmemPwFilterDesc->memType = GCL_MEM_BUF; + gclmemPwFilterDesc->memFormat = DF_HWCN; + gclmemPwFilterDesc->flags = CL_MEM_READ_WRITE; + gclmemPwFilterDesc->host_ptr = NULL; + *bytes = 0; + return SUCCESS; +} + +EE depthwise_pointwise_convolution_gemm_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + GCLMem_t dwFilter, + GCLMem_t pwFilter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *dwFltmemDesc, + TensorDesc *pwFltmemDesc, + GCLMem_t dwFltmem, + GCLMem_t pwFltmem) +{ + U32 dfw, dfh, dfc; + U32 pfc, pfn; + tensorSelectGet(dwFilterDesc, NULL, NULL, NULL, &dfc, &dfh, &dfw); + tensorSelectGet(pwFilterDesc, NULL, NULL, &pfn, &pfc, NULL, NULL); + U32 dfwh = dfw * dfh; + U32 item_kd = forwardRunInfo->best_k[0]; + U32 item_kp = forwardRunInfo->best_k[1]; + U32 item_c = forwardRunInfo->best_c[1]; + char kernelname[128]; + Kernel kernel; + sprintf(kernelname, "conv_depthwise_trans_fltbuf_%d", item_kd); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, dfwh, dfc, dwFilter->mem, dwFltmem->mem)); + U32 gs[2] = {dfwh, (dfc + item_kd - 1) / item_kd}; + U32 ls[2] = {0, 0}; + U32 dim = 2; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + *dwFltmemDesc = dwFilterDesc; + + U32 fn_align = ALIGN(pfn, item_kp); + sprintf(kernelname, "conv_direct_trans_fltbuf_%d%d", item_c, 0); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, 1, pfc, fn_align, pwFilter->mem, pwFltmem->mem)); + U32 gsc[3] = {1, ALIGN(pfc, item_c) / item_c, fn_align}; + U32 lsc[3] = {0, 0, 0}; + U32 dimc = 3; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dimc, gsc, lsc, kernelname)); + *pwFltmemDesc = pwFilterDesc; + return SUCCESS; +} + +EE depthwise_pointwise_convolution_gemm_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + UNUSED(inputDesc); + UNUSED(pwFilterDesc); + UNUSED(convParamSpec); + DataType odt; + U32 oh, ow, fc; + tensorSelectGet(dwFilterDesc, NULL, NULL, NULL, &fc, NULL, NULL); + tensorSelectGet(outputDesc, &odt, NULL, NULL, NULL, &oh, &ow); + + U32 N; + U32 item_wh = forwardRunInfo->best_w[1]; + N = ALIGN(oh * ow, item_wh); + *bytes = N * ALIGN(fc, 4) * bytesOf(odt); + return SUCCESS; +} + +EE depthwise_pointwise_convolution_gemm_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + const GCLMem_t dwFilter, + const GCLMem_t pwFilter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc dwBiasDesc, + TensorDesc pwBiasDesc, + const GCLMem_t dwBias, + const GCLMem_t pwBias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode, + ActivationMode pointwiseActivationMode) +{ + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(depthwise_pointwise_gemm_core_mali_fp16(handle, inputDesc, input, dwFilterDesc, + pwFilterDesc, dwFilter, pwFilter, convParamSpec, forwardRunInfo, dwBiasDesc, pwBiasDesc, + dwBias, pwBias, tmpBytes, tmpBuf, outputDesc, output, depthwiseActivationMode, + pointwiseActivationMode)); + + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.h new file mode 100644 index 00000000..72fdf75b --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.h @@ -0,0 +1,66 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _DEPTHWISE_POINTWISE_CONVOLUTION_GEMM_MALI_FP16 +#define _DEPTHWISE_POINTWISE_CONVOLUTION_GEMM_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE depthwise_pointwise_convolution_gemm_transform_filter_bytes_mali_fp16(TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemDwFilterDesc, + GCLMemDesc_t gclmemPwFilterDesc, + U32 *bytes); + +EE depthwise_pointwise_convolution_gemm_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + GCLMem_t dwFilter, + GCLMem_t pwFilter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *dwFltmemDesc, + TensorDesc *pwFltmemDesc, + GCLMem_t dwFltmem, + GCLMem_t pwFltmem); + +EE depthwise_pointwise_convolution_gemm_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE depthwise_pointwise_convolution_gemm_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + const GCLMem_t dwFilter, + const GCLMem_t pwFilter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc dwBiasDesc, + TensorDesc pwBiasDesc, + const GCLMem_t dwBias, + const GCLMem_t pwBias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode, + ActivationMode pointwiseActivationMode); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_mali_fp16.cpp new file mode 100644 index 00000000..28b32d67 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_mali_fp16.cpp @@ -0,0 +1,190 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/depthwise_pointwise_convolution_mali_fp16.h" +#include "gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.h" +#include "gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.h" + +inline EE depthwise_pointwise_convolution_checkpara_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + const GCLMem_t dwFilter, + const GCLMem_t pwFilter, + const GCLMem_t dwBias, + const GCLMem_t pwBias, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (nullptr == handle || nullptr == input || nullptr == dwFilter || nullptr == pwFilter || + nullptr == output || nullptr == dwBias || nullptr == pwBias || nullptr == tmpBuf) { + return NULL_POINTER; + } + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != dwFilterDesc.dt || + inputDesc.dt != pwFilterDesc.dt || inputDesc.dt != DT_F16) { + return NOT_MATCH; + } + + U32 ic, fn, fh, fw, oc; + U32 dfc, pfc; + CHECK_STATUS(tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, NULL, NULL)); + CHECK_STATUS(tensorSelectGet(dwFilterDesc, NULL, NULL, NULL, &dfc, &fh, &fw)); + CHECK_STATUS(tensorSelectGet(pwFilterDesc, NULL, NULL, &fn, &pfc, NULL, NULL)); + CHECK_STATUS(tensorSelectGet(outputDesc, NULL, NULL, NULL, &oc, NULL, NULL)); + if (fw != 3 && fw != 5 && fw != 7) { + return NOT_MATCH; + } + if (ic != dfc || ic != pfc) { + return NOT_MATCH; + } + if (fn != oc) { + return NOT_MATCH; + } + return SUCCESS; +} + +EE depthwise_pointwise_convolution_transform_filter_bytes_mali_fp16(TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemDwFilterDesc, + GCLMemDesc_t gclmemPwFilterDesc, + U32 *bytes) +{ + EE ret = SUCCESS; + DepthwiseConvolutionForwardAlgorithm algorithm = + (DepthwiseConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: + ret = depthwise_pointwise_convolution_direct_transform_filter_bytes_mali_fp16( + dwFilterDesc, pwFilterDesc, forwardRunInfo, gclmemDwFilterDesc, gclmemPwFilterDesc, + bytes); + break; + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM: + ret = depthwise_pointwise_convolution_gemm_transform_filter_bytes_mali_fp16(dwFilterDesc, + pwFilterDesc, forwardRunInfo, gclmemDwFilterDesc, gclmemPwFilterDesc, bytes); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_pointwise_convolution_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + GCLMem_t dwFilter, + GCLMem_t pwFilter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *dwFltmemDesc, + TensorDesc *pwFltmemDesc, + GCLMem_t dwFltmem, + GCLMem_t pwFltmem) +{ + EE ret = SUCCESS; + DepthwiseConvolutionForwardAlgorithm algorithm = + (DepthwiseConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: + ret = depthwise_pointwise_convolution_direct_transform_filter_mali_fp16(handle, + dwFilterDesc, pwFilterDesc, dwFilter, pwFilter, forwardRunInfo, dwFltmemDesc, + pwFltmemDesc, dwFltmem, pwFltmem); + break; + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM: + ret = depthwise_pointwise_convolution_gemm_transform_filter_mali_fp16(handle, + dwFilterDesc, pwFilterDesc, dwFilter, pwFilter, forwardRunInfo, dwFltmemDesc, + pwFltmemDesc, dwFltmem, pwFltmem); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_pointwise_convolution_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + EE ret = SUCCESS; + DepthwiseConvolutionForwardAlgorithm algorithm = + (DepthwiseConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: + ret = depthwise_pointwise_convolution_direct_infer_forward_tmp_bytes_mali_fp16(inputDesc, + dwFilterDesc, pwFilterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); + break; + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM: + ret = depthwise_pointwise_convolution_gemm_infer_forward_tmp_bytes_mali_fp16(inputDesc, + dwFilterDesc, pwFilterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE depthwise_pointwise_convolution_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + const GCLMem_t dwFilter, + const GCLMem_t pwFilter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc dwBiasDesc, + TensorDesc pwBiasDesc, + const GCLMem_t dwBias, + const GCLMem_t pwBias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode, + ActivationMode pointwiseActivationMode) +{ + EE ret = SUCCESS; + CHECK_STATUS(depthwise_pointwise_convolution_checkpara_mali_fp16(handle, inputDesc, input, + dwFilterDesc, pwFilterDesc, dwFilter, pwFilter, dwBias, pwBias, tmpBuf, outputDesc, output)); + DepthwiseConvolutionForwardAlgorithm algorithm = + (DepthwiseConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + switch (algorithm) { + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: + ret = depthwise_pointwise_convolution_direct_mali_fp16(handle, inputDesc, input, + dwFilterDesc, pwFilterDesc, dwFilter, pwFilter, convParamSpec, forwardRunInfo, + dwBiasDesc, pwBiasDesc, dwBias, pwBias, tmpBytes, tmpBuf, outputDesc, output, + depthwiseActivationMode, pointwiseActivationMode); + break; + case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM: + ret = depthwise_pointwise_convolution_gemm_mali_fp16(handle, inputDesc, input, + dwFilterDesc, pwFilterDesc, dwFilter, pwFilter, convParamSpec, forwardRunInfo, + dwBiasDesc, pwBiasDesc, dwBias, pwBias, tmpBytes, tmpBuf, outputDesc, output, + depthwiseActivationMode, pointwiseActivationMode); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_mali_fp16.h new file mode 100644 index 00000000..1468f365 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_mali_fp16.h @@ -0,0 +1,66 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _DEPTHWISE_POINTWISE_CONVOLUTION_MALI_FP16 +#define _DEPTHWISE_POINTWISE_CONVOLUTION_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE depthwise_pointwise_convolution_transform_filter_bytes_mali_fp16(TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemDwFilterDesc, + GCLMemDesc_t gclmemPwFilterDesc, + U32 *bytes); + +EE depthwise_pointwise_convolution_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + GCLMem_t dwFilter, + GCLMem_t pwFilter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *dwFltmemDesc, + TensorDesc *pwFltmemDesc, + GCLMem_t dwFltmem, + GCLMem_t pwFltmem); + +EE depthwise_pointwise_convolution_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE depthwise_pointwise_convolution_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + const GCLMem_t dwFilter, + const GCLMem_t pwFilter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc dwBiasDesc, + TensorDesc pwBiasDesc, + const GCLMem_t dwBias, + const GCLMem_t pwBias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode, + ActivationMode pointwiseActivationMode); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/eltwise_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/eltwise_mali_fp16.cpp new file mode 100644 index 00000000..4062bc06 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/eltwise_mali_fp16.cpp @@ -0,0 +1,290 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#include +#include + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/eltwise_mali_fp16.h" + +bool eltwise_same_desc(std::vector inputDesc, U32 *arrayDimMax) +{ + U32 size = inputDesc.size(); + U32 dimMax = 0; + for (U32 i = 1; i < size; i++) { + if (inputDesc[i].nDims > inputDesc[dimMax].nDims) { + dimMax = i; + } else if (inputDesc[i].nDims == inputDesc[dimMax].nDims) { + U32 nDims = inputDesc[dimMax].nDims; + U32 sign[8]; + if (nDims > 8) { + CHECK_STATUS(NOT_SUPPORTED); + } + for (U32 j = 0; j < nDims; j++) { + if (inputDesc[i].dims[j] > inputDesc[dimMax].dims[j]) { + sign[j] = 2; + } else if (inputDesc[i].dims[j] == inputDesc[dimMax].dims[j]) { + sign[j] = 1; + } else { + sign[j] = 0; + } + } + if (*std::max_element(sign, sign + nDims) == 2 && + *std::min_element(sign, sign + nDims) == 1) { + dimMax = i; + } + if (*std::max_element(sign, sign + nDims) == 2 && + *std::min_element(sign, sign + nDims) == 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + } + } + + bool sameDesc = true; + DataFormat idf; + U32 in, ic, ih, iw; + tensorSelectGet(inputDesc[0], NULL, &idf, &in, &ic, &ih, &iw); + for (U32 i = 1; i < size; i++) { + DataFormat tdf; + U32 tn, tc, th, tw; + tensorSelectGet(inputDesc[i], NULL, &tdf, &tn, &tc, &th, &tw); + if (tdf != idf || in != tn || ic != tc || ih != th || iw != tw) { + sameDesc = false; + break; + } + } + *arrayDimMax = dimMax; + return sameDesc; +} + +inline EE eltwise_checkpara_mali_fp16( + std::vector inputDesc, std::vector input, TensorDesc outputDesc) +{ + for (auto it : inputDesc) { + if (it.dt != outputDesc.dt) { + return NOT_SUPPORTED; + } + } + U32 num = input.size(); + if (num > 8) { + return NOT_SUPPORTED; + } + if (outputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE eltwise_core_mali_fp16(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + TensorDesc outputDesc, + GCLMem_t output, + EltwiseParamSpec eltwiseDesc) +{ + UNUSED(outputDesc); + U32 iw, ih, ic, in; + U32 arrayDimMax; + bool sameDesc = eltwise_same_desc(inputDesc, &arrayDimMax); + tensorSelectGet(inputDesc[arrayDimMax], NULL, NULL, &in, &ic, &ih, &iw); + + U32 num = input.size(); + GCLMem_t inputMem[8]; + for (U32 i = 0; i < num; ++i) { + inputMem[i] = (GCLMem_t)input[i]; + } + cl_mem outbuf; + outbuf = output->mem; + + U32 ow_str, oh_str, oc_str, ow_off, oh_off; + U32 iw_str[8]; + U32 ih_str[8]; + U32 iw_off[8]; + U32 ih_off[8]; + for (U32 i = 0; i < num; ++i) { + get_gclmem_dim(inputMem[i]->desc, &iw_str[i], &ih_str[i], NULL, &iw_off[i], &ih_off[i]); + } + get_gclmem_dim(output->desc, &ow_str, &oh_str, &oc_str, &ow_off, &oh_off); + + char modeName[16]; + char activeName[16]; + char kernelName[128]; + EltwiseMode eltwiseMode = eltwiseDesc.elt_mode; + ActivationMode activeMode = eltwiseDesc.activation_type; + + Kernel kernel; + if (eltwiseMode == ELTWISE_MAX) { + strcpy(modeName, "max"); + } + if (eltwiseMode == ELTWISE_SUM) { + strcpy(modeName, "sum"); + } + if (eltwiseMode == ELTWISE_PROD) { + strcpy(modeName, "prod"); + } + switch (activeMode) { + case ACTIVATION_RELU: + strcpy(activeName, "relu_"); + break; + case ACTIVATION_NULL: + strcpy(activeName, ""); + break; + default: + return NOT_SUPPORTED; + } + U32 gs[3] = {ih, iw, (ic + 3) / 4 * in}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + if (activeMode != ACTIVATION_NULL && !sameDesc) { + CHECK_STATUS(NOT_SUPPORTED); + } + + if (sameDesc) { + char formatName[16] = ""; + if (inputMem[0]->desc.memFormat == DF_NCHW) { + strcpy(formatName, "nchw_"); + gs[0] = (iw + 3) / 4; + gs[1] = ih; + gs[2] = ic; + if (output->desc.memFormat == DF_NCWHC4) { + CHECK_STATUS(NOT_SUPPORTED); + } + } + sprintf(kernelName, "eltwise_%s%s%s%d", formatName, activeName, modeName, num); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + switch (num) { + case 1: + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ic, oh_str, ow_str, oh_off, ow_off, + gs[0], gs[1], ih_str[0], iw_str[0], ih_off[0], iw_off[0], inputMem[0]->mem, + outbuf)); + break; + case 2: + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ic, oh_str, ow_str, oh_off, ow_off, + gs[0], gs[1], ih_str[0], iw_str[0], ih_off[0], iw_off[0], inputMem[0]->mem, + ih_str[1], iw_str[1], ih_off[1], iw_off[1], inputMem[1]->mem, outbuf)); + break; + case 3: + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ic, oh_str, ow_str, oh_off, ow_off, + gs[0], gs[1], ih_str[0], iw_str[0], ih_off[0], iw_off[0], inputMem[0]->mem, + ih_str[1], iw_str[1], ih_off[1], iw_off[1], inputMem[1]->mem, ih_str[2], + iw_str[2], ih_off[2], iw_off[2], inputMem[2]->mem, outbuf)); + break; + case 4: + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ic, oh_str, ow_str, oh_off, ow_off, + gs[0], gs[1], ih_str[0], iw_str[0], ih_off[0], iw_off[0], inputMem[0]->mem, + ih_str[1], iw_str[1], ih_off[1], iw_off[1], inputMem[1]->mem, ih_str[2], + iw_str[2], ih_off[2], iw_off[2], inputMem[2]->mem, ih_str[3], iw_str[3], + ih_off[3], iw_off[3], inputMem[3]->mem, outbuf)); + break; + default: + return NOT_SUPPORTED; + } + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; + } else { + if (num > 2) { + CHECK_STATUS(NOT_SUPPORTED) + } + DataFormat mf[2]; + mf[0] = inputMem[arrayDimMax]->desc.memFormat; + mf[1] = inputMem[1 - arrayDimMax]->desc.memFormat; + if (mf[0] == DF_NCWHC4 && mf[1] == DF_NCWHC4) { + U32 w_str, h_str, c_str, w_off, h_off; + get_gclmem_dim(inputMem[1 - arrayDimMax]->desc, &w_str, &h_str, &c_str, &w_off, &h_off); + if (w_str == 1 && h_str == 1 && c_str == 1) { + sprintf(kernelName, "eltwise_broadcast_%s%d", modeName, 0); + } else if (w_str == 1 && h_str == 1) { + sprintf(kernelName, "eltwise_broadcast_%s%d", modeName, 1); + } else if (w_str != 1 && h_str == 1) { + sprintf(kernelName, "eltwise_broadcast_%s%d", modeName, 2); + } else if (w_str == 1 && h_str != 1) { + sprintf(kernelName, "eltwise_broadcast_%s%d", modeName, 3); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ic, ih_str[arrayDimMax], + iw_str[arrayDimMax], ih_off[arrayDimMax], iw_off[arrayDimMax], oh_str, ow_str, + oh_off, ow_off, inputMem[arrayDimMax]->mem, inputMem[1 - arrayDimMax]->mem, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; + } else if (mf[0] == DF_NCWHC4 && mf[1] == DF_NCHW) { + U32 axis_a[3]; + U32 axis_b[3]; + tensorSelectGet( + inputDesc[arrayDimMax], NULL, NULL, NULL, &axis_a[2], &axis_a[1], &axis_a[0]); + tensorSelectGet( + inputDesc[1 - arrayDimMax], NULL, NULL, NULL, &axis_b[2], &axis_b[1], &axis_b[0]); + U32 matchAxis[2]; + for (U32 i = 0; i < 3; ++i) { + for (U32 j = 0; j < 3; ++j) { + if (axis_a[i] == axis_b[j] && axis_b[j] != 1) { + matchAxis[0] = i; + matchAxis[1] = j; + break; + } + } + } + if (matchAxis[0] == 2) { + for (U32 i = 0; i < 3; ++i) { + if (i != matchAxis[1]) { + if (axis_b[i] != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (inputMem[1 - arrayDimMax]->desc.stride[i] != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (inputMem[1 - arrayDimMax]->desc.offset[i] != 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + } + } + sprintf(kernelName, "eltwise_spe_nchw_c_%s", modeName); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS( + gcl_set_kernelArgs(kernel, ih, iw, ic, ih_str[arrayDimMax], iw_str[arrayDimMax], + ih_off[arrayDimMax], iw_off[arrayDimMax], oh_str, ow_str, oh_off, ow_off, + inputMem[arrayDimMax]->mem, inputMem[1 - arrayDimMax]->mem, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; + } + } + } + return NOT_SUPPORTED; +} + +EE eltwise_mali_fp16(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + TensorDesc outputDesc, + GCLMem_t output, + EltwiseParamSpec eltwiseDesc) +{ + CHECK_STATUS(eltwise_checkpara_mali_fp16(inputDesc, input, outputDesc)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(eltwise_core_mali_fp16(handle, inputDesc, input, outputDesc, output, eltwiseDesc)); + return SUCCESS; +} diff --git a/tensor_computing/src/gpu/mali/fp16/eltwise_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/eltwise_mali_fp16.h similarity index 75% rename from tensor_computing/src/gpu/mali/fp16/eltwise_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/eltwise_mali_fp16.h index 9f724162..ddc17912 100644 --- a/tensor_computing/src/gpu/mali/fp16/eltwise_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/eltwise_mali_fp16.h @@ -11,20 +11,19 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_ELTWISE_MALI_FP16 #define _H_ELTWISE_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" #include "error.h" +#include "types.h" #include "tensor_computing_type.h" -EE eltwise_mali_fp16(GCLHandle_t handle, - std::vector inputDesc, - std::vector input, - TensorDesc outputDesc, - GCLMem_t output, - EltwiseMode eltwiseMode); -#endif +bool eltwise_same_desc(std::vector inputDesc, U32 *arrayDimMax); +EE eltwise_mali_fp16(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + TensorDesc outputDesc, + GCLMem_t output, + EltwiseParamSpec eltwiseDesc); +#endif diff --git a/tensor_computing/src/gpu/mali/fp16/embedding_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/embedding_mali_fp16.cpp similarity index 53% rename from tensor_computing/src/gpu/mali/fp16/embedding_mali_fp16.cpp rename to compute/tensor/src/gpu/mali/fp16/embedding_mali_fp16.cpp index d628ae95..f4532f33 100644 --- a/tensor_computing/src/gpu/mali/fp16/embedding_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/embedding_mali_fp16.cpp @@ -11,79 +11,76 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "sys.h" -#include "type.h" -#include "tensor_desc.h" #include "error.h" -#include "tensor_computing_type.h" +#include "types.h" #include "gpu/mali/fp16/embedding_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" -inline EE embedding_checkpara_mali_fp16(TensorDesc weightDesc, - TensorDesc outputDesc) { - if(weightDesc.dt != outputDesc.dt || weightDesc.dt != DT_F16) return NOT_SUPPORTED; - return SUCCESS; +inline EE embedding_checkpara_mali_fp16(TensorDesc weightDesc, TensorDesc outputDesc) +{ + if (weightDesc.dt != outputDesc.dt || weightDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; } inline EE embedding_core_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc weightDesc, - GCLMem_t weight, - TensorDesc outputDesc, - GCLMem_t output, - U32 inputDim, - U32 numOutput, - bool transpose) { + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc weightDesc, + GCLMem_t weight, + EmbedParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ UNUSED(weightDesc); UNUSED(outputDesc); - UNUSED(inputDim); - UNUSED(numOutput); U32 step = inputDesc.dims[0]; - U32 on = numOutput; + U32 on = p.num_output; U32 oh_str = output->desc.stride[0]; U32 ow_str = output->desc.stride[1]; U32 oc_str = output->desc.stride[2]; U32 oh_off = output->desc.offset[0]; U32 ow_off = output->desc.offset[1]; - if(ow_str != 1 || oh_off != 0 || ow_off != 0) CHECK_STATUS(NOT_SUPPORTED); + if (ow_str != 1 || oh_off != 0 || ow_off != 0) { + CHECK_STATUS(NOT_SUPPORTED); + } cl_mem inbuf, weibuf, outbuf; - inbuf = input->mem; + inbuf = input->mem; weibuf = weight->mem; outbuf = output->mem; - if(!transpose) { + if (!p.transpose) { U32 gs[2] = {oc_str, step}; U32 ls[2] = {0, 0}; U32 dim = 2; Kernel kernel; - CHECK_STATUS(gcl_create_kernel_binary(handle, "embedding", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, step, on, oc_str, oh_str, oh_off, ow_off, inbuf, weibuf, outbuf)); + CHECK_STATUS(gcl_create_kernel(handle, "embedding", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs( + kernel, step, on, oc_str, oh_str, oh_off, ow_off, inbuf, weibuf, outbuf)); gcl_set_kernelVec(handle, kernel, dim, gs, ls, "embedding"); #ifdef _DEBUG CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "embedding")); CHECK_STATUS(gcl_print_memory(handle, output, "embedding_output")); #endif - return SUCCESS; + return SUCCESS; } else { return NOT_SUPPORTED; } } - EE embedding_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc weightDesc, - GCLMem_t weight, - TensorDesc outputDesc, - GCLMem_t output, - U32 inputDim, - U32 numOutput, - bool transpose) { + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc weightDesc, + GCLMem_t weight, + EmbedParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ CHECK_STATUS(embedding_checkpara_mali_fp16(weightDesc, outputDesc)); - CHECK_STATUS(embedding_core_mali_fp16(handle, inputDesc, input, weightDesc, weight, outputDesc, output, inputDim, numOutput, transpose)); - return SUCCESS; + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(embedding_core_mali_fp16( + handle, inputDesc, input, weightDesc, weight, p, outputDesc, output)); + return SUCCESS; } - diff --git a/tensor_computing/src/gpu/mali/fp16/embedding_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/embedding_mali_fp16.h similarity index 72% rename from tensor_computing/src/gpu/mali/fp16/embedding_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/embedding_mali_fp16.h index 383134e6..309402b2 100644 --- a/tensor_computing/src/gpu/mali/fp16/embedding_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/embedding_mali_fp16.h @@ -14,20 +14,16 @@ #ifndef _EMBEDDING_MALI_FP16 #define _EMBEDDING_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" #include "error.h" +#include "types.h" #include "tensor_computing_type.h" EE embedding_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc weightDesc, - GCLMem_t weight, - TensorDesc outputDesc, - GCLMem_t output, - U32 inputDim, - U32 numOutput, - bool transpose); -#endif - + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc weightDesc, + GCLMem_t weight, + EmbedParamSpec p, + TensorDesc outputDesc, + GCLMem_t output); +#endif diff --git a/tensor_computing/src/gpu/mali/fp16/fully_connected_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/fully_connected_mali_fp16.cpp similarity index 58% rename from tensor_computing/src/gpu/mali/fp16/fully_connected_mali_fp16.cpp rename to compute/tensor/src/gpu/mali/fp16/fully_connected_mali_fp16.cpp index 29490a89..dd828ad1 100644 --- a/tensor_computing/src/gpu/mali/fp16/fully_connected_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/fully_connected_mali_fp16.cpp @@ -1,44 +1,43 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "sys.h" -#include "tensor_desc.h" -#include "type.h" #include "error.h" -#include "tensor_computing_type.h" +#include "types.h" #include "gpu/mali/fp16/fully_connected_mali_fp16.h" -inline EE fully_connected_checkpara_mali_fp16(TensorDesc inputDesc, - TensorDesc filterDesc, - TensorDesc outputDesc) { - if(inputDesc.dt != outputDesc.dt || inputDesc.dt != filterDesc.dt || inputDesc.dt != DT_F16) return NOT_MATCH; +inline EE fully_connected_checkpara_mali_fp16( + TensorDesc inputDesc, TensorDesc filterDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != filterDesc.dt || inputDesc.dt != DT_F16) { + return NOT_MATCH; + } return SUCCESS; } - -inline EE fully_connected_core_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - std::vector filter, - TensorDesc biasDesc, - std::vector bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - std::vector output, - ForwardRunInfoMali_t forwardRunInfo) { +inline EE fully_connected_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + std::vector filter, + TensorDesc biasDesc, + std::vector bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + std::vector output, + ForwardRunInfoMali_t forwardRunInfo) +{ UNUSED(biasDesc); UNUSED(tmpBytes); UNUSED(outputDesc); @@ -47,13 +46,13 @@ inline EE fully_connected_core_mali_fp16(GCLHandle_t handle, U32 oh_str, ow_str, oh_off, ow_off; U32 fw, fh, fc, fn; cl_mem inbuf, fltbuf, biasmem, outbuf, tmp; - inbuf = input->mem; - fltbuf = filter[0]->mem; + inbuf = input->mem; + fltbuf = filter[0]->mem; biasmem = bias[0]->mem; - outbuf = output[0]->mem; - tmp = tmpBuf->mem; + outbuf = output[0]->mem; + tmp = tmpBuf->mem; - tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); + tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); ih_str = input->desc.stride[0]; iw_str = input->desc.stride[1]; ih_off = input->desc.offset[0]; @@ -72,38 +71,41 @@ inline EE fully_connected_core_mali_fp16(GCLHandle_t handle, U32 item_c = forwardRunInfo->best_c[0]; U32 item_k = forwardRunInfo->best_k[0]; - if(fw == 1 && fh == 1) { - if(inputDesc.df == DF_NCHW) { + if (fw == 1 && fh == 1) { + if (inputDesc.df == DF_NCHW || inputDesc.df == DF_NORMAL) { U32 ic_str; ic_str = filter[0]->desc.stride[1]; - if(ih_str > 1 || iw_str > 1) CHECK_STATUS(NOT_SUPPORTED); + if (ih_str > 1 || iw_str > 1) { + CHECK_STATUS(NOT_SUPPORTED); + } sprintf(kernelname, "conv_direct_spe_fwhs1_%d", item_c); gs[0] = fn; gs[1] = 1; gs[2] = 1; - dim = 1; - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, oh_str, ow_str, oh_off, ow_off, fn, gs[0], gs[1], inbuf, fltbuf, biasmem, outbuf)); + dim = 1; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, oh_str, + ow_str, oh_off, ow_off, fn, gs[0], gs[1], inbuf, fltbuf, biasmem, outbuf)); gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); #ifdef _DEBUG CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); - CHECK_STATUS(gcl_print_memory(handle, input, "fc_wh1_input")); + CHECK_STATUS(gcl_print_memory(handle, input, "fc_wh1_input")); CHECK_STATUS(gcl_print_memory(handle, filter[0], "fc_wh1_filter")); - CHECK_STATUS(gcl_print_memory(handle, bias[0], "fc_wh1_bias")); + CHECK_STATUS(gcl_print_memory(handle, bias[0], "fc_wh1_bias")); CHECK_STATUS(gcl_print_memory(handle, output[0], "fc_wh1_output")); handle->t_total += handle->t_execute; #endif - } - if(inputDesc.df == DF_MKT) { + } + if (inputDesc.df == DF_MKT) { item_k = item_k >> 2; U32 ic_str = input->desc.stride[2]; U32 ohw_str; U32 step = inputDesc.dims[0]; sprintf(kernelname, "conv_direct_s%d_%d%d%d", 1, 1, item_w, item_k); - for(U32 i = 0; i < filter.size(); ++i) { - fltbuf = filter[i]->mem; + for (U32 i = 0; i < filter.size(); ++i) { + fltbuf = filter[i]->mem; biasmem = bias[i]->mem; - outbuf = output[i]->mem; + outbuf = output[i]->mem; iw_str = input->desc.stride[0]; ih_str = input->desc.stride[1]; iw_off = input->desc.offset[0]; @@ -113,18 +115,22 @@ inline EE fully_connected_core_mali_fp16(GCLHandle_t handle, ow_off = output[i]->desc.offset[0]; oh_off = output[i]->desc.offset[1]; ohw_str = oh_str * ow_str; - if(ih_str != 1 || ih_off != 0) CHECK_STATUS(NOT_SUPPORTED); + if (ih_str != 1 || ih_off != 0) { + CHECK_STATUS(NOT_SUPPORTED); + } gs[0] = 1; gs[1] = (step + item_w - 1) / item_w; gs[2] = output[i]->desc.stride[2] / item_k; - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelname, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, oh_str, ohw_str, oh_off, ow_off, step, gs[0], gs[1], inbuf, fltbuf, biasmem, outbuf)); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ihw_str, ic_str, ih_off, iw_off, + oh_str, ohw_str, oh_off, ow_off, step, 1, gs[0], gs[1], inbuf, fltbuf, biasmem, + outbuf)); gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); #ifdef _DEBUG CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); - CHECK_STATUS(gcl_print_memory(handle, input, "conv_direct_input")); + CHECK_STATUS(gcl_print_memory(handle, input, "conv_direct_input")); CHECK_STATUS(gcl_print_memory(handle, filter[i], "conv_direct_filter")); - CHECK_STATUS(gcl_print_memory(handle, bias[i], "conv_direct_bias")); + CHECK_STATUS(gcl_print_memory(handle, bias[i], "conv_direct_bias")); CHECK_STATUS(gcl_print_memory(handle, output[i], "conv_direct_output")); handle->t_total += handle->t_execute; #endif @@ -133,33 +139,35 @@ inline EE fully_connected_core_mali_fp16(GCLHandle_t handle, } else { U32 ihy_str, fhy_str, fhw_str, fwc_str; ihy_str = ih_str * item_w; - fc = (fc + item_c - 1) / item_c; - fn = (fn + item_k - 1) / item_k; + fc = (fc + item_c - 1) / item_c; + fn = (fn + item_k - 1) / item_k; fhy_str = fh * item_w; fhw_str = fh * fw; fwc_str = fw * fc; - CHECK_STATUS(gcl_create_kernel_binary(handle, "fc_p1", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, item_w, ih_str, iw_str, ih_off, iw_off, ihy_str, ihw_str, fh, fw, fc, fn, fhy_str, fhw_str, fwc_str, fltbuf, inbuf, tmp)); + CHECK_STATUS(gcl_create_kernel(handle, "fc_p1", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, item_w, ih_str, iw_str, ih_off, iw_off, ihy_str, + ihw_str, fh, fw, fc, fn, fhy_str, fhw_str, fwc_str, fltbuf, inbuf, tmp)); gs[0] = fh; gs[1] = item_w; gs[2] = fn; gcl_set_kernelVec(handle, kernel, dim, gs, ls, "fc_p1"); #ifdef _DEBUG CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "fc_p1")); - CHECK_STATUS(gcl_print_memory(handle, input, "fc_p1_input")); - CHECK_STATUS(gcl_print_memory(handle, filter[0], "fc_p1_filter")); + CHECK_STATUS(gcl_print_memory(handle, input, "fc_p1_input")); + CHECK_STATUS(gcl_print_memory(handle, filter[0], "fc_p1_filter")); CHECK_STATUS(gcl_print_buffer(handle, tmp, fh * item_w * fn * item_k, "fc_p1_output")); handle->t_total += handle->t_execute; #endif - CHECK_STATUS(gcl_create_kernel_binary(handle, "fc_p2", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, fh * item_w, fn, oh_str, ow_str, oh_off, ow_off, tmp, biasmem, outbuf)); + CHECK_STATUS(gcl_create_kernel(handle, "fc_p2", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs( + kernel, fh * item_w, fn, oh_str, ow_str, oh_off, ow_off, tmp, biasmem, outbuf)); U32 gs2 = fn; U32 ls2 = 0; dim = 1; gcl_set_kernelVec(handle, kernel, dim, &gs2, &ls2, "fc_p2"); #ifdef _DEBUG CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs2, &ls2, "fc_p2")); - CHECK_STATUS(gcl_print_memory(handle, bias[0], "fc_p2_bias")); + CHECK_STATUS(gcl_print_memory(handle, bias[0], "fc_p2_bias")); CHECK_STATUS(gcl_print_memory(handle, output[0], "fc_p2_output")); handle->t_total += handle->t_execute; #endif @@ -167,10 +175,11 @@ inline EE fully_connected_core_mali_fp16(GCLHandle_t handle, return SUCCESS; } -EE fully_connected_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, - GCLMemDesc_t gclmemFilterDesc, - U32* bytes, - ForwardRunInfoMali_t forwardRunInfo) { +EE fully_connected_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ U32 fw, fh, fc, fn; tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); U32 item_c = forwardRunInfo->best_c[0]; @@ -181,16 +190,20 @@ EE fully_connected_transform_filter_bytes_mali_fp16(TensorDesc filterD U32 num = 0; U32 byteSize; - if(item_k == 0) { + if (item_k == 0) { s0 = fn; s1 = (fc + item_c - 1) / item_c; s2 = 1; DataFormat df = DF_CHWNC4; - if(item_c == 8) df = DF_CHWNC8; - if(item_c == 16) df = DF_CHWNC16; + if (item_c == 8) { + df = DF_CHWNC8; + } + if (item_c == 16) { + df = DF_CHWNC16; + } gclmemFilterDesc->memFormat = df; num = s0 * s1 * s2 * item_c; - } else if(fw == 1 && fh == 1) { + } else if (fw == 1 && fh == 1) { s0 = item_k >> 2; s1 = (fc + item_c - 1) / item_c; s2 = (fn + item_k - 1) / item_k; @@ -210,22 +223,23 @@ EE fully_connected_transform_filter_bytes_mali_fp16(TensorDesc filterD gclmemFilterDesc->offset[0] = 0; gclmemFilterDesc->offset[1] = 0; gclmemFilterDesc->offset[2] = 0; - gclmemFilterDesc->num = num; - gclmemFilterDesc->byteSize = byteSize; - gclmemFilterDesc->memType = GCL_MEM_BUF; - gclmemFilterDesc->flags = CL_MEM_READ_WRITE; - gclmemFilterDesc->host_ptr = NULL; + gclmemFilterDesc->num = num; + gclmemFilterDesc->byteSize = byteSize; + gclmemFilterDesc->memType = GCL_MEM_BUF; + gclmemFilterDesc->flags = CL_MEM_READ_WRITE; + gclmemFilterDesc->host_ptr = NULL; *bytes = 0; return SUCCESS; } -EE fully_connected_transform_filter_mali_fp16(GCLHandle_t handle, - TensorDesc filterDesc, - GCLMem_t filter, - TensorDesc* fltmemDesc, - std::vector fltmem, - ForwardRunInfoMali_t forwardRunInfo) { - DataType fdt; +EE fully_connected_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + TensorDesc *fltmemDesc, + std::vector fltmem, + ForwardRunInfoMali_t forwardRunInfo) +{ + DataType fdt; DataFormat fdf; U32 fw, fh, fc, fn; tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw); @@ -237,9 +251,9 @@ EE fully_connected_transform_filter_mali_fp16(GCLHandle_t handle, U32 fwh = fw * fh; U32 item_c = forwardRunInfo->best_c[0]; U32 item_k = forwardRunInfo->best_k[0]; - if(fw == 1 && fh == 1) { - if(item_k == 0) { - sprintf(kernelname, "conv_direct_trans_fltbuf_%d%d",item_c, item_k); + if (fw == 1 && fh == 1) { + if (item_k == 0) { + sprintf(kernelname, "conv_direct_trans_fltbuf_%d%d", item_c, item_k); CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); CHECK_STATUS(gcl_set_kernelArgs(kernel, fwh, fc, fn, filter->mem, fltmem[0]->mem)); gs[0] = fwh; @@ -247,9 +261,9 @@ EE fully_connected_transform_filter_mali_fp16(GCLHandle_t handle, gs[2] = fn; CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); } else { - sprintf(kernelname, "conv_direct_trans_fltbuf_%d%d",item_c, item_k); + sprintf(kernelname, "conv_direct_trans_fltbuf_%d%d", item_c, item_k); CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); - if(fltmem.size() == 1) { + if (fltmem.size() == 1) { CHECK_STATUS(gcl_set_kernelArgs(kernel, fwh, fc, fn, filter->mem, fltmem[0]->mem)); gs[0] = fwh; gs[1] = (fc + item_c - 1) / item_c; @@ -258,9 +272,11 @@ EE fully_connected_transform_filter_mali_fp16(GCLHandle_t handle, } else { GCLMem_t tmp = gcl_create_gclmem(); tmp->desc.byteSize = 0; - for(U32 i = 0; i < fltmem.size(); ++i) tmp->desc.byteSize += fltmem[i]->desc.byteSize; + for (U32 i = 0; i < fltmem.size(); ++i) { + tmp->desc.byteSize += fltmem[i]->desc.byteSize; + } tmp->desc.memType = GCL_MEM_BUF; - tmp->desc.flags = CL_MEM_READ_WRITE; + tmp->desc.flags = CL_MEM_READ_WRITE; CHECK_STATUS(gcl_create_memory(handle, tmp)); CHECK_STATUS(gcl_set_kernelArgs(kernel, fwh, fc, fn, filter->mem, tmp->mem)); gs[0] = fwh; @@ -268,16 +284,17 @@ EE fully_connected_transform_filter_mali_fp16(GCLHandle_t handle, gs[2] = (fn + item_k - 1) / item_k * item_k; CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); U32 offset[2] = {0, 0}; - for(U32 i = 0; i < fltmem.size(); i++) { + for (U32 i = 0; i < fltmem.size(); i++) { U32 size = fltmem[i]->desc.byteSize; - CHECK_STATUS(gcl_trans_memory(handle, tmp, fltmem[i], &size, DEVICE_BUF_TO_BUF, CL_TRUE, offset)); + CHECK_STATUS(gcl_trans_memory( + handle, tmp, fltmem[i], &size, DEVICE_BUF_TO_BUF, CL_TRUE, offset)); offset[0] += size; } gcl_destroy_gclmem(tmp); } } } else { - sprintf(kernelname, "fc_trans_fltbuf_%d%d",item_c, item_k); + sprintf(kernelname, "fc_trans_fltbuf_%d%d", item_c, item_k); CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); CHECK_STATUS(gcl_set_kernelArgs(kernel, fw, fh, fwh, fc, fn, filter->mem, fltmem[0]->mem)); gs[0] = fw; @@ -288,43 +305,49 @@ EE fully_connected_transform_filter_mali_fp16(GCLHandle_t handle, *fltmemDesc = tensor4df(fdt, fdf, fn, fc, fh, fw); #ifdef _DEBUG CHECK_STATUS(gcl_print_memory(handle, filter, "fc_filter_org")); - for(U32 i = 0; i < fltmem.size(); ++i) CHECK_STATUS(gcl_print_memory(handle, fltmem[i], "fc_filter_tran")); + for (U32 i = 0; i < fltmem.size(); ++i) { + CHECK_STATUS(gcl_print_memory(handle, fltmem[i], "fc_filter_tran")); + } #endif return SUCCESS; } -EE fully_connected_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, - TensorDesc filterDesc, - U32* bytes, - ForwardRunInfoMali_t forwardRunInfo) { - U32 fn, fw, fh; - tensorSelectGet(filterDesc, NULL, NULL, &fn, NULL, &fh, &fw); - if(fh == 1 && fw == 1) { +EE fully_connected_infer_forward_tmp_bytes_mali_fp16( + TensorDesc inputDesc, TensorDesc filterDesc, U32 *bytes, ForwardRunInfoMali_t forwardRunInfo) +{ + U32 fn, fw, fh; + tensorSelectGet(filterDesc, NULL, NULL, &fn, NULL, &fh, &fw); + if (fh == 1 && fw == 1) { *bytes = 0; } else { - DataType dt; + DataType dt; U32 ic, ih, iw; - tensorSelectGet(inputDesc, &dt, NULL, NULL, &ic, &ih, &iw); + tensorSelectGet(inputDesc, &dt, NULL, NULL, &ic, &ih, &iw); U32 item_w = forwardRunInfo->best_w[0]; U32 item_k = forwardRunInfo->best_k[0]; - *bytes = ih * item_w * ((fn + item_k - 1) / item_k * item_k) * bytesOf(dt); + *bytes = ih * item_w * ((fn + item_k - 1) / item_k * item_k) * bytesOf(dt); } return SUCCESS; } -EE fully_connected_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc filterDesc, - std::vector filter, - TensorDesc biasDesc, - std::vector bias, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - std::vector output, - ForwardRunInfoMali_t forwardRunInfo) { +EE fully_connected_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + std::vector filter, + TensorDesc biasDesc, + std::vector bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + std::vector output, + ForwardRunInfoMali_t forwardRunInfo) +{ CHECK_STATUS(fully_connected_checkpara_mali_fp16(inputDesc, filterDesc, outputDesc)); - CHECK_STATUS(fully_connected_core_mali_fp16(handle, inputDesc, input, filterDesc, filter, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, forwardRunInfo)); + for (U32 i = 0; i < output.size(); i++) { + CHECK_STATUS(fill_output_zero(handle, output[i], outputDesc)); + } + CHECK_STATUS(fully_connected_core_mali_fp16(handle, inputDesc, input, filterDesc, filter, + biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, forwardRunInfo)); return SUCCESS; } diff --git a/tensor_computing/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/fully_connected_mali_fp16.h similarity index 52% rename from tensor_computing/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/fully_connected_mali_fp16.h index 16297560..6af33a0d 100644 --- a/tensor_computing/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/fully_connected_mali_fp16.h @@ -11,26 +11,38 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#ifndef _BILATERAL_SLICE_APPLY_MALI_FP16 -#define _BILATERAL_SLICE_APPLY_MALI_FP16 +#ifndef _FC_MALI_FP16 +#define _FC_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" #include "error.h" +#include "types.h" #include "tensor_computing_type.h" -EE bilateral_slice_apply_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - TensorDesc guideDesc, - const GCLMem_t guide, - TensorDesc gridDesc, - const GCLMem_t grid, - BilateralSliceApplyDesc bilateralSliceApplyDesc, - ForwardRunInfoMali_t forwardRunInfo, - U32 tmpBytes, - GCLMem_t tmpBuf, - TensorDesc outputDesc, - GCLMem_t output); -#endif +EE fully_connected_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); +EE fully_connected_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + TensorDesc *fltmemDesc, + std::vector fltmem, + ForwardRunInfoMali_t forwardRunInfo); + +EE fully_connected_infer_forward_tmp_bytes_mali_fp16( + TensorDesc inputDesc, TensorDesc filterDesc, U32 *bytes, ForwardRunInfoMali_t forwardRunInfo); + +EE fully_connected_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + std::vector filter, + TensorDesc biasDesc, + std::vector bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + std::vector output, + ForwardRunInfoMali_t forwardRunInfo); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/matmul_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/matmul_mali_fp16.cpp new file mode 100644 index 00000000..e9b46aae --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/matmul_mali_fp16.cpp @@ -0,0 +1,197 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/matmul_mali_fp16.h" + +inline EE matmul_checkpara_mali_fp16( + TensorDesc matrixADesc, TensorDesc matrixBDesc, TensorDesc matrixCDesc) +{ + if (matrixADesc.dt != matrixBDesc.dt || matrixADesc.dt != matrixCDesc.dt || + matrixADesc.dt != DT_F16) { + return NOT_MATCH; + } + return SUCCESS; +} + +inline EE matmul_core_mali_fp16(GCLHandle_t handle, + TensorDesc matrixADesc, + bool transposeA, + const GCLMem_t matrixA, + TensorDesc matrixBDesc, + bool transposeB, + const GCLMem_t matrixB, + GCLMem_t tmp, + TensorDesc matrixCDesc, + GCLMem_t matrixC, + ForwardRunInfoMali_t forwardRunInfo) +{ + UNUSED(tmp); + UNUSED(matrixCDesc); + U32 adims = matrixADesc.nDims; + U32 ac = (adims > 2) ? matrixADesc.dims[2] : 1; + U32 ah = matrixADesc.dims[1]; + U32 aw = matrixADesc.dims[0]; + U32 bh = matrixBDesc.dims[1]; + U32 bw = matrixBDesc.dims[0]; + + U32 item_w = forwardRunInfo->best_w[0]; + U32 item_c = forwardRunInfo->best_c[0]; + U32 item_k = forwardRunInfo->best_k[0]; + cl_mem A, B, C; + A = matrixA->mem; + B = matrixB->mem; + C = matrixC->mem; + char kernelname[128]; + Kernel kernel; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + if (matrixA->desc.offset[0] != 0 || matrixA->desc.offset[1] != 0 || + matrixB->desc.offset[0] != 0 || matrixB->desc.offset[1] != 0 || + matrixC->desc.offset[0] != 0 || matrixC->desc.offset[1] != 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (transposeA && !transposeB) { + U32 M = matrixA->desc.stride[0]; + U32 N = matrixB->desc.stride[0]; + U32 K = ah; + U32 ow_str = matrixC->desc.stride[0]; + U32 A_str = M * matrixA->desc.stride[1]; + U32 B_str = N * matrixB->desc.stride[1]; + U32 C_str = ow_str * matrixC->desc.stride[1]; + U32 batch = ac; + gs[0] = (bw + item_w - 1) / item_w; + gs[1] = (aw + item_k - 1) / item_k; + gs[2] = batch; + sprintf(kernelname, "gemm_tn_nobias_%d%d", item_k, item_w); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, M, N, K, ow_str, A_str, B_str, C_str, 0, 0, bw, aw, + gs[0], gs[1], 0, 0, A, B, C)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + CHECK_STATUS(gcl_print_memory(handle, matrixA, "gemm_tn_a")); + CHECK_STATUS(gcl_print_memory(handle, matrixB, "gemm_tn_b")); + CHECK_STATUS(gcl_print_memory(handle, matrixC, "gemm_tn_c")); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; + } + + if (!transposeA && transposeB) { + U32 KA = matrixA->desc.stride[0]; + U32 KB = matrixB->desc.stride[0]; + U32 K = (aw + item_c - 1) / item_c * item_c; + U32 ow_str = matrixC->desc.stride[0]; + U32 A_str = KA * matrixA->desc.stride[1]; + U32 B_str = KB * matrixB->desc.stride[1]; + U32 C_str = ow_str * matrixC->desc.stride[1]; + U32 batch = ac; + gs[0] = (bh + item_w - 1) / item_w; + gs[1] = (ah + item_k - 1) / item_k; + gs[2] = batch; + sprintf(kernelname, "gemm_nt_nobias_%d%d%d", item_k, item_w, (item_c >> 1)); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs( + kernel, KA, KB, K, ow_str, A_str, B_str, C_str, 0, 0, bh, ah, gs[0], gs[1], A, B, C)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + CHECK_STATUS(gcl_print_memory(handle, matrixA, "gemm_nt_a")); + CHECK_STATUS(gcl_print_memory(handle, matrixB, "gemm_nt_b")); + CHECK_STATUS(gcl_print_memory(handle, matrixC, "gemm_nt_c")); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; + } + + if (transposeA && transposeB) { + if (matrixADesc.df != DF_MKT) { + CHECK_STATUS(NOT_SUPPORTED); + } + U32 m, k, t; + get_nlp_mkt_val(matrixADesc, NULL, &m, &k, &t); + if (t != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (m != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (aw != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (ah != k) { + CHECK_STATUS(NOT_MATCH); + } + U32 KA = matrixA->desc.stride[2] * 4; + U32 KB = matrixB->desc.stride[0]; + U32 K = (ah + item_c - 1) / item_c * item_c; + U32 ow_str = matrixC->desc.stride[0]; + U32 batch = 1; + gs[0] = (bh + item_w - 1) / item_w; + gs[1] = 1; + gs[2] = batch; + sprintf(kernelname, "gemm_nt_nobias_%d%d%d", item_k, item_w, (item_c >> 1)); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs( + kernel, KA, KB, K, ow_str, 0, 0, 0, 0, 0, bh, 1, gs[0], gs[1], A, B, C)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + CHECK_STATUS(gcl_print_memory(handle, matrixA, "gemm_nt_a")); + CHECK_STATUS(gcl_print_memory(handle, matrixB, "gemm_nt_b")); + CHECK_STATUS(gcl_print_memory(handle, matrixC, "gemm_nt_c")); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE matmul_infer_forward_tmp_bytes_mali_fp16(TensorDesc matrixADesc, + bool transposeA, + TensorDesc matrixBDesc, + bool transposeB, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ + UNUSED(matrixADesc); + UNUSED(transposeA); + UNUSED(matrixBDesc); + UNUSED(transposeB); + UNUSED(forwardRunInfo); + *bytes = 0; + return SUCCESS; +} + +EE matmul_mali_fp16(GCLHandle_t handle, + TensorDesc matrixADesc, + bool transposeA, + const GCLMem_t matrixA, + TensorDesc matrixBDesc, + bool transposeB, + const GCLMem_t matrixB, + GCLMem_t tmp, + TensorDesc matrixCDesc, + GCLMem_t matrixC, + ForwardRunInfoMali_t forwardRunInfo) +{ + CHECK_STATUS(matmul_checkpara_mali_fp16(matrixADesc, matrixBDesc, matrixCDesc)); + CHECK_STATUS(fill_output_zero(handle, matrixC, matrixCDesc)); + CHECK_STATUS(matmul_core_mali_fp16(handle, matrixADesc, transposeA, matrixA, matrixBDesc, + transposeB, matrixB, tmp, matrixCDesc, matrixC, forwardRunInfo)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/matmul_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/matmul_mali_fp16.h new file mode 100644 index 00000000..3d717463 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/matmul_mali_fp16.h @@ -0,0 +1,39 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _MATMUL_MALI_FP16 +#define _MATMUL_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE matmul_infer_forward_tmp_bytes_mali_fp16(TensorDesc matrixADesc, + bool transposeA, + TensorDesc matrixBDesc, + bool transposeB, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); + +EE matmul_mali_fp16(GCLHandle_t handle, + TensorDesc matrixADesc, + bool transposeA, + const GCLMem_t matrixA, + TensorDesc matrixBDesc, + bool transposeB, + const GCLMem_t matrixB, + GCLMem_t tmp, + TensorDesc matrixCDesc, + GCLMem_t matrixC, + ForwardRunInfoMali_t forwardRunInfo); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/multihead_attention_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/multihead_attention_mali_fp16.cpp new file mode 100644 index 00000000..4a4e38c6 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/multihead_attention_mali_fp16.cpp @@ -0,0 +1,907 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/multihead_attention_mali_fp16.h" + +#define set_best_wkc(fc_bw, fc_bk, tn_bw, tn_bk, nt_bw, nt_bk, nt_bc, runInfo) \ + { \ + U32 *best_w = runInfo->best_w; \ + U32 *best_k = runInfo->best_k; \ + U32 *best_c = runInfo->best_c; \ + fc_bw[0] = best_w[0]; \ + fc_bw[1] = best_w[3]; \ + fc_bw[2] = best_w[4]; \ + fc_bw[3] = best_w[5]; \ + fc_bk[0] = best_k[0]; \ + fc_bk[1] = best_k[3]; \ + fc_bk[2] = best_k[4]; \ + fc_bk[3] = best_k[5]; \ + tn_bw = best_w[1]; \ + tn_bk = best_k[1]; \ + nt_bw = best_w[2]; \ + nt_bk = best_k[2]; \ + nt_bc = best_c[2]; \ + } + +#define set_mem_flag(ln_out_flag, fc_out_flag, tn_out_flag, nt_out_flag) \ + { \ + ln_out_flag[0] = 2; \ + fc_out_flag[0] = 0; \ + tn_out_flag = 1; \ + nt_out_flag = 0; \ + fc_out_flag[1] = 1; \ + ln_out_flag[1] = 0; \ + fc_out_flag[2] = 2; \ + } + +#define get_subbuf_size(dt, ln_out_w, ln_out_h, ln_out_flag, fc_out_w, fc_out_h, fc_out_flag, \ + tn_out_w, tn_out_h, tn_out_c, tn_out_flag, nt_out_w, nt_out_h, nt_out_c, nt_out_flag, sub_size) \ + { \ + U32 size; \ + for (U32 i = 0; i < 3; i++) \ + sub_size[i] = 0; \ + for (U32 i = 0; i < 2; i++) { \ + size = ln_out_w[i] * ln_out_h[i] * bytesOf(dt); \ + if (size > sub_size[ln_out_flag[i]]) \ + sub_size[ln_out_flag[i]] = size; \ + } \ + for (U32 i = 0; i < 3; i++) { \ + size = fc_out_w[i] * fc_out_h[i] * bytesOf(dt); \ + if (size > sub_size[fc_out_flag[i]]) \ + sub_size[fc_out_flag[i]] = size; \ + } \ + size = tn_out_w * tn_out_h * tn_out_c * bytesOf(dt); \ + if (size > sub_size[tn_out_flag]) \ + sub_size[tn_out_flag] = size; \ + size = nt_out_w * nt_out_h * nt_out_c * bytesOf(dt); \ + if (size > sub_size[nt_out_flag]) \ + sub_size[nt_out_flag] = size; \ + } + +#define get_ln0_out_wh(t, k, fc_bw, ow, oh, useEltIn) \ + { \ + ow = ALIGN(t, fc_bw[0]); \ + if (!useEltIn[0]) \ + ow = (ow > ALIGN(t, fc_bw[1])) ? ow : ALIGN(t, fc_bw[1]); \ + oh = ALIGN(k, 4); \ + } + +#define get_fc0_out_wh(t, k, fc_bw, fc_bk, tn_bw, tn_bk, nt_bc, ow, oh) \ + { \ + ow = ALIGN(t, fc_bw[0]); \ + oh = ALIGN(k, fc_bk[0]); \ + ow = (ow > ALIGN(t, tn_bw)) ? ow : ALIGN(t, tn_bw); \ + ow = (ow > ALIGN(t, tn_bk)) ? ow : ALIGN(t, tn_bk); \ + ow = (ow > ALIGN(t, nt_bc)) ? ow : ALIGN(t, nt_bc); \ + } + +#define get_tn_sf_out_whc(Aw, Bw, t, k, sliceLen, nt_bw, nt_bc, ow, oh, oc) \ + { \ + ow = Bw; \ + oh = Aw; \ + oc = k / sliceLen; \ + ow = (ow > ALIGN(t, 4)) ? ow : ALIGN(t, 4); \ + ow = (ow > ALIGN(t, nt_bc)) ? ow : ALIGN(t, nt_bc); \ + oh = (oh > ALIGN(t, nt_bw)) ? oh : ALIGN(t, nt_bw); \ + } + +#define get_nt_out_whc(Ah, Bh, t, k, sliceLen, fc_bw, ow, oh, oc) \ + { \ + ow = Bh; \ + oh = Ah; \ + oc = k / sliceLen; \ + ow = (ow > ALIGN(t, fc_bw[1])) ? ow : ALIGN(t, fc_bw[1]); \ + if (sliceLen != oh) \ + CHECK_STATUS(NOT_MATCH); \ + } + +#define get_fc1_out_wh(Bw, t, k, fc_bw, fc_bk, ow, oh) \ + { \ + ow = Bw; \ + oh = ALIGN(k, fc_bk[1]); \ + ow = (ow > ALIGN(t, fc_bw[2])) ? ow : ALIGN(t, fc_bw[2]); \ + ow = (ow > ALIGN(t, fc_bw[3])) ? ow : ALIGN(t, fc_bw[3]); \ + } + +#define get_fc2_out_wh(Bw, t, k, fc_bw, fc_bk, ow, oh) \ + { \ + ow = Bw; \ + oh = ALIGN(k, fc_bk[2]); \ + ow = (ow > ALIGN(t, fc_bw[3])) ? ow : ALIGN(t, fc_bw[3]); \ + } + +inline void fill_zero_nchw(GCLHandle_t handle, U32 len, U32 offset, Mem buf) +{ + char kernelName[128]; + Kernel kernel; + sprintf(kernelName, "fill_memory_zero_vec4_f16"); + U32 gs = (len + 3) / 4; + U32 ls = 0; + U32 dim = 1; + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, len, offset, gs, buf)); + gcl_set_kernelVec(handle, kernel, dim, &gs, &ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs, &ls, kernelName)); +#endif +} + +inline void layer_norm(GCLHandle_t handle, + U32 len, + U32 on, + U32 ih_str, + U32 ic_str, + U32 ih_off, + U32 iw_off, + U32 oh_str, + Mem alpbuf, + Mem betbuf, + Mem in, + Mem out, + bool USE_C1 = false) +{ + U32 gs = len; + U32 ls = 0; + U32 dim = 1; + float para = 1.0 / on; + Kernel kernel; + if (USE_C1) { + CHECK_STATUS(gcl_create_kernel(handle, "normalization_c1", &kernel)); + } else { + CHECK_STATUS(gcl_create_kernel(handle, "normalization", &kernel)); + } + CHECK_STATUS(gcl_set_kernelArgs( + kernel, len, ih_str, ic_str, ih_off, iw_off, oh_str, 0, 0, para, alpbuf, betbuf, in, out)); + gcl_set_kernelVec(handle, kernel, dim, &gs, &ls, "normalization_c1"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs, &ls, "normalization_c1")); +#endif +} + +inline void inner_product_c1(GCLHandle_t handle, + U32 M, + U32 N, + U32 K, + U32 ow_str, + U32 ow, + U32 oh, + U32 item_w, + U32 item_k, + Mem A, + Mem B, + Mem bias, + Mem C) +{ + /*output is c1*/ + U32 ow_align = ALIGN(ow, item_w); + U32 oh_align = ALIGN(oh, item_k); + U32 gs[2] = {ow_align / item_w, oh_align / item_k}; + U32 ls[2] = {0, 0}; + U32 dim = 2; + Kernel kernel; + char kernelName[128]; + sprintf(kernelName, "gemm_tn_%d%d", item_k, item_w); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, M, N, K, ow_str, ow, oh, gs[0], gs[1], A, B, bias, C)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif +} + +inline void inner_product_with_eltwise_c4(GCLHandle_t handle, + U32 M, + U32 N, + U32 K, + U32 ow_str, + U32 ow, + U32 oh, + U32 item_w, + U32 item_k, + bool useLayerNormIn, + U32 ew_str, + Mem A, + Mem B, + Mem bias, + Mem C, + Mem elt) +{ + /*output is c4*/ + U32 ow_align = ALIGN(ow, item_w); + U32 oh_align = ALIGN(oh, item_k); + U32 gs[2] = {ow_align / item_w, oh_align / item_k}; + U32 ls[2] = {0, 0}; + U32 dim = 2; + Kernel kernel; + char kernelName[128]; + if (useLayerNormIn) { + sprintf(kernelName, "gemm_tn_eltwise4_ncwhc4_%d%d", item_k, item_w); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, M, N, K, ow, 1, oh, ow_str, 1, ow_str, 0, 0, gs[0], + gs[1], A, B, bias, C, ew_str, 1, ew_str, 0, 0, elt)); + } else { + sprintf(kernelName, "gemm_tn_eltwise1_ncwhc4_%d%d", item_k, item_w); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, M, N, K, ow, 1, oh, ow_str, 1, ow_str, 0, 0, gs[0], + gs[1], A, B, bias, C, ew_str, 0, 0, elt)); + } + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif +} + +inline void inner_product_ncwhc4(GCLHandle_t handle, + U32 iw_str, + U32 ic_str, + U32 fn, + U32 ow_str, + U32 oh_off, + U32 ow_off, + U32 ow, + U32 item_w, + U32 item_k, + ActivationMode activation, + bool useEltwise, + Mem in, + Mem flt, + Mem bias, + Mem out, + U32 ew_str, + Mem elt) +{ + U32 ow_align = ALIGN(ow, item_w); + U32 gs[3] = {1, ow_align / item_w, fn / item_k}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + char kernelName[128]; + char modeName[128]; + if (useEltwise) { + strcpy(modeName, "eltwise4_"); + } else { + switch (activation) { + case ACTIVATION_RELU: + strcpy(modeName, "relu_"); + break; + case ACTIVATION_GELU: + strcpy(modeName, "gelu_"); + break; + case ACTIVATION_NULL: + strcpy(modeName, ""); + break; + default: + CHECK_STATUS(NOT_SUPPORTED); + } + } + item_k = item_k >> 2; + sprintf(kernelName, "conv_direct_s%d_%s%d%d%d", 1, modeName, 1, item_w, item_k); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + if (useEltwise) { + CHECK_STATUS(gcl_set_kernelArgs(kernel, 1, iw_str, ic_str, 0, 0, 1, ow_str, oh_off, ow_off, + ow, 1, gs[0], gs[1], in, flt, bias, out, 1, ew_str, 0, 0, elt)); + } else { + CHECK_STATUS(gcl_set_kernelArgs(kernel, 1, iw_str, ic_str, 0, 0, 1, ow_str, oh_off, ow_off, + ow, 1, gs[0], gs[1], in, flt, bias, out)); + } + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif +} + +inline void matmul_tn_c1(GCLHandle_t handle, + U32 M, + U32 N, + U32 K, + U32 ow_str, + U32 A_str, + U32 B_str, + U32 C_str, + U32 A_off, + U32 B_off, + U32 ow, + U32 oh, + U32 item_w, + U32 item_k, + U32 batch, + float alp, + float bet, + Mem A, + Mem B, + Mem C) +{ + /*output is c1*/ + U32 ow_align = ALIGN(ow, item_w); + U32 oh_align = ALIGN(oh, item_k); + U32 gs[3] = {ow_align / item_w, oh_align / item_k, batch}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + char kernelName[128]; + sprintf(kernelName, "gemm_tn_nobias_%d%d", item_k, item_w); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, M, N, K, ow_str, A_str, B_str, C_str, A_off, B_off, ow, + oh, gs[0], gs[1], alp, bet, A, B, C)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif +} + +inline void matmul_nt_c1(GCLHandle_t handle, + U32 KA, + U32 KB, + U32 K, + U32 ow_str, + U32 A_str, + U32 B_str, + U32 C_str, + U32 A_off, + U32 B_off, + U32 ow, + U32 oh, + U32 item_w, + U32 item_k, + U32 item_c, + U32 batch, + Mem A, + Mem B, + Mem C) +{ + /*output is c1*/ + U32 ow_align = ALIGN(ow, item_w); + U32 oh_align = ALIGN(oh, item_k); + U32 gs[3] = {ow_align / item_w, oh_align / item_k, batch}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + char kernelName[128]; + sprintf(kernelName, "gemm_nt_nobias_%d%d%d", item_k, item_w, (item_c >> 1)); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, KA, KB, K, ow_str, A_str, B_str, C_str, A_off, B_off, + ow, oh, gs[0], gs[1], A, B, C)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif +} + +inline void softmax_w(GCLHandle_t handle, + U32 iw, + U32 ih, + U32 ic, + U32 iw_str, + U32 ih_str, + U32 iw_off, + U32 ih_off, + U32 ow_str, + U32 oh_str, + U32 ow_off, + U32 oh_off, + Mem in, + Mem out) +{ + U32 gs[2] = {ih, ic}; + U32 ls[2] = {0, 0}; + U32 dim = 2; + U32 iwd4 = (iw + 3) >> 2; + U32 iwe4 = ((iw & 3) == 0) ? 4 : (iw & 3); + Kernel kernel; + char kernelName[128]; + sprintf(kernelName, "softmax_nchw_w"); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iwd4, iwe4, iw_str, ih_str, iw_off, ih_off, ow_str, + oh_str, ow_off, oh_off, gs[0], gs[1], in, out)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif +} + +inline EE multihead_attention_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + std::vector filterDesc, + std::vector filter, + std::vector biasDesc, + std::vector bias, + std::vector layerNormAlpha, + std::vector layerNormBeta, + void *multiplyAlpha, + void *multiplyBeta, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + std::vector eltwiseWithLayerNormIn, + ActivationMode activation, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ForwardRunInfoMali_t forwardRunInfo) +{ + DataType dt; + U32 m, k, t; + get_nlp_mkt_val(inputDesc, &dt, &m, &k, &t); + U32 ih_str, ic_str, ih_off, iw_off; + get_gclmem_dim(input->desc, NULL, &ih_str, &ic_str, &iw_off, &ih_off); + + U32 oh_str, oc_str, oh_off, ow_off; + get_gclmem_dim(output->desc, NULL, &oh_str, &oc_str, &ow_off, &oh_off); + U32 fn[4]; + for (U32 i = 0; i < filterDesc.size(); i++) { + tensorSelectGet(filterDesc[i], NULL, NULL, &fn[i], NULL, NULL, NULL); + } + + U32 fc_bw[4]; + U32 fc_bk[4]; + U32 tn_bw, tn_bk; + U32 nt_bw, nt_bk, nt_bc; + set_best_wkc(fc_bw, fc_bk, tn_bw, tn_bk, nt_bw, nt_bk, nt_bc, forwardRunInfo); + + U32 ln_out_flag[2]; + U32 fc_out_flag[3]; + U32 tn_out_flag, nt_out_flag; + set_mem_flag(ln_out_flag, fc_out_flag, tn_out_flag, nt_out_flag); + + U32 ln_out_w[2]; + U32 ln_out_h[2]; + U32 fc_out_w[3]; + U32 fc_out_h[3]; + U32 tn_out_w, tn_out_h, tn_out_c; + U32 nt_out_w, nt_out_h, nt_out_c; + + get_ln0_out_wh(t, k, fc_bw, ln_out_w[0], ln_out_h[0], eltwiseWithLayerNormIn); + get_fc0_out_wh(t, fn[0], fc_bw, fc_bk, tn_bw, tn_bk, nt_bc, fc_out_w[0], fc_out_h[0]); + U32 Aw = ALIGN(t, tn_bk); + U32 Bw = ALIGN(t, tn_bw); + get_tn_sf_out_whc( + Aw, Bw, t, firstFCSliceNum[0], matmulSliceLen, nt_bw, nt_bc, tn_out_w, tn_out_h, tn_out_c); + + U32 Ah = ALIGN(matmulSliceLen, nt_bk); + U32 Bh = ALIGN(t, nt_bw); + get_nt_out_whc( + Ah, Bh, t, firstFCSliceNum[2], matmulSliceLen, fc_bw, nt_out_w, nt_out_h, nt_out_c); + + Bw = ALIGN(t, fc_bw[1]); + get_fc1_out_wh(Bw, t, fn[1], fc_bw, fc_bk, fc_out_w[1], fc_out_h[1]); + + ln_out_w[1] = fc_out_w[1]; + ln_out_h[1] = fc_out_h[1]; + + Bw = ALIGN(t, fc_bw[2]); + get_fc2_out_wh(Bw, t, fn[2], fc_bw, fc_bk, fc_out_w[2], fc_out_h[2]); + + U32 offset = 0; + U32 sub_size[3]; + get_subbuf_size(dt, ln_out_w, ln_out_h, ln_out_flag, fc_out_w, fc_out_h, fc_out_flag, tn_out_w, + tn_out_h, tn_out_c, tn_out_flag, nt_out_w, nt_out_h, nt_out_c, nt_out_flag, sub_size); + + Mem ln_out_mem[2]; + Mem fc_out_mem[3]; + Mem tn_out_mem, nt_out_mem; + Mem subBuf[3]; + CHECK_STATUS(gcl_create_sub_buffer(sub_size[0], &offset, tmpBuf, &subBuf[0])); + CHECK_STATUS(gcl_create_sub_buffer(sub_size[1], &offset, tmpBuf, &subBuf[1])); + CHECK_STATUS(gcl_create_sub_buffer(sub_size[2], &offset, tmpBuf, &subBuf[2])); + + for (U32 i = 0; i < 2; i++) { + ln_out_mem[i] = subBuf[ln_out_flag[i]]; + } + for (U32 i = 0; i < 3; i++) { + fc_out_mem[i] = subBuf[fc_out_flag[i]]; + } + tn_out_mem = subBuf[tn_out_flag]; + nt_out_mem = subBuf[nt_out_flag]; + + /* STAGE0: layerNorm + * INPUT (X, 78) C4 + * OUTPUT (X, 312) C1 --> X align to best_w[0] + */ + Mem stage0LNIn = input->mem; + Mem stage0LNAlp = ((GCLMem_t)(layerNormAlpha[0]))->mem; + Mem stage0LNBet = ((GCLMem_t)(layerNormBeta[0]))->mem; + + layer_norm(handle, t, k, ih_str, ic_str, ih_off, iw_off, ln_out_w[0], stage0LNAlp, stage0LNBet, + stage0LNIn, ln_out_mem[0], true); + + /* STAGE1: InnerProduct + * TN GEMM + * weight(T) (932, 312) * stage0LNOut(N) (X, 312) + * GPU: + * weight W : 932 -> 312 * 3 + * weight H: 312 + * OUTPUT: + * mat_q: (X, 312) --> (Xq, 26, 12) + * mat_k: (X, 312) --> (Xk, 26, 12) + * mat_v: (X, 312) --> (Xv, 26, 12) + * Xq = Xk = Xv + + * mat_q * mat_k(TN) --->mat_qk(Xk, Xq, 12) + * mat_q --> Xq X align to best_k[1] + * mat_k --> Xk X align to best_w[1] + * mat_qk --> Xqk_w X align to best_c[0] + * mat_qk --> Xqk_h X align to best_w[2] + + * mat_v * mat_qk(NT) -->mat_vqk(Xq, 7, 12) + * mat_v --> Xv X align to best_c[0]; + * mat_v --> 26 26 align to best_k[2](require 26 % best_k[2] = 0); + + * Stage1: + * OUTPUT + * dim0: max(Xq align best_k[1], Xk align best_w[1], Xv align to best_c[0]) + * dim1: 312 + 312 + 312 + * INPUT: + * A(dim1 align to best_k[0], 312) B(X align to best_w[0]) + */ + + U32 M = ((GCLMem_t)(filter[0]))->desc.stride[0]; + U32 K = ((GCLMem_t)(filter[0]))->desc.stride[1]; + U32 N = ln_out_w[0]; + Mem stage1MatA = ((GCLMem_t)(filter[0]))->mem; + Mem stage1MatB = ln_out_mem[0]; + Mem stage1Bias = ((GCLMem_t)(bias[0]))->mem; + if (N < ALIGN(t, nt_bc)) { + U32 off = (firstFCSliceNum[0] + firstFCSliceNum[1]) * fc_out_w[0]; + U32 len = fc_out_w[0] * fc_out_h[0] - off; + fill_zero_nchw(handle, len, off, fc_out_mem[0]); + } + inner_product_c1(handle, M, N, K, fc_out_w[0], t, M, fc_bw[0], fc_bk[0], stage1MatA, stage1MatB, + stage1Bias, fc_out_mem[0]); + + /* Stage2: Matmul mat_q * mat_k + * TN GEMM + * INPUT: mat_q(Xq, 26, 12) mat_k (Xk, 26, 12); + * Xq X align to best_k[1] + * Xk X align to best_w[1] + * Use stride Xmax + * Output: mat_qk(Xqk_w, Xqk_h, 12) + * Xqk_w X align to best_c[0](Xk) + * Xqk_h X align to best_w[2](Xq) + */ + + M = fc_out_w[0]; + N = fc_out_w[0]; + K = matmulSliceLen; + Mem stage2MatA = fc_out_mem[0]; + Mem stage2MatB = fc_out_mem[0]; + Aw = ALIGN(t, tn_bk); + Bw = ALIGN(t, tn_bw); + if (tn_out_w > Aw || tn_out_h > Bw) { + U32 len = tn_out_w * tn_out_h * tn_out_c; + fill_zero_nchw(handle, len, 0, tn_out_mem); + } + U32 A_str = matmulSliceLen * M; + U32 B_str = matmulSliceLen * N; + U32 C_str = tn_out_w * tn_out_h; + U32 A_off = 0; + U32 B_off = firstFCSliceNum[0] * fc_out_w[0]; + float *mulAlp = (float *)multiplyAlpha; + float *mulBet = (float *)multiplyBeta; + matmul_tn_c1(handle, M, N, K, tn_out_w, A_str, B_str, C_str, A_off, B_off, t, t, tn_bw, tn_bk, + tn_out_c, *mulAlp, *mulBet, stage2MatA, stage2MatB, tn_out_mem); + + /* STAGE3: Softmax on w for mat_qk */ + softmax_w(handle, t, t, tn_out_c, tn_out_w, tn_out_h, 0, 0, tn_out_w, tn_out_h, 0, 0, + tn_out_mem, tn_out_mem); + + /* STAGE4: Matmul mat_v * mat_qk + * NT GEMM + * INPUT: mat_v(Xv, 26, 12) mat_qk(Xqk_w, Xqk_h, 12) + * Xv X align to best_c[0] + * 26 align to best_k[2] + * Xqk_w align to best_c[0] + * Xqk_h align to best_w[2] + * OUTPUT: mat_vqk(Xvqk, 26, 12) + * Xvqk X align to best_w[3] + * set 26 divided by best_k[2], for next step + */ + U32 KA = fc_out_w[0]; + U32 KB = tn_out_w; + Mem stage4MatA = fc_out_mem[0]; + Mem stage4MatB = tn_out_mem; + K = ALIGN(t, nt_bc); + A_str = KA * matmulSliceLen; + B_str = tn_out_w * tn_out_h; + C_str = nt_out_w * nt_out_h; + A_off = (firstFCSliceNum[0] + firstFCSliceNum[1]) * KA; + B_off = 0; + matmul_nt_c1(handle, KA, KB, K, nt_out_w, A_str, B_str, C_str, A_off, B_off, t, matmulSliceLen, + nt_bw, nt_bk, nt_bc, nt_out_c, stage4MatA, stage4MatB, nt_out_mem); + + /* STAGE5: Innerproduct + * TN GEMM + * weight(T) (312, 312) stage4MatC(Xvqk, 312) + * weight w 312 align to best_k[3] + * Xvqk align to best_w[3], use stride Xvqk_max_w + * Output: stage5MatC + * use ncwhc4 for layer normal + * (Xi5, 312) + * Xi5, X align to best_w[4] + */ + + M = ((GCLMem_t)filter[1])->desc.stride[0]; + K = ((GCLMem_t)filter[1])->desc.stride[1]; + N = nt_out_w; + Mem stage5MatA = ((GCLMem_t)filter[1])->mem; + Mem stage5MatB = nt_out_mem; + Mem stage5Bias = ((GCLMem_t)bias[1])->mem; + U32 ew_str = (eltwiseWithLayerNormIn[0]) ? ih_str : ln_out_w[0]; + Mem elt = (eltwiseWithLayerNormIn[0]) ? stage0LNIn : ln_out_mem[0]; + inner_product_with_eltwise_c4(handle, M, N, K, fc_out_w[1], t, fn[1], fc_bw[1], fc_bk[1], + eltwiseWithLayerNormIn[0], ew_str, stage5MatA, stage5MatB, stage5Bias, fc_out_mem[1], elt); + + /* STAGE6: LayerNorm + */ + Mem stage6LNAlp = ((GCLMem_t)(layerNormAlpha[1]))->mem; + Mem stage6LNBet = ((GCLMem_t)(layerNormBeta[1]))->mem; + layer_norm(handle, t, fn[1], fc_out_w[1], (fn[1] + 3) / 4, 0, 0, ln_out_w[1], stage6LNAlp, + stage6LNBet, fc_out_mem[1], ln_out_mem[1]); + + /* STAGE7: Innerproduct with relu + */ + Mem stage7Flt = ((GCLMem_t)filter[2])->mem; + Mem stage7In = ln_out_mem[1]; + Mem stage7Bias = ((GCLMem_t)bias[2])->mem; + inner_product_ncwhc4(handle, ln_out_w[1], (fn[1] + 3) / 4, fn[2], fc_out_w[2], 0, 0, t, fc_bw[2], + fc_bk[2], activation, false, stage7In, stage7Flt, stage7Bias, fc_out_mem[2], 0, NULL); + + /*STAGE8: Innerproduct with eltwise + */ + M = ((GCLMem_t)(filter[3]))->desc.stride[0]; + K = ((GCLMem_t)(filter[3]))->desc.stride[1]; + N = fc_out_w[2]; + Mem stage8Flt = ((GCLMem_t)filter[3])->mem; + Mem stage8In = fc_out_mem[2]; + Mem stage8Bias = ((GCLMem_t)bias[3])->mem; + ew_str = (eltwiseWithLayerNormIn[1]) ? fc_out_w[1] : ln_out_w[1]; + Mem elt2 = (eltwiseWithLayerNormIn[1]) ? fc_out_mem[1] : ln_out_mem[1]; + inner_product_ncwhc4(handle, fc_out_w[2], (fn[2] + 3) / 4, fn[3], oh_str, oh_off, ow_off, t, + fc_bw[3], fc_bk[3], ACTIVATION_NULL, true, stage8In, stage8Flt, stage8Bias, output->mem, + ew_str, elt2); + return SUCCESS; +} + +inline EE multihead_attention_checkpara_mali_fp16( + TensorDesc inputDesc, std::vector filterDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) { + return NOT_MATCH; + } + for (U32 i = 0; i < filterDesc.size(); i++) { + if (filterDesc[i].dt != DT_F16) { + return NOT_SUPPORTED; + } + } + return SUCCESS; +} + +EE multihead_attention_transform_filter_bytes_mali_fp16(std::vector filterDesc, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ + U32 fc_bk[4]; + U32 fc_bc[4]; + fc_bk[0] = forwardRunInfo->best_k[0]; + fc_bk[1] = forwardRunInfo->best_k[3]; + fc_bk[2] = forwardRunInfo->best_k[4]; + fc_bk[3] = forwardRunInfo->best_k[5]; + fc_bc[0] = forwardRunInfo->best_c[0]; + fc_bc[1] = forwardRunInfo->best_c[3]; + fc_bc[2] = forwardRunInfo->best_c[4]; + fc_bc[3] = forwardRunInfo->best_c[5]; + for (U32 i = 0; i < 2; i++) { + U32 fn, fc, fh, fw; + U32 s0, s1, s2; + U32 num; + DataType dt = filterDesc[i].dt; + tensorSelectGet(filterDesc[i], NULL, NULL, &fn, &fc, &fh, &fw); + if (fh != 1 || fw != 1) { + CHECK_STATUS(NOT_MATCH); + } + s0 = ALIGN(fn, fc_bk[i]); + s1 = ALIGN(fc, 4); + s2 = 1; + num = s0 * s1 * s2; + gclmemFilterDesc[i].stride[0] = s0; + gclmemFilterDesc[i].stride[1] = s1; + gclmemFilterDesc[i].stride[2] = s2; + gclmemFilterDesc[i].offset[0] = 0; + gclmemFilterDesc[i].offset[1] = 0; + gclmemFilterDesc[i].offset[2] = 0; + gclmemFilterDesc[i].num = num; + gclmemFilterDesc[i].memFormat = DF_NCHW; + gclmemFilterDesc[i].byteSize = num * bytesOf(dt); + gclmemFilterDesc[i].memType = GCL_MEM_BUF; + gclmemFilterDesc[i].flags = CL_MEM_READ_WRITE; + gclmemFilterDesc[i].host_ptr = NULL; + } + for (U32 i = 2; i < filterDesc.size(); i++) { + U32 fn, fc, fh, fw; + U32 s0, s1, s2; + U32 num; + DataType dt = filterDesc[i].dt; + tensorSelectGet(filterDesc[i], NULL, NULL, &fn, &fc, &fh, &fw); + if (fh != 1 || fw != 1) { + CHECK_STATUS(NOT_MATCH); + } + s0 = fc_bk[i] >> 2; + s1 = (fc + fc_bc[i] - 1) / fc_bc[i]; + s2 = (fn + fc_bk[i] - 1) / fc_bk[i]; + num = s0 * s1 * s2 * fc_bc[i] * fc_bk[i] / (fc_bk[i] >> 2); + gclmemFilterDesc[i].stride[0] = s0; + gclmemFilterDesc[i].stride[1] = s1; + gclmemFilterDesc[i].stride[2] = s2; + gclmemFilterDesc[i].offset[0] = 0; + gclmemFilterDesc[i].offset[1] = 0; + gclmemFilterDesc[i].offset[2] = 0; + gclmemFilterDesc[i].num = num; + gclmemFilterDesc[i].memFormat = DF_NCWHN4C4; + gclmemFilterDesc[i].byteSize = num * bytesOf(dt); + gclmemFilterDesc[i].memType = GCL_MEM_BUF; + gclmemFilterDesc[i].flags = CL_MEM_READ_WRITE; + gclmemFilterDesc[i].host_ptr = NULL; + } + return SUCCESS; +} + +EE multihead_attention_transform_filter_mali_fp16(GCLHandle_t handle, + std::vector filterDesc, + std::vector filter, + std::vector *fltmemDesc, + std::vector fltmem, + ForwardRunInfoMali_t forwardRunInfo) +{ + U32 fc_bk[4]; + U32 fc_bc[4]; + + fc_bk[0] = forwardRunInfo->best_k[0]; + fc_bk[1] = forwardRunInfo->best_k[3]; + fc_bk[2] = forwardRunInfo->best_k[4]; + fc_bk[3] = forwardRunInfo->best_k[5]; + fc_bc[0] = forwardRunInfo->best_c[0]; + fc_bc[1] = forwardRunInfo->best_c[3]; + fc_bc[2] = forwardRunInfo->best_c[4]; + fc_bc[3] = forwardRunInfo->best_c[5]; + char kernelname[128]; + Kernel kernel; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + U32 filterNum = filterDesc.size(); + if (filterNum != filter.size() || filterNum != fltmemDesc->size() || filterNum != fltmem.size()) { + CHECK_STATUS(NOT_MATCH); + } + for (auto p : filterDesc) { + fltmemDesc->push_back(p); + } + U32 fwh = 1; + for (U32 i = 0; i < 2; i++) { + sprintf(kernelname, "conv_direct_trans_fltbuf_%d%d", 1, 0); + U32 fc, fn; + Mem flt_org = ((GCLMem_t)filter[i])->mem; + Mem flt_tra = ((GCLMem_t)fltmem[i])->mem; + tensorSelectGet(filterDesc[i], NULL, NULL, &fn, &fc, NULL, NULL); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, fwh, fc, fn, flt_org, flt_tra)); + gs[0] = fwh; + gs[1] = ALIGN(fc, 4); + gs[2] = ALIGN(fn, fc_bk[i]); + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + } + + for (U32 i = 2; i < filterDesc.size(); i++) { + sprintf(kernelname, "conv_direct_trans_fltbuf_%d%d", fc_bc[i], fc_bk[i]); + U32 fc, fn; + Mem flt_org = ((GCLMem_t)filter[i])->mem; + Mem flt_tra = ((GCLMem_t)fltmem[i])->mem; + tensorSelectGet(filterDesc[i], NULL, NULL, &fn, &fc, NULL, NULL); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, fwh, fc, fn, flt_org, flt_tra)); + gs[0] = fwh; + gs[1] = (fc + fc_bc[i] - 1) / fc_bk[i]; + gs[2] = ALIGN(fn, fc_bk[i]); + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + } + return SUCCESS; +} + +EE multihead_attention_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + std::vector filterDesc, + std::vector eltwiseWithLayerNormIn, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ + U32 fn[4]; + for (U32 i = 0; i < filterDesc.size(); i++) { + tensorSelectGet(filterDesc[i], NULL, NULL, &fn[i], NULL, NULL, NULL); + } + DataType dt; + U32 m, k, t; + get_nlp_mkt_val(inputDesc, &dt, &m, &k, &t); + U32 fc_bw[4]; + U32 fc_bk[4]; + U32 tn_bw, tn_bk; + U32 nt_bw, nt_bk, nt_bc; + set_best_wkc(fc_bw, fc_bk, tn_bw, tn_bk, nt_bw, nt_bk, nt_bc, forwardRunInfo); + + U32 ln_out_flag[2]; + U32 fc_out_flag[3]; + U32 tn_out_flag, nt_out_flag; + set_mem_flag(ln_out_flag, fc_out_flag, tn_out_flag, nt_out_flag); + + U32 ln_out_w[2]; + U32 ln_out_h[2]; + U32 fc_out_w[3]; + U32 fc_out_h[3]; + U32 tn_out_w, tn_out_h, tn_out_c; + U32 nt_out_w, nt_out_h, nt_out_c; + get_ln0_out_wh(t, k, fc_bw, ln_out_w[0], ln_out_h[0], eltwiseWithLayerNormIn); + get_fc0_out_wh(t, fn[0], fc_bw, fc_bk, tn_bw, tn_bk, nt_bc, fc_out_w[0], fc_out_h[0]); + U32 Aw = ALIGN(t, tn_bk); + U32 Bw = ALIGN(t, tn_bw); + get_tn_sf_out_whc( + Aw, Bw, t, firstFCSliceNum[0], matmulSliceLen, nt_bw, nt_bc, tn_out_w, tn_out_h, tn_out_c); + U32 Ah = ALIGN(matmulSliceLen, nt_bk); + U32 Bh = ALIGN(t, nt_bw); + get_nt_out_whc( + Ah, Bh, t, firstFCSliceNum[2], matmulSliceLen, fc_bw, nt_out_w, nt_out_h, nt_out_c); + Bw = ALIGN(t, fc_bw[1]); + get_fc1_out_wh(Bw, t, fn[1], fc_bw, fc_bk, fc_out_w[1], fc_out_h[1]); + ln_out_w[1] = fc_out_w[1]; + ln_out_h[1] = fc_out_h[1]; + Bw = ALIGN(t, fc_bw[2]); + get_fc2_out_wh(Bw, t, fn[2], fc_bw, fc_bk, fc_out_w[2], fc_out_h[2]); + + U32 sub_size[3]; + get_subbuf_size(dt, ln_out_w, ln_out_h, ln_out_flag, fc_out_w, fc_out_h, fc_out_flag, tn_out_w, + tn_out_h, tn_out_c, tn_out_flag, nt_out_w, nt_out_h, nt_out_c, nt_out_flag, sub_size); + *bytes = ALIGN(sub_size[0], 1024) + ALIGN(sub_size[1], 1024) + sub_size[2]; + return SUCCESS; +} + +EE multihead_attention_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + std::vector filterDesc, + std::vector filter, + std::vector biasDesc, + std::vector bias, + std::vector layerNormAlpha, + std::vector layerNormBeta, + void *multiplyAlpha, + void *multiplyBeta, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + std::vector eltwiseWithLayerNormIn, + ActivationMode activation, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ForwardRunInfoMali_t forwardRunInfo) +{ + CHECK_STATUS(multihead_attention_checkpara_mali_fp16(inputDesc, filterDesc, outputDesc)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(multihead_attention_core_mali_fp16(handle, inputDesc, input, filterDesc, filter, + biasDesc, bias, layerNormAlpha, layerNormBeta, multiplyAlpha, multiplyBeta, firstFCSliceNum, + matmulSliceLen, eltwiseWithLayerNormIn, activation, tmpBytes, tmpBuf, outputDesc, output, + forwardRunInfo)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/multihead_attention_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/multihead_attention_mali_fp16.h new file mode 100644 index 00000000..d53b17b3 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/multihead_attention_mali_fp16.h @@ -0,0 +1,61 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _MULTIHEAD_ATTENTION_MALI_FP16 +#define _MULTIHEAD_ATTENTION_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE multihead_attention_transform_filter_bytes_mali_fp16(std::vector filterDesc, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); + +EE multihead_attention_transform_filter_mali_fp16(GCLHandle_t handle, + std::vector filterDesc, + std::vector filter, + std::vector *fltmemDesc, + std::vector fltmem, + ForwardRunInfoMali_t forwardRunInfo); + +EE multihead_attention_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + std::vector filterDesc, + std::vector eltwiseWithLayerNormIn, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); + +EE multihead_attention_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + std::vector filterDesc, + std::vector filter, + std::vector biasDesc, + std::vector bias, + std::vector layerNormAlpha, + std::vector layerNormBeta, + void *multiplyAlpha, + void *multiplyBeta, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + std::vector eltwiseWithLayerNormIn, + ActivationMode activation, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ForwardRunInfoMali_t forwardRunInfo); +#endif diff --git a/tensor_computing/src/gpu/mali/fp16/normalization_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/normalization_mali_fp16.cpp similarity index 61% rename from tensor_computing/src/gpu/mali/fp16/normalization_mali_fp16.cpp rename to compute/tensor/src/gpu/mali/fp16/normalization_mali_fp16.cpp index a84c00fe..d35df88e 100644 --- a/tensor_computing/src/gpu/mali/fp16/normalization_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/normalization_mali_fp16.cpp @@ -11,27 +11,27 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "sys.h" -#include "type.h" -#include "tensor_desc.h" #include "error.h" -#include "tensor_computing_type.h" +#include "types.h" #include "gpu/mali/fp16/normalization_mali_fp16.h" -inline EE normalization_checkpara_mali_fp16(TensorDesc inputDesc, - TensorDesc outputDesc) { - if(inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) return NOT_SUPPORTED; - return SUCCESS; +inline EE normalization_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; } inline EE normalization_core_mali_fp16(GCLHandle_t handle, - GCLMem_t alpha, - GCLMem_t beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { + GCLMem_t alpha, + GCLMem_t beta, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output) +{ UNUSED(outputDesc); U32 step = inputDesc.dims[0]; U32 numOutput = inputDesc.dims[1]; @@ -45,38 +45,43 @@ inline EE normalization_core_mali_fp16(GCLHandle_t handle, oh_str = output->desc.stride[0]; oh_off = output->desc.offset[0]; ow_off = output->desc.offset[1]; - if(iw_str != 1 || ih_off != 0 || iw_off != 0) CHECK_STATUS(NOT_SUPPORTED); + if (iw_str != 1 || ih_off != 0 || iw_off != 0) { + CHECK_STATUS(NOT_SUPPORTED); + } cl_mem alpbuf, betbuf, inbuf, outbuf; alpbuf = alpha->mem; betbuf = beta->mem; - inbuf = input->mem; + inbuf = input->mem; outbuf = output->mem; U32 gs = step; U32 ls = 0; U32 dim = 1; + float para = 1.0 / numOutput; Kernel kernel; - CHECK_STATUS(gcl_create_kernel_binary(handle, "normalization", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, step, numOutput, ih_str, ic_str, ih_off, iw_off, oh_str, oh_off, ow_off, alpbuf, betbuf, inbuf, outbuf)); + CHECK_STATUS(gcl_create_kernel(handle, "normalization", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, step, ih_str, ic_str, ih_off, iw_off, oh_str, oh_off, + ow_off, para, alpbuf, betbuf, inbuf, outbuf)); gcl_set_kernelVec(handle, kernel, dim, &gs, &ls, "normalization"); #ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, input, "normalization_input")); + CHECK_STATUS(gcl_print_memory(handle, input, "normalization_input")); CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs, &ls, "normalization")); CHECK_STATUS(gcl_print_memory(handle, output, "normalization_output")); #endif - return SUCCESS; + return SUCCESS; } - EE normalization_mali_fp16(GCLHandle_t handle, - GCLMem_t alpha, - GCLMem_t beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { + GCLMem_t alpha, + GCLMem_t beta, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output) +{ CHECK_STATUS(normalization_checkpara_mali_fp16(inputDesc, outputDesc)); - CHECK_STATUS(normalization_core_mali_fp16(handle, alpha, beta, inputDesc, input, outputDesc, output)); - return SUCCESS; + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS( + normalization_core_mali_fp16(handle, alpha, beta, inputDesc, input, outputDesc, output)); + return SUCCESS; } - diff --git a/tensor_computing/src/gpu/mali/fp16/normalization_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/normalization_mali_fp16.h similarity index 78% rename from tensor_computing/src/gpu/mali/fp16/normalization_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/normalization_mali_fp16.h index 9d903e72..77ccd7a0 100644 --- a/tensor_computing/src/gpu/mali/fp16/normalization_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/normalization_mali_fp16.h @@ -14,17 +14,15 @@ #ifndef _NORMALIZATION_MALI_FP16 #define _NORMALIZATION_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" #include "error.h" +#include "types.h" #include "tensor_computing_type.h" EE normalization_mali_fp16(GCLHandle_t handle, - GCLMem_t alpha, - GCLMem_t beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output); -#endif - + GCLMem_t alpha, + GCLMem_t beta, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/padding_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/padding_mali_fp16.cpp new file mode 100644 index 00000000..60ea7859 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/padding_mali_fp16.cpp @@ -0,0 +1,168 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/padding_mali_fp16.h" + +inline EE padding_checkpara_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + PadParamSpec padParamSpec, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (handle == nullptr || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + if (inputDesc.df != outputDesc.df || inputDesc.df != DF_NCHW) { + return NOT_SUPPORTED; + } + if (input->desc.memFormat != output->desc.memFormat || input->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + if (padParamSpec.pad_mode == Pad_Reflect && + (padParamSpec.top >= inputDesc.dims[1] || padParamSpec.bottom >= inputDesc.dims[1])) { + return NOT_SUPPORTED; + } + if (padParamSpec.pad_mode == Pad_Symmetric && + (padParamSpec.left > inputDesc.dims[0] || padParamSpec.right > inputDesc.dims[0])) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE padding_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + PadParamSpec padParamSpec, + TensorDesc outputDesc, + GCLMem_t output) +{ + U32 iw, ih, ic, in; + U32 ow, oh, oc, on; + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + cl_mem inbuf, outbuf; + inbuf = input->mem; + outbuf = output->mem; + + U32 iw_str, ih_str, iw_off, ih_off; + ih_str = input->desc.stride[0]; + iw_str = input->desc.stride[1]; + ih_off = input->desc.offset[0]; + iw_off = input->desc.offset[1]; + + U32 ow_str, oh_str, ow_off, oh_off; + oh_str = output->desc.stride[0]; + ow_str = output->desc.stride[1]; + oh_off = output->desc.offset[0]; + ow_off = output->desc.offset[1]; + + U32 pw, ph, pr, pb; + pw = padParamSpec.left; + pr = padParamSpec.right; + ph = padParamSpec.top; + pb = padParamSpec.bottom; + + Kernel kernel; + switch (padParamSpec.pad_mode) { + case Pad_Constant: { + CHECK_STATUS(gcl_create_kernel(handle, "padding_constant", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_str, iw_str, ih_off, iw_off, oh, ow, + oh_str, ow_str, oh_off, ow_off, ph, pb, pw, pr, inbuf, outbuf)); + + U32 gs[3] = {oh, ow, (oc + 3) / 4 * on}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "padding_constant"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "padding_constant")); + CHECK_STATUS(gcl_print_memory(handle, input, "padding_constant_input")); + CHECK_STATUS(gcl_print_memory(handle, output, "padding_constant_output")); +#endif + break; + } + case Pad_Reflect: { + CHECK_STATUS(gcl_create_kernel(handle, "padding_reflect", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_str, iw_str, ih_off, iw_off, oh, ow, + oh_str, ow_str, oh_off, ow_off, ph, pb, pw, pr, inbuf, outbuf)); + + U32 gs[3] = {oh, ow, (oc + 3) / 4 * on}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "padding_reflect"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "padding_reflect")); + CHECK_STATUS(gcl_print_memory(handle, input, "padding_reflect_input")); + CHECK_STATUS(gcl_print_memory(handle, output, "padding_reflect_output")); +#endif + break; + } + case Pad_Edge: { + CHECK_STATUS(gcl_create_kernel(handle, "padding_edge", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_str, iw_str, ih_off, iw_off, oh, ow, + oh_str, ow_str, oh_off, ow_off, ph, pb, pw, pr, inbuf, outbuf)); + + U32 gs[3] = {oh, ow, (oc + 3) / 4 * on}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "padding_edge"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "padding_edge")); + CHECK_STATUS(gcl_print_memory(handle, input, "padding_edge_input")); + CHECK_STATUS(gcl_print_memory(handle, output, "padding_edge_output")); +#endif + break; + } + case Pad_Symmetric: { + CHECK_STATUS(gcl_create_kernel(handle, "padding_symmetric", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_str, iw_str, ih_off, iw_off, oh, ow, + oh_str, ow_str, oh_off, ow_off, ph, pb, pw, pr, inbuf, outbuf)); + + U32 gs[3] = {oh, ow, (oc + 3) / 4 * on}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "padding_symmetric"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "padding_symmetric")); + CHECK_STATUS(gcl_print_memory(handle, input, "padding_symmetric_input")); + CHECK_STATUS(gcl_print_memory(handle, output, "padding_symmetric_output")); +#endif + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + return SUCCESS; +} + +EE padding_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + PadParamSpec padParamSpec, + TensorDesc outputDesc, + GCLMem_t output) +{ + CHECK_STATUS( + padding_checkpara_mali_fp16(handle, inputDesc, input, padParamSpec, outputDesc, output)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(padding_core_mali_fp16(handle, inputDesc, input, padParamSpec, outputDesc, output)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/padding_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/padding_mali_fp16.h new file mode 100644 index 00000000..26b78ed6 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/padding_mali_fp16.h @@ -0,0 +1,27 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_PADDING_MALI_FP16 +#define _H_PADDING_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE padding_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + PadParamSpec padParamSpec, + TensorDesc outputDesc, + GCLMem_t output); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/pooling_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/pooling_mali_fp16.cpp new file mode 100644 index 00000000..2bdb49ec --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/pooling_mali_fp16.cpp @@ -0,0 +1,188 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/pooling_mali_fp16.h" + +inline EE pooling_checkpara_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (handle == nullptr || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + if (inputDesc.df != outputDesc.df) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[2] != outputDesc.dims[2] || inputDesc.dims[3] != outputDesc.dims[3]) { + return NOT_SUPPORTED; + } + if (poolingParamSpec.padding_top >= poolingParamSpec.kernel_h) { + return NOT_SUPPORTED; + } + if (poolingParamSpec.padding_bottom >= poolingParamSpec.kernel_w) { + return NOT_SUPPORTED; + } + if (input->desc.memFormat != output->desc.memFormat || input->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE pooling_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t temp) +{ + DataFormat df; + U32 iw, ih, ic, in, it; + U32 ow, oh, oc, on, ot; + tensorSelectGet(inputDesc, NULL, &df, &in, &ic, &ih, &iw, &it); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow, &ot); + + cl_mem inbuf, outbuf, tmpbuf; + inbuf = input->mem; + outbuf = output->mem; + tmpbuf = temp->mem; + U32 iw_str, ih_str, iw_off, ih_off; + U32 ow_str, oh_str, ow_off, oh_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, NULL, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + + U32 sw, sh, st, pw, ph, pt, kw, kh, kt; + sw = poolingParamSpec.stride_w; + sh = poolingParamSpec.stride_h; + st = poolingParamSpec.stride_t; + pw = poolingParamSpec.padding_left; + ph = poolingParamSpec.padding_top; + pt = poolingParamSpec.padding_before; + kw = poolingParamSpec.kernel_w; + kh = poolingParamSpec.kernel_h; + kt = poolingParamSpec.kernel_t; + + if (df == DF_NCHW) { + st = 1; + pt = 0; + kt = 1; + } + Kernel kernel; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + char kernelname[128]; + switch (poolingParamSpec.mode) { + case POOLING_MAX: { + gs[0] = oh; + gs[1] = ow; + gs[2] = (oc + 3) / 4 * ot * on; + if (st == 1 && pt == 0 && kt == 1) { + sprintf(kernelname, "pooling_max"); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_off, iw_off, ih_str, iw_str, oh, + ow, oh_off, ow_off, oh_str, ow_str, sh, sw, ph, pw, kh, kw, inbuf, outbuf)); + } else { + return NOT_SUPPORTED; + } + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + break; + } + case POOLING_MEAN: { + if (oh == 1 && ow == 1 && iw > 7) { + sprintf(kernelname, "pooling_global_mean_w"); + gs[0] = ih; + gs[1] = (oc + 3) / 4 * on; + dim = 2; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, ih_str * iw_str, ih_off, iw_off, ih, + iw, gs[0], gs[1], inbuf, tmpbuf)); + CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname)); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + sprintf(kernelname, "pooling_global_mean_h"); + gs[0] = (oc + 3) / 4 * on; + dim = 1; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs( + kernel, ih, oh_str, oh_str * ow_str, oh_off, ow_off, gs[0], tmpbuf, outbuf)); + CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname)); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); +#endif + } else { + sprintf(kernelname, "pooling_mean"); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_off, iw_off, ih_str, iw_str, oh, + ow, oh_off, ow_off, oh_str, ow_str, sh, sw, ph, pw, kh, kw, inbuf, outbuf)); + + gs[0] = oh; + gs[1] = ow; + gs[2] = (oc + 3) / 4 * on; + dim = 3; + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + } + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + return SUCCESS; +} + +EE pooling_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t temp) +{ + CHECK_STATUS( + pooling_checkpara_mali_fp16(handle, inputDesc, input, poolingParamSpec, outputDesc, output)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(pooling_core_mali_fp16( + handle, inputDesc, input, poolingParamSpec, outputDesc, output, temp)); + return SUCCESS; +} + +EE pooling_infer_forward_tmp_bytes_mali_fp16( + TensorDesc inputDesc, U32 *bytes, ForwardRunInfoMali_t forwardRunInfo) +{ + UNUSED(forwardRunInfo); + DataType idt; + U32 in, ic, ih, iw; + tensorSelectGet(inputDesc, &idt, NULL, &in, &ic, &ih, &iw); + *bytes = ih * ((ic + 3) / 4 * 4) * bytesOf(idt); + return SUCCESS; +} diff --git a/tensor_computing/src/gpu/mali/fp16/pooling_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/pooling_mali_fp16.h similarity index 78% rename from tensor_computing/src/gpu/mali/fp16/pooling_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/pooling_mali_fp16.h index bda21f3a..6ae6f310 100644 --- a/tensor_computing/src/gpu/mali/fp16/pooling_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/pooling_mali_fp16.h @@ -11,20 +11,20 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_POOLING_MALI_FP16 #define _H_POOLING_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" #include "error.h" +#include "types.h" #include "tensor_computing_type.h" -EE pooling_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - const GCLMem_t input, - PoolingDesc poolingDesc, - TensorDesc outputDesc, - GCLMem_t output); +EE pooling_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t temp); +EE pooling_infer_forward_tmp_bytes_mali_fp16( + TensorDesc inputDesc, U32 *bytes, ForwardRunInfoMali_t forwardRunInfo); #endif - diff --git a/compute/tensor/src/gpu/mali/fp16/power_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/power_mali_fp16.cpp new file mode 100644 index 00000000..bf22eb72 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/power_mali_fp16.cpp @@ -0,0 +1,98 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "types.h" +#include "gpu/mali/fp16/power_mali_fp16.h" + +inline EE power_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + EE ret = SUCCESS; + if (inputDesc.dt != outputDesc.dt || (inputDesc.dt != DT_F16 && inputDesc.dt != DT_I32)) { + ret = NOT_SUPPORTED; + } + return ret; +} + +inline EE power_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + PowerParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + UNUSED(outputDesc); + U32 iw, ih, ic, in; + DataType dt; + if (inputDesc.df == DF_NCHW || inputDesc.df == DF_NORMAL) { + tensorSelectGet(inputDesc, &dt, NULL, &in, &ic, &ih, &iw); + } else if (inputDesc.df == DF_MKT) { + get_nlp_mkt_val(inputDesc, &dt, &in, &ic, &ih); + iw = 1; + } else { + return NOT_SUPPORTED; + } + U32 iw_str, ih_str, iw_off, ih_off; + U32 ow_str, oh_str, ow_off, oh_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, NULL, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + cl_mem inbuf, outbuf; + inbuf = input->mem; + outbuf = output->mem; + Kernel kernel; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + char kernelname[128]; + sprintf(kernelname, "power_f16"); + if (dt == DT_I32) { + sprintf(kernelname, "power_i32"); + } + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + U32 has_power = (p.power == (F32)1.0) ? 0 : 1; + if (input->desc.memFormat == DF_NCHW) { + gs[0] = (iw + 3) / 4; + gs[1] = ih; + gs[2] = ic; + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, iw_str, ih_off, iw_off, oh_str, ow_str, + oh_off, ow_off, iw, gs[0], gs[1], has_power, p.scale, p.shift, p.power, inbuf, outbuf)); + } + if (input->desc.memFormat == DF_NCWHC4) { + gs[0] = ih; + gs[1] = iw; + gs[2] = (ic + 3) / 4; + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str * 4, iw_off, ih_off * 4, ow_str, + oh_str * 4, ow_off, oh_off * 4, ih * 4, gs[0], gs[1], has_power, p.scale, p.shift, + p.power, inbuf, outbuf)); + } + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); + handle->t_total += handle->t_execute; +#endif + return SUCCESS; +} + +EE power_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + PowerParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + CHECK_STATUS(power_checkpara_mali_fp16(inputDesc, outputDesc)); + if (input->mem != output->mem) { + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + } + CHECK_STATUS(power_core_mali_fp16(handle, inputDesc, input, p, outputDesc, output)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/power_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/power_mali_fp16.h new file mode 100644 index 00000000..ca25d102 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/power_mali_fp16.h @@ -0,0 +1,25 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _POWER_MALI_FP16 +#define _POWER_MALI_FP16 +#include "types.h" +#include "tensor_computing_type.h" + +EE power_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + PowerParamSpec p, + TensorDesc outputDesc, + GCLMem_t output); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/prelu_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/prelu_mali_fp16.cpp new file mode 100644 index 00000000..5ef92949 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/prelu_mali_fp16.cpp @@ -0,0 +1,93 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/prelu_mali_fp16.h" + +inline EE prelu_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE prelu_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + GCLMem_t weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + GCLMem_t output) +{ + UNUSED(outputDesc); + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + U32 iw_str, ih_str, iw_off, ih_off; + U32 ow_str, oh_str, ow_off, oh_off; + ih_str = input->desc.stride[0]; + iw_str = input->desc.stride[1]; + ih_off = input->desc.offset[0]; + iw_off = input->desc.offset[1]; + oh_str = output->desc.stride[0]; + ow_str = output->desc.stride[1]; + oh_off = output->desc.offset[0]; + ow_off = output->desc.offset[1]; + cl_mem inbuf, outbuf, webuf; + inbuf = input->mem; + outbuf = output->mem; + webuf = weight->mem; + + char modeName[16]; + char kernelName[128]; + if (preluDesc.propagate_down) { + strcpy(modeName, "prop"); + } else { + strcpy(modeName, "noprop"); + } + sprintf(kernelName, "prelu_%s", modeName); + + U32 gs[3] = {ih, iw, (ic + 3) / 4}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_str, iw_str, ih_off, iw_off, ih, iw, oh_str, + ow_str, oh_off, ow_off, webuf, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_print_memory(handle, input, "prelu_input")); + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); + CHECK_STATUS(gcl_print_memory(handle, weight, "prelu_weight")); +#endif + return SUCCESS; +} + +EE prelu_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + GCLMem_t weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + GCLMem_t output) +{ + CHECK_STATUS(prelu_checkpara_mali_fp16(inputDesc, outputDesc)); + if (input->mem != output->mem) { + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + } + CHECK_STATUS( + prelu_core_mali_fp16(handle, inputDesc, input, weight, preluDesc, outputDesc, output)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/prelu_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/prelu_mali_fp16.h new file mode 100644 index 00000000..51c1143e --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/prelu_mali_fp16.h @@ -0,0 +1,28 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _PRELU_MALI_FP16 +#define _PRELU_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE prelu_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + GCLMem_t weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + GCLMem_t output); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/reshape_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/reshape_mali_fp16.cpp new file mode 100644 index 00000000..463b116f --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/reshape_mali_fp16.cpp @@ -0,0 +1,310 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/reshape_mali_fp16.h" + +inline EE reshape_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt) { + return NOT_SUPPORTED; + } + if (outputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE reshape_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t tmpbuf) +{ + DataFormat idf, odf; + U32 iw, ih, ic, in, it; + U32 ow, oh, oc, on, ot; + tensorSelectGet(inputDesc, NULL, &idf, &in, &ic, &ih, &iw, &it); + tensorSelectGet(outputDesc, NULL, &odf, &on, &oc, &oh, &ow, &ot); + U32 iw_str, ih_str, ic_str, iw_off, ih_off; + U32 ow_str, oh_str, oc_str, ow_off, oh_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, &oc_str, &ow_off, &oh_off); + DataFormat imf = input->desc.memFormat; + DataFormat omf = output->desc.memFormat; + cl_mem inbuf = input->mem; + cl_mem outbuf = output->mem; + cl_mem tmp = tmpbuf->mem; + bool dataCopy = false; + U32 copy_len_in = iw * ih * ic * in * it; + U32 copy_len_out = ow * oh * oc * on * ot; + + if ((iw_str == 1 && ih_str == 1 && omf == DF_NCHW && ow_off == 0 && oh_off == 0) || + (ow_str == 1 && oh_str == 1 && imf == DF_NCHW && iw_off == 0 && ih_off == 0)) { + if (inbuf == outbuf) { + return SUCCESS; + } else { + dataCopy = true; + goto DATACOPY; + } + } + + if (imf == omf) { + if (imf == DF_NCHW) { + if ((iw_off == 0 && ih_off == 0 && ow_off == 0 && oh_off == 0) || + (iw_str == ow_str && ih_str == oh_str && iw_off == ow_off && ih_off == oh_off && + iw == ow && ih == oh)) { + if (inbuf == outbuf) { + return SUCCESS; + } else { + dataCopy = true; + goto DATACOPY; + } + } + } + + if (imf == DF_NCWHC4) { + if (iw_str == ow_str && ih_str == oh_str && iw_off == ow_off && ih_off == oh_off && + iw == ow && ih == oh) { + if (it == ot) { + if (inbuf == outbuf) { + return SUCCESS; + } else { + dataCopy = true; + goto DATACOPY; + } + } else { + goto DATACOPY; + } + } + } + + if (iw == ow && ih == oh) { + if (inbuf == outbuf) { + outbuf = tmp; + dataCopy = true; + copy_len_in = copy_len_out; + } + char kernelName[128]; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + if (imf == DF_NCHW) { + sprintf(kernelName, "mem_trans_nchw_to_nchw"); + gs[0] = (ow + 3) / 4; + gs[1] = oh; + gs[2] = oc * ot * on; + } else { + if (it != ot) { + dataCopy = false; + goto DATACOPY; + } + sprintf(kernelName, "mem_trans_ncwhc4_to_ncwhc4"); + gs[0] = oh; + gs[1] = ow; + gs[2] = (oc + 3) / 4 * ot * on; + ic = ALIGN(ic, 4); + } + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, ow_str, oh_str, + ow_off, oh_off, iw, ih, ic * it * in, ow, oh, oc * ot * on, 0, 0, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif + if (dataCopy) { + inbuf = tmp; + goto DATACOPY; + } else { + return SUCCESS; + } + } + } + + if (imf != omf && it == 1 && ot == 1) { + if ((imf == DF_NCWHC4 && ih == ow && iw == 1) || (omf == DF_NCWHC4 && iw == oh && ow == 1)) { + if (inbuf == outbuf) { + outbuf = tmp; + dataCopy = true; + copy_len_in = copy_len_out; + } + char kernelName[128]; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + U32 h_val, c_val; + if (imf == DF_NCWHC4) { + sprintf(kernelName, "mem_trans_ncwhc4_to_nchw_ih_equal_ow"); + gs[0] = ih; + gs[1] = iw; + gs[2] = (ic + 3) / 4; + h_val = oh; + c_val = oc; + } else { + sprintf(kernelName, "mem_trans_nchw_to_ncwhc4_iw_equal_oh"); + gs[0] = oh; + gs[1] = ow; + gs[2] = (oc + 3) / 4; + h_val = ih; + c_val = ic; + } + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, ow_str, oh_str, + ow_off, oh_off, h_val, c_val, gs[0], gs[1], inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif + if (dataCopy) { + inbuf = tmp; + } else { + return SUCCESS; + } + } + } + +DATACOPY: + if (dataCopy) { + U32 gs = (copy_len_out + 3) / 4; + U32 ls = 0; + U32 dim = 1; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "copy_f16", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, copy_len_in, copy_len_out, 0, 0, gs, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, &gs, &ls, "copy_f16"); + inbuf = tmp; +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs, &ls, "copy_f16")); +#endif + return SUCCESS; + } + + bool noNeedOutTrans = false; + if (ow_str == 1 && oh_str == 1) { + noNeedOutTrans = true; + tmp = outbuf; + } + + if (imf == DF_NCHW && (iw_off > 0 || ih_off > 0)) { + U32 gs[3] = {(iw + 3) / 4, ih, ic * it}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "mem_trans_nchw_to_nchw", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, iw, ih, 0, 0, iw, + ih, ic * it, iw, ih, ic * it, 0, 0, inbuf, tmp)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "mem_trans_nchw_to_nchw"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "mem_trans_nchw_to_nchw")); +#endif + if (noNeedOutTrans) { + return SUCCESS; + } else { + inbuf = tmp; + } + } + + if (imf == DF_NCWHC4) { + U32 gs[3] = {ih, (iw + 3) / 4, (ic + 3) / 4 * it}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + char kernelName[128]; + if (idf == DF_NCTHW) { + sprintf(kernelName, "mem_trans_3d_ncwhc4_to_nchw"); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, iw, ih, 0, 0, + iw, ih, ic, it, iw, ih, ic, it, 0, 0, inbuf, tmp)); + } else { + sprintf(kernelName, "mem_trans_ncwhc4_to_nchw"); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, iw, ih, 0, 0, + iw, ih, ic, iw, ih, ic, 0, 0, inbuf, tmp)); + } + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif + if (noNeedOutTrans) { + return SUCCESS; + } else { + inbuf = tmp; + } + } + + if (omf == DF_NCHW) { + U32 gs[3] = {(ow + 3) / 4, oh, oc * ot}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "mem_trans_nchw_to_nchw", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ow, oh, 0, 0, ow_str, oh_str, ow_off, oh_off, ow, + oh, oc * ot, ow, oh, oc * ot, 0, 0, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "mem_trans_nchw_to_nchw"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "mem_trans_nchw_to_nchw")); +#endif + return SUCCESS; + } + + if (omf == DF_NCWHC4) { + U32 gs[3] = {(ow + 3) / 4, oh, (oc + 3) / 4 * on}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "mem_trans_nchw_to_ncwhc4", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ow, oh, 0, 0, ow_str, oh_str, ow_off, oh_off, ow, + oh, oc, ow, oh, oc, 0, 0, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "mem_trans_nchw_to_ncwhc4"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "mem_trans_nchw_to_ncwhc4")); +#endif + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE reshape_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + U32 *bytes) +{ + U32 maxSize = tensorNumBytes(inputDesc); + U32 tmpSize = tensorNumBytes(outputDesc); + maxSize = (maxSize > tmpSize) ? maxSize : tmpSize; + tmpSize = gclmemInputDesc->byteSize; + maxSize = (maxSize > tmpSize) ? maxSize : tmpSize; + tmpSize = gclmemOutputDesc->byteSize; + maxSize = (maxSize > tmpSize) ? maxSize : tmpSize; + *bytes = maxSize; + return SUCCESS; +} + +EE reshape_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t tmpbuf) +{ + CHECK_STATUS(reshape_checkpara_mali_fp16(inputDesc, outputDesc)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(reshape_core_mali_fp16(handle, inputDesc, input, outputDesc, output, tmpbuf)); + return SUCCESS; +} diff --git a/tensor_computing/src/gpu/mali/fp16/reshape_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/reshape_mali_fp16.h similarity index 80% rename from tensor_computing/src/gpu/mali/fp16/reshape_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/reshape_mali_fp16.h index 4156e1f2..f0959222 100644 --- a/tensor_computing/src/gpu/mali/fp16/reshape_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/reshape_mali_fp16.h @@ -11,18 +11,23 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _RESHAPE_MALI_FP16 #define _RESHAPE_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" #include "error.h" +#include "types.h" #include "tensor_computing_type.h" +EE reshape_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + U32 *bytes); + EE reshape_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output); + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t tmpbuf); #endif diff --git a/compute/tensor/src/gpu/mali/fp16/rnn_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/rnn_mali_fp16.cpp new file mode 100644 index 00000000..ab8048f9 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/rnn_mali_fp16.cpp @@ -0,0 +1,192 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/rnn_mali_fp16.h" + +inline EE rnn_checkpara_mali_fp16( + TensorDesc xDesc, TensorDesc filterDesc, TensorDesc biasDesc, TensorDesc hDesc) +{ + if (xDesc.dt != filterDesc.dt || xDesc.dt != biasDesc.dt || xDesc.dt != hDesc.dt || + xDesc.dt != DT_F16) { + return NOT_MATCH; + } + return SUCCESS; +} + +inline EE rnn_core_mali_fp16(GCLHandle_t handle, + TensorDesc xDesc, + const GCLMem_t currentX, + TensorDesc filterDesc, + GCLMem_t filter, + TensorDesc biasDesc, + GCLMem_t bias, + GCLMem_t state, + U32 tmpBytes, + GCLMem_t tmpBuf, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + GCLMem_t currentH, + ForwardRunInfoMali_t forwardRunInfo) +{ + UNUSED(handle); + UNUSED(xDesc); + UNUSED(currentX); + UNUSED(filterDesc); + UNUSED(filter); + UNUSED(biasDesc); + UNUSED(bias); + UNUSED(state); + UNUSED(tmpBytes); + UNUSED(tmpBuf); + UNUSED(rnnParamSpec); + UNUSED(batchStrideX); + UNUSED(batchStrideH); + UNUSED(hDesc); + UNUSED(currentH); + UNUSED(forwardRunInfo); + return NOT_SUPPORTED; +} + +EE rnn_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + RNNParamSpec rnnParamSpec, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ + U32 filterRow, filterCol; + tensorSelectGet(filterDesc, NULL, NULL, NULL, NULL, &filterRow, &filterCol); + U32 s0, s1, s2, num, byteSize, item_c; + U32 filterNum = (rnnParamSpec.numProjection > 0) ? 2 : 1; + for (U32 i = 0; i < filterNum; ++i) { + item_c = forwardRunInfo->best_c[i]; + if (i == 0) { + s0 = filterRow; + s1 = (filterCol + item_c - 1) / item_c; + } else { + s0 = rnnParamSpec.numOutput; + s1 = (rnnParamSpec.numProjection + item_c - 1) / item_c; + } + s2 = 1; + num = s0 * s1 * s2 * item_c; + byteSize = num * bytesOf(DT_F16); + gclmemFilterDesc[i].stride[0] = s0; + gclmemFilterDesc[i].stride[1] = s1; + gclmemFilterDesc[i].stride[2] = s2; + gclmemFilterDesc[i].offset[0] = 0; + gclmemFilterDesc[i].offset[1] = 0; + gclmemFilterDesc[i].offset[2] = 0; + gclmemFilterDesc[i].num = num; + gclmemFilterDesc[i].byteSize = byteSize; + gclmemFilterDesc[i].memType = GCL_MEM_BUF; + gclmemFilterDesc[i].flags = CL_MEM_READ_WRITE; + gclmemFilterDesc[i].memFormat = DF_CHWNC4; + if (item_c == 8) { + gclmemFilterDesc[i].memFormat = DF_CHWNC8; + } + if (item_c == 16) { + gclmemFilterDesc[i].memFormat = DF_CHWNC16; + } + gclmemFilterDesc[i].host_ptr = NULL; + } + *bytes = 0; + return SUCCESS; +} + +EE rnn_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + RNNParamSpec rnnParamSpec, + TensorDesc *fltmemDesc, + GCLMem_t fltmem, + ForwardRunInfoMali_t forwardRunInfo) +{ + DataType fdt; + U32 filterRow, filterCol; + tensorSelectGet(filterDesc, &fdt, NULL, NULL, NULL, &filterRow, &filterCol); + U32 filterNum = (rnnParamSpec.numProjection > 0) ? 2 : 1; + U32 item_c, item_k; + + char kernelname[128]; + Kernel kernel; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + U32 fwh = 1; + for (U32 i = 0; i < filterNum; i++) { + item_c = forwardRunInfo->best_c[i]; + item_k = forwardRunInfo->best_k[i]; + sprintf(kernelname, "conv_direct_trans_fltbuf_%d%d", item_c, item_k); + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelname, &kernel)); + if (i == 1) { + filterCol = rnnParamSpec.numProjection; + filterRow = rnnParamSpec.numOutput; + } + CHECK_STATUS( + gcl_set_kernelArgs(kernel, fwh, filterCol, filterRow, filter[i].mem, fltmem[i].mem)); + gs[0] = fwh; + gs[1] = (filterCol + item_c - 1) / item_c; + gs[2] = filterRow; + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); +#ifdef _DEBUG + CHECK_STATUS(gcl_print_memory(handle, filter, "fc_filter_org")); + CHECK_STATUS(gcl_print_memory(handle, fltmem, "fc_filter_tran")); +#endif + fltmemDesc[i] = tensor2df(fdt, DF_NORMAL, filterRow, filterCol); + } + return SUCCESS; +} + +EE rnn_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + RNNParamSpec rnnParamSpec, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ + UNUSED(inputDesc); + UNUSED(filterDesc); + UNUSED(outputDesc); + UNUSED(rnnParamSpec); + UNUSED(bytes); + UNUSED(forwardRunInfo); + return SUCCESS; +} + +EE rnn_mali_fp16(GCLHandle_t handle, + TensorDesc xDesc, + const GCLMem_t currentX, + TensorDesc filterDesc, + GCLMem_t filter, + TensorDesc biasDesc, + GCLMem_t bias, + GCLMem_t state, + U32 tmpBytes, + GCLMem_t tmpBuf, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + GCLMem_t currentH, + ForwardRunInfoMali_t forwardRunInfo) +{ + CHECK_STATUS(rnn_checkpara_mali_fp16(xDesc, filterDesc, biasDesc, hDesc)); + CHECK_STATUS(rnn_core_mali_fp16(handle, xDesc, currentX, filterDesc, filter, biasDesc, bias, + state, tmpBytes, tmpBuf, rnnParamSpec, batchStrideX, batchStrideH, hDesc, currentH, + forwardRunInfo)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/rnn_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/rnn_mali_fp16.h new file mode 100644 index 00000000..e66ba6bc --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/rnn_mali_fp16.h @@ -0,0 +1,58 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RNN_MALI_FP16 +#define _RNN_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE rnn_transform_filter_bytes_mali_fp16(TensorDesc filterDesc, + RNNParamSpec rnnParamSpec, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); + +EE rnn_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + RNNParamSpec rnnParamSpec, + TensorDesc *fltmemDesc, + GCLMem_t fltmem, + ForwardRunInfoMali_t forwardRunInfo); + +EE rnn_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + RNNParamSpec rnnParamSpec, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); + +EE rnn_mali_fp16(GCLHandle_t handle, + TensorDesc xDesc, + const GCLMem_t currentX, + TensorDesc filterDesc, + GCLMem_t filter, + TensorDesc biasDesc, + GCLMem_t bias, + GCLMem_t state, + U32 tmpBytes, + GCLMem_t tmpBuf, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + GCLMem_t currentH, + ForwardRunInfoMali_t forwardRunInfo); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/rnncell_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/rnncell_mali_fp16.cpp new file mode 100644 index 00000000..3a55cd2c --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/rnncell_mali_fp16.cpp @@ -0,0 +1,233 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "gpu/mali/fp16/rnncell_mali_fp16.h" +#define get_xDim(xDesc, xDim) \ + { \ + if (xDesc.nDims == 2 || xDesc.df == DF_MTK) \ + xDim = xDesc.dims[0]; \ + if (xDesc.df == DF_MKT) \ + xDim = xDesc.dims[1]; \ + } + +inline EE rnncell_checkpara_mali_fp16( + TensorDesc xDesc, TensorDesc filterDesc, TensorDesc biasDesc, TensorDesc hDesc) +{ + if (xDesc.dt != filterDesc.dt || xDesc.dt != biasDesc.dt || xDesc.dt != hDesc.dt || + xDesc.dt != DT_F16) { + return NOT_MATCH; + } + return SUCCESS; +} + +inline EE rnncell_core_mali_fp16(GCLHandle_t handle, + TensorDesc xDesc, + const GCLMem_t currentX, + TensorDesc filterDesc, + GCLMem_t filter, + TensorDesc biasDesc, + GCLMem_t bias, + GCLMem_t state, + U32 tmpBytes, + GCLMem_t tmpBuf, + RNNParamSpec rnncellDesc, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + GCLMem_t output, + ForwardRunInfoMali_t forwardRunInfo) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(batchStrideX); + UNUSED(batchStrideH); + UNUSED(hDesc); + U32 item_c = forwardRunInfo->best_c[0]; + U32 hDim = rnncellDesc.numOutput; + U32 col = (rnncellDesc.numProjection > 0) ? rnncellDesc.numProjection : hDim; + bool project = (rnncellDesc.numProjection > 0) ? true : false; + float fbias = rnncellDesc.forgetBias; + float zonecell = rnncellDesc.zoneoutCell; + float zoneout = rnncellDesc.zoneoutOutput; + + DataType dt = xDesc.dt; + U32 xDim; + get_xDim(xDesc, xDim); + Mem xMem = currentX->mem; + Mem sMem = state->mem; + Mem xhMem; + U32 offset = 0; + U32 xhNum, xhSize; + xhNum = (xDim + hDim + item_c - 1) / item_c * item_c; + xhSize = xhNum * bytesOf(dt); + CHECK_STATUS(gcl_create_sub_buffer(xhSize, &offset, tmpBuf, &xhMem)); + + Mem interMem; + U32 interNum, interSize; + U32 filterRow, filterCol; + tensorSelectGet(filterDesc, NULL, NULL, NULL, NULL, &filterRow, &filterCol); + interNum = filterRow + 4; + interSize = interNum * bytesOf(dt); + CHECK_STATUS(gcl_create_sub_buffer(interSize, &offset, tmpBuf, &interMem)); + + Mem tmpOut; + Mem outbuf = output->mem; + if (project) { + U32 item_cp = forwardRunInfo->best_c[1]; + U32 tmpOutNum = (col + item_cp - 1) / item_cp * item_cp; + U32 tmpOutSize = tmpOutNum * bytesOf(dt); + CHECK_STATUS(gcl_create_sub_buffer(tmpOutSize, &offset, tmpBuf, &tmpOut)); + outbuf = tmpOut; + } + + U32 xh_str, xw_str, xh_off, xw_off; + get_gclmem_dim(currentX->desc, &xw_str, &xh_str, NULL, &xw_off, &xh_off); + if (xw_str != 1 || xh_str != 1 || xw_off != 0 || xh_off != 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + U32 gs1 = xhNum; + U32 ls1 = 0; + U32 dim = 1; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "rnncell_build_xh", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, xDim, xDim + hDim, col, gs1, xMem, sMem, xhMem)); + gcl_set_kernelVec(handle, kernel, dim, &gs1, &ls1, "rnncell_build_xh"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs1, &ls1, "rnncell_build_xh")); + CHECK_STATUS(gcl_print_memory(handle, currentX, "currentX")); + CHECK_STATUS(gcl_print_memory(handle, state, "state")); + CHECK_STATUS(gcl_print_buffer(handle, xhMem, xhNum, "xhMem")); + handle->t_total += handle->t_execute; +#endif + + Mem fltbuf = filter[0].mem; + Mem biasMem = bias->mem; + char kernelname[128]; + U32 ic_str = filter[0].desc.stride[1]; + sprintf(kernelname, "conv_direct_spe_fwhs1_%d", item_c); + gs1 = filterRow; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, 1, 1, ic_str, 0, 0, 1, 1, 0, 0, filterRow, gs1, 1, + xhMem, fltbuf, biasMem, interMem)); + gcl_set_kernelVec(handle, kernel, dim, &gs1, &ls1, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs1, &ls1, kernelname)); + CHECK_STATUS(gcl_print_memory(handle, &filter[0], "filter")); + CHECK_STATUS(gcl_print_memory(handle, bias, "bias")); + CHECK_STATUS(gcl_print_buffer(handle, interMem, interNum, "interMem")); + handle->t_total += handle->t_execute; +#endif + + U8 noproject = (project) ? 0 : 1; + gs1 = (col + 3) / 4; + CHECK_STATUS(gcl_create_kernel(handle, "rnncell_update_res", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs( + kernel, col, noproject, gs1, fbias, zonecell, zoneout, sMem, interMem, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, &gs1, &ls1, "rnncell_update_res"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs1, &ls1, "rnncell_update_res")); + CHECK_STATUS(gcl_print_buffer(handle, sMem, col + hDim, "sMem")); + CHECK_STATUS(gcl_print_buffer(handle, interMem, 4 * col, "interMem")); + CHECK_STATUS(gcl_print_buffer(handle, outbuf, col, "outbuf")); + handle->t_total += handle->t_execute; +#endif + + if (project) { + item_c = forwardRunInfo->best_c[1]; + filterRow = rnncellDesc.numOutput; + ic_str = filter[1].desc.stride[1]; + Mem fltbuf = filter[1].mem; + sprintf(kernelname, "conv_direct_spe_fwhs1_nobias_%d", item_c); + gs1 = filterRow; + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, 1, 1, ic_str, 0, 0, 1, 1, 0, 0, filterRow, gs1, 1, + outbuf, fltbuf, biasMem, output->mem)); + gcl_set_kernelVec(handle, kernel, dim, &gs1, &ls1, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs1, &ls1, kernelname)); + CHECK_STATUS(gcl_print_memory(handle, &filter[1], "filter")); + CHECK_STATUS(gcl_print_memory(handle, output, "output")); + handle->t_total += handle->t_execute; +#endif + + gs1 = (hDim + 3) / 4; + CHECK_STATUS(gcl_create_kernel(handle, "rnncell_update_project_state", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, hDim, col, gs1, zoneout, output->mem, sMem)); + gcl_set_kernelVec(handle, kernel, dim, &gs1, &ls1, "rnncell_update_project_state"); +#ifdef _DEBUG + CHECK_STATUS( + gcl_run_kernel(handle, kernel, dim, &gs1, &ls1, "rnncell_update_project_state")); + CHECK_STATUS(gcl_print_buffer(handle, sMem, col + hDim, "sMem")); + handle->t_total += handle->t_execute; +#endif + } + return SUCCESS; +} + +EE rnncell_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + RNNParamSpec rnncellDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ + UNUSED(outputDesc); + U32 item_c = forwardRunInfo->best_c[0]; + DataType dt = inputDesc.dt; + U32 xDim; + get_xDim(inputDesc, xDim); + U32 hDim = rnncellDesc.numOutput; + U32 xhNum = (xDim + hDim + item_c - 1) / item_c * item_c; + U32 xhSize = (xhNum * bytesOf(dt) + 1023) / 1024 * 1024; + + U32 filterRow; + tensorSelectGet(filterDesc, NULL, NULL, NULL, NULL, &filterRow, NULL); + U32 interNum = filterRow + 4; + U32 interSize = (interNum * bytesOf(dt) + 1023) / 1024 * 1024; + + U32 tmpOutSize = 0; + if (rnncellDesc.numProjection > 0) { + U32 tmpOutNum = rnncellDesc.numProjection; + tmpOutSize = (tmpOutNum * bytesOf(dt) + 1023) / 1024 * 1024; + } + *bytes = xhSize + interSize + tmpOutSize; + return SUCCESS; +} + +EE rnncell_mali_fp16(GCLHandle_t handle, + TensorDesc xDesc, + const GCLMem_t currentX, + TensorDesc filterDesc, + GCLMem_t filter, + TensorDesc biasDesc, + GCLMem_t bias, + GCLMem_t state, + U32 tmpBytes, + GCLMem_t tmpBuf, + RNNParamSpec rnncellDesc, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + GCLMem_t output, + ForwardRunInfoMali_t forwardRunInfo) +{ + CHECK_STATUS(rnncell_checkpara_mali_fp16(xDesc, filterDesc, biasDesc, hDesc)); + CHECK_STATUS(fill_output_zero(handle, output, hDesc)); + CHECK_STATUS(rnncell_core_mali_fp16(handle, xDesc, currentX, filterDesc, filter, biasDesc, bias, + state, tmpBytes, tmpBuf, rnncellDesc, batchStrideX, batchStrideH, hDesc, output, + forwardRunInfo)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/rnncell_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/rnncell_mali_fp16.h new file mode 100644 index 00000000..7cb9fccf --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/rnncell_mali_fp16.h @@ -0,0 +1,44 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RNNCELL_MALI_FP16 +#define _RNNCELL_MALI_FP16 +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor_computing_type.h" + +EE rnncell_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + RNNParamSpec rnncellDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); + +EE rnncell_mali_fp16(GCLHandle_t handle, + TensorDesc xDesc, + const GCLMem_t currentX, + TensorDesc filterDesc, + GCLMem_t filter, + TensorDesc biasDesc, + GCLMem_t bias, + GCLMem_t state, + U32 tmpBytes, + GCLMem_t tmpBuf, + RNNParamSpec rnncellDesc, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + GCLMem_t output, + ForwardRunInfoMali_t forwardRunInfo); +#endif diff --git a/tensor_computing/src/gpu/mali/fp16/scale_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/scale_mali_fp16.cpp similarity index 56% rename from tensor_computing/src/gpu/mali/fp16/scale_mali_fp16.cpp rename to compute/tensor/src/gpu/mali/fp16/scale_mali_fp16.cpp index 06bf46f0..a9ba9c5c 100644 --- a/tensor_computing/src/gpu/mali/fp16/scale_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/scale_mali_fp16.cpp @@ -11,30 +11,30 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "sys.h" -#include "type.h" -#include "tensor_desc.h" #include "error.h" -#include "tensor_computing_type.h" +#include "types.h" #include "gpu/mali/fp16/scale_mali_fp16.h" -inline EE scale_checkpara_mali_fp16(TensorDesc inputDesc, - TensorDesc outputDesc) { - if(inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) return NOT_SUPPORTED; - return SUCCESS; +inline EE scale_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; } inline EE scale_core_mali_fp16(GCLHandle_t handle, - GCLMem_t alpha, - GCLMem_t beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { + GCLMem_t alpha, + GCLMem_t beta, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output) +{ UNUSED(outputDesc); U32 iw, ih, ic, in; - tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); U32 iw_str, ih_str, iw_off, ih_off; U32 ow_str, oh_str, ow_off, oh_off; ih_str = input->desc.stride[0]; @@ -46,47 +46,62 @@ inline EE scale_core_mali_fp16(GCLHandle_t handle, oh_off = output->desc.offset[0]; ow_off = output->desc.offset[1]; cl_mem inbuf, outbuf, albuf, bebuf; - inbuf = input->mem; + inbuf = input->mem; outbuf = output->mem; - albuf = alpha->mem; - bebuf = (beta) ? beta->mem : albuf; + albuf = alpha->mem; + bebuf = (beta) ? beta->mem : albuf; char modeName[16]; - if(beta){ + char kernelName[128]; + if (beta) { strcpy(modeName, "beta"); + if (alpha->desc.stride[0] == 1 && beta->desc.stride[0] == 1 && alpha->desc.stride[1] == 1 && + beta->desc.stride[1] == 1 && alpha->desc.stride[2] == 1 && beta->desc.stride[2] == 1) { + sprintf(kernelName, "scale1_%s", modeName); + } else { + sprintf(kernelName, "scale_%s", modeName); + } } else { strcpy(modeName, "nobeta"); + if (alpha->desc.stride[0] == 1 && alpha->desc.stride[1] == 1 && alpha->desc.stride[2] == 1) { + sprintf(kernelName, "scale1_%s", modeName); + } else { + sprintf(kernelName, "scale_%s", modeName); + } } - char kernelName[128]; - sprintf(kernelName, "scale_%s", modeName); - U32 gs[3] = {(ih + 1) / 2, iw, (ic + 3) / 4}; + + U32 gs[3] = {ih, iw, (ic + 3) / 4}; U32 ls[3] = {0, 0, 0}; - U32 dim = 3; + U32 dim = 3; Kernel kernel; - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelName, &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, ih_str, iw_str, ih_off, iw_off, oh_str, ow_str, oh_off, ow_off, gs[0], gs[1], albuf, bebuf, inbuf, outbuf)); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, ih_str, iw_str, ih_off, iw_off, oh_str, ow_str, + oh_off, ow_off, gs[0], gs[1], albuf, bebuf, inbuf, outbuf)); gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); #ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, input, "scale_input")); + CHECK_STATUS(gcl_print_memory(handle, input, "scale_input")); CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); - CHECK_STATUS(gcl_print_memory(handle, alpha, "scale_alpha")); - if(beta) - CHECK_STATUS(gcl_print_memory(handle, beta, "scale_beta")); + CHECK_STATUS(gcl_print_memory(handle, alpha, "scale_alpha")); + if (beta) { + CHECK_STATUS(gcl_print_memory(handle, beta, "scale_beta")); + } CHECK_STATUS(gcl_print_memory(handle, output, "scale_output")); #endif - return SUCCESS; + return SUCCESS; } - EE scale_mali_fp16(GCLHandle_t handle, - GCLMem_t alpha, - GCLMem_t beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { + GCLMem_t alpha, + GCLMem_t beta, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output) +{ CHECK_STATUS(scale_checkpara_mali_fp16(inputDesc, outputDesc)); + if (input->mem != output->mem) { + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + } CHECK_STATUS(scale_core_mali_fp16(handle, alpha, beta, inputDesc, input, outputDesc, output)); - return SUCCESS; + return SUCCESS; } - diff --git a/tensor_computing/src/gpu/mali/fp16/scale_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/scale_mali_fp16.h similarity index 82% rename from tensor_computing/src/gpu/mali/fp16/scale_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/scale_mali_fp16.h index b5f1bb9b..f135907d 100644 --- a/tensor_computing/src/gpu/mali/fp16/scale_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/scale_mali_fp16.h @@ -14,16 +14,15 @@ #ifndef _SCALE_MALI_FP16 #define _SCALE_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" +#include "types.h" #include "error.h" #include "tensor_computing_type.h" EE scale_mali_fp16(GCLHandle_t handle, - GCLMem_t alpha, - GCLMem_t beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output); + GCLMem_t alpha, + GCLMem_t beta, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output); #endif diff --git a/tensor_computing/src/gpu/mali/fp16/slice_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/slice_mali_fp16.cpp similarity index 61% rename from tensor_computing/src/gpu/mali/fp16/slice_mali_fp16.cpp rename to compute/tensor/src/gpu/mali/fp16/slice_mali_fp16.cpp index 8bb38559..268eb884 100644 --- a/tensor_computing/src/gpu/mali/fp16/slice_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/slice_mali_fp16.cpp @@ -11,37 +11,38 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "sys.h" -#include "type.h" -#include "tensor_desc.h" +#include "types.h" #include "error.h" -#include "tensor_computing_type.h" #include "gpu/mali/fp16/slice_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" #define MAX_SLICE_NUM 2 -inline EE slice_checkpara_mali_fp16(TensorDesc inputDesc, - std::vector outputDesc) { - if(inputDesc.dt != DT_F16) return NOT_SUPPORTED; - for(auto p : outputDesc) { - if(p.dt != DT_F16) return NOT_SUPPORTED; +inline EE slice_checkpara_mali_fp16(TensorDesc inputDesc, std::vector outputDesc) +{ + if (inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; } - return SUCCESS; + for (auto p : outputDesc) { + if (p.dt != DT_F16) { + return NOT_SUPPORTED; + } + } + return SUCCESS; } -inline EE slice_core_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - I32 axis, - std::vector outputDesc, - std::vector* output) { - if(inputDesc.df == DF_MKT) { +inline EE slice_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + SliceParamSpec p, + std::vector outputDesc, + std::vector *output) +{ + if (inputDesc.df == DF_MKT) { U32 m, k, t; U32 gw, gh, gc; get_nlp_mkt_val(inputDesc, NULL, &m, &k, &t); map_nlp_mkt_to_ncwhc4(m, k, t, &gw, &gh, &gc); - if(axis == 2) { + if (p.axis == 2) { U32 iw_str, ih_str, iw_off, ih_off; ih_str = input->desc.stride[0]; iw_str = input->desc.stride[1]; @@ -54,10 +55,12 @@ inline EE slice_core_mali_fp16(GCLHandle_t handle, cl_mem outbuf[MAX_SLICE_NUM]; U32 sliceEnd[MAX_SLICE_NUM]; U32 sliceNum = (*output).size(); - if(sliceNum > MAX_SLICE_NUM) CHECK_STATUS(NOT_SUPPORTED); + if (sliceNum > MAX_SLICE_NUM) { + CHECK_STATUS(NOT_SUPPORTED); + } U32 j = 0; - std::vector outputArray = *output; - for(U32 i = 0; i < sliceNum; ++i) { + std::vector outputArray = *output; + for (U32 i = 0; i < sliceNum; ++i) { oh_str[i] = ((GCLMem_t)outputArray[i])->desc.stride[0]; ow_str[i] = ((GCLMem_t)outputArray[i])->desc.stride[1]; oh_off[i] = ((GCLMem_t)outputArray[i])->desc.offset[0]; @@ -70,15 +73,16 @@ inline EE slice_core_mali_fp16(GCLHandle_t handle, char kernelName[128]; sprintf(kernelName, "slice_h_%d", sliceNum); Kernel kernel; - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); U32 gs[3] = {gh, gw, gc}; U32 ls[3] = {0, 0, 0}; - U32 dim = 3; - switch(sliceNum) { + U32 dim = 3; + switch (sliceNum) { case 2: - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, iw_str, ih_off, iw_off, gs[0], gs[1], input->mem, - oh_str[0], ow_str[0], oh_off[0], ow_off[0], sliceEnd[0], outbuf[0], - oh_str[1], ow_str[1], oh_off[1], ow_off[1], sliceEnd[1], outbuf[1])); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, iw_str, ih_off, iw_off, gs[0], + gs[1], input->mem, oh_str[0], ow_str[0], oh_off[0], ow_off[0], sliceEnd[0], + outbuf[0], oh_str[1], ow_str[1], oh_off[1], ow_off[1], sliceEnd[1], + outbuf[1])); break; default: return NOT_SUPPORTED; @@ -86,25 +90,31 @@ inline EE slice_core_mali_fp16(GCLHandle_t handle, gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); #ifdef _DEBUG CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); - CHECK_STATUS(gcl_print_memory(handle, input, "slice_input")); - for(U32 i = 0; i < sliceNum; ++i) CHECK_STATUS(gcl_print_memory(handle, (GCLMem_t)(outputArray[i]), "slice_output")); + CHECK_STATUS(gcl_print_memory(handle, input, "slice_input")); + for (U32 i = 0; i < sliceNum; ++i) { + CHECK_STATUS( + gcl_print_memory(handle, (GCLMem_t)(outputArray[i]), "slice_output")); + } #endif return SUCCESS; } return NOT_SUPPORTED; } - return NOT_SUPPORTED; + return NOT_SUPPORTED; } - -EE slice_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - I32 axis, - std::vector outputDesc, - std::vector* output) { +EE slice_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + SliceParamSpec p, + std::vector outputDesc, + std::vector *output) +{ + std::vector outputArray = *output; CHECK_STATUS(slice_checkpara_mali_fp16(inputDesc, outputDesc)); - CHECK_STATUS(slice_core_mali_fp16(handle, inputDesc, input, axis, outputDesc, output)); - return SUCCESS; + for (U32 i = 0; i < outputArray.size(); i++) { + CHECK_STATUS(fill_output_zero(handle, (GCLMem_t)(outputArray[i]), outputDesc[i])); + } + CHECK_STATUS(slice_core_mali_fp16(handle, inputDesc, input, p, outputDesc, output)); + return SUCCESS; } - diff --git a/tensor_computing/src/gpu/mali/fp16/slice_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/slice_mali_fp16.h similarity index 76% rename from tensor_computing/src/gpu/mali/fp16/slice_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/slice_mali_fp16.h index e75ea731..c238a2b0 100644 --- a/tensor_computing/src/gpu/mali/fp16/slice_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/slice_mali_fp16.h @@ -11,19 +11,16 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _SLICE_MALI_FP16 #define _SLICE_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" -#include "error.h" +#include "types.h" #include "tensor_computing_type.h" -EE slice_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - I32 axis, - std::vector outputDesc, - std::vector* output); +EE slice_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + SliceParamSpec p, + std::vector outputDesc, + std::vector *output); #endif diff --git a/compute/tensor/src/gpu/mali/fp16/softmax_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/softmax_mali_fp16.cpp new file mode 100644 index 00000000..4c151c86 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/softmax_mali_fp16.cpp @@ -0,0 +1,207 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "error.h" +#include "gpu/mali/fp16/softmax_mali_fp16.h" +namespace { +constexpr int SOFTMAX_KERNEL_ITEM_NUM = 16; +constexpr int SOFTMAX_KERNEL_TMPBUF_EXPAND = 2; +} // namespace + +inline EE softmax_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt) { + return NOT_SUPPORTED; + } + if (outputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE softmax_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + GCLMem_t tmp, + int axis, + TensorDesc outputDesc, + GCLMem_t output) +{ + UNUSED(outputDesc); + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + U32 iw_str, ih_str, ic_str, iw_off, ih_off, ihw_str; + U32 ow_str, oh_str, ow_off, oh_off, ohw_str; + get_gclmem_dim(input->desc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + ihw_str = ih_str * iw_str; + ohw_str = oh_str * ow_str; + U32 nDims = inputDesc.nDims; + I32 axisTran = (axis + nDims) % nDims; + axisTran = nDims - 1 - axisTran; + cl_mem inbuf, outbuf; + inbuf = input->mem; + outbuf = output->mem; + Kernel kernel; + char kernelname[128]; + U32 gs[2]; + U32 ls[2] = {0, 0}; + U32 dim = 2; + if (iw_off == 0 && ih_off == 0) { + bool matchCase = false; + I32 icd4; + I32 ice4; + if (iw_str == 1 && ih_str == 1) { + icd4 = (ic + 3) >> 2; + ice4 = ((ic & 3) == 0) ? 4 : (ic & 3); + matchCase = true; + } + if (iw_str == 1 && ic_str == 1) { + icd4 = (ih + 3) >> 2; + ice4 = ((ih & 3) == 0) ? 4 : (ih & 3); + matchCase = true; + } + if (ih_str == 1 && ic_str == 1) { + icd4 = (iw + 3) >> 2; + ice4 = ((iw & 3) == 0) ? 4 : (iw & 3); + matchCase = true; + } + + if (matchCase) { + gs[0] = SOFTMAX_KERNEL_ITEM_NUM; + dim = 1; + Mem clTmpBuf = tmp->mem; + sprintf(kernelname, "softmax_h1w1_max_part"); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS( + gcl_set_kernelArgs(kernel, icd4, ice4, SOFTMAX_KERNEL_ITEM_NUM, inbuf, clTmpBuf)); + CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname)); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); +#endif + dim = 1; + gs[0] = 1; + sprintf(kernelname, "softmax_h1w1_max_all"); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, SOFTMAX_KERNEL_ITEM_NUM, clTmpBuf)); + CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname)); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); +#endif + dim = 1; + gs[0] = SOFTMAX_KERNEL_ITEM_NUM; + sprintf(kernelname, "softmax_h1w1_sum_part"); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS( + gcl_set_kernelArgs(kernel, icd4, ice4, SOFTMAX_KERNEL_ITEM_NUM, inbuf, clTmpBuf)); + CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname)); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); +#endif + dim = 1; + gs[0] = 1; + sprintf(kernelname, "softmax_h1w1_sum_all"); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, SOFTMAX_KERNEL_ITEM_NUM, clTmpBuf)); + CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname)); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); +#endif + dim = 1; + gs[0] = icd4; + sprintf(kernelname, "softmax_h1w1_output"); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs( + kernel, icd4, ice4, SOFTMAX_KERNEL_ITEM_NUM, inbuf, clTmpBuf, outbuf)); + CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname)); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); +#endif + return SUCCESS; + } + } + + if (input->desc.memFormat == DF_NCWHC4) { + if ((nDims == 4 && axisTran == 1) || (inputDesc.df == DF_MTK && axisTran == 0) || + (inputDesc.df == DF_MKT && axisTran == 1)) { + gs[0] = ih; + gs[1] = iw; + I32 icd4 = (ic + 3) >> 2; + I32 ice4 = ((ic & 3) == 0) ? 4 : (ic & 3); + sprintf(kernelname, "softmax"); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, icd4, ice4, ih_str, ihw_str, ih_off, iw_off, + oh_str, ohw_str, oh_off, ow_off, gs[0], gs[1], inbuf, outbuf)); + } else { + return NOT_SUPPORTED; + } + } else if (input->desc.memFormat == DF_NCHW) { + if (axisTran == 2) { // on c axis + gs[0] = (iw + 3) / 4; + gs[1] = ih; + sprintf(kernelname, "softmax_nchw_c"); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ic, iw_str, ihw_str, iw_off, ih_off, ow_str, + ohw_str, ow_off, oh_off, iw, gs[0], gs[1], inbuf, outbuf)); + } else if (axisTran == 0) { // on w axis + gs[0] = ih; + gs[1] = ic; + I32 iwd4 = (iw + 3) >> 2; + I32 iwe4 = ((iw & 3) == 0) ? 4 : (iw & 3); + sprintf(kernelname, "softmax_nchw_w"); + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iwd4, iwe4, iw_str, ih_str, iw_off, ih_off, + ow_str, oh_str, ow_off, oh_off, gs[0], gs[1], inbuf, outbuf)); + } else { + return NOT_SUPPORTED; + } + } else { + return NOT_SUPPORTED; + } + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelname)); +#endif + return SUCCESS; +} + +EE softmax_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + GCLMem_t tmp, + int axis, + TensorDesc outputDesc, + GCLMem_t output) +{ + CHECK_STATUS(softmax_checkpara_mali_fp16(inputDesc, outputDesc)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(softmax_core_mali_fp16(handle, inputDesc, input, tmp, axis, outputDesc, output)); + return SUCCESS; +} + +EE softmax_infer_forward_tmp_bytes_mali_fp16( + TensorDesc inputDesc, U32 *bytes, ForwardRunInfoMali_t forwardRunInfo) +{ + UNUSED(forwardRunInfo); + U32 in, ic, ih, iw; + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + if (ih != 1 || iw != 1 || in != 1) { + *bytes = 0; + } else { + *bytes = SOFTMAX_KERNEL_ITEM_NUM + SOFTMAX_KERNEL_TMPBUF_EXPAND; + } + + return SUCCESS; +} diff --git a/tensor_computing/src/gpu/mali/fp16/softmax_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/softmax_mali_fp16.h similarity index 82% rename from tensor_computing/src/gpu/mali/fp16/softmax_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/softmax_mali_fp16.h index 758e83b3..5a01ff3b 100644 --- a/tensor_computing/src/gpu/mali/fp16/softmax_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/softmax_mali_fp16.h @@ -11,21 +11,21 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _H_SOFTMAX_MALI_FP16 #define _H_SOFTMAX_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" +#include "types.h" #include "error.h" #include "tensor_computing_type.h" -EE softmax_mali_fp16(GCLHandle_t handle, - TensorDesc inputdesc, - GCLMem_t input, - int axis, - TensorDesc outputDesc, - GCLMem_t output); -#endif - +EE softmax_infer_forward_tmp_bytes_mali_fp16( + TensorDesc inputDesc, U32 *bytes, ForwardRunInfoMali_t forwardRunInfo); +EE softmax_mali_fp16(GCLHandle_t handle, + TensorDesc inputdesc, + GCLMem_t input, + GCLMem_t tmp, + int axis, + TensorDesc outputDesc, + GCLMem_t output); +#endif diff --git a/tensor_computing/src/gpu/mali/fp16/squeeze_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/squeeze_mali_fp16.cpp similarity index 64% rename from tensor_computing/src/gpu/mali/fp16/squeeze_mali_fp16.cpp rename to compute/tensor/src/gpu/mali/fp16/squeeze_mali_fp16.cpp index fcf5077d..8757f85e 100644 --- a/tensor_computing/src/gpu/mali/fp16/squeeze_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/squeeze_mali_fp16.cpp @@ -11,29 +11,35 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "sys.h" -#include "type.h" -#include "tensor_desc.h" +#include "types.h" #include "error.h" -#include "tensor_computing_type.h" #include "gpu/mali/fp16/squeeze_mali_fp16.h" -inline EE squeeze_checkpara_mali_fp16(TensorDesc inputDesc, - TensorDesc outputDesc) { - if(inputDesc.dt != outputDesc.dt) return NOT_SUPPORTED; - if(outputDesc.dt != DT_F16) return NOT_SUPPORTED; - return SUCCESS; +inline EE squeeze_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt) { + return NOT_SUPPORTED; + } + if (outputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; } -inline EE squeeze_core_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { +inline EE squeeze_core_mali_fp16( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ UNUSED(outputDesc); U32 iw, ih, ic, in; - tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + if (inputDesc.df == DF_NCHW) { + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + } else if (inputDesc.df == DF_MKT) { + get_nlp_mkt_val(inputDesc, NULL, &in, &ic, &ih); + iw = 1; + } else { + return NOT_SUPPORTED; + } U32 iw_str, ih_str, iw_off, ih_off; ih_str = input->desc.stride[0]; iw_str = input->desc.stride[1]; @@ -46,32 +52,32 @@ inline EE squeeze_core_mali_fp16(GCLHandle_t handle, ow_off = output->desc.offset[1]; cl_mem inbuf, outbuf; - inbuf = input->mem; + inbuf = input->mem; outbuf = output->mem; U32 gs[3] = {ih, iw, (ic + 3) / 4}; U32 ls[3] = {0, 0, 0}; - U32 dim = 3; + U32 dim = 3; Kernel kernel; - CHECK_STATUS(gcl_create_kernel_binary(handle, "squeeze", &kernel)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_str, iw_str, ih_off, iw_off, oh_str, ow_str, oh_off, ow_off, inbuf, outbuf)); + CHECK_STATUS(gcl_create_kernel(handle, "squeeze", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_str, iw_str, ih_off, iw_off, oh_str, ow_str, + oh_off, ow_off, inbuf, outbuf)); gcl_set_kernelVec(handle, kernel, dim, gs, ls, "squeeze"); #ifdef _DEBUG CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "squeeze")); - CHECK_STATUS(gcl_print_memory(handle, input, "squeeze_input")); + CHECK_STATUS(gcl_print_memory(handle, input, "squeeze_input")); CHECK_STATUS(gcl_print_memory(handle, output, "squeeze_output")); #endif - return SUCCESS; + return SUCCESS; } -EE squeeze_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - +EE squeeze_mali_fp16( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ CHECK_STATUS(squeeze_checkpara_mali_fp16(inputDesc, outputDesc)); + if (input->mem != output->mem) { + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + } CHECK_STATUS(squeeze_core_mali_fp16(handle, inputDesc, input, outputDesc, output)); - return SUCCESS; + return SUCCESS; } - diff --git a/tensor_computing/src/gpu/mali/fp16/squeeze_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/squeeze_mali_fp16.h similarity index 79% rename from tensor_computing/src/gpu/mali/fp16/squeeze_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/squeeze_mali_fp16.h index b8dcfaa6..f3e9cc94 100644 --- a/tensor_computing/src/gpu/mali/fp16/squeeze_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/squeeze_mali_fp16.h @@ -11,20 +11,13 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#ifndef _ACTIVATION_MALI_FP16 -#define _ACTIVATION_MALI_FP16 +#ifndef _SQUEEZE_MALI_FP16 +#define _SQUEEZE_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" +#include "types.h" #include "error.h" #include "tensor_computing_type.h" - -EE squeeze_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output); +EE squeeze_mali_fp16( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output); #endif - diff --git a/compute/tensor/src/gpu/mali/fp16/transpose_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/transpose_mali_fp16.cpp new file mode 100644 index 00000000..27d42df8 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/transpose_mali_fp16.cpp @@ -0,0 +1,229 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "error.h" +#include "gpu/mali/fp16/transpose_mali_fp16.h" + +inline EE transpose_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE transpose_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t tmpbuf, + U32 *dims) +{ + DataFormat df; + U32 nDims; + U32 in, ic, ih, iw, it; + U32 on, oc, oh, ow, ot; + nDims = inputDesc.nDims; + tensorSelectGet(inputDesc, NULL, &df, &in, &ic, &ih, &iw, &it); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow, &ot); + DataFormat imf = input->desc.memFormat; + DataFormat omf = output->desc.memFormat; + U32 iw_str, ih_str, iw_off, ih_off; + U32 ow_str, oh_str, ow_off, oh_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, NULL, &iw_off, &ih_off); + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + cl_mem inbuf = input->mem; + cl_mem outbuf = output->mem; + cl_mem tmp = tmpbuf->mem; + I32 dimTran[6] = {0, 1, 2, 3, 4, 5}; + for (U32 i = 0; i < nDims; i++) { + dimTran[nDims - 1 - i] = nDims - 1 - dims[i]; + } + char kernelName[128]; + Kernel kernel; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + if (dimTran[2] == 2 && dimTran[3] == 3 && nDims == 4) { + bool matchCase = false; + if (imf == DF_NCWHC4 && omf == DF_NCWHC4) { + if (dimTran[0] == 0 && dimTran[1] == 1) { + sprintf(kernelName, "mem_trans_ncwhc4_to_ncwhc4"); + gs[0] = oh; + gs[1] = ow; + gs[2] = (oc + 3) / 4; + matchCase = true; + } else if (dimTran[0] == 1 && dimTran[1] == 0) { + sprintf(kernelName, "mem_trans_ncwhc4_to_ncwhc4_output_tran"); + gs[0] = ow; + gs[1] = oh; + gs[2] = (oc + 3) / 4; + matchCase = true; + } else { + return NOT_SUPPORTED; + } + } + if (imf == DF_NCWHC4 && omf == DF_NCHW) { + if (dimTran[0] == 0 && dimTran[1] == 1) { + sprintf(kernelName, "mem_trans_ncwhc4_to_nchw"); + gs[0] = oh; + gs[1] = (ow + 3) / 4; + gs[2] = (oc + 3) / 4; + matchCase = true; + } else if (dimTran[0] == 1 && dimTran[1] == 0) { + sprintf(kernelName, "mem_trans_ncwhc4_to_nchw_output_tran"); + gs[0] = (ow + 3) / 4; + gs[1] = oh; + gs[2] = (oc + 3) / 4; + matchCase = true; + } else { + return NOT_SUPPORTED; + } + } + if (imf == DF_NCHW && omf == DF_NCWHC4) { + if (dimTran[0] == 0 && dimTran[1] == 1) { + sprintf(kernelName, "mem_trans_nchw_to_ncwhc4"); + gs[0] = (ow + 3) / 4; + gs[1] = oh; + gs[2] = (oc + 3) / 4; + matchCase = true; + } else if (dimTran[0] == 1 && dimTran[1] == 0) { + sprintf(kernelName, "mem_trans_nchw_to_ncwhc4_output_tran"); + gs[0] = (oh + 3) / 4; + gs[1] = ow; + gs[2] = (oc + 3) / 4; + matchCase = true; + } else { + return NOT_SUPPORTED; + } + } + if (matchCase) { + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, ow_str, oh_str, + ow_off, oh_off, iw, ih, ic, ow, oh, oc, 0, 0, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif + return SUCCESS; + } + } + + if (imf == DF_NCWHC4) { + gs[0] = ih; + gs[1] = (iw + 3) / 4; + gs[2] = (ic + 3) / 4 * it; + if (df == DF_NCTHW) { + sprintf(kernelName, "mem_trans_3d_ncwhc4_to_nchw"); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, iw, ih, 0, 0, + iw, ih, ic, iw, ih, ic, 0, 0, inbuf, tmp)); + } else { + sprintf(kernelName, "mem_trans_ncwhc4_to_nchw"); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, iw, ih, 0, 0, + iw, ih, ic, it, iw, ih, ic, it, 0, 0, inbuf, tmp)); + } + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif + inbuf = tmp; + } + U32 ow_str_val = ow_str; + U32 oh_str_val = oh_str; + U32 ow_off_val = ow_off; + U32 oh_off_val = ow_off; + + if (omf == DF_NCWHC4) { + U32 offset = tensorNumBytes(inputDesc); + offset = ALIGN(offset, 1024); + U32 size = tensorNumBytes(outputDesc); + gcl_create_sub_buffer(size, &offset, tmpbuf, &outbuf); + ow_str_val = ow; + oh_str_val = oh; + ow_off_val = 0; + oh_off_val = 0; + } + + gs[0] = (iw + 3) / 4; + gs[1] = ih; + gs[2] = ic * it; + if (df == DF_NCTHW) { + sprintf(kernelName, "transpose_3d_nchw"); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, ow_str_val, + oh_str_val, ow_off_val, oh_off_val, dimTran[0], dimTran[1], dimTran[2], dimTran[3], iw, + it, ot, gs[0], gs[1], inbuf, outbuf)); + } else { + sprintf(kernelName, "transpose_nchw"); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, ow_str_val, + oh_str_val, ow_off_val, oh_off_val, dimTran[0], dimTran[1], dimTran[2], iw, gs[0], + gs[1], inbuf, outbuf)); + } + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif + if (omf == DF_NCWHC4) { + if (df == DF_NCTHW) { + CHECK_STATUS(NOT_SUPPORTED); + } + sprintf(kernelName, "mem_trans_nchw_to_ncwhc4"); + gs[0] = (ow + 3) / 4; + gs[1] = oh; + gs[2] = (oc + 3) / 4; + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ow_str_val, oh_str_val, ow_off_val, oh_off_val, + ow_str, oh_str, ow_off, oh_off, ow, oh, oc, ow, oh, oc, 0, 0, outbuf, output->mem)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); +#endif + } + return SUCCESS; +} + +EE transpose_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + U32 *bytes) +{ + UNUSED(inputDesc); + UNUSED(outputDesc); + U32 input_size = gclmemInputDesc->byteSize; + input_size = ALIGN(input_size, 1024); + U32 output_size = gclmemOutputDesc->byteSize; + *bytes = input_size + output_size; + return SUCCESS; +} + +EE transpose_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t tmpbuf, + U32 *dim) +{ + CHECK_STATUS(transpose_checkpara_mali_fp16(inputDesc, outputDesc)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS( + transpose_core_mali_fp16(handle, inputDesc, input, outputDesc, output, tmpbuf, dim)); + return SUCCESS; +} diff --git a/tensor_computing/src/gpu/mali/fp16/transpose_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/transpose_mali_fp16.h similarity index 79% rename from tensor_computing/src/gpu/mali/fp16/transpose_mali_fp16.h rename to compute/tensor/src/gpu/mali/fp16/transpose_mali_fp16.h index 5be446e1..5123b20b 100644 --- a/tensor_computing/src/gpu/mali/fp16/transpose_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/transpose_mali_fp16.h @@ -14,15 +14,21 @@ #ifndef _TRANSPOSE_MALI_FP16 #define _TRANSPOSE_MALI_FP16 #include "sys.h" -#include "type.h" -#include "tensor_desc.h" +#include "types.h" #include "error.h" #include "tensor_computing_type.h" +EE transpose_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + U32 *bytes); + EE transpose_mali_fp16(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output, - U32* dim); + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t tmpbuf, + U32 *dim); #endif diff --git a/compute/tensor/src/gpu/mali/fp16/unsqueeze_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/unsqueeze_mali_fp16.cpp new file mode 100644 index 00000000..b2bb0f3e --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/unsqueeze_mali_fp16.cpp @@ -0,0 +1,81 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "error.h" +#include "gpu/mali/fp16/unsqueeze_mali_fp16.h" + +inline EE unsqueeze_checkpara_mali_fp16(TensorDesc inputDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt) { + return NOT_SUPPORTED; + } + if (outputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE unsqueeze_core_mali_fp16( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + UNUSED(outputDesc); + U32 iw, ih, ic, in; + if (inputDesc.df == DF_NCHW) { + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + } else if (inputDesc.df == DF_MKT) { + get_nlp_mkt_val(inputDesc, NULL, &in, &ic, &ih); + iw = 1; + } else { + return NOT_SUPPORTED; + } + U32 iw_str, ih_str, iw_off, ih_off; + ih_str = input->desc.stride[0]; + iw_str = input->desc.stride[1]; + ih_off = input->desc.offset[0]; + iw_off = input->desc.offset[1]; + U32 ow_str, oh_str, ow_off, oh_off; + oh_str = output->desc.stride[0]; + ow_str = output->desc.stride[1]; + oh_off = output->desc.offset[0]; + ow_off = output->desc.offset[1]; + + cl_mem inbuf, outbuf; + inbuf = input->mem; + outbuf = output->mem; + + U32 gs[3] = {ih, iw, (ic + 3) / 4}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "squeeze", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, ih, iw, ih_str, iw_str, ih_off, iw_off, oh_str, ow_str, + oh_off, ow_off, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "squeeze"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "squeeze")); + CHECK_STATUS(gcl_print_memory(handle, input, "unsqueeze_input")); + CHECK_STATUS(gcl_print_memory(handle, output, "unsqueeze_output")); +#endif + return SUCCESS; +} + +EE unsqueeze_mali_fp16( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + CHECK_STATUS(unsqueeze_checkpara_mali_fp16(inputDesc, outputDesc)); + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(unsqueeze_core_mali_fp16(handle, inputDesc, input, outputDesc, output)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/unsqueeze_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/unsqueeze_mali_fp16.h new file mode 100644 index 00000000..cd2a2b06 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/unsqueeze_mali_fp16.h @@ -0,0 +1,23 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _UNSQUEEZE_MALI_FP16 +#define _UNSQUEEZE_MALI_FP16 +#include "sys.h" +#include "types.h" +#include "error.h" +#include "tensor_computing_type.h" + +EE unsqueeze_mali_fp16( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output); +#endif diff --git a/compute/tensor/src/gpu/mali/fully_connected.cpp b/compute/tensor/src/gpu/mali/fully_connected.cpp new file mode 100644 index 00000000..090daf11 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fully_connected.cpp @@ -0,0 +1,507 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/fully_connected_mali_fp16.h" +inline void fully_connected_produce_algos_paras(TensorDesc inputDesc, + TensorDesc filterDesc, + std::vector outputDescs, + std::vector *fcAlgorithms, + std::vector *algoNumIndex, + std::vector *vecW, + std::vector *vecC, + std::vector *vecK) +{ + DataType dt; + U32 iw, ih, ic, fw, fh, fn; + tensorSelectGet(filterDesc, &dt, NULL, &fn, NULL, &fh, &fw); + tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, &ih, &iw); + U32 configInfo[3][128]; + U32 configNums[2]; + ConvolutionForwardAlgorithm algo[2]; + U32 algoNum = 1; + algo[0] = CONVOLUTION_ALGORITHM_DIRECT; + if (inputDesc.df == DF_NCHW || inputDesc.df == DF_NORMAL) { + if (ih != 1 || iw != 1 || fh != 1 || fw != 1) { + U32 item_w = (64 + ih - 1) / ih; + item_w = (item_w > iw) ? iw : item_w; + configInfo[0][0] = item_w; + configInfo[1][0] = 4; + configInfo[2][0] = 4; + configNums[0] = 1; + } else { + U32 configNum = 0; + U32 j = 8; + for (U32 i = 0; i < 3; i++) { + configInfo[0][configNum] = 1; + configInfo[1][configNum] = 1 << (2 + i); + configInfo[2][configNum] = 0; + configNum++; + if (ic % j != 0) { + break; + } + j = j << 1; + } + configNums[0] = configNum; + } + } else if (inputDesc.df == DF_MKT) { + U32 configNum = 0; + U32 align8 = true; + U32 nj = 8; + U32 k = 4; + for (U32 i = 0; i < outputDescs.size(); i++) { + if (outputDescs[i].dims[1] % 8 != 0) { + align8 = false; + } + } + for (U32 i = 0; i < 2; i++) { + for (U32 j = 0; j < nj; j++) { + configInfo[0][configNum] = j + 1; + configInfo[1][configNum] = 4; + configInfo[2][configNum] = k; + configNum++; + } + if (!align8) { + break; + } + nj = 4; + k = 8; + } + configNums[0] = configNum; + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + + for (U32 i = 0; i < algoNum; i++) { + (*fcAlgorithms).push_back(algo[i]); + (*algoNumIndex).push_back(configNums[i]); + U32 be = (i == 0) ? 0 : configNums[i - 1]; + U32 end = configNums[i]; + for (U32 j = be; j < end; j++) { + if (vecW) { + (*vecW).push_back(configInfo[0][j]); + } + if (vecC) { + (*vecC).push_back(configInfo[1][j]); + } + if (vecK) { + (*vecK).push_back(configInfo[2][j]); + } + } + } +} +inline EE fully_connected_checkpara_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + std::vector *filter, + std::vector *bias, + TensorDesc outputDesc, + std::vector *output) +{ + if (nullptr == handle || nullptr == input || nullptr == filter || nullptr == output || + nullptr == bias) { + return NULL_POINTER; + } + if (filter->size() != output->size() || filter->size() != bias->size() || bias->size() == 0) { + return NOT_MATCH; + } + for (U32 i = 0; i < filter->size(); ++i) { + if (nullptr == (*filter)[i] || nullptr == (*output)[i] || nullptr == (*bias)[i]) { + return NULL_POINTER; + } + } + if (inputDesc.df == DF_NCHW || inputDesc.df == DF_NORMAL) { + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 oc; + CHECK_STATUS(tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensorSelectGet(outputDesc, NULL, NULL, NULL, &oc, NULL, NULL)); + if (filterDesc.df != DF_NCHW) { + return NOT_SUPPORTED; + } + if (input->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + if ((*filter)[0]->desc.memFormat != DF_NCWHN4C4) { + return NOT_SUPPORTED; + } + if ((*output)[0]->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + if (in > 1) { + return NOT_SUPPORTED; + } + if (filter->size() > 1) { + return NOT_SUPPORTED; + } + if (fw != iw) { + return NOT_MATCH; + } + if (fh != ih) { + return NOT_MATCH; + } + if (fc != ic) { + return NOT_MATCH; + } + if (fn != oc) { + return NOT_MATCH; + } + } + if (inputDesc.df == DF_MKT) { + U32 k; + U32 fw, fh, fc, fn; + k = inputDesc.dims[1]; + CHECK_STATUS(tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw)); + if (fh != 1 || fw != 1) { + return NOT_MATCH; + } + if (k != fc) { + return NOT_MATCH; + } + } + return SUCCESS; +} +EE fully_connected_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + U32 fn; + tensorSelectGet(filterDesc, NULL, NULL, &fn, NULL, NULL, NULL); + if (inputDesc.df == DF_NCHW || inputDesc.df == DF_NORMAL) { + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + if (outputDesc) { + *outputDesc = tensor4df(idt, idf, in, fn, 1, 1); + } + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, 1, 1, fn, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + return SUCCESS; + } else if (inputDesc.df == DF_MKT) { + bool need_pad = false; + DataType dt; + U32 m, k, t; + get_nlp_mkt_val(inputDesc, &dt, &m, &k, &t); + if (outputDesc) { + *outputDesc = inputDesc; + (*outputDesc).dims[1] = fn; + } + std::vector fcAlgorithms; + std::vector algoNumIndex; + std::vector vecW; + std::vector outputDescs; + outputDescs.push_back(*outputDesc); + fully_connected_produce_algos_paras( + inputDesc, filterDesc, outputDescs, &fcAlgorithms, &algoNumIndex, &vecW, NULL, NULL); + U32 igw, igh, igc; + U32 ogw, ogh, ogc; + U32 t_align = t; + for (U32 i = 0; i < algoNumIndex[0]; i++) { + U32 j = ALIGN(t, vecW[i]); + t_align = (t_align < j) ? j : t_align; + } + if (t_align != t) { + need_pad = true; + } + map_nlp_mkt_to_ncwhc4(m, k, t_align, &igw, &igh, &igc); + map_nlp_mkt_to_ncwhc4(m, fn, t, &ogw, &ogh, &ogc); + igc = igc * 4; + ogc = ogc * 4; + CHECK_STATUS(infer_gclmem_desc_ncwhc4(igw, igh, igc, 0, 0, ogw, ogh, ogc, dt, dt, + gclmemInputDesc, gclmemOutputDesc, need_pad)); + return SUCCESS; + } + CHECK_STATUS(NOT_SUPPORTED); + return NOT_SUPPORTED; +} + +EE fully_connected_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc inputDesc, + TensorDesc filterDesc, + std::vector outputDescs, + ForwardRunInfoMali_t forwardRunInfo) +{ + if (forwardRunInfo == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + if (algorithm != CONVOLUTION_ALGORITHM_NULL) { + return SUCCESS; + } + DataType dt; + U32 fn; + tensorSelectGet(filterDesc, &dt, NULL, &fn, NULL, NULL, NULL); + std::vector fcAlgorithms; + std::vector algoNumIndex; + std::vector vecW; + std::vector vecC; + std::vector vecK; + fully_connected_produce_algos_paras( + inputDesc, filterDesc, outputDescs, &fcAlgorithms, &algoNumIndex, &vecW, &vecC, &vecK); + if (vecW.size() == 1) { + forwardRunInfo->best_w[0] = vecW[0]; + forwardRunInfo->best_k[0] = vecK[0]; + forwardRunInfo->best_c[0] = vecC[0]; + forwardRunInfo->algorithm = fcAlgorithms[0]; + return SUCCESS; + } + + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_enable_queue_profiling(handle)); + U32 sliceNum = outputDescs.size(); + GCLMem_t input = gcl_create_gclmem(); + GCLMem_t tmpbuf = gcl_create_gclmem(); + std::vector filter; + std::vector bias; + std::vector output; + for (U32 i = 0; i < sliceNum; ++i) { + GCLMem_t filterTmp = gcl_create_gclmem(); + GCLMem_t biasTmp = gcl_create_gclmem(); + GCLMem_t outTmp = gcl_create_gclmem(); + filter.push_back(filterTmp); + bias.push_back(biasTmp); + output.push_back(outTmp); + } + + std::vector runInfos; + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + GCLMemDesc inputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + GCLMemDesc outputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + CHECK_STATUS(fully_connected_infer_output_size_mali( + inputDesc, filterDesc, NULL, &inputMemDesc, &outputMemDesc)); + std::vector filterMemDescs; + U32 maxBytes = 0; + U32 maxFilterSize = 0; + for (U32 i = 0; i < algoNumIndex.size(); i++) { + U32 bytes = 0; + ForwardRunInfoMali runInfo; + runInfo.algorithm = fcAlgorithms[i]; + U32 be = (i == 0) ? 0 : algoNumIndex[i - 1]; + U32 end = algoNumIndex[i]; + for (U32 j = be; j < end; j++) { + GCLMemDesc filterMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + runInfo.best_w[0] = vecW[j]; + runInfo.best_c[0] = vecC[j]; + runInfo.best_k[0] = vecK[j]; + if (fully_connected_transform_filter_bytes_mali( + filterDesc, &filterMemDesc, &bytes, &runInfo) != SUCCESS) { + continue; + } + maxBytes = (maxBytes < bytes) ? bytes : maxBytes; + if (fully_connected_infer_forward_tmp_bytes_mali( + inputDesc, filterDesc, &bytes, &runInfo) != SUCCESS) { + continue; + } + maxBytes = (maxBytes < bytes) ? bytes : maxBytes; + maxFilterSize = (maxFilterSize < filterMemDesc.byteSize) ? filterMemDesc.byteSize + : maxFilterSize; + filterMemDescs.push_back(filterMemDesc); + runInfos.push_back(runInfo); + } + } + + MemFlags flags = CL_MEM_READ_WRITE; + if (inputDesc.df == DF_MKT) { + U32 stride[3] = {(fn + 3) / 4, 1, 1}; + U32 offset[3] = {0, 0, 0}; + CHECK_STATUS(gclmem_set_desc_padding( + &bias[0]->desc, stride, offset, dt, DF_NHWC, GCL_MEM_IMG_1D, flags)); + } else { + U32 stride[3] = {fn, 1, 1}; + U32 offset[3] = {0, 0, 0}; + CHECK_STATUS(gclmem_set_desc_padding( + &bias[0]->desc, stride, offset, dt, DF_NHWC, GCL_MEM_BUF, flags)); + } + + U32 algosNum = runInfos.size(); + if (algosNum == 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + TensorDesc biasDesc = tensor1d(dt, fn); + filterMemDescs[0].byteSize = maxFilterSize; + input->desc = inputMemDesc; + output[0]->desc = outputMemDesc; + filter[0]->desc = filterMemDescs[0]; + tmpbuf->desc.byteSize = maxBytes; + gcl_create_memory(handle, input); + for (U32 i = 0; i < sliceNum; ++i) { + filter[i]->desc = filter[0]->desc; + bias[i]->desc = bias[0]->desc; + output[i]->desc = output[0]->desc; + gcl_create_memory(handle, filter[i]); + gcl_create_memory(handle, bias[i]); + gcl_create_memory(handle, output[i]); + } + if (maxBytes) { + gcl_create_memory(handle, tmpbuf); + } + + U32 runKernelBe = 0; + U32 runKernelEnd = 0; + double minTime = DBL_MAX; + ForwardRunInfoMali bestRunInfo; + for (U32 i = 0; i < algosNum; i++) { + filter[0]->desc = filterMemDescs[i]; + if (sliceNum > 1) { + U32 item_k = runInfos[i].best_k[0]; + for (U32 j = 0; j < sliceNum; j++) { + U32 fn = outputDescs[j].dims[1]; + output[j]->desc.stride[2] = (fn + 3) / 4; + filter[j]->desc.stride[2] = (fn + item_k - 1) / item_k; + bias[j]->desc.stride[0] = (inputDesc.df == DF_MKT) ? (fn + 3) / 4 : fn; + } + } + if (fully_connected_mali(handle, inputDesc, input, filterDesc, &filter, biasDesc, &bias, + maxBytes, tmpbuf, outputDescs[0], &output, &runInfos[i]) == SUCCESS) { + runKernelEnd = handle->kernelVec->size(); + gcl_run_kernelVec_timing(handle, runKernelBe, runKernelEnd); + runKernelBe = runKernelEnd; + if (minTime > handle->t_execute) { + minTime = handle->t_execute; + bestRunInfo = runInfos[i]; + } + } + } + if (minTime == DBL_MAX) { + CHECK_STATUS(NOT_SUPPORTED); + } + *forwardRunInfo = bestRunInfo; + CHECK_STATUS(gcl_finish(handle)); + gcl_destroy_gclmem(input); + gcl_destroy_gclmem(tmpbuf); + for (auto p : filter) { + gcl_destroy_gclmem(p); + } + for (auto p : output) { + gcl_destroy_gclmem(p); + } + for (auto p : bias) { + gcl_destroy_gclmem(p); + } + runInfos.clear(); + filterMemDescs.clear(); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_off_queue_profiling(handle)); + return SUCCESS; +} +EE fully_connected_transform_filter_bytes_mali(TensorDesc filterDesc, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { + case DT_F16: { + ret = fully_connected_transform_filter_bytes_mali_fp16( + filterDesc, gclmemFilterDesc, bytes, forwardRunInfo); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE fully_connected_transform_filter_mali(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + TensorDesc *fltmemDesc, + std::vector fltmem, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { + case DT_F16: { + ret = fully_connected_transform_filter_mali_fp16( + handle, filterDesc, filter, fltmemDesc, fltmem, forwardRunInfo); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE fully_connected_infer_forward_tmp_bytes_mali( + TensorDesc inputDesc, TensorDesc filterDesc, U32 *bytes, ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = fully_connected_infer_forward_tmp_bytes_mali_fp16( + inputDesc, filterDesc, bytes, forwardRunInfo); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE fully_connected_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + std::vector *filter, + TensorDesc biasDesc, + std::vector *bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + std::vector *output, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + ret = fully_connected_checkpara_mali( + handle, inputDesc, input, filterDesc, filter, bias, outputDesc, output); + switch (inputDesc.dt) { + case DT_F16: { + ret = fully_connected_mali_fp16(handle, inputDesc, input, filterDesc, *filter, biasDesc, + *bias, tmpBytes, tmpBuf, outputDesc, *output, forwardRunInfo); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/matmul.cpp b/compute/tensor/src/gpu/mali/matmul.cpp new file mode 100644 index 00000000..752fcb54 --- /dev/null +++ b/compute/tensor/src/gpu/mali/matmul.cpp @@ -0,0 +1,476 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/matmul_mali_fp16.h" +inline void matmul_produce_algos_paras(bool transposeA, + TensorDesc matrixADesc, + bool transposeB, + TensorDesc matrixBDesc, + std::vector *matmulAlgorithms, + std::vector *vecW, + std::vector *vecC, + std::vector *vecK) +{ + U32 configInfo[3][192]; + U32 configNum = 0; + if (matmulAlgorithms) { + (*matmulAlgorithms).push_back(CONVOLUTION_ALGORITHM_GEMM); + } + if (transposeA && !transposeB) { //TN + for (U32 i = 1; i <= 8; ++i) { + for (U32 j = 1; j <= 8; ++j) { + if (i * j <= 2) { + continue; + } + configInfo[0][configNum] = j; // w + configInfo[1][configNum] = 1; // c + configInfo[2][configNum] = i; // k + configNum++; + } + } + } else if (!transposeA && transposeB) { + for (U32 ii = 1; ii <= 2; ++ii) { + for (U32 i = 1; i <= 8; ++i) { + for (U32 j = 1; j <= 8; ++j) { + if (i * j <= 2) { + continue; + } + if (i == 5 && j > 6 && ii == 2) { + continue; + } + if (i == 6) { + if ((j > 7 && ii == 1) || (j > 5 && ii == 2)) { + continue; + } + } + if (i == 7) { + if ((j > 6 && ii == 1) || (j > 4 && ii == 2)) { + continue; + } + } + if (i == 8) { + if ((j > 5 && ii == 1) || (j > 4 && ii == 2)) { + continue; + } + } + configInfo[0][configNum] = j; // w + configInfo[1][configNum] = 2 * ii; // c + configInfo[2][configNum] = i; // k + configNum++; + } + } + } + } else if (transposeA && transposeB) { + for (U32 ii = 1; ii <= 2; ++ii) { + for (U32 i = 1; i <= 8; ++i) { + if (i <= 2) { + continue; + } + configInfo[0][configNum] = i; // w + configInfo[1][configNum] = 2 * ii; // c + configInfo[2][configNum] = 1; // k + configNum++; + } + } + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + + for (U32 i = 0; i < configNum; i++) { + if (vecW) { + (*vecW).push_back(configInfo[0][i]); + } + if (vecC) { + (*vecC).push_back(configInfo[1][i]); + } + if (vecK) { + (*vecK).push_back(configInfo[2][i]); + } + } +} + +inline EE matmul_checkpara_mali(GCLHandle_t handle, + TensorDesc matrixADesc, + bool transposeA, + const GCLMem_t matrixA, + TensorDesc matrixBDesc, + bool transposeB, + const GCLMem_t matrixB, + TensorDesc matrixCDesc, + GCLMem_t matrixC) +{ + if (nullptr == handle || nullptr == matrixA || nullptr == matrixB || nullptr == matrixC) { + return NULL_POINTER; + } + if ((transposeA && !transposeB) || (!transposeA && transposeB)) { + if (matrixADesc.df != matrixBDesc.df || matrixADesc.df != matrixCDesc.df || + matrixADesc.df != DF_NCHW) { + return NOT_SUPPORTED; + } + if (matrixA->desc.memFormat != DF_NCHW || matrixB->desc.memFormat != DF_NCHW || + matrixC->desc.memFormat != DF_NCHW) { + return NOT_SUPPORTED; + } + } + if (!transposeA && !transposeB) { + return NOT_SUPPORTED; + } + if (matrixA->desc.stride[2] != matrixB->desc.stride[2]) { + return NOT_MATCH; + } + if (matrixA->desc.offset[0] != 0 || matrixA->desc.offset[1] != 0) { + return NOT_SUPPORTED; + } + if (matrixB->desc.offset[0] != 0 || matrixB->desc.offset[1] != 0) { + return NOT_SUPPORTED; + } + if (matrixC->desc.offset[0] != 0 || matrixC->desc.offset[1] != 0) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE matmul_infer_output_size_mali(TensorDesc matrixADesc, + bool transposeA, + TensorDesc matrixBDesc, + bool transposeB, + TensorDesc *matrixCDesc, + GCLMemDesc_t gclmemMatrixADesc, + GCLMemDesc_t gclmemMatrixBDesc, + GCLMemDesc_t gclmemMatrixCDesc) +{ + U32 adims = matrixADesc.nDims; + U32 bdims = matrixBDesc.nDims; + DataType adt = matrixADesc.dt; + DataType bdt = matrixBDesc.dt; + if (adims < 2 || bdims < 2) { + CHECK_STATUS(NOT_MATCH); + } + if (adt != bdt) { + CHECK_STATUS(NOT_MATCH); + } + U32 ac = (adims > 2) ? matrixADesc.dims[2] : 1; + U32 ah = matrixADesc.dims[1]; + U32 aw = matrixADesc.dims[0]; + U32 bc = (bdims > 2) ? matrixBDesc.dims[2] : 1; + U32 bh = matrixBDesc.dims[1]; + U32 bw = matrixBDesc.dims[0]; + bool need_pad_a = false; + bool need_pad_b = false; + if (ac != bc) { + CHECK_STATUS(NOT_SUPPORTED); + } + std::vector vecW; + std::vector vecC; + std::vector vecK; + matmul_produce_algos_paras( + transposeA, matrixADesc, transposeB, matrixBDesc, NULL, &vecW, NULL, &vecK); + + if (transposeA && !transposeB) { + /*TN*/ + if (ah != bh) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (matrixCDesc) { + *matrixCDesc = matrixADesc; + (*matrixCDesc).dims[0] = bw; + (*matrixCDesc).dims[1] = aw; + } + U32 aw_align = aw; + U32 bw_align = bw; + for (auto item_k : vecK) { + U32 i = ALIGN(aw, item_k); + aw_align = (aw_align < i) ? i : aw_align; + } + for (auto item_w : vecW) { + U32 i = ALIGN(bw, item_w); + bw_align = (bw_align < i) ? i : bw_align; + } + if (aw_align != aw) { + need_pad_a = true; + } + if (bw_align != bw) { + need_pad_b = true; + } + CHECK_STATUS(infer_gclmem_desc_nchw(aw_align, ah, ac, 0, 0, bw_align, aw_align, ac, adt, + adt, gclmemMatrixADesc, gclmemMatrixCDesc, need_pad_a)); + CHECK_STATUS(infer_gclmem_desc_nchw( + bw_align, bh, bc, 0, 0, 0, 0, 0, adt, adt, gclmemMatrixBDesc, NULL, need_pad_b)); + return SUCCESS; + } + if (!transposeA && transposeB) { + /*NT*/ + if (aw != bw) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (matrixCDesc) { + *matrixCDesc = matrixADesc; + (*matrixCDesc).dims[0] = bh; + (*matrixCDesc).dims[1] = ah; + } + U32 ah_align = ah; + U32 bh_align = bh; + U32 aw_align = aw; + for (auto item_k : vecK) { + U32 i = ALIGN(ah, item_k); + ah_align = (ah_align < i) ? i : ah_align; + } + for (auto item_w : vecW) { + U32 i = ALIGN(bh, item_w); + bh_align = (bh_align < i) ? i : bh_align; + } + for (auto item_c : vecC) { + U32 i = ALIGN(aw, item_c); + aw_align = (aw_align < i) ? i : aw_align; + } + if (aw_align != aw || ah_align != ah) { + need_pad_a = true; + } + if (aw_align != aw || bh_align != bh) { + need_pad_b = true; + } + CHECK_STATUS(infer_gclmem_desc_nchw(aw_align, ah_align, ac, 0, 0, bh_align, ah_align, ac, + adt, adt, gclmemMatrixADesc, gclmemMatrixCDesc, need_pad_a)); + CHECK_STATUS(infer_gclmem_desc_nchw( + aw_align, bh_align, bc, 0, 0, 0, 0, 0, adt, adt, gclmemMatrixBDesc, NULL, need_pad_b)); + return SUCCESS; + } + + if (transposeA && transposeB) { + /*TT*/ + if (ah != bw) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (matrixCDesc) { + *matrixCDesc = matrixADesc; + (*matrixCDesc).dims[0] = bh; + (*matrixCDesc).dims[1] = aw; + } + U32 aw_align = aw; + U32 ah_align = ah; + U32 bh_align = bh; + for (auto item_k : vecK) { + U32 i = ALIGN(aw, item_k); + aw_align = (aw_align < i) ? i : ah_align; + } + for (auto item_c : vecC) { + U32 i = ALIGN(ah, item_c); + ah_align = (ah_align < i) ? i : ah_align; + } + for (auto item_w : vecW) { + U32 i = ALIGN(bh, item_w); + bh_align = (bh_align < i) ? i : bh_align; + } + if (aw_align != aw || ah_align != ah) { + need_pad_a = true; + } + if (ah_align != ah || bh_align != bh) { + need_pad_b = true; + } + if (matrixADesc.df == DF_MKT) { + U32 m, k, t; + U32 gw, gh, gc; + get_nlp_mkt_val(matrixADesc, NULL, &m, &k, &t); + if (t != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (ah != k) { + CHECK_STATUS(NOT_MATCH); + } + if (aw != t) { + CHECK_STATUS(NOT_MATCH) + t = aw_align; + } + k = ah_align; + map_nlp_mkt_to_ncwhc4(m, k, t, &gw, &gh, &gc); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + gw, gh, gc * 4, 0, 0, 0, 0, 0, adt, adt, gclmemMatrixADesc, NULL, need_pad_a)); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + CHECK_STATUS(infer_gclmem_desc_nchw( + ah_align, bh_align, bc, 0, 0, 0, 0, 0, adt, adt, gclmemMatrixBDesc, NULL, need_pad_b)); + CHECK_STATUS(infer_gclmem_desc_nchw( + 0, 0, 0, 0, 0, bh_align, aw_align, ac, adt, adt, NULL, gclmemMatrixCDesc)); + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE matmul_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc matrixADesc, + bool transposeA, + TensorDesc matrixBDesc, + bool transposeB, + TensorDesc matrixCDesc, + ForwardRunInfoMali_t forwardRunInfo) +{ + if (forwardRunInfo == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + if (algorithm != CONVOLUTION_ALGORITHM_NULL) { + return SUCCESS; + } + std::vector matmulAlgorithms; + std::vector vecW; + std::vector vecC; + std::vector vecK; + matmul_produce_algos_paras( + transposeA, matrixADesc, transposeB, matrixBDesc, &matmulAlgorithms, &vecW, &vecC, &vecK); + if (vecW.size() == 1) { + forwardRunInfo->best_w[0] = vecW[0]; + forwardRunInfo->best_k[0] = vecK[0]; + forwardRunInfo->best_c[0] = vecC[0]; + forwardRunInfo->algorithm = matmulAlgorithms[0]; + return SUCCESS; + } + + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_enable_queue_profiling(handle)); + GCLMem_t matrixA = gcl_create_gclmem(); + GCLMem_t matrixB = gcl_create_gclmem(); + GCLMem_t matrixC = gcl_create_gclmem(); + GCLMem_t tmpbuf = gcl_create_gclmem(); + std::vector runInfos; + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + GCLMemDesc matrixAMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + GCLMemDesc matrixBMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + GCLMemDesc matrixCMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + U32 bytes; + U32 maxBytes = 0; + ForwardRunInfoMali runInfo; + runInfo.algorithm = matmulAlgorithms[0]; + CHECK_STATUS(matmul_infer_output_size_mali(matrixADesc, transposeA, matrixBDesc, transposeB, + NULL, &matrixAMemDesc, &matrixBMemDesc, &matrixCMemDesc)); + + for (U32 i = 0; i < vecW.size(); i++) { + runInfo.best_w[0] = vecW[i]; + runInfo.best_c[0] = vecC[i]; + runInfo.best_k[0] = vecK[i]; + if (matmul_infer_forward_tmp_bytes_mali( + matrixADesc, transposeA, matrixBDesc, transposeB, &bytes, &runInfo) != SUCCESS) { + continue; + } + maxBytes = (maxBytes < bytes) ? bytes : maxBytes; + runInfos.push_back(runInfo); + } + + U32 algosNum = runInfos.size(); + if (algosNum == 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + matrixA->desc = matrixAMemDesc; + matrixB->desc = matrixBMemDesc; + matrixC->desc = matrixCMemDesc; + tmpbuf->desc.byteSize = maxBytes; + gcl_create_memory(handle, matrixA); + gcl_create_memory(handle, matrixB); + gcl_create_memory(handle, matrixC); + if (maxBytes) { + gcl_create_memory(handle, tmpbuf); + } + + U32 runKernelBe = 0; + U32 runKernelEnd = 0; + double minTime = DBL_MAX; + ForwardRunInfoMali bestRunInfo; + for (U32 i = 0; i < algosNum; i++) { + if (matmul_mali(handle, matrixADesc, transposeA, matrixA, matrixBDesc, transposeB, matrixB, + tmpbuf, matrixCDesc, matrixC, &runInfos[i]) == SUCCESS) { + runKernelEnd = handle->kernelVec->size(); + gcl_run_kernelVec_timing(handle, runKernelBe, runKernelEnd); + runKernelBe = runKernelEnd; + if (minTime > handle->t_execute) { + minTime = handle->t_execute; + bestRunInfo = runInfos[i]; + } + } + } + if (minTime == DBL_MAX) { + CHECK_STATUS(NOT_SUPPORTED); + } + *forwardRunInfo = bestRunInfo; + CHECK_STATUS(gcl_finish(handle)); + gcl_destroy_gclmem(matrixA); + gcl_destroy_gclmem(matrixB); + gcl_destroy_gclmem(matrixC); + gcl_destroy_gclmem(tmpbuf); + runInfos.clear(); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_off_queue_profiling(handle)); + return SUCCESS; +} + +EE matmul_infer_forward_tmp_bytes_mali(TensorDesc matrixADesc, + bool transposeA, + TensorDesc matrixBDesc, + bool transposeB, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + switch (matrixADesc.dt) { + case DT_F16: { + ret = matmul_infer_forward_tmp_bytes_mali_fp16( + matrixADesc, transposeA, matrixBDesc, transposeB, bytes, forwardRunInfo); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE matmul_mali(GCLHandle_t handle, + TensorDesc matrixADesc, + bool transposeA, + const GCLMem_t matrixA, + TensorDesc matrixBDesc, + bool transposeB, + const GCLMem_t matrixB, + GCLMem_t tmp, + TensorDesc matrixCDesc, + GCLMem_t matrixC, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + ret = matmul_checkpara_mali(handle, matrixADesc, transposeA, matrixA, matrixBDesc, transposeB, + matrixB, matrixCDesc, matrixC); + switch (matrixADesc.dt) { + case DT_F16: { + ret = matmul_mali_fp16(handle, matrixADesc, transposeA, matrixA, matrixBDesc, + transposeB, matrixB, tmp, matrixCDesc, matrixC, forwardRunInfo); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/multihead_attention.cpp b/compute/tensor/src/gpu/mali/multihead_attention.cpp new file mode 100644 index 00000000..8f7203ef --- /dev/null +++ b/compute/tensor/src/gpu/mali/multihead_attention.cpp @@ -0,0 +1,722 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/multihead_attention_mali_fp16.h" +#include "tensor_computing_type.h" + +inline bool find_vector(std::vector vec, U32 val) +{ + bool find = false; + for (auto p : vec) { + if (p == val) { + find = true; + break; + } + } + return find; +} + +EE multihead_attention_checkpara_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + std::vector filterDesc, + std::vector filter, + std::vector bias, + std::vector layerNormAlpha, + std::vector layerNormBeta, + void *multiplyAlpha, + void *multiplyBeta, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + std::vector eltwiseWithLayerNormIn, + ActivationMode activation, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (nullptr == handle || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + U32 filterNum = filterDesc.size(); + U32 lnNum = layerNormAlpha.size(); + if (filterNum != filter.size() || filterNum != bias.size()) { + return NOT_MATCH; + } + if (lnNum != layerNormBeta.size()) { + return NOT_MATCH; + } + if (filterNum != 4 || lnNum != 2) { + return NOT_SUPPORTED; + } + for (U32 i = 0; i < filterNum; ++i) { + if (nullptr == filter[i] || nullptr == bias[i]) { + return NULL_POINTER; + } + } + for (U32 i = 0; i < lnNum; ++i) { + if (nullptr == layerNormAlpha[i] || nullptr == layerNormBeta[i]) { + return NULL_POINTER; + } + } + + if (inputDesc.df == DF_MKT || inputDesc.df == DF_MTK) { + U32 m, k; + U32 fw, fh, fc, fn; + get_nlp_mkt_val(inputDesc, NULL, &m, &k, NULL); + if (firstFCSliceNum[0] != firstFCSliceNum[1] || firstFCSliceNum[0] != firstFCSliceNum[2]) { + return NOT_SUPPORTED; + } + if (firstFCSliceNum[0] % matmulSliceLen != 0) { + return NOT_MATCH; + } + if (m != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + CHECK_STATUS(tensorSelectGet(filterDesc[0], NULL, NULL, &fn, &fc, &fh, &fw)); + if (fh != 1 || fw != 1) { + return NOT_MATCH; + } + if (k != fc) { + return NOT_MATCH; + } + } + return SUCCESS; +} + +EE multihead_attention_infer_output_size_mali(TensorDesc inputDesc, + std::vector filterDesc, + TensorDesc *outputDesc, + U32 *firstFCSliceNum, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + ForwardRunInfoMali_t forwardRunInfo) +{ + if (inputDesc.df == DF_MTK || inputDesc.df == DF_MKT) { + DataType dt; + U32 m, k, t; + get_nlp_mkt_val(inputDesc, &dt, &m, &k, &t); + U32 filterNum = filterDesc.size(); + U32 fn; + tensorSelectGet(filterDesc[filterNum - 1], NULL, NULL, &fn, NULL, NULL, NULL); + if (filterNum == 1) { + fn = firstFCSliceNum[2]; + } + if (outputDesc) { + *outputDesc = inputDesc; + (*outputDesc).dims[1] = fn; + } + U32 igw, igh, igc; + U32 ogw, ogh, ogc; + map_nlp_mkt_to_ncwhc4(m, k, t, &igw, &igh, &igc); + map_nlp_mkt_to_ncwhc4(m, fn, t, &ogw, &ogh, &ogc); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + igw, igh, igc * 4, 0, 0, ogw, ogh, ogc * 4, dt, dt, gclmemInputDesc, gclmemOutputDesc)); + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE multihead_attention_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc inputDesc, + std::vector filterDesc, + void *multiplyAlpha, + void *multiplyBeta, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + std::vector eltwiseWithLayerNormIn, + ActivationMode activation, + TensorDesc outputDesc, + ForwardRunInfoMali_t forwardRunInfo) +{ + if (forwardRunInfo == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + if (algorithm != CONVOLUTION_ALGORITHM_NULL) { + return SUCCESS; + } + + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_enable_queue_profiling(handle)); + GCLMem_t input = gcl_create_gclmem(); + GCLMem_t tmpbuf = gcl_create_gclmem(); + GCLMem_t output = gcl_create_gclmem(); + std::vector filter; + std::vector bias; + std::vector layerNormAlpha; + std::vector layerNormBeta; + U32 fn[4]; + + for (U32 i = 0; i < filterDesc.size(); ++i) { + tensorSelectGet(filterDesc[i], NULL, NULL, &fn[i], NULL, NULL, NULL); + GCLMem_t filterTmp = gcl_create_gclmem(); + GCLMem_t biasTmp = gcl_create_gclmem(); + filter.push_back((void *)filterTmp); + bias.push_back((void *)biasTmp); + } + + for (U32 i = 0; i < 2; ++i) { + GCLMem_t alphaTmp = gcl_create_gclmem(); + GCLMem_t betaTmp = gcl_create_gclmem(); + layerNormAlpha.push_back((void *)alphaTmp); + layerNormBeta.push_back((void *)betaTmp); + } + + std::vector runInfos; + ForwardRunInfoMali runInfo; + runInfo.algorithm = (I32)CONVOLUTION_ALGORITHM_GEMM; + std::vector inputMemDescs; + std::vector outputMemDescs; + std::vector filterMemDescs0; + std::vector filterMemDescs1; + std::vector filterMemDescs2; + std::vector filterMemDescs3; + std::vector> filterMemDescs; + /*0: fc0 + * 1: tn + * 2: nt + * 3: fc1 + * 4: fc2 + * 5: fc3*/ + U32 configInfos[6][3][64]; + U32 configNum_fc0 = 0; + U32 configNum_fc1 = 0; + U32 configNum_fc2 = 0; + U32 configNum_fc3 = 0; + U32 configNum_tn = 0; + U32 configNum_nt = 0; + U32 bytes; + U32 maxBytes = 0; + U32 maxInputSize = 0; + U32 maxOutputSize = 0; + U32 maxFilterSize[4] = {0, 0, 0, 0}; + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + + for (U32 i = 1; i <= 8; ++i) { + for (U32 j = 1; j <= 8; ++j) { + if (i * j <= 2) { + continue; + } + configInfos[0][0][configNum_fc0] = j; + configInfos[0][1][configNum_fc0] = 1; + configInfos[0][2][configNum_fc0] = i; + configInfos[1][0][configNum_tn] = j; + configInfos[1][1][configNum_tn] = 1; + configInfos[1][2][configNum_tn] = i; + configNum_fc0++; + configNum_tn++; + } + } + + for (U32 i = 4; i <= 8; i += 4) { + for (U32 j = 1; j <= 8; ++j) { + configInfos[3][0][configNum_fc1] = j; + configInfos[3][1][configNum_fc1] = 1; + configInfos[3][2][configNum_fc1] = i; + configNum_fc1++; + } + } + + for (U32 j = 1; j <= 8; j++) { + configInfos[4][0][configNum_fc2] = j; + configInfos[4][1][configNum_fc2] = 4; + configInfos[4][2][configNum_fc2] = 4; + configInfos[5][0][configNum_fc3] = j; + configInfos[5][1][configNum_fc3] = 4; + configInfos[5][2][configNum_fc3] = 4; + configNum_fc2++; + configNum_fc3++; + } + + if (fn[2] % 8 == 0) { + for (U32 j = 1; j <= 4; j++) { + configInfos[4][0][configNum_fc2] = j; + configInfos[4][1][configNum_fc2] = 4; + configInfos[4][2][configNum_fc2] = 8; + configNum_fc2++; + } + } + + if (fn[3] % 8 == 0) { + for (U32 j = 1; j <= 4; j++) { + configInfos[5][0][configNum_fc3] = j; + configInfos[5][1][configNum_fc3] = 4; + configInfos[5][2][configNum_fc3] = 8; + configNum_fc3++; + } + } + + for (U32 i = 1; i <= 8; ++i) { + for (U32 j = 1; j <= 8; ++j) { + if (i * j <= 2) { + continue; + } + if (i == 6 && j > 7) { + continue; + } + if (i == 7 && j > 6) { + continue; + } + if (i == 8 && j > 5) { + continue; + } + if (matmulSliceLen % i != 0) { + continue; + } + configInfos[2][0][configNum_nt] = j; // w + configInfos[2][1][configNum_nt] = 2; // c + configInfos[2][2][configNum_nt] = i; // k + configNum_nt++; + } + } + + for (U32 i = 1; i <= 8; ++i) { + for (U32 j = 1; j <= 8; ++j) { + if (i * j <= 2) { + continue; + } + if (i == 5 && j > 6) { + continue; + } + if (i == 6 && j > 5) { + continue; + } + if (i == 7 && j > 4) { + continue; + } + if (i == 8 && j > 3) { + continue; + } + if (matmulSliceLen % i != 0) { + continue; + } + configInfos[2][0][configNum_nt] = j; // w + configInfos[2][1][configNum_nt] = 4; // c + configInfos[2][2][configNum_nt] = i; // k + configNum_nt++; + } + } + std::vector configNums; + configNums.push_back(configNum_fc0); + configNums.push_back(configNum_tn); + configNums.push_back(configNum_nt); + configNums.push_back(configNum_fc1); + configNums.push_back(configNum_fc2); + configNums.push_back(configNum_fc3); + + DataType dt; + U32 t, k; + get_nlp_mkt_val(inputDesc, &dt, NULL, &k, &t); + std::vector biasDesc; + for (U32 i = 0; i < 2; ++i) { + GCLMemDesc tmpDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + U32 biasNum = fn[i] + 8; + tmpDesc.stride[0] = biasNum; + tmpDesc.stride[1] = 1; + tmpDesc.stride[2] = 1; + tmpDesc.offset[0] = 0; + tmpDesc.offset[1] = 0; + tmpDesc.offset[2] = 0; + tmpDesc.num = biasNum; + tmpDesc.byteSize = biasNum * bytesOf(dt); + tmpDesc.flags = CL_MEM_READ_WRITE; + tmpDesc.memFormat = DF_NHWC; + tmpDesc.memType = GCL_MEM_BUF; + TensorDesc biasDescTmp = tensor1d(dt, fn[i]); + biasDesc.push_back(biasDescTmp); + ((GCLMem_t)bias[i])->desc = tmpDesc; + gcl_create_memory(handle, (GCLMem_t)bias[i]); + } + + for (U32 i = 2; i < filterDesc.size(); ++i) { + GCLMemDesc tmpDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + U32 biasNum = (fn[i] + 3) / 4; + tmpDesc.stride[0] = biasNum; + tmpDesc.stride[1] = 1; + tmpDesc.stride[2] = 1; + tmpDesc.offset[0] = 0; + tmpDesc.offset[1] = 0; + tmpDesc.offset[2] = 0; + tmpDesc.num = biasNum; + tmpDesc.byteSize = biasNum * 4 * bytesOf(dt); + tmpDesc.flags = CL_MEM_READ_WRITE; + tmpDesc.memFormat = DF_NHWC; + tmpDesc.memType = GCL_MEM_IMG_1D; + TensorDesc biasDescTmp = tensor1d(dt, fn[i]); + biasDesc.push_back(biasDescTmp); + ((GCLMem_t)bias[i])->desc = tmpDesc; + gcl_create_memory(handle, (GCLMem_t)bias[i]); + } + + for (U32 i = 0; i < 2; ++i) { + U32 layerNormNum = ALIGN(k, 4); + if (i == 1) { + tensorSelectGet(filterDesc[1], NULL, NULL, &layerNormNum, NULL, NULL, NULL); + layerNormNum = ALIGN(layerNormNum, 4); + } + GCLMemDesc tmpDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + tmpDesc.stride[0] = layerNormNum; + tmpDesc.stride[1] = 1; + tmpDesc.stride[2] = 1; + tmpDesc.offset[0] = 0; + tmpDesc.offset[1] = 0; + tmpDesc.offset[2] = 0; + tmpDesc.num = layerNormNum; + tmpDesc.byteSize = layerNormNum * bytesOf(dt); + tmpDesc.flags = CL_MEM_READ_WRITE; + tmpDesc.memFormat = DF_NHWC; + tmpDesc.memType = GCL_MEM_BUF; + ((GCLMem_t)layerNormAlpha[i])->desc = tmpDesc; + ((GCLMem_t)layerNormBeta[i])->desc = tmpDesc; + gcl_create_memory(handle, (GCLMem_t)layerNormAlpha[i]); + gcl_create_memory(handle, (GCLMem_t)layerNormBeta[i]); + } + + U32 runKernelBe = 0; + U32 runKernelEnd = 0; + ForwardRunInfoMali bestRunInfo; + bestRunInfo.algorithm = (I32)CONVOLUTION_ALGORITHM_GEMM; + for (U32 i = 0; i < configNums.size(); ++i) { + bestRunInfo.best_w[i] = configInfos[i][0][0]; + bestRunInfo.best_c[i] = configInfos[i][1][0]; + bestRunInfo.best_k[i] = configInfos[i][2][0]; + } + GCLMemDesc inputMemDesc; + GCLMemDesc outputMemDesc; + GCLMemDesc filterMemDesc[4]; + for (U32 i = 0; i < configNums.size(); ++i) { + runInfo = bestRunInfo; + for (U32 j = 0; j < configNums[i]; ++j) { + runInfo.best_w[i] = configInfos[i][0][j]; + runInfo.best_c[i] = configInfos[i][1][j]; + runInfo.best_k[i] = configInfos[i][2][j]; + inputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + outputMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + for (U32 m = 0; m < filterDesc.size(); m++) { + filterMemDesc[i] = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + } + if (multihead_attention_infer_output_size_mali(inputDesc, filterDesc, NULL, + firstFCSliceNum, &inputMemDesc, &outputMemDesc, &runInfo) != SUCCESS) { + continue; + } + if (multihead_attention_transform_filter_bytes_mali( + filterDesc, filterMemDesc, &bytes, &runInfo) != SUCCESS) { + continue; + } + if (maxBytes < bytes) { + maxBytes = bytes; + } + if (multihead_attention_infer_forward_tmp_bytes_mali(inputDesc, filterDesc, + eltwiseWithLayerNormIn, firstFCSliceNum, matmulSliceLen, &bytes, + &runInfo) != SUCCESS) { + continue; + } + if (maxBytes < bytes) { + maxBytes = bytes; + } + if (maxInputSize < inputMemDesc.byteSize) { + maxInputSize = inputMemDesc.byteSize; + } + if (maxOutputSize < outputMemDesc.byteSize) { + maxOutputSize = outputMemDesc.byteSize; + } + if (maxFilterSize[0] < filterMemDesc[0].byteSize) { + maxFilterSize[0] = filterMemDesc[0].byteSize; + } + if (maxFilterSize[1] < filterMemDesc[1].byteSize) { + maxFilterSize[1] = filterMemDesc[1].byteSize; + } + if (maxFilterSize[2] < filterMemDesc[2].byteSize) { + maxFilterSize[2] = filterMemDesc[2].byteSize; + } + if (maxFilterSize[3] < filterMemDesc[3].byteSize) { + maxFilterSize[3] = filterMemDesc[3].byteSize; + } + inputMemDescs.push_back(inputMemDesc); + outputMemDescs.push_back(outputMemDesc); + filterMemDescs0.push_back(filterMemDesc[0]); + filterMemDescs1.push_back(filterMemDesc[1]); + filterMemDescs2.push_back(filterMemDesc[2]); + filterMemDescs3.push_back(filterMemDesc[3]); + runInfos.push_back(runInfo); + } + U32 algosNum = runInfos.size(); + if (algosNum == 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + + if (maxInputSize > inputMemDescs[0].byteSize || i == 0) { + inputMemDescs[0].byteSize = maxInputSize; + if (i > 0) { + CHECK_STATUS(gcl_release_memory(input)) + } + input->desc = inputMemDescs[0]; + CHECK_STATUS(gcl_create_memory(handle, input)); + } + if (maxOutputSize > outputMemDescs[0].byteSize || i == 0) { + outputMemDescs[0].byteSize = maxOutputSize; + if (i > 0) { + CHECK_STATUS(gcl_release_memory(output)) + } + output->desc = outputMemDescs[0]; + CHECK_STATUS(gcl_create_memory(handle, output)); + } + filterMemDescs.push_back(filterMemDescs0); + filterMemDescs.push_back(filterMemDescs1); + filterMemDescs.push_back(filterMemDescs2); + filterMemDescs.push_back(filterMemDescs3); + for (U32 k = 0; k < filterDesc.size(); k++) { + if (maxFilterSize[k] > filterMemDescs[k][0].byteSize || i == 0) { + filterMemDescs[k][0].byteSize = maxFilterSize[k]; + if (i > 0) { + CHECK_STATUS(gcl_release_memory((GCLMem_t)filter[k])); + } + ((GCLMem_t)filter[k])->desc = filterMemDescs[k][0]; + CHECK_STATUS(gcl_create_memory(handle, (GCLMem_t)filter[k])); + } + } + if (maxBytes > tmpbuf->desc.byteSize || i == 0) { + tmpbuf->desc.byteSize = maxBytes; + if (i > 0) { + CHECK_STATUS(gcl_release_subMem(tmpbuf)); + CHECK_STATUS(gcl_release_memory(tmpbuf)); + } + if (maxBytes) { + gcl_create_memory(handle, tmpbuf); + } + } + + double minTime = DBL_MAX; + for (U32 ii = 0; ii < algosNum; ii++) { + input->desc = inputMemDescs[ii]; + output->desc = outputMemDescs[ii]; + ((GCLMem_t)filter[0])->desc = filterMemDescs0[ii]; + ((GCLMem_t)filter[1])->desc = filterMemDescs1[ii]; + ((GCLMem_t)filter[2])->desc = filterMemDescs2[ii]; + ((GCLMem_t)filter[3])->desc = filterMemDescs3[ii]; + U32 best_w = runInfos[ii].best_w[i]; + U32 best_c = runInfos[ii].best_c[i]; + U32 best_k = runInfos[ii].best_k[i]; + runKernelBe = handle->kernelVec->size(); + if (multihead_attention_mali(handle, inputDesc, input, filterDesc, filter, biasDesc, + bias, layerNormAlpha, layerNormBeta, multiplyAlpha, multiplyBeta, + firstFCSliceNum, matmulSliceLen, eltwiseWithLayerNormIn, activation, maxBytes, + tmpbuf, outputDesc, output, &runInfos[ii]) == SUCCESS) { + runKernelEnd = handle->kernelVec->size(); + runKernelBe = runKernelBe + 1; + auto kernelInfo = (*handle->kernelVec)[runKernelBe]; + if (kernelInfo.name == "unknow_fill_memory_zero_vec4_f16") { + runKernelBe = runKernelBe + 1; + } + if (i == 0) { + goto R00; + } + runKernelBe = runKernelBe + 1; + kernelInfo = (*handle->kernelVec)[runKernelBe]; + if (kernelInfo.name == "unknow_fill_memory_zero_vec4_f16") { + runKernelBe = runKernelBe + 1; + } + if (i == 1) { + goto R00; + } + runKernelBe = runKernelBe + 2; + if (i == 2) { + goto R00; + } + runKernelBe = runKernelBe + 1; + if (i == 3) { + goto R00; + } + runKernelBe = runKernelBe + 2; + if (i == 4) { + goto R00; + } + runKernelBe = runKernelBe + 1; + if (runKernelBe >= runKernelEnd) { + CHECK_STATUS(NOT_MATCH); + } + R00: + gcl_run_kernelVec_timing(handle, runKernelBe, runKernelBe + 1); + if (minTime > handle->t_execute) { + minTime = handle->t_execute; + bestRunInfo.best_w[i] = best_w; + bestRunInfo.best_c[i] = best_c; + bestRunInfo.best_k[i] = best_k; + } + } + } + inputMemDescs.clear(); + outputMemDescs.clear(); + filterMemDescs.clear(); + filterMemDescs0.clear(); + filterMemDescs1.clear(); + filterMemDescs2.clear(); + filterMemDescs3.clear(); + runInfos.clear(); + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + if (minTime == DBL_MAX) { + CHECK_STATUS(NOT_SUPPORTED); + } + } + *forwardRunInfo = bestRunInfo; + CHECK_STATUS(gcl_finish(handle)); + gcl_destroy_gclmem(input); + gcl_destroy_gclmem(output); + gcl_destroy_gclmem(tmpbuf); + for (auto p : filter) { + gcl_destroy_gclmem(GCLMem_t(p)); + } + for (auto p : bias) { + gcl_destroy_gclmem(GCLMem_t(p)); + } + for (auto p : layerNormAlpha) { + gcl_destroy_gclmem(GCLMem_t(p)); + } + for (auto p : layerNormBeta) { + gcl_destroy_gclmem(GCLMem_t(p)); + } + runInfos.clear(); + inputMemDescs.clear(); + outputMemDescs.clear(); + filterMemDescs[0].clear(); + filterMemDescs[1].clear(); + filterMemDescs[2].clear(); + filterMemDescs[3].clear(); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_off_queue_profiling(handle)); + return SUCCESS; +} +EE multihead_attention_transform_filter_bytes_mali(std::vector filterDesc, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + switch (filterDesc[0].dt) { + case DT_F16: { + ret = multihead_attention_transform_filter_bytes_mali_fp16( + filterDesc, gclmemFilterDesc, bytes, forwardRunInfo); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE multihead_attention_transform_filter_mali(GCLHandle_t handle, + std::vector filterDesc, + std::vector filter, + std::vector *fltmemDesc, + std::vector fltmem, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + switch (filterDesc[0].dt) { + case DT_F16: { + ret = multihead_attention_transform_filter_mali_fp16( + handle, filterDesc, filter, fltmemDesc, fltmem, forwardRunInfo); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE multihead_attention_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + std::vector filterDesc, + std::vector eltwiseWithLayerNormIn, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = multihead_attention_infer_forward_tmp_bytes_mali_fp16(inputDesc, filterDesc, + eltwiseWithLayerNormIn, firstFCSliceNum, matmulSliceLen, bytes, forwardRunInfo); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE multihead_attention_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + std::vector filterDesc, + std::vector filter, + std::vector biasDesc, + std::vector bias, + std::vector layerNormAlpha, + std::vector layerNormBeta, + void *multiplyAlpha, + void *multiplyBeta, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + std::vector eltwiseWithLayerNormIn, + ActivationMode activation, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + ret = multihead_attention_checkpara_mali(handle, inputDesc, input, filterDesc, filter, bias, + layerNormAlpha, layerNormBeta, multiplyAlpha, multiplyBeta, firstFCSliceNum, matmulSliceLen, + eltwiseWithLayerNormIn, activation, tmpBuf, outputDesc, output); + switch (inputDesc.dt) { + case DT_F16: { + ret = multihead_attention_mali_fp16(handle, inputDesc, input, filterDesc, filter, + biasDesc, bias, layerNormAlpha, layerNormBeta, multiplyAlpha, multiplyBeta, + firstFCSliceNum, matmulSliceLen, eltwiseWithLayerNormIn, activation, tmpBytes, + tmpBuf, outputDesc, output, forwardRunInfo); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/tensor_computing/src/gpu/mali/normalization.cpp b/compute/tensor/src/gpu/mali/normalization.cpp similarity index 50% rename from tensor_computing/src/gpu/mali/normalization.cpp rename to compute/tensor/src/gpu/mali/normalization.cpp index 6621de14..aa9d58fb 100644 --- a/tensor_computing/src/gpu/mali/normalization.cpp +++ b/compute/tensor/src/gpu/mali/normalization.cpp @@ -11,70 +11,75 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "sys.h" -#include "type.h" +#include "types.h" #include "tensor_desc.h" #include "error.h" #include "gpu/mali/tensor_computing_mali.h" #include "gpu/mali/fp16/normalization_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" -EE normalization_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc) { - if(outputDesc) *outputDesc = inputDesc; - if(inputDesc.df == DF_MKT) { +EE normalization_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + if (outputDesc) { + *outputDesc = inputDesc; + } + if (inputDesc.df == DF_MKT) { DataType dt; U32 m, k, t; U32 w, h, c; get_nlp_mkt_val(inputDesc, &dt, &m, &k, &t); map_nlp_mkt_to_ncwhc4(m, k, t, &w, &h, &c); c = c * 4; - CHECK_STATUS(infer_gclmem_desc_ncwhc4(w, h, c, 0, 0, w, h, c, dt, dt, gclmemInputDesc, gclmemOutputDesc)); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + w, h, c, 0, 0, w, h, c, dt, dt, gclmemInputDesc, gclmemOutputDesc)); return SUCCESS; - } + } return NOT_SUPPORTED; } inline EE normalization_checkpara_mali(GCLHandle_t handle, - GCLMem_t alpha, - GCLMem_t beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - if(nullptr == handle || nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) return NULL_POINTER; - if(inputDesc.df != outputDesc.df || inputDesc.df != DF_MKT) return NOT_SUPPORTED; - if(input->desc.memFormat != output->desc.memFormat || input->desc.memFormat != DF_NCWHC4) return NOT_SUPPORTED; - return SUCCESS; + GCLMem_t alpha, + GCLMem_t beta, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (nullptr == handle || nullptr == alpha || nullptr == beta || nullptr == input || + nullptr == output) { + return NULL_POINTER; + } + if (inputDesc.df != outputDesc.df || inputDesc.df != DF_MKT) { + return NOT_SUPPORTED; + } + if (input->desc.memFormat != output->desc.memFormat || input->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + return SUCCESS; } EE layer_normalization_mali(GCLHandle_t handle, - GCLMem_t alpha, - GCLMem_t beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { + TensorDesc inputDesc, + GCLMem_t input, + GCLMem_t alpha, + GCLMem_t beta, + TensorDesc outputDesc, + GCLMem_t output) +{ EE ret = SUCCESS; - CHECK_STATUS(normalization_checkpara_mali(handle, alpha, beta, inputDesc, input, outputDesc, output)); - switch(inputDesc.dt){ - case DT_F16:{ + CHECK_STATUS( + normalization_checkpara_mali(handle, alpha, beta, inputDesc, input, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { ret = normalization_mali_fp16(handle, alpha, beta, inputDesc, input, outputDesc, output); break; } - case DT_I8:{ - ret = NOT_SUPPORTED; - break; - } default: ret = NOT_SUPPORTED; break; } return ret; } - - - diff --git a/compute/tensor/src/gpu/mali/padding.cpp b/compute/tensor/src/gpu/mali/padding.cpp new file mode 100644 index 00000000..16e8f8c6 --- /dev/null +++ b/compute/tensor/src/gpu/mali/padding.cpp @@ -0,0 +1,72 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/padding_mali_fp16.h" + +EE padding_infer_output_size_mali(TensorDesc inputDesc, + PadParamSpec padParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + U32 ow, oh; + U32 pw, ph, pr, pb; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + pw = padParamSpec.left; + pr = padParamSpec.right; + ph = padParamSpec.top; + pb = padParamSpec.bottom; + // if (pw!=pr || ph != pb) CHECK_STATUS(NOT_SUPPORTED); + ow = iw + pw + pr; + oh = ih + ph + pb; + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, ow, oh, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + if (outputDesc) { + *outputDesc = tensor4df(idt, idf, in, ic, oh, ow); + } + return SUCCESS; +} + +EE padding_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + PadParamSpec padParamSpec, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = padding_mali_fp16(handle, inputDesc, input, padParamSpec, outputDesc, output); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/pooling.cpp b/compute/tensor/src/gpu/mali/pooling.cpp new file mode 100644 index 00000000..7f14af45 --- /dev/null +++ b/compute/tensor/src/gpu/mali/pooling.cpp @@ -0,0 +1,124 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/pooling_mali_fp16.h" + +EE pooling_infer_output_size_mali(TensorDesc inputDesc, + PoolingParamSpec poolingParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in, it; + U32 ow, oh, ot; + U32 kw, kh, kt, sw, sh, st, pl, pt, pr, pb, pt_b, pt_a; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw, &it); + pl = poolingParamSpec.padding_left; + pr = poolingParamSpec.padding_right; + pt = poolingParamSpec.padding_top; + pb = poolingParamSpec.padding_bottom; + pt_b = poolingParamSpec.padding_before; + pt_a = poolingParamSpec.padding_after; + kw = poolingParamSpec.kernel_w; + kh = poolingParamSpec.kernel_h; + kt = poolingParamSpec.kernel_t; + sw = poolingParamSpec.stride_w; + sh = poolingParamSpec.stride_h; + st = poolingParamSpec.stride_t; + if (st == 0) { + st = 1; + } + switch (poolingParamSpec.rm) { + case CEIL: { + ow = (U32)(ceil((double(iw + pl + pr - kw) / sw))) + 1; + oh = (U32)(ceil((double(ih + pt + pb - kh) / sh))) + 1; + ot = (U32)(ceil((double(it + pt_b + pt_a - kt) / st))) + 1; + break; + } + case FLOOR: { + ow = (U32)(floor((double(iw + pl + pr - kw) / sw))) + 1; + oh = (U32)(floor((double(ih + pb + pt - kh) / sh))) + 1; + ot = (U32)(floor((double(it + pt_b + pt_a - kt) / st))) + 1; + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + U32 iw_align, ih_align; + ih_align = ih + pt + pb; + ih_align = ih_align - pt * 2; + iw_align = iw + pl + pr; + iw_align = iw_align - pl * 2; + + if (inputDesc.df == DF_NCTHW) { + *outputDesc = tensor5df(idt, idf, in, ic, ot, oh, ow); + } else { + *outputDesc = tensor4df(idt, idf, in, ic, oh, ow); + it = 1; + ot = 1; + } + + ic = ALIGN(ic, 4); + CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw_align, ih_align, ic * it, pl, pt, ow, oh, ic * ot, idt, + idt, gclmemInputDesc, gclmemOutputDesc)); + return SUCCESS; +} +EE pooling_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + PoolingParamSpec poolingParamSpec, + const void *scale, + GCLMem_t temp, + TensorDesc outputDesc, + GCLMem_t output) +{ + UNUSED(scale); + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = pooling_mali_fp16( + handle, inputDesc, input, poolingParamSpec, outputDesc, output, temp); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE pooling_infer_forward_tmp_bytes_mali( + TensorDesc inputDesc, U32 *bytes, ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = pooling_infer_forward_tmp_bytes_mali_fp16(inputDesc, bytes, forwardRunInfo); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/power.cpp b/compute/tensor/src/gpu/mali/power.cpp new file mode 100644 index 00000000..16e343ab --- /dev/null +++ b/compute/tensor/src/gpu/mali/power.cpp @@ -0,0 +1,113 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/power_mali_fp16.h" + +EE power_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + if (outputDesc) { + *outputDesc = inputDesc; + } + DataType idt; + DataFormat idf = inputDesc.df; + U32 iw, ih, ic, in; + if (idf == DF_NCHW || idf == DF_NORMAL) { + if (gclmemInputDesc) { + tensorSelectGet(inputDesc, &idt, NULL, &in, &ic, &ih, &iw); + if (gclmemInputDesc->memFormat == DF_NCHW) { + CHECK_STATUS(infer_gclmem_desc_nchw( + iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + if (gclmemInputDesc && gclmemOutputDesc) { + *gclmemOutputDesc = *gclmemInputDesc; + } + } else if (gclmemInputDesc->memFormat == DF_NCWHC4) { + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + if (gclmemInputDesc && gclmemOutputDesc) { + *gclmemOutputDesc = *gclmemInputDesc; + } + } else { + return NOT_SUPPORTED; + } + } + return SUCCESS; + } + if (idf == DF_MKT) { + if (gclmemInputDesc) { + if (gclmemInputDesc->memFormat == DF_NCWHC4) { + get_nlp_mkt_val(inputDesc, &idt, &in, &ic, &ih); + iw = 1; + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + if (gclmemInputDesc && gclmemOutputDesc) { + *gclmemOutputDesc = *gclmemInputDesc; + } + } else { + return NOT_SUPPORTED; + } + } + return SUCCESS; + } + return NOT_SUPPORTED; +} + +inline EE power_checkpara_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + EE ret = SUCCESS; + if (handle == nullptr || nullptr == input || nullptr == output) { + ret = NULL_POINTER; + } + if (inputDesc.df != outputDesc.df) { + ret = NOT_SUPPORTED; + } + if (input->desc.memFormat != output->desc.memFormat) { + ret = NOT_SUPPORTED; + } + if (inputDesc.df != DF_NCHW && inputDesc.df != DF_MKT && inputDesc.df != DF_NORMAL) { + ret = NOT_SUPPORTED; + } + if (input->desc.memFormat != DF_NCHW && input->desc.memFormat != DF_NCWHC4) { + ret = NOT_SUPPORTED; + } + return ret; +} + +EE power_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + PowerParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(power_checkpara_mali(handle, inputDesc, input, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = power_mali_fp16(handle, inputDesc, input, p, outputDesc, output); + break; + } + case DT_I32: { + ret = power_mali_fp16(handle, inputDesc, input, p, outputDesc, output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/preallocated_memory.cpp b/compute/tensor/src/gpu/mali/preallocated_memory.cpp new file mode 100644 index 00000000..b12b485f --- /dev/null +++ b/compute/tensor/src/gpu/mali/preallocated_memory.cpp @@ -0,0 +1,94 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" + +inline EE preallocated_memory_checkpara_mali( + GCLHandle_t handle, TensorDesc outputDesc, GCLMem_t output) +{ + if (handle == nullptr || nullptr == output) { + return NULL_POINTER; + } + if (output->desc.memFormat != DF_NCHW) { + return NOT_SUPPORTED; + } + if (outputDesc.dt != DT_F16 && outputDesc.dt != DT_I32) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE preallocated_memory_core_mali_fp16( + GCLHandle_t handle, TensorDesc outputDesc, GCLMem_t output) +{ + DataType dt = outputDesc.dt; + U32 numElements = output->desc.num; + cl_mem outbuf = output->mem; + U32 gs = numElements; + U32 ls = 0; + U32 dim = 1; + Kernel kernel; + char dataType[16]; + if (dt == DT_I32) { + strcpy(dataType, "i32"); + } + if (dt == DT_F16) { + strcpy(dataType, "f16"); + } + char kernelName[128]; + sprintf(kernelName, "fill_memory_zero_%s", dataType); + + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, numElements, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, &gs, &ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, &gs, &ls, kernelName)); + CHECK_STATUS(gcl_print_memory(handle, output, "preallocated_memory_output")); +#endif + return SUCCESS; +} + +EE preallocated_memory_infer_output_size_mali(TensorDesc *outputDesc, GCLMemDesc_t gclmemOutputDesc) +{ + U32 w, h, c, n; + TensorDesc desc = *outputDesc; + U32 ndims = desc.nDims; + DataType dt = desc.dt; + if (ndims < 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + w = desc.dims[0]; + h = (ndims > 1) ? desc.dims[1] : 1; + c = (ndims > 2) ? desc.dims[2] : 1; + n = (ndims > 3) ? desc.dims[3] : 1; + if (dt != DT_F16 && dt != DT_I32) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (n != 1) { + CHECK_STATUS(NOT_SUPPORTED) + CHECK_STATUS(infer_gclmem_desc_nchw(0, 0, 0, 0, 0, w, h, c, dt, dt, NULL, gclmemOutputDesc)); + } + return SUCCESS; +} + +EE preallocated_memory_mali(GCLHandle_t handle, TensorDesc outputDesc, GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(preallocated_memory_checkpara_mali(handle, outputDesc, output)); + CHECK_STATUS(preallocated_memory_core_mali_fp16(handle, outputDesc, output)); + return ret; +} diff --git a/compute/tensor/src/gpu/mali/prelu.cpp b/compute/tensor/src/gpu/mali/prelu.cpp new file mode 100644 index 00000000..d8e69f0f --- /dev/null +++ b/compute/tensor/src/gpu/mali/prelu.cpp @@ -0,0 +1,84 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/prelu_mali_fp16.h" + +EE prelu_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + + if (idf == DF_NCHW) { + if (outputDesc) { + *outputDesc = inputDesc; + } + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + if (gclmemInputDesc && gclmemOutputDesc) { + *gclmemOutputDesc = *gclmemInputDesc; // the input and output mem maybe the same + } + return SUCCESS; + } + return NOT_SUPPORTED; +} + +inline EE prelu_checkpara_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + GCLMem_t weight, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (handle == nullptr || nullptr == weight || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + if (input->desc.memFormat != output->desc.memFormat || input->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE prelu_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + GCLMem_t weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(prelu_checkpara_mali(handle, inputDesc, input, weight, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = prelu_mali_fp16(handle, inputDesc, input, weight, preluDesc, outputDesc, output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/reshape.cpp b/compute/tensor/src/gpu/mali/reshape.cpp new file mode 100644 index 00000000..6c63b50c --- /dev/null +++ b/compute/tensor/src/gpu/mali/reshape.cpp @@ -0,0 +1,157 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/reshape_mali_fp16.h" + +EE reshape_infer_output_size_mali(TensorDesc inputDesc, + ReshapeParamSpec p, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + if (outputDesc == nullptr || gclmemInputDesc == nullptr || gclmemOutputDesc == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + I32 *dims = p.shape_dims; + I32 shapeSize = p.shape_size; + int inputElementNum = tensorNumElements(inputDesc); + int outputElementNum = 1; + for (int i = 0; i < shapeSize; i++) { + outputElementNum *= dims[i]; + } + int index_range = ((int)inputDesc.nDims > shapeSize) ? shapeSize : inputDesc.nDims; + if (inputElementNum > 0 && outputElementNum > 0 && inputElementNum != outputElementNum) { + for (int i = 0; i < index_range; i++) { + if ((inputElementNum / (int)inputDesc.dims[inputDesc.nDims - 1 - i]) == + (outputElementNum / dims[i])) { + dims[i] = inputDesc.dims[inputDesc.nDims - 1 - i]; + break; + } + } + } + *outputDesc = inputDesc; + if (shapeSize == 2) { + (*outputDesc).df = DF_NORMAL; + } else if (shapeSize == 3) { + (*outputDesc).df = DF_MKT; + } else if (shapeSize == 4) { + (*outputDesc).df = DF_NCHW; + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + (*outputDesc).nDims = shapeSize; + + U32 factor = 1; + U32 count = 0; + for (I32 i = 0; i < shapeSize; i++) { + I32 value = dims[i]; + if (value == 0) { + value = inputDesc.dims[inputDesc.nDims - 1 - i]; + } + if (value == -1) { + value = 0; + count++; + } else { + factor *= value; + } + (*outputDesc).dims[shapeSize - 1 - i] = value; + } + + if (count > 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + + for (I32 i = 0; i < 4; i++) { + if (i < shapeSize) { + if ((*outputDesc).dims[i] == 0) { + (*outputDesc).dims[i] = tensorNumElements(inputDesc) / factor; + } + } else { + (*outputDesc).dims[i] = 1; + } + } + + DataType idt, odt; + U32 ic, ih, iw, it; + U32 oc, oh, ow, ot; + tensorSelectGet(inputDesc, &idt, NULL, NULL, &ic, &ih, &iw, &it); + tensorSelectGet((*outputDesc), &odt, NULL, NULL, &oc, &oh, &ow, &ot); + if (gclmemInputDesc->memFormat == DF_NCHW || gclmemInputDesc->byteSize == 0) { + CHECK_STATUS( + infer_gclmem_desc_nchw(iw, ih, ic * it, 0, 0, 0, 0, 0, idt, odt, gclmemInputDesc, NULL)); + } else { + ic = ALIGN(ic, 4); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic * it, 0, 0, 0, 0, 0, idt, odt, gclmemInputDesc, NULL)); + } + CHECK_STATUS( + infer_gclmem_desc_nchw(0, 0, 0, 0, 0, ow, oh, oc * ot, idt, odt, NULL, gclmemOutputDesc)); + return SUCCESS; +} + +inline EE reshape_checkpara_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + if (handle == nullptr || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + return SUCCESS; +} + +EE reshape_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + U32 *bytes) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = reshape_infer_forward_tmp_bytes_mali_fp16( + inputDesc, outputDesc, gclmemInputDesc, gclmemOutputDesc, bytes); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE reshape_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + GCLMem_t tmpbuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(reshape_checkpara_mali(handle, inputDesc, input, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = reshape_mali_fp16(handle, inputDesc, input, outputDesc, output, tmpbuf); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/rnncell.cpp b/compute/tensor/src/gpu/mali/rnncell.cpp new file mode 100644 index 00000000..fe19e09c --- /dev/null +++ b/compute/tensor/src/gpu/mali/rnncell.cpp @@ -0,0 +1,439 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/rnncell_mali_fp16.h" +#include "gpu/mali/fp16/rnn_mali_fp16.h" + +inline EE rnncell_checkpara_mali(GCLHandle_t handle, + TensorDesc xDesc, + GCLMem_t currentX, + TensorDesc filterDesc, + GCLMem_t filter, + GCLMem_t bias, + GCLMem_t state, + RNNParamSpec rnncellDesc, + GCLMem_t tmpBuf, + TensorDesc hDesc, + GCLMem_t output) +{ + if (nullptr == handle || nullptr == currentX || nullptr == filter || nullptr == output || + nullptr == state || nullptr == bias || nullptr == tmpBuf) { + return NULL_POINTER; + } + DataFormat df; + DataType dt; + U32 iB, iX; + if (xDesc.nDims == 2) { + CHECK_STATUS(tensor2dGet(xDesc, &dt, &df, &iB, &iX)); + } + if (xDesc.nDims == 3) { + if (xDesc.df != DF_MTK && xDesc.df != DF_MKT) { + CHECK_STATUS(NOT_SUPPORTED); + } + U32 m, k, t; + get_nlp_mkt_val(xDesc, &dt, &m, &k, &t); + iB = m; + iX = k; + if (t != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + } + if (iB != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + U32 hDim = rnncellDesc.numOutput; + U32 col = (rnncellDesc.numProjection > 0) ? rnncellDesc.numProjection : hDim; + U32 filterRow, filterCol; + tensorSelectGet(filterDesc, NULL, NULL, NULL, NULL, &filterRow, &filterCol); + if (filterCol != hDim + iX) { + CHECK_STATUS(NOT_MATCH); + } + if (filterRow != col * 4) { + CHECK_STATUS(NOT_MATCH); + } + if (hDesc.df != xDesc.df) { + CHECK_STATUS(NOT_MATCH); + } + if (hDesc.dims[0] != hDim && hDesc.dims[1] != hDim) { + CHECK_STATUS(NOT_MATCH); + } + return SUCCESS; +} + +EE rnncell_infer_output_size_mali(TensorDesc inputDesc, + RNNParamSpec rnncellDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemStateDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + DataType dt; + DataFormat df; + U32 iB, iX; + U32 hDim = rnncellDesc.numOutput; + if (inputDesc.nDims == 2) { + CHECK_STATUS(tensor2dGet(inputDesc, &dt, &df, &iB, &iX)); + } else if (inputDesc.nDims == 3) { + if (inputDesc.df != DF_MTK && inputDesc.df != DF_MKT) { + CHECK_STATUS(NOT_SUPPORTED); + } + U32 m, k, t; + get_nlp_mkt_val(inputDesc, &dt, &m, &k, &t); + iB = m; + iX = k; + } else { + return NOT_SUPPORTED; + } + + if (outputDesc) { + *outputDesc = inputDesc; + if (inputDesc.nDims == 2) { + (*outputDesc).dims[0] = hDim; + } + if (inputDesc.df == DF_MTK) { + (*outputDesc).dims[0] = hDim; + } + if (inputDesc.df == DF_MKT) { + (*outputDesc).dims[1] = hDim; + } + } + + // U32 item_c = forwardRunInfo->best_c[0]; + // U32 iX_align = (iX + item_c - 1) / item_c * item_c; + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + 1, 1, iX, 0, 0, 1, 1, hDim, dt, dt, gclmemInputDesc, gclmemOutputDesc)); + U32 hdim = rnncellDesc.numOutput; + U32 col = (rnncellDesc.numProjection > 0) ? rnncellDesc.numProjection : hDim; + U32 numState = col + (hdim + 3) / 4 * 4; + CHECK_STATUS( + infer_gclmem_desc_nchw(1, 1, numState, 0, 0, 0, 0, 0, dt, dt, gclmemStateDesc, NULL)); + return SUCCESS; +} + +EE rnncell_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc xDesc, + TensorDesc filterDesc, + TensorDesc biasDesc, + RNNParamSpec rnncellDesc, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + ForwardRunInfoMali_t forwardRunInfo) +{ + if (forwardRunInfo == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + ConvolutionForwardAlgorithm algorithm = (ConvolutionForwardAlgorithm)(forwardRunInfo->algorithm); + if (algorithm != CONVOLUTION_ALGORITHM_NULL) { + return SUCCESS; + } + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_enable_queue_profiling(handle)); + GCLMem_t currentX = gcl_create_gclmem(); + GCLMem_t state = gcl_create_gclmem(); + GCLMem_t filter0 = gcl_create_gclmem(); + GCLMem_t filter1 = gcl_create_gclmem(); + GCLMem_t bias = gcl_create_gclmem(); + GCLMem_t tmpbuf = gcl_create_gclmem(); + GCLMem_t currentH = gcl_create_gclmem(); + + std::vector runInfos; + ForwardRunInfoMali runInfo; + runInfo.algorithm = (I32)CONVOLUTION_ALGORITHM_DIRECT; + std::vector currentXMemDescs; + std::vector stateMemDescs; + std::vector currentHMemDescs; + std::vector filterMemDescs; + std::vector filterMemProDescs; + U32 configInfo[3][64]; + U32 configNum = 3; + U32 bytes = 0; + U32 maxBytes = 0; + U32 maxCurrentXSize = 0; + U32 maxStateSize = 0; + U32 maxCurrentHSize = 0; + U32 maxFilterSize = 0; + U32 hDim = rnncellDesc.numOutput; + U32 col = (rnncellDesc.numProjection > 0) ? rnncellDesc.numProjection : hDim; + U32 biasNum = col * 4; + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + DataType dt = xDesc.dt; + bool useProject = (rnncellDesc.numProjection > 0) ? true : false; + for (U32 i = 0; i < configNum; ++i) { + configInfo[0][i] = 1; + configInfo[1][i] = 1 << (2 + i); + configInfo[2][i] = 0; + configInfo[0][i + configNum] = 1; + configInfo[1][i + configNum] = 1 << (2 + i); + configInfo[2][i + configNum] = 0; + } + + for (U32 i = 0; i < configNum; ++i) { + GCLMemDesc currentXMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + GCLMemDesc stateMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCHW); + GCLMemDesc currentHMemDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + GCLMemDesc filterMemDesc[2]; + filterMemDesc[0] = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + filterMemDesc[1] = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + runInfo.best_w[0] = configInfo[0][i]; + runInfo.best_c[0] = configInfo[1][i]; + runInfo.best_k[0] = configInfo[2][i]; + runInfo.best_w[1] = configInfo[0][i + configNum]; + runInfo.best_c[1] = configInfo[1][i + configNum]; + runInfo.best_k[1] = configInfo[2][i + configNum]; + if (rnncell_infer_output_size_mali(xDesc, rnncellDesc, NULL, ¤tXMemDesc, + &stateMemDesc, ¤tHMemDesc) != SUCCESS) { + continue; + } + if (rnn_transform_filter_bytes_mali( + filterDesc, rnncellDesc, filterMemDesc, &bytes, &runInfo) != SUCCESS) { + continue; + } + if (maxBytes < bytes) { + maxBytes = bytes; + } + if (rnncell_infer_forward_tmp_bytes_mali( + xDesc, filterDesc, hDesc, rnncellDesc, &bytes, &runInfo) != SUCCESS) { + continue; + } + if (maxBytes < bytes) { + maxBytes = bytes; + } + if (maxCurrentXSize < currentXMemDesc.byteSize) { + maxCurrentXSize = currentXMemDesc.byteSize; + } + if (maxStateSize < stateMemDesc.byteSize) { + maxStateSize = stateMemDesc.byteSize; + } + if (maxCurrentHSize < currentHMemDesc.byteSize) { + maxCurrentHSize = currentHMemDesc.byteSize; + } + if (maxFilterSize < filterMemDesc[0].byteSize) { + maxFilterSize = filterMemDesc[0].byteSize; + } + if (maxFilterSize < filterMemDesc[1].byteSize) { + maxFilterSize = filterMemDesc[1].byteSize; + } + currentXMemDescs.push_back(currentXMemDesc); + stateMemDescs.push_back(stateMemDesc); + currentHMemDescs.push_back(currentHMemDesc); + filterMemDescs.push_back(filterMemDesc[0]); + filterMemProDescs.push_back(filterMemDesc[1]); + runInfos.push_back(runInfo); + } + + U32 algosNum = runInfos.size(); + if (algosNum == 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + currentXMemDescs[0].byteSize = maxCurrentXSize; + stateMemDescs[0].byteSize = maxStateSize; + currentHMemDescs[0].byteSize = maxCurrentHSize; + filterMemDescs[0].byteSize = maxFilterSize; + filterMemProDescs[0].byteSize = maxFilterSize; + + currentX->desc = currentXMemDescs[0]; + state->desc = stateMemDescs[0]; + currentH->desc = currentHMemDescs[0]; + filter0->desc = filterMemDescs[0]; + filter1->desc = filterMemProDescs[0]; + bias->desc.stride[0] = biasNum; + bias->desc.stride[1] = 1; + bias->desc.stride[2] = 1; + bias->desc.offset[0] = 0; + bias->desc.offset[1] = 0; + bias->desc.offset[2] = 0; + bias->desc.num = biasNum; + bias->desc.memFormat = DF_NHWC; + bias->desc.byteSize = biasNum * bytesOf(dt); + bias->desc.memType = GCL_MEM_BUF; + tmpbuf->desc.byteSize = maxBytes; + gcl_create_memory(handle, currentX); + gcl_create_memory(handle, state); + gcl_create_memory(handle, currentH); + gcl_create_memory(handle, filter0); + gcl_create_memory(handle, filter1); + gcl_create_memory(handle, bias); + if (maxBytes) { + gcl_create_memory(handle, tmpbuf); + } + + U32 runKernelBe = 0; + U32 runKernelEnd = 0; + double minTime = DBL_MAX; + double minTimePro = DBL_MAX; + ForwardRunInfoMali bestRunInfo; + for (U32 i = 0; i < algosNum; i++) { + currentX->desc = currentXMemDescs[i]; + state->desc = currentXMemDescs[i]; + currentH->desc = currentHMemDescs[i]; + filter0->desc = filterMemDescs[i]; + filter1->desc = filterMemProDescs[i]; + GCLMem filter[2]; + filter[0] = *filter0; + filter[1] = *filter1; + + runKernelBe = handle->kernelVec->size() + 1; + runKernelEnd = handle->kernelVec->size() + 2; + if (rnncell_mali(handle, xDesc, currentX, filterDesc, filter, biasDesc, bias, state, + rnncellDesc, batchStrideX, batchStrideH, maxBytes, tmpbuf, hDesc, currentH, + &runInfos[i]) == SUCCESS) { + gcl_run_kernelVec_timing(handle, runKernelBe, runKernelEnd); + if (minTime > handle->t_execute) { + minTime = handle->t_execute; + bestRunInfo.algorithm = runInfos[i].algorithm; + bestRunInfo.best_w[0] = runInfos[i].best_w[0]; + bestRunInfo.best_c[0] = runInfos[i].best_c[0]; + bestRunInfo.best_k[0] = runInfos[i].best_k[0]; + } + if (useProject) { + runKernelBe += 2; + runKernelEnd += 2; + gcl_run_kernelVec_timing(handle, runKernelBe, runKernelEnd); + if (minTimePro > handle->t_execute) { + minTime = handle->t_execute; + bestRunInfo.algorithm = runInfos[i].algorithm; + bestRunInfo.best_w[1] = runInfos[i].best_w[1]; + bestRunInfo.best_c[1] = runInfos[i].best_c[1]; + bestRunInfo.best_k[1] = runInfos[i].best_k[1]; + } + } + } + } + if (minTime == DBL_MAX) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (useProject && minTimePro == DBL_MAX) { + CHECK_STATUS(NOT_SUPPORTED); + } + *forwardRunInfo = bestRunInfo; + CHECK_STATUS(gcl_finish(handle)); + gcl_destroy_gclmem(currentX); + gcl_destroy_gclmem(state); + gcl_destroy_gclmem(currentH); + gcl_destroy_gclmem(filter0); + gcl_destroy_gclmem(filter1); + gcl_destroy_gclmem(bias); + runInfos.clear(); + currentXMemDescs.clear(); + stateMemDescs.clear(); + currentHMemDescs.clear(); + filterMemDescs.clear(); + filterMemProDescs.clear(); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + CHECK_STATUS(gcl_off_queue_profiling(handle)); + return SUCCESS; +} + +EE rnn_transform_filter_bytes_mali(TensorDesc filterDesc, + RNNParamSpec rnnParamSpec, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { + case DT_F16: { + ret = rnn_transform_filter_bytes_mali_fp16( + filterDesc, rnnParamSpec, gclmemFilterDesc, bytes, forwardRunInfo); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE rnn_transform_filter_mali(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + RNNParamSpec rnnParamSpec, + TensorDesc *fltmemDesc, + GCLMem_t fltmem, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + switch (filterDesc.dt) { + case DT_F16: { + ret = rnn_transform_filter_mali_fp16( + handle, filterDesc, filter, rnnParamSpec, fltmemDesc, fltmem, forwardRunInfo); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE rnncell_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + RNNParamSpec rnncellDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = rnncell_infer_forward_tmp_bytes_mali_fp16( + inputDesc, filterDesc, outputDesc, rnncellDesc, bytes, forwardRunInfo); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE rnncell_mali(GCLHandle_t handle, + TensorDesc xDesc, + const GCLMem_t currentX, + TensorDesc filterDesc, + GCLMem_t filter, + TensorDesc biasDesc, + GCLMem_t bias, + GCLMem_t state, + RNNParamSpec rnncellDesc, + U32 batchStrideX, + U32 batchStrideH, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc hDesc, + GCLMem_t output, + ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + ret = rnncell_checkpara_mali(handle, xDesc, currentX, filterDesc, filter, bias, state, + rnncellDesc, tmpBuf, hDesc, output); + switch (xDesc.dt) { + case DT_F16: { + ret = rnncell_mali_fp16(handle, xDesc, currentX, filterDesc, filter, biasDesc, bias, + state, tmpBytes, tmpBuf, rnncellDesc, batchStrideX, batchStrideH, hDesc, output, + forwardRunInfo); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/tensor_computing/src/gpu/mali/scale.cpp b/compute/tensor/src/gpu/mali/scale.cpp similarity index 53% rename from tensor_computing/src/gpu/mali/scale.cpp rename to compute/tensor/src/gpu/mali/scale.cpp index 6fb2312d..44cfc8b3 100644 --- a/tensor_computing/src/gpu/mali/scale.cpp +++ b/compute/tensor/src/gpu/mali/scale.cpp @@ -11,64 +11,74 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "sys.h" -#include "type.h" +#include "types.h" #include "tensor_desc.h" #include "error.h" #include "gpu/mali/tensor_computing_mali.h" #include "gpu/mali/fp16/scale_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" -EE scale_infer_output_size_mali(TensorDesc inputDesc, - TensorDesc* outputDesc, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc){ +EE scale_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ /*tensorDesc record cpu org data format info*/ /*gclmemDesc record gpu trans data format info*/ - if(outputDesc) *outputDesc = inputDesc; - DataType idt; + DataType idt; DataFormat idf; U32 iw, ih, ic, in; - tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); - if(idf == DF_NCHW) { + if (idf == DF_NCHW) { + if (outputDesc) { + *outputDesc = inputDesc; + } U32 ih_align = (ih + 1) / 2 * 2; - CHECK_STATUS(infer_gclmem_desc_ncwhc4(iw, ih_align, ic, 0, 0, iw, ih_align, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); - if(gclmemInputDesc && gclmemOutputDesc) *gclmemOutputDesc = *gclmemInputDesc;//the input and output mem maybe the same + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih_align, ic, 0, 0, iw, ih_align, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + if (gclmemInputDesc && gclmemOutputDesc) { + *gclmemOutputDesc = *gclmemInputDesc; // the input and output mem maybe the same + } return SUCCESS; - } + } return NOT_SUPPORTED; } -inline EE scale_checkpara_mali(GCLHandle_t handle, - GCLMem_t alpha, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { - if(handle == nullptr || nullptr == alpha || nullptr == input || nullptr == output) return NULL_POINTER; - if(inputDesc.df != outputDesc.df || inputDesc.df != DF_NCHW) return NOT_SUPPORTED; - if(input->desc.memFormat != output->desc.memFormat || input->desc.memFormat != DF_NCWHC4) return NOT_SUPPORTED; - return SUCCESS; +inline EE scale_checkpara_mali(GCLHandle_t handle, + GCLMem_t alpha, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (handle == nullptr || nullptr == alpha || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + // if(inputDesc.df != outputDesc.df || inputDesc.df != DF_NCHW) return NOT_SUPPORTED; + if (input->desc.memFormat != output->desc.memFormat || input->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + return SUCCESS; } EE scale_mali(GCLHandle_t handle, - GCLMem_t alpha, - GCLMem_t beta, - TensorDesc inputDesc, - GCLMem_t input, - TensorDesc outputDesc, - GCLMem_t output) { + GCLMem_t alpha, + GCLMem_t beta, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output) +{ EE ret = SUCCESS; CHECK_STATUS(scale_checkpara_mali(handle, alpha, inputDesc, input, outputDesc, output)); - switch(inputDesc.dt){ - case DT_F16:{ + switch (inputDesc.dt) { + case DT_F16: { ret = scale_mali_fp16(handle, alpha, beta, inputDesc, input, outputDesc, output); break; } - case DT_I8:{ + case DT_I8: { ret = NOT_SUPPORTED; break; } @@ -78,6 +88,3 @@ EE scale_mali(GCLHandle_t handle, } return ret; } - - - diff --git a/tensor_computing/src/gpu/mali/slice.cpp b/compute/tensor/src/gpu/mali/slice.cpp similarity index 51% rename from tensor_computing/src/gpu/mali/slice.cpp rename to compute/tensor/src/gpu/mali/slice.cpp index 39a5bf02..3b903a63 100644 --- a/tensor_computing/src/gpu/mali/slice.cpp +++ b/compute/tensor/src/gpu/mali/slice.cpp @@ -11,23 +11,25 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "sys.h" -#include "type.h" +#include "types.h" #include "tensor_desc.h" #include "error.h" #include "gpu/mali/tensor_computing_mali.h" #include "gpu/mali/fp16/slice_mali_fp16.h" -#include "gpu/mali/infer_gclmem_desc_mali.h" -EE slice_infer_output_size_mali(TensorDesc inputDesc, - std::vector* outputDesc, - I32 axis, - I32* slice_point, - GCLMemDesc_t gclmemInputDesc, - GCLMemDesc_t gclmemOutputDesc) { - if(outputDesc == NULL) CHECK_STATUS(NULL_POINTER); - U32 num = (*outputDesc).size(); +EE slice_infer_output_size_mali(TensorDesc inputDesc, + SliceParamSpec p, + std::vector *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + if (outputDesc == NULL) { + CHECK_STATUS(NULL_POINTER); + } + int axis = p.axis; + int *slice_points = p.slice_points; + U32 num = outputDesc->size(); axis = (axis + inputDesc.nDims) % inputDesc.nDims; I32 target_axis = inputDesc.nDims - 1 - axis; for (U32 i = 0; i < num; i++) { @@ -35,11 +37,11 @@ EE slice_infer_output_size_mali(TensorDesc inputDesc, I32 prev_point = 0; if (i > 0) { - prev_point = slice_point[i-1]; + prev_point = slice_points[i - 1]; } I32 next_point = inputDesc.dims[target_axis]; if (i < num - 1) { - next_point = slice_point[i]; + next_point = slice_points[i]; } if (prev_point < 0) { prev_point = (prev_point + inputDesc.dims[target_axis]) % inputDesc.dims[target_axis]; @@ -49,19 +51,21 @@ EE slice_infer_output_size_mali(TensorDesc inputDesc, } (*outputDesc)[i].dims[target_axis] = next_point - prev_point; } - if(inputDesc.df == DF_MKT) { - if(axis == 2) {//slice on T + if (inputDesc.df == DF_MKT) { + if (axis == 2) { // slice on T DataType dt; U32 m, k, t; U32 gw, gh, gc; get_nlp_mkt_val(inputDesc, &dt, &m, &k, &t); map_nlp_mkt_to_ncwhc4(m, k, t, &gw, &gh, &gc); - CHECK_STATUS(infer_gclmem_desc_ncwhc4(gw, gh, gc * 4, 0, 0, 0, 0, 0, dt, dt, gclmemInputDesc, NULL)); - if(gclmemOutputDesc) { - for(U32 i = 0; i < num; ++i) { + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + gw, gh, gc * 4, 0, 0, 0, 0, 0, dt, dt, gclmemInputDesc, NULL)); + if (gclmemOutputDesc) { + for (U32 i = 0; i < num; ++i) { get_nlp_mkt_val((*outputDesc)[i], NULL, &m, &k, &t); map_nlp_mkt_to_ncwhc4(m, k, t, &gw, &gh, &gc); - CHECK_STATUS(infer_gclmem_desc_ncwhc4(0, 0, 0, 0, 0, gw, gh, gc * 4, dt, dt, NULL, &gclmemOutputDesc[i])); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + 0, 0, 0, 0, 0, gw, gh, gc * 4, dt, dt, NULL, &gclmemOutputDesc[i])); } } } @@ -70,41 +74,53 @@ EE slice_infer_output_size_mali(TensorDesc inputDesc, return NOT_SUPPORTED; } -inline EE slice_checkpara_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - I32 axis, - std::vector outputDesc, - std::vector* output) { - if(handle == nullptr || input == nullptr) return NULL_POINTER; - if(input->desc.memFormat != DF_NCWHC4) return NOT_SUPPORTED; - for(auto p : (*output)) { - if(p == nullptr) return NULL_POINTER; - if(((GCLMem_t)p)->desc.memFormat != input->desc.memFormat) return NOT_MATCH; +inline EE slice_checkpara_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + SliceParamSpec p, + std::vector outputDesc, + std::vector *output) +{ + if (handle == nullptr || input == nullptr) { + return NULL_POINTER; + } + if (input->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + for (auto p : (*output)) { + if (p == nullptr) { + return NULL_POINTER; + } + if (((GCLMem_t)p)->desc.memFormat != input->desc.memFormat) { + return NOT_MATCH; + } } - if(inputDesc.df != DF_MKT) return NOT_SUPPORTED; - if(inputDesc.df == DF_MKT && axis != 2) return NOT_SUPPORTED; - for(auto p : outputDesc) { - if(p.df != inputDesc.df) return NOT_MATCH; + if (inputDesc.df != DF_MKT) { + return NOT_SUPPORTED; } - return SUCCESS; + if (inputDesc.df == DF_MKT && p.axis != 2) { + return NOT_SUPPORTED; + } + for (auto p : outputDesc) { + if (p.df != inputDesc.df) { + return NOT_MATCH; + } + } + return SUCCESS; } -EE slice_mali(GCLHandle_t handle, - TensorDesc inputDesc, - GCLMem_t input, - I32 axis, - std::vector outputDesc, - std::vector* output) { +EE slice_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + SliceParamSpec p, + std::vector outputDesc, + std::vector *output) +{ EE ret = SUCCESS; - CHECK_STATUS(slice_checkpara_mali(handle, inputDesc, input, axis, outputDesc, output)); - switch(inputDesc.dt){ - case DT_F16:{ - ret = slice_mali_fp16(handle, inputDesc, input, axis, outputDesc, output); - break; - } - case DT_I8:{ - ret = NOT_SUPPORTED; + CHECK_STATUS(slice_checkpara_mali(handle, inputDesc, input, p, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = slice_mali_fp16(handle, inputDesc, input, p, outputDesc, output); break; } default: @@ -113,6 +129,3 @@ EE slice_mali(GCLHandle_t handle, } return ret; } - - - diff --git a/compute/tensor/src/gpu/mali/softmax.cpp b/compute/tensor/src/gpu/mali/softmax.cpp new file mode 100644 index 00000000..4b521347 --- /dev/null +++ b/compute/tensor/src/gpu/mali/softmax.cpp @@ -0,0 +1,131 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/softmax_mali_fp16.h" + +EE softmax_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + if (outputDesc) { + *outputDesc = inputDesc; + } + + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + if (gclmemInputDesc) { + if (gclmemInputDesc->memFormat == DF_NCHW) { + U32 iw_align = (iw + 3) / 4 * 4; + if ((iw == 1 && ic == 1) || (iw == 1 && ih == 1)) { + iw_align = 1; + } + bool need_pad = false; + if (iw_align != iw) { + need_pad = true; + } + CHECK_STATUS(infer_gclmem_desc_nchw(iw_align, ih, ic, 0, 0, iw_align, ih, ic, idt, idt, + gclmemInputDesc, gclmemOutputDesc, need_pad)); + } else if (gclmemInputDesc->memFormat == DF_NCWHC4) { + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + } + return SUCCESS; +} + +inline EE softmax_checkpara_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + SoftmaxParamSpec p, + TensorDesc outputDesc, + GCLMem_t output) +{ + if (handle == nullptr || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + if (input->desc.memFormat != output->desc.memFormat) { + return NOT_SUPPORTED; + } + if (inputDesc.df != outputDesc.df) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[0] != outputDesc.dims[0]) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[1] != outputDesc.dims[1]) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[2] != outputDesc.dims[2]) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[3] != outputDesc.dims[3]) { + return NOT_SUPPORTED; + } + if (output->desc.memFormat != DF_NCWHC4 && output->desc.memFormat != DF_NCHW) { + return NOT_SUPPORTED; + } + if (p.axis != 1 && p.axis != 3 && p.axis != -1) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE softmax_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + SoftmaxParamSpec p, + GCLMem_t tmp, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(softmax_checkpara_mali(handle, inputDesc, input, p, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = softmax_mali_fp16(handle, inputDesc, input, tmp, p.axis, outputDesc, output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE softmax_infer_forward_tmp_bytes_mali( + TensorDesc inputDesc, U32 *bytes, ForwardRunInfoMali_t forwardRunInfo) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = softmax_infer_forward_tmp_bytes_mali_fp16(inputDesc, bytes, forwardRunInfo); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/space2depth.cpp b/compute/tensor/src/gpu/mali/space2depth.cpp new file mode 100644 index 00000000..ab75ddff --- /dev/null +++ b/compute/tensor/src/gpu/mali/space2depth.cpp @@ -0,0 +1,142 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" + +inline EE space2depth_checkpara_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + if (handle == nullptr || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + if (input->desc.memFormat != DF_NCHW) { + return NOT_SUPPORTED; + } + if (output->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + if (inputDesc.df != DF_NCHW) { + return NOT_SUPPORTED; + } + if (outputDesc.df != DF_NCHW) { + return NOT_SUPPORTED; + } + if (inputDesc.dt != DT_U8) { + return NOT_SUPPORTED; + } + if (outputDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[0] != outputDesc.dims[0] * 4) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[1] != outputDesc.dims[1] * 4) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[2] != outputDesc.dims[2] / 16) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[3] != outputDesc.dims[3]) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE space2depth_core_mali_fp16( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + UNUSED(outputDesc); + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + U32 iw_str, ih_str, iw_off, ih_off; + iw_str = input->desc.stride[0]; + ih_str = input->desc.stride[1]; + iw_off = input->desc.offset[0]; + ih_off = input->desc.offset[1]; + U32 ow_str, oh_str, ow_off, oh_off, ohw_str; + oh_str = output->desc.stride[0]; + ow_str = output->desc.stride[1]; + oh_off = output->desc.offset[0]; + ow_off = output->desc.offset[1]; + ohw_str = oh_str * ow_str; + + cl_mem inbuf, outbuf; + inbuf = input->mem; + outbuf = output->mem; + + U32 gs[3] = {(ih + 3) / 4, (iw + 3) / 4}; + U32 ls[3] = {0, 0}; + U32 dim = 2; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "space2depth", &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, oh_str, ohw_str, ow_off, + oh_off, gs[0], gs[1], inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, "space2depth"); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, "space2depth")); + CHECK_STATUS(gcl_print_memory(handle, input, "space2depth_input")); + CHECK_STATUS(gcl_print_memory(handle, output, "space2depth_output")); +#endif + return SUCCESS; +} + +EE space2depth_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + *outputDesc = inputDesc; + + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + U32 ow, oh, oc, on; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + if (idt != DT_U8) { + return NOT_SUPPORTED; + } + if (ic != 1) { + return NOT_SUPPORTED; + } + on = in; + oc = ic * 16; + oh = ih / 4; + ow = iw / 4; + + if (idf == DF_NCHW) { + if (outputDesc) { + *outputDesc = tensor4df(DT_F16, idf, on, oc, oh, ow); + } + CHECK_STATUS( + infer_gclmem_desc_nchw(iw, ih, ic, 0, 0, 0, 0, 0, DT_U8, DT_U8, gclmemInputDesc, NULL)); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + 0, 0, 0, 0, 0, ow, oh, oc, DT_F16, DT_F16, NULL, gclmemOutputDesc)); + return SUCCESS; + } + return NOT_SUPPORTED; +} + +EE space2depth_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(space2depth_checkpara_mali(handle, inputDesc, input, outputDesc, output)); + CHECK_STATUS(space2depth_core_mali_fp16(handle, inputDesc, input, outputDesc, output)); + return ret; +} diff --git a/compute/tensor/src/gpu/mali/squeeze.cpp b/compute/tensor/src/gpu/mali/squeeze.cpp new file mode 100644 index 00000000..e4f980ef --- /dev/null +++ b/compute/tensor/src/gpu/mali/squeeze.cpp @@ -0,0 +1,96 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/squeeze_mali_fp16.h" + +EE squeeze_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + if (outputDesc) { + *outputDesc = inputDesc; + } + + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + if (gclmemInputDesc && gclmemOutputDesc) { + *gclmemOutputDesc = *gclmemInputDesc; + } + return SUCCESS; +} + +inline EE squeeze_checkpara_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + if (handle == nullptr || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + if (input->desc.memFormat != output->desc.memFormat) { + return NOT_SUPPORTED; + } + if (inputDesc.df != outputDesc.df) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[0] != outputDesc.dims[0]) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[1] != outputDesc.dims[1]) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[2] != outputDesc.dims[2]) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[3] != outputDesc.dims[3]) { + return NOT_SUPPORTED; + } + if (outputDesc.df != DF_NCHW) { + return NOT_SUPPORTED; + } + if (output->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE squeeze_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(squeeze_checkpara_mali(handle, inputDesc, input, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = squeeze_mali_fp16(handle, inputDesc, input, outputDesc, output); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/tensor_computing_mali.h b/compute/tensor/src/gpu/mali/tensor_computing_mali.h new file mode 100644 index 00000000..a750b321 --- /dev/null +++ b/compute/tensor/src/gpu/mali/tensor_computing_mali.h @@ -0,0 +1,785 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TENSOR_COMPUTING_MALI +#define _H_TENSOR_COMPUTING_MALI +#include "types.h" +#include "tensor_computing_type.h" + +EE pooling_infer_forward_tmp_bytes_mali( + TensorDesc inputDesc, U32 *bytes, ForwardRunInfoMali_t forwardRunInfo); + +EE pooling_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + PoolingParamSpec poolingParamSpec, + const void *scale, + GCLMem_t temp, + TensorDesc outputDesc, + GCLMem_t output); + +EE pooling_infer_output_size_mali(TensorDesc inputDesc, + PoolingParamSpec poolingParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE padding_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + PadParamSpec padParamSpec, + TensorDesc outputDesc, + GCLMem_t output); + +EE padding_infer_output_size_mali(TensorDesc inputDesc, + PadParamSpec padParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE convolution_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE convolution_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc outputDesc, + GCLMemDesc inputMemDesc, + GCLMemDesc outputMemDesc, + ConvolutionPolicy policy, + ActivationMode activationMode, + ForwardRunInfoMali_t forwardRunInfo); + +EE convolution_transform_filter_bytes_mali(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes); + +EE convolution_transform_filter_mali(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + GCLMem_t tmp, + TensorDesc *fltmemDesc, + GCLMem_t fltmem); + +EE convolution_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE convolution_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc scaleDesc, + const GCLMem_t scale, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode); + +EE depthwise_pointwise_convolution_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE depthwise_pointwise_convolution_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ActivationMode depthwiseActivationMode, + ActivationMode pointwiseActivationMode, + ForwardRunInfoMali_t forwardRunInfo); + +EE depthwise_pointwise_convolution_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE depthwise_pointwise_convolution_transform_filter_bytes_mali(TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemDwFilterDesc, + GCLMemDesc_t gclmemPwFilterDesc, + U32 *bytes); + +EE depthwise_pointwise_convolution_transform_filter_mali(GCLHandle_t handle, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + GCLMem_t dwFilter, + GCLMem_t pwFilter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *dwfltmemDesc, + TensorDesc *pwfltmemDesc, + GCLMem_t dwfltmem, + GCLMem_t pwfltmem); + +EE depthwise_pointwise_convolution_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc dwFilterDesc, + TensorDesc pwFilterDesc, + const GCLMem_t dwFilter, + const GCLMem_t pwFilter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc dwBiasDesc, + TensorDesc pwBiasDesc, + const GCLMem_t dwBias, + const GCLMem_t pwBias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode, + ActivationMode pointwiseActivationMode); + +EE depthwise_convolution_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE depthwise_convolution_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ConvolutionPolicy policy, + ActivationMode depthwiseActivationMode, + ForwardRunInfoMali_t forwardRunInfo); + +EE depthwise_convolution_transform_filter_bytes_mali(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes); + +EE depthwise_convolution_transform_filter_mali(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem); + +EE depthwise_convolution_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE depthwise_convolution_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode depthwiseActivationMode); + +EE deconvolution_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE deconvolution_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc inputDesc, + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + TensorDesc outputDesc, + ConvolutionPolicy policy, + ActivationMode activationMode, + ForwardRunInfoMali_t forwardRunInfo); + +EE deconvolution_transform_filter_bytes_mali(TensorDesc filterDesc, + ForwardRunInfoMali_t forwardRunInfo, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes); + +EE deconvolution_transform_filter_mali(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + GCLMem_t tmp, + TensorDesc *fltmemDesc, + GCLMem_t fltmem); + +EE deconvolution_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE deconvolution_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc scaleDesc, + const GCLMem_t scale, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode); + +EE bilateral_slice_apply_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc guideDesc, + TensorDesc gridDesc, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemGuideDesc, + GCLMemDesc_t gclmemGridDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE bilateral_slice_apply_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc guideDesc, + TensorDesc gridDesc, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE bilateral_slice_apply_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc guideDesc, + const GCLMem_t guide, + TensorDesc gridDesc, + const GCLMem_t grid, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output); + +EE eltwise_infer_output_size_mali(std::vector inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE eltwise_mali(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + EltwiseParamSpec eltwiseDesc, + TensorDesc outputDesc, + GCLMem_t output); + +EE softmax_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE softmax_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + SoftmaxParamSpec p, + GCLMem_t tmp, + TensorDesc outputDesc, + GCLMem_t output); + +EE softmax_infer_forward_tmp_bytes_mali( + TensorDesc inputDesc, U32 *bytes, ForwardRunInfoMali_t forwardRunInfo); + +EE activation_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE activation_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode); + +EE fully_connected_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE fully_connected_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc inputDesc, + TensorDesc filterDesc, + std::vector outputDescs, + ForwardRunInfoMali_t forwardRunInfo); + +EE fully_connected_transform_filter_bytes_mali(TensorDesc filterDesc, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); + +EE fully_connected_transform_filter_mali(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + TensorDesc *fltmemDesc, + std::vector fltmem, + ForwardRunInfoMali_t forwardRunInfo); + +EE fully_connected_infer_forward_tmp_bytes_mali( + TensorDesc inputDesc, TensorDesc filterDesc, U32 *bytes, ForwardRunInfoMali_t forwardRunInfo); + +EE fully_connected_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + std::vector *filter, + TensorDesc biasDesc, + std::vector *bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + std::vector *output, + ForwardRunInfoMali_t forwardRunInfo); + +EE scale_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE scale_mali(GCLHandle_t handle, + GCLMem_t alpha, + GCLMem_t beta, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc outputDesc, + GCLMem_t output); + +EE prelu_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE prelu_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + GCLMem_t weight, + PReLUParamSpec preluDesc, + TensorDesc outputDesc, + GCLMem_t output); + +EE concat_infer_output_size_mali(std::vector inputDesc, + ConcatParamSpec p, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE concat_infer_forward_tmp_bytes_mali(std::vector inputDesc, U32 *bytes); + +EE concat_mali(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + GCLMem_t inputScale, + ConcatParamSpec p, + GCLMem_t tmpbuf, + TensorDesc outputDesc, + GCLMem_t output, + GCLMem_t outputScale); + +EE clip_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE clip_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + ClipParamSpec p, + TensorDesc outputDesc, + GCLMem_t output); + +EE squeeze_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE squeeze_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output); + +EE unsqueeze_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE unsqueeze_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output); + +EE reshape_infer_output_size_mali(TensorDesc inputDesc, + ReshapeParamSpec p, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE reshape_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + U32 *bytes); + +EE reshape_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + GCLMem_t tmpbuf, + TensorDesc outputDesc, + GCLMem_t output); + +EE space2depth_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE space2depth_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output); + +EE depth2space_infer_output_size_mali(TensorDesc inputDesc, + Depth2SpaceParamSpec p, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE depth2space_infer_tmpBuf_size_mali( + TensorDesc inputDesc, Depth2SpaceParamSpec p, TensorDesc outputDesc, U32 *bytes); + +EE depth2space_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + Depth2SpaceParamSpec p, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output); + +EE embedding_infer_output_size_mali(TensorDesc inputDesc, + EmbedParamSpec p, + DataType dt, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE embedding_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + TensorDesc weightDesc, + GCLMem_t weight, + EmbedParamSpec p, + TensorDesc outputDesc, + GCLMem_t output); + +EE normalization_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE layer_normalization_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + GCLMem_t alpha, + GCLMem_t beta, + TensorDesc outputDesc, + GCLMem_t output); + +EE matmul_infer_output_size_mali(TensorDesc matrixADesc, + bool transposeA, + TensorDesc matrixBDesc, + bool transposeB, + TensorDesc *matrixCDesc, + GCLMemDesc_t gclmemMatrixADesc, + GCLMemDesc_t gclmemMatrixBDesc, + GCLMemDesc_t gclmemMatrixCDesc); + +EE matmul_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc matrixADesc, + bool TransposeA, + TensorDesc matrixBDesc, + bool TransposeB, + TensorDesc matrixCDesc, + ForwardRunInfoMali_t forwardRunInfo); + +EE matmul_infer_forward_tmp_bytes_mali(TensorDesc matrixADesc, + bool transposeA, + TensorDesc matrixBDesc, + bool transposeB, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); + +EE matmul_mali(GCLHandle_t handle, + TensorDesc matrixADesc, + bool transposeA, + const GCLMem_t matrixA, + TensorDesc matrixBDesc, + bool transposeB, + const GCLMem_t matrixB, + GCLMem_t tmp, + TensorDesc matrixCDesc, + GCLMem_t matrixC, + ForwardRunInfoMali_t forwardRunInfo); + +EE power_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE power_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + PowerParamSpec p, + TensorDesc outputDesc, + GCLMem_t output); + +EE transpose_infer_output_size_mali(TensorDesc inputDesc, + TransposeParamSpec p, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE transpose_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + U32 *bytes); + +EE transpose_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TransposeParamSpec p, + GCLMem_t tmpbuf, + TensorDesc outputDesc, + GCLMem_t output); + +EE slice_infer_output_size_mali(TensorDesc inputDesc, + SliceParamSpec p, + std::vector *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE slice_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + SliceParamSpec p, + std::vector outputDesc, + std::vector *output); + +EE rnncell_infer_output_size_mali(TensorDesc inputDesc, + RNNParamSpec rnnParamSpec, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemStateDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE rnncell_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc xDesc, + TensorDesc filterDesc, + TensorDesc biasDesc, + RNNParamSpec rnncellDesc, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + ForwardRunInfoMali_t forwardRunInfo); + +EE rnn_transform_filter_bytes_mali(TensorDesc filterDesc, + RNNParamSpec rnnParamSpec, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); + +EE rnn_transform_filter_mali(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + RNNParamSpec rnnParamSpec, + TensorDesc *fltmemDesc, + GCLMem_t fltmem, + ForwardRunInfoMali_t forwardRunInfo); + +EE rnncell_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + RNNParamSpec rnncellDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); + +EE rnncell_mali(GCLHandle_t handle, + TensorDesc xDesc, + const GCLMem_t currentX, + TensorDesc filterDesc, + GCLMem_t filter, + TensorDesc biasDesc, + GCLMem_t bias, + GCLMem_t state, + RNNParamSpec rnncellDesc, + U32 batchStrideX, + U32 batchStrideH, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc hDesc, + GCLMem_t output, + ForwardRunInfoMali_t forwardRunInfo); + +EE argmax_infer_output_size_mali(TensorDesc inputDesc, + ArgMaxParamSpec p, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE argmax_infer_forward_tmp_bytes_mali( + TensorDesc inputDesc, ArgMaxParamSpec p, TensorDesc outputDesc, U32 *bytes); + +EE argmax_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + ArgMaxParamSpec p, + GCLMem_t tmpbuf, + TensorDesc outputDesc, + GCLMem_t output); + +EE preallocated_memory_infer_output_size_mali(TensorDesc *outputDesc, GCLMemDesc_t gclmemOutputDesc); + +EE preallocated_memory_mali(GCLHandle_t handle, TensorDesc outputDesc, GCLMem_t output); + +EE copy_infer_output_size_mali(std::vector inputDesc, GCLMemDesc_t gclmemInputDesc); + +EE copy_mali(GCLHandle_t handle, + std::vector inputDesc, + std::vector input, + U32 srcOffset, + U32 dstOffset, + U32 srcStride, + U32 dstStride, + U32 length); + +EE check_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputADesc, + GCLMemDesc_t gclmemInputBDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE check_mali(GCLHandle_t handle, + TensorDesc inputDescA, + GCLMem_t inputA, + TensorDesc inputDescB, + GCLMem_t inputB, + CheckParamSpec p, + TensorDesc outputDesc, + GCLMem_t output); + +EE multihead_attention_infer_output_size_mali(TensorDesc inputDesc, + std::vector filterDesc, + TensorDesc *outputDesc, + U32 *firstFCSliceNum, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + ForwardRunInfoMali_t forwardRunInfo); + +EE multihead_attention_infer_forward_algorithm_mali(GCLHandle_t handle, + TensorDesc inputDesc, + std::vector filterDesc, + void *multiplyAlpha, + void *multiplyBeta, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + std::vector eltwiseWithLayerNormIn, + ActivationMode activation, + TensorDesc outputDesc, + ForwardRunInfoMali_t forwardRunInfo); + +EE multihead_attention_transform_filter_bytes_mali(std::vector filterDesc, + GCLMemDesc_t gclmemFilterDesc, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); + +EE multihead_attention_transform_filter_mali(GCLHandle_t handle, + std::vector filterDesc, + std::vector filter, + std::vector *fltmemDesc, + std::vector fltmem, + ForwardRunInfoMali_t forwardRunInfo); + +EE multihead_attention_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + std::vector filterDesc, + std::vector eltwiseWithLayerNormIn, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + U32 *bytes, + ForwardRunInfoMali_t forwardRunInfo); + +EE multihead_attention_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + std::vector filterDesc, + std::vector filter, + std::vector biasDesc, + std::vector bias, + std::vector layerNormAlpha, + std::vector layerNormBeta, + void *multiplyAlpha, + void *multiplyBeta, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + std::vector eltwiseWithLayerNormIn, + ActivationMode activation, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ForwardRunInfoMali_t forwardRunInfo); + +EE channel_resize_infer_output_size_mali(TensorDesc inputDesc, + ChannelResizeParamSpec p, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc); + +EE channel_resize_mali(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + ChannelResizeParamSpec p, + TensorDesc outputDesc, + GCLMem_t output); +#endif diff --git a/compute/tensor/src/gpu/mali/transpose.cpp b/compute/tensor/src/gpu/mali/transpose.cpp new file mode 100644 index 00000000..568cddce --- /dev/null +++ b/compute/tensor/src/gpu/mali/transpose.cpp @@ -0,0 +1,114 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/transpose_mali_fp16.h" + +EE transpose_infer_output_size_mali(TensorDesc inputDesc, + TransposeParamSpec p, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + if (outputDesc == nullptr || gclmemInputDesc == nullptr || gclmemOutputDesc == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + U32 *dim = p.trans_dims; + U32 dimTran[6] = {1, 1, 1, 1, 1, 1}; + U32 nDims = inputDesc.nDims; + for (U32 i = 0; i < nDims; ++i) { + dimTran[nDims - 1 - i] = inputDesc.dims[nDims - 1 - dim[i]]; + } + *outputDesc = inputDesc; + for (U32 i = 0; i < nDims; ++i) { + (*outputDesc).dims[i] = dimTran[i]; + } + + DataType idt; + DataType odt; + U32 iw, ih, ic, in, it; + U32 ow, oh, oc, on, ot; + tensorSelectGet(inputDesc, &idt, NULL, &in, &ic, &ih, &iw, &it); + tensorSelectGet(*outputDesc, &odt, NULL, &on, &oc, &oh, &ow, &ot); + if (gclmemInputDesc->byteSize == 0 || gclmemInputDesc->memFormat == DF_NCHW) { + CHECK_STATUS( + infer_gclmem_desc_nchw(iw, ih, ic * it, 0, 0, 0, 0, 0, idt, odt, gclmemInputDesc, NULL)); + } else { + ic = ALIGN(ic, 4); + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic * it, 0, 0, 0, 0, 0, idt, odt, gclmemInputDesc, NULL)); + } + CHECK_STATUS( + infer_gclmem_desc_nchw(0, 0, 0, 0, 0, ow, oh, oc * ot, idt, odt, NULL, gclmemOutputDesc)); + return SUCCESS; +} + +inline EE transpose_checkpara_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + if (handle == nullptr || input == nullptr || output == nullptr) { + return NULL_POINTER; + } + + if (inputDesc.df != outputDesc.df) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE transpose_infer_forward_tmp_bytes_mali(TensorDesc inputDesc, + TensorDesc outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc, + U32 *bytes) +{ + EE ret = SUCCESS; + switch (inputDesc.dt) { + case DT_F16: { + ret = transpose_infer_forward_tmp_bytes_mali_fp16( + inputDesc, outputDesc, gclmemInputDesc, gclmemOutputDesc, bytes); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE transpose_mali(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TransposeParamSpec p, + GCLMem_t tmpbuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(transpose_checkpara_mali(handle, inputDesc, input, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = transpose_mali_fp16( + handle, inputDesc, input, outputDesc, output, tmpbuf, p.trans_dims); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.cpp b/compute/tensor/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.cpp new file mode 100644 index 00000000..e9c87b61 --- /dev/null +++ b/compute/tensor/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.cpp @@ -0,0 +1,130 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "error.h" +#include "gpu/mali/uchar/bilateral_slice_apply_mali_uchar.h" + +inline EE bilateral_slice_apply_checkpara_mali_uchar( + TensorDesc inputDesc, TensorDesc guideDesc, TensorDesc gridDesc, TensorDesc outputDesc) +{ + if (inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_U8) { + return NOT_SUPPORTED; + } + if (gridDesc.dt != guideDesc.dt || gridDesc.dt != DT_F16) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline EE bilateral_slice_apply_core_mali_uchar(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc guideDesc, + const GCLMem_t guide, + TensorDesc gridDesc, + const GCLMem_t grid, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + UNUSED(guideDesc); + UNUSED(forwardRunInfo); + U32 iw, ih, ic, in; + U32 gw, gh, gc, gn; + U32 ow, oh, oc, on; + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + tensorSelectGet(gridDesc, NULL, NULL, &gn, &gc, &gh, &gw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + U32 coe = bilateralSliceApplyParamSpec.coefficient_len; + BilateralSliceApplyMode mode = bilateralSliceApplyParamSpec.mode; + U32 dep = gc / coe; + U32 gcw = gc * gw; + U32 wh = iw * ih; + F32 scale_x = (F32)gw / iw; + F32 scale_y = (F32)gh / ih; + Mem inbuf, gridbuf, guidebuf, outbuf, gridTran; + inbuf = input->mem; + gridbuf = grid->mem; + outbuf = output->mem; + gridTran = tmpBuf->mem; + if (mode == BSliceApply_NULL) { + guidebuf = guide->mem; + } else { + guidebuf = inbuf; + } + + U32 gs0[3] = {gc / 4, gw, ih}; + U32 ls0[3] = {0, 0, 0}; + U32 dim0 = 3; + Kernel kernel; + CHECK_STATUS(gcl_create_kernel(handle, "bilateral_slice_apply_pre", &kernel)); + CHECK_STATUS( + gcl_set_kernelArgs(kernel, gh, gc, gcw, gs0[0], gs0[1], scale_y, gridbuf, gridTran)); + gcl_set_kernelVec(handle, kernel, dim0, gs0, ls0, "bilateral_slice_apply_pre"); + +#ifdef _DEBUG + CHECK_STATUS( + gcl_run_kernel_profiling(handle, kernel, dim0, gs0, ls0, "bilateral_slice_apply_pre")); + CHECK_STATUS(gcl_print_memory(handle, grid, "bilateral_slice_apply_grid")); +#endif + U32 gs[2] = {ow, oh}; + U32 ls[2] = {0, 0}; + U32 dim = 2; + char kernelname[128]; + if (mode == BSliceApply_CONV) { + sprintf(kernelname, "bilateral_slice_apply_c12_conv_uchar"); + } else { + sprintf(kernelname, "bilateral_slice_apply_c12_uchar"); + } + CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw, wh, gc, gw, gh, gcw, dep, coe, gs[0], gs[1], + scale_x, scale_y, guidebuf, gridTran, inbuf, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelname); + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel_profiling(handle, kernel, dim, gs, ls, kernelname)); + CHECK_STATUS(gcl_print_memory(handle, input, "bilateral_slice_apply_input")); + CHECK_STATUS(gcl_print_memory(handle, output, "bilateral_slice_apply_output")); + if (mode == BSliceApply_NULL) { + CHECK_STATUS(gcl_print_memory(handle, guide, "bilateral_slice_apply_guide")); + } +#endif + return SUCCESS; +} + +EE bilateral_slice_apply_mali_uchar(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc guideDesc, + const GCLMem_t guide, + TensorDesc gridDesc, + const GCLMem_t grid, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output) +{ + UNUSED(tmpBytes); + CHECK_STATUS( + bilateral_slice_apply_checkpara_mali_uchar(inputDesc, guideDesc, gridDesc, outputDesc)); + CHECK_STATUS(bilateral_slice_apply_core_mali_uchar(handle, inputDesc, input, guideDesc, guide, + gridDesc, grid, bilateralSliceApplyParamSpec, forwardRunInfo, tmpBuf, outputDesc, output)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.h b/compute/tensor/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.h new file mode 100644 index 00000000..c854698b --- /dev/null +++ b/compute/tensor/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.h @@ -0,0 +1,34 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _BILATERAL_SLICE_APPLY_MALI_UCHAR +#define _BILATERAL_SLICE_APPLY_MALI_UCHAR +#include "sys.h" +#include "types.h" +#include "error.h" +#include "tensor_computing_type.h" + +EE bilateral_slice_apply_mali_uchar(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc guideDesc, + const GCLMem_t guide, + TensorDesc gridDesc, + const GCLMem_t grid, + BilateralSliceApplyParamSpec bilateralSliceApplyParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output); +#endif diff --git a/compute/tensor/src/gpu/mali/unsqueeze.cpp b/compute/tensor/src/gpu/mali/unsqueeze.cpp new file mode 100644 index 00000000..ccdc6d9b --- /dev/null +++ b/compute/tensor/src/gpu/mali/unsqueeze.cpp @@ -0,0 +1,102 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "types.h" +#include "tensor_desc.h" +#include "error.h" +#include "gpu/mali/tensor_computing_mali.h" +#include "gpu/mali/fp16/unsqueeze_mali_fp16.h" + +EE unsqueeze_infer_output_size_mali(TensorDesc inputDesc, + TensorDesc *outputDesc, + GCLMemDesc_t gclmemInputDesc, + GCLMemDesc_t gclmemOutputDesc) +{ + /*tensorDesc record cpu org data format info*/ + /*gclmemDesc record gpu trans data format info*/ + if (outputDesc) { + *outputDesc = inputDesc; + } + + DataType idt; + DataFormat idf; + U32 iw, ih, ic, in; + if (inputDesc.df == DF_NCHW) { + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); + } else if (inputDesc.df == DF_MKT) { + U32 m, k, t; + get_nlp_mkt_val(inputDesc, &idt, &m, &k, &t); + map_nlp_mkt_to_ncwhc4(m, k, t, &iw, &ih, &ic); + ic = ic * 4; + in = 1; + idf = DF_MKT; + } else { + return NOT_SUPPORTED; + } + + CHECK_STATUS(infer_gclmem_desc_ncwhc4( + iw, ih, ic, 0, 0, iw, ih, ic, idt, idt, gclmemInputDesc, gclmemOutputDesc)); + return SUCCESS; +} + +inline EE unsqueeze_checkpara_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + if (handle == nullptr || nullptr == input || nullptr == output) { + return NULL_POINTER; + } + if (input->desc.memFormat != output->desc.memFormat) { + return NOT_SUPPORTED; + } + if (inputDesc.df != outputDesc.df) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[0] != outputDesc.dims[0]) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[1] != outputDesc.dims[1]) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[2] != outputDesc.dims[2]) { + return NOT_SUPPORTED; + } + if (inputDesc.dims[3] != outputDesc.dims[3]) { + return NOT_SUPPORTED; + } + if (output->desc.memFormat != DF_NCWHC4) { + return NOT_SUPPORTED; + } + return SUCCESS; +} + +EE unsqueeze_mali( + GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, TensorDesc outputDesc, GCLMem_t output) +{ + EE ret = SUCCESS; + CHECK_STATUS(unsqueeze_checkpara_mali(handle, inputDesc, input, outputDesc, output)); + switch (inputDesc.dt) { + case DT_F16: { + ret = unsqueeze_mali_fp16(handle, inputDesc, input, outputDesc, output); + break; + } + case DT_I8: { + ret = NOT_SUPPORTED; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/l2normalization.cpp b/compute/tensor/src/l2normalization.cpp new file mode 100644 index 00000000..ec3668c1 --- /dev/null +++ b/compute/tensor/src/l2normalization.cpp @@ -0,0 +1,65 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif + +inline EE l2normalization_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + if (tensorIs2d(inputDesc) || tensorIs3d(inputDesc)) { + *outputDesc = inputDesc; + } else if (tensorIs4d(inputDesc) && inputDesc.dims[0] == 1 && inputDesc.dims[1] == 1) { + *outputDesc = inputDesc; + } else { + CHECK_STATUS(NOT_MATCH); + } + return SUCCESS; +} + +EE l2normalization_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + UNUSED(archInfo); + CHECK_STATUS(l2normalization_infer_output_size_cpu(inputDesc, &outputDesc)); + outputTensor->resize(outputDesc); + return SUCCESS; +} + +EE l2normalization(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = l2normalization_cpu(inputDesc, input, outputDesc, output, arch); +#endif + } + return ret; +} diff --git a/compute/tensor/src/matmul.cpp b/compute/tensor/src/matmul.cpp new file mode 100644 index 00000000..670b3f1e --- /dev/null +++ b/compute/tensor/src/matmul.cpp @@ -0,0 +1,384 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "blas_enhance.h" +#include +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE matmul_infer_output_size_cpu(TensorDesc matrixADesc, + bool transposeA, + TensorDesc matrixBDesc, + bool transposeB, + TensorDesc *matrixCDesc) +{ + if (matrixCDesc == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + + if (DT_I8 == matrixADesc.dt || DT_I8 == matrixBDesc.dt) { + matrixADesc.dt = DT_I8; + matrixBDesc.dt = DT_I8; + } + + if (matrixADesc.dt != matrixBDesc.dt || matrixADesc.nDims < 2) { + CHECK_STATUS(NOT_MATCH); + } + + if (DF_NCHWC8 == matrixADesc.df && 4 == matrixADesc.nDims) { + CHECK_REQUIREMENT(1 == matrixADesc.dims[1] && 1 == matrixADesc.dims[0]); + } + + if (DF_NCHWC8 == matrixBDesc.df && 4 == matrixBDesc.nDims) { + CHECK_REQUIREMENT(1 == matrixBDesc.dims[1] && 1 == matrixBDesc.dims[0]); + } + + int i = 0; + int j = 0; + int dimA = matrixADesc.nDims; + int dimB = matrixBDesc.nDims; + while (i < dimA - 2 || j < dimB - 2) { + if (matrixADesc.dims[dimA - 1 - i] != matrixBDesc.dims[dimB - 1 - j]) { + if (matrixADesc.dims[dimA - 1 - i] == 1) { + i++; + continue; + } + if (matrixBDesc.dims[dimB - 1 - j] == 1) { + j++; + continue; + } + CHECK_STATUS(NOT_MATCH); + } else { + i++; + j++; + } + } + if (i != dimA - 2 || j != dimB - 2) { + CHECK_STATUS(NOT_MATCH); + } + + U32 kDimA, kDimB; + if (transposeA) { + kDimA = 1; + } else { + kDimA = 0; + } + if (transposeB) { + kDimB = 0; + } else { + kDimB = 1; + } + + if (matrixADesc.dims[kDimA] != matrixBDesc.dims[kDimB]) { + CHECK_STATUS(NOT_MATCH); + } + + *matrixCDesc = matrixADesc; + (*matrixCDesc).dims[kDimA] = matrixBDesc.dims[1 - kDimB]; + if (transposeA) { + U32 tmp = (*matrixCDesc).dims[0]; + (*matrixCDesc).dims[0] = (*matrixCDesc).dims[1]; + (*matrixCDesc).dims[1] = tmp; + } + return SUCCESS; +} + +EE matmul_infer_output_size(Tensor *matrixATensor, + bool transposeA, + Tensor *matrixBTensor, + bool transposeB, + Tensor *matrixCTensor, + ArchInfo_t archInfo) +{ + if (matrixATensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (matrixBTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (matrixCTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc matrixADesc = matrixATensor->get_desc(); + TensorDesc matrixBDesc = matrixBTensor->get_desc(); + TensorDesc matrixCDesc = matrixCTensor->get_desc(); + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemMatrixADesc = ocl_get_desc(*matrixATensor); + GCLMemDesc gclmemMatrixBDesc = ocl_get_desc(*matrixBTensor); + GCLMemDesc gclmemMatrixCDesc = ocl_get_desc(*matrixCTensor); + CHECK_STATUS(matmul_infer_output_size_mali(matrixADesc, transposeA, matrixBDesc, transposeB, + &matrixCDesc, &gclmemMatrixADesc, &gclmemMatrixBDesc, &gclmemMatrixCDesc)); + ocl_set_desc(matrixATensor, gclmemMatrixADesc); + ocl_set_desc(matrixBTensor, gclmemMatrixBDesc); + ocl_set_desc(matrixCTensor, gclmemMatrixCDesc); +#endif + } else { + CHECK_STATUS(matmul_infer_output_size_cpu( + matrixADesc, transposeA, matrixBDesc, transposeB, &matrixCDesc)); + } + matrixCTensor->resize(matrixCDesc); + return SUCCESS; +} + +EE matmul_infer_forward_algorithm(Tensor matrixATensor, + bool transposeA, + Tensor matrixBTensor, + bool transposeB, + Tensor matrixCTensor, + ArchInfo_t archInfo) +{ +#ifdef _USE_MALI + if (IS_MALI_GPU(archInfo->arch)) { + TensorDesc matrixADesc = matrixATensor.get_desc(); + TensorDesc matrixBDesc = matrixBTensor.get_desc(); + TensorDesc matrixCDesc = matrixCTensor.get_desc(); + CHECK_STATUS(matmul_infer_forward_algorithm_mali(((MaliPara_t)(archInfo->archPara))->handle, + matrixADesc, transposeA, matrixBDesc, transposeB, matrixCDesc, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo)); + } else { +#endif + return NOT_SUPPORTED; +#ifdef _USE_MALI + } +#endif + return SUCCESS; +} + +EE matmul_infer_forward_tmp_bytes(Tensor matrixATensor, + bool transposeA, + Tensor matrixBTensor, + bool transposeB, + U32 *bytes, + ArchInfo_t archInfo) +{ + TensorDesc matrixADesc = matrixATensor.get_desc(); + TensorDesc matrixBDesc = matrixBTensor.get_desc(); + + if (bytes == nullptr) { + CHECK_STATUS(NULL_POINTER); + } +#ifdef _USE_MALI + if (IS_MALI_GPU(archInfo->arch)) { + CHECK_STATUS(matmul_infer_forward_tmp_bytes_mali(matrixADesc, transposeA, matrixBDesc, + transposeB, bytes, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo)); + return SUCCESS; + } +#endif + bool quantA = false; + bool quantB = false; + if (DT_I8 == matrixADesc.dt || DT_I8 == matrixBDesc.dt) { + if (DT_F16 == matrixADesc.dt) { + quantA = true; + matrixADesc.dt = DT_I8; + } + + if (DT_F16 == matrixBDesc.dt) { + quantB = true; + matrixBDesc.dt = DT_I8; + } + } + + EE ret = SUCCESS; + U32 kDimA, kDimB; + DataFormat dataFormatA, dataFormatB; + if (transposeA) { + kDimA = 1; + dataFormatA = DF_TRANSPOSE; + } else { + kDimA = 0; + dataFormatA = DF_NORMAL; + } + if (transposeB) { + kDimB = 0; + dataFormatB = DF_TRANSPOSE; + } else { + kDimB = 1; + dataFormatB = DF_NORMAL; + } + if (matrixADesc.dims[1 - kDimA] == 1 || matrixBDesc.dims[1 - kDimB] == 1) { + TensorDesc matrixDesc, vectorDesc; + if (matrixADesc.dims[1 - kDimA] == 1) { + matrixDesc = + tensor2df(matrixBDesc.dt, dataFormatB, matrixBDesc.dims[1], matrixBDesc.dims[0]); + vectorDesc = tensor1d(matrixADesc.dt, matrixADesc.dims[kDimA]); + } else { + matrixDesc = + tensor2df(matrixADesc.dt, dataFormatA, matrixADesc.dims[1], matrixADesc.dims[0]); + vectorDesc = tensor1d(matrixBDesc.dt, matrixBDesc.dims[kDimB]); + } + ret = matrix_vector_multiply_tmp_bytes(matrixDesc, vectorDesc, bytes, archInfo->arch); + } else { + TensorDesc matrixA2DDesc = + tensor2df(matrixADesc.dt, dataFormatA, matrixADesc.dims[1], matrixADesc.dims[0]); + TensorDesc matrixB2Ddesc = + tensor2df(matrixBDesc.dt, dataFormatB, matrixBDesc.dims[1], matrixBDesc.dims[0]); + ret = matrix_matrix_multiply_tmp_bytes(matrixA2DDesc, matrixB2Ddesc, bytes, archInfo->arch); + } + + if (quantA) { + *bytes += tensorNumBytes(matrixADesc); + } + if (quantB) { + *bytes += tensorNumBytes(matrixBDesc); + } + return ret; +} + +EE matmul(Tensor matrixATensor, + bool transposeA, + Tensor matrixBTensor, + bool transposeB, + Tensor tmpTensor, + Tensor matrixCTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + U32 tmpBytes = tmpTensor.bytes(); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc matrixADesc = matrixATensor.get_desc(); + void *matrixA = get_ptr_from_tensor(matrixATensor, arch); + TensorDesc matrixBDesc = matrixBTensor.get_desc(); + void *matrixB = get_ptr_from_tensor(matrixBTensor, arch); + TensorDesc matrixCDesc = matrixCTensor.get_desc(); + void *matrixC = get_ptr_from_tensor(matrixCTensor, arch); + + if (matrixA == nullptr || matrixB == nullptr || matrixC == nullptr) { + CHECK_STATUS(NULL_POINTER); + } +#ifdef _USE_MALI + if (IS_MALI_GPU(arch)) { + CHECK_STATUS(matmul_mali(((MaliPara_t)(archInfo->archPara))->handle, matrixADesc, + transposeA, (GCLMem_t)matrixA, matrixBDesc, transposeB, (GCLMem_t)matrixB, (GCLMem_t)tmp, + matrixCDesc, (GCLMem_t)matrixC, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo)); + return SUCCESS; + } +#endif + +#ifdef _USE_INT8 + F32 scaleO = 1; + if (DT_I8 == matrixADesc.dt || DT_I8 == matrixBDesc.dt) { + if (DT_F16 == matrixADesc.dt) { + F16 *inD = (F16 *)matrixA; + INT8 *inQ = (INT8 *)tmp; + F16 scale = matrixATensor.get_scale(); + quantize_tensor(matrixADesc, inD, &matrixADesc, inQ, &scale); + scaleO *= scale; + matrixA = (U8 *)tmp; + tmp = (U8 *)tmp + tensorNumBytes(matrixADesc); + } else { + scaleO *= matrixATensor.get_scale(); + } + if (DT_F16 == matrixBDesc.dt) { + F16 *inD = (F16 *)matrixB; + INT8 *inQ = (INT8 *)tmp; + F16 scale = matrixBTensor.get_scale(); + quantize_tensor(matrixBDesc, inD, &matrixBDesc, inQ, &scale); + scaleO *= scale; + matrixB = (U8 *)tmp; + tmp = (U8 *)tmp + tensorNumBytes(matrixBDesc); + } else { + scaleO *= matrixBTensor.get_scale(); + } + matrixCDesc.dt = DT_I32; + matrixC = tmp; + tmp = (U8 *)tmp + tensorNumBytes(matrixCDesc); + } +#endif + + U32 sizeA = tensorNumElements(matrixADesc); + U32 loops = sizeA / (matrixADesc.dims[1] * matrixADesc.dims[0]); + U32 kDimA, kDimB; + if (transposeA) { + kDimA = 1; + } else { + kDimA = 0; + } + if (transposeB) { + kDimB = 0; + } else { + kDimB = 1; + } + + U32 matrixA2DBytes = (matrixADesc.dims[1] * matrixADesc.dims[0]) * bytesOf(matrixADesc.dt); + U32 matrixB2DBytes = (matrixBDesc.dims[1] * matrixBDesc.dims[0]) * bytesOf(matrixBDesc.dt); + U32 matrixC2DBytes = (matrixCDesc.dims[1] * matrixCDesc.dims[0]) * bytesOf(matrixCDesc.dt); + U8 *matrixAPtr = (U8 *)matrixA; + U8 *matrixBPtr = (U8 *)matrixB; + U8 *matrixCPtr = (U8 *)matrixC; + memset(matrixC, 0, tensorNumBytes(matrixCDesc)); + for (U32 i = 0; i < loops; i++) { + if (matrixADesc.dims[1 - kDimA] == 1) { + TensorDesc matrixA1DDesc = tensor1d(matrixADesc.dt, matrixADesc.dims[kDimA]); + TensorDesc matrixB2DDesc = tensor2df(matrixBDesc.dt, + transposeB ? DF_NORMAL : DF_TRANSPOSE, matrixBDesc.dims[1], matrixBDesc.dims[0]); + TensorDesc matrixC1DDesc = tensor1d(matrixCDesc.dt, matrixCDesc.dims[0]); + CHECK_STATUS(matrix_vector_multiply(matrixB2DDesc, matrixBPtr, matrixA1DDesc, + matrixAPtr, tmpBytes, tmp, matrixC1DDesc, matrixCPtr, archInfo->arch)); + } else { + if (matrixBDesc.dims[1 - kDimB] == 1) { + TensorDesc matrixA2DDesc; + if (transposeA) { + matrixA2DDesc = tensor2df( + matrixADesc.dt, DF_TRANSPOSE, matrixADesc.dims[1], matrixADesc.dims[0]); + } else { + matrixA2DDesc = tensor2df( + matrixADesc.dt, DF_NORMAL, matrixADesc.dims[1], matrixADesc.dims[0]); + } + TensorDesc matrixB1DDesc = tensor1d(matrixBDesc.dt, matrixBDesc.dims[kDimB]); + TensorDesc matrixC1DDesc = tensor1d(matrixCDesc.dt, matrixCDesc.dims[1]); + CHECK_STATUS(matrix_vector_multiply(matrixA2DDesc, matrixAPtr, matrixB1DDesc, + matrixBPtr, tmpBytes, tmp, matrixC1DDesc, matrixCPtr, archInfo->arch)); + } else { + DataFormat dataFormatA, dataFormatB; + if (transposeA) { + dataFormatA = DF_TRANSPOSE; + } else { + dataFormatA = DF_NORMAL; + } + if (transposeB) { + dataFormatB = DF_TRANSPOSE; + } else { + dataFormatB = DF_NORMAL; + } + TensorDesc matrixA2DDesc = tensor2df( + matrixADesc.dt, dataFormatA, matrixADesc.dims[1], matrixADesc.dims[0]); + TensorDesc matrixB2DDesc = tensor2df( + matrixBDesc.dt, dataFormatB, matrixBDesc.dims[1], matrixBDesc.dims[0]); + TensorDesc matrixC2DDesc = + tensor2df(matrixCDesc.dt, DF_NORMAL, matrixCDesc.dims[1], matrixCDesc.dims[0]); + CHECK_STATUS(matrix_matrix_multiply(matrixA2DDesc, matrixAPtr, matrixB2DDesc, + matrixBPtr, tmpBytes, tmp, matrixC2DDesc, matrixCPtr, archInfo->arch)); + } + } + matrixAPtr += matrixA2DBytes; + matrixBPtr += matrixB2DBytes; + matrixCPtr += matrixC2DBytes; + } +#ifdef _USE_INT8 + if (DT_I8 == matrixADesc.dt || DT_I8 == matrixBDesc.dt) { + if (DT_I8 == matrixCTensor.get_desc().dt) { + CHECK_STATUS(quantize_tensor(matrixCDesc, matrixC, &matrixCDesc, + get_ptr_from_tensor(matrixCTensor, arch), &scaleO)); + matrixCTensor.set_scale(scaleO); + } else { + CHECK_REQUIREMENT(DT_F16 == matrixCTensor.get_desc().dt); + F16 *output = (F16 *)get_ptr_from_tensor(matrixCTensor, arch); + dequantize_int32_to_fp16(tensorNumElements(matrixCDesc), (I32 *)matrixC, scaleO, output); + } + } +#endif + return SUCCESS; +} diff --git a/compute/tensor/src/multihead_attention.cpp b/compute/tensor/src/multihead_attention.cpp new file mode 100644 index 00000000..c05f496a --- /dev/null +++ b/compute/tensor/src/multihead_attention.cpp @@ -0,0 +1,227 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "tensor_computing.h" +#include "blas_enhance.h" +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE multihead_attention_infer_output_size(Tensor *inputTensor, + std::vector filterTensor, + Tensor *outputTensor, + U32 *firstFCSliceNum, + ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + std::vector filterDesc = get_desc_from_tensors(filterTensor); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = multihead_attention_infer_output_size_mali(inputDesc, filterDesc, &outputDesc, + firstFCSliceNum, &gclmemInputDesc, &gclmemOutputDesc, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + UNUSED(inputDesc); + UNUSED(filterDesc); + UNUSED(outputDesc); + UNUSED(firstFCSliceNum); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE multihead_attention_infer_forward_algorithm(Tensor inputTensor, + std::vector filterTensor, + void *multiplyAlpha, + void *multiplyBeta, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + std::vector eltwiseWithLayerNormIn, + ActivationMode activation, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + std::vector filterDesc = get_desc_from_tensors(filterTensor); + TensorDesc outputDesc = outputTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + ret = multihead_attention_infer_forward_algorithm_mali( + ((MaliPara_t)(archInfo->archPara))->handle, inputDesc, filterDesc, multiplyAlpha, + multiplyBeta, firstFCSliceNum, matmulSliceLen, eltwiseWithLayerNormIn, activation, + outputDesc, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } else { + UNUSED(inputTensor); + UNUSED(filterTensor); + UNUSED(multiplyAlpha); + UNUSED(multiplyBeta); + UNUSED(firstFCSliceNum); + UNUSED(matmulSliceLen); + UNUSED(eltwiseWithLayerNormIn); + UNUSED(activation); + UNUSED(outputTensor); + UNUSED(archInfo); + } + return ret; +} + +EE multihead_attention_infer_forward_tmp_bytes(Tensor inputTensor, + std::vector filterTensor, + std::vector eltwiseWithLayerNormIn, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + U32 *bytes, + ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + std::vector filterDesc = get_desc_from_tensors(filterTensor); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + ret = multihead_attention_infer_forward_tmp_bytes_mali(inputDesc, filterDesc, + eltwiseWithLayerNormIn, firstFCSliceNum, matmulSliceLen, bytes, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } else { + UNUSED(inputTensor); + UNUSED(filterTensor); + UNUSED(eltwiseWithLayerNormIn); + UNUSED(firstFCSliceNum); + UNUSED(matmulSliceLen); + UNUSED(bytes); + UNUSED(archInfo); + } + return ret; +} + +EE multihead_attention_transform_filter_bytes( + std::vector filterTensor, U32 *bytes, ArchInfo_t archInfo) +{ + std::vector filterDesc = get_desc_from_tensors(filterTensor); + + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + CHECK_STATUS(multihead_attention_transform_filter_bytes_mali(filterDesc, + ((MaliPara_t)(archInfo->archPara))->gclmemFilterDesc, bytes, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo)); +#endif + } else { + UNUSED(filterTensor); + UNUSED(bytes); + UNUSED(archInfo); + } + return SUCCESS; +} + +EE multihead_attention_transform_filter( + std::vector filterTensor, std::vector ftmTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + std::vector filterDesc = get_desc_from_tensors(filterTensor); + std::vector filter = get_data_from_tensors(filterTensor, arch); + std::vector ftmDesc = get_desc_from_tensor_ptrs(ftmTensor); + std::vector filterTransformed = get_data_from_tensor_ptrs(ftmTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + ret = multihead_attention_transform_filter_mali(((MaliPara_t)(archInfo->archPara))->handle, + filterDesc, filter, &ftmDesc, filterTransformed, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } else { + UNUSED(filterTensor); + UNUSED(ftmTensor); + UNUSED(archInfo); + } + for (U32 i = 0; i < ftmTensor.size(); i++) { + ftmTensor[i]->resize(ftmDesc[i]); + } + return ret; +} + +EE multihead_attention(Tensor inputTensor, + std::vector filterTensor, + std::vector biasTensor, + std::vector layerNormAlphaTensor, + std::vector layerNormBetaTensor, + void *multiplyAlpha, + void *multiplyBeta, + U32 *firstFCSliceNum, + U32 matmulSliceLen, + std::vector eltwiseWithLayerNormIn, + ActivationMode activation, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + std::vector filterDesc = get_desc_from_tensors(filterTensor); + std::vector filter = get_data_from_tensors(filterTensor, arch); + std::vector layerNormAlpha = get_data_from_tensors(layerNormAlphaTensor, arch); + ; + std::vector layerNormBeta = get_data_from_tensors(layerNormBetaTensor, arch); + std::vector biasDesc = get_desc_from_tensors(biasTensor); + std::vector bias = get_data_from_tensors(biasTensor, arch); + U32 tmpBytes = tmpTensor.bytes(); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + ret = multihead_attention_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (GCLMem_t)input, filterDesc, filter, biasDesc, bias, layerNormAlpha, layerNormBeta, + multiplyAlpha, multiplyBeta, firstFCSliceNum, matmulSliceLen, eltwiseWithLayerNormIn, + activation, tmpBytes, (GCLMem_t)tmp, outputDesc, (GCLMem_t)output, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } else { + UNUSED(inputDesc); + UNUSED(filterTensor); + UNUSED(biasTensor); + UNUSED(layerNormAlpha); + UNUSED(layerNormBeta); + UNUSED(multiplyAlpha); + UNUSED(multiplyBeta); + UNUSED(firstFCSliceNum); + UNUSED(matmulSliceLen); + UNUSED(eltwiseWithLayerNormIn); + UNUSED(activation); + UNUSED(tmpTensor); + UNUSED(outputTensor); + } + return ret; +} diff --git a/compute/tensor/src/non_max_suppression.cpp b/compute/tensor/src/non_max_suppression.cpp new file mode 100644 index 00000000..cf04825f --- /dev/null +++ b/compute/tensor/src/non_max_suppression.cpp @@ -0,0 +1,84 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif + +inline EE non_max_suppression_infer_output_size_cpu( + std::vector inputDesc, NonMaxSuppressionParamSpec p, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt0, idt1; + DataFormat idf0, idf1; + U32 in0, ic0, ilens0; + U32 in1, ic1, ilens1; + // boxes + CHECK_STATUS(tensor3dGet(inputDesc[0], &idt0, &idf0, &in0, &ic0, &ilens0)); + // scores + CHECK_STATUS(tensor3dGet(inputDesc[1], &idt1, &idf1, &in1, &ic1, &ilens1)); + CHECK_REQUIREMENT(ilens0 == 4); + CHECK_REQUIREMENT(ic0 == ilens1); + CHECK_REQUIREMENT(p.max_output_boxes_per_class != 0); + // output size + U32 oh, ow; + // oh = the first box for saving the number of available boxes(1) + the maximum number of dectected boxes(max_output_boxes_per_class * num_class) + U32 max_output_boxes_per_class = p.max_output_boxes_per_class; + U32 num_class = ic1; + U32 num_detected_max = max_output_boxes_per_class * num_class; + oh = num_detected_max + 1; + // Each width is a 3 dimension vector, which stores [batch_index, class_index, box_index] -> 3 + // The first box is [ number of available boxes, 0, 0 ] + ow = 3; + *outputDesc = tensor2d(idt0, oh, ow); + return SUCCESS; +} + +EE non_max_suppression_infer_output_size(std::vector inputTensor, + NonMaxSuppressionParamSpec p, + Tensor *outputTensor, + ArchInfo_t archInfo) +{ + UNUSED(archInfo); + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + std::vector inputDesc = get_desc_from_tensor_ptrs(inputTensor); + TensorDesc outputDesc = outputTensor->get_desc(); + CHECK_STATUS(non_max_suppression_infer_output_size_cpu(inputDesc, p, &outputDesc)); + outputTensor->resize(outputDesc); + return SUCCESS; +} + +EE non_max_suppression(std::vector inputTensor, + NonMaxSuppressionParamSpec p, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + std::vector inputDesc = get_desc_from_tensors(inputTensor); + std::vector input = get_data_from_tensors(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = non_max_suppression_cpu(inputDesc, input, p, outputDesc, output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/normalization.cpp b/compute/tensor/src/normalization.cpp new file mode 100644 index 00000000..8c033065 --- /dev/null +++ b/compute/tensor/src/normalization.cpp @@ -0,0 +1,88 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE layer_normalization(Tensor inputTensor, + Tensor alphaTensor, + Tensor betaTensor, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + void *alpha = get_ptr_from_tensor(alphaTensor, arch); + void *beta = get_ptr_from_tensor(betaTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = layer_normalization_general(inputDesc, input, alpha, beta, outputDesc, output); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = layer_normalization_x86(inputDesc, input, alpha, beta, outputDesc, output); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = layer_normalization_arm(inputDesc, input, alpha, beta, outputDesc, output); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = layer_normalization_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (GCLMem_t)input, (GCLMem_t)alpha, (GCLMem_t)beta, outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} + +EE normalization_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + CHECK_STATUS(normalization_infer_output_size_mali( + inputDesc, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc)); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + outputDesc = inputDesc; + } + outputTensor->resize(outputDesc); + return SUCCESS; +} diff --git a/compute/tensor/src/padding.cpp b/compute/tensor/src/padding.cpp new file mode 100644 index 00000000..b8ec5192 --- /dev/null +++ b/compute/tensor/src/padding.cpp @@ -0,0 +1,72 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE padding_infer_output_size( + Tensor *inputTensor, PadParamSpec padParamSpec, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = padding_infer_output_size_mali( + inputDesc, padParamSpec, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif +#ifdef _USE_CPU + } else { + ret = padding_infer_output_size_cpu(inputDesc, padParamSpec, &outputDesc); +#endif + } + outputTensor->resize(outputDesc); + return ret; +} + +EE padding(Tensor inputTensor, PadParamSpec padParamSpec, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + ret = padding_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (const GCLMem_t)input, padParamSpec, outputDesc, (GCLMem_t)output); +#endif +#ifdef _USE_CPU + } else { + ret = padding_cpu(inputDesc, input, padParamSpec, outputDesc, output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/pooling.cpp b/compute/tensor/src/pooling.cpp new file mode 100644 index 00000000..d08e133e --- /dev/null +++ b/compute/tensor/src/pooling.cpp @@ -0,0 +1,185 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif + +inline EE pooling_infer_output_size_cpu( + TensorDesc inputDesc, PoolingParamSpec poolingParamSpec, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + U32 strideH = poolingParamSpec.stride_h; + U32 strideW = poolingParamSpec.stride_w; + U32 paddingT = poolingParamSpec.padding_top; + U32 paddingB = poolingParamSpec.padding_bottom; + U32 paddingL = poolingParamSpec.padding_left; + U32 paddingR = poolingParamSpec.padding_right; + U32 kernelSizeH = poolingParamSpec.kernel_h; + U32 kernelSizeW = poolingParamSpec.kernel_w; + RoundMode rm = poolingParamSpec.rm; + U32 oh = 0, ow = 0; + switch (rm) { + case CEIL: { + oh = (U32)(ceil((double(ih + paddingT + paddingB - kernelSizeH) / strideH))) + 1; + ow = (U32)(ceil((double(iw + paddingL + paddingR - kernelSizeW) / strideW))) + 1; + break; + } + case FLOOR: { + oh = (U32)(floor((double(ih + paddingT + paddingB - kernelSizeH) / strideH))) + 1; + ow = (U32)(floor((double(iw + paddingL + paddingR - kernelSizeW) / strideW))) + 1; + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + *outputDesc = tensor4df(idt, idf, in, ic, oh, ow); + return SUCCESS; +} + +EE pooling_infer_output_size( + Tensor *inputTensor, PoolingParamSpec poolingParamSpec, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (0 == poolingParamSpec.kernel_h && 0 == poolingParamSpec.kernel_w) { // Global pooling + CHECK_REQUIREMENT(4 == inputDesc.nDims); + poolingParamSpec.kernel_h = inputDesc.dims[1]; + poolingParamSpec.kernel_w = inputDesc.dims[0]; + } + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = pooling_infer_output_size_mali( + inputDesc, poolingParamSpec, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = pooling_infer_output_size_cpu(inputDesc, poolingParamSpec, &outputDesc); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE pooling(Tensor inputTensor, + PoolingParamSpec poolingParamSpec, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + F32 scale[2] = {inputTensor.get_scale(), -1}; + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + + EE ret = NOT_SUPPORTED; + if (0 == poolingParamSpec.kernel_h && 0 == poolingParamSpec.kernel_w) { // Global pooling + CHECK_REQUIREMENT(4 == inputDesc.nDims); + poolingParamSpec.kernel_h = inputDesc.dims[1]; + poolingParamSpec.kernel_w = inputDesc.dims[0]; + } + TensorDesc inDescCPU = inputDesc; + U8 *inputCPU = (U8 *)input; + TensorDesc outDescCPU = outputDesc; + U8 *outputCPU = (U8 *)output; + if (DF_NCHWC8 != inputDesc.df && !IS_MALI_GPU(arch)) { + U32 paddedC = (inputDesc.dims[2] + 7) / 8 * 8; + inDescCPU.dims[2] = paddedC; + inDescCPU.df = DF_NCHWC8; + outDescCPU.dims[2] = paddedC; + outDescCPU.df = DF_NCHWC8; + inputCPU = (U8 *)tmp; + outputCPU = inputCPU + tensorNumBytes(inDescCPU); + transformNCHWToNCHWC8(inputDesc, input, inDescCPU, inputCPU); + } + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = pooling_general(inDescCPU, inputCPU, poolingParamSpec, outDescCPU, outputCPU); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = pooling_x86(inDescCPU, inputCPU, poolingParamSpec, scale, outDescCPU, outputCPU); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = pooling_arm(inDescCPU, inputCPU, poolingParamSpec, scale, outDescCPU, outputCPU); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = pooling_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (const GCLMem_t)input, poolingParamSpec, scale, (GCLMem_t)tmp, outputDesc, + (GCLMem_t)output); +#endif + } + if (DF_NCHWC8 != outputDesc.df && !IS_MALI_GPU(arch)) { + transformToNCHW(outDescCPU, outputCPU, outputDesc, output); + } + outputTensor.set_scale(scale[1]); + return ret; +} + +EE pooling_infer_forward_tmp_bytes( + Tensor inputTensor, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo) +{ + if (bytes == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor.get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + ret = pooling_infer_forward_tmp_bytes_mali( + inputDesc, bytes, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } else { + *bytes = 0; + if (DF_NCHW == inputDesc.df) { + U32 paddedC = (inputDesc.dims[2] + 7) / 8 * 8; + TensorDesc outputDesc = outputTensor.get_desc(); + inputDesc.dims[2] = paddedC; + outputDesc.dims[2] = paddedC; + *bytes = tensorNumBytes(inputDesc) + tensorNumBytes(outputDesc); + } + ret = SUCCESS; + } + return ret; +} diff --git a/tensor_computing/src/check.cpp b/compute/tensor/src/pooling_bp.cpp similarity index 51% rename from tensor_computing/src/check.cpp rename to compute/tensor/src/pooling_bp.cpp index f2fb8882..746e9fca 100644 --- a/tensor_computing/src/check.cpp +++ b/compute/tensor/src/pooling_bp.cpp @@ -1,50 +1,48 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" -#endif - -EE check(TensorDesc inputDescA, const void* inputA, - TensorDesc inputDescB, const void* inputB, - CheckMode checkMode, - TensorDesc outputDesc, void* output, Arch arch) -{ - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = check_general(inputDescA, inputA, inputDescB, inputB, checkMode, outputDesc, output); -#endif -#ifdef _USE_NEON - } else if (arch == ARM_A55 || arch == ARM_A76 || arch == ARM_V8 || arch == ARM_V7) { - ret = check_arm(inputDescA, inputA, inputDescB, inputB, checkMode, outputDesc, output); -#endif - } - return ret; -} - -EE check_infer_output_size(TensorDesc inputDesc, TensorDesc *outputDesc) -{ - if (nullptr == outputDesc) - CHECK_STATUS(NULL_POINTER); - - (*outputDesc).dt = DT_I32; - (*outputDesc).nDims = 1; - (*outputDesc).dims[0] = inputDesc.dims[inputDesc.nDims-1]; - return SUCCESS; -} +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +// only support average pooling now +EE pooling_bp( + Tensor inputTensor, PoolingParamSpec poolingParamSpec, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + CHECK_REQUIREMENT(POOLING_MEAN == poolingParamSpec.mode); + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = pooling_bp_general(inputDesc, input, poolingParamSpec, outputDesc, output); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + UNI_WARNING_LOG("The x86 pooling_bp operator is not optimized now.\n"); + ret = pooling_bp_general(inputDesc, input, poolingParamSpec, outputDesc, output); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = pooling_bp_arm(inputDesc, input, poolingParamSpec, outputDesc, output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/power.cpp b/compute/tensor/src/power.cpp new file mode 100644 index 00000000..7681b5e2 --- /dev/null +++ b/compute/tensor/src/power.cpp @@ -0,0 +1,78 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +inline EE power_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + *outputDesc = inputDesc; + return SUCCESS; +} + +EE power_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = power_infer_output_size_mali( + inputDesc, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = power_infer_output_size_cpu(inputDesc, &outputDesc); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE power(Tensor inputTensor, PowerParamSpec p, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = power_cpu(inputDesc, input, p, outputDesc, output, arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = power_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, p, + outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} diff --git a/tensor_computing/src/set_input.cpp b/compute/tensor/src/preallocated_memory.cpp similarity index 50% rename from tensor_computing/src/set_input.cpp rename to compute/tensor/src/preallocated_memory.cpp index fae848d2..0771e118 100644 --- a/tensor_computing/src/set_input.cpp +++ b/compute/tensor/src/preallocated_memory.cpp @@ -1,39 +1,55 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "tensor_computing.h" -#ifdef _USE_MALI +#ifdef _USE_MALI #include "gpu/mali/tensor_computing_mali.h" #endif -EE tensor_computing_set_input_infer_tmpBuf_size(void* input, TensorDesc hostDesc, U32* tmpBufSize, Arch arch) +EE preallocated_memory_infer_output_size(Tensor *outputTensor, ArchInfo_t archInfo) { + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc outputDesc = outputTensor->get_desc(); EE ret = NOT_SUPPORTED; - if(arch == MALI){ + if (IS_MALI_GPU(archInfo->arch)) { #ifdef _USE_MALI - ret = tensor_computing_set_input_infer_tmpBuf_size_mali((GCLMem_t)input, hostDesc, tmpBufSize); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = preallocated_memory_infer_output_size_mali(&outputDesc, &gclmemOutputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); #endif } + outputTensor->resize(outputDesc); return ret; } -EE tensor_computing_set_input(void* input, TensorDesc hostDesc, const void* hostPtr, void* tmpBuf, bool blocking, Arch arch, ExtInfo_t extInfo) +EE preallocated_memory(Tensor outputTensor, ArchInfo_t archInfo) { + auto arch = archInfo->arch; + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + EE ret = NOT_SUPPORTED; - if (arch == MALI) { + if (IS_MALI_GPU(arch)) { #ifdef _USE_MALI - ret = tensor_computing_set_input_mali(extInfo->maliInfo.handle, (GCLMem_t)input, hostDesc, (const U8*)hostPtr, (GCLMem_t)tmpBuf, blocking); + ret = preallocated_memory_mali( + ((MaliPara_t)(archInfo->archPara))->handle, outputDesc, (GCLMem_t)output); +#endif +#ifdef _USE_CPU + } else { + memset(output, 0, tensorNumBytes(outputDesc)); + ret = SUCCESS; #endif } return ret; diff --git a/compute/tensor/src/prelu.cpp b/compute/tensor/src/prelu.cpp new file mode 100644 index 00000000..11d53a55 --- /dev/null +++ b/compute/tensor/src/prelu.cpp @@ -0,0 +1,98 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +inline EE prelu_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + if (inputDesc.df != DF_NCHWC8) { + CHECK_STATUS(NOT_SUPPORTED); + } + *outputDesc = inputDesc; + return SUCCESS; +} + +EE prelu_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = prelu_infer_output_size_mali( + inputDesc, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = prelu_infer_output_size_cpu(inputDesc, &outputDesc); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE prelu(Tensor inputTensor, + Tensor weightTensor, + PReLUParamSpec preluDesc, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + void *weight = get_ptr_from_tensor(weightTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = prelu_general(inputDesc, input, weight, preluDesc, outputDesc, output); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + UNI_WARNING_LOG("The x86 prelu operator is not optimized now.\n"); + ret = prelu_general(inputDesc, input, weight, preluDesc, outputDesc, output); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = prelu_arm(inputDesc, input, weight, preluDesc, outputDesc, output); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = prelu_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, + (GCLMem_t)weight, preluDesc, outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/priorbox.cpp b/compute/tensor/src/priorbox.cpp new file mode 100644 index 00000000..d525a3d4 --- /dev/null +++ b/compute/tensor/src/priorbox.cpp @@ -0,0 +1,104 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif + +inline EE priorbox_infer_output_size_cpu( + std::vector inputDesc, PriorBoxParamSpec priorBoxParamSpec, TensorDesc *outputDesc) +{ + std::vector minsizes; + for (int i = 0; i < 2; i++) { + if (priorBoxParamSpec.min_sizes[i] == 0) { + break; + } + minsizes.push_back(priorBoxParamSpec.min_sizes[i]); + } + std::vector maxsizes; + for (int i = 0; i < 2; i++) { + if (priorBoxParamSpec.max_sizes[i] == 0) { + break; + } + maxsizes.push_back(priorBoxParamSpec.max_sizes[i]); + } + std::vector ars; + for (int i = 0; i < 2; i++) { + if (priorBoxParamSpec.aspect_ratios[i] == 0) { + break; + } + ars.push_back(priorBoxParamSpec.aspect_ratios[i]); + } + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + CHECK_STATUS(tensor4dGet(inputDesc[0], &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_REQUIREMENT(!ars.empty()); + U32 num_priorboxs = ars.size(); + if (priorBoxParamSpec.flip) { + num_priorboxs = num_priorboxs * 2; + } + CHECK_REQUIREMENT(!minsizes.empty()); + U32 num_minsize = minsizes.size(); + num_priorboxs = num_priorboxs * num_minsize + num_minsize; + if (!maxsizes.empty()) { + U32 num_maxsize = maxsizes.size(); + CHECK_REQUIREMENT(num_minsize == num_maxsize); + num_priorboxs = num_priorboxs + num_maxsize; + } + UNI_DEBUG_LOG("Number of priorboxs per pixel: %u\n", num_priorboxs); + // on = 1, oc = 2, ol= 4*num_priorboxs*ih*iw + if (DT_I8 == idt) { + idt = DT_F16; + } + *outputDesc = tensor3d(idt, 1, 2, 4 * num_priorboxs * ih * iw); + return SUCCESS; +} + +EE priorbox_infer_output_size(std::vector inputTensor, + PriorBoxParamSpec priorBoxParamSpec, + Tensor *outputTensor, + ArchInfo_t archInfo) +{ + UNUSED(archInfo); + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + std::vector inputDesc = get_desc_from_tensor_ptrs(inputTensor); + TensorDesc outputDesc = outputTensor->get_desc(); + CHECK_STATUS(priorbox_infer_output_size_cpu(inputDesc, priorBoxParamSpec, &outputDesc)); + outputTensor->resize(outputDesc); + return SUCCESS; +} + +EE priorbox(std::vector inputTensor, + PriorBoxParamSpec priorBoxParamSpec, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + std::vector inputDesc = get_desc_from_tensors(inputTensor); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = priorbox_cpu(inputDesc, priorBoxParamSpec, outputDesc, output, arch); +#endif + } + return ret; +} diff --git a/tensor_computing/src/quantize.cpp b/compute/tensor/src/quantize.cpp similarity index 59% rename from tensor_computing/src/quantize.cpp rename to compute/tensor/src/quantize.cpp index 2d12f60d..913eaefd 100644 --- a/tensor_computing/src/quantize.cpp +++ b/compute/tensor/src/quantize.cpp @@ -1,19 +1,18 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - #include "tensor_computing.h" +#ifdef _USE_NEON #include "cpu/arm/tensor_computing_arm.h" #ifdef _USE_FP16 #include "cpu/arm/fp16/arm_functions_fp16.h" @@ -21,16 +20,19 @@ #ifdef _USE_FP32 #include "cpu/arm/fp32/arm_functions_fp32.h" #endif -#define BINS 2048 +#endif -EE quantize_tensor(TensorDesc dDesc, const void* data, TensorDesc* qDesc, void* qData, void *scale) +EE quantize_tensor(TensorDesc dDesc, const void *data, TensorDesc *qDesc, void *qData, void *scale) { - EE ret = quantize_tensor_arm(dDesc, data, qDesc, qData, scale); + EE ret = NOT_SUPPORTED; +#ifdef _USE_NEON + ret = quantize_tensor_arm(dDesc, data, qDesc, qData, scale); +#endif return ret; } -#ifdef _USE_INT8 -void dequantize_int8_to_fp16(U32 len, INT8* q, F32 scale, F16* d) +#if defined(_USE_NEON) && defined(_USE_INT8) +void dequantize_int8_to_fp16(U32 len, INT8 *q, F32 scale, F16 *d) { F16 factor = 1 / scale; int i = 0; @@ -52,69 +54,86 @@ void dequantize_int8_to_fp16(U32 len, INT8* q, F32 scale, F16* d) } } -void dequantize_int32_to_fp16(U32 len, I32* q, F32 scale, F16* d, U32 biasLen, F16* biasPtr) +void dequantize_int32_to_fp16(U32 len, I32 *q, F32 scale, F16 *d, U32 biasLen, F16 *biasPtr) { if (0 != biasLen) { CHECK_REQUIREMENT(nullptr != biasPtr); CHECK_REQUIREMENT(len % biasLen == 0); - CHECK_REQUIREMENT(biasLen % 4 == 0); } float16x4_t bias[4]; F32 factor = 1 / scale; - int i = 0; - for (; i < ((int)len) - 15; i += 16) { - int32x4_t in0 = vld1q_s32(q + i); - int32x4_t in1 = vld1q_s32(q + i + 4); - int32x4_t in2 = vld1q_s32(q + i + 8); - int32x4_t in3 = vld1q_s32(q + i + 12); - if (0 != biasLen) { - U32 offset = i % biasLen; - for (U32 j = 0; j < 4; j++) { - bias[j] = vld1_f16(biasPtr + offset); - offset += 4; - if (offset >= biasLen) { - offset = 0; + if (biasLen % 4 == 0) { + int i = 0; + for (; i < ((int)len) - 15; i += 16) { + int32x4_t in0 = vld1q_s32(q + i); + int32x4_t in1 = vld1q_s32(q + i + 4); + int32x4_t in2 = vld1q_s32(q + i + 8); + int32x4_t in3 = vld1q_s32(q + i + 12); + if (0 != biasLen) { + U32 offset = i % biasLen; + for (U32 j = 0; j < 4; j++) { + bias[j] = vld1_f16(biasPtr + offset); + offset += 4; + if (offset >= biasLen) { + offset = 0; + } } } + float32x4_t f0 = vcvtq_f32_s32(in0); + float32x4_t f1 = vcvtq_f32_s32(in1); + float32x4_t f2 = vcvtq_f32_s32(in2); + float32x4_t f3 = vcvtq_f32_s32(in3); + f0 = vmulq_n_f32(f0, factor); + f1 = vmulq_n_f32(f1, factor); + f2 = vmulq_n_f32(f2, factor); + f3 = vmulq_n_f32(f3, factor); + float16x4_t h0 = vcvt_f16_f32(f0); + float16x4_t h1 = vcvt_f16_f32(f1); + float16x4_t h2 = vcvt_f16_f32(f2); + float16x4_t h3 = vcvt_f16_f32(f3); + if (0 != biasLen) { + h0 = vadd_f16(h0, bias[0]); + h1 = vadd_f16(h1, bias[1]); + h2 = vadd_f16(h2, bias[2]); + h3 = vadd_f16(h3, bias[3]); + } + vst1_f16(d + i, h0); + vst1_f16(d + i + 4, h1); + vst1_f16(d + i + 8, h2); + vst1_f16(d + i + 12, h3); } - float32x4_t f0 = vcvtq_f32_s32(in0); - float32x4_t f1 = vcvtq_f32_s32(in1); - float32x4_t f2 = vcvtq_f32_s32(in2); - float32x4_t f3 = vcvtq_f32_s32(in3); - f0 = vmulq_n_f32(f0, factor); - f1 = vmulq_n_f32(f1, factor); - f2 = vmulq_n_f32(f2, factor); - f3 = vmulq_n_f32(f3, factor); - float16x4_t h0 = vcvt_f16_f32(f0); - float16x4_t h1 = vcvt_f16_f32(f1); - float16x4_t h2 = vcvt_f16_f32(f2); - float16x4_t h3 = vcvt_f16_f32(f3); - if (0 != biasLen) { - h0 = vadd_f16(h0, bias[0]); - h1 = vadd_f16(h1, bias[1]); - h2 = vadd_f16(h2, bias[2]); - h3 = vadd_f16(h3, bias[3]); - } - vst1_f16(d + i, h0); - vst1_f16(d + i + 4, h1); - vst1_f16(d + i + 8, h2); - vst1_f16(d + i + 12, h3); - } - for (; i < (int)len; i++) { - d[i] = q[i] * factor; - if (0 != biasLen) { - d[i] += biasPtr[i % biasLen]; + for (; i < (int)len; i++) { + d[i] = q[i] * factor; + if (0 != biasLen) { + d[i] += biasPtr[i % biasLen]; + } + } + } else { + for (int i = 0; i < ((int)len); i += biasLen) { + int j = 0; + for (; j < ((int)biasLen) - 3; j += 4) { + int32x4_t in0 = vld1q_s32(q + i + j); + bias[0] = vld1_f16(biasPtr + j); + float32x4_t f0 = vcvtq_f32_s32(in0); + f0 = vmulq_n_f32(f0, factor); + float16x4_t h0 = vcvt_f16_f32(f0); + h0 = vadd_f16(h0, bias[0]); + vst1_f16(d + i + j, h0); + } + for (; j < (int)biasLen; j++) { + d[i + j] = q[i + j] * factor + biasPtr[j]; + } } } } -void update_histogram(U32 len, const F16* data, int numBins, F32 interval, F32* histo) +void update_histogram(U32 len, const F16 *data, int numBins, F32 interval, F32 *histo) { for (U32 i = 0; i < len; i++) { F32 tmp = data[i]; - int index = std::floor(std::abs(tmp) / interval); + int index = floor(abs(tmp) / interval); if (index >= numBins) { index = numBins - 1; } @@ -125,35 +144,33 @@ void update_histogram(U32 len, const F16* data, int numBins, F32 interval, F32* std::vector compress_histogram(std::vector &histogram, F32 numPerBin, F32 last_max) { std::vector newhistogram(2048, 0); - for (U32 q = 0; q < std::ceil(2048/numPerBin) ; q++) { - + for (U32 q = 0; q < ceil(2048 / numPerBin); q++) { F32 start = q * numPerBin; - F32 end = start + numPerBin; - int left = std::ceil(start); + F32 end = start + numPerBin; + int left = ceil(start); if (left > start) { - newhistogram[q] += ((F32)left - start) * histogram[left - 1]; + newhistogram[q] += ((F32)left - start) * histogram[left - 1]; } - if( end <= last_max){ - int right = std::floor(end); + if (end <= last_max) { + int right = floor(end); if (right < end) { newhistogram[q] += (end - (F32)right) * histogram[right]; } - + for (int k = left; k < right; k++) { newhistogram[q] += histogram[k]; } - } - else{ + } else { for (int k = left; k < 2048; k++) { newhistogram[q] += histogram[k]; } - } + } } - histogram.assign(newhistogram.begin(), newhistogram.end()); + histogram.assign(newhistogram.begin(), newhistogram.end()); return histogram; } -F32 compute_KLD(U32 len, const F32* p, const F32* q) +F32 compute_KLD(U32 len, const F32 *p, const F32 *q) { F32 kld = 0; @@ -162,7 +179,7 @@ F32 compute_KLD(U32 len, const F32* p, const F32* q) if (0 == q[i]) { kld += 1; } else { - kld += p[i] * std::log(p[i] / q[i]); + kld += p[i] * log(p[i] / q[i]); } } } @@ -175,13 +192,14 @@ std::vector compute_scale_with_KL(std::vector &histogram, F32 interval { std::vector scale; #ifdef _USE_INT8 + const int BINS = 2048; F32 histoSum = array_sum_f32(histogram.data(), BINS); array_scale_f32(histogram.data(), histogram.data(), BINS, 1 / histoSum, 0); F32 minKLD = 2048; int bestThreshold = 128; F32 sumBin = array_sum_f32(histogram.data(), 128); - DEBUG_info("First 128 bins contain " << sumBin << " of values"); + UNI_DEBUG_LOG("First 128 bins contain %f of values", sumBin); F32 sumOver = 1 - sumBin; for (U32 i = 128; i < 2048; i++) { @@ -196,17 +214,17 @@ std::vector compute_scale_with_KL(std::vector &histogram, F32 interval for (U32 j = 0; j < 128; j++) { F32 start = j * numPerBin; F32 end = start + numPerBin; - - int left = std::ceil(start); + + int left = ceil(start); if (left > start) { quantDist[j] += ((F32)left - start) * histogram[left - 1]; } - int right = std::floor(end); + int right = floor(end); if (right < end) { quantDist[j] += (end - (F32)right) * histogram[right]; } - + for (int k = left; k < right; k++) { quantDist[j] += histogram[k]; } @@ -220,12 +238,12 @@ std::vector compute_scale_with_KL(std::vector &histogram, F32 interval F32 count = 0; - int left = std::ceil(start); + int left = ceil(start); if (left > start && 0 != histogram[left - 1]) { count += (F32)left - start; } - int right = std::floor(end); + int right = floor(end); if (right < end && 0 != histogram[right]) { count += end - (F32)right; } @@ -260,7 +278,7 @@ std::vector compute_scale_with_KL(std::vector &histogram, F32 interval bestThreshold = i; } } - DEBUG_info(bestThreshold << "/2048"); + UNI_DEBUG_LOG(" %d/2048\n", bestThreshold); F32 threshold = (F32)bestThreshold * interval; F32 quantScale = 127.99 / threshold; scale.push_back(quantScale); diff --git a/compute/tensor/src/reduction.cpp b/compute/tensor/src/reduction.cpp new file mode 100644 index 00000000..98e7ada7 --- /dev/null +++ b/compute/tensor/src/reduction.cpp @@ -0,0 +1,144 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif + +EE reduction(Tensor inputTensor, + Tensor maskTensor, + ReductionParamSpec p, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc maskDesc = maskTensor.get_desc(); + void *mask = get_ptr_from_tensor(maskTensor, arch); + U32 tmpBytes = tmpTensor.bytes(); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = reduction_cpu( + inputDesc, input, maskDesc, mask, p, tmpBytes, tmp, outputDesc, output, arch); +#endif + } + return ret; +} + +EE reduction_infer_forward_tmp_bytes( + Tensor inputTensor, ReductionParamSpec p, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + int factor = 0; + if (p.axes_num > 1) { + factor = 2; + } + if (inputDesc.df == DF_NCHWC8) { + for (int i = 0; i < p.axes_num; i++) { + // channel dimension + if (p.axes[i] == 1 || p.axes[i] == -3) { + factor = 2; + break; + } + } + } + *bytes = UNI_MAX(inputTensor.bytes(), outputTensor.bytes()) * factor; + return SUCCESS; +} + +EE reduction_infer_output_size( + Tensor *inputTensor, Tensor maskTensor, ReductionParamSpec p, Tensor *outputTensor) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc maskDesc = maskTensor.get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + int start = 0; + TensorDesc tmpDesc = inputDesc; + if (inputDesc.df == DF_NCHWC8) { + for (int i = 0; i < p.axes_num; i++) { + // channel dimension + if (p.axes[i] == 1 || p.axes[i] == -3) { + start = -1; + break; + } + } + for (int i = (int)tmpDesc.nDims - 1; i >= 0; i--) { + tmpDesc.dims[i + 1] = tmpDesc.dims[i]; + } + tmpDesc.dims[3] /= 8; + tmpDesc.dims[0] = 8; + tmpDesc.nDims += 1; + } + outputDesc = tmpDesc; + for (int i = start; i < p.axes_num; i++) { + int axis; + if (i == -1) { + axis = 4; + } else { + axis = p.axes[i]; + } + if (axis < 0) { + axis = tmpDesc.nDims + axis; + } + axis = tmpDesc.nDims - 1 - axis; + if (tensorNumElements(maskDesc) == 0) { + outputDesc.dims[axis] = 0; + } else { + int num = maskDesc.dims[1] > 1 ? maskDesc.dims[1] : 0; + outputDesc.dims[axis] = num; + } + } + if (p.keep_dim) { + for (U32 i = 0; i < tmpDesc.nDims; i++) { + if (outputDesc.dims[i] == 0) { + outputDesc.dims[i] = 1; + } + } + outputDesc.nDims = tmpDesc.nDims; + } else { + int index = 0; + for (U32 i = 0; i < tmpDesc.nDims; i++) { + if (outputDesc.dims[i] != 0) { + outputDesc.dims[index++] = outputDesc.dims[i]; + } + } + outputDesc.nDims = index; + } + outputDesc.df = getTensorDefaultDataFormat(outputDesc.nDims); + if (inputDesc.df == DF_NCHWC8) { + if (start == 0) { + outputDesc.df = DF_NCHWC8; + for (int i = 0; i < (int)outputDesc.nDims - 1; i++) { + outputDesc.dims[i] = outputDesc.dims[i + 1]; + } + outputDesc.nDims -= 1; + outputDesc.dims[outputDesc.nDims - 2] *= 8; + } + } + outputTensor->resize(outputDesc); + return SUCCESS; +} diff --git a/compute/tensor/src/reshape.cpp b/compute/tensor/src/reshape.cpp new file mode 100644 index 00000000..24a4f685 --- /dev/null +++ b/compute/tensor/src/reshape.cpp @@ -0,0 +1,157 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +inline EE reshape_infer_output_size_cpu( + TensorDesc inputDesc, ReshapeParamSpec p, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + return NULL_POINTER; + } + + I32 *shape = p.shape_dims; + I32 shape_size = p.shape_size; + + int inputElementNum = tensorNumElements(inputDesc); + int outputElementNum = 1; + for (int i = 0; i < shape_size; i++) { + outputElementNum *= shape[i]; + } + int index_range = ((int)inputDesc.nDims > shape_size) ? shape_size : inputDesc.nDims; + if (inputElementNum > 0 && outputElementNum > 0 && inputElementNum != outputElementNum) { + for (int i = 0; i < index_range; i++) { + if ((inputElementNum / (int)inputDesc.dims[inputDesc.nDims - 1 - i]) == + (outputElementNum / shape[i])) { + shape[i] = inputDesc.dims[inputDesc.nDims - 1 - i]; + break; + } + } + } + + *outputDesc = inputDesc; + (*outputDesc).nDims = shape_size; + if (shape_size == 2) { + (*outputDesc).df = DF_NORMAL; + } + if (shape_size >= 4) { + (*outputDesc).df = DF_NCHW; + } + + U32 factor = 1; + I32 count = 0; + for (I32 i = 0; i < shape_size; i++) { + I32 value = shape[i]; + if (value == 0) { + value = inputDesc.dims[inputDesc.nDims - 1 - i]; + } + if (value == -1) { + value = 0; + count++; + } else { + factor *= value; + } + + (*outputDesc).dims[shape_size - 1 - i] = value; + } + if (count > 1) { + return NOT_SUPPORTED; + } + + for (I32 i = 0; i < shape_size; i++) { + if ((*outputDesc).dims[i] == 0) { + (*outputDesc).dims[i] = tensorNumElements(inputDesc) / factor; + } + } + + return SUCCESS; +} + +EE reshape_infer_output_size( + Tensor *inputTensor, ReshapeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = reshape_infer_output_size_mali( + inputDesc, p, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif +#ifdef _USE_CPU + } else { + ret = reshape_infer_output_size_cpu(inputDesc, p, &outputDesc); +#endif + } + outputTensor->resize(outputDesc); + return ret; +} + +EE reshape_infer_forward_tmp_bytes( + Tensor inputTensor, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo) +{ + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + GCLMemDesc gclmemInputDesc = ocl_get_desc(inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(outputTensor); + ret = reshape_infer_forward_tmp_bytes_mali( + inputDesc, outputDesc, &gclmemInputDesc, &gclmemOutputDesc, bytes); +#endif + } else { + *bytes = UNI_MAX(inputTensor.bytes(), outputTensor.bytes()); + ret = SUCCESS; + } + return ret; +} + +EE reshape(Tensor inputTensor, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + ret = reshape_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, + (GCLMem_t)tmp, outputDesc, (GCLMem_t)output); +#endif +#ifdef _USE_CPU + } else { + ret = reshape_cpu(inputDesc, input, outputDesc, output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/rnn.cpp b/compute/tensor/src/rnn.cpp new file mode 100644 index 00000000..91e53294 --- /dev/null +++ b/compute/tensor/src/rnn.cpp @@ -0,0 +1,298 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE rnn_transform_filter(std::vector filterTensors, + RNNParamSpec rnnParamSpec, + std::vector ftmTensors, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + std::vector filterDescs = get_desc_from_tensors(filterTensors); + std::vector filters = get_data_from_tensors(filterTensors, arch); + std::vector ftmDescs(ftmTensors.size()); + std::vector ftms = get_data_from_tensor_ptrs(ftmTensors, arch); + + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = rnn_transform_filter_cpu(filterDescs.data(), (const void **)filters.data(), + rnnParamSpec, ftmDescs.data(), ftms.data()); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + GCLMem filterArray[2]; + GCLMem filterTranArray[2]; + filterArray[0] = *((GCLMem_t)filters[0]); + filterTranArray[0] = *((GCLMem_t)ftms[0]); + if (rnnParamSpec.numProjection > 0) { + filterArray[1] = *((GCLMem_t)filters[1]); + filterTranArray[1] = *((GCLMem_t)ftms[1]); + } + ret = rnn_transform_filter_mali(((MaliPara_t)(archInfo->archPara))->handle, filterDescs[0], + filterArray, rnnParamSpec, ftmDescs.data(), filterTranArray, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } + for (U32 i = 0; i < ftmTensors.size(); i++) { + ftmTensors[i]->resize(ftmDescs[i]); + } + return ret; +} + +EE rnn_transform_filter_bytes( + std::vector filterTensors, RNNParamSpec rnnParamSpec, U32 *bytes, ArchInfo_t archInfo) +{ + std::vector filterDescs = get_desc_from_tensors(filterTensors); + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = rnn_transform_filter_bytes_cpu(filterDescs.data(), rnnParamSpec, bytes); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = rnn_transform_filter_bytes_mali(filterDescs[0], rnnParamSpec, + ((MaliPara_t)(archInfo->archPara))->gclmemFilterDesc, bytes, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } + return ret; +} + +EE rnn_infer_output_size( + Tensor *inputTensor, RNNParamSpec rnnParamSpec, Tensor *outputTensor, ArchInfo_t archInfo) +{ + UNUSED(archInfo); + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + DataType idt; + DataFormat idf; + U32 batch, step, xDim; + CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &batch, &step, &xDim)); + U32 num = (rnnParamSpec.biDirection) ? 2 : 1; + U32 hDim = num * rnnParamSpec.numOutput; + outputDesc = tensor3df(idt, idf, batch, step, hDim); + outputTensor->resize(outputDesc); + return SUCCESS; +} + +EE rnn_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + RNNParamSpec rnnParamSpec, + U32 *bytes, + ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + + EE ret = NOT_SUPPORTED; +#ifdef _USE_CPU + if (IS_CPU(archInfo->arch)) { + ret = rnn_infer_forward_tmp_bytes_cpu( + inputDesc, filterDesc, outputDesc, rnnParamSpec, bytes, archInfo->arch); + } +#endif + return ret; +} + +EE rnn(Tensor inputTensor, + std::vector filterTensors, + std::vector biasTensors, + RNNParamSpec rnnParamSpec, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + std::vector filterDescs = get_desc_from_tensors(filterTensors); + std::vector filters = get_data_from_tensors(filterTensors, arch); + std::vector biasDescs = get_desc_from_tensors(biasTensors); + std::vector biases = get_data_from_tensors(biasTensors, arch); + U32 tmpBytes = tmpTensor.bytes(); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = rnn_cpu(inputDesc, input, filterDescs.data(), (const void **)filters.data(), + biasDescs.data(), (const void **)biases.data(), rnnParamSpec, tmpBytes, tmp, outputDesc, + output, arch); +#endif + } + return ret; +} + +EE rnncell_infer_output_size(std::vector inputTensor, + RNNParamSpec rnnParamSpec, + Tensor *outputTensor, + ArchInfo_t archInfo) +{ + if (inputTensor[0] == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (inputTensor[1] == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor[0]->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + DataType idt; + DataFormat idf; + U32 batch, xDim; + CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &batch, &xDim)); + U32 hDim = rnnParamSpec.numOutput; + outputDesc = tensor2df(idt, idf, batch, hDim); + ret = SUCCESS; +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor[0]); + GCLMemDesc gclmemStateDesc = ocl_get_desc(*inputTensor[1]); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = rnncell_infer_output_size_mali(inputDesc, rnnParamSpec, &outputDesc, &gclmemInputDesc, + &gclmemStateDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor[0], gclmemInputDesc); + ocl_set_desc(inputTensor[1], gclmemStateDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } + outputTensor->resize(outputDesc); + return ret; +} + +EE rnncell_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor filterTensor, + Tensor outputTensor, + RNNParamSpec rnnParamSpec, + U32 *bytes, + ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + + EE ret = NOT_SUPPORTED; + auto arch = archInfo->arch; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = rnncell_infer_forward_tmp_bytes_cpu( + inputDesc, filterDesc, outputDesc, rnnParamSpec, bytes, archInfo->arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = rnncell_infer_forward_tmp_bytes_mali(inputDesc, filterDesc, outputDesc, rnnParamSpec, + bytes, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } + return ret; +} + +EE rnncell_infer_forward_algorithm(Tensor xTensor, + Tensor filterTensor, + Tensor biasTensor, + RNNParamSpec rnncellDesc, + U32 batchStrideX, + U32 batchStrideH, + Tensor hTensor, + ArchInfo_t archInfo) +{ + EE ret = NOT_SUPPORTED; +#ifdef _USE_MALI + if (IS_MALI_GPU(archInfo->arch)) { + TensorDesc filterDesc = filterTensor.get_desc(); + TensorDesc biasDesc = biasTensor.get_desc(); + TensorDesc xDesc = xTensor.get_desc(); + TensorDesc hDesc = hTensor.get_desc(); + ret = rnncell_infer_forward_algorithm_mali(((MaliPara_t)(archInfo->archPara))->handle, + xDesc, filterDesc, biasDesc, rnncellDesc, batchStrideX, batchStrideH, hDesc, + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); + } +#endif + return ret; +} + +EE rnncell(Tensor xTensor, + std::vector filterTensors, + std::vector biasTensors, + Tensor stateTensor, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + U32 tmpOffset, + Tensor tmpTensor, + Tensor hTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc xDesc = xTensor.get_desc(); + void *currentX = get_ptr_from_tensor(xTensor, arch); + std::vector filterDescs = get_desc_from_tensors(filterTensors); + std::vector filters = get_data_from_tensors(filterTensors, arch); + std::vector biasDescs = get_desc_from_tensors(biasTensors); + std::vector biases = get_data_from_tensors(biasTensors, arch); + void *state = get_ptr_from_tensor(stateTensor, arch); + U32 tmpBytes = tmpTensor.bytes(); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc hDesc = hTensor.get_desc(); + void *currentH = get_ptr_from_tensor(hTensor, arch); + if (!IS_MALI_GPU(arch)) { + tmp = (U8 *)tmp + tmpOffset; + } + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = rnncell_cpu(xDesc, currentX, filterDescs.data(), (const void **)filters.data(), + biasDescs.data(), (const void **)biases.data(), state, rnnParamSpec, batchStrideX, + batchStrideH, tmpBytes, tmp, hDesc, currentH, archInfo->arch); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + GCLMem filterArray[2]; + filterArray[0] = *((GCLMem_t)filters[0]); + if (rnnParamSpec.numProjection > 0) { + filterArray[1] = *((GCLMem_t)filters[1]); + } + ret = rnncell_mali(((MaliPara_t)(archInfo->archPara))->handle, xDesc, (GCLMem_t)currentX, + filterDescs[0], filterArray, biasDescs[0], (GCLMem_t)biases[0], (GCLMem_t)state, + rnnParamSpec, batchStrideX, batchStrideH, tmpBytes, (GCLMem_t)tmp, hDesc, + (GCLMem_t)currentH, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } + return ret; +} diff --git a/compute/tensor/src/roialign.cpp b/compute/tensor/src/roialign.cpp new file mode 100644 index 00000000..08069c3f --- /dev/null +++ b/compute/tensor/src/roialign.cpp @@ -0,0 +1,80 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif + +inline EE roialign_infer_output_size_cpu( + std::vector inputDesc, RoiAlignParamSpec p, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + CHECK_REQUIREMENT(inputDesc.size() == 3); + DataType idt0, idt1, idt2; + DataFormat idf0, idf1, idf2; + U32 in0, ic0, ih0, iw0; + U32 ih1, iw1; + U32 ilens2; + // feature map + CHECK_STATUS(tensor4dGet(inputDesc[0], &idt0, &idf0, &in0, &ic0, &ih0, &iw0)); + // rois + CHECK_STATUS(tensor2dGet(inputDesc[1], &idt1, &idf1, &ih1, &iw1)); + // bacth indices + CHECK_STATUS(tensor1dGet(inputDesc[2], &idt2, &idf2, &ilens2)); + CHECK_REQUIREMENT(ih1 == ilens2); + CHECK_REQUIREMENT(iw1 == 4); + // output size + U32 on, oc, oh, ow; + // on = num_rois, oc = ic, oh = output_h, ow = output_w + on = ih1; + oc = ic0; + oh = p.output_h; + ow = p.output_w; + *outputDesc = tensor4d(idt0, on, oc, oh, ow); + return SUCCESS; +} + +EE roialign_infer_output_size( + std::vector inputTensor, RoiAlignParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) +{ + UNUSED(archInfo); + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + std::vector inputDesc = get_desc_from_tensor_ptrs(inputTensor); + TensorDesc outputDesc = outputTensor->get_desc(); + + CHECK_STATUS(roialign_infer_output_size_cpu(inputDesc, p, &outputDesc)); + outputTensor->resize(outputDesc); + return SUCCESS; +} + +EE roialign( + std::vector inputTensor, RoiAlignParamSpec p, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + std::vector inputDesc = get_desc_from_tensors(inputTensor); + std::vector input = get_data_from_tensors(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = roialign_cpu(inputDesc, input, p, outputDesc, output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/scale.cpp b/compute/tensor/src/scale.cpp new file mode 100644 index 00000000..fea48db9 --- /dev/null +++ b/compute/tensor/src/scale.cpp @@ -0,0 +1,98 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +inline EE scale_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + + *outputDesc = inputDesc; + return SUCCESS; +} + +EE scale_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = scale_infer_output_size_mali( + inputDesc, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = scale_infer_output_size_cpu(inputDesc, &outputDesc); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE scale(Tensor inputTensor, + void *alpha, + void *beta, + ScaleParamSpec p, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = scale_general(inputDesc, input, alpha, beta, p, outputDesc, output); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = scale_x86(inputDesc, input, alpha, beta, p, outputDesc, output); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = scale_arm(inputDesc, input, alpha, beta, p, outputDesc, output); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = scale_mali(((MaliPara_t)(archInfo->archPara))->handle, (GCLMem_t)alpha, + (GCLMem_t)beta, inputDesc, (GCLMem_t)input, outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/slice.cpp b/compute/tensor/src/slice.cpp new file mode 100644 index 00000000..6be69a12 --- /dev/null +++ b/compute/tensor/src/slice.cpp @@ -0,0 +1,128 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +inline EE slice_infer_output_size_cpu( + TensorDesc inputDesc, SliceParamSpec p, std::vector *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + U32 num = (*outputDesc).size(); + int axis = (p.axis + inputDesc.nDims) % inputDesc.nDims; + I32 *slice_points = p.slice_points; + + bool splitEqual = true; + for (U32 i = 0; i < num; i++) { + if (0 != slice_points[i]) { + splitEqual = false; + break; + } + } + I32 target_axis = inputDesc.nDims - 1 - axis; + if (splitEqual) { + CHECK_REQUIREMENT(0 == inputDesc.dims[target_axis] % num); + inputDesc.dims[target_axis] /= num; + } + for (U32 i = 0; i < num; i++) { + (*outputDesc)[i] = inputDesc; + if (splitEqual) { + continue; + } + + I32 prev_point = 0; + if (i > 0) { + prev_point = slice_points[i - 1]; + } + I32 next_point = inputDesc.dims[target_axis]; + if (i < num - 1) { + next_point = slice_points[i]; + } + if (i == 0 && num == 1 && p.slice_size == 1) { // Could happen in onnx + next_point = slice_points[0]; + } + if (prev_point < 0) { + prev_point = prev_point + inputDesc.dims[target_axis]; + if (prev_point < 0) { + prev_point = 0; + } + } + if (next_point < 0) { + next_point = next_point + inputDesc.dims[target_axis]; + if (next_point < 0) { + next_point = 0; + } + } + (*outputDesc)[i].dims[target_axis] = next_point - prev_point; + } + return SUCCESS; +} + +EE slice_infer_output_size( + Tensor *inputTensor, SliceParamSpec p, std::vector outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + std::vector outputDesc = get_desc_from_tensor_ptrs(outputTensor); + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + std::vector gclmemOutputDescs; + for (auto p : outputTensor) { + gclmemOutputDescs.push_back(ocl_get_desc(*p)); + } + CHECK_STATUS(slice_infer_output_size_mali( + inputDesc, p, &outputDesc, &gclmemInputDesc, gclmemOutputDescs.data())); + ocl_set_desc(inputTensor, gclmemInputDesc); + for (U32 i = 0; i < outputTensor.size(); i++) { + ocl_set_desc(outputTensor[i], gclmemOutputDescs[i]); + } +#endif + } else { + CHECK_STATUS(slice_infer_output_size_cpu(inputDesc, p, &outputDesc)); + } + for (U32 i = 0; i < outputTensor.size(); i++) { + outputTensor[i]->resize(outputDesc[i]); + } + return SUCCESS; +} + +EE slice(Tensor inputTensor, SliceParamSpec p, std::vector outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + std::vector outputDesc = get_desc_from_tensors(outputTensor); + std::vector output = get_data_from_tensors(outputTensor, arch); + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = slice_cpu(inputDesc, input, p, outputDesc, &output); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + ret = slice_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, p, + outputDesc, &output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/softmax.cpp b/compute/tensor/src/softmax.cpp new file mode 100644 index 00000000..c025d364 --- /dev/null +++ b/compute/tensor/src/softmax.cpp @@ -0,0 +1,115 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE softmax( + Tensor inputTensor, SoftmaxParamSpec p, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = softmax_general(inputDesc, input, p, outputDesc, output); +#endif +#ifdef _USE_X86 + } else if (IS_X86_AVX2(arch)) { + ret = softmax_x86(inputDesc, input, p, outputDesc, output); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = softmax_arm(inputDesc, input, p, outputDesc, output); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + ret = softmax_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, + p, (GCLMem_t)tmp, outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} + +inline EE softmax_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + *outputDesc = inputDesc; + if (DF_NCHWC8 == (*outputDesc).df) { + (*outputDesc).df = DF_NCHW; + } + return SUCCESS; +} + +EE softmax_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = softmax_infer_output_size_mali( + inputDesc, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = softmax_infer_output_size_cpu(inputDesc, &outputDesc); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE softmax_infer_forward_tmp_bytes(Tensor inputTensor, U32 *bytes, ArchInfo_t archInfo) +{ + if (bytes == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + TensorDesc inputDesc = inputTensor.get_desc(); + ret = softmax_infer_forward_tmp_bytes_mali( + inputDesc, bytes, ((MaliPara_t)(archInfo->archPara))->forwardRunInfo); +#endif + } else { + *bytes = 0; + ret = SUCCESS; + } + return ret; +} diff --git a/compute/tensor/src/space2depth.cpp b/compute/tensor/src/space2depth.cpp new file mode 100644 index 00000000..85b5c5b8 --- /dev/null +++ b/compute/tensor/src/space2depth.cpp @@ -0,0 +1,60 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE space2depth_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = space2depth_infer_output_size_mali( + inputDesc, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } + outputTensor->resize(outputDesc); + return ret; +} + +EE space2depth(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + ret = space2depth_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (GCLMem_t)input, outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/split.cpp b/compute/tensor/src/split.cpp new file mode 100644 index 00000000..672337b4 --- /dev/null +++ b/compute/tensor/src/split.cpp @@ -0,0 +1,49 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#if defined(_USE_GENERAL) || defined(_USE_NEON) || defined(_USE_X86) +#include "cpu/tensor_computing_cpu.h" +#endif + +EE split_infer_output_size(Tensor *inputTensor, std::vector output) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + for (auto p : output) { + if (p == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + p->resize(inputDesc); + } + return SUCCESS; +} + +EE split(Tensor inputTensor, std::vector outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + std::vector outputDesc = get_desc_from_tensors(outputTensor); + std::vector output = get_data_from_tensors(outputTensor, arch); + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#if defined(_USE_GENERAL) || defined(_USE_NEON) || defined(_USE_X86) + ret = split_cpu(inputDesc, input, outputDesc, &output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/squeeze.cpp b/compute/tensor/src/squeeze.cpp new file mode 100644 index 00000000..066dc8b1 --- /dev/null +++ b/compute/tensor/src/squeeze.cpp @@ -0,0 +1,101 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif +#include + +EE squeeze(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + TensorDesc outputDesc = outputTensor.get_desc(); + ret = squeeze_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, + outputDesc, (GCLMem_t)output); +#endif +#ifdef _USE_CPU + } else { + if (output != input) { + memcpy(output, input, tensorNumBytes(inputDesc)); + } + ret = SUCCESS; +#endif + } + return ret; +} + +#ifdef _USE_CPU +EE squeeze_infer_output_size_cpu( + TensorDesc inputDesc, int *axes, int axesNum, TensorDesc *outputDesc) +{ + outputDesc->dt = inputDesc.dt; + for (U32 i = 0; i < inputDesc.nDims; i++) { + outputDesc->dims[i] = inputDesc.dims[i]; + } + for (int i = 0; i < axesNum; i++) { + int axis = axes[i]; + if (axis < 0) { + axis += inputDesc.nDims; + } + outputDesc->dims[inputDesc.nDims - 1 - axis] = 0; + } + U32 index = 0; + for (U32 i = 0; i < inputDesc.nDims; i++) { + if (outputDesc->dims[i] != 0) { + outputDesc->dims[index++] = outputDesc->dims[i]; + } + } + CHECK_REQUIREMENT(index + axesNum == inputDesc.nDims); + outputDesc->nDims = index; + outputDesc->df = getTensorDefaultDataFormat(outputDesc->nDims); + return SUCCESS; +} +#endif + +EE squeeze_infer_output_size( + Tensor *inputTensor, SqueezeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = squeeze_infer_output_size_mali( + inputDesc, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif +#ifdef _USE_CPU + } else { + ret = squeeze_infer_output_size_cpu(inputDesc, p.axes, p.axes_num, &outputDesc); +#endif + } + outputTensor->resize(outputDesc); + return ret; +} diff --git a/compute/tensor/src/tensor_computing_type.cpp b/compute/tensor/src/tensor_computing_type.cpp new file mode 100644 index 00000000..5e1b6d13 --- /dev/null +++ b/compute/tensor/src/tensor_computing_type.cpp @@ -0,0 +1,174 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "types.h" +#include "tensor_computing_type.h" + +ConvolutionParamSpec createConvolutionParamSpec(U32 group, + U32 kernelH, + U32 kernelW, + U32 strideH, + U32 strideW, + U32 paddingT, + U32 paddingB, + U32 paddingL, + U32 paddingR, + U32 dilateH, + U32 dilateW, + U32 num_outputs, + ConvolutionMode convMode) +{ + ConvolutionParamSpec p; + p.group = group; + p.kernel_h = kernelH; + p.kernel_w = kernelW; + p.stride_h = strideH; + p.stride_w = strideW; + p.padding_top = paddingT; + p.padding_bottom = paddingB; + p.padding_left = paddingL; + p.padding_right = paddingR; + p.dilatedRate_h = dilateH; + p.dilatedRate_w = dilateW; + p.num_outputs = num_outputs; + p.convolution_type = convMode; + return p; +} + +FullyConnectedParamSpec createFullyConnectedParamSpec( + U32 num_outputs, U32 num_slices, I32 *slice_point) +{ + FullyConnectedParamSpec p; + p.num_outputs = num_outputs; + p.num_slices = num_slices; + if (num_slices > 1 && slice_point != nullptr) { + for (int i = 0; i < (int)num_slices; i++) { + p.slice_point[i] = slice_point[i]; + } + } + return p; +} + +PoolingParamSpec createPoolingParamSpec(PoolingMode pm, + U32 ksH, + U32 ksW, + U32 strideH, + U32 strideW, + U32 paddingT, + U32 paddingB, + U32 paddingL, + U32 paddingR, + RoundMode rm) +{ + PoolingParamSpec p; + p.mode = pm; + p.kernel_h = ksH; + p.kernel_w = ksW; + p.stride_h = strideH; + p.stride_w = strideW; + p.padding_top = paddingT; + p.padding_bottom = paddingB; + p.padding_left = paddingL; + p.padding_right = paddingR; + p.rm = rm; + return p; +} + +ReshapeParamSpec createReshapeParamSpec(I32 *shape_dims, I32 shape_size, I32 axis, I32 num_axes) +{ + ReshapeParamSpec p; + p.shape_size = shape_size; + p.axis = axis; + p.num_axes = num_axes; + if (shape_dims != nullptr && shape_size != 0) { + for (int i = 0; i < shape_size; i++) { + p.shape_dims[i] = shape_dims[i]; + } + } + return p; +} + +ClipParamSpec createClipParamSpec(float min, float max) +{ + ClipParamSpec p; + p.min = min; + p.max = max; + return p; +} + +SqueezeParamSpec createSqueezeParamSpec(int *axes, int axes_num) +{ + SqueezeParamSpec p; + p.axes_num = axes_num; + if (axes != nullptr && axes_num != 0) { + for (int i = 0; i < axes_num; i++) { + p.axes[i] = axes[i]; + } + } + return p; +} + +std::vector get_desc_from_tensors(std::vector tensors) +{ + int size = tensors.size(); + std::vector result(size); + for (int i = 0; i < size; i++) { + result[i] = tensors[i].get_desc(); + } + return result; +} + +std::vector get_desc_from_tensor_ptrs(std::vector tensors) +{ + int size = tensors.size(); + std::vector result(size); + for (int i = 0; i < size; i++) { + result[i] = tensors[i]->get_desc(); + } + return result; +} + +std::vector get_scale_from_tensors(std::vector tensors) +{ + int size = tensors.size(); + std::vector result(size); + for (int i = 0; i < size; i++) { + result[i] = tensors[i].get_scale(); + } + return result; +} + +template +std::vector get_data_from_tensors(std::vector tensors, Arch arch) +{ + int size = tensors.size(); + std::vector result(size); + for (int i = 0; i < size; i++) { + result[i] = (T)get_ptr_from_tensor(tensors[i], arch); + } + return result; +} + +template +std::vector get_data_from_tensor_ptrs(std::vector tensors, Arch arch) +{ + int size = tensors.size(); + std::vector result(size); + for (int i = 0; i < size; i++) { + result[i] = (T)get_ptr_from_tensor(*tensors[i], arch); + } + return result; +} + +template std::vector get_data_from_tensors(std::vector tensors, Arch arch); +template std::vector get_data_from_tensor_ptrs(std::vector tensors, Arch arch); diff --git a/compute/tensor/src/tfslice.cpp b/compute/tensor/src/tfslice.cpp new file mode 100644 index 00000000..e1e49774 --- /dev/null +++ b/compute/tensor/src/tfslice.cpp @@ -0,0 +1,55 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif + +EE tfslice_infer_output_size( + Tensor *inputTensor, TfSliceParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + EE ret = NOT_SUPPORTED; + if (IS_CPU(archInfo->arch)) { +#ifdef _USE_CPU + ret = tfslice_infer_output_size_cpu(inputDesc, p, &outputDesc); +#endif + } + outputTensor->resize(outputDesc); + return ret; +} + +EE tfslice(Tensor inputTensor, TfSliceParamSpec p, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = tfslice_cpu(inputDesc, input, p, outputDesc, output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/tile.cpp b/compute/tensor/src/tile.cpp new file mode 100644 index 00000000..a958687e --- /dev/null +++ b/compute/tensor/src/tile.cpp @@ -0,0 +1,80 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#include "tensor_computing.h" + +EE tile_infer_output_size( + Tensor *inputTensor, TileParamSpec tileParamSpec, Tensor *outputTensor, ArchInfo_t archInfo) +{ + auto inDim = inputTensor->get_desc(); + auto outDim = inDim; + if ((int)inDim.nDims == tileParamSpec.dimsSize) { + for (int i = 0; i < tileParamSpec.dimsSize; i++) { + outDim.dims[tileParamSpec.dimsSize - 1 - i] = + inDim.dims[tileParamSpec.dimsSize - 1 - i] * tileParamSpec.repeatsInfo[i]; + } + } else { + if (tileParamSpec.axis == -1) { + tileParamSpec.axis = 0; + } + outDim.dims[tileParamSpec.axis] = + outDim.dims[tileParamSpec.axis] * tileParamSpec.repeatsInfo[0]; + } + outputTensor->resize(outDim); + return SUCCESS; +} + +EE tile(Tensor inputTensor, TileParamSpec tileParamSpec, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + U8 *ptr = (U8 *)output; + int element_size = bytesOf(inputDesc.dt); + if (tileParamSpec.dimsSize == (int)inputDesc.nDims) { //onnx model support + ret = NOT_SUPPORTED; + } else { //caffe model support + int axis = tileParamSpec.axis; + if (axis == -1) { + axis = 0; + } + int length = 1; + for (U32 i = 0; i < inputDesc.nDims; i++) { + length = length * inputDesc.dims[i]; + } + if (axis == (int)inputDesc.nDims - 1) { + for (int i = 0; i < tileParamSpec.repeatsInfo[0]; i++) { + U8 *srcPtr = (U8 *)input; + U8 *desPtr = ptr + element_size * length * i; + memcpy(desPtr, srcPtr, element_size * length); + } + ret = SUCCESS; + } else if (axis == 0) { + int count = length / inputDesc.dims[axis]; + for (int i = 0; i < count; i++) { + for (int j = 0; j < tileParamSpec.repeatsInfo[0]; j++) { + U8 *srcPtr = (U8 *)input + element_size * inputDesc.dims[axis] * i; + U8 *desPtr = ptr + + element_size * inputDesc.dims[axis] * (tileParamSpec.repeatsInfo[0] * i + j); + memcpy(desPtr, srcPtr, element_size * inputDesc.dims[axis]); + } + } + ret = SUCCESS; + } else { + ret = NOT_SUPPORTED; + } + } + return ret; +} diff --git a/compute/tensor/src/transpose.cpp b/compute/tensor/src/transpose.cpp new file mode 100644 index 00000000..3542e8e3 --- /dev/null +++ b/compute/tensor/src/transpose.cpp @@ -0,0 +1,166 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#if defined(_USE_X86) || defined(_USE_NEON) +#include "cpu/tensor_computing_cpu.h" +#endif +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif + +EE transpose(Tensor inputTensor, + TransposeParamSpec p, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + std::vector tmpDims(p.trans_dims, p.trans_dims + p.trans_size); + if (IS_CPU(arch)) { + // Keep transDims unchanged so that input resize does not lead to error + if (DF_NCHWC8 == inputDesc.df) { + if (4 == p.trans_size) { + auto ptr = std::find(tmpDims.begin(), tmpDims.end(), 1); + tmpDims.insert(ptr + 1, 4); + } + inputDesc.nDims = 5; + for (int i = 3; i >= 0; i--) { + inputDesc.dims[i + 1] = inputDesc.dims[i]; + } + inputDesc.dims[3] /= 8; + inputDesc.dims[0] = 8; + + TensorDesc desc = outputDesc; + desc.nDims = 5; + U32 idx = 4; + for (int i = 3; i >= 0; i--) { + if (1 == tmpDims[3 - i]) { // C + desc.dims[idx] = outputDesc.dims[i] / 8; + idx--; + desc.dims[idx] = 8; + idx--; + } else { + desc.dims[idx] = outputDesc.dims[i]; + idx--; + } + } + outputDesc = desc; + } + } + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = transpose_general(inputDesc, input, tmpDims.data(), outputDesc, output); +#endif +#if defined(_USE_X86) || defined(_USE_NEON) + } else if (IS_CPU(arch)) { + ret = transpose_cpu(inputDesc, input, tmpDims.data(), outputDesc, output); +#endif +#ifdef _USE_MALI + } else if (IS_MALI_GPU(arch)) { + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + ret = transpose_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (const GCLMem_t)input, p, (GCLMem_t)tmp, outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} + +inline EE transpose_infer_output_size_cpu( + TensorDesc inputDesc, TransposeParamSpec p, TensorDesc *outputDesc) +{ + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + + U32 *dim = p.trans_dims; + *outputDesc = inputDesc; + U32 inputDim = inputDesc.nDims; + if (4 == inputDim) { + (*outputDesc).df = DF_NCHW; + } + U32 outputDim = (*outputDesc).nDims; + for (U32 i = 0; i < inputDim; i++) { + CHECK_REQUIREMENT(dim[i] < inputDim); + // NOTE: TensorDesc.dims array is in [W H C N] order. + // so if you want to transpose [N C H W] format data, we use (dims - 1 - *) + // [5 6 7 8] + [0 3 2 1] = [5 8 7 6] + // [8 7 6 5] + [0 3 2 1] = [6 7 8 5] + (*outputDesc).dims[outputDim - 1 - i] = inputDesc.dims[inputDim - 1 - dim[i]]; + } + if ((*outputDesc).nDims >= 4) { + (*outputDesc).df = DF_NCHW; + } + if ((*outputDesc).nDims == 4 && p.trans_size == 3 && (*outputDesc).dims[0] == 1) { + (*outputDesc) = tensor3df(inputDesc.dt, DF_NCHW, (*outputDesc).dims[3], (*outputDesc).dims[2], (*outputDesc).dims[1]); + } + return SUCCESS; +} + +EE transpose_infer_output_size( + Tensor *inputTensor, TransposeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = transpose_infer_output_size_mali( + inputDesc, p, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif + } else { + ret = transpose_infer_output_size_cpu(inputDesc, p, &outputDesc); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE transpose_infer_forward_tmp_bytes( + Tensor inputTensor, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo) +{ + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(outputTensor); + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + ret = transpose_infer_forward_tmp_bytes_mali( + inputDesc, outputDesc, &gclmemInputDesc, &gclmemOutputDesc, bytes); +#endif + } else { + *bytes = 0; + ret = SUCCESS; + } + return ret; +} diff --git a/compute/tensor/src/unsqueeze.cpp b/compute/tensor/src/unsqueeze.cpp new file mode 100644 index 00000000..299c8c1e --- /dev/null +++ b/compute/tensor/src/unsqueeze.cpp @@ -0,0 +1,102 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_MALI +#include "gpu/mali/tensor_computing_mali.h" +#endif +#include + +EE unsqueeze(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(arch)) { +#ifdef _USE_MALI + TensorDesc outputDesc = outputTensor.get_desc(); + ret = unsqueeze_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, + outputDesc, (GCLMem_t)output); +#endif +#ifdef _USE_CPU + } else { + if (output != input) { + memcpy(output, input, tensorNumBytes(inputDesc)); + } + ret = SUCCESS; +#endif + } + return ret; +} + +#ifdef _USE_CPU +EE unsqueeze_infer_output_size_cpu( + TensorDesc inputDesc, int *axes, int axesNum, TensorDesc *outputDesc) +{ + outputDesc->dt = inputDesc.dt; + outputDesc->nDims = inputDesc.nDims + axesNum; + outputDesc->df = getTensorDefaultDataFormat(outputDesc->nDims); + for (U32 i = 0; i < outputDesc->nDims; i++) { + outputDesc->dims[i] = 0; + } + for (int i = 0; i < axesNum; i++) { + int axis = axes[i]; + if (axis < 0) { + axis += outputDesc->nDims; + } + outputDesc->dims[outputDesc->nDims - 1 - axis] = 1; + } + U32 index = 0; + for (U32 i = 0; i < outputDesc->nDims; i++) { + if (outputDesc->dims[i] == 0) { + outputDesc->dims[i] = inputDesc.dims[index++]; + } + } + CHECK_REQUIREMENT(index == inputDesc.nDims); + return SUCCESS; +} +#endif + +EE unsqueeze_infer_output_size( + Tensor *inputTensor, UnsqueezeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + + EE ret = NOT_SUPPORTED; + if (IS_MALI_GPU(archInfo->arch)) { +#ifdef _USE_MALI + GCLMemDesc gclmemInputDesc = ocl_get_desc(*inputTensor); + GCLMemDesc gclmemOutputDesc = ocl_get_desc(*outputTensor); + ret = unsqueeze_infer_output_size_mali( + inputDesc, &outputDesc, &gclmemInputDesc, &gclmemOutputDesc); + ocl_set_desc(inputTensor, gclmemInputDesc); + ocl_set_desc(outputTensor, gclmemOutputDesc); +#endif +#ifdef _USE_CPU + } else { + ret = unsqueeze_infer_output_size_cpu(inputDesc, p.axes, p.axes_num, &outputDesc); +#endif + } + outputTensor->resize(outputDesc); + return ret; +} diff --git a/compute/tensor/src/yolov3detectionoutput.cpp b/compute/tensor/src/yolov3detectionoutput.cpp new file mode 100644 index 00000000..32affba6 --- /dev/null +++ b/compute/tensor/src/yolov3detectionoutput.cpp @@ -0,0 +1,77 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif + +#define NUM_DETECTED_MAX 200 + +inline EE yolov3detectionoutput_infer_output_size_cpu(std::vector inputDesc, + Yolov3DetectionOutputParamSpec yolov3DetectionOutputParamSpec, + TensorDesc *outputDesc) +{ + UNUSED(yolov3DetectionOutputParamSpec); + if (nullptr == outputDesc) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt0; + idt0 = inputDesc[0].dt; + // output size + U32 oh, ow; + // oh = the first box for saving the number of available boxes(1) + the maximum number of dectected boxes(NUM_DETECTED_MAX = 200) + oh = 1 + NUM_DETECTED_MAX; + // Each width is a 6 dimension vector, which stores [label, confidence, xmin, ymin, xmax, ymax] -> 6 + // The first box is [ number of available boxes, 0, 0, 0, 0, 0 ] + ow = 6; + *outputDesc = tensor2d(idt0, oh, ow); + return SUCCESS; +} + +EE yolov3detectionoutput_infer_output_size(std::vector inputTensor, + Yolov3DetectionOutputParamSpec yolov3DetectionOutputParamSpec, + Tensor *outputTensor, + ArchInfo_t archInfo) +{ + UNUSED(archInfo); + if (outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + std::vector inputDesc = get_desc_from_tensor_ptrs(inputTensor); + TensorDesc outputDesc = outputTensor->get_desc(); + CHECK_STATUS(yolov3detectionoutput_infer_output_size_cpu( + inputDesc, yolov3DetectionOutputParamSpec, &outputDesc)); + outputTensor->resize(outputDesc); + return SUCCESS; +} + +EE yolov3detectionoutput(std::vector inputTensor, + Yolov3DetectionOutputParamSpec yolov3DetectionOutputParamSpec, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + std::vector inputDesc = get_desc_from_tensors(inputTensor); + std::vector input = get_data_from_tensors(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = yolov3detectionoutput_cpu( + inputDesc, input, yolov3DetectionOutputParamSpec, outputDesc, output, arch); +#endif + } + return ret; +} diff --git a/compute/tensor/tests/CMakeLists.txt b/compute/tensor/tests/CMakeLists.txt new file mode 100644 index 00000000..63622e8e --- /dev/null +++ b/compute/tensor/tests/CMakeLists.txt @@ -0,0 +1,67 @@ +function(tensor_test name) + add_executable(${name} ${name}.cpp) + link_tensor(${name}) + install(TARGETS ${name} + RUNTIME DESTINATION tests) +endfunction() + +set_test_c_cxx_flags() + +tensor_test(test_activation) +tensor_test(test_argmax) +tensor_test(test_attention) +tensor_test(test_check) +tensor_test(test_clip) +tensor_test(test_concat) +tensor_test(test_convolution) +tensor_test(test_deconvolution) +tensor_test(test_depthwise_convolution) +tensor_test(test_dilated_convolution) +tensor_test(test_detectionoutput) +tensor_test(test_eltwise) +tensor_test(test_fully_connected) +tensor_test(test_rnn) +tensor_test(test_power) +tensor_test(test_reduction) +tensor_test(test_pooling) +tensor_test(test_pooling_bp) +tensor_test(test_padding) +tensor_test(test_priorbox) +tensor_test(test_reshape) +tensor_test(test_softmax) +tensor_test(test_split) +tensor_test(test_slice) +tensor_test(test_scale) +tensor_test(test_transpose) +tensor_test(test_non_max_suppression) +tensor_test(test_roialign) +tensor_test(test_l2normalization) +tensor_test(test_prelu) +tensor_test(test_normalization) +tensor_test(test_tile) + +tensor_test(test_convolution_int8) +tensor_test(test_depthwise_convolution_int8) +tensor_test(test_concat_int8) +tensor_test(test_pooling_int8) +tensor_test(test_convolution_bnn) + +if (USE_MALI) + if (USE_FP16) + tensor_test(test_convolution_ocl test_convolution_ocl.cpp) + tensor_test(test_deconvolution_ocl test_deconvolution_ocl.cpp) + tensor_test(test_channel_resize_ocl test_channel_resize_ocl.cpp) + tensor_test(test_depthwise_convolution_ocl test_depthwise_convolution_ocl.cpp) + tensor_test(test_depthwise_pointwise_convolution_ocl test_depthwise_pointwise_convolution_ocl.cpp) + tensor_test(test_fully_connected_ocl test_fully_connected_ocl.cpp) + tensor_test(test_multihead_attention_ocl test_multihead_attention_ocl.cpp) + tensor_test(test_padding_ocl test_padding_ocl.cpp) + tensor_test(test_prelu_ocl test_prelu_ocl.cpp) + tensor_test(test_pooling_ocl test_pooling_ocl.cpp) + tensor_test(test_softmax_h1w1_ocl test_softmax_h1w1_ocl.cpp) + tensor_test(test_power_ocl test_power_ocl.cpp) + tensor_test(test_transpose_ocl test_transpose_ocl.cpp) + tensor_test(test_concat_ocl test_concat_ocl.cpp) + tensor_test(test_reshape_ocl test_reshape_ocl.cpp) + endif (USE_FP16) +endif (USE_MALI) diff --git a/compute/tensor/tests/test_activation.cpp b/compute/tensor/tests/test_activation.cpp new file mode 100644 index 00000000..177206fb --- /dev/null +++ b/compute/tensor/tests/test_activation.cpp @@ -0,0 +1,124 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" + +int activationFunctionTest(U32 in, + U32 ic, + U32 ih, + U32 iw, + DataType dt, + ActivationParamSpec activationDesc, + const char *activationType) +{ + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + DataFormat df = DF_NCHWC8; + memset(activationDesc.value, 0, sizeof(activationDesc.value)); + + TensorDesc dataDesc = tensor4df(dt, df, in, ic, ih, iw); + U32 len = tensorNumElements(dataDesc); + + U8 *data = ut_input_v(len, dt, UT_INIT_RANDOM); + + Tensor dataTensor = Tensor::alloc_sized(dataDesc); + Tensor dataTensorRef = Tensor::alloc_sized(dataDesc); + memcpy(get_ptr_from_tensor(dataTensor, UT_ARCH), data, tensorNumBytes(dataDesc)); + memcpy(get_ptr_from_tensor(dataTensorRef, UT_ARCH), data, tensorNumBytes(dataDesc)); + + if (UT_CHECK) { + //check + CHECK_STATUS(activation(dataTensor, activationDesc, dataTensor, &archInfo)); + + // naive implement + CHECK_STATUS(activation(dataTensorRef, activationDesc, dataTensorRef, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(dataTensor, UT_ARCH), + get_ptr_from_tensor(dataTensorRef, UT_ARCH), dataTensor.length(), dt, 0.01, __FILE__, + __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(activation(dataTensor, activationDesc, dataTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)=(%u %u %u %u)", in, ic, ih, iw, in, ic, ih, iw); + sprintf(buffer, "%20s, %80s", activationType, params); + double ops = 1.0 * in * ic * ih * iw; + ut_log(dt, buffer, ops, time / UT_LOOPS); + + free(data); + + return 0; +} + +int activationTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 5); + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + + ActivationParamSpec activationDesc; + //test relu + activationDesc.mode = ACTIVATION_RELU; + activationFunctionTest(in, ic, ih, iw, dt, activationDesc, "Activation Relu"); + //test relu6 + activationDesc.mode = ACTIVATION_RELU6; + activationFunctionTest(in, ic, ih, iw, dt, activationDesc, "Activation Relu6"); + //test h swish + activationDesc.mode = ACTIVATION_H_SWISH; + activationFunctionTest(in, ic, ih, iw, dt, activationDesc, "Activation h_siwsh"); + //test h sigmod + activationDesc.mode = ACTIVATION_H_SIGMOID; + activationFunctionTest(in, ic, ih, iw, dt, activationDesc, "Activation h_sigmod"); + //test tanh + activationDesc.mode = ACTIVATION_TANH; + activationFunctionTest(in, ic, ih, iw, dt, activationDesc, "Activation tanh"); + //test gelu + activationDesc.mode = ACTIVATION_GELU; + activationFunctionTest(in, ic, ih, iw, dt, activationDesc, "Activation gelu"); + //test mish + activationDesc.mode = ACTIVATION_MISH; + activationFunctionTest(in, ic, ih, iw, dt, activationDesc, "Activation mish"); + //test sigmod + activationDesc.mode = ACTIVATION_SIGMOID; + activationFunctionTest(in, ic, ih, iw, dt, activationDesc, "Activation sigmod"); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + activationTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + activationTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/tests/test_argmax.cpp b/compute/tensor/tests/test_argmax.cpp similarity index 54% rename from tests/test_argmax.cpp rename to compute/tensor/tests/test_argmax.cpp index 3b5abc28..49dc7b18 100644 --- a/tests/test_argmax.cpp +++ b/compute/tensor/tests/test_argmax.cpp @@ -1,74 +1,86 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "tensor_computing.h" #include "ut_util.h" -int argmaxTest(int argc, char** argv, DataType dt) { +int argmaxTest(int argc, char **argv, DataType dt) +{ CHECK_REQUIREMENT(argc == 6); + ArgMaxParamSpec p; U32 in = atoi(argv[1]); U32 ic = atoi(argv[2]); U32 ih = atoi(argv[3]); U32 iw = atoi(argv[4]); - I32 axis = atoi(argv[5]); + p.axis = atoi(argv[5]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; DataFormat df = DF_NCHW; TensorDesc inDesc = tensor4df(dt, df, in, ic, ih, iw); - U8* input = ut_input_v(tensorNumElements(inDesc), dt, UT_INIT_RANDOM); - TensorDesc outDesc; - CHECK_STATUS(argmax_infer_output_size(inDesc, axis, &outDesc)); - U8* output = ut_input_v(tensorNumElements(outDesc), DT_U32, UT_INIT_ZERO); - U8* outputRef = ut_input_v(tensorNumElements(outDesc), DT_U32, UT_INIT_ZERO); + U8 *input = ut_input_v(tensorNumElements(inDesc), dt, UT_INIT_RANDOM); + Tensor inputTensor; + inputTensor.resize(inDesc); + inputTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inDesc)); + + Tensor outputTensor; + Tensor outputTensorRef; + CHECK_STATUS(argmax_infer_output_size(&inputTensor, p, &outputTensor, &archInfo)); + outputTensor.alloc(); + outputTensorRef.resize(outputTensor.get_desc()); + outputTensorRef.alloc(); + + Tensor nullTensor; if (UT_CHECK) { - CHECK_STATUS(argmax(inDesc, input, axis, outDesc, output, UT_ARCH)); + CHECK_STATUS(argmax(inputTensor, p, nullTensor, outputTensor, &archInfo)); // naive implement - CHECK_STATUS(argmax(inDesc, input, axis, outDesc, outputRef, CPU_GENERAL)); + CHECK_STATUS(argmax(inputTensor, p, nullTensor, outputTensorRef, &archInfo_org)); // check - ut_check_v(output, outputRef, tensorNumElements(outDesc), DT_U32, 0, __FILE__, __LINE__); + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), outputTensor.length(), DT_U32, 0, + __FILE__, __LINE__); } // benchmark double time_start = ut_time_ms(); for (int iter = 0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(argmax(inDesc, input, axis, outDesc, output, UT_ARCH)); + CHECK_STATUS(argmax(inputTensor, p, nullTensor, outputTensor, &archInfo)); } double time_end = ut_time_ms(); double time = (time_end - time_start) / UT_LOOPS; // log performance data U32 on, oh, ow; - CHECK_STATUS(tensor3dGet(outDesc, &dt, &df, &on, &oh, &ow)); + CHECK_STATUS(tensor3dGet(outputTensor.get_desc(), &dt, &df, &on, &oh, &ow)); char buffer[150]; char params[120]; - sprintf(params, "(%u %u %u %u) %d =(%u %u %u)", - in, ic, ih, iw, axis, - on, oh, ow); + sprintf(params, "(%u %u %u %u) %d =(%u %u %u)", in, ic, ih, iw, p.axis, on, oh, ow); sprintf(buffer, "%20s, %80s", "Argmax", params); double ops = 1.0 * in * ic * ih * iw; - ut_log(dt, buffer, ops, time/UT_LOOPS); - - free(output); - free(outputRef); + ut_log(dt, buffer, ops, time / UT_LOOPS); return 0; } -int main(int argc, char** argv) { +int main(int argc, char **argv) +{ #ifdef _USE_FP16 argmaxTest(argc, argv, DT_F16); #endif diff --git a/tests/test_attention.cpp b/compute/tensor/tests/test_attention.cpp similarity index 50% rename from tests/test_attention.cpp rename to compute/tensor/tests/test_attention.cpp index 50935509..7316b50f 100644 --- a/tests/test_attention.cpp +++ b/compute/tensor/tests/test_attention.cpp @@ -1,49 +1,58 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "tensor_computing.h" #include "ut_util.h" -int attentionTest(int argc, char** argv, DataType dt) { +int attentionTest(int argc, char **argv, DataType dt) +{ CHECK_REQUIREMENT(argc == 5); + AttentionParamSpec p; U32 batch = atoi(argv[1]); - U32 numHeads = atoi(argv[2]); - U32 fromSequenceLength = atoi(argv[3]); - U32 toSequenceLength = atoi(argv[4]); + p.num_heads = atoi(argv[2]); + p.from_sequence_length = atoi(argv[3]); + p.to_sequence_length = atoi(argv[4]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; DataFormat df = DF_NORMAL; - TensorDesc inputDesc = tensor2df(dt, df, batch, toSequenceLength); - TensorDesc outputDesc; - CHECK_STATUS(attention_infer_output_size(inputDesc, numHeads, fromSequenceLength, toSequenceLength, &outputDesc)); - U32 inputLength = tensorNumElements(inputDesc); - U32 outputLength = tensorNumElements(outputDesc); + TensorDesc inDesc = tensor2df(dt, df, batch, p.to_sequence_length); + U32 inputLength = tensorNumElements(inDesc); + U8 *input = ut_input_v(inputLength, dt, UT_INIT_ZERO); + Tensor inputTensor = Tensor::alloc_sized(inDesc); - U8* input = ut_input_v(inputLength, dt, UT_INIT_ZERO); + Tensor outputTensor; + CHECK_STATUS(attention_infer_output_size(&inputTensor, p, &outputTensor)); + outputTensor.alloc(); + Tensor outputTensorRef = Tensor::alloc_sized(outputTensor.get_desc()); + ; + U32 outputLength = outputTensor.length(); for (U32 i = 0; i < batch; i++) { - U32 threshold = toSequenceLength / 2 + i; - for (U32 j = 0; j < toSequenceLength; j++) { + U32 threshold = p.to_sequence_length / 2 + i; + for (U32 j = 0; j < p.to_sequence_length; j++) { if (j < threshold) { switch (dt) { #ifdef _USE_FP32 case DT_F32: - ((F32*)input)[i * toSequenceLength + j] = 1; + ((F32 *)input)[i * p.to_sequence_length + j] = 1; break; #endif #ifdef _USE_FP16 case DT_F16: - ((F16*)input)[i * toSequenceLength + j] = 1; + ((F16 *)input)[i * p.to_sequence_length + j] = 1; break; #endif default: @@ -52,22 +61,24 @@ int attentionTest(int argc, char** argv, DataType dt) { } } } - U8* output = ut_input_v(outputLength, dt, UT_INIT_ZERO); - U8* outputRef = ut_input_v(outputLength, dt, UT_INIT_ZERO); - if(UT_CHECK) { - CHECK_STATUS(attention(inputDesc, input, outputDesc, output, UT_ARCH)); + + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inDesc)); + + if (UT_CHECK) { + CHECK_STATUS(attention(inputTensor, outputTensor, &archInfo)); // naive implement - CHECK_STATUS(attention(inputDesc, input, outputDesc, outputRef, CPU_GENERAL)); + CHECK_STATUS(attention(inputTensor, outputTensorRef, &archInfo_org)); // check - ut_check_v(output, outputRef, outputLength, dt, 0, __FILE__, __LINE__); + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), outputLength, dt, 0, __FILE__, __LINE__); } // benchmark double time_start = ut_time_ms(); - for(int iter=0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(attention(inputDesc, input, outputDesc, output, UT_ARCH)); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(attention(inputTensor, outputTensor, &archInfo)); } double time_end = ut_time_ms(); double time = (time_end - time_start) / UT_LOOPS; @@ -75,21 +86,19 @@ int attentionTest(int argc, char** argv, DataType dt) { // log performance data char buffer[150]; char params[120]; - sprintf(params, "(%u %u)=(%u %u %u %u)", - batch, fromSequenceLength, - batch, numHeads, fromSequenceLength, toSequenceLength); + sprintf(params, "(%u %u)=(%u %u %u %u)", batch, p.from_sequence_length, batch, p.num_heads, + p.from_sequence_length, p.to_sequence_length); sprintf(buffer, "%20s, %80s", "Attention", params); double ops = 3.0 * outputLength; - ut_log(dt, buffer, ops, time/UT_LOOPS); + ut_log(dt, buffer, ops, time / UT_LOOPS); free(input); - free(output); - free(outputRef); return 0; } -int main(int argc, char** argv) { +int main(int argc, char **argv) +{ #ifdef _USE_FP16 attentionTest(argc, argv, DT_F16); #endif diff --git a/compute/tensor/tests/test_axpby.cpp b/compute/tensor/tests/test_axpby.cpp new file mode 100644 index 00000000..70b2f351 --- /dev/null +++ b/compute/tensor/tests/test_axpby.cpp @@ -0,0 +1,75 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "blas_enhance.h" +#include "ut_util.h" + +int axpbyTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 4); + U32 len = atoi(argv[1]); + F32 a = atof(argv[2]); + F32 b = atof(argv[3]); + + TensorDesc xDesc = tensor1d(dt, len); + TensorDesc yDesc = tensor1d(dt, len); + + U8 *x = ut_input_v(len, dt, UT_INIT_RANDOM); + U8 *y = ut_input_v(len, dt, UT_INIT_RANDOM); + U8 *y_ref = ut_input_v(len, dt, UT_INIT_ZERO); + + memcpy(y_ref, y, tensorNumBytes(yDesc)); + // check + if (UT_CHECK) { + CHECK_STATUS(vector_vector_axpby(a, xDesc, x, b, yDesc, y, UT_ARCH)); + + // naive implement + CHECK_STATUS(vector_vector_axpby(a, xDesc, x, b, yDesc, y_ref, CPU_GENERAL)); + + ut_check_v(y, y_ref, len, dt, 0.01, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + vector_vector_axpby(a, xDesc, x, b, yDesc, y, UT_ARCH); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%.2f * %u) + (%.2f * %u) = (%u)", a, len, b, len, len); + sprintf(buffer, "%20s, %80s", "VectorVectoraXpbY", params); + double ops = 3.0 * len; + ut_log(dt, buffer, ops, time); + + free(x); + free(y); + free(y_ref); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + axpbyTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + axpbyTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_channel_resize_ocl.cpp b/compute/tensor/tests/test_channel_resize_ocl.cpp new file mode 100644 index 00000000..6ba31933 --- /dev/null +++ b/compute/tensor/tests/test_channel_resize_ocl.cpp @@ -0,0 +1,166 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "types.h" +#include +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" +#include "iostream" + +#ifdef _USE_FP16 +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} + +int channelresizeTest(int argc, char *argv[], DataType dt) +{ + CHECK_REQUIREMENT(argc == 8); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + + ChannelResizeParamSpec p; + p.channel_before = atoi(argv[5]); + p.channel_after = atoi(argv[6]); + p.group = atoi(argv[7]); + // output + U32 on = in; + U32 oc = p.channel_after; + U32 oh = ih; + U32 ow = iw; + + CHECK_REQUIREMENT(in == 1 && on == 1); + CHECK_REQUIREMENT(p.channel_before == (int)ic); + + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + TensorDesc inputDesc_cpu, inputDesc_gpu, outputDesc; + inputDesc_cpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + inputDesc_gpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + outputDesc = tensor4df(dt, DF_NCHW, in, oc, oh, ow); + + // setup input + U8 *input_cpu = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + U8 *output_cpu = ut_input_v(on * oc * oh * ow, dt, UT_INIT_RANDOM); + U8 *output_gpu = NULL; + F16 *in_val = (F16 *)input_cpu; + U32 len_in = tensorNumElements(inputDesc_cpu); + for (U32 i = 0; i < len_in; i++) { + in_val[i] = i; + } + + U32 len = tensorNumElements(outputDesc); + F16 *out_val = (F16 *)output_cpu; + for (U32 i = 0; i < len; i++) { + out_val[i] = in_val[i]; + } + + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(inputDesc_gpu); + + MaliPara maliPara; + maliPara.handle = handle; + archInfo.archPara = &maliPara; + + CHECK_STATUS(channel_resize_infer_output_size(&inputTensor, p, &outputTensor, &archInfo)); + outputDesc = outputTensor.get_desc(); + CHECK_REQUIREMENT(tensorNumElements(outputDesc) == on * oc * oh * ow); + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + + U32 maxBytes = 0; + U32 tmpBytes = 0; + tmpBytes = tensorNumBytes(inputDesc_gpu); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + CHECK_STATUS(ocl_set_input(handle, input, inputDesc_gpu, input_cpu, tmpbuf, true)); + CHECK_STATUS(channel_resize(inputTensor, p, outputTensor, &archInfo)); + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + + UNI_INFO_LOG("Run:\n") +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + CHECK_STATUS(ocl_get_output(handle, output, outputDesc, true)); + output_gpu = output->mapPtrArray.back(); + + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)->(%u %u %u %u)", in, ic, ih, iw, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "ChannelResize", params); +#ifdef _DEBUG + double ops = 1.0 * on * oc * oh * ow; + ut_log(dt, buffer, ops, time); +#endif + + ut_check_a(output_gpu, output_cpu, on * oc * ow * oh, dt); + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(input_cpu); + free(output_cpu); + return 0; +} +#endif + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + channelresizeTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/tests/test_check.cpp b/compute/tensor/tests/test_check.cpp similarity index 52% rename from tests/test_check.cpp rename to compute/tensor/tests/test_check.cpp index e14428bf..93dff6c0 100644 --- a/tests/test_check.cpp +++ b/compute/tensor/tests/test_check.cpp @@ -1,53 +1,72 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "tensor_computing.h" #include "ut_util.h" -int checkTest(int argc, char** argv, DataType dt) { +int checkTest(int argc, char **argv, DataType dt) +{ CHECK_REQUIREMENT(argc == 5); U32 in = atoi(argv[1]); U32 ic = atoi(argv[2]); U32 ih = atoi(argv[3]); U32 iw = atoi(argv[4]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; DataFormat df = DF_NCHW; - CheckMode checkMode = CHECK_EQUAL; + CheckParamSpec p; + p.check_mode = CHECK_EQUAL; TensorDesc inDesc = tensor4df(dt, df, in, ic, ih, iw); - U8* inputA = ut_input_v(tensorNumElements(inDesc), dt, UT_INIT_RANDOM); - U8* inputB = ut_input_v(tensorNumElements(inDesc), dt, UT_INIT_RANDOM); - TensorDesc outDesc; - CHECK_STATUS(check_infer_output_size(inDesc, &outDesc)); - I32* output = (I32*)ut_input_v(tensorNumElements(outDesc), DT_I32, UT_INIT_ZERO); - I32* outputRef = (I32*)ut_input_v(tensorNumElements(outDesc), DT_I32, UT_INIT_ZERO); + U8 *inputA = ut_input_v(tensorNumElements(inDesc), dt, UT_INIT_RANDOM); + U8 *inputB = ut_input_v(tensorNumElements(inDesc), dt, UT_INIT_RANDOM); + + Tensor inputTensorA; + Tensor inputTensorB; + inputTensorA.resize(inDesc); + inputTensorB.resize(inDesc); + inputTensorA.alloc(); + inputTensorB.alloc(); + memcpy(get_ptr_from_tensor(inputTensorA, UT_ARCH), inputA, tensorNumBytes(inDesc)); + memcpy(get_ptr_from_tensor(inputTensorB, UT_ARCH), inputB, tensorNumBytes(inDesc)); + + Tensor outputTensor; + Tensor outputTensorRef; + CHECK_STATUS(check_infer_output_size({&inputTensorA, &inputTensorB}, &outputTensor, &archInfo)); + outputTensor.alloc(); + outputTensorRef.resize(outputTensor.get_desc()); + outputTensorRef.alloc(); if (UT_CHECK) { - CHECK_STATUS(check(inDesc, inputA, inDesc, inputB, checkMode, outDesc, output, UT_ARCH)); + CHECK_STATUS(check(inputTensorA, inputTensorB, p, outputTensor, &archInfo)); // naive implement - CHECK_STATUS(check(inDesc, inputA, inDesc, inputB, checkMode, outDesc, outputRef, CPU_GENERAL)); + CHECK_STATUS(check(inputTensorA, inputTensorB, p, outputTensorRef, &archInfo_org)); // check - ut_check_v(output, outputRef, tensorNumElements(outDesc), DT_I32, 0, __FILE__, __LINE__); + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), outputTensor.length(), DT_I32, 0, + __FILE__, __LINE__); } // benchmark double time_start = ut_time_ms(); for (int iter = 0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(check(inDesc, inputA, inDesc, inputB, checkMode, outDesc, output, UT_ARCH)); + CHECK_STATUS(check(inputTensorA, inputTensorB, p, outputTensor, &archInfo)); } double time_end = ut_time_ms(); double time = (time_end - time_start) / UT_LOOPS; @@ -55,20 +74,16 @@ int checkTest(int argc, char** argv, DataType dt) { // log performance data char buffer[150]; char params[120]; - sprintf(params, "(%u %u %u %u)=(%u)", - in, ic, ih, iw, - in); + sprintf(params, "(%u %u %u %u)=(%u)", in, ic, ih, iw, in); sprintf(buffer, "%20s, %80s", "Check", params); double ops = 1.0 * in * ic * ih * iw; - ut_log(dt, buffer, ops, time/UT_LOOPS); - - free(output); - free(outputRef); + ut_log(dt, buffer, ops, time / UT_LOOPS); return 0; } -int main(int argc, char** argv) { +int main(int argc, char **argv) +{ #ifdef _USE_FP16 checkTest(argc, argv, DT_F16); #endif diff --git a/tests/test_clip.cpp b/compute/tensor/tests/test_clip.cpp similarity index 51% rename from tests/test_clip.cpp rename to compute/tensor/tests/test_clip.cpp index b677224d..32b4d251 100644 --- a/tests/test_clip.cpp +++ b/compute/tensor/tests/test_clip.cpp @@ -1,48 +1,61 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "tensor_computing.h" #include "ut_util.h" -int clipTest(int argc, char** argv, DataType dt) { +int clipTest(int argc, char **argv, DataType dt) +{ CHECK_REQUIREMENT(argc == 4); U32 len = atoi(argv[1]); - F32 min_value = atof(argv[2]); - F32 max_value = atof(argv[3]); - - TensorDesc input_desc = tensor1d(dt, len); - TensorDesc output_desc; - CHECK_STATUS(clip_infer_output_size(input_desc, &output_desc, UT_ARCH)); - - U8* input = ut_input_v(len, dt, UT_INIT_RANDOM); - U8* output = ut_input_v(len, dt, UT_INIT_ZERO); - U8* output_ref = ut_input_v(len, dt, UT_INIT_ZERO); + ClipParamSpec p; + p.min = atof(argv[2]); + p.max = atof(argv[3]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + TensorDesc inDesc = tensor1d(dt, len); + U8 *input = ut_input_v(len, dt, UT_INIT_RANDOM); + Tensor inputTensor; + inputTensor.resize(inDesc); + inputTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inDesc)); + + Tensor outputTensor; + Tensor outputTensorRef; + CHECK_STATUS(clip_infer_output_size(&inputTensor, &outputTensor, &archInfo)); + outputTensor.alloc(); + outputTensorRef.resize(outputTensor.get_desc()); + outputTensorRef.alloc(); if (UT_CHECK) { - CHECK_STATUS(clip(&min_value, &max_value, input_desc, input, output_desc, output, UT_ARCH)); + CHECK_STATUS(clip(inputTensor, p, outputTensor, &archInfo)); // naive implement - CHECK_STATUS(clip(&min_value, &max_value, input_desc, input, output_desc, output_ref, CPU_GENERAL)); + CHECK_STATUS(clip(inputTensor, p, outputTensorRef, &archInfo_org)); // check - ut_check_v(output, output_ref, len, dt, 0, __FILE__, __LINE__); + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), outputTensor.length(), dt, 0, __FILE__, + __LINE__); } // benchmark double time_start = ut_time_ms(); - for (int iter = 0; iter < UT_LOOPS; iter ++) { - CHECK_STATUS(clip(&min_value, &max_value, input_desc, input, output_desc, output, UT_ARCH)); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(clip(inputTensor, p, outputTensor, &archInfo)); } double time_end = ut_time_ms(); double time = (time_end - time_start) / UT_LOOPS; @@ -50,21 +63,18 @@ int clipTest(int argc, char** argv, DataType dt) { // log performance data char buffer[150]; char params[120]; - sprintf(params, "(%u)=(%u)", - len, len); + sprintf(params, "(%u)=(%u)", len, len); sprintf(buffer, "%20s, %80s", "Clip", params); double ops = 2.0 * len; - ut_log(dt, buffer, ops, time/UT_LOOPS); + ut_log(dt, buffer, ops, time / UT_LOOPS); free(input); - free(output); - free(output_ref); return 0; } - -int main(int argc, char** argv) { +int main(int argc, char **argv) +{ #ifdef _USE_FP16 clipTest(argc, argv, DT_F16); #endif diff --git a/compute/tensor/tests/test_concat.cpp b/compute/tensor/tests/test_concat.cpp new file mode 100644 index 00000000..9f5b0202 --- /dev/null +++ b/compute/tensor/tests/test_concat.cpp @@ -0,0 +1,135 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "tensor_computing.h" +#include "ut_util.h" + +int concatTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc > 2); + ConcatParamSpec p; + int num = atoi(argv[1]); + p.axis = atoi(argv[2]); + CHECK_REQUIREMENT(p.axis == 0 || p.axis == 1); + CHECK_REQUIREMENT(argc == 1 + 2 + (num + 1) * 4); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + + std::vector inTensors(num); + std::vector inTensorPtr(num); + Tensor outTensor; + + for (int i = 0; i < num; i++) { + std::vector in_dim(4); + in_dim[0] = atoi(argv[3 + i * 4]); + in_dim[1] = atoi(argv[3 + i * 4 + 1]); + in_dim[2] = atoi(argv[3 + i * 4 + 2]); + in_dim[3] = atoi(argv[3 + i * 4 + 3]); + TensorDesc inDesc; + if (in_dim[1] % 8 == 0) { + inDesc = tensor4df(dt, DF_NCHWC8, in_dim[0], in_dim[1], in_dim[2], in_dim[3]); + } else { + inDesc = tensor4df(dt, DF_NCHW, in_dim[0], in_dim[1], in_dim[2], in_dim[3]); + } + inTensors[i].resize(inDesc); + inTensorPtr[i] = &inTensors[i]; + } + U32 on = atoi(argv[3 + num * 4]); + U32 oc = atoi(argv[3 + num * 4 + 1]); + U32 oh = atoi(argv[3 + num * 4 + 2]); + U32 ow = atoi(argv[3 + num * 4 + 3]); + + CHECK_STATUS(concat_infer_output_size(inTensorPtr, p, &outTensor, &archInfo)); + + U32 in_len = 0; + for (int i = 0; i < num; i++) { + in_len += inTensors[i].length(); + } + U32 out_len = outTensor.length(); + CHECK_REQUIREMENT(in_len == out_len && out_len == on * oc * oh * ow); + + // setup tmp + U32 tmpBytes; + CHECK_STATUS(concat_infer_forward_tmp_bytes(inTensors, &tmpBytes, &archInfo)); + Tensor tmpTensor; + tmpTensor.resize(tensor1d(DT_U8, tmpBytes)); + tmpTensor.alloc(); + + U8 *outputRef = ut_input_v(in_len, dt, UT_INIT_RANDOM); + U8 *tmp = ut_input_v(tmpBytes, dt, UT_INIT_RANDOM); + U8 *tmpPtr = tmp; + + U32 count = 0; + TensorDesc outDesc = outTensor.get_desc(); + for (int i = 0; i < num; i++) { + inTensors[i].alloc(); + TensorDesc inputDesc = inTensors[i].get_desc(); + U32 bytes = tensorNumBytes(inputDesc); + TensorDesc tmpDesc = inputDesc; + tmpDesc.df = outDesc.df; + U8 *srcPtr = (U8 *)get_ptr_from_tensor(inTensors[i], UT_ARCH); + if (inputDesc.df == DF_NCHW && outDesc.df == DF_NCHWC8) { + transformNCHWToNCHWC8(inputDesc, srcPtr, tmpDesc, tmpPtr); + srcPtr = tmpPtr; + } else if (inputDesc.df == DF_NCHWC8 && outDesc.df == DF_NCHW) { + transformToNCHW(inputDesc, srcPtr, tmpDesc, tmpPtr); + srcPtr = tmpPtr; + } + memcpy(outputRef + count, srcPtr, bytes); + count += bytes; + tmpPtr += bytes; + } + outTensor.alloc(); + + if (UT_CHECK) { + CHECK_STATUS(concat(inTensors, p, tmpTensor, outTensor, &archInfo)); + + // check + ut_check_v( + get_ptr_from_tensor(outTensor, UT_ARCH), outputRef, in_len, dt, 0, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(concat(inTensors, p, tmpTensor, outTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "%d (*)/%u=(%u %u %u %u)", num, p.axis, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Concat", params); + double ops = 1.0 * out_len; + ut_log(dt, buffer, ops, time); + + free(tmp); + free(outputRef); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + concatTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + concatTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/tests/test_concat_int8.cpp b/compute/tensor/tests/test_concat_int8.cpp similarity index 53% rename from tests/test_concat_int8.cpp rename to compute/tensor/tests/test_concat_int8.cpp index 31eb9bb3..0c1e7351 100644 --- a/tests/test_concat_int8.cpp +++ b/compute/tensor/tests/test_concat_int8.cpp @@ -1,94 +1,108 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "tensor_computing.h" #include "ut_util.h" #ifdef _USE_INT8 -int int8ConcatTest(int argc, char** argv, DataType dt){ +int int8ConcatTest(int argc, char **argv, DataType dt) +{ CHECK_REQUIREMENT(argc > 2); + ConcatParamSpec p; int num = atoi(argv[1]); - U32 axis = atoi(argv[2]); - CHECK_REQUIREMENT(axis == 0 || axis == 1); - CHECK_REQUIREMENT(argc == 1 + 2 + (num+1)*4); - - std::vector in_desc(num); - std::vector in_desc_ref(num); - TensorDesc out_desc; - std::vector> in_dims(num); - for (int i = 0; i < num; i++){ + p.axis = atoi(argv[2]); + CHECK_REQUIREMENT(p.axis == 0 || p.axis == 1); + CHECK_REQUIREMENT(argc == 1 + 2 + (num + 1) * 4); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + + std::vector inTensors(num); + std::vector inTensorsRef(num); + std::vector inTensorPtr(num); + Tensor outTensor; + + for (int i = 0; i < num; i++) { std::vector in_dim(4); in_dim[0] = atoi(argv[3 + i * 4]); in_dim[1] = atoi(argv[3 + i * 4 + 1]); in_dim[2] = atoi(argv[3 + i * 4 + 2]); in_dim[3] = atoi(argv[3 + i * 4 + 3]); - in_dims[i] = in_dim; - in_desc[i] = tensor4df(DT_I8, DF_NCHWC8, in_dim[0], in_dim[1], in_dim[2], in_dim[3]); - in_desc_ref[i] = in_desc[i]; - in_desc_ref[i].dt = dt; + TensorDesc inDesc; + if (in_dim[1] % 8 == 0) { + inDesc = tensor4df(DT_I8, DF_NCHWC8, in_dim[0], in_dim[1], in_dim[2], in_dim[3]); + } else { + inDesc = tensor4df(DT_I8, DF_NCHW, in_dim[0], in_dim[1], in_dim[2], in_dim[3]); + } + inTensors[i].resize(inDesc); + inDesc.dt = dt; + inTensorsRef[i].resize(inDesc); + inTensorPtr[i] = &inTensors[i]; } U32 on = atoi(argv[3 + num * 4]); U32 oc = atoi(argv[3 + num * 4 + 1]); U32 oh = atoi(argv[3 + num * 4 + 2]); U32 ow = atoi(argv[3 + num * 4 + 3]); - CHECK_STATUS(concat_infer_output_size(in_desc, &out_desc, axis, UT_ARCH)); + CHECK_STATUS(concat_infer_output_size(inTensorPtr, p, &outTensor, &archInfo)); U32 in_len = 0; - for (int i = 0; i < num; i++){ - in_len += tensorNumElements(in_desc[i]); + for (int i = 0; i < num; i++) { + in_len += inTensors[i].length(); } - U32 out_len = tensorNumElements(out_desc); + U32 out_len = outTensor.length(); CHECK_REQUIREMENT(in_len == out_len && out_len == on * oc * oh * ow); - std::vector input_ref(num); - std::vector input(num); - U8 *tmp = ut_input_v(in_len, dt, UT_INIT_RANDOM); - INT8 *quant = (INT8*)ut_input_v(in_len, DT_I8, UT_INIT_RANDOM); + //INT8 *quant = (INT8 *)ut_input_v(in_len, DT_I8, UT_INIT_ZERO); U32 count = 0; - std::vector scale_i(num); - - for (int i = 0; i < num; i++){ - input_ref[i] = (void *)(tmp + count * bytesOf(dt)); - input[i] = (void *)(quant + count); + for (int i = 0; i < num; i++) { + //input_ref[i] = (void *)(tmp + count * bytesOf(dt)); + inTensorsRef[i].alloc(); + U32 floatBytes = inTensorsRef[i].bytes(); + memcpy(get_ptr_from_tensor(inTensorsRef[i], UT_ARCH), tmp + count, floatBytes); + + inTensors[i].alloc(); + TensorDesc dummy; F16 scale = -1; - quantize_tensor(in_desc_ref[i], input_ref[i], &(in_desc[i]), input[i], &scale); - scale_i[i] = scale; - count += tensorNumElements(in_desc[i]); + quantize_tensor(inTensorsRef[i].get_desc(), tmp + count, &dummy, + get_ptr_from_tensor(inTensors[i], UT_ARCH), &scale); + inTensors[i].set_scale(scale); + count += floatBytes; } - INT8 *output = (INT8*)ut_input_v(out_len, DT_I8, UT_INIT_ZERO); + outTensor.alloc(); U8 *out_d = ut_input_v(out_len, dt, UT_INIT_ZERO); - F32 scale_o; + + Tensor tmpTensor; if (UT_CHECK) { - CHECK_STATUS(concat(in_desc, input, scale_i.data(), out_desc, output, &scale_o, axis, UT_ARCH)); + CHECK_STATUS(concat(inTensors, p, tmpTensor, outTensor, &archInfo)); + F32 scale_o = outTensor.get_scale(); + INT8 *output = (INT8 *)get_ptr_from_tensor(outTensor, UT_ARCH); for (U32 i = 0; i < out_len; i++) { switch (dt) { #ifdef _USE_FP16 case DT_F16: - ((F16*)out_d)[i] = output[i] / scale_o; + ((F16 *)out_d)[i] = output[i] / scale_o; break; #endif #ifdef _USE_FP32 case DT_F32: - ((F32*)out_d)[i] = output[i] / scale_o; + ((F32 *)out_d)[i] = output[i] / scale_o; break; #endif default: @@ -102,8 +116,8 @@ int int8ConcatTest(int argc, char** argv, DataType dt){ // benchmark double time_start = ut_time_ms(); - for(int iter = 0; iter < UT_LOOPS; iter++){ - CHECK_STATUS(concat(in_desc, input, scale_i.data(), out_desc, output, &scale_o, axis, UT_ARCH)); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(concat(inTensors, p, tmpTensor, outTensor, &archInfo)); } double time_end = ut_time_ms(); double time = (time_end - time_start) / UT_LOOPS; @@ -111,20 +125,19 @@ int int8ConcatTest(int argc, char** argv, DataType dt){ // log performance data char buffer[150]; char params[120]; - sprintf(params, "%d (*)/%u=(%u %u %u %u)", - num, axis, on, oc, oh, ow); + sprintf(params, "%d (*)/%u=(%u %u %u %u)", num, p.axis, on, oc, oh, ow); sprintf(buffer, "%20s, %80s", "Concat", params); double ops = 1.0 * out_len; ut_log(DT_I8, buffer, ops, time); free(tmp); - free(output); free(out_d); return 0; } #endif -int main(int argc, char** argv) { +int main(int argc, char **argv) +{ #ifdef _USE_INT8 int8ConcatTest(argc, argv, DT_F16); #endif diff --git a/compute/tensor/tests/test_concat_ocl.cpp b/compute/tensor/tests/test_concat_ocl.cpp new file mode 100644 index 00000000..ad3a75c5 --- /dev/null +++ b/compute/tensor/tests/test_concat_ocl.cpp @@ -0,0 +1,187 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" +#include "libkernelsource.h" +#include +#include "gcl.h" +#include + +#ifdef _USE_FP16 +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} + +int concatTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc > 2); + ConcatParamSpec p; + int num = atoi(argv[1]); + p.axis = atoi(argv[2]); + CHECK_REQUIREMENT(argc == 1 + 2 + (num + 1) * 4); + std::vector inputDesc(num); + std::vector inputTensorCpu; + std::vector inputTensor; + for (int i = 0; i < num; i++) { + U32 n, c, h, w; + n = atoi(argv[3 + i * 4]); + c = atoi(argv[3 + i * 4 + 1]); + h = atoi(argv[3 + i * 4 + 2]); + w = atoi(argv[3 + i * 4 + 3]); + inputDesc[i] = tensor4df(dt, DF_NCHW, n, c, h, w); + std::shared_ptr tensorCpu(new Tensor()); + std::shared_ptr tensor(new Tensor(OCLMem)); + tensorCpu->resize(inputDesc[i]); + tensor->resize(inputDesc[i]); + inputTensorCpu.push_back(*tensorCpu.get()); + inputTensor.push_back(*tensor.get()); + } + U32 on = atoi(argv[3 + num * 4]); + U32 oc = atoi(argv[3 + num * 4 + 1]); + U32 oh = atoi(argv[3 + num * 4 + 2]); + U32 ow = atoi(argv[3 + num * 4 + 3]); + + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + std::vector inputTensorCpuPtr; + std::vector inputTensorPtr; + for (int i = 0; i < num; i++) { + inputTensorCpuPtr.push_back(&inputTensorCpu[i]); + } + for (int i = 0; i < num; i++) { + inputTensorPtr.push_back(&inputTensor[i]); + } + + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor outputTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + + U32 in_len = 0; + for (int i = 0; i < num; i++) { + in_len += tensorNumElements(inputDesc[i]); + } + std::vector input_cpu(num); + U8 *tmp = ut_input_v(in_len, dt, UT_INIT_RANDOM); + U32 count = 0; + for (int i = 0; i < num; i++) { + input_cpu[i] = (void *)(tmp + count * bytesOf(dt)); + count += tensorNumElements(inputDesc[i]); + } + + MaliPara maliPara; + maliPara.handle = handle; + archInfo.archPara = &maliPara; + + CHECK_STATUS(concat_infer_output_size(inputTensorPtr, p, &outputTensor, &archInfo)); + TensorDesc outputDesc = outputTensor.get_desc(); + + U32 maxBytes = 0; + U32 tmpBytes = 0; + CHECK_STATUS(concat_infer_forward_tmp_bytes(inputTensor, &tmpBytes, &archInfo)); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + + GCLMem_t output = alloc_map(outputTensor); + for (int i = 0; i < num; i++) { + tmpBytes = tensorNumBytes(inputTensor[i].get_desc()); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + } + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + for (int i = 0; i < num; i++) { + GCLMem_t input = alloc(inputTensor[i]); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + CHECK_STATUS(ocl_set_input(handle, input, inputDesc[i], (U8 *)input_cpu[i], tmpbuf, true)); + } + + CHECK_STATUS(concat(inputTensor, p, tmpTensor, outputTensor, &archInfo)); + + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + + UNI_INFO_LOG("Run:\n") +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + CHECK_STATUS(ocl_get_output(handle, output, outputDesc, true)); + U8 *output_gpu_val = output->mapPtrArray.back(); + + char buffer[150]; + char params[120]; + sprintf(params, "%d (*)/%u=(%u %u %u %u)", num, p.axis, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Concat", params); +#ifdef _DEBUG + double ops = 1.0 * on * oc * oh * ow; + ut_log(dt, buffer, ops, time); +#endif + for (int i = 0; i < num; i++) { + inputTensorCpu[i].alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu[i], UT_ARCH), input_cpu[i], + tensorNumBytes(inputDesc[i])); + } + + Tensor outputTensorCpu; + CHECK_STATUS(concat_infer_output_size(inputTensorCpuPtr, p, &outputTensorCpu, &archInfo_org)); + outputTensorCpu.alloc(); + + Tensor tmpTensorCpu; + CHECK_STATUS(concat(inputTensorCpu, p, tmpTensorCpu, outputTensorCpu, &archInfo_org)); + ut_check_a(output_gpu_val, get_ptr_from_tensor(outputTensorCpu, UT_ARCH), on * oc * ow * oh, dt); + + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(tmp); + return 0; +} +#endif +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + concatTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_convolution.cpp b/compute/tensor/tests/test_convolution.cpp new file mode 100644 index 00000000..7cedf58d --- /dev/null +++ b/compute/tensor/tests/test_convolution.cpp @@ -0,0 +1,172 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" + +int convolutionTest(int argc, char *argv[], DataType dt) +{ + CHECK_REQUIREMENT(argc == 16); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + // weight + U32 fn = atoi(argv[5]); + U32 fc = atoi(argv[6]); + U32 fh = atoi(argv[7]); + U32 fw = atoi(argv[8]); + U32 group = atoi(argv[9]); + // stride & padding + U32 stride = atoi(argv[10]); + U32 padding = atoi(argv[11]); + // output + U32 on = atoi(argv[12]); + U32 oc = atoi(argv[13]); + U32 oh = atoi(argv[14]); + U32 ow = atoi(argv[15]); + CHECK_REQUIREMENT(in == 1 && on == 1); + + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_RELU; + activationDesc.value[0] = 0; + + TensorDesc inputDesc, outputDesc; + if (ic % 8 != 0) { + inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + } else { + inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + } + TensorDesc filterDesc = tensor4df(dt, DF_NCHW, fn, fc, fh, fw); + TensorDesc biasDesc = tensor1d(dt, oc); + ConvolutionParamSpec p = createConvolutionParamSpec(group, fh, fw, stride, stride, padding, + padding, padding, padding, 1, 1, fn, Convolution_Depthwise_Pointwise); + + // setup input, filter, bias + U8 *input = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + U8 *filter = ut_input_v(fn * fc * fh * fw, dt, UT_INIT_RANDOM); + U8 *bias = ut_input_v(oc, dt, UT_INIT_RANDOM); + Tensor inputTensor; + Tensor inputTensorRef; + Tensor filterTensor; + Tensor filterTensorRef; + Tensor outputTensor; + Tensor outputTensorRef; + Tensor biasTensor; + + inputTensor.resize(inputDesc); + inputTensorRef.resize(inputDesc); + filterTensor.resize(filterDesc); + filterTensorRef.resize(filterDesc); + biasTensor.resize(biasDesc); + + inputTensor.alloc(); + inputTensorRef.alloc(); + filterTensor.alloc(); + filterTensorRef.alloc(); + biasTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, bytesOf(dt) * in * ic * ih * iw); + memcpy(get_ptr_from_tensor(inputTensorRef, UT_ARCH), input, bytesOf(dt) * in * ic * ih * iw); + memcpy(get_ptr_from_tensor(filterTensor, UT_ARCH), filter, bytesOf(dt) * fn * fc * fh * fw); + memcpy(get_ptr_from_tensor(filterTensorRef, UT_ARCH), filter, bytesOf(dt) * fn * fc * fh * fw); + memcpy(get_ptr_from_tensor(biasTensor, UT_ARCH), bias, bytesOf(dt) * oc); + + // setup output, bias + CHECK_STATUS( + convolution_infer_output_size(&inputTensor, filterTensor, p, &outputTensor, dt, &archInfo)); + outputDesc = outputTensor.get_desc(); + + outputTensor.alloc(); + outputTensorRef.resize(outputTensor.get_desc()); + outputTensorRef.alloc(); + + // setup alg + ConvolutionPolicy policy = CONVOLUTION_FASTEST; + ConvolutionForwardAlgorithm alg = CONVOLUTION_ALGORITHM_NULL; + CHECK_STATUS(convolution_infer_forward_algorithm( + inputTensor, filterTensor, outputTensor, p, policy, &alg, dt, activationDesc, &archInfo)); + + // setup tmp + U32 tmpBytes; + CHECK_STATUS(convolution_infer_forward_tmp_bytes( + inputTensor, filterTensor, outputTensor, p, alg, &tmpBytes, &archInfo)); + Tensor tmpTensor; + tmpTensor.resize(tensor1d(DT_U8, tmpBytes)); + tmpTensor.alloc(); + + // setup filter trans + U32 ftmBytes; + CHECK_STATUS(convolution_transform_filter_bytes(filterTensor, p, alg, &ftmBytes, &archInfo)); + // trans filter + Tensor ftmTensor; + ftmTensor.resize(tensor1d(DT_U8, ftmBytes)); + ftmTensor.alloc(); + CHECK_STATUS( + convolution_transform_filter(filterTensor, p, alg, tmpTensor, &ftmTensor, &archInfo)); + + if (UT_CHECK) { + CHECK_STATUS(convolution(inputTensor, ftmTensor, p, alg, nullptr, biasTensor, tmpTensor, + outputTensor, activationDesc, &archInfo)); + + // naive implement + CHECK_STATUS(convolution(inputTensorRef, filterTensorRef, p, alg, nullptr, biasTensor, + tmpTensor, outputTensorRef, activationDesc, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), outputTensor.length(), dt, 5, __FILE__, + __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(convolution(inputTensor, ftmTensor, p, alg, nullptr, biasTensor, tmpTensor, + outputTensor, activationDesc, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + DataFormat df; + CHECK_STATUS(tensor4dGet(outputDesc, &dt, &df, &on, &oc, &oh, &ow)); + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, + fh, fw, group, stride, padding, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Convolution", params); + double ops = (1.0 * on * oc * oh * ow) * (2.0 * ic / group * fh * fw + 1); + ut_log(dt, buffer, ops, time); + + free(input); + free(filter); + free(bias); + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + convolutionTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + convolutionTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_convolution_bnn.cpp b/compute/tensor/tests/test_convolution_bnn.cpp new file mode 100644 index 00000000..4ed4658f --- /dev/null +++ b/compute/tensor/tests/test_convolution_bnn.cpp @@ -0,0 +1,179 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" + +int bnnConvolutionTest(int argc, char *argv[], DataType dt) +{ + CHECK_REQUIREMENT(argc == 16); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + // weight + U32 fn = atoi(argv[5]); + U32 fc = atoi(argv[6]); + U32 fh = atoi(argv[7]); + U32 fw = atoi(argv[8]); + U32 group = atoi(argv[9]); + // stride & padding + U32 stride = atoi(argv[10]); + U32 padding = atoi(argv[11]); + // output + U32 on = atoi(argv[12]); + U32 oc = atoi(argv[13]); + U32 oh = atoi(argv[14]); + U32 ow = atoi(argv[15]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + CHECK_REQUIREMENT(in == 1 && on == 1); + + DataType fdt = DT_BIN11; // Use dt to distinguish DoReFa and XNOR + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_NULL; + + TensorDesc inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + TensorDesc filterDesc = tensor4df(fdt, DF_NCHW, oc, ic, fh, fw); + TensorDesc biasDesc = tensor1d(dt, oc * 2); // including scale and bias + ConvolutionParamSpec p = createConvolutionParamSpec(group, fh, fw, stride, stride, padding, + padding, padding, padding, 1, 1, oc, Convolution_Depthwise_Pointwise); + + // setup input, filter, bias + U8 *input = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + if (fdt == DT_BIN01) { + for (U32 i = 0; i < in * ic * ih * iw; i++) { + switch (dt) { +#ifdef _USE_FP16 + case DT_F16: + ((F16 *)input)[i] += 0.5; + break; +#endif +#ifdef _USE_FP32 + case DT_F32: + ((F32 *)input)[i] += 0.5; + break; +#endif + default: + break; + } + } + } + + BIN8 *filter = (BIN8 *)ut_input_v(fn * fc * fh * fw / 8, fdt, UT_INIT_POS); + U8 *bias = ut_input_v(oc * 2, dt, UT_INIT_RANDOM); + Tensor inputTensor; + Tensor inputTensorRef; + Tensor filterTensor; + Tensor filterTensorRef; + Tensor outputTensor; + Tensor outputTensorRef; + Tensor biasTensor; + + inputTensor.resize(inputDesc); + inputTensorRef.resize(inputDesc); + filterTensor.resize(filterDesc); + filterTensorRef.resize(filterDesc); + biasTensor.resize(biasDesc); + + inputTensor.alloc(); + inputTensorRef.alloc(); + filterTensor.alloc(); + filterTensorRef.alloc(); + biasTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, bytesOf(dt) * in * ic * ih * iw); + memcpy(get_ptr_from_tensor(inputTensorRef, UT_ARCH), input, bytesOf(dt) * in * ic * ih * iw); + memcpy(get_ptr_from_tensor(filterTensor, UT_ARCH), filter, tensorNumBytes(filterDesc)); + memcpy(get_ptr_from_tensor(filterTensorRef, UT_ARCH), filter, tensorNumBytes(filterDesc)); + memcpy(get_ptr_from_tensor(biasTensor, UT_ARCH), bias, tensorNumBytes(biasDesc)); + + // setup output, bias + CHECK_STATUS( + convolution_infer_output_size(&inputTensor, filterTensor, p, &outputTensor, dt, &archInfo)); + + outputTensor.alloc(); + outputTensorRef.resize(outputTensor.get_desc()); + outputTensorRef.alloc(); + + // setup alg + ConvolutionPolicy policy = CONVOLUTION_FASTEST; + ConvolutionForwardAlgorithm alg = CONVOLUTION_ALGORITHM_NULL; + CHECK_STATUS(convolution_infer_forward_algorithm( + inputTensor, filterTensor, outputTensor, p, policy, &alg, fdt, activationDesc, &archInfo)); + + // setup tmp + U32 tmpBytes; + CHECK_STATUS(convolution_infer_forward_tmp_bytes( + inputTensor, filterTensor, outputTensor, p, alg, &tmpBytes, &archInfo)); + Tensor tmpTensor; + tmpTensor.resize(tensor1d(DT_U8, tmpBytes)); + tmpTensor.alloc(); + // setup filter trans + U32 ftmBytes; + CHECK_STATUS(convolution_transform_filter_bytes(filterTensor, p, alg, &ftmBytes, &archInfo)); + // trans filter + Tensor ftmTensor; + ftmTensor.resize(tensor1d(DT_U8, ftmBytes)); + ftmTensor.alloc(); + + CHECK_STATUS( + convolution_transform_filter(filterTensor, p, alg, tmpTensor, &ftmTensor, &archInfo)); + + if (UT_CHECK) { + CHECK_STATUS(convolution(inputTensor, ftmTensor, p, alg, nullptr, biasTensor, tmpTensor, + outputTensor, activationDesc, &archInfo)); + // naive implement + CHECK_STATUS(convolution(inputTensorRef, filterTensorRef, p, alg, nullptr, biasTensor, + tmpTensor, outputTensorRef, activationDesc, &archInfo_org)); + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), outputTensor.length(), dt, 1, __FILE__, + __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(convolution(inputTensor, ftmTensor, p, alg, nullptr, biasTensor, tmpTensor, + outputTensor, activationDesc, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, fh, + fw, stride, padding, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "BNN Convolution", params); + double ops = (1.0 * on * oc * oh * ow) * (2.0 * ic * fh * fw + 1); + ut_log(DT_I8, buffer, ops, time); + + free(input); + free(filter); + free(bias); + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + bnnConvolutionTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_convolution_int8.cpp b/compute/tensor/tests/test_convolution_int8.cpp new file mode 100644 index 00000000..45ef38ce --- /dev/null +++ b/compute/tensor/tests/test_convolution_int8.cpp @@ -0,0 +1,249 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" + +#ifdef _USE_INT8 +int int8ConvolutionTest(int argc, char *argv[], DataType dt, DataType filterDataType) +{ + CHECK_REQUIREMENT(argc == 16); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + // weight + U32 fn = atoi(argv[5]); + U32 fc = atoi(argv[6]); + U32 fh = atoi(argv[7]); + U32 fw = atoi(argv[8]); + U32 group = atoi(argv[9]); + // stride & padding + U32 stride = atoi(argv[10]); + U32 padding = atoi(argv[11]); + // output + U32 on = atoi(argv[12]); + U32 oc = atoi(argv[13]); + U32 oh = atoi(argv[14]); + U32 ow = atoi(argv[15]); + CHECK_REQUIREMENT(in == 1 && on == 1); + + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_RELU; + activationDesc.value[0] = 0; + + TensorDesc inputDesc, filterDesc, outputDesc, biasDesc; + ConvolutionParamSpec p = createConvolutionParamSpec(group, fh, fw, stride, stride, padding, + padding, padding, padding, 1, 1, fn, Convolution_Depthwise_Pointwise); + + if (ic % 8 != 0) { + printf("[WARN] can not quantize the first layer\n"); + return 0; + } else { + DataType qdt = DT_I8; + TensorDesc inputDesc_ref = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + filterDesc = tensor4df(dt, DF_NCHW, oc, ic, fh, fw); + biasDesc = tensor1d(dt, oc); + + // setup input, filter, bias + U8 *input_ref = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + U8 *filter = ut_input_v(fn * fc * fh * fw, dt, UT_INIT_RANDOM); + U8 *bias = ut_input_v(oc, dt, UT_INIT_RANDOM); + + INT8 *input = (INT8 *)ut_input_v(in * ic * ih * iw, DT_I8, UT_INIT_ZERO); + F16 scale_i = -1; + quantize_tensor(inputDesc_ref, input_ref, &inputDesc, input, &scale_i); + + Tensor inputTensor; + inputTensor.resize(inputDesc); + inputTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inputDesc)); + + Tensor inputTensorRef; + inputTensorRef.resize(inputDesc_ref); + inputTensorRef.alloc(); + memcpy( + get_ptr_from_tensor(inputTensorRef, UT_ARCH), input_ref, tensorNumBytes(inputDesc_ref)); + + Tensor filterTensor; + filterTensor.resize(filterDesc); + filterTensor.alloc(); + memcpy(get_ptr_from_tensor(filterTensor, UT_ARCH), filter, tensorNumBytes(filterDesc)); + + Tensor filterTensorRef; + filterTensorRef.resize(filterDesc); + filterTensorRef.alloc(); + memcpy(get_ptr_from_tensor(filterTensorRef, UT_ARCH), filter, tensorNumBytes(filterDesc)); + + Tensor biasTensor; + biasTensor.resize(biasDesc); + biasTensor.alloc(); + memcpy(get_ptr_from_tensor(biasTensor, UT_ARCH), bias, tensorNumBytes(biasDesc)); + + Tensor outputTensor, outputTensorRef; + + // setup output, bias + CHECK_STATUS(convolution_infer_output_size( + &inputTensor, filterTensor, p, &outputTensor, qdt, &archInfo)); + outputTensor.alloc(); + + outputDesc = outputTensor.get_desc(); + TensorDesc outputDesc_ref = outputTensor.get_desc(); + outputDesc_ref.dt = dt; + outputTensorRef.resize(outputDesc_ref); + outputTensorRef.alloc(); + + // setup alg + ConvolutionPolicy policy = CONVOLUTION_FASTEST; + ConvolutionForwardAlgorithm alg = CONVOLUTION_ALGORITHM_NULL; + CHECK_STATUS(convolution_infer_forward_algorithm(inputTensor, filterTensor, outputTensor, p, + policy, &alg, qdt, activationDesc, &archInfo)); + + F16 *scales; + + // setup filter trans + U32 ftBytes; + Tensor ftmTensor, tmpTensor; + + switch (alg) { + case CONVOLUTION_ALGORITHM_WINOGRAD: { + CHECK_STATUS( + convolution_transform_filter_bytes(filterTensor, p, alg, &ftBytes, &archInfo)); + + Tensor tFilter; + tFilter.resize(tensor1d(DT_U8, ftBytes)); + tFilter.alloc(); + + filterDesc.dt = filterDataType; // To label as int8 + filterTensor.resize(filterDesc); + CHECK_STATUS(convolution_transform_filter( + filterTensor, p, alg, tmpTensor, &tFilter, &archInfo)); + + TensorDesc ftmDesc = tFilter.get_desc(); + ftmDesc.dt = DT_I8; + ftmTensor.resize(ftmDesc); + ftmTensor.alloc(); + + scales = (F16 *)ut_input_v( + 38, DT_F16, UT_INIT_ZERO); // 1 for input, 1 for output and 36 for filter + CHECK_STATUS( + quantize_tensor(tFilter.get_desc(), get_ptr_from_tensor(tFilter, UT_ARCH), + &ftmDesc, get_ptr_from_tensor(ftmTensor, UT_ARCH), scales + 2)); + break; + } + default: { + Tensor qFilter; + TensorDesc qDesc = filterDesc; + qDesc.dt = DT_I8; + qFilter.resize(qDesc); + qFilter.alloc(); + scales = (F16 *)ut_input_v(3, DT_F16, UT_INIT_ZERO); + CHECK_STATUS(quantize_tensor( + filterDesc, filter, &qDesc, get_ptr_from_tensor(qFilter, UT_ARCH), scales + 2)); + + CHECK_STATUS( + convolution_transform_filter_bytes(qFilter, p, alg, &ftBytes, &archInfo)); + + ftmTensor.resize(tensor1d(DT_U8, ftBytes)); + ftmTensor.alloc(); + // trans filter + CHECK_STATUS( + convolution_transform_filter(qFilter, p, alg, tmpTensor, &ftmTensor, &archInfo)); + break; + } + } + + scales[0] = scale_i; + + // setup tmp + U32 tmpBytes; + CHECK_STATUS(convolution_infer_forward_tmp_bytes( + inputTensor, ftmTensor, outputTensor, p, alg, &tmpBytes, &archInfo)); + tmpTensor.resize(tensor1d(DT_U8, tmpBytes)); + tmpTensor.alloc(); + + if (UT_CHECK) { + CHECK_STATUS(convolution(inputTensor, ftmTensor, p, alg, scales, biasTensor, tmpTensor, + outputTensor, activationDesc, &archInfo)); + + // naive implement + CHECK_STATUS(convolution(inputTensorRef, filterTensorRef, p, alg, nullptr, biasTensor, + tmpTensor, outputTensorRef, activationDesc, &archInfo_org)); + + U32 output_size = outputTensor.length(); + U8 *out_d = ut_input_v(output_size, dt, UT_INIT_ZERO); + INT8 *output = (INT8 *)get_ptr_from_tensor(outputTensor, UT_ARCH); + for (U32 i = 0; i < output_size; i++) { + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + ((F32 *)out_d)[i] = output[i] / scales[1]; + break; +#endif +#ifdef _USE_FP16 + case DT_F16: + ((F16 *)out_d)[i] = output[i] / scales[1]; + break; +#endif + default: + break; + } + } + ut_check_v(out_d, get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_size, dt, 8, + __FILE__, __LINE__); + free(out_d); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(convolution(inputTensor, ftmTensor, p, alg, scales, biasTensor, tmpTensor, + outputTensor, activationDesc, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + DataFormat df; + CHECK_STATUS(tensor4dGet(outputDesc, &dt, &df, &on, &oc, &oh, &ow)); + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, + fc, fh, fw, group, stride, padding, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Convolution", params); + double ops = (1.0 * on * oc * oh * ow) * (2.0 * ic * fh * fw / group + 1); + ut_log(DT_I8, buffer, ops, time); + + free(input); + free(filter); + free(bias); + free(input_ref); + free(scales); + } + return 0; +} +#endif + +int main(int argc, char **argv) +{ +#ifdef _USE_INT8 + int8ConvolutionTest(argc, argv, DT_F16, DT_F16_8Q); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_convolution_ocl.cpp b/compute/tensor/tests/test_convolution_ocl.cpp new file mode 100644 index 00000000..03b9a87a --- /dev/null +++ b/compute/tensor/tests/test_convolution_ocl.cpp @@ -0,0 +1,296 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} + +inline GCLMem_t alloc_desc(Tensor tensor, GCLMemDesc desc) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->padding(desc); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +int convolutionTest(int argc, char *argv[], DataType dt) +{ + U32 biasNum; + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + U32 in = 1; + U32 ic = 4; + U32 ih = 4; + U32 iw = 4; + U32 fn = 4; + U32 fh = 3; + U32 fw = 3; + U32 group = 1; + U32 strideW = 1; + U32 strideH = 1; + U32 paddingT = 1; + U32 paddingB = 1; + U32 paddingL = 1; + U32 paddingR = 1; + if (argc == 9) { + ic = atoi(argv[1]); + ih = atoi(argv[2]); + iw = atoi(argv[3]); + fn = atoi(argv[4]); + fh = atoi(argv[5]); + fw = atoi(argv[6]); + strideH = atoi(argv[7]); + strideW = atoi(argv[7]); + paddingT = atoi(argv[8]); + paddingB = atoi(argv[8]); + paddingL = atoi(argv[8]); + paddingR = atoi(argv[8]); + } + if (argc == 13) { + ic = atoi(argv[1]); + ih = atoi(argv[2]); + iw = atoi(argv[3]); + fn = atoi(argv[4]); + fh = atoi(argv[5]); + fw = atoi(argv[6]); + strideH = atoi(argv[7]); + strideW = atoi(argv[8]); + paddingT = atoi(argv[9]); + paddingB = atoi(argv[10]); + paddingL = atoi(argv[11]); + paddingR = atoi(argv[12]); + } + U32 fc = ic; + U32 on = 1; + U32 oc = fn; + U32 oh = (ih + paddingT + paddingB - fh) / strideH + 1; + U32 ow = (iw + paddingL + paddingR - fw) / strideW + 1; + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_NULL; + ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, fh, fw, strideH, strideW, + paddingT, paddingB, paddingL, paddingR, 1, 1, fn, Convolution_Depthwise_Pointwise); + + TensorDesc inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + TensorDesc filterDesc = tensor4df(dt, DF_NCHW, fn, fc, fh, fw); + TensorDesc biasDesc = tensor1d(dt, oc); + U8 *input_cpu = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + U8 *filter_cpu = ut_input_v(fn * fc * fh * fw, dt, UT_INIT_RANDOM); + U8 *bias_cpu = ut_input_v(oc, dt, UT_INIT_RANDOM); + TensorDesc inputDesc_gpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + ; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor filterTensorOrg = Tensor(OCLMem); + Tensor filterTensor = Tensor(OCLMem); + Tensor biasTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(inputDesc_gpu); + filterTensor.resize(filterDesc); + filterTensorOrg.resize(filterDesc); + biasTensor.resize(biasDesc); + U32 str[3] = {1, 1, 1}; + U32 off[3] = {0, 0, 0}; + GCLMemDesc inputMemDesc = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); + ocl_set_desc(&inputTensor, inputMemDesc); + + MaliPara maliPara; + ForwardRunInfoMali runInfo; + runInfo.algorithm = (I32)(CONVOLUTION_ALGORITHM_NULL); + maliPara.handle = handle; + maliPara.forwardRunInfo = &runInfo; + archInfo.archPara = &maliPara; + + CHECK_STATUS(convolution_infer_output_size( + &inputTensor, filterTensor, convParamSpec, &outputTensor, dt, &archInfo)); + + ConvolutionPolicy policy = CONVOLUTION_TUNNING; + ConvolutionForwardAlgorithm alg = CONVOLUTION_ALGORITHM_NULL; + CHECK_STATUS(convolution_infer_forward_algorithm(inputTensor, filterTensor, outputTensor, + convParamSpec, policy, &alg, dt, activationDesc, &archInfo)); + + U32 maxBytes = 0; + U32 tmpBytes; + CHECK_STATUS(convolution_infer_forward_tmp_bytes( + inputTensor, filterTensor, outputTensor, convParamSpec, alg, &tmpBytes, &archInfo)); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + + GCLMemDesc filterMemDesc = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); + maliPara.gclmemFilterDesc = &filterMemDesc; + U32 ftmBytes; + CHECK_STATUS( + convolution_transform_filter_bytes(filterTensor, convParamSpec, alg, &ftmBytes, &archInfo)); + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + + GCLMemDesc desc = gclmem_build_desc(); + if ((fh == 1 && fw == 1 && ih == 1 && iw == 1) || fn == 1) { + biasNum = oc; + desc.memType = GCL_MEM_BUF; + desc.byteSize = biasNum * bytesOf(dt); + } else { + biasNum = (oc + 3) / 4; + desc.memType = GCL_MEM_IMG_1D; + desc.byteSize = biasNum * 4 * bytesOf(dt); + } + desc.stride[0] = biasNum; + desc.stride[1] = 1; + desc.stride[2] = 1; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.num = biasNum; + desc.memFormat = DF_NHWC; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + U8 *bias_cpu_align = NULL; + if ((oc & 3) != 0) { + U8 *bias_cpu_align = ut_input_v((oc + 3) / 4 * 4, dt, UT_INIT_ZERO); + memcpy(bias_cpu_align, bias_cpu, (oc + 3) / 4 * 4 * bytesOf(dt)); + desc.host_ptr = bias_cpu_align; + } else { + desc.host_ptr = bias_cpu; + } + alloc_desc(biasTensor, desc); + + desc = filterMemDesc; + alloc_desc(filterTensor, desc); + desc.stride[0] = fw * fh; + desc.stride[1] = fc; + desc.stride[2] = fn; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.byteSize = fw * fh * fc * fn * bytesOf(dt); + desc.num = fw * fh * fc * fn; + desc.memType = GCL_MEM_BUF; + desc.memFormat = DF_NCHW; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + desc.host_ptr = filter_cpu; + alloc_desc(filterTensorOrg, desc); + + tmpBytes = tensorNumBytes(inputDesc_gpu); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + CHECK_STATUS(convolution_transform_filter( + filterTensorOrg, convParamSpec, alg, tmpTensor, &filterTensor, &archInfo)); + + CHECK_STATUS(ocl_set_input(handle, input, inputDesc_gpu, input_cpu, tmpbuf, true)); + + CHECK_STATUS(convolution(inputTensor, filterTensor, convParamSpec, alg, nullptr, biasTensor, + tmpTensor, outputTensor, activationDesc, &archInfo)); + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + TensorDesc outputDesc = outputTensor.get_desc(); + ; + CHECK_STATUS(ocl_get_output(handle, output, outputDesc, true)); + void *output_gpu = output->mapPtrArray.back(); + + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u %u %u %u %u %u)=(%u %u %u %u)", in, ic, ih, + iw, fn, fc, fh, fw, group, strideH, strideW, paddingT, paddingB, paddingL, paddingR, on, oc, + oh, ow); + sprintf(buffer, "%20s, %80s", "Convolution", params); +#ifdef _DEBUG + double ops = (1.0 * on * oc * oh * ow) * (2.0 * ic * fh * fw / group + 1); + ut_log(dt, buffer, ops, time); +#endif + Tensor inputTensorCpu; + inputTensorCpu.resize(inputDesc); + inputTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu, tensorNumBytes(inputDesc)); + + Tensor filterTensorCpu; + filterTensorCpu.resize(filterDesc); + filterTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(filterTensorCpu, UT_ARCH), filter_cpu, tensorNumBytes(filterDesc)); + + Tensor biasTensorCpu; + biasTensorCpu.resize(biasDesc); + biasTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(biasTensorCpu, UT_ARCH), bias_cpu, tensorNumBytes(biasDesc)); + + Tensor outputTensorCpu; + outputTensorCpu.resize(outputDesc); + outputTensorCpu.alloc(); + + Tensor tmpTensorCpu; + CHECK_STATUS( + convolution(inputTensorCpu, filterTensorCpu, convParamSpec, CONVOLUTION_ALGORITHM_GEMM, + nullptr, biasTensorCpu, tmpTensorCpu, outputTensorCpu, activationDesc, &archInfo_org)); + ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, UT_ARCH), on * oc * ow * oh, dt); + + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(input_cpu); + free(filter_cpu); + free(bias_cpu); + if (bias_cpu_align) { + free(bias_cpu_align); + } + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + convolutionTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_deconvolution.cpp b/compute/tensor/tests/test_deconvolution.cpp new file mode 100644 index 00000000..cd1b93aa --- /dev/null +++ b/compute/tensor/tests/test_deconvolution.cpp @@ -0,0 +1,168 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" + +int deconvolutionTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 16); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + // weight + U32 fn = atoi(argv[5]); + U32 fc = atoi(argv[6]); + U32 fh = atoi(argv[7]); + U32 fw = atoi(argv[8]); + U32 group = atoi(argv[9]); + // stride & padding + U32 stride = atoi(argv[10]); + U32 padding = atoi(argv[11]); + // output + U32 on = atoi(argv[12]); + U32 oc = atoi(argv[13]); + U32 oh = atoi(argv[14]); + U32 ow = atoi(argv[15]); + CHECK_REQUIREMENT(in == 1 && on == 1); + CHECK_REQUIREMENT(ic % 8 == 0 && oc % 8 == 0); + + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_NULL; + ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, fh, fw, stride, stride, + padding, padding, padding, padding, 1, 1, fn, Convolution_Deconvolution); + + TensorDesc outputDesc; + TensorDesc inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + TensorDesc filterDesc = tensor4df(dt, DF_NCHW, fn, fc, fh, fw); + TensorDesc biasDesc = tensor1d(dt, oc); + + // setup input, filter, bias + U8 *input = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + U8 *filter = ut_input_v(fn * fc * fh * fw, dt, UT_INIT_RANDOM); + U8 *bias = ut_input_v(oc, dt, UT_INIT_RANDOM); + + Tensor inputTensor; + Tensor inputTensorRef; + Tensor filterTensor; + Tensor filterTensorRef; + Tensor outputTensor; + Tensor outputTensorRef; + Tensor biasTensor; + + inputTensor.resize(inputDesc); + inputTensorRef.resize(inputDesc); + filterTensor.resize(filterDesc); + filterTensorRef.resize(filterDesc); + biasTensor.resize(biasDesc); + + inputTensor.alloc(); + inputTensorRef.alloc(); + filterTensor.alloc(); + filterTensorRef.alloc(); + biasTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, bytesOf(dt) * in * ic * ih * iw); + memcpy(get_ptr_from_tensor(inputTensorRef, UT_ARCH), input, bytesOf(dt) * in * ic * ih * iw); + memcpy(get_ptr_from_tensor(filterTensor, UT_ARCH), filter, tensorNumBytes(filterDesc)); + memcpy(get_ptr_from_tensor(filterTensorRef, UT_ARCH), filter, tensorNumBytes(filterDesc)); + memcpy(get_ptr_from_tensor(biasTensor, UT_ARCH), bias, tensorNumBytes(biasDesc)); + + // setup output, bias + CHECK_STATUS(deconvolution_infer_output_size( + &inputTensor, filterTensor, convParamSpec, &outputTensor, dt, &archInfo)); + U32 output_size = outputTensor.length(); + + outputTensor.alloc(); + outputTensorRef.resize(outputTensor.get_desc()); + outputTensorRef.alloc(); + + // setup alg + ConvolutionPolicy policy = CONVOLUTION_FASTEST; + ConvolutionForwardAlgorithm alg = CONVOLUTION_ALGORITHM_NULL; + CHECK_STATUS(deconvolution_infer_forward_algorithm(inputTensor, filterTensor, outputTensor, + convParamSpec, policy, &alg, dt, activationDesc, &archInfo)); + + // setup tmp + U32 tmpBytes; + CHECK_STATUS(deconvolution_infer_forward_tmp_bytes( + inputTensor, filterTensor, outputTensor, convParamSpec, alg, &tmpBytes, &archInfo)); + Tensor tmpTensor; + tmpTensor.resize(tensor1d(DT_U8, tmpBytes)); + tmpTensor.alloc(); + + // setup filter trans + U32 ftmBytes; + CHECK_STATUS(deconvolution_transform_filter_bytes( + filterTensor, convParamSpec, alg, &ftmBytes, &archInfo)); + Tensor ftmTensor; + ftmTensor.resize(tensor1d(DT_U8, ftmBytes)); + ftmTensor.alloc(); + + // trans filter + CHECK_STATUS(deconvolution_transform_filter( + filterTensor, convParamSpec, alg, tmpTensor, &ftmTensor, &archInfo)); + + if (UT_CHECK) { + CHECK_STATUS(deconvolution(inputTensor, ftmTensor, convParamSpec, alg, nullptr, biasTensor, + tmpTensor, outputTensor, activationDesc, &archInfo)); + + // naive implement + CHECK_STATUS(deconvolution(inputTensorRef, filterTensorRef, convParamSpec, alg, nullptr, + biasTensor, tmpTensor, outputTensorRef, activationDesc, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_size, dt, 1, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(deconvolution(inputTensor, ftmTensor, convParamSpec, alg, nullptr, biasTensor, + tmpTensor, outputTensor, activationDesc, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, fh, + fw, stride, padding, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Deconvolution", params); + double ops = (1.0 * on * oc * ih * iw) * (2.0 * ic * fh * fw + fh * fw); + ut_log(dt, buffer, ops, time); + + free(input); + free(filter); + free(bias); + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + deconvolutionTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + deconvolutionTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_deconvolution_ocl.cpp b/compute/tensor/tests/test_deconvolution_ocl.cpp new file mode 100644 index 00000000..dd1d38c6 --- /dev/null +++ b/compute/tensor/tests/test_deconvolution_ocl.cpp @@ -0,0 +1,268 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" +#include +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} + +inline GCLMem_t alloc_desc(Tensor tensor, GCLMemDesc desc) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->padding(desc); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +int deconvolutionTest(int argc, char *argv[], DataType dt) +{ + U32 biasNum; + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + U32 in = 1; + U32 ic = 4; + U32 ih = 2; + U32 iw = 2; + U32 fn = 4; + U32 fh = 2; + U32 fw = 2; + U32 fc = 4; + U32 stride = 2; + U32 padding = 0; + U32 group = 1; + if (argc == 9) { + ic = atoi(argv[1]); + ih = atoi(argv[2]); + iw = atoi(argv[3]); + fc = atoi(argv[4]); + fh = atoi(argv[5]); + fw = atoi(argv[6]); + stride = atoi(argv[7]); + padding = atoi(argv[8]); + fn = ic; + } + U32 on = 1; + U32 oc = fc; + U32 oh = fh + stride * (ih - 1) - padding - padding; + U32 ow = fw + stride * (iw - 1) - padding - padding; + + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_NULL; + ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, fh, fw, stride, stride, + padding, padding, padding, padding, 1, 1, fn, Convolution_Deconvolution); + + TensorDesc inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + TensorDesc filterDesc = tensor4df(dt, DF_NCHW, fn, fc, fh, fw); + TensorDesc biasDesc = tensor1d(dt, oc); + U8 *input_cpu = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + U8 *filter_cpu = ut_input_v(fn * fc * fh * fw, dt, UT_INIT_RANDOM); + U8 *bias_cpu = ut_input_v(oc, dt, UT_INIT_RANDOM); + U8 *output_gpu = NULL; + TensorDesc inputDesc_gpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + ; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor filterTensorOrg = Tensor(OCLMem); + Tensor filterTensor = Tensor(OCLMem); + Tensor biasTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(inputDesc_gpu); + filterTensor.resize(filterDesc); + filterTensorOrg.resize(filterDesc); + biasTensor.resize(biasDesc); + + MaliPara maliPara; + ForwardRunInfoMali runInfo; + runInfo.algorithm = (I32)(CONVOLUTION_ALGORITHM_NULL); + maliPara.handle = handle; + maliPara.forwardRunInfo = &runInfo; + archInfo.archPara = &maliPara; + + CHECK_STATUS(deconvolution_infer_output_size( + &inputTensor, filterTensor, convParamSpec, &outputTensor, dt, &archInfo)); + + ConvolutionPolicy policy = CONVOLUTION_TUNNING; + ConvolutionForwardAlgorithm alg = CONVOLUTION_ALGORITHM_NULL; + CHECK_STATUS(deconvolution_infer_forward_algorithm(inputTensor, filterTensor, outputTensor, + convParamSpec, policy, &alg, dt, activationDesc, &archInfo)); + + U32 maxBytes = 0; + U32 tmpBytes; + CHECK_STATUS(deconvolution_infer_forward_tmp_bytes( + inputTensor, filterTensor, outputTensor, convParamSpec, alg, &tmpBytes, &archInfo)); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + + U32 ftmBytes; + U32 str[3] = {0, 0, 0}; + U32 off[3] = {0, 0, 0}; + GCLMemDesc filterMemDesc = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); + maliPara.gclmemFilterDesc = &filterMemDesc; + CHECK_STATUS(deconvolution_transform_filter_bytes( + filterTensor, convParamSpec, alg, &ftmBytes, &archInfo)); + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + + GCLMemDesc desc = gclmem_build_desc(); + biasNum = (oc + 3) / 4; + desc.memType = GCL_MEM_IMG_1D; + desc.byteSize = biasNum * 4 * bytesOf(dt); + desc.stride[0] = biasNum; + desc.stride[1] = 1; + desc.stride[2] = 1; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.num = biasNum; + desc.memFormat = DF_NHWC; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + U8 *bias_cpu_align = NULL; + if ((oc & 3) != 0) { + U8 *bias_cpu_align = ut_input_v((oc + 3) / 4 * 4, dt, UT_INIT_ZERO); + memcpy(bias_cpu_align, bias_cpu, (oc + 3) / 4 * 4 * bytesOf(dt)); + desc.host_ptr = bias_cpu_align; + } else { + desc.host_ptr = bias_cpu; + } + + alloc_desc(biasTensor, desc); + + desc = filterMemDesc; + alloc_desc(filterTensor, desc); + + desc.stride[0] = fw * fh; + desc.stride[1] = fc; + desc.stride[2] = fn; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.byteSize = fw * fh * fc * fn * bytesOf(dt); + desc.num = fw * fh * fc * fn; + desc.memType = GCL_MEM_BUF; + desc.memFormat = DF_NCHW; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + desc.host_ptr = filter_cpu; + alloc_desc(filterTensorOrg, desc); + + tmpBytes = tensorNumBytes(inputDesc_gpu); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + CHECK_STATUS(deconvolution_transform_filter( + filterTensorOrg, convParamSpec, alg, tmpTensor, &filterTensor, &archInfo)); + CHECK_STATUS(ocl_set_input(handle, input, inputDesc_gpu, input_cpu, tmpbuf, true)); + + CHECK_STATUS(deconvolution(inputTensor, filterTensor, convParamSpec, alg, nullptr, biasTensor, + tmpTensor, outputTensor, activationDesc, &archInfo)); + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + TensorDesc outputDesc = outputTensor.get_desc(); + CHECK_STATUS(ocl_get_output(handle, output, outputDesc, true)); + output_gpu = output->mapPtrArray.back(); + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, fh, + fw, stride, padding, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Deonvolution", params); +#ifdef _DEBUG + double ops = (1.0 * on * oc * ih * iw) * (2.0 * ic * fh * fw + fh * fw); + ut_log(dt, buffer, ops, time); +#endif + Tensor inputTensorCpu; + inputTensorCpu.resize(inputDesc); + inputTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu, tensorNumBytes(inputDesc)); + + Tensor filterTensorCpu; + filterTensorCpu.resize(filterDesc); + filterTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(filterTensorCpu, UT_ARCH), filter_cpu, tensorNumBytes(filterDesc)); + + Tensor biasTensorCpu; + biasTensorCpu.resize(biasDesc); + biasTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(biasTensorCpu, UT_ARCH), bias_cpu, tensorNumBytes(biasDesc)); + + Tensor outputTensorCpu; + outputTensorCpu.resize(outputDesc); + outputTensorCpu.alloc(); + + Tensor tmpTensorCpu; + CHECK_STATUS( + deconvolution(inputTensorCpu, filterTensorCpu, convParamSpec, CONVOLUTION_ALGORITHM_GEMM, + nullptr, biasTensorCpu, tmpTensorCpu, outputTensorCpu, activationDesc, &archInfo_org)); + ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, UT_ARCH), on * oc * ow * oh, dt); + + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(input_cpu); + free(filter_cpu); + free(bias_cpu); + if (bias_cpu_align) { + free(bias_cpu_align); + } + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + deconvolutionTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_depthwise_convolution.cpp b/compute/tensor/tests/test_depthwise_convolution.cpp new file mode 100644 index 00000000..6b14c492 --- /dev/null +++ b/compute/tensor/tests/test_depthwise_convolution.cpp @@ -0,0 +1,257 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "tensor_computing.h" +#include "ut_util.h" + +int depthwiseConvolutionTest(int argc, char *argv[], bool isFusedWithPw, DataType dt) +{ + CHECK_REQUIREMENT(argc == 16); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + // weight + U32 fn = atoi(argv[5]); + U32 fc = atoi(argv[6]); + U32 fh = atoi(argv[7]); + U32 fw = atoi(argv[8]); + U32 group = atoi(argv[9]); + // stride & padding + U32 stride = atoi(argv[10]); + U32 padding = atoi(argv[11]); + // output + U32 on = atoi(argv[12]); + U32 oc = atoi(argv[13]); + U32 oh = atoi(argv[14]); + U32 ow = atoi(argv[15]); + CHECK_REQUIREMENT(in == 1 && on == 1); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + ActivationParamSpec dwActivationParamSpec; + ActivationParamSpec pwActivationParamSpec; + dwActivationParamSpec.mode = ACTIVATION_NULL; + pwActivationParamSpec.mode = ACTIVATION_NULL; + + TensorDesc inputDesc, dwFilterDesc, pwFilterDesc, outputDesc, dwBiasDesc, pwBiasDesc; + inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + dwFilterDesc = tensor4df(dt, DF_NCHW, 1, ic, fh, fw); + dwBiasDesc = tensor1d(dt, ic); + if (isFusedWithPw) { + pwFilterDesc = tensor4df(dt, DF_NCHW, oc, ic, 1, 1); + pwBiasDesc = tensor1d(dt, oc); + } + ConvolutionParamSpec p = createConvolutionParamSpec(group, fh, fw, stride, stride, padding, + padding, padding, padding, 1, 1, fn, Convolution_Depthwise); + + // setup input, filter, bias + U8 *dwFilter = nullptr; + U8 *dwBias = nullptr; + U8 *pwFilter = nullptr; + U8 *pwBias = nullptr; + + U8 *input = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + dwFilter = ut_input_v(tensorNumElements(dwFilterDesc), dt, UT_INIT_RANDOM); + dwBias = ut_input_v(tensorNumElements(dwBiasDesc), dt, UT_INIT_RANDOM); + Tensor inputTensor; + Tensor inputTensorRef; + Tensor dwFilterTensor; + Tensor dwFilterTensorRef; + Tensor outputTensor; + Tensor outputTensorRef; + Tensor dwBiasTensor; + + inputTensor.resize(inputDesc); + inputTensorRef.resize(inputDesc); + dwFilterTensor.resize(dwFilterDesc); + dwFilterTensorRef.resize(dwFilterDesc); + dwBiasTensor.resize(dwBiasDesc); + + inputTensor.alloc(); + inputTensorRef.alloc(); + dwFilterTensor.alloc(); + dwFilterTensorRef.alloc(); + dwBiasTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, bytesOf(dt) * in * ic * ih * iw); + memcpy(get_ptr_from_tensor(inputTensorRef, UT_ARCH), input, bytesOf(dt) * in * ic * ih * iw); + memcpy(get_ptr_from_tensor(dwFilterTensor, UT_ARCH), dwFilter, bytesOf(dt) * 1 * ic * fh * fw); + memcpy( + get_ptr_from_tensor(dwFilterTensorRef, UT_ARCH), dwFilter, bytesOf(dt) * 1 * ic * fh * fw); + memcpy(get_ptr_from_tensor(dwBiasTensor, UT_ARCH), dwBias, bytesOf(dt) * ic); + Tensor pwFilterTensor; + Tensor pwFilterTensorRef; + Tensor pwBiasTensor; + if (isFusedWithPw) { + pwFilter = ut_input_v(tensorNumElements(pwFilterDesc), dt, UT_INIT_RANDOM); + pwBias = ut_input_v(tensorNumElements(pwBiasDesc), dt, UT_INIT_RANDOM); + pwFilterTensor.resize(pwFilterDesc); + pwFilterTensorRef.resize(pwFilterDesc); + pwBiasTensor.resize(pwBiasDesc); + pwFilterTensor.alloc(); + pwFilterTensorRef.alloc(); + pwBiasTensor.alloc(); + memcpy( + get_ptr_from_tensor(pwFilterTensor, UT_ARCH), pwFilter, bytesOf(dt) * oc * ic * 1 * 1); + memcpy(get_ptr_from_tensor(pwFilterTensorRef, UT_ARCH), pwFilter, + bytesOf(dt) * oc * ic * 1 * 1); + memcpy(get_ptr_from_tensor(pwBiasTensor, UT_ARCH), pwBias, bytesOf(dt) * oc); + } + + // setup output, bias + if (isFusedWithPw) { + CHECK_STATUS(depthwise_pointwise_convolution_infer_output_size( + &inputTensor, dwFilterTensor, pwFilterTensor, p, &outputTensor, dt, &archInfo)); + } else { + CHECK_STATUS(depthwise_convolution_infer_output_size( + &inputTensor, dwFilterTensor, p, &outputTensor, dt, &archInfo)); + } + + outputTensor.alloc(); + outputTensorRef.resize(outputTensor.get_desc()); + outputTensorRef.alloc(); + + // setup alg + ConvolutionPolicy policy = CONVOLUTION_FASTEST; + DepthwiseConvolutionForwardAlgorithm alg = DEPTHWISE_CONVOLUTION_ALGORITHM_NULL; + if (isFusedWithPw) { + CHECK_STATUS(depthwise_pointwise_convolution_infer_forward_algorithm(inputTensor, + dwFilterTensor, pwFilterTensor, outputTensor, p, policy, &alg, dt, + dwActivationParamSpec, pwActivationParamSpec, &archInfo)); + } else { + CHECK_STATUS(depthwise_convolution_infer_forward_algorithm(inputTensor, dwFilterTensor, + outputTensor, p, policy, &alg, dt, dwActivationParamSpec, &archInfo)); + } + + // setup tmp + U32 tmpBytes; + if (isFusedWithPw) { + CHECK_STATUS(depthwise_pointwise_convolution_infer_forward_tmp_bytes(inputTensor, + dwFilterTensor, pwFilterTensor, outputTensor, p, alg, &tmpBytes, &archInfo)); + } else { + CHECK_STATUS(depthwise_convolution_infer_forward_tmp_bytes( + inputTensor, dwFilterTensor, outputTensor, p, alg, &tmpBytes, &archInfo)); + } + Tensor tmpTensor; + tmpTensor.resize(tensor1d(DT_U8, tmpBytes)); + tmpTensor.alloc(); + + // setup filter trans + U32 dwBytes, pwBytes; + if (isFusedWithPw) { + CHECK_STATUS(depthwise_pointwise_convolution_transform_filter_bytes( + dwFilterTensor, pwFilterTensor, p, alg, &dwBytes, &pwBytes, &archInfo)); + } else { + CHECK_STATUS(depthwise_convolution_transform_filter_bytes( + dwFilterTensor, p, alg, &dwBytes, &archInfo)); + } + Tensor dwFtmTensor; + dwFtmTensor.resize(tensor1d(DT_U8, dwBytes)); + dwFtmTensor.alloc(); + Tensor pwFtmTensor; + if (isFusedWithPw) { + pwFtmTensor.resize(tensor1d(DT_U8, pwBytes)); + pwFtmTensor.alloc(); + } + + // trans filter + if (isFusedWithPw) { + CHECK_STATUS(depthwise_pointwise_convolution_transform_filter( + dwFilterTensor, pwFilterTensor, p, alg, &dwFtmTensor, &pwFtmTensor, &archInfo)); + } else { + CHECK_STATUS( + depthwise_convolution_transform_filter(dwFilterTensor, p, alg, &dwFtmTensor, &archInfo)); + } + + if (UT_CHECK) { + if (isFusedWithPw) { + CHECK_STATUS(depthwise_pointwise_convolution(inputTensor, dwFtmTensor, pwFtmTensor, p, + alg, dwBiasTensor, pwBiasTensor, tmpTensor, outputTensor, dwActivationParamSpec, + pwActivationParamSpec, &archInfo)); + + // naive implement + CHECK_STATUS(depthwise_pointwise_convolution(inputTensorRef, dwFilterTensorRef, + pwFilterTensorRef, p, alg, dwBiasTensor, pwBiasTensor, tmpTensor, outputTensorRef, + dwActivationParamSpec, pwActivationParamSpec, &archInfo_org)); + } else { + CHECK_STATUS(depthwise_convolution(inputTensor, dwFtmTensor, p, alg, dwBiasTensor, + tmpTensor, outputTensor, dwActivationParamSpec, &archInfo)); + + // naive implement + CHECK_STATUS(depthwise_convolution(inputTensorRef, dwFilterTensorRef, p, alg, + dwBiasTensor, tmpTensor, outputTensorRef, dwActivationParamSpec, &archInfo_org)); + } + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), outputTensor.length(), dt, 0.1, __FILE__, + __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + if (isFusedWithPw) { + CHECK_STATUS(depthwise_pointwise_convolution(inputTensor, dwFtmTensor, pwFtmTensor, p, + alg, dwBiasTensor, pwBiasTensor, tmpTensor, outputTensor, dwActivationParamSpec, + pwActivationParamSpec, &archInfo)); + } else { + CHECK_STATUS(depthwise_convolution(inputTensor, dwFtmTensor, p, alg, dwBiasTensor, + tmpTensor, outputTensor, dwActivationParamSpec, &archInfo)); + } + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, fh, + fw, stride, padding, on, oc, oh, ow); + double ops = 0; + if (isFusedWithPw) { + sprintf(buffer, "%20s, %80s", "DepthwisePointwise", params); + ops = 2.0 * in * ic * ih * iw * fh * fw + in * ic * oh * ow + 2.0 * on * oc * oh * ow * ic + + on * oc * oh * ow; + } else { + sprintf(buffer, "%20s, %80s", "DepthwiseConvolution", params); + ops = 2.0 * in * ic * ih * iw * fh * fw + in * ic * oh * ow; + } + ut_log(dt, buffer, ops, time); + + free(input); + free(dwFilter); + free(dwBias); + if (isFusedWithPw) { + free(pwFilter); + free(pwBias); + } + return 0; +} + +int main(int argc, char *argv[]) +{ +#ifdef _USE_FP16 + depthwiseConvolutionTest(argc, argv, true, DT_F16); + depthwiseConvolutionTest(argc, argv, false, DT_F16); +#endif +#ifdef _USE_FP32 + depthwiseConvolutionTest(argc, argv, true, DT_F32); + depthwiseConvolutionTest(argc, argv, false, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_depthwise_convolution_int8.cpp b/compute/tensor/tests/test_depthwise_convolution_int8.cpp new file mode 100644 index 00000000..f1dfcd56 --- /dev/null +++ b/compute/tensor/tests/test_depthwise_convolution_int8.cpp @@ -0,0 +1,190 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "tensor_computing.h" +#include "ut_util.h" + +int main(int argc, char *argv[]) +{ +#ifdef _USE_INT8 + CHECK_REQUIREMENT(argc == 16); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + // weight + U32 fn = atoi(argv[5]); + U32 fc = atoi(argv[6]); + U32 fh = atoi(argv[7]); + U32 fw = atoi(argv[8]); + U32 group = atoi(argv[9]); + // stride & padding + U32 stride = atoi(argv[10]); + U32 padding = atoi(argv[11]); + // output + U32 on = atoi(argv[12]); + U32 oc = atoi(argv[13]); + U32 oh = atoi(argv[14]); + U32 ow = atoi(argv[15]); + CHECK_REQUIREMENT(in == 1 && on == 1); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + DataType dt = DT_I8; + DataType odt = DT_I32; + ActivationParamSpec dwActivationParamSpec; + ActivationParamSpec pwActivationParamSpec; + dwActivationParamSpec.mode = ACTIVATION_RELU6; + pwActivationParamSpec.mode = ACTIVATION_RELU6; + + TensorDesc inputDesc, dwFilterDesc, pwFilterDesc, outputDesc, dwBiasDesc, pwBiasDesc; + inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + dwFilterDesc = tensor4df(dt, DF_NCHW, 1, ic, fh, fw); + pwFilterDesc = tensor4df(dt, DF_NCHW, oc, ic, 1, 1); + dwBiasDesc = tensor1d(odt, ic); + pwBiasDesc = tensor1d(odt, oc); + ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, fh, fw, stride, stride, + padding, padding, padding, padding, 1, 1, fn, Convolution_Depthwise); + + // setup input, filter, bias + INT8 *input = (INT8 *)ut_input_v(in * ic * ih * iw, DT_I8, UT_INIT_RANDOM); + INT8 *dwFilter = (INT8 *)ut_input_v(tensorNumElements(dwFilterDesc), DT_I8, UT_INIT_RANDOM); + INT8 *pwFilter = (INT8 *)ut_input_v(tensorNumElements(pwFilterDesc), DT_I8, UT_INIT_RANDOM); + I32 *dwBias = (I32 *)ut_input_v(ic, DT_I32, UT_INIT_RANDOM); + I32 *pwBias = (I32 *)ut_input_v(oc, DT_I32, UT_INIT_RANDOM); + + Tensor inputTensor; + Tensor inputTensorRef; + Tensor dwFilterTensor; + Tensor dwFilterTensorRef; + Tensor outputTensor; + Tensor outputTensorRef; + Tensor dwBiasTensor; + + inputTensor.resize(inputDesc); + inputTensorRef.resize(inputDesc); + dwFilterTensor.resize(dwFilterDesc); + dwFilterTensorRef.resize(dwFilterDesc); + dwBiasTensor.resize(dwBiasDesc); + + inputTensor.alloc(); + inputTensorRef.alloc(); + dwFilterTensor.alloc(); + dwFilterTensorRef.alloc(); + dwBiasTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, bytesOf(dt) * in * ic * ih * iw); + memcpy(get_ptr_from_tensor(inputTensorRef, UT_ARCH), input, bytesOf(dt) * in * ic * ih * iw); + memcpy(get_ptr_from_tensor(dwFilterTensor, UT_ARCH), dwFilter, bytesOf(dt) * 1 * ic * fh * fw); + memcpy( + get_ptr_from_tensor(dwFilterTensorRef, UT_ARCH), dwFilter, bytesOf(dt) * 1 * ic * fh * fw); + memcpy(get_ptr_from_tensor(dwBiasTensor, UT_ARCH), dwBias, bytesOf(dt) * ic); + + Tensor pwFilterTensor; + Tensor pwFilterTensorRef; + Tensor pwBiasTensor; + pwFilterTensor.resize(pwFilterDesc); + pwFilterTensorRef.resize(pwFilterDesc); + pwBiasTensor.resize(pwBiasDesc); + pwFilterTensor.alloc(); + pwFilterTensorRef.alloc(); + pwBiasTensor.alloc(); + memcpy(get_ptr_from_tensor(pwFilterTensor, UT_ARCH), pwFilter, bytesOf(dt) * oc * ic * 1 * 1); + memcpy(get_ptr_from_tensor(pwFilterTensorRef, UT_ARCH), pwFilter, bytesOf(dt) * oc * ic * 1 * 1); + memcpy(get_ptr_from_tensor(pwBiasTensor, UT_ARCH), pwBias, bytesOf(dt) * oc); + + // setup output, bias + CHECK_STATUS(depthwise_pointwise_convolution_infer_output_size(&inputTensor, dwFilterTensor, + pwFilterTensor, convParamSpec, &outputTensor, odt, &archInfo)); + outputTensor.alloc(); + outputTensorRef.resize(outputTensor.get_desc()); + outputTensorRef.alloc(); + + // setup alg + ConvolutionPolicy policy = CONVOLUTION_FASTEST; + DepthwiseConvolutionForwardAlgorithm alg = DEPTHWISE_CONVOLUTION_ALGORITHM_NULL; + CHECK_STATUS(depthwise_pointwise_convolution_infer_forward_algorithm(inputTensor, + dwFilterTensor, pwFilterTensor, outputTensor, convParamSpec, policy, &alg, dt, + dwActivationParamSpec, pwActivationParamSpec, &archInfo)); + + // setup tmp + U32 tmpBytes; + CHECK_STATUS(depthwise_pointwise_convolution_infer_forward_tmp_bytes(inputTensor, + dwFilterTensor, pwFilterTensor, outputTensor, convParamSpec, alg, &tmpBytes, &archInfo)); + Tensor tmpTensor; + tmpTensor.resize(tensor1d(DT_U8, tmpBytes)); + tmpTensor.alloc(); + + // setup filter trans + U32 dwBytes, pwBytes; + CHECK_STATUS(depthwise_pointwise_convolution_transform_filter_bytes( + dwFilterTensor, pwFilterTensor, convParamSpec, alg, &dwBytes, &pwBytes, &archInfo)); + Tensor dwFtmTensor; + dwFtmTensor.resize(tensor1d(DT_U8, dwBytes)); + dwFtmTensor.alloc(); + Tensor pwFtmTensor; + pwFtmTensor.resize(tensor1d(DT_U8, pwBytes)); + pwFtmTensor.alloc(); + // trans filter + CHECK_STATUS(depthwise_pointwise_convolution_transform_filter( + dwFilterTensor, pwFilterTensor, convParamSpec, alg, &dwFtmTensor, &pwFtmTensor, &archInfo)); + + if (UT_CHECK) { + CHECK_STATUS(depthwise_pointwise_convolution(inputTensor, dwFtmTensor, pwFtmTensor, + convParamSpec, alg, dwBiasTensor, pwBiasTensor, tmpTensor, outputTensor, + dwActivationParamSpec, pwActivationParamSpec, &archInfo)); + + // naive implement + CHECK_STATUS(depthwise_pointwise_convolution(inputTensorRef, dwFilterTensorRef, + pwFilterTensorRef, convParamSpec, alg, dwBiasTensor, pwBiasTensor, tmpTensor, + outputTensorRef, dwActivationParamSpec, pwActivationParamSpec, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), outputTensor.length(), DT_I32, 1, + __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(depthwise_pointwise_convolution(inputTensor, dwFtmTensor, pwFtmTensor, + convParamSpec, alg, dwBiasTensor, pwBiasTensor, tmpTensor, outputTensor, + dwActivationParamSpec, pwActivationParamSpec, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, fh, + fw, stride, padding, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "DepthwiseConvolution", params); + double ops = 2.0 * in * ic * ih * iw * fh * fw + in * ic * oh * ow + + 2.0 * on * oc * oh * ow * ic + on * oc * oh * ow; + ut_log(DT_I8, buffer, ops, time); + + free(input); + free(dwFilter); + free(pwFilter); + free(dwBias); + free(pwBias); +#endif + + return 0; +} diff --git a/compute/tensor/tests/test_depthwise_convolution_ocl.cpp b/compute/tensor/tests/test_depthwise_convolution_ocl.cpp new file mode 100644 index 00000000..d135ff50 --- /dev/null +++ b/compute/tensor/tests/test_depthwise_convolution_ocl.cpp @@ -0,0 +1,270 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" + +#ifdef _USE_FP16 +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} + +inline GCLMem_t alloc_desc(Tensor tensor, GCLMemDesc desc) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->padding(desc); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} +int depthwiseConvolutionTest(int argc, char *argv[], DataFormat filterDataFormat, DataType dt) +{ + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 group, stride, padding; + U32 on, oc, oh, ow; + U32 biasNum; + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + in = 1; + ic = 8; + ih = 4; + iw = 4; + fn = 1; + fc = 8; + fh = 3; + fw = 3; + group = 1; + stride = 1; + padding = 1; + + if (argc == 9) { + ic = atoi(argv[1]); + ih = atoi(argv[2]); + iw = atoi(argv[3]); + fc = atoi(argv[4]); + fh = atoi(argv[5]); + fw = atoi(argv[6]); + stride = atoi(argv[7]); + padding = atoi(argv[8]); + } + + on = 1; + oc = fc; + oh = (ih + padding * 2 - fh) / stride + 1; + ow = (iw + padding * 2 - fw) / stride + 1; + ActivationParamSpec dwActivationParamSpec; + dwActivationParamSpec.mode = ACTIVATION_NULL; + ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, fh, fw, stride, stride, + padding, padding, padding, padding, 1, 1, fn, Convolution_Depthwise); + + U32 filterLen = fn * fc * fh * fw; + U32 biasLen = oc; + TensorDesc inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + TensorDesc filterDesc = tensor4df(dt, filterDataFormat, fn, fc, fh, fw); + TensorDesc biasDesc = tensor1d(dt, biasLen); + U8 *input_cpu = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + U8 *filter_cpu = ut_input_v(filterLen, dt, UT_INIT_RANDOM); + U8 *bias_cpu = ut_input_v(biasLen, dt, UT_INIT_RANDOM); + + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + ; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor filterTensorOrg = Tensor(OCLMem); + Tensor filterTensor = Tensor(OCLMem); + Tensor biasTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(inputDesc); + filterTensor.resize(filterDesc); + filterTensorOrg.resize(filterDesc); + biasTensor.resize(biasDesc); + + MaliPara maliPara; + ForwardRunInfoMali runInfo; + runInfo.algorithm = (I32)(DEPTHWISE_CONVOLUTION_ALGORITHM_NULL); + maliPara.handle = handle; + maliPara.forwardRunInfo = &runInfo; + archInfo.archPara = &maliPara; + + CHECK_STATUS(depthwise_convolution_infer_output_size( + &inputTensor, filterTensor, convParamSpec, &outputTensor, dt, &archInfo)); + ConvolutionPolicy policy = CONVOLUTION_TUNNING; + DepthwiseConvolutionForwardAlgorithm alg = DEPTHWISE_CONVOLUTION_ALGORITHM_NULL; + CHECK_STATUS(depthwise_convolution_infer_forward_algorithm(inputTensor, filterTensor, + outputTensor, convParamSpec, policy, &alg, dt, dwActivationParamSpec, &archInfo)); + + U32 maxBytes = 0; + U32 tmpBytes; + CHECK_STATUS(depthwise_convolution_infer_forward_tmp_bytes( + inputTensor, filterTensor, outputTensor, convParamSpec, alg, &tmpBytes, &archInfo)); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + + U32 str[3] = {0, 0, 0}; + U32 off[3] = {0, 0, 0}; + GCLMemDesc filterMemDesc = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); + maliPara.gclmemFilterDesc = &filterMemDesc; + U32 ftmBytes; + CHECK_STATUS(depthwise_convolution_transform_filter_bytes( + filterTensor, convParamSpec, alg, &ftmBytes, &archInfo)); + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + + GCLMemDesc desc = gclmem_build_desc(); + biasNum = (oc + 3) / 4; + desc.memType = GCL_MEM_IMG_1D; + desc.byteSize = biasNum * 4 * bytesOf(dt); + desc.stride[0] = biasNum; + desc.stride[1] = 1; + desc.stride[2] = 1; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.num = biasNum; + desc.memFormat = DF_NHWC; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + desc.host_ptr = bias_cpu; + alloc_desc(biasTensor, desc); + + desc = filterMemDesc; + alloc_desc(filterTensor, desc); + + desc.stride[0] = fw * fh; + desc.stride[1] = fc; + desc.stride[2] = fn; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.byteSize = fw * fh * fc * fn * bytesOf(dt); + desc.num = fw * fh * fc * fn; + desc.memType = GCL_MEM_BUF; + desc.memFormat = DF_NCHW; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + desc.host_ptr = filter_cpu; + alloc_desc(filterTensorOrg, desc); + + tmpBytes = tensorNumBytes(inputDesc); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + CHECK_STATUS(depthwise_convolution_transform_filter( + filterTensorOrg, convParamSpec, alg, &filterTensor, &archInfo)); + + CHECK_STATUS(ocl_set_input(handle, input, inputDesc, input_cpu, tmpbuf, true)); + + CHECK_STATUS(depthwise_convolution(inputTensor, filterTensor, convParamSpec, alg, biasTensor, + tmpTensor, outputTensor, dwActivationParamSpec, &archInfo)); + + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + TensorDesc outputDesc = outputTensor.get_desc(); + CHECK_STATUS(ocl_get_output(handle, output, outputDesc, true)); + void *output_gpu = output->mapPtrArray.back(); + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, fh, + fw, stride, padding, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "DepthwiseConvolution", params); +#ifdef _DEBUG + double ops = 2.0 * in * ic * ih * iw * fh * fw + in * ic * oh * ow; + ut_log(dt, buffer, ops, time); +#endif + Tensor inputTensorCpu; + inputTensorCpu.resize(inputDesc); + inputTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu, tensorNumBytes(inputDesc)); + + Tensor filterTensorCpu; + filterTensorCpu.resize(filterDesc); + filterTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(filterTensorCpu, UT_ARCH), filter_cpu, tensorNumBytes(filterDesc)); + + Tensor biasTensorCpu; + biasTensorCpu.resize(biasDesc); + biasTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(biasTensorCpu, UT_ARCH), bias_cpu, tensorNumBytes(biasDesc)); + + Tensor outputTensorCpu; + outputTensorCpu.resize(outputDesc); + outputTensorCpu.alloc(); + + Tensor tmpTensorCpu; + // setup tmp + CHECK_STATUS(depthwise_convolution_infer_forward_tmp_bytes(inputTensorCpu, filterTensorCpu, + outputTensorCpu, convParamSpec, alg, &tmpBytes, &archInfo)); + tmpTensorCpu.resize(tensor1d(DT_F16, tmpBytes / bytesOf(DT_F16))); + tmpTensorCpu.alloc(); + + CHECK_STATUS(depthwise_convolution(inputTensorCpu, filterTensorCpu, convParamSpec, + DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT, biasTensorCpu, tmpTensorCpu, outputTensorCpu, + dwActivationParamSpec, &archInfo_org)); + ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, UT_ARCH), on * oc * ow * oh, dt); + + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(input_cpu); + free(filter_cpu); + free(bias_cpu); + return 0; +} +#endif + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + depthwiseConvolutionTest(argc, argv, DF_NCHW, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_depthwise_pointwise_convolution_ocl.cpp b/compute/tensor/tests/test_depthwise_pointwise_convolution_ocl.cpp new file mode 100644 index 00000000..b87b7126 --- /dev/null +++ b/compute/tensor/tests/test_depthwise_pointwise_convolution_ocl.cpp @@ -0,0 +1,361 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" + +#ifdef _USE_FP16 +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} + +inline GCLMem_t alloc_desc(Tensor tensor, GCLMemDesc desc) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->padding(desc); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +int depthwisePointwiseConvolutionTest( + int argc, char *argv[], DataFormat filterDataFormat, DataType dt) +{ + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 group, stride, padding; + U32 on, oc, oh, ow; + U32 biasNum; + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + in = 1; + ic = 8; + ih = 4; + iw = 4; + fn = 8; + fh = 3; + fw = 3; + group = 1; + stride = 1; + padding = 1; + + if (argc == 9) { + ic = atoi(argv[1]); + ih = atoi(argv[2]); + iw = atoi(argv[3]); + fn = atoi(argv[4]); + fh = atoi(argv[5]); + fw = atoi(argv[6]); + stride = atoi(argv[7]); + padding = atoi(argv[8]); + } + fc = ic; + on = 1; + oc = fn; + oh = (ih + padding * 2 - fh) / stride + 1; + ow = (iw + padding * 2 - fw) / stride + 1; + ActivationParamSpec dwActivationParamSpec; + ActivationParamSpec pwActivationParamSpec; + dwActivationParamSpec.mode = ACTIVATION_NULL; + pwActivationParamSpec.mode = ACTIVATION_NULL; + ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, fh, fw, stride, stride, + padding, padding, padding, padding, 1, 1, fn, Convolution_Depthwise_Pointwise); + + U32 dwFilterLen = 1 * fc * fh * fw; + U32 pwFilterLen = fn * fc * 1 * 1; + U32 dwBiasLen = fc; + U32 pwBiasLen = fn; + + TensorDesc inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + TensorDesc dwFilterDesc = tensor4df(dt, filterDataFormat, 1, fc, fh, fw); + TensorDesc pwFilterDesc = tensor4df(dt, filterDataFormat, fn, fc, 1, 1); + TensorDesc dwBiasDesc = tensor1d(dt, dwBiasLen); + TensorDesc pwBiasDesc = tensor1d(dt, pwBiasLen); + + U8 *input_cpu = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + U8 *dw_filter_cpu = ut_input_v(dwFilterLen, dt, UT_INIT_RANDOM); + U8 *pw_filter_cpu = ut_input_v(pwFilterLen, dt, UT_INIT_RANDOM); + U8 *dw_bias_cpu = ut_input_v(dwBiasLen, dt, UT_INIT_RANDOM); + U8 *pw_bias_cpu = ut_input_v(pwBiasLen, dt, UT_INIT_RANDOM); + + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + ; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor dwFilterTensorOrg = Tensor(OCLMem); + Tensor dwFilterTensor = Tensor(OCLMem); + Tensor pwFilterTensorOrg = Tensor(OCLMem); + Tensor pwFilterTensor = Tensor(OCLMem); + Tensor dwBiasTensor = Tensor(OCLMem); + Tensor pwBiasTensor = Tensor(OCLMem); + Tensor pwBiasTensorBuf = Tensor(OCLMem); + Tensor pwBiasTensorImg = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(inputDesc); + dwFilterTensor.resize(dwFilterDesc); + dwFilterTensorOrg.resize(dwFilterDesc); + pwFilterTensor.resize(pwFilterDesc); + pwFilterTensorOrg.resize(pwFilterDesc); + dwBiasTensor.resize(dwBiasDesc); + pwBiasTensor.resize(pwBiasDesc); + pwBiasTensorBuf.resize(pwBiasDesc); + pwBiasTensorImg.resize(pwBiasDesc); + + MaliPara maliPara; + ForwardRunInfoMali runInfo; + runInfo.algorithm = (I32)(DEPTHWISE_CONVOLUTION_ALGORITHM_NULL); + maliPara.handle = handle; + maliPara.forwardRunInfo = &runInfo; + archInfo.archPara = &maliPara; + + CHECK_STATUS(depthwise_pointwise_convolution_infer_output_size( + &inputTensor, dwFilterTensor, pwFilterTensor, convParamSpec, &outputTensor, dt, &archInfo)); + ConvolutionPolicy policy = CONVOLUTION_TUNNING; + DepthwiseConvolutionForwardAlgorithm alg = DEPTHWISE_CONVOLUTION_ALGORITHM_NULL; + CHECK_STATUS(depthwise_pointwise_convolution_infer_forward_algorithm(inputTensor, + dwFilterTensor, pwFilterTensor, outputTensor, convParamSpec, policy, &alg, DT_F16, + dwActivationParamSpec, pwActivationParamSpec, &archInfo)); + + U32 maxBytes = 0; + U32 tmpBytes; + CHECK_STATUS(depthwise_pointwise_convolution_infer_forward_tmp_bytes(inputTensor, + dwFilterTensor, pwFilterTensor, outputTensor, convParamSpec, alg, &tmpBytes, &archInfo)); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + + U32 dwBytes; + U32 pwBytes; + U32 str[3] = {0, 0, 0}; + U32 off[3] = {0, 0, 0}; + GCLMemDesc filterMemDesc[2]; + filterMemDesc[0] = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); + filterMemDesc[1] = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); + maliPara.gclmemFilterDesc = filterMemDesc; + CHECK_STATUS(depthwise_pointwise_convolution_transform_filter_bytes( + dwFilterTensor, pwFilterTensor, convParamSpec, alg, &dwBytes, &pwBytes, &archInfo)); + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + + GCLMemDesc desc = gclmem_build_desc(); + biasNum = (oc + 3) / 4; + desc.memType = GCL_MEM_IMG_1D; + desc.byteSize = biasNum * 4 * bytesOf(dt); + desc.stride[0] = biasNum; + desc.stride[1] = 1; + desc.stride[2] = 1; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.num = biasNum; + desc.memFormat = DF_NHWC; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + desc.host_ptr = pw_bias_cpu; + alloc_desc(pwBiasTensorImg, desc); + + biasNum = (oc + 7) / 8 * 8; + desc.memType = GCL_MEM_BUF; + desc.byteSize = biasNum * bytesOf(dt); + desc.stride[0] = biasNum; + desc.stride[1] = 1; + desc.stride[2] = 1; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.num = biasNum; + desc.memFormat = DF_NHWC; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + desc.host_ptr = pw_bias_cpu; + alloc_desc(pwBiasTensorBuf, desc); + + biasNum = (ic + 3) / 4; + desc.memType = GCL_MEM_IMG_1D; + desc.byteSize = biasNum * 4 * bytesOf(dt); + desc.stride[0] = biasNum; + desc.stride[1] = 1; + desc.stride[2] = 1; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.num = biasNum; + desc.memFormat = DF_NHWC; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + desc.host_ptr = dw_bias_cpu; + alloc_desc(dwBiasTensor, desc); + + desc = filterMemDesc[0]; + alloc_desc(dwFilterTensor, desc); + desc = filterMemDesc[1]; + alloc_desc(pwFilterTensor, desc); + + desc.stride[0] = fw * fh; + desc.stride[1] = fc; + desc.stride[2] = 1; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.byteSize = fw * fh * fc * bytesOf(dt); + desc.num = fw * fh * fc; + desc.memType = GCL_MEM_BUF; + desc.memFormat = DF_NCHW; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + desc.host_ptr = dw_filter_cpu; + alloc_desc(dwFilterTensorOrg, desc); + + desc.stride[0] = 1; + desc.stride[1] = fc; + desc.stride[2] = fn; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.byteSize = fn * fc * bytesOf(dt); + desc.num = fn * fc; + desc.memType = GCL_MEM_BUF; + desc.memFormat = DF_NCHW; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + desc.host_ptr = pw_filter_cpu; + alloc_desc(pwFilterTensorOrg, desc); + + tmpBytes = tensorNumBytes(inputDesc); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + CHECK_STATUS(depthwise_pointwise_convolution_transform_filter(dwFilterTensorOrg, + pwFilterTensorOrg, convParamSpec, alg, &dwFilterTensor, &pwFilterTensor, &archInfo)); + pwBiasTensor = (runInfo.algorithm == (I32)(DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM)) + ? pwBiasTensorBuf + : pwBiasTensorImg; + + CHECK_STATUS(ocl_set_input(handle, input, inputDesc, input_cpu, tmpbuf, true)); + + CHECK_STATUS(depthwise_pointwise_convolution(inputTensor, dwFilterTensor, pwFilterTensor, + convParamSpec, alg, dwBiasTensor, pwBiasTensor, tmpTensor, outputTensor, + dwActivationParamSpec, pwActivationParamSpec, &archInfo)); + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + TensorDesc outputDesc = outputTensor.get_desc(); + CHECK_STATUS(ocl_get_output(handle, output, outputDesc, true)); + void *output_gpu = output->mapPtrArray.back(); + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, fh, + fw, stride, padding, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "DepthwisePointwise", params); +#ifdef _DEBUG + double ops = 2.0 * in * ic * ih * iw * fh * fw + in * ic * oh * ow + + 2.0 * on * oc * oh * ow * ic + on * oc * oh * ow; + ut_log(dt, buffer, ops, time); +#endif + Tensor inputTensorCpu; + inputTensorCpu.resize(inputDesc); + inputTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu, tensorNumBytes(inputDesc)); + + Tensor dwFilterTensorCpu; + dwFilterTensorCpu.resize(dwFilterDesc); + dwFilterTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(dwFilterTensorCpu, UT_ARCH), dw_filter_cpu, + tensorNumBytes(dwFilterDesc)); + + Tensor pwFilterTensorCpu; + pwFilterTensorCpu.resize(pwFilterDesc); + pwFilterTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(pwFilterTensorCpu, UT_ARCH), pw_filter_cpu, + tensorNumBytes(pwFilterDesc)); + + Tensor dwBiasTensorCpu; + dwBiasTensorCpu.resize(dwBiasDesc); + dwBiasTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(dwBiasTensorCpu, UT_ARCH), dw_bias_cpu, tensorNumBytes(dwBiasDesc)); + + Tensor pwBiasTensorCpu; + pwBiasTensorCpu.resize(pwBiasDesc); + pwBiasTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(pwBiasTensorCpu, UT_ARCH), pw_bias_cpu, tensorNumBytes(pwBiasDesc)); + + Tensor outputTensorCpu; + outputTensorCpu.resize(outputDesc); + outputTensorCpu.alloc(); + + Tensor tmpTensorCpu; + // setup tmp + CHECK_STATUS( + depthwise_pointwise_convolution_infer_forward_tmp_bytes(inputTensorCpu, dwFilterTensorCpu, + pwFilterTensorCpu, outputTensorCpu, convParamSpec, alg, &tmpBytes, &archInfo)); + tmpTensorCpu.resize(tensor1d(DT_F16, tmpBytes / bytesOf(DT_F16))); + tmpTensorCpu.alloc(); + + CHECK_STATUS(depthwise_pointwise_convolution(inputTensorCpu, dwFilterTensorCpu, + pwFilterTensorCpu, convParamSpec, DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT, dwBiasTensorCpu, + pwBiasTensorCpu, tmpTensorCpu, outputTensorCpu, dwActivationParamSpec, + pwActivationParamSpec, &archInfo_org)); + ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, UT_ARCH), on * oc * ow * oh, dt); + + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(input_cpu); + free(dw_filter_cpu); + free(pw_filter_cpu); + free(dw_bias_cpu); + free(pw_bias_cpu); + return 0; +} +#endif + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + depthwisePointwiseConvolutionTest(argc, argv, DF_NCHW, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_detectionoutput.cpp b/compute/tensor/tests/test_detectionoutput.cpp new file mode 100644 index 00000000..4ad46012 --- /dev/null +++ b/compute/tensor/tests/test_detectionoutput.cpp @@ -0,0 +1,145 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" + +int detectionoutputTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 11); + // in0 loc + U32 ih0 = atoi(argv[1]); + U32 iw0 = atoi(argv[2]); + // in1 conf + U32 ih1 = atoi(argv[3]); + U32 iw1 = atoi(argv[4]); + // in2 priorbox + U32 in2 = atoi(argv[5]); + U32 ic2 = atoi(argv[6]); + U32 ilens2 = atoi(argv[7]); + // output + U32 oh = atoi(argv[8]); + U32 ow = atoi(argv[9]); + U32 num_class = atoi(argv[10]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + DetectionOutputParamSpec detectionoutput_desc; + detectionoutput_desc.num_class = num_class; + detectionoutput_desc.nms_top_k = 400; + detectionoutput_desc.nms_threshold = 0.449999988079; + detectionoutput_desc.keep_top_k = 200; + detectionoutput_desc.confidence_threshold = 0.00999999977648; + + std::vector inputTensors(3); + std::vector inputTensorPtrs(3); + Tensor inputTensor_loc, inputTensor_conf, inputTensor_priorbox; + TensorDesc inputDesc_loc = tensor2d(dt, ih0, iw0); + TensorDesc inputDesc_conf = tensor2d(dt, ih1, iw1); + TensorDesc inputDesc_priorbox = tensor3d(dt, in2, ic2, ilens2); + inputTensor_loc.resize(inputDesc_loc); + inputTensor_conf.resize(inputDesc_conf); + inputTensor_priorbox.resize(inputDesc_priorbox); + inputTensor_loc.alloc(); + inputTensor_conf.alloc(); + inputTensor_priorbox.alloc(); + U32 input_len_loc = tensorNumElements(inputDesc_loc); + U32 input_len_conf = tensorNumElements(inputDesc_conf); + U32 input_len_priorbox = tensorNumElements(inputDesc_priorbox); + U8 *input_loc = ut_input_v(input_len_loc, dt, UT_INIT_RANDOM); + U8 *input_conf = ut_input_v(input_len_conf, dt, UT_INIT_RANDOM); + U8 *input_priorbox = ut_input_v(input_len_priorbox, dt, UT_INIT_RANDOM); + memcpy(get_ptr_from_tensor(inputTensor_loc, UT_ARCH), input_loc, tensorNumBytes(inputDesc_loc)); + memcpy( + get_ptr_from_tensor(inputTensor_conf, UT_ARCH), input_conf, tensorNumBytes(inputDesc_conf)); + memcpy(get_ptr_from_tensor(inputTensor_priorbox, UT_ARCH), input_priorbox, + tensorNumBytes(inputDesc_priorbox)); + inputTensors[0] = inputTensor_loc; + inputTensors[1] = inputTensor_conf; + inputTensors[2] = inputTensor_priorbox; + inputTensorPtrs[0] = &inputTensors[0]; + inputTensorPtrs[1] = &inputTensors[1]; + inputTensorPtrs[2] = &inputTensors[2]; + // set output + Tensor outputTensor, outputTensorRef; + CHECK_STATUS(detectionoutput_infer_output_size( + inputTensorPtrs, detectionoutput_desc, &outputTensor, &archInfo)); + outputTensor.alloc(); + TensorDesc outputDesc_ref = outputTensor.get_desc(); + outputTensorRef.resize(outputDesc_ref); + outputTensorRef.alloc(); + U32 output_len = outputTensor.length(); + CHECK_REQUIREMENT(input_len_loc == ih0 * iw0 && input_len_conf == ih1 * iw1 && + input_len_priorbox == in2 * ic2 * ilens2 && output_len == oh * ow); + if (UT_CHECK) { + CHECK_STATUS(detectionoutput(inputTensors, detectionoutput_desc, outputTensor, &archInfo)); + CHECK_STATUS( + detectionoutput(inputTensors, detectionoutput_desc, outputTensorRef, &archInfo_org)); + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_len, dt, 0.05, __FILE__, __LINE__); + } + U32 num_detected_max = detectionoutput_desc.keep_top_k; +#ifdef _USE_FP16 + if (dt == DT_F16) { + F16 *output_f16 = (F16 *)get_ptr_from_tensor(outputTensor, UT_ARCH); + int idx = 0; + for (U32 i = 0; i < 1 + num_detected_max; i++) { + if (i >= 1 && output_f16[idx] == 0) { + break; + } + std::cout << " 1 : " << output_f16[idx] << " 2 : " << output_f16[idx + 1] + << " 3 : " << output_f16[idx + 2] << " 4 : " << output_f16[idx + 3] + << " 5 : " << output_f16[idx + 4] << " 6 : " << output_f16[idx + 5] + << std::endl; + idx = idx + 6; + } + } +#endif + if (dt == DT_F32) { + F32 *output_f32 = (F32 *)get_ptr_from_tensor(outputTensorRef, UT_ARCH); + int idx = 0; + for (U32 i = 0; i < 1 + num_detected_max; i++) { + if (i >= 1 && output_f32[idx] == 0) { + break; + } + std::cout << " 1 : " << output_f32[idx] << " 2 : " << output_f32[idx + 1] + << " 3 : " << output_f32[idx + 2] << " 4 : " << output_f32[idx + 3] + << " 5 : " << output_f32[idx + 4] << " 6 : " << output_f32[idx + 5] + << std::endl; + idx = idx + 6; + } + } + + free(input_loc); + free(input_conf); + free(input_priorbox); + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + std::cout << "----- Testing FP16 Detectionoutput -----" << std::endl; + detectionoutputTest(argc, argv, DT_F16); + std::cout << "----- Finished FP16 Detectionoutput -----" << std::endl; +#endif +#ifdef _USE_FP32 + std::cout << "----- Testing FP32 Detectionoutput -----" << std::endl; + detectionoutputTest(argc, argv, DT_F32); + std::cout << "----- Finished FP32 Detectionoutput -----" << std::endl; +#endif + return 0; +} diff --git a/compute/tensor/tests/test_dilated_convolution.cpp b/compute/tensor/tests/test_dilated_convolution.cpp new file mode 100644 index 00000000..8aff5695 --- /dev/null +++ b/compute/tensor/tests/test_dilated_convolution.cpp @@ -0,0 +1,169 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" + +int dilatedConvolutionTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 17); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + // weight + U32 fn = atoi(argv[5]); + U32 fc = atoi(argv[6]); + U32 fh = atoi(argv[7]); + U32 fw = atoi(argv[8]); + U32 group = atoi(argv[9]); + // stride & padding + U32 stride = atoi(argv[10]); + U32 padding = atoi(argv[11]); + + // dilation rate + U32 rate = atoi(argv[12]); + + // output + U32 on = atoi(argv[13]); + U32 oc = atoi(argv[14]); + U32 oh = atoi(argv[15]); + U32 ow = atoi(argv[16]); + CHECK_REQUIREMENT(in == 1 && on == 1); + + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_RELU; + activationDesc.value[0] = 0; + TensorDesc outputDesc; + TensorDesc inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + TensorDesc filterDesc = tensor4df(dt, DF_NCHW, oc, ic, fh, fw); + TensorDesc biasDesc = tensor1d(dt, oc); + ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, fh, fw, stride, stride, + padding, padding, padding, padding, rate, rate, fn, Convolution_Dilation); + + // setup input, filter, bias + U8 *input = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + U8 *filter = ut_input_v(fn * fc * fh * fw, dt, UT_INIT_RANDOM); + U8 *bias = ut_input_v(oc, dt, UT_INIT_RANDOM); + + Tensor inputTensor; + Tensor inputTensorRef; + Tensor filterTensor; + Tensor filterTensorRef; + Tensor outputTensor; + Tensor outputTensorRef; + Tensor biasTensor; + + inputTensor.resize(inputDesc); + inputTensorRef.resize(inputDesc); + filterTensor.resize(filterDesc); + filterTensorRef.resize(filterDesc); + biasTensor.resize(biasDesc); + + inputTensor.alloc(); + inputTensorRef.alloc(); + filterTensor.alloc(); + filterTensorRef.alloc(); + biasTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, bytesOf(dt) * in * ic * ih * iw); + memcpy(get_ptr_from_tensor(inputTensorRef, UT_ARCH), input, bytesOf(dt) * in * ic * ih * iw); + memcpy(get_ptr_from_tensor(filterTensor, UT_ARCH), filter, tensorNumBytes(filterDesc)); + memcpy(get_ptr_from_tensor(filterTensorRef, UT_ARCH), filter, tensorNumBytes(filterDesc)); + memcpy(get_ptr_from_tensor(biasTensor, UT_ARCH), bias, tensorNumBytes(biasDesc)); + + // setup output, bias + CHECK_STATUS(convolution_infer_output_size( + &inputTensor, filterTensor, convParamSpec, &outputTensor, dt, &archInfo)); + outputTensor.alloc(); + outputTensorRef.resize(outputTensor.get_desc()); + outputTensorRef.alloc(); + + // setup alg + ConvolutionPolicy policy = CONVOLUTION_FASTEST; + ConvolutionForwardAlgorithm alg = CONVOLUTION_ALGORITHM_NULL; + CHECK_STATUS(convolution_infer_forward_algorithm(inputTensor, filterTensor, outputTensor, + convParamSpec, policy, &alg, dt, activationDesc, &archInfo)); + + // setup tmp + U32 tmpBytes; + CHECK_STATUS(convolution_infer_forward_tmp_bytes( + inputTensor, filterTensor, outputTensor, convParamSpec, alg, &tmpBytes, &archInfo)); + Tensor tmpTensor; + tmpTensor.resize(tensor1d(DT_U8, tmpBytes)); + tmpTensor.alloc(); + + // setup filter trans + U32 ftmBytes; + CHECK_STATUS( + convolution_transform_filter_bytes(filterTensor, convParamSpec, alg, &ftmBytes, &archInfo)); + Tensor ftmTensor; + ftmTensor.resize(tensor1d(DT_U8, ftmBytes)); + ftmTensor.alloc(); + // trans filter + CHECK_STATUS(convolution_transform_filter( + filterTensor, convParamSpec, alg, tmpTensor, &ftmTensor, &archInfo)); + + if (UT_CHECK) { + CHECK_STATUS(convolution(inputTensor, ftmTensor, convParamSpec, alg, nullptr, biasTensor, + tmpTensor, outputTensor, activationDesc, &archInfo)); + + // naive implement + CHECK_STATUS(convolution(inputTensorRef, filterTensorRef, convParamSpec, alg, nullptr, + biasTensor, tmpTensor, outputTensorRef, activationDesc, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), outputTensor.length(), dt, 1, __FILE__, + __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(convolution(inputTensor, ftmTensor, convParamSpec, alg, nullptr, biasTensor, + tmpTensor, outputTensor, activationDesc, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, + fh, fw, stride, padding, rate, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "DilatedConvolution", params); + double ops = (1.0 * on * oc * oh * ow) * (2.0 * ic * fh * fw + 1); + ut_log(dt, buffer, ops, time); + + free(input); + free(filter); + free(bias); + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + dilatedConvolutionTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + dilatedConvolutionTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/tests/test_eltwise.cpp b/compute/tensor/tests/test_eltwise.cpp similarity index 50% rename from tests/test_eltwise.cpp rename to compute/tensor/tests/test_eltwise.cpp index 585b631e..7452c24a 100644 --- a/tests/test_eltwise.cpp +++ b/compute/tensor/tests/test_eltwise.cpp @@ -1,24 +1,24 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include #include "tensor_computing.h" #include "ut_util.h" -int eltwiseTest(int argc, char** argv, DataType dt) { +int eltwiseTest(int argc, char **argv, DataType dt) +{ CHECK_REQUIREMENT(argc == 6); U32 num = atoi(argv[1]); U32 in = atoi(argv[2]); @@ -28,32 +28,54 @@ int eltwiseTest(int argc, char** argv, DataType dt) { U32 len = in * ic * ih * iw; EltwiseMode eltwiseMode = ELTWISE_MAX; - - std::vector inputDesc(num); - std::vector input(num); + EltwiseParamSpec eltwiseDesc; + eltwiseDesc.elt_mode = eltwiseMode; + eltwiseDesc.activation_type = ACTIVATION_NULL; + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + std::vector input(num); + std::vector inTensors(num); + std::vector inTensorPtr(num); + TensorDesc inDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + Tensor outTensor; for (U32 i = 0; i < num; i++) { - inputDesc[i] = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); - input[i] = (void*)ut_input_v(len, dt, UT_INIT_RANDOM); + input[i] = (void *)ut_input_v(len, dt, UT_INIT_RANDOM); + inTensors[i].resize(inDesc); + inTensors[i].alloc(); + memcpy(get_ptr_from_tensor(inTensors[i], UT_ARCH), input[i], tensorNumBytes(inDesc)); + inTensorPtr[i] = &inTensors[i]; } - TensorDesc outputDesc; - CHECK_STATUS(eltwise_infer_output_size(inputDesc, &outputDesc, UT_ARCH)); - CHECK_REQUIREMENT(len == tensorNumElements(outputDesc)); - U8 *output = ut_input_v(len, dt, UT_INIT_ZERO); - U8 *output_ref = ut_input_v(len, dt, UT_INIT_ZERO); + + CHECK_STATUS(eltwise_infer_output_size(inTensorPtr, &outTensor, &archInfo)); + CHECK_REQUIREMENT(len == outTensor.length()); + outTensor.alloc(); + Tensor outTensorRef; + outTensorRef.resize(outTensor.get_desc()); + outTensorRef.alloc(); + + U32 tmpBytes; + CHECK_STATUS(eltwise_infer_forward_tmp_bytes(inTensors, outTensor, &tmpBytes, &archInfo)); + Tensor tmpTensor; + tmpTensor.resize(tensor1d(DT_U8, tmpBytes)); + tmpTensor.alloc(); if (UT_CHECK) { - CHECK_STATUS(eltwise(inputDesc, input, outputDesc, output, eltwiseMode, UT_ARCH)); + CHECK_STATUS(eltwise(inTensors, eltwiseDesc, tmpTensor, outTensor, &archInfo)); - CHECK_STATUS(eltwise(inputDesc, input, outputDesc, output_ref, eltwiseMode, CPU_GENERAL)); + CHECK_STATUS(eltwise(inTensors, eltwiseDesc, tmpTensor, outTensorRef, &archInfo_org)); // check - ut_check_v(output, output_ref, len, dt, 1, __FILE__, __LINE__); + ut_check_v(get_ptr_from_tensor(outTensor, UT_ARCH), + get_ptr_from_tensor(outTensorRef, UT_ARCH), len, dt, 1, __FILE__, __LINE__); } // benchmark double time_start = ut_time_ms(); for (int iter = 0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(eltwise(inputDesc, input, outputDesc, output, eltwiseMode, UT_ARCH)); + CHECK_STATUS(eltwise(inTensors, eltwiseDesc, tmpTensor, outTensor, &archInfo)); } double time_end = ut_time_ms(); double time = (time_end - time_start) / UT_LOOPS; @@ -61,24 +83,20 @@ int eltwiseTest(int argc, char** argv, DataType dt) { // log performance data char buffer[150]; char params[120]; - sprintf(params, "%u (%u %u %u %u)=(%u %u %u %u)", - num, in, ic, ih, iw, - in, ic, ih, iw); + sprintf(params, "%u (%u %u %u %u)=(%u %u %u %u)", num, in, ic, ih, iw, in, ic, ih, iw); sprintf(buffer, "%20s, %80s", "Eltwise", params); double ops = 1.0 * num * in * ic * ih * iw; ut_log(dt, buffer, ops, time); - for(U32 i=0; i +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" + +#ifdef _USE_FP16 +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} + +inline GCLMem_t alloc_desc(Tensor tensor, GCLMemDesc desc) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->padding(desc); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} +int fullyConnectedTest(int argc, char *argv[], DataType dt) +{ + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + U32 biasNum; + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + in = 1; + ic = 4; + ih = 4; + iw = 4; + fn = 4; + + if (argc == 5) { + ic = atoi(argv[1]); + ih = atoi(argv[2]); + iw = atoi(argv[3]); + fn = atoi(argv[4]); + } + fc = ic; + fh = ih; + fw = iw; + + on = 1; + oc = fn; + oh = 1; + ow = 1; + + TensorDesc inputDesc, filterDesc, outputDesc, biasDesc; + TensorDesc filterDesc_cpu, outputDesc_cpu; + + inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + filterDesc = tensor4df(dt, DF_NCHW, fn, fc, fh, fw); + filterDesc_cpu = tensor2df(dt, DF_NORMAL, fn, fc * fh * fw); + outputDesc_cpu = tensor2df(dt, DF_NORMAL, 1, fn); + biasDesc = tensor1d(dt, oc); + + U8 *input_cpu = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + U8 *filter_cpu = ut_input_v(fn * fc * fh * fw, dt, UT_INIT_RANDOM); + U8 *bias_cpu = ut_input_v(oc, dt, UT_INIT_RANDOM); + U8 *output_gpu = NULL; + + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + ; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor filterTensorOrg = Tensor(OCLMem); + Tensor filterTensor = Tensor(OCLMem); + Tensor biasTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(inputDesc); + filterTensor.resize(filterDesc); + filterTensorOrg.resize(filterDesc); + biasTensor.resize(biasDesc); + + MaliPara maliPara; + ForwardRunInfoMali runInfo; + runInfo.algorithm = (I32)(CONVOLUTION_ALGORITHM_NULL); + runInfo.best_w[0] = 1; + runInfo.best_c[0] = 1; + runInfo.best_k[0] = 1; + maliPara.handle = handle; + maliPara.gclmemInputDesc = NULL; + maliPara.gclmemOutputDesc = NULL; + maliPara.gclmemFilterDesc = NULL; + maliPara.forwardRunInfo = &runInfo; + archInfo.archPara = &maliPara; + + CHECK_STATUS( + fully_connected_infer_output_size(&inputTensor, filterTensor, &outputTensor, &archInfo)); + CHECK_STATUS( + fully_connected_infer_forward_algorithm(inputTensor, filterTensor, outputTensor, &archInfo)); + + U32 maxBytes = 0; + U32 tmpBytes; + CHECK_STATUS( + fully_connected_infer_forward_tmp_bytes(inputTensor, filterTensor, &tmpBytes, &archInfo)); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + + U32 ftmBytes; + U32 str[3] = {0, 0, 0}; + U32 off[3] = {0, 0, 0}; + GCLMemDesc filterMemDesc = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); + maliPara.gclmemFilterDesc = &filterMemDesc; + CHECK_STATUS(fully_connected_transform_filter_bytes(filterTensor, &ftmBytes, &archInfo)); + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + + GCLMemDesc desc = gclmem_build_desc(); + biasNum = oc; + desc.memType = GCL_MEM_BUF; + desc.byteSize = biasNum * bytesOf(dt); + desc.stride[0] = biasNum; + desc.stride[1] = 1; + desc.stride[2] = 1; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.num = biasNum; + desc.memFormat = DF_NHWC; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + desc.host_ptr = bias_cpu; + GCLMem_t bias = alloc_desc(biasTensor, desc); + + desc = filterMemDesc; + GCLMem_t filter = alloc_desc(filterTensor, desc); + + desc.stride[0] = fw * fh * fc; + desc.stride[1] = fn; + desc.stride[2] = 1; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.byteSize = fw * fh * fc * fn * bytesOf(dt); + desc.num = fw * fh * fc * fn; + desc.memType = GCL_MEM_BUF; + desc.memFormat = DF_NCHW; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + desc.host_ptr = filter_cpu; + alloc_desc(filterTensorOrg, desc); + + tmpBytes = tensorNumBytes(inputDesc); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + TensorDesc filterDescTran; + std::vector filterArray; + std::vector outputArray; + std::vector biasArray; + filterArray.push_back(filter); + outputArray.push_back(output); + biasArray.push_back(bias); + + CHECK_STATUS( + fully_connected_transform_filter(inputTensor, filterTensorOrg, &filterTensor, &archInfo)); + + CHECK_STATUS(ocl_set_input(handle, input, inputDesc, input_cpu, tmpbuf, true)); + + CHECK_STATUS( + fully_connected(inputTensor, filterTensor, biasTensor, tmpTensor, outputTensor, &archInfo)); + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + outputDesc = outputTensor.get_desc(); + CHECK_STATUS(ocl_get_output(handle, output, outputDesc, true)); + output_gpu = output->mapPtrArray.back(); + + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, fh, fw, on, + oc, oh, ow); + sprintf(buffer, "%20s, %80s", "InnerProdect", params); +#ifdef _DEBUG + double ops = 2.0 * fn * fc * fh * fw + 1.0 * fn; + ut_log(dt, buffer, ops, time); +#endif + Tensor inputTensorCpu; + inputTensorCpu.resize(inputDesc); + inputTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu, tensorNumBytes(inputDesc)); + + Tensor filterTensorCpu; + filterTensorCpu.resize(filterDesc_cpu); + filterTensorCpu.alloc(); + memcpy( + get_ptr_from_tensor(filterTensorCpu, UT_ARCH), filter_cpu, tensorNumBytes(filterDesc_cpu)); + + Tensor biasTensorCpu; + biasTensorCpu.resize(biasDesc); + biasTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(biasTensorCpu, UT_ARCH), bias_cpu, tensorNumBytes(biasDesc)); + + Tensor outputTensorCpu; + outputTensorCpu.resize(outputDesc_cpu); + outputTensorCpu.alloc(); + + Tensor tmpTensorCpu; + CHECK_STATUS(fully_connected_infer_forward_tmp_bytes( + inputTensorCpu, filterTensorCpu, &tmpBytes, &archInfo_org)); + tmpTensorCpu.resize(tensor1d(DT_F16, tmpBytes / bytesOf(DT_F16))); + tmpTensorCpu.alloc(); + + CHECK_STATUS(fully_connected(inputTensorCpu, filterTensorCpu, biasTensorCpu, tmpTensorCpu, + outputTensorCpu, &archInfo_org)); + ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, UT_ARCH), on * oc * ow * oh, dt); + + free(input_cpu); + free(filter_cpu); + free(bias_cpu); + return 0; +} +#endif + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + fullyConnectedTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_l2normalization.cpp b/compute/tensor/tests/test_l2normalization.cpp new file mode 100644 index 00000000..1a987616 --- /dev/null +++ b/compute/tensor/tests/test_l2normalization.cpp @@ -0,0 +1,101 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" + +int l2normalizationTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 4); + U32 ic = atoi(argv[1]); + U32 ih = atoi(argv[2]); + U32 iw = atoi(argv[3]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + DataFormat df = DF_MTK; + TensorDesc inputDesc = tensor3df(dt, df, ic, ih, iw); + U32 input_len = tensorNumElements(inputDesc); + U8 *input = ut_input_v(input_len, dt, UT_INIT_RANDOM); + + Tensor inputTensor; + inputTensor.resize(inputDesc); + inputTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inputDesc)); + + // set output + Tensor outputTensor, outputTensorRef; + CHECK_STATUS(l2normalization_infer_output_size(&inputTensor, &outputTensor, &archInfo)); + outputTensor.alloc(); + TensorDesc outputDesc_ref = outputTensor.get_desc(); + outputTensorRef.resize(outputDesc_ref); + outputTensorRef.alloc(); + + U32 output_len = outputTensor.length(); + CHECK_REQUIREMENT(input_len == ic * ih * iw && output_len == ic * ih * iw); + + if (UT_CHECK) { + CHECK_STATUS(l2normalization(inputTensor, outputTensor, &archInfo)); + + // naive implement + CHECK_STATUS(l2normalization(inputTensor, outputTensorRef, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_len, dt, 0.05, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(l2normalization(inputTensor, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + //general benchmark + time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(l2normalization(inputTensor, outputTensorRef, &archInfo_org)); + } + time_end = ut_time_ms(); + double general_implement_time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char general_buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u)=(%u %u %u)", ic, ih, iw, ic, ih, iw); + sprintf(buffer, "%20s, %80s", "L2Normalization", params); + sprintf(general_buffer, "%20s, %80s", "General L2Normalization", params); + double ops = input_len; + ut_log(dt, buffer, ops, time); + ut_log(dt, general_buffer, ops, general_implement_time); + + free(input); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + l2normalizationTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + l2normalizationTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_multihead_attention_ocl.cpp b/compute/tensor/tests/test_multihead_attention_ocl.cpp new file mode 100644 index 00000000..eac6128e --- /dev/null +++ b/compute/tensor/tests/test_multihead_attention_ocl.cpp @@ -0,0 +1,393 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" + +#ifdef _USE_FP16 +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} + +inline GCLMem_t alloc_desc(Tensor tensor, GCLMemDesc desc) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->padding(desc); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} +int multiheadAttentionTest(int argc, char *argv[], DataType dt) +{ + U32 in, ic, ih, iw; + U32 fn[4]; + U32 fc[4]; + U32 on, oc, oh, ow; + U32 firstFCSliceNum[3]; + U32 matmulSliceLen; + float multiplyAlpha; + float multiplyBeta; + std::vector eltwiseWithLayerNormIn; + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + in = 1; + ic = 312; + ih = 9; + iw = 1; + + fn[0] = 936; + fc[0] = 312; + + fn[1] = 312; + fc[1] = 312; + + fn[2] = 1200; + fc[2] = 312; + + fn[3] = 312; + fc[3] = 1200; + + firstFCSliceNum[0] = 312; + firstFCSliceNum[1] = 312; + firstFCSliceNum[2] = 312; + + matmulSliceLen = 26; + multiplyAlpha = 0.196116134524; + multiplyBeta = 0; + U32 filterNum = 4; + U32 lnNum = 2; + for (U32 i = 0; i < lnNum; ++i) { + eltwiseWithLayerNormIn.push_back(false); + } + + if (argc == 20) { + in = atoi(argv[1]); + ic = atoi(argv[2]); + ih = atoi(argv[3]); + iw = atoi(argv[4]); + fn[0] = atoi(argv[5]); + fc[0] = atoi(argv[6]); + fn[1] = atoi(argv[7]); + fc[1] = atoi(argv[8]); + fn[2] = atoi(argv[9]); + fc[2] = atoi(argv[10]); + fn[3] = atoi(argv[11]); + fc[3] = atoi(argv[12]); + firstFCSliceNum[0] = atoi(argv[13]); + firstFCSliceNum[1] = atoi(argv[14]); + firstFCSliceNum[2] = atoi(argv[15]); + matmulSliceLen = atoi(argv[16]); + multiplyAlpha = atof(argv[17]); + multiplyBeta = atof(argv[18]); + eltwiseWithLayerNormIn[0] = atoi(argv[19]); + eltwiseWithLayerNormIn[1] = atoi(argv[19]); + } + on = 1; + oc = fn[3]; + oh = ih; + ow = 1; + + TensorDesc inputDesc, outputDesc; + std::vector filterDesc; + std::vector biasDesc; + std::vector lnAlphaDesc; + std::vector lnBetaDesc; + + inputDesc = tensor3df(dt, DF_MKT, in, ic, ih); + for (U32 i = 0; i < filterNum; ++i) { + TensorDesc tmpFilterDesc = tensor4df(dt, DF_NCHW, fn[i], fc[i], 1, 1); + TensorDesc tmpBiasDesc = tensor1d(dt, fn[i] + 8); + filterDesc.push_back(tmpFilterDesc); + biasDesc.push_back(tmpBiasDesc); + } + + for (U32 i = 0; i < lnNum; ++i) { + TensorDesc tmpDesc = tensor1d(dt, (ic + 3) / 4 * 4); + if (i == 1) { + tmpDesc = tensor1d(dt, (fn[1] + 3) / 4 * 4); + } + lnAlphaDesc.push_back(tmpDesc); + lnBetaDesc.push_back(tmpDesc); + } + + std::vector filter_cpu; + std::vector bias_cpu; + std::vector lnAlpha_cpu; + std::vector lnBeta_cpu; + + U8 *input_cpu = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + + for (U32 i = 0; i < filterNum; i++) { + U8 *fltval = ut_input_v(tensorNumElements(filterDesc[i]), dt, UT_INIT_RANDOM); + U8 *biasval = ut_input_v(tensorNumElements(biasDesc[i]), dt, UT_INIT_RANDOM); + filter_cpu.push_back(fltval); + bias_cpu.push_back(biasval); + } + + for (U32 i = 0; i < lnNum; i++) { + U8 *alphaVal = ut_input_v(tensorNumElements(lnAlphaDesc[i]), dt, UT_INIT_RANDOM); + U8 *betaVal = ut_input_v(tensorNumElements(lnBetaDesc[i]), dt, UT_INIT_RANDOM); + lnAlpha_cpu.push_back(alphaVal); + lnBeta_cpu.push_back(betaVal); + } + + U8 *output_gpu = NULL; + + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + ; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + inputTensor.resize(inputDesc); + + std::vector filterTensorOrg; + std::vector filterTensor; + std::vector biasTensor; + for (U32 i = 0; i < filterNum; i++) { + Tensor tensor = Tensor(OCLMem); + tensor.resize(filterDesc[i]); + filterTensor.push_back(tensor); + filterTensorOrg.push_back(tensor); + tensor.resize(biasDesc[i]); + biasTensor.push_back(tensor); + } + + std::vector lnAlphaTensor; + std::vector lnBetaTensor; + for (U32 i = 0; i < lnNum; i++) { + Tensor tensor = Tensor(OCLMem); + tensor.resize(lnAlphaDesc[i]); + lnAlphaTensor.push_back(tensor); + tensor.resize(lnBetaDesc[i]); + lnBetaTensor.push_back(tensor); + } + Tensor tmpTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + + MaliPara maliPara; + ForwardRunInfoMali runInfo; + runInfo.algorithm = (I32)(CONVOLUTION_ALGORITHM_NULL); + for (U32 i = 0; i < 6; ++i) { + runInfo.best_w[i] = 1; + runInfo.best_c[i] = 1; + runInfo.best_k[i] = 1; + } + maliPara.handle = handle; + maliPara.forwardRunInfo = &runInfo; + archInfo.archPara = &maliPara; + ActivationMode activation = ACTIVATION_GELU; + CHECK_STATUS(multihead_attention_infer_output_size( + &inputTensor, filterTensor, &outputTensor, firstFCSliceNum, &archInfo)); + + CHECK_STATUS(multihead_attention_infer_forward_algorithm(inputTensor, filterTensor, + &multiplyAlpha, &multiplyBeta, firstFCSliceNum, matmulSliceLen, eltwiseWithLayerNormIn, + activation, outputTensor, &archInfo)); + U32 maxBytes = 0; + U32 tmpBytes; + CHECK_STATUS(multihead_attention_infer_forward_tmp_bytes(inputTensor, filterTensor, + eltwiseWithLayerNormIn, firstFCSliceNum, matmulSliceLen, &tmpBytes, &archInfo)); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + + U32 ftmBytes = 0; + GCLMemDesc filterMemDesc[4]; + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + for (U32 i = 0; i < filterNum; i++) { + filterMemDesc[i] = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + } + maliPara.gclmemFilterDesc = filterMemDesc; + CHECK_STATUS(multihead_attention_transform_filter_bytes(filterTensor, &ftmBytes, &archInfo)); + + GCLMem_t output = alloc_map(outputTensor); + + for (U32 i = 0; i < 2; ++i) { + U32 biasNum = fn[i] + 8; + GCLMemDesc tmpDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + tmpDesc.stride[0] = biasNum; + tmpDesc.stride[1] = 1; + tmpDesc.stride[2] = 1; + tmpDesc.offset[0] = 0; + tmpDesc.offset[1] = 0; + tmpDesc.offset[2] = 0; + tmpDesc.num = biasNum; + tmpDesc.byteSize = biasNum * bytesOf(dt); + tmpDesc.memFormat = DF_NHWC; + tmpDesc.memType = GCL_MEM_BUF; + tmpDesc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + tmpDesc.host_ptr = bias_cpu[i]; + alloc_desc(biasTensor[i], tmpDesc); + } + for (U32 i = 2; i < filterNum; ++i) { + U32 biasNum = (fn[i] + 3) / 4; + GCLMemDesc tmpDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + tmpDesc.stride[0] = biasNum; + tmpDesc.stride[1] = 1; + tmpDesc.stride[2] = 1; + tmpDesc.offset[0] = 0; + tmpDesc.offset[1] = 0; + tmpDesc.offset[2] = 0; + tmpDesc.num = biasNum; + tmpDesc.byteSize = biasNum * bytesOf(dt) * 4; + tmpDesc.memFormat = DF_NHWC; + tmpDesc.memType = GCL_MEM_IMG_1D; + tmpDesc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + tmpDesc.host_ptr = bias_cpu[i]; + alloc_desc(biasTensor[i], tmpDesc); + } + + for (U32 i = 0; i < lnNum; ++i) { + U32 layerNormNum = (ic + 3) / 4 * 4; + if (i == 1) { + layerNormNum = (fn[1] + 3) / 4 * 4; + } + GCLMemDesc tmpDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + tmpDesc.stride[0] = layerNormNum; + tmpDesc.stride[1] = 1; + tmpDesc.stride[2] = 1; + tmpDesc.offset[0] = 0; + tmpDesc.offset[1] = 0; + tmpDesc.offset[2] = 0; + tmpDesc.num = layerNormNum; + tmpDesc.byteSize = layerNormNum * bytesOf(dt); + tmpDesc.memFormat = DF_NHWC; + tmpDesc.memType = GCL_MEM_BUF; + tmpDesc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + tmpDesc.host_ptr = lnAlpha_cpu[i]; + alloc_desc(lnAlphaTensor[i], tmpDesc); + + tmpDesc.stride[0] = layerNormNum; + tmpDesc.stride[1] = 1; + tmpDesc.stride[2] = 1; + tmpDesc.offset[0] = 0; + tmpDesc.offset[1] = 0; + tmpDesc.offset[2] = 0; + tmpDesc.num = layerNormNum; + tmpDesc.byteSize = layerNormNum * bytesOf(dt); + tmpDesc.memFormat = DF_NHWC; + tmpDesc.memType = GCL_MEM_BUF; + tmpDesc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + tmpDesc.host_ptr = lnBeta_cpu[i]; + alloc_desc(lnBetaTensor[i], tmpDesc); + } + for (U32 i = 0; i < filterNum; ++i) { + GCLMemDesc desc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + desc.stride[0] = fc[i]; + desc.stride[1] = fn[i]; + desc.stride[2] = 1; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.byteSize = fc[i] * fn[i] * bytesOf(dt); + desc.num = fc[i] * fn[i]; + desc.memType = GCL_MEM_BUF; + desc.memFormat = DF_NCHW; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + desc.host_ptr = filter_cpu[i]; + alloc_desc(filterTensorOrg[i], desc); + } + + for (U32 i = 0; i < filterNum; ++i) { + GCLMemDesc desc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + desc = filterMemDesc[i]; + alloc_desc(filterTensor[i], desc); + } + + auto inputMem = (OclMemory *)inputTensor.get_memory(); + GCLMemDesc inputGclDesc = inputMem->get_desc(); + inputGclDesc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + inputGclDesc.host_ptr = input_cpu; + alloc_desc(inputTensor, inputGclDesc); + + tmpBytes = tensorNumBytes(inputDesc); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + alloc_bytes(tmpTensor, maxBytes); + + std::vector filterTensorPtr; + for (U32 i = 0; i < filterNum; i++) { + filterTensorPtr.push_back(&filterTensor[i]); + } + CHECK_STATUS(multihead_attention_transform_filter(filterTensorOrg, filterTensorPtr, &archInfo)); + CHECK_STATUS(multihead_attention(inputTensor, filterTensor, biasTensor, lnAlphaTensor, + lnBetaTensor, &multiplyAlpha, &multiplyBeta, firstFCSliceNum, matmulSliceLen, + eltwiseWithLayerNormIn, activation, tmpTensor, outputTensor, &archInfo)); + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + UNI_INFO_LOG("Run:\n") +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); +// double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + outputDesc = outputTensor.get_desc(); + ; + CHECK_STATUS(ocl_get_output(handle, output, outputDesc, true)); + output_gpu = output->mapPtrArray.back(); + + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(input_cpu); + for (auto p : filter_cpu) { + free((U8 *)p); + } + for (auto p : bias_cpu) { + free((U8 *)p); + } + for (auto p : lnAlpha_cpu) { + free((U8 *)p); + } + for (auto p : lnBeta_cpu) { + free((U8 *)p); + } + return 0; +} +#endif + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + multiheadAttentionTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_non_max_suppression.cpp b/compute/tensor/tests/test_non_max_suppression.cpp new file mode 100644 index 00000000..b31e66d3 --- /dev/null +++ b/compute/tensor/tests/test_non_max_suppression.cpp @@ -0,0 +1,139 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" + +int nonmaxsuppressionTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 12); + // in0 boxes + U32 in0 = atoi(argv[1]); + U32 ic0 = atoi(argv[2]); + U32 ilens0 = atoi(argv[3]); + // in1 scores + U32 in1 = atoi(argv[4]); + U32 ic1 = atoi(argv[5]); + U32 ilens1 = atoi(argv[6]); + // output + U32 oh = atoi(argv[7]); + U32 ow = atoi(argv[8]); + // nonMaxSuppressionParamSpec + U32 max_output_boxes_per_class = atoi(argv[9]); + F32 iou_threshold = (F32)atof(argv[10]); + F32 score_threshold = (F32)atof(argv[11]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + NonMaxSuppressionParamSpec nonMaxSuppressionParamSpec; + nonMaxSuppressionParamSpec.max_output_boxes_per_class = max_output_boxes_per_class; + nonMaxSuppressionParamSpec.iou_threshold = iou_threshold; + nonMaxSuppressionParamSpec.score_threshold = score_threshold; + + std::vector inputTensors(2); + TensorDesc input_desc_boxes = tensor3d(dt, in0, ic0, ilens0); + TensorDesc input_desc_scores = tensor3d(dt, in1, ic1, ilens1); + inputTensors[0] = Tensor::alloc_sized(input_desc_boxes); + inputTensors[1] = Tensor::alloc_sized(input_desc_scores); + U32 input_len_boxes = tensorNumElements(input_desc_boxes); + U8 *input_boxes = ut_input_v(input_len_boxes, dt, UT_INIT_RANDOM); + memcpy(get_ptr_from_tensor(inputTensors[0], UT_ARCH), input_boxes, + tensorNumBytes(input_desc_boxes)); + U32 input_len_scores = tensorNumElements(input_desc_scores); + U8 *input_scores = ut_input_v(input_len_scores, dt, UT_INIT_RANDOM); + memcpy(get_ptr_from_tensor(inputTensors[1], UT_ARCH), input_scores, + tensorNumBytes(input_desc_scores)); + std::vector inputTensorsPtr(2); + inputTensorsPtr[0] = &inputTensors[0]; + inputTensorsPtr[1] = &inputTensors[1]; + //set output + Tensor outputTensor; + CHECK_STATUS(non_max_suppression_infer_output_size( + inputTensorsPtr, nonMaxSuppressionParamSpec, &outputTensor, &archInfo)); + outputTensor.alloc(); + Tensor outputTensorRef = Tensor::alloc_sized(outputTensor.get_desc()); + U32 output_len = outputTensor.length(); + CHECK_REQUIREMENT(input_len_boxes == in0 * ic0 * ilens0 && + input_len_scores == in1 * ic1 * ilens1 && output_len == oh * ow); + /* + You can also change codes and use datas in the following example. + Command: ./test_non_max_suppression 1 6 4 1 2 6 7 3 3 0.5 0 + example: + input_box[24] = { 0.0, 0.0, 1.0, 1.0, + 0.0, 0.1, 1.0, 1.1, + 0.0, -0.1, 1.0, 0.9, + 0.0, 10.0, 1.0, 11.0, + 0.0, 10.1, 1.0, 11.1, + 0.0, 100.0, 1.0, 101.0 }; + input_score[12] = { 0.75, 0.9, 0.6, 0.95, 0.5, 0.3, 0.75, 0.9, 0.6, 0.95, 0.5, 0.3 }; + output_expect: + { 6, 0, 0, + 0, 0, 3, + 0, 0, 1, + 0, 0, 5, + 0, 1, 3, + 0, 1, 1, + 0, 1, 5 }; + */ + if (UT_CHECK) { + CHECK_STATUS( + non_max_suppression(inputTensors, nonMaxSuppressionParamSpec, outputTensor, &archInfo)); + CHECK_STATUS(non_max_suppression( + inputTensors, nonMaxSuppressionParamSpec, outputTensorRef, &archInfo_org)); + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_len, dt, 0.05, __FILE__, __LINE__); + } + + U32 num_detected_max = max_output_boxes_per_class * ic1; + if (dt == DT_F32) { + F32 *output_f32 = (F32 *)get_ptr_from_tensor(outputTensor, UT_ARCH); + int idx = 0; + for (U32 i = 0; i < 1 + num_detected_max; i++) { + std::cout << " 1 : " << output_f32[idx] << " 2 : " << output_f32[idx + 1] + << " 3 : " << output_f32[idx + 2] << std::endl; + idx = idx + 3; + } + } +#ifdef _USE_FP16 + if (dt == DT_F16) { + F16 *output_f16 = (F16 *)get_ptr_from_tensor(outputTensorRef, UT_ARCH); + int idx = 0; + for (U32 i = 0; i < 1 + num_detected_max; i++) { + std::cout << " 1 : " << output_f16[idx] << " 2 : " << output_f16[idx + 1] + << " 3 : " << output_f16[idx + 2] << std::endl; + idx = idx + 3; + } + } +#endif + free(input_boxes); + free(input_scores); + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + std::cout << "----- Testing FP16 Nonmaxsuppression -----" << std::endl; + nonmaxsuppressionTest(argc, argv, DT_F16); + std::cout << "----- Finished FP16 Nonmaxsuppression -----" << std::endl; +#endif +#ifdef _USE_FP32 + std::cout << "----- Testing FP32 Nonmaxsuppression -----" << std::endl; + nonmaxsuppressionTest(argc, argv, DT_F32); + std::cout << "----- Finished FP32 Nonmaxsuppression -----" << std::endl; +#endif + return 0; +} diff --git a/compute/tensor/tests/test_normalization.cpp b/compute/tensor/tests/test_normalization.cpp new file mode 100644 index 00000000..63457173 --- /dev/null +++ b/compute/tensor/tests/test_normalization.cpp @@ -0,0 +1,111 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" + +int normalizationTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 6); + U32 alpha = atoi(argv[1]); + U32 beta = atoi(argv[2]); + U32 ic = atoi(argv[3]); + U32 ih = atoi(argv[4]); + U32 iw = atoi(argv[5]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + DataFormat df = DF_MTK; + Tensor inputTensor; + TensorDesc inputDesc = tensor3df(dt, df, ic, ih, iw); + inputTensor.resize(inputDesc); + inputTensor.alloc(); + U32 input_len = tensorNumElements(inputDesc); + U8 *input = ut_input_v(input_len, dt, UT_INIT_RANDOM); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inputDesc)); + + // set output + Tensor outputTensor, outputTensorRef; + CHECK_STATUS(normalization_infer_output_size(&inputTensor, &outputTensor, &archInfo)); + outputTensor.alloc(); + TensorDesc outputDesc_ref = outputTensor.get_desc(); + outputTensorRef.resize(outputDesc_ref); + outputTensorRef.alloc(); + U32 output_len = outputTensor.length(); + CHECK_REQUIREMENT(input_len == ic * ih * iw && output_len == ic * ih * iw); + + U32 alpha_list[ic]; + U32 beta_list[ic]; + for (int i = 0; i < (int)ic; i++) { + alpha_list[i] = alpha; + beta_list[i] = beta; + } + Tensor alphaTensor, betaTensor; + TensorDesc alphaDesc, betaDesc; + alphaDesc = tensor1d(dt, ic); + betaDesc = tensor1d(dt, ic); + alphaTensor.resize(alphaDesc); + betaTensor.resize(betaDesc); + alphaTensor.alloc(); + betaTensor.alloc(); + memcpy(get_ptr_from_tensor(alphaTensor, UT_ARCH), alpha_list, tensorNumBytes(alphaDesc)); + memcpy(get_ptr_from_tensor(betaTensor, UT_ARCH), beta_list, tensorNumBytes(betaDesc)); + + if (UT_CHECK) { + CHECK_STATUS( + layer_normalization(inputTensor, alphaTensor, betaTensor, outputTensor, &archInfo)); + + // naive implement + CHECK_STATUS(layer_normalization( + inputTensor, alphaTensor, betaTensor, outputTensorRef, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_len, dt, 0.000001, __FILE__, + __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS( + layer_normalization(inputTensor, alphaTensor, betaTensor, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u)=(%u %u %u)", ic, ih, iw, ic, ih, iw); + sprintf(buffer, "%20s, %80s", "Normalization", params); + double ops = input_len; + ut_log(dt, buffer, ops, time); + + free(input); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + normalizationTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + normalizationTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_padding.cpp b/compute/tensor/tests/test_padding.cpp new file mode 100644 index 00000000..8d2540b7 --- /dev/null +++ b/compute/tensor/tests/test_padding.cpp @@ -0,0 +1,115 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" + +int paddingTest(int argc, char **argv, DataType dt) +{ + // input dim + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + + // padding info + U32 n_fir = atoi(argv[5]); + U32 c_fir = atoi(argv[6]); + U32 h_fir = atoi(argv[7]); + U32 w_fir = atoi(argv[8]); + U32 n_sec = atoi(argv[9]); + U32 c_sec = atoi(argv[10]); + U32 h_sec = atoi(argv[11]); + U32 w_sec = atoi(argv[12]); + U32 mode = atoi(argv[13]); + CHECK_REQUIREMENT(n_fir == 0 && n_sec == 0 && c_fir == 0 && c_sec == 0); + + CHECK_REQUIREMENT(0 == n_fir); + CHECK_REQUIREMENT(0 == n_sec); + CHECK_REQUIREMENT(0 == c_fir); + CHECK_REQUIREMENT(0 == c_sec); + + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + PadParamSpec padParamSpec; + + padParamSpec.top = h_fir; + padParamSpec.bottom = h_sec; + padParamSpec.left = w_fir; + padParamSpec.right = w_sec; + padParamSpec.constant_value = 0.0; + switch (mode) { + case 0: { + padParamSpec.pad_mode = Pad_Constant; + break; + } + case 1: { + padParamSpec.pad_mode = Pad_Edge; + break; + } + case 2: { + // limitation: the h_fir and the h_sec should lower than 0 + padParamSpec.pad_mode = Pad_Reflect; + break; + } + case 3: { + padParamSpec.pad_mode = Pad_Symmetric; + break; + } + default: { + UNI_ERROR_LOG("unknown paddding mode %d\n", mode); + break; + } + } + + Tensor inputTensor; + TensorDesc inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + inputTensor.resize(inputDesc); + inputTensor.alloc(); + U32 input_len = tensorNumElements(inputDesc); + U8 *input = ut_input_v(input_len, dt, UT_INIT_RANDOM); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inputDesc)); + + // set output + Tensor outputTensor, outputTensorRef; + CHECK_STATUS( + padding_infer_output_size(&inputTensor, padParamSpec, &outputTensor, &archInfo_org)); + outputTensor.alloc(); + TensorDesc outputDesc_ref = outputTensor.get_desc(); + outputTensorRef.resize(outputDesc_ref); + outputTensorRef.alloc(); + U32 output_len = outputTensor.length(); + + if (UT_CHECK) { + CHECK_STATUS(padding(inputTensor, padParamSpec, outputTensor, &archInfo)); + + CHECK_STATUS(padding(inputTensor, padParamSpec, outputTensorRef, &archInfo_org)); + + // check + ut_check_a(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_len, dt); + } + + free(input); + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + paddingTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_padding_ocl.cpp b/compute/tensor/tests/test_padding_ocl.cpp new file mode 100644 index 00000000..9c0efc47 --- /dev/null +++ b/compute/tensor/tests/test_padding_ocl.cpp @@ -0,0 +1,191 @@ + +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" + +#ifdef _USE_FP16 +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} +int paddingTest(int argc, char **argv, DataType dt) +{ + // input dim + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + + // padding info + U32 h_fir = atoi(argv[7]); + U32 w_fir = atoi(argv[8]); + U32 h_sec = atoi(argv[11]); + U32 w_sec = atoi(argv[12]); + U32 mode = atoi(argv[13]); + + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + PadParamSpec padParamSpec; + + padParamSpec.top = h_fir; + padParamSpec.bottom = h_sec; + padParamSpec.left = w_fir; + padParamSpec.right = w_sec; + padParamSpec.constant_value = 0.0; + switch (mode) { + case 0: { + padParamSpec.pad_mode = Pad_Constant; + break; + } + case 1: { + padParamSpec.pad_mode = Pad_Edge; + break; + } + case 2: { + // limitation: the h_fir and the h_sec should lower than 0 + padParamSpec.pad_mode = Pad_Reflect; + break; + } + case 3: { + padParamSpec.pad_mode = Pad_Symmetric; + break; + } + default: { + UNI_ERROR_LOG("unknown paddding mode %d\n", mode); + break; + } + } + + TensorDesc inputDescCPU, inputDescGPU, outputDescCPU, outputDescGPU; + inputDescCPU = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + inputDescGPU = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + + U32 input_len = tensorNumElements(inputDescCPU); + U8 *inputCPU = ut_input_v(input_len, dt, UT_INIT_RANDOM); + U8 *outputGPU = NULL; + + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + ; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(inputDescGPU); + + MaliPara maliPara; + maliPara.handle = handle; + archInfo.archPara = &maliPara; + + CHECK_STATUS(padding_infer_output_size(&inputTensor, padParamSpec, &outputTensor, &archInfo)); + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + + U32 maxBytes = 0; + U32 tmpBytes = 0; + tmpBytes = tensorNumBytes(inputDescGPU); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + CHECK_STATUS(ocl_set_input(handle, input, inputDescGPU, inputCPU, tmpbuf, true)); + CHECK_STATUS(padding(inputTensor, padParamSpec, outputTensor, &archInfo)); + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + outputDescGPU = outputTensor.get_desc(); + ; + CHECK_STATUS(ocl_get_output(handle, output, outputDescGPU, true)); + outputGPU = output->mapPtrArray.back(); + + char buffer[150]; + char params[120]; + U32 on, oc, oh, ow; + on = outputDescGPU.dims[3]; + oc = outputDescGPU.dims[2]; + oh = outputDescGPU.dims[1]; + ow = outputDescGPU.dims[0]; + sprintf(params, "(%u %u %u %u)->(%u %u %u %u)", in, ic, ih, iw, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "padding", params); +#ifdef _DEBUG + double ops = on * oc * oh * ow * 4; // TO DO + ut_log(dt, buffer, ops, time); +#endif + Tensor inputTensorCpu; + inputTensorCpu.resize(inputDescCPU); + inputTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), inputCPU, tensorNumBytes(inputDescCPU)); + + Tensor outputTensorCpu; + CHECK_STATUS( + padding_infer_output_size(&inputTensorCpu, padParamSpec, &outputTensorCpu, &archInfo_org)); + outputTensorCpu.alloc(); + + if (UT_CHECK) { + CHECK_STATUS(padding(inputTensorCpu, padParamSpec, outputTensorCpu, &archInfo_org)); + } + TensorDesc desc = outputTensorCpu.get_desc(); + ut_check_a( + outputGPU, get_ptr_from_tensor(outputTensorCpu, UT_ARCH), tensorNumElements(desc), dt); + + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(inputCPU); + return 0; +} +#endif + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + paddingTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_pooling.cpp b/compute/tensor/tests/test_pooling.cpp new file mode 100644 index 00000000..66b69173 --- /dev/null +++ b/compute/tensor/tests/test_pooling.cpp @@ -0,0 +1,117 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" + +int poolingTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 15); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + // weight + U32 fn = atoi(argv[5]); + U32 fc = atoi(argv[6]); + U32 fh = atoi(argv[7]); + U32 fw = atoi(argv[8]); + // stride & padding + U32 stride = atoi(argv[9]); + U32 padding = atoi(argv[10]); + // output + U32 on = atoi(argv[11]); + U32 oc = atoi(argv[12]); + U32 oh = atoi(argv[13]); + U32 ow = atoi(argv[14]); + CHECK_REQUIREMENT(in == 1 && fn == 1 && fc == 1); + CHECK_REQUIREMENT(ic == oc && ic % 8 == 0); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + PoolingParamSpec poolingParamSpec; + poolingParamSpec.mode = POOLING_MAX; + + poolingParamSpec.stride_h = stride; + poolingParamSpec.stride_w = stride; + poolingParamSpec.padding_top = padding; + poolingParamSpec.padding_bottom = padding; + poolingParamSpec.padding_left = padding; + poolingParamSpec.padding_right = padding; + poolingParamSpec.kernel_h = fh; + poolingParamSpec.kernel_w = fw; + poolingParamSpec.rm = CEIL; + + Tensor inputTensor; + TensorDesc inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + inputTensor.resize(inputDesc); + inputTensor.alloc(); + U32 input_len = tensorNumElements(inputDesc); + U8 *input = ut_input_v(input_len, dt, UT_INIT_RANDOM); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inputDesc)); + + // set output + Tensor outputTensor, outputTensorRef; + CHECK_STATUS( + pooling_infer_output_size(&inputTensor, poolingParamSpec, &outputTensor, &archInfo)); + outputTensor.alloc(); + TensorDesc outputDesc_ref = outputTensor.get_desc(); + outputTensorRef.resize(outputDesc_ref); + outputTensorRef.alloc(); + U32 output_len = outputTensor.length(); + CHECK_REQUIREMENT(input_len == in * ic * ih * iw && output_len == on * oc * oh * ow); + Tensor tmpTensor; + if (UT_CHECK) { + CHECK_STATUS(pooling(inputTensor, poolingParamSpec, tmpTensor, outputTensor, &archInfo)); + + CHECK_STATUS( + pooling(inputTensor, poolingParamSpec, tmpTensor, outputTensorRef, &archInfo_org)); + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_len, dt, 0.05, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(pooling(inputTensor, poolingParamSpec, tmpTensor, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, fh, + fw, stride, padding, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Pooling", params); + double ops = 1.0 * on * oc * oh * ow * fh * fw; + ut_log(dt, buffer, ops, time); + + free(input); + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + poolingTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + poolingTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_pooling_bp.cpp b/compute/tensor/tests/test_pooling_bp.cpp new file mode 100644 index 00000000..eb8c3943 --- /dev/null +++ b/compute/tensor/tests/test_pooling_bp.cpp @@ -0,0 +1,111 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" + +int poolingbpTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 15); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + // weight + U32 fn = atoi(argv[5]); + U32 fc = atoi(argv[6]); + U32 fh = atoi(argv[7]); + U32 fw = atoi(argv[8]); + // stride & padding + U32 stride = atoi(argv[9]); + U32 padding = atoi(argv[10]); + // output + U32 on = atoi(argv[11]); + U32 oc = atoi(argv[12]); + U32 oh = atoi(argv[13]); + U32 ow = atoi(argv[14]); + CHECK_REQUIREMENT(fn == 1 && fc == 1); + CHECK_REQUIREMENT(ic == oc && ic % 8 == 0); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + PoolingParamSpec poolingParamSpec; + poolingParamSpec.mode = POOLING_MEAN; + poolingParamSpec.stride_h = stride; + poolingParamSpec.stride_w = stride; + poolingParamSpec.padding_top = padding; + poolingParamSpec.padding_bottom = padding; + poolingParamSpec.padding_left = padding; + poolingParamSpec.padding_right = padding; + poolingParamSpec.kernel_h = fh; + poolingParamSpec.kernel_w = fw; + + Tensor inputTensor; + TensorDesc inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + inputTensor.resize(inputDesc); + inputTensor.alloc(); + U32 input_len = tensorNumElements(inputDesc); + U8 *input = ut_input_v(input_len, dt, UT_INIT_RANDOM); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inputDesc)); + + // set output + Tensor outputTensor, outputTensorRef; + TensorDesc outputDesc = tensor4df(dt, DF_NCHWC8, on, oc, oh, ow); + U32 output_len = tensorNumElements(outputDesc); + outputTensor.resize(outputDesc); + outputTensorRef.resize(outputDesc); + outputTensor.alloc(); + outputTensorRef.alloc(); + CHECK_REQUIREMENT(input_len == in * ic * ih * iw && output_len == on * oc * oh * ow); + + if (UT_CHECK) { + CHECK_STATUS(pooling_bp(inputTensor, poolingParamSpec, outputTensor, &archInfo)); + + CHECK_STATUS(pooling_bp(inputTensor, poolingParamSpec, outputTensorRef, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_len, dt, 0.05, __FILE__, __LINE__); + } + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(pooling_bp(inputTensor, poolingParamSpec, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)*(%u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, fh, + fw, stride, padding, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Pooling_bp", params); + double ops = 1.0 * on * oc * oh * ow * fh * fw; + ut_log(dt, buffer, ops, time); + + free(input); + return 0; +} + +int main(int argc, char **argv) +{ +// only support average pooling now +#ifdef _USE_FP32 + poolingbpTest(argc, argv, DT_F32); +#endif + return 0; +} \ No newline at end of file diff --git a/compute/tensor/tests/test_pooling_int8.cpp b/compute/tensor/tests/test_pooling_int8.cpp new file mode 100644 index 00000000..bc2784ec --- /dev/null +++ b/compute/tensor/tests/test_pooling_int8.cpp @@ -0,0 +1,150 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" + +#ifdef _USE_INT8 +int int8PoolingTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 15); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + // weight + U32 fn = atoi(argv[5]); + U32 fc = atoi(argv[6]); + U32 fh = atoi(argv[7]); + U32 fw = atoi(argv[8]); + // stride & padding + U32 stride = atoi(argv[9]); + U32 padding = atoi(argv[10]); + // output + U32 on = atoi(argv[11]); + U32 oc = atoi(argv[12]); + U32 oh = atoi(argv[13]); + U32 ow = atoi(argv[14]); + CHECK_REQUIREMENT(in == 1 && fn == 1 && fc == 1); + CHECK_REQUIREMENT(ic == oc && ic % 8 == 0); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + PoolingParamSpec poolingParamSpec; + poolingParamSpec.mode = POOLING_MEAN; + + poolingParamSpec.stride_h = stride; + poolingParamSpec.stride_w = stride; + poolingParamSpec.padding_top = padding; + poolingParamSpec.padding_bottom = padding; + poolingParamSpec.padding_left = padding; + poolingParamSpec.padding_right = padding; + poolingParamSpec.kernel_h = fh; + poolingParamSpec.kernel_w = fw; + poolingParamSpec.rm = CEIL; + + TensorDesc input_desc = tensor4df(DT_I8, DF_NCHWC8, in, ic, ih, iw); + TensorDesc in_desc_ref = input_desc; + in_desc_ref.dt = dt; + + Tensor inputTensor, outputTensor; + inputTensor.resize(input_desc); + + //TensorDesc output_desc; + CHECK_STATUS( + pooling_infer_output_size(&inputTensor, poolingParamSpec, &outputTensor, &archInfo)); + U32 input_len = tensorNumElements(input_desc); + U32 output_len = outputTensor.length(); + CHECK_REQUIREMENT(input_len == in * ic * ih * iw && output_len == on * oc * oh * ow); + + U8 *input_ref = ut_input_v(input_len, dt, UT_INIT_RANDOM); + Tensor inputTensorRef = Tensor::alloc_sized(in_desc_ref); + memcpy(get_ptr_from_tensor(inputTensorRef, UT_ARCH), input_ref, tensorNumBytes(in_desc_ref)); + + inputTensor.alloc(); + F16 inputScale = -1; + quantize_tensor(in_desc_ref, input_ref, &input_desc, get_ptr_from_tensor(inputTensor, UT_ARCH), + &inputScale); + inputTensor.set_scale(inputScale); + + outputTensor.alloc(); + INT8 *output = (INT8 *)get_ptr_from_tensor(outputTensor, UT_ARCH); + U8 *out_d = ut_input_v(output_len, dt, UT_INIT_ZERO); + + TensorDesc outDescRef = outputTensor.get_desc(); + outDescRef.dt = dt; + Tensor outputTensorRef = Tensor::alloc_sized(outDescRef); + + Tensor tmpTensor; + if (UT_CHECK) { + CHECK_STATUS(pooling(inputTensor, poolingParamSpec, tmpTensor, outputTensor, &archInfo)); + F32 outputScale = outputTensor.get_scale(); + for (U32 i = 0; i < output_len; i++) { + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + ((F32 *)out_d)[i] = output[i] / outputScale; + break; +#endif +#ifdef _USE_FP16 + case DT_F16: + ((F16 *)out_d)[i] = output[i] / outputScale; + break; +#endif + default: + break; + } + } + + CHECK_STATUS( + pooling(inputTensorRef, poolingParamSpec, tmpTensor, outputTensorRef, &archInfo_org)); + + // check + ut_check_v(out_d, get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_len, dt, 0.05, + __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(pooling(inputTensor, poolingParamSpec, tmpTensor, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, fh, + fw, stride, padding, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Pooling", params); + double ops = 1.0 * on * oc * oh * ow * fh * fw; + ut_log(DT_I8, buffer, ops, time); + + free(input_ref); + free(out_d); + + return 0; +} +#endif + +int main(int argc, char **argv) +{ +#ifdef _USE_INT8 + int8PoolingTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_pooling_ocl.cpp b/compute/tensor/tests/test_pooling_ocl.cpp new file mode 100644 index 00000000..d507efa9 --- /dev/null +++ b/compute/tensor/tests/test_pooling_ocl.cpp @@ -0,0 +1,210 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" +#include "libkernelsource.h" +#include +#include "gcl.h" +#include + +#ifdef _USE_FP16 +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} +void NCHWC8_to_NCHW(F16 *input_cpu, F16 *input_cpu_nchw, U32 ih, U32 iw, U32 ic) +{ + int index_c = 0; + int index_hw = 0; + int channel_k = 0; + for (int i = 0; i < (int)(ic * ih * iw);) { + index_c = i % (ih * iw); + index_hw = i / (ih * iw); + for (int k = 0; k < 8; k++) { + if (index_hw % 8 == 0) { + channel_k = index_hw * (ih * iw); + } + if (index_c == 0) { + for (int j = 0; j < (int)(ih * iw); j++) { + input_cpu_nchw[i++] = input_cpu[channel_k + k + j * 8]; + } + } + } + } +} + +int poolingTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 15); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + // weight + U32 fn = atoi(argv[5]); + U32 fc = atoi(argv[6]); + U32 fh = atoi(argv[7]); + U32 fw = atoi(argv[8]); + // stride & padding + U32 stride = atoi(argv[9]); + U32 padding = atoi(argv[10]); + // output + U32 on = atoi(argv[11]); + U32 oc = atoi(argv[12]); + U32 oh = atoi(argv[13]); + U32 ow = atoi(argv[14]); + CHECK_REQUIREMENT(in == 1 && fn == 1 && fc == 1); + CHECK_REQUIREMENT(ic == oc && ic % 8 == 0); + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + PoolingParamSpec poolingParamSpec; + poolingParamSpec.mode = POOLING_MEAN; + poolingParamSpec.stride_h = stride; + poolingParamSpec.stride_w = stride; + poolingParamSpec.padding_top = padding; + poolingParamSpec.padding_bottom = padding; + poolingParamSpec.padding_left = padding; + poolingParamSpec.padding_right = padding; + poolingParamSpec.kernel_h = fh; + poolingParamSpec.kernel_w = fw; + poolingParamSpec.rm = CEIL; + + TensorDesc input_desc_cpu = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + TensorDesc input_desc_gpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + TensorDesc output_desc_cpu, output_desc_gpu; + U32 input_len = tensorNumElements(input_desc_cpu); + U8 *input_cpu_nchwc8 = ut_input_v(input_len, dt, UT_INIT_RANDOM); + U8 *input_cpu_nchw = ut_input_v(input_len, dt, UT_INIT_ZERO); + NCHWC8_to_NCHW((F16 *)input_cpu_nchwc8, (F16 *)input_cpu_nchw, ih, iw, ic); + Tensor inputTensorCpu; + inputTensorCpu.resize(input_desc_cpu); + inputTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu_nchwc8, + tensorNumBytes(input_desc_cpu)); + + Tensor outputTensorCpu; + Tensor tmpTensorCpu; + CHECK_STATUS(pooling_infer_output_size( + &inputTensorCpu, poolingParamSpec, &outputTensorCpu, &archInfo_org)); + + outputTensorCpu.alloc(); + CHECK_STATUS( + pooling(inputTensorCpu, poolingParamSpec, tmpTensorCpu, outputTensorCpu, &archInfo_org)); + + U32 output_len = outputTensorCpu.length(); + U8 *output_cpu_nchw = ut_input_v(output_len, dt, UT_INIT_ZERO); + NCHWC8_to_NCHW( + (F16 *)get_ptr_from_tensor(outputTensorCpu, UT_ARCH), (F16 *)output_cpu_nchw, oh, ow, oc); + + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + ; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(input_desc_gpu); + + MaliPara maliPara; + maliPara.handle = handle; + archInfo.archPara = &maliPara; + + CHECK_STATUS( + pooling_infer_output_size(&inputTensor, poolingParamSpec, &outputTensor, &archInfo)); + U32 maxBytes = 0; + U32 tmpBytes = 0; + CHECK_STATUS(pooling_infer_forward_tmp_bytes(inputTensor, outputTensor, &tmpBytes, &archInfo)); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + tmpBytes = tensorNumBytes(input_desc_gpu); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + CHECK_STATUS(ocl_set_input(handle, input, input_desc_gpu, input_cpu_nchw, tmpbuf, true)); + CHECK_STATUS(pooling(inputTensor, poolingParamSpec, tmpTensor, outputTensor, &archInfo)); + + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + + UNI_INFO_LOG("Run:\n") +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + + TensorDesc outputDesc = outputTensor.get_desc(); + ; + CHECK_STATUS(ocl_get_output(handle, output, outputDesc, true)); + void *output_gpu_val = output->mapPtrArray.back(); + + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)+(%u %u %u %u)/(%u %u)=(%u %u %u %u)", in, ic, ih, iw, fn, fc, fh, + fw, stride, padding, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Pooling", params); +#ifdef _DEBUG + double ops = 1.0 * on * oc * oh * ow * fh * fw; + ut_log(dt, buffer, ops, time); +#endif + + ut_check_a(output_gpu_val, output_cpu_nchw, on * oc * ow * oh, dt); + free(input_cpu_nchwc8); + free(input_cpu_nchw); + free(output_cpu_nchw); + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + return 0; +} +#endif +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + poolingTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_power.cpp b/compute/tensor/tests/test_power.cpp new file mode 100644 index 00000000..8e91f4c9 --- /dev/null +++ b/compute/tensor/tests/test_power.cpp @@ -0,0 +1,85 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" + +int powerTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 5); + U32 len = atoi(argv[1]); + PowerParamSpec p; + p.scale = atof(argv[2]); + p.shift = atof(argv[3]); + p.power = atof(argv[4]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + Tensor inputTensor; + TensorDesc inputDesc = tensor1d(dt, len); + inputTensor.resize(inputDesc); + inputTensor.alloc(); + U8 *input = ut_input_v(len, dt, UT_INIT_RANDOM); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inputDesc)); + // set output + Tensor outputTensor, outputTensorRef; + CHECK_STATUS(power_infer_output_size(&inputTensor, &outputTensor, &archInfo)); + outputTensor.alloc(); + TensorDesc outputDesc_ref = outputTensor.get_desc(); + outputTensorRef.resize(outputDesc_ref); + outputTensorRef.alloc(); + + if (UT_CHECK) { + CHECK_STATUS(power(inputTensor, p, outputTensor, &archInfo)); + + // naive implement + CHECK_STATUS(power(inputTensor, p, outputTensorRef, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), len, dt, 0.1, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(power(inputTensor, p, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u)=(%u)", len, len); + sprintf(buffer, "%20s, %80s", "Power", params); + double ops = 2.0 * len; + ut_log(dt, buffer, ops, time / UT_LOOPS); + + free(input); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + powerTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + powerTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_power_ocl.cpp b/compute/tensor/tests/test_power_ocl.cpp new file mode 100644 index 00000000..99a210a3 --- /dev/null +++ b/compute/tensor/tests/test_power_ocl.cpp @@ -0,0 +1,157 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_FP16 +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} + +int powerTest(int argc, char **argv, DataType dt) +{ + U32 in = 1; + U32 ic = 4; + U32 ih = 4; + U32 iw = 4; + PowerParamSpec p; + p.scale = 0.5; + p.shift = 0.5; + p.power = 2; + if (argc == 8) { + in = atoi(argv[1]); + ic = atoi(argv[2]); + ih = atoi(argv[3]); + iw = atoi(argv[4]); + p.scale = atof(argv[5]); + p.shift = atof(argv[6]); + p.power = atof(argv[7]); + } + U32 on = in; + U32 oc = ic; + U32 oh = ih; + U32 ow = iw; + + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + U32 len = in * ic * ih * iw; + + TensorDesc input_desc_cpu = tensor1d(dt, len); + TensorDesc output_desc_cpu = tensor1d(dt, len); + TensorDesc input_desc_gpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + TensorDesc output_desc_gpu; + + U8 *input_cpu = ut_input_v(len, dt, UT_INIT_RANDOM); + U8 *output_gpu = NULL; + + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(input_desc_gpu); + + MaliPara maliPara; + maliPara.handle = handle; + archInfo.archPara = &maliPara; + CHECK_STATUS(power_infer_output_size(&inputTensor, &outputTensor, &archInfo)); + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + + U32 maxBytes = 0; + U32 tmpBytes = 0; + tmpBytes = tensorNumBytes(input_desc_gpu); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + CHECK_STATUS(ocl_set_input(handle, input, input_desc_gpu, input_cpu, tmpbuf, true)); + CHECK_STATUS(power(inputTensor, p, outputTensor, &archInfo)); + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + + UNI_INFO_LOG("Run:\n") +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + + output_desc_gpu = outputTensor.get_desc(); + CHECK_STATUS(ocl_get_output(handle, output, output_desc_gpu, true)); + output_gpu = output->mapPtrArray.back(); + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u) = (%u %u %u %u)", in, ic, ih, iw, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Power", params); +#ifdef _DEBUG + double ops = (2.0 * on * oc * oh * ow); + ut_log(dt, buffer, ops, time); +#endif + Tensor inputTensorCpu; + inputTensorCpu.resize(input_desc_cpu); + inputTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu, tensorNumBytes(input_desc_cpu)); + + Tensor outputTensorCpu; + outputTensorCpu.resize(output_desc_cpu); + outputTensorCpu.alloc(); + + CHECK_STATUS(power(inputTensorCpu, p, outputTensorCpu, &archInfo_org)); + ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, UT_ARCH), on * oc * ow * oh, dt); + + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(input_cpu); + return 0; +} + +int main(int argc, char **argv) +{ + powerTest(argc, argv, DT_F16); + return 0; +} +#endif diff --git a/compute/tensor/tests/test_prelu.cpp b/compute/tensor/tests/test_prelu.cpp new file mode 100644 index 00000000..097db305 --- /dev/null +++ b/compute/tensor/tests/test_prelu.cpp @@ -0,0 +1,92 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" + +int preluTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 5); + // in data + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + + CHECK_REQUIREMENT(ic % 8 == 0); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + PReLUParamSpec prelu_desc; + prelu_desc.propagate_down = 0; + TensorDesc inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + TensorDesc weightDesc = tensor1d(dt, ic); + U32 input_len = tensorNumElements(inputDesc); + U8 *input = ut_input_v(input_len, dt, UT_INIT_RANDOM); + U8 *weight = ut_input_v(ic, dt, UT_INIT_RANDOM); + + Tensor inputTensor = Tensor::alloc_sized(inputDesc); + Tensor weightTensor = Tensor::alloc_sized(weightDesc); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inputDesc)); + memcpy(get_ptr_from_tensor(weightTensor, UT_ARCH), weight, tensorNumBytes(weightDesc)); + + // set output + Tensor outputTensor; + CHECK_STATUS(prelu_infer_output_size(&inputTensor, &outputTensor, &archInfo)); + outputTensor.alloc(); + Tensor outputTensorRef = Tensor::alloc_sized(outputTensor.get_desc()); + U32 output_len = outputTensor.length(); + CHECK_REQUIREMENT(input_len == in * ic * ih * iw && output_len == in * ic * ih * iw); + + if (UT_CHECK) { + CHECK_STATUS(prelu(inputTensor, weightTensor, prelu_desc, outputTensor, &archInfo)); + + CHECK_STATUS(prelu(inputTensor, weightTensor, prelu_desc, outputTensorRef, &archInfo_org)); + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_len, dt, 0.05, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(prelu(inputTensor, weightTensor, prelu_desc, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)*(%u)=(%u %u %u %u)", in, ic, ih, iw, ic, in, ic, ih, iw); + sprintf(buffer, "%20s, %80s", "Prelu", params); + double ops = 2.0 * in * ic * ih * iw + 1.0 * in; + ut_log(dt, buffer, ops, time); + + free(input); + free(weight); + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + preluTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + preluTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_prelu_ocl.cpp b/compute/tensor/tests/test_prelu_ocl.cpp new file mode 100644 index 00000000..a61ec0b6 --- /dev/null +++ b/compute/tensor/tests/test_prelu_ocl.cpp @@ -0,0 +1,173 @@ + +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_FP16 +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} +inline GCLMem_t alloc_desc(Tensor tensor, GCLMemDesc desc) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->padding(desc); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +int preluTest(int argc, char **argv, DataType dt) +{ + // input dim + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + U32 prop = atoi(argv[5]); + U32 weightNum; + + ArchInfo archInfo; + archInfo.arch = MALI; + + TensorDesc inputDescGPU, outputDescGPU, weightDescGPU; + inputDescGPU = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + + U32 input_len = tensorNumElements(inputDescGPU); + U8 *inputCPU = ut_input_v(input_len, dt, UT_INIT_RANDOM); + U8 *weightCPU = NULL; + U8 *outputGPU = NULL; + PReLUParamSpec preluDesc; + if (prop) { + preluDesc.propagate_down = true; + weightCPU = ut_input_v(1, dt, UT_INIT_RANDOM); + weightDescGPU = tensor1d(dt, 1); + } else { + preluDesc.propagate_down = false; + weightCPU = ut_input_v(ic, dt, UT_INIT_RANDOM); + weightDescGPU = tensor1d(dt, ic); + } + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + ; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + Tensor weightTensor = Tensor(OCLMem); + inputTensor.resize(inputDescGPU); + weightTensor.resize(weightDescGPU); + + MaliPara maliPara; + maliPara.handle = handle; + archInfo.archPara = &maliPara; + + CHECK_STATUS(prelu_infer_output_size(&inputTensor, &outputTensor, &archInfo)); + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + + GCLMemDesc desc = gclmem_build_desc(); + if (preluDesc.propagate_down) { + weightNum = 1; + desc.byteSize = weightNum * bytesOf(dt); + } else { + weightNum = (ic + 3) / 4 * 4; + desc.byteSize = weightNum * bytesOf(dt); + } + desc.stride[0] = weightNum; + desc.stride[1] = 1; + desc.stride[2] = 1; + desc.offset[0] = 0; + desc.offset[1] = 0; + desc.offset[2] = 0; + desc.memType = GCL_MEM_BUF; + desc.num = weightNum; + desc.memFormat = DF_NHWC; + desc.flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + if (ic != 1) { + U8 *weight_align = ut_input_v((ic + 3) / 4 * 4, dt, UT_INIT_ZERO); + memcpy(weight_align, weightCPU, (ic + 3) / 4 * 4 * bytesOf(dt)); + desc.host_ptr = weight_align; + } else { + desc.host_ptr = weightCPU; + } + alloc_desc(weightTensor, desc); + + U32 tmpBytes; + U32 maxBytes = 0; + tmpBytes = tensorNumBytes(inputDescGPU); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + CHECK_STATUS(ocl_set_input(handle, input, inputDescGPU, inputCPU, tmpbuf, true)); + CHECK_STATUS(prelu(inputTensor, weightTensor, preluDesc, outputTensor, &archInfo)); + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + UNI_INFO_LOG("Run gpu:\n") +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); +// double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + outputDescGPU = outputTensor.get_desc(); + ; + CHECK_STATUS(ocl_get_output(handle, output, outputDescGPU, true)); + outputGPU = output->mapPtrArray.back(); + + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)->(%u %u %u %u)", in, ic, ih, iw, in, ic, ih, iw); + sprintf(buffer, "%20s, %80s", "prelu", params); + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(inputCPU); + free(weightCPU); + return 0; +} + +int main(int argc, char **argv) +{ + preluTest(argc, argv, DT_F16); + return 0; +} +#endif diff --git a/tests/test_priorbox.cpp b/compute/tensor/tests/test_priorbox.cpp similarity index 50% rename from tests/test_priorbox.cpp rename to compute/tensor/tests/test_priorbox.cpp index 3043dad3..29dea659 100644 --- a/tests/test_priorbox.cpp +++ b/compute/tensor/tests/test_priorbox.cpp @@ -1,22 +1,22 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "tensor_computing.h" #include "ut_util.h" #include -int priorboxTest(int argc, char **argv, DataType dt){ +int priorboxTest(int argc, char **argv, DataType dt) +{ CHECK_REQUIREMENT(argc == 18 || argc == 19 || argc == 20 || argc == 21); // in0 feature map U32 in0 = atoi(argv[1]); @@ -40,37 +40,47 @@ int priorboxTest(int argc, char **argv, DataType dt){ U32 olens = atoi(argv[16]); // multi param priorbox F32 ar1 = (F32)atof(argv[17]); - F32 ar2; - F32 min_size1; - F32 max_size1; - if(argc == 19 || argc == 21){ + F32 ar2 = 0; + F32 min_size1 = 0; + F32 max_size1 = 0; + if (argc == 19 || argc == 21) { ar2 = (F32)atof(argv[18]); - if(argc == 21){ + if (argc == 21) { min_size1 = (F32)atof(argv[19]); max_size1 = (F32)atof(argv[20]); } } - if(argc == 20){ + if (argc == 20) { min_size1 = (F32)atof(argv[18]); max_size1 = (F32)atof(argv[19]); } - + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + CHECK_REQUIREMENT(in0 == 1 && in1 == 1 && on == 1 && oc == 2); - PriorBoxDesc priorbox_desc; - priorbox_desc.min_sizes.push_back(min_size); - priorbox_desc.max_sizes.push_back(max_size); - priorbox_desc.aspect_ratios.push_back(ar1); - if(argc == 19 || argc == 21){ - priorbox_desc.aspect_ratios.push_back(ar2); - if(argc == 21){ - priorbox_desc.min_sizes.push_back(min_size1); - priorbox_desc.max_sizes.push_back(max_size1); + PriorBoxParamSpec priorbox_desc; + int min_sizes_len = 1; + int max_sizes_len = 1; + int aspect_ratios_len = 1; + priorbox_desc.min_sizes[0] = min_size; + priorbox_desc.max_sizes[0] = max_size; + priorbox_desc.aspect_ratios[0] = ar1; + priorbox_desc.min_sizes[1] = min_size1; + priorbox_desc.max_sizes[1] = max_size1; + priorbox_desc.aspect_ratios[1] = ar2; + if (argc == 19 || argc == 21) { + aspect_ratios_len++; + if (argc == 21) { + min_sizes_len++; + max_sizes_len++; } } - if(argc == 20){ - priorbox_desc.min_sizes.push_back(min_size1); - priorbox_desc.max_sizes.push_back(max_size1); + if (argc == 20) { + min_sizes_len++; + max_sizes_len++; } priorbox_desc.flip = flip; priorbox_desc.clip = clip; @@ -84,70 +94,74 @@ int priorboxTest(int argc, char **argv, DataType dt){ priorbox_desc.variances[3] = 0.20000000298; priorbox_desc.offset = 0.5; - std::vector input_descs; - TensorDesc output_desc; - TensorDesc input_desc_fm = tensor4df(dt, DF_NCHWC8, in0, ic0, ih0, iw0); - TensorDesc input_desc_data = tensor4df(dt, DF_NCHWC8, in1, ic1, ih1, iw1); - input_descs.push_back(input_desc_fm); - input_descs.push_back(input_desc_data); - CHECK_STATUS(priorbox_infer_output_size(input_descs, priorbox_desc, &output_desc, UT_ARCH)); - U32 input_len_fm = tensorNumElements(input_descs[0]); - U32 input_len_data = tensorNumElements(input_descs[1]); - U32 output_len = tensorNumElements(output_desc); - CHECK_REQUIREMENT(input_len_fm == in0*ic0*ih0*iw0 && input_len_data == in1*ic1*ih1*iw1 && output_len == on*oc*olens); + std::vector inputTensors(2); + std::vector inputTensorPtrs(2); + Tensor inputTensor_fm, inputTensor_data; + TensorDesc inputDesc_fm = tensor4df(dt, DF_NCHWC8, in0, ic0, ih0, iw0); + TensorDesc inputDesc_data = tensor4df(dt, DF_NCHWC8, in1, ic1, ih1, iw1); + inputTensor_fm.resize(inputDesc_fm); + inputTensor_data.resize(inputDesc_data); + U32 input_len_fm = tensorNumElements(inputDesc_fm); + U32 input_len_data = tensorNumElements(inputDesc_data); + inputTensors[0] = inputTensor_fm; + inputTensors[1] = inputTensor_data; + inputTensorPtrs[0] = &inputTensors[0]; + inputTensorPtrs[1] = &inputTensors[1]; + // set output + Tensor outputTensor, outputTensorRef; + CHECK_STATUS( + priorbox_infer_output_size(inputTensorPtrs, priorbox_desc, &outputTensor, &archInfo)); + outputTensor.alloc(); + TensorDesc outputDesc_ref = outputTensor.get_desc(); + outputTensorRef.resize(outputDesc_ref); + outputTensorRef.alloc(); + U32 output_len = outputTensor.length(); + CHECK_REQUIREMENT(input_len_fm == in0 * ic0 * ih0 * iw0 && + input_len_data == in1 * ic1 * ih1 * iw1 && output_len == on * oc * olens); - U8* output = ut_input_v(output_len, dt, UT_INIT_ZERO); - U8* output_ref = ut_input_v(output_len, dt, UT_INIT_ZERO); - if (UT_CHECK) { - CHECK_STATUS(priorbox(input_descs, - priorbox_desc, - output_desc, output, - UT_ARCH)); + CHECK_STATUS(priorbox(inputTensors, priorbox_desc, outputTensor, &archInfo)); - CHECK_STATUS(priorbox(input_descs, - priorbox_desc, - output_desc, output_ref, - CPU_GENERAL)); + CHECK_STATUS(priorbox(inputTensors, priorbox_desc, outputTensorRef, &archInfo_org)); // check - ut_check_v(output, output_ref, output_len, dt, 0.05, __FILE__, __LINE__); + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_len, dt, 0.05, __FILE__, __LINE__); } // benchmark double time_start = ut_time_ms(); - for(int iter = 0; iter < UT_LOOPS; iter++){ - CHECK_STATUS(priorbox(input_descs, priorbox_desc, output_desc, output, UT_ARCH)); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(priorbox(inputTensors, priorbox_desc, outputTensor, &archInfo)); } double time_end = ut_time_ms(); double time = (time_end - time_start) / UT_LOOPS; // log performance data - U32 num_priorboxs = priorbox_desc.aspect_ratios.size(); - if(priorbox_desc.flip){ + U32 num_priorboxs = aspect_ratios_len; + if (priorbox_desc.flip) { num_priorboxs = num_priorboxs * 2; } - U32 num_minsize = priorbox_desc.min_sizes.size(); + U32 num_minsize = min_sizes_len; num_priorboxs = (num_priorboxs + 1) * num_minsize; - if(!priorbox_desc.max_sizes.empty()){ - U32 num_maxsize = priorbox_desc.max_sizes.size(); + if (max_sizes_len != 0) { + U32 num_maxsize = max_sizes_len; num_priorboxs = num_priorboxs + num_maxsize; } U32 ochannel = 2; U32 numperbox = 4; char buffer[150]; char params[120]; - sprintf(params, "(%u %u %u %u) * (%u %u %u) = (%u %u %u)", in0, ic0, ih0, iw0, ochannel, numperbox, num_priorboxs, on, oc, olens); + sprintf(params, "(%u %u %u %u) * (%u %u %u) = (%u %u %u)", in0, ic0, ih0, iw0, ochannel, + numperbox, num_priorboxs, on, oc, olens); sprintf(buffer, "%20s, %80s", "Priorbox", params); double ops = 1.0 * output_len; ut_log(dt, buffer, ops, time); - free(output); - free(output_ref); return 0; } - -int main(int argc, char** argv){ +int main(int argc, char **argv) +{ #ifdef _USE_FP16 priorboxTest(argc, argv, DT_F16); #endif @@ -155,4 +169,4 @@ int main(int argc, char** argv){ priorboxTest(argc, argv, DT_F32); #endif return 0; -} \ No newline at end of file +} diff --git a/compute/tensor/tests/test_reduction.cpp b/compute/tensor/tests/test_reduction.cpp new file mode 100644 index 00000000..f815f543 --- /dev/null +++ b/compute/tensor/tests/test_reduction.cpp @@ -0,0 +1,107 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "tensor_computing.h" +#include "ut_util.h" + +int reductionTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc >= 6); + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + ReductionParamSpec p; + p.axes_num = atoi(argv[5]); + for (int i = 0; i < p.axes_num; i++) { + p.axes[i] = atoi(argv[6 + i]); + } + p.reduction_mode = REDUCTION_MEAN; + p.coeff = 1.0; + p.keep_dim = true; + DataFormat df = DF_NCHW; + TensorDesc maskDesc; + maskDesc.nDims = 0; + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + TensorDesc inDesc = tensor4df(dt, df, in, ic, ih, iw); + U8 *input = ut_input_v(tensorNumElements(inDesc), dt, UT_INIT_RANDOM); + Tensor inputTensor; + inputTensor.resize(inDesc); + inputTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inDesc)); + + Tensor maskTensor; + maskTensor.resize(maskDesc); + Tensor outputTensor; + Tensor outputTensorRef; + CHECK_STATUS(reduction_infer_output_size(&inputTensor, maskTensor, p, &outputTensor)); + outputTensor.alloc(); + outputTensorRef.resize(outputTensor.get_desc()); + outputTensorRef.alloc(); + + U32 tmpBytes; + CHECK_STATUS( + reduction_infer_forward_tmp_bytes(inputTensor, p, outputTensor, &tmpBytes, &archInfo)); + Tensor tmpTensor; + tmpTensor.resize(tensor1d(dt, tmpBytes)); + tmpTensor.alloc(); + + if (UT_CHECK) { + CHECK_STATUS(reduction(inputTensor, maskTensor, p, tmpTensor, outputTensor, &archInfo)); + + // naive implement + CHECK_STATUS( + reduction(inputTensor, maskTensor, p, tmpTensor, outputTensorRef, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), outputTensor.length(), dt, 1, __FILE__, + __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(reduction(inputTensor, maskTensor, p, tmpTensor, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + U32 on = 1, oc = 1, oh = 1, ow = 1; + CHECK_STATUS(tensor4dGet(outputTensor.get_desc(), &dt, &df, &on, &oc, &oh, &ow)); + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u) %d =(%u %u %u %u)", in, ic, ih, iw, p.axes_num, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Reduction", params); + double ops = 1.0 * in * ic * ih * iw; + ut_log(dt, buffer, ops, time / UT_LOOPS); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + reductionTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + reductionTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/tests/test_reshape.cpp b/compute/tensor/tests/test_reshape.cpp similarity index 52% rename from tests/test_reshape.cpp rename to compute/tensor/tests/test_reshape.cpp index 7ac1f090..563b1c62 100644 --- a/tests/test_reshape.cpp +++ b/compute/tensor/tests/test_reshape.cpp @@ -1,55 +1,62 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include #include "tensor_computing.h" #include "ut_util.h" -int reshapeTest(int argc, char** argv, DataType dt) { +int reshapeTest(int argc, char **argv, DataType dt) +{ CHECK_REQUIREMENT(argc > 4); - U32 in = atoi(argv[1]); - U32 ic = atoi(argv[2]); - U32 ih = atoi(argv[3]); - U32 iw = atoi(argv[4]); - I32 shape_size = atoi(argv[5]); - CHECK_REQUIREMENT(argc == 6+shape_size); - std::vector shape(shape_size); - for (I32 i = 0; i < shape_size; i++) { - shape[i] = atoi(argv[6+i]); + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + ReshapeParamSpec p; + p.shape_size = atoi(argv[5]); + CHECK_REQUIREMENT(argc == 6 + p.shape_size); + for (I32 i = 0; i < p.shape_size; i++) { + p.shape_dims[i] = atoi(argv[6 + i]); } + ArchInfo archInfo; + archInfo.arch = UT_ARCH; DataFormat df = DF_NCHW; - TensorDesc in_desc = tensor4df(dt, df, in, ic, ih, iw); - TensorDesc out_desc; - - CHECK_STATUS(reshape_infer_output_size(in_desc, &out_desc, shape.data(), shape_size, UT_ARCH)); - - U32 len = tensorNumElements(in_desc); - U8* input = ut_input_v(len, dt, UT_INIT_RANDOM); - U8* output = ut_input_v(len, dt, UT_INIT_RANDOM); + TensorDesc inDesc = tensor4df(dt, df, in, ic, ih, iw); + U32 len = tensorNumElements(inDesc); + U8 *input = ut_input_v(len, dt, UT_INIT_RANDOM); + Tensor inputTensor; + inputTensor.resize(inDesc); + inputTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inDesc)); + + Tensor outputTensor; + CHECK_STATUS(reshape_infer_output_size(&inputTensor, p, &outputTensor, &archInfo)); + outputTensor.alloc(); + TensorDesc outDesc = outputTensor.get_desc(); + Tensor nullTensor; if (UT_CHECK) { - CHECK_STATUS(reshape(in_desc, input, out_desc, output, UT_ARCH)); + CHECK_STATUS(reshape(inputTensor, nullTensor, outputTensor, &archInfo)); - CHECK_REQUIREMENT(tensorNumElements(out_desc) == len); + CHECK_REQUIREMENT(tensorNumElements(outDesc) == len); } double time_start = ut_time_ms(); for (int iter = 0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(reshape(in_desc, input, out_desc, output, UT_ARCH)); + CHECK_STATUS(reshape(inputTensor, nullTensor, outputTensor, &archInfo)); } double time_end = ut_time_ms(); double time = (time_end - time_start) / UT_LOOPS; @@ -58,20 +65,18 @@ int reshapeTest(int argc, char** argv, DataType dt) { char buffer[150]; char params[120]; memset(params, 0, 120); - sprintf(params, "(%u %u %u %u)=(", - in, ic, ih, iw); - for(I32 i = 0; i < shape_size; i++) { + sprintf(params, "(%u %u %u %u)=(", in, ic, ih, iw); + for (I32 i = 0; i < p.shape_size; i++) { I32 index = 0; for (; index < 120; index++) { if (params[index] == '\0') { break; } } - if (i != shape_size-1) { - sprintf(params+index, "%d ", out_desc.dims[out_desc.nDims-1-i]); - } - else { - sprintf(params+index, "%d)", out_desc.dims[out_desc.nDims-1-i]); + if (i != p.shape_size - 1) { + sprintf(params + index, "%d ", outDesc.dims[outDesc.nDims - 1 - i]); + } else { + sprintf(params + index, "%d)", outDesc.dims[outDesc.nDims - 1 - i]); } } sprintf(buffer, "%20s, %80s", "Reshape", params); @@ -79,13 +84,12 @@ int reshapeTest(int argc, char** argv, DataType dt) { ut_log(dt, buffer, ops, time); free(input); - free(output); return 0; } - -int main(int argc, char** argv) { +int main(int argc, char **argv) +{ #ifdef _USE_FP16 reshapeTest(argc, argv, DT_F16); #endif diff --git a/compute/tensor/tests/test_reshape_ocl.cpp b/compute/tensor/tests/test_reshape_ocl.cpp new file mode 100644 index 00000000..694cb4d1 --- /dev/null +++ b/compute/tensor/tests/test_reshape_ocl.cpp @@ -0,0 +1,162 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_FP16 +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} + +int reshapeTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc > 4); + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + ReshapeParamSpec p; + p.shape_size = atoi(argv[5]); + CHECK_REQUIREMENT(argc == 6 + p.shape_size); + for (I32 i = 0; i < p.shape_size; i++) { + p.shape_dims[i] = atoi(argv[6 + i]); + } + + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + DataFormat df = DF_NCHW; + TensorDesc inputDesc = tensor4df(dt, df, in, ic, ih, iw); + U32 len = tensorNumElements(inputDesc); + U8 *input_cpu = ut_input_v(len, dt, UT_INIT_RANDOM); + + Tensor inputTensorCpu; + inputTensorCpu.resize(inputDesc); + inputTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu, tensorNumBytes(inputDesc)); + + Tensor outputTensorCpu; + Tensor tmpTensorCpu; + CHECK_STATUS(reshape_infer_output_size(&inputTensorCpu, p, &outputTensorCpu, &archInfo_org)); + outputTensorCpu.alloc(); + CHECK_STATUS(reshape(inputTensorCpu, tmpTensorCpu, outputTensorCpu, &archInfo_org)); + + U8 *output_gpu = NULL; + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(inputDesc); + + MaliPara maliPara; + maliPara.handle = handle; + archInfo.archPara = &maliPara; + CHECK_STATUS(reshape_infer_output_size(&inputTensor, p, &outputTensor, &archInfo)); + TensorDesc outputDesc = outputTensor.get_desc(); + U32 on, oc, oh, ow; + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + + U32 maxBytes = 0; + U32 tmpBytes = 0; + tmpBytes = tensorNumBytes(inputDesc); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + CHECK_STATUS(ocl_set_input(handle, input, inputDesc, input_cpu, tmpbuf, true)); + CHECK_STATUS(reshape(inputTensor, tmpTensor, outputTensor, &archInfo)); + + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + + UNI_INFO_LOG("Run:\n") +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + outputDesc = outputTensor.get_desc(); + CHECK_STATUS(ocl_get_output(handle, output, outputDesc, true)); + output_gpu = output->mapPtrArray.back(); + + char buffer[150]; + char params[120]; + memset(params, 0, 120); + sprintf(params, "(%u %u %u %u)=(", in, ic, ih, iw); + for (I32 i = 0; i < p.shape_size; i++) { + I32 index = 0; + for (; index < 120; index++) { + if (params[index] == '\0') { + break; + } + } + if (i != p.shape_size - 1) { + sprintf(params + index, "%d ", outputDesc.dims[outputDesc.nDims - 1 - i]); + } else { + sprintf(params + index, "%d)", outputDesc.dims[outputDesc.nDims - 1 - i]); + } + } + sprintf(buffer, "%20s, %80s", "Reshape", params); +#ifdef _DEBUG + double ops = len; + ut_log(dt, buffer, ops, time); +#endif + ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, UT_ARCH), len, dt); + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(input_cpu); + return 0; +} + +int main(int argc, char **argv) +{ + reshapeTest(argc, argv, DT_F16); + return 0; +} +#endif diff --git a/compute/tensor/tests/test_rnn.cpp b/compute/tensor/tests/test_rnn.cpp new file mode 100644 index 00000000..b2b14238 --- /dev/null +++ b/compute/tensor/tests/test_rnn.cpp @@ -0,0 +1,151 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "tensor_computing.h" +#include "ut_util.h" + +int rnnTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 5); + U32 batch = atoi(argv[1]); + U32 step = atoi(argv[2]); + U32 xDim = atoi(argv[3]); + U32 hDim = atoi(argv[4]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + RNNParamSpec rnnParamSpec; + rnnParamSpec.mode = RNN_LSTM; + rnnParamSpec.biDirection = false; + rnnParamSpec.numOutput = hDim; + rnnParamSpec.numProjection = 1024; + rnnParamSpec.forgetBias = 1.0; + rnnParamSpec.activationMode = ACTIVATION_TANH; + rnnParamSpec.zoneoutCell = 0; + rnnParamSpec.zoneoutOutput = 0; + F32 threshold = 10; + if (rnnParamSpec.numProjection > 0) { + threshold = 40; + } + + U32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection + : rnnParamSpec.numOutput; + U32 num2 = (rnnParamSpec.numProjection > 0) ? 2 : 1; + TensorDesc inputDesc = tensor3df(dt, DF_MTK, batch, step, xDim); + Tensor inputTensor; + inputTensor.resize(inputDesc); + inputTensor.alloc(); + U32 inputLength = batch * step * xDim; + U8 *input = ut_input_v(inputLength, dt, UT_INIT_RANDOM); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inputDesc)); + + U32 tmpBytes; + std::vector filterDesc(2), biasDesc(2); + filterDesc[0] = tensor2df(dt, DF_NK, 4 * column, xDim + hDim); + filterDesc[1] = tensor2df(dt, DF_NK, rnnParamSpec.numOutput, rnnParamSpec.numProjection); + biasDesc[0] = tensor1d(dt, column * 4); + biasDesc[1] = tensor1d(dt, rnnParamSpec.numOutput); + std::vector filterTensor(num2), biasTensor(num2); + for (U32 i = 0; i < num2; i++) { + filterTensor[i].resize(filterDesc[i]); + filterTensor[i].alloc(); + U8 *filter = ut_input_v(tensorNumBytes(filterDesc[i]) / bytesOf(dt), dt, UT_INIT_RANDOM); + memcpy(get_ptr_from_tensor(filterTensor[i], UT_ARCH), filter, tensorNumBytes(filterDesc[i])); + free(filter); + + biasTensor[i].resize(biasDesc[i]); + biasTensor[i].alloc(); + U8 *bias = ut_input_v(tensorNumBytes(biasDesc[i]) / bytesOf(dt), dt, UT_INIT_RANDOM); + memcpy(get_ptr_from_tensor(biasTensor[i], UT_ARCH), bias, tensorNumBytes(biasDesc[i])); + free(bias); + } + + // set output + Tensor outputTensor, outputTensorRef; + CHECK_STATUS(rnn_infer_output_size(&inputTensor, rnnParamSpec, &outputTensor, &archInfo)); + outputTensor.alloc(); + U32 outputLength = outputTensor.length(); + + TensorDesc outputDesc_ref = outputTensor.get_desc(); + outputTensorRef.resize(outputDesc_ref); + outputTensorRef.alloc(); + + CHECK_STATUS(rnn_infer_forward_tmp_bytes( + inputTensor, filterTensor[0], outputTensor, rnnParamSpec, &tmpBytes, &archInfo)); + std::vector ftmBytes(num2); + CHECK_STATUS(rnn_transform_filter_bytes(filterTensor, rnnParamSpec, ftmBytes.data(), &archInfo)); + std::vector ftmTensor(num2); + std::vector ftmTensorPtr(num2); + for (U32 i = 0; i < num2; i++) { + ftmTensor[i].resize(tensor1d(DT_U8, ftmBytes[i])); + ftmTensor[i].alloc(); + ftmTensorPtr[i] = &ftmTensor[i]; + } + + Tensor tmpTensor; + tmpTensor.resize(tensor1d(DT_U8, tmpBytes)); + tmpTensor.alloc(); + + CHECK_STATUS(rnn_transform_filter(filterTensor, rnnParamSpec, ftmTensorPtr, &archInfo)); + + if (UT_CHECK) { + CHECK_STATUS(rnn( + inputTensor, ftmTensor, biasTensor, rnnParamSpec, tmpTensor, outputTensor, &archInfo)); + + // naive implement + CHECK_STATUS(rnn(inputTensor, ftmTensor, biasTensor, rnnParamSpec, tmpTensor, + outputTensorRef, &archInfo_org)); + + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), outputLength, dt, threshold, __FILE__, + __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(rnn( + inputTensor, ftmTensor, biasTensor, rnnParamSpec, tmpTensor, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "%u (%u %u %u)=(%u %u)", batch, step, xDim, hDim, batch, hDim); + sprintf(buffer, "%20s, %80s", "RNN", params); + double hxDim = hDim + xDim; + double ops = 1.0 * batch * step * + (2.0 * hxDim * column * 4 + column * 4 + rnnParamSpec.numProjection * rnnParamSpec.numOutput); + ut_log(dt, buffer, ops, time); + + free(input); + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + rnnTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + rnnTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/compute/tensor/tests/test_roialign.cpp b/compute/tensor/tests/test_roialign.cpp new file mode 100644 index 00000000..931f5e7a --- /dev/null +++ b/compute/tensor/tests/test_roialign.cpp @@ -0,0 +1,131 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#include "ut_util.h" + +int roialignTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 16); + // in0 feature map + U32 in0 = atoi(argv[1]); + U32 ic0 = atoi(argv[2]); + U32 ih0 = atoi(argv[3]); + U32 iw0 = atoi(argv[4]); + // in1 rois + U32 ih1 = atoi(argv[5]); + U32 iw1 = atoi(argv[6]); + // in2 batch_indices + U32 ilens2 = atoi(argv[7]); + // output + U32 on0 = atoi(argv[8]); + U32 oc0 = atoi(argv[9]); + U32 oh0 = atoi(argv[10]); + U32 ow0 = atoi(argv[11]); + // p + U32 output_h = atoi(argv[12]); + U32 output_w = atoi(argv[13]); + U32 sampling_ratio = atoi(argv[14]); + F32 spatial_scale = (F32)atof(argv[15]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + RoiAlignParamSpec p; + p.output_h = output_h; + p.output_w = output_w; + p.sampling_ratio = sampling_ratio; + p.spatial_scale = spatial_scale; + + std::vector inputTensors(3); + std::vector inputTensorPtrs(3); + TensorDesc inputDesc_feat = tensor4d(dt, in0, ic0, ih0, iw0); + TensorDesc inputDesc_rois = tensor2d(dt, ih1, iw1); + TensorDesc inputDesc_batch = tensor1d(dt, ilens2); + Tensor inputTensor_feat = Tensor::alloc_sized(inputDesc_feat); + Tensor inputTensor_rois = Tensor::alloc_sized(inputDesc_rois); + Tensor inputTensor_batch = Tensor::alloc_sized(inputDesc_batch); + U32 input_len_feat = tensorNumElements(inputDesc_feat); + U32 input_len_rois = tensorNumElements(inputDesc_rois); + U32 input_len_batch = tensorNumElements(inputDesc_batch); + U8 *input_feat = ut_input_v(input_len_feat, dt, UT_INIT_RANDOM); + U8 *input_rois = ut_input_v(input_len_rois, dt, UT_INIT_RANDOM); + U8 *input_batch = ut_input_v(input_len_batch, dt, UT_INIT_ZERO); + memcpy( + get_ptr_from_tensor(inputTensor_feat, UT_ARCH), input_feat, tensorNumBytes(inputDesc_feat)); + memcpy( + get_ptr_from_tensor(inputTensor_rois, UT_ARCH), input_rois, tensorNumBytes(inputDesc_rois)); + memcpy(get_ptr_from_tensor(inputTensor_batch, UT_ARCH), input_batch, + tensorNumBytes(inputDesc_batch)); + inputTensors[0] = inputTensor_feat; + inputTensors[1] = inputTensor_rois; + inputTensors[2] = inputTensor_batch; + inputTensorPtrs[0] = &inputTensors[0]; + inputTensorPtrs[1] = &inputTensors[1]; + inputTensorPtrs[2] = &inputTensors[2]; + + // set output + Tensor outputTensor, outputTensorRef; + CHECK_STATUS(roialign_infer_output_size(inputTensorPtrs, p, &outputTensor, &archInfo)); + outputTensor.alloc(); + TensorDesc outputDesc_ref = outputTensor.get_desc(); + outputTensorRef.resize(outputDesc_ref); + outputTensorRef.alloc(); + U32 output_len = outputTensor.length(); + CHECK_REQUIREMENT(ih1 == on0 && ic0 == oc0 && output_h == oh0 && output_w == ow0); + CHECK_REQUIREMENT(input_len_feat == in0 * ic0 * ih0 * iw0 && input_len_rois == ih1 * iw1 && + input_len_batch == ilens2 && output_len == on0 * oc0 * oh0 * ow0); + + if (UT_CHECK) { + CHECK_STATUS(roialign(inputTensors, p, outputTensor, &archInfo)); + CHECK_STATUS(roialign(inputTensors, p, outputTensorRef, &archInfo_org)); + // check + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), output_len, dt, 0.05, __FILE__, __LINE__); + } + + // benchmark + double time_start = ut_time_ms(); + for (int iter = 0; iter < UT_LOOPS; iter++) { + CHECK_STATUS(roialign(inputTensors, p, outputTensor, &archInfo)); + } + double time_end = ut_time_ms(); + double time = (time_end - time_start) / UT_LOOPS; + + // log performance data + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u) * (%u %u) * (%u) * (%u %u) = (%u %u %u %u)", in0, ic0, ih0, iw0, + ih1, iw1, ilens2, output_h, output_w, on0, oc0, oh0, ow0); + sprintf(buffer, "%20s, %80s", "Roialign", params); + double ops = 1.0 * output_len; + ut_log(dt, buffer, ops, time); + + free(input_feat); + free(input_rois); + free(input_batch); + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + roialignTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + roialignTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/tests/test_scale.cpp b/compute/tensor/tests/test_scale.cpp similarity index 54% rename from tests/test_scale.cpp rename to compute/tensor/tests/test_scale.cpp index 0d8ccd22..0eb788c2 100644 --- a/tests/test_scale.cpp +++ b/compute/tensor/tests/test_scale.cpp @@ -1,53 +1,66 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "tensor_computing.h" #include "ut_util.h" -int scaleTest(int argc, char** argv, DataType dt){ +int scaleTest(int argc, char **argv, DataType dt) +{ CHECK_REQUIREMENT(argc == 5); U32 in = atoi(argv[1]); U32 ic = atoi(argv[2]); U32 ih = atoi(argv[3]); U32 iw = atoi(argv[4]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; - I32 axis = 1; + ScaleParamSpec p; + p.axis = 1; DataFormat df = DF_NCHWC8; - TensorDesc data_desc = tensor4df(dt, df, in, ic, ih, iw); - U32 len = tensorNumElements(data_desc); - - U8* alpha = ut_input_v(ic, dt, UT_INIT_RANDOM); - U8* beta = ut_input_v(ic, dt, UT_INIT_RANDOM); - U8* data = ut_input_v(len, dt, UT_INIT_RANDOM); - U8* data_ref = ut_input_v(len, dt, UT_INIT_ZERO); - memcpy(data_ref, data, len*bytesOf(dt)); + TensorDesc inDesc = tensor4df(dt, df, in, ic, ih, iw); + U32 len = tensorNumElements(inDesc); + U8 *data = ut_input_v(len, dt, UT_INIT_RANDOM); + + Tensor dataTensor; + Tensor dataTensorRef; + dataTensor.resize(inDesc); + dataTensorRef.resize(inDesc); + dataTensor.alloc(); + dataTensorRef.alloc(); + memcpy(get_ptr_from_tensor(dataTensor, UT_ARCH), data, tensorNumBytes(inDesc)); + memcpy(get_ptr_from_tensor(dataTensorRef, UT_ARCH), data, tensorNumBytes(inDesc)); + + U8 *alpha = ut_input_v(ic, dt, UT_INIT_RANDOM); + U8 *beta = ut_input_v(ic, dt, UT_INIT_RANDOM); if (UT_CHECK) { - CHECK_STATUS(scale(data_desc, data, axis, alpha, beta, data_desc, data, UT_ARCH)); + CHECK_STATUS(scale(dataTensor, alpha, beta, p, dataTensor, &archInfo)); // naive implement - CHECK_STATUS(scale(data_desc, data_ref, axis, alpha, beta, data_desc, data_ref, CPU_GENERAL)); + CHECK_STATUS(scale(dataTensorRef, alpha, beta, p, dataTensorRef, &archInfo_org)); // check - ut_check_v(data, data_ref, len, dt, 1.0, __FILE__, __LINE__); + ut_check_v(get_ptr_from_tensor(dataTensor, UT_ARCH), + get_ptr_from_tensor(dataTensorRef, UT_ARCH), len, dt, 1.0, __FILE__, __LINE__); } // benchmark double time_start = ut_time_ms(); for (int iter = 0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(scale(data_desc, data, axis, alpha, beta, data_desc, data, UT_ARCH)); + CHECK_STATUS(scale(dataTensor, alpha, beta, p, dataTensor, &archInfo)); } double time_end = ut_time_ms(); double time = (time_end - time_start) / UT_LOOPS; @@ -55,21 +68,18 @@ int scaleTest(int argc, char** argv, DataType dt){ // log performance data char buffer[150]; char params[120]; - sprintf(params, "(%u %u %u %u)=(%u %u %u %u)", - in, ic, ih, iw, - in, ic, ih, iw); + sprintf(params, "(%u %u %u %u)=(%u %u %u %u)", in, ic, ih, iw, in, ic, ih, iw); sprintf(buffer, "%20s, %80s", "Scale", params); double ops = 2.0 * in * ic * ih * iw; - ut_log(dt, buffer, ops, time/UT_LOOPS); + ut_log(dt, buffer, ops, time / UT_LOOPS); free(data); - free(data_ref); return 0; } - -int main(int argc, char** argv) { +int main(int argc, char **argv) +{ #ifdef _USE_FP16 scaleTest(argc, argv, DT_F16); #endif diff --git a/tests/test_slice.cpp b/compute/tensor/tests/test_slice.cpp similarity index 54% rename from tests/test_slice.cpp rename to compute/tensor/tests/test_slice.cpp index 17736755..e5f1b996 100644 --- a/tests/test_slice.cpp +++ b/compute/tensor/tests/test_slice.cpp @@ -1,62 +1,71 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "tensor_computing.h" #include "ut_util.h" -int sliceTest(int argc, char** argv, DataType dt) { +int sliceTest(int argc, char **argv, DataType dt) +{ CHECK_REQUIREMENT(argc > 2); I32 num = atoi(argv[1]); - CHECK_REQUIREMENT(argc == 2+4+1+num-1); - U32 in = atoi(argv[2]); - U32 ic = atoi(argv[3]); - U32 ih = atoi(argv[4]); - U32 iw = atoi(argv[5]); - I32 axis= atoi(argv[6]); - std::vector slice_point(num); - for (I32 i = 0; i < num-1; i++) { - slice_point[i] = atoi(argv[7+i]); + CHECK_REQUIREMENT(argc == 2 + 4 + 1 + num - 1); + U32 in = atoi(argv[2]); + U32 ic = atoi(argv[3]); + U32 ih = atoi(argv[4]); + U32 iw = atoi(argv[5]); + SliceParamSpec p; + p.axis = atoi(argv[6]); + p.slice_size = num - 1; + for (U32 i = 0; i < p.slice_size; i++) { + p.slice_points[i] = atoi(argv[7 + i]); } + ArchInfo archInfo; + archInfo.arch = UT_ARCH; DataFormat df = DF_NCHW; - TensorDesc in_desc = tensor4df(dt, df, in, ic, ih, iw); - std::vector out_desc(num); - - CHECK_STATUS(slice_infer_output_size(in_desc, &out_desc, axis, slice_point.data(), UT_ARCH)); - std::vector output(num); + TensorDesc inDesc = tensor4df(dt, df, in, ic, ih, iw); + U32 len = tensorNumElements(inDesc); + U8 *input = ut_input_v(len, dt, UT_INIT_RANDOM); + Tensor inputTensor; + inputTensor.resize(inDesc); + inputTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inDesc)); + + std::vector outputTensors(num); + std::vector outputTensorsPtr(num); for (I32 i = 0; i < num; i++) { - output[i] = (void*)ut_input_v(tensorNumElements(out_desc[i]), dt, UT_INIT_ZERO); + outputTensorsPtr[i] = &outputTensors[i]; + } + CHECK_STATUS(slice_infer_output_size(&inputTensor, p, outputTensorsPtr, &archInfo)); + for (I32 i = 0; i < num; i++) { + outputTensors[i].alloc(); } - - U32 len = tensorNumElements(in_desc); - U8* input = ut_input_v(len, dt, UT_INIT_RANDOM); if (UT_CHECK) { - CHECK_STATUS(slice(in_desc, input, axis, out_desc, &output, UT_ARCH)); + CHECK_STATUS(slice(inputTensor, p, outputTensors, &archInfo)); U32 tmp = 0; for (I32 i = 0; i < num; i++) { - tmp += tensorNumElements(out_desc[i]); + tmp += outputTensors[i].length(); } CHECK_REQUIREMENT(tmp == len); } double time_start = ut_time_ms(); for (int iter = 0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(slice(in_desc, input, axis, out_desc, &output, UT_ARCH)); + CHECK_STATUS(slice(inputTensor, p, outputTensors, &archInfo)); } double time_end = ut_time_ms(); double time = (time_end - time_start) / UT_LOOPS; @@ -64,9 +73,7 @@ int sliceTest(int argc, char** argv, DataType dt) { // log performance data char buffer[150]; char params[120]; - sprintf(params, "(%u %u %u %u)=(%u %u %u %u)/%u", - in, ic, ih, iw, - in, ic, ih, iw, num); + sprintf(params, "(%u %u %u %u)=(%u %u %u %u)/%u", in, ic, ih, iw, in, ic, ih, iw, num); sprintf(buffer, "%20s, %80s", "Slice", params); double ops = num * len; ut_log(dt, buffer, ops, time); @@ -76,7 +83,8 @@ int sliceTest(int argc, char** argv, DataType dt) { return 0; } -int main(int argc, char** argv) { +int main(int argc, char **argv) +{ #ifdef _USE_FP16 sliceTest(argc, argv, DT_F16); #endif diff --git a/tests/test_softmax.cpp b/compute/tensor/tests/test_softmax.cpp similarity index 52% rename from tests/test_softmax.cpp rename to compute/tensor/tests/test_softmax.cpp index a011c446..8bed13af 100644 --- a/tests/test_softmax.cpp +++ b/compute/tensor/tests/test_softmax.cpp @@ -1,47 +1,58 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include "tensor_computing.h" #include "ut_util.h" -int softmaxTest(int argc, char** argv, DataType dt){ +int softmaxTest(int argc, char **argv, DataType dt) +{ CHECK_REQUIREMENT(argc == 2); + SoftmaxParamSpec p; U32 len = atoi(argv[1]); - U32 axis = 1; - - TensorDesc in_desc, out_desc; - in_desc = tensor2df(dt, DF_NORMAL, 1, len); - CHECK_STATUS(softmax_infer_output_size(in_desc, &out_desc, UT_ARCH)); + p.axis = 1; + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + TensorDesc inDesc = tensor2df(dt, DF_NORMAL, 1, len); + U8 *input = ut_input_v(len, dt, UT_INIT_RANDOM); + Tensor inputTensor = Tensor::alloc_sized(inDesc); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inDesc)); - U8* in = ut_input_v(len, dt, UT_INIT_RANDOM); - U8* out = ut_input_v(len, dt, UT_INIT_ZERO); - U8* out_ref = ut_input_v(len, dt, UT_INIT_ZERO); + Tensor outputTensor; + CHECK_STATUS(softmax_infer_output_size(&inputTensor, &outputTensor, &archInfo)); + outputTensor.alloc(); + Tensor outputTensorRef = Tensor::alloc_sized(outputTensor.get_desc()); - if(UT_CHECK){ - CHECK_STATUS(softmax(in_desc, in, axis, out_desc, out, UT_ARCH)); + Tensor blankTensor; + + if (UT_CHECK) { + CHECK_STATUS(softmax(inputTensor, p, blankTensor, outputTensor, &archInfo)); // naive implement - CHECK_STATUS(softmax(in_desc, in, axis, out_desc, out_ref, CPU_GENERAL)); + CHECK_STATUS(softmax(inputTensor, p, blankTensor, outputTensorRef, &archInfo_org)); // check - ut_check_v(out, out_ref, len, dt, 0.1, __FILE__, __LINE__); + ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), + get_ptr_from_tensor(outputTensorRef, UT_ARCH), outputTensor.length(), dt, 0.1, __FILE__, + __LINE__); } // benchmark double time_start = ut_time_ms(); - for(int iter=0; iter +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} + +int softmaxTest(int argc, char **argv, DataType dt) +{ + U32 in, ic, ih, iw; + in = 1; + ic = 4; + ih = 1; + iw = 1; + + if (argc == 2) { + ic = atoi(argv[1]); + } + + SoftmaxParamSpec p; + p.axis = 1; + + ArchInfo archInfo; + archInfo.arch = MALI; + ArchInfo archInfo_org; + archInfo_org.arch = CPU_GENERAL; + + TensorDesc in_desc, in_desc_gpu, out_desc; + in_desc = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + + U8 *input_cpu = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + U8 *output_gpu = NULL; + in_desc_gpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(in_desc_gpu); + + MaliPara maliPara; + maliPara.handle = handle; + archInfo.archPara = &maliPara; + CHECK_STATUS(softmax_infer_output_size(&inputTensor, &outputTensor, &archInfo)); + U32 maxBytes = 0; + U32 tmpBytes; + CHECK_STATUS(softmax_infer_forward_tmp_bytes(inputTensor, &tmpBytes, &archInfo)); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; // 18 + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + tmpBytes = tensorNumBytes(in_desc_gpu); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + CHECK_STATUS(ocl_set_input(handle, input, in_desc_gpu, input_cpu, tmpbuf, true)); + CHECK_STATUS(softmax(inputTensor, p, tmpTensor, outputTensor, &archInfo)); + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + + UNI_INFO_LOG("Run:\n") +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + out_desc = outputTensor.get_desc(); + CHECK_STATUS(ocl_get_output(handle, output, out_desc, true)); + output_gpu = output->mapPtrArray.back(); + + char buffer[150]; + char params[120]; + sprintf(params, "(%u %u %u %u)", in, ic, ih, iw); + sprintf(buffer, "%20s, %80s", "softmax_h1w1", params); +#ifdef _DEBUG + double ops = 1; + ut_log(dt, buffer, ops, time); +#endif + Tensor inputTensorCpu; + inputTensorCpu.resize(in_desc); + inputTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu, tensorNumBytes(in_desc)); + + Tensor outputTensorCpu; + outputTensorCpu.resize(out_desc); + outputTensorCpu.alloc(); + + Tensor tmpTensorCpu; + CHECK_STATUS(softmax(inputTensorCpu, p, tmpTensorCpu, outputTensorCpu, &archInfo_org)); + + ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, UT_ARCH), in * ih * iw * ic, dt); + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + + free(input_cpu); + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + softmaxTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/tests/test_split.cpp b/compute/tensor/tests/test_split.cpp similarity index 56% rename from tests/test_split.cpp rename to compute/tensor/tests/test_split.cpp index 6f73e3bc..a75f6431 100644 --- a/tests/test_split.cpp +++ b/compute/tensor/tests/test_split.cpp @@ -1,53 +1,63 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "tensor_computing.h" #include "ut_util.h" -int splitTest(int argc, char** argv, DataType dt) { +int splitTest(int argc, char **argv, DataType dt) +{ CHECK_REQUIREMENT(argc == 6); I32 num = atoi(argv[1]); - U32 in = atoi(argv[2]); - U32 ic = atoi(argv[3]); - U32 ih = atoi(argv[4]); - U32 iw = atoi(argv[5]); + U32 in = atoi(argv[2]); + U32 ic = atoi(argv[3]); + U32 ih = atoi(argv[4]); + U32 iw = atoi(argv[5]); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; DataFormat df = DF_NCHWC8; - TensorDesc in_desc = tensor4df(dt, df, in, ic, ih, iw); - std::vector out_desc(num); - - CHECK_STATUS(split_infer_output_size(in_desc, &out_desc)); - std::vector output(num); + TensorDesc inDesc = tensor4df(dt, df, in, ic, ih, iw); + U32 len = tensorNumElements(inDesc); + U8 *input = ut_input_v(len, dt, UT_INIT_RANDOM); + Tensor inputTensor; + inputTensor.resize(inDesc); + inputTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inDesc)); + + std::vector outputTensors(num); + std::vector outputTensorsPtr(num); + for (I32 i = 0; i < num; i++) { + outputTensorsPtr[i] = &outputTensors[i]; + } + CHECK_STATUS(split_infer_output_size(&inputTensor, outputTensorsPtr)); for (I32 i = 0; i < num; i++) { - output[i] = (void*)ut_input_v(tensorNumElements(out_desc[i]), dt, UT_INIT_ZERO); + outputTensors[i].alloc(); } - U32 len = tensorNumElements(in_desc); - U8* input = ut_input_v(len, dt, UT_INIT_RANDOM); if (UT_CHECK) { - CHECK_STATUS(split(in_desc, input, out_desc, &output, UT_ARCH)); + CHECK_STATUS(split(inputTensor, outputTensors, &archInfo)); for (I32 i = 0; i < num; i++) { - ut_check_v(output[i], input, len, dt, 0, __FILE__, __LINE__); + ut_check_v(get_ptr_from_tensor(outputTensors[i], UT_ARCH), input, len, dt, 0, __FILE__, + __LINE__); } } double time_start = ut_time_ms(); for (int iter = 0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(split(in_desc, input, out_desc, &output, UT_ARCH)); + CHECK_STATUS(split(inputTensor, outputTensors, &archInfo)); } double time_end = ut_time_ms(); double time = (time_end - time_start) / UT_LOOPS; @@ -55,9 +65,7 @@ int splitTest(int argc, char** argv, DataType dt) { // log performance data char buffer[150]; char params[120]; - sprintf(params, "(%u %u %u %u)=(%u %u %u %u)*%u", - in, ic, ih, iw, - in, ic, ih, iw, num); + sprintf(params, "(%u %u %u %u)=(%u %u %u %u)*%u", in, ic, ih, iw, in, ic, ih, iw, num); sprintf(buffer, "%20s, %80s", "Split", params); double ops = num * len; ut_log(dt, buffer, ops, time); @@ -67,7 +75,8 @@ int splitTest(int argc, char** argv, DataType dt) { return 0; } -int main(int argc, char** argv) { +int main(int argc, char **argv) +{ #ifdef _USE_FP16 splitTest(argc, argv, DT_F16); #endif diff --git a/compute/tensor/tests/test_tile.cpp b/compute/tensor/tests/test_tile.cpp new file mode 100644 index 00000000..657e6b3b --- /dev/null +++ b/compute/tensor/tests/test_tile.cpp @@ -0,0 +1,61 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + +#include "tensor_computing.h" +#include "ut_util.h" + +int tileTest(int argc, char **argv, DataType dt) +{ + // input dim + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + //input axis and tiles + TileParamSpec tileParamSpec; + tileParamSpec.axis = atoi(argv[5]); + tileParamSpec.dimsSize = 0; + tileParamSpec.repeatsInfo[0] = atoi(argv[6]); + + //set input + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + DataFormat df = DF_NCHW; + TensorDesc inDesc = tensor4df(dt, df, in, ic, ih, iw); + U32 len = tensorNumElements(inDesc); + U8 *input = ut_input_v(len, dt, UT_INIT_RANDOM); + Tensor inputTensor = Tensor::alloc_sized(inDesc); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, inputTensor.bytes()); + + //set output + Tensor outputTensor; + CHECK_STATUS(tile_infer_output_size(&inputTensor, tileParamSpec, &outputTensor, &archInfo)); + outputTensor.alloc(); + if (UT_CHECK) { + CHECK_STATUS(tile(inputTensor, tileParamSpec, outputTensor, &archInfo)); + + CHECK_REQUIREMENT(outputTensor.length() == (len * tileParamSpec.repeatsInfo[0])); + } + + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + tileTest(argc, argv, DT_F16); +#endif +#ifdef _USE_FP32 + tileTest(argc, argv, DT_F32); +#endif + return 0; +} diff --git a/tests/test_transpose.cpp b/compute/tensor/tests/test_transpose.cpp similarity index 50% rename from tests/test_transpose.cpp rename to compute/tensor/tests/test_transpose.cpp index 587a4a50..48f9decc 100644 --- a/tests/test_transpose.cpp +++ b/compute/tensor/tests/test_transpose.cpp @@ -1,61 +1,69 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "tensor_computing.h" #include "ut_util.h" -int transposeTest(int argc, char** argv, DataType dt) { +int transposeTest(int argc, char **argv, DataType dt) +{ CHECK_REQUIREMENT(argc == 9); - U32 in = atoi(argv[1]); - U32 ic = atoi(argv[2]); - U32 ih = atoi(argv[3]); - U32 iw = atoi(argv[4]); - std::vector dim(4, 0); - std::vector inv_dim(4, 0); + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + TransposeParamSpec p, p_inv; + p.trans_size = 4; + p_inv.trans_size = 4; for (int i = 0; i < 4; i++) { - I32 value = atoi(argv[5+i]); - dim[i] = value; - inv_dim[value] = i; + I32 value = atoi(argv[5 + i]); + p.trans_dims[i] = value; + p_inv.trans_dims[value] = i; } + ArchInfo archInfo; + archInfo.arch = UT_ARCH; DataFormat df = DF_NCHW; - TensorDesc in_desc = tensor4df(dt, df, in, ic, ih, iw); - TensorDesc out_1_desc; - TensorDesc out_2_desc; - - CHECK_STATUS(transpose_infer_output_size(in_desc, &out_1_desc, dim.data(), UT_ARCH)); - CHECK_STATUS(transpose_infer_output_size(out_1_desc, &out_2_desc, inv_dim.data(), UT_ARCH)); - - U32 len = tensorNumElements(in_desc); - U8* input = ut_input_v(len, dt, UT_INIT_RANDOM); - U8* out_1 = ut_input_v(len, dt, UT_INIT_RANDOM); - U8* out_2 = ut_input_v(len, dt, UT_INIT_RANDOM); + TensorDesc inDesc = tensor4df(dt, df, in, ic, ih, iw); + U32 len = tensorNumElements(inDesc); + U8 *input = ut_input_v(len, dt, UT_INIT_RANDOM); + Tensor inputTensor; + inputTensor.resize(inDesc); + inputTensor.alloc(); + memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inDesc)); + + Tensor outputTensor1; + Tensor outputTensor2; + CHECK_STATUS(transpose_infer_output_size(&inputTensor, p, &outputTensor1, &archInfo)); + CHECK_STATUS(transpose_infer_output_size(&outputTensor1, p_inv, &outputTensor2, &archInfo)); + outputTensor1.alloc(); + outputTensor2.alloc(); + Tensor blankTensor; if (UT_CHECK) { - CHECK_STATUS(transpose(in_desc, input, out_1_desc, out_1, dim.data(), UT_ARCH)); + CHECK_STATUS(transpose(inputTensor, p, blankTensor, outputTensor1, &archInfo)); - CHECK_STATUS(transpose(out_1_desc, out_1, out_2_desc, out_2, inv_dim.data(), UT_ARCH)); + CHECK_STATUS(transpose(outputTensor1, p_inv, blankTensor, outputTensor2, &archInfo)); // check - ut_check_v(input, out_2, len, dt, 0.0001, __FILE__, __LINE__); + ut_check_v(input, get_ptr_from_tensor(outputTensor2, UT_ARCH), len, dt, 0.0001, __FILE__, + __LINE__); } double time_start = ut_time_ms(); for (int iter = 0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(transpose(in_desc, input, out_1_desc, out_1, dim.data(), UT_ARCH)); + CHECK_STATUS(transpose(inputTensor, p, blankTensor, outputTensor1, &archInfo)); } double time_end = ut_time_ms(); double time = (time_end - time_start) / UT_LOOPS; @@ -64,26 +72,22 @@ int transposeTest(int argc, char** argv, DataType dt) { U32 oc = 0; U32 oh = 0; U32 ow = 0; - CHECK_STATUS(tensor4dGet(out_1_desc, &dt, &df, &on, &oc, &oh, &ow)); + CHECK_STATUS(tensor4dGet(outputTensor1.get_desc(), &dt, &df, &on, &oc, &oh, &ow)); // log performance data char buffer[150]; char params[120]; - sprintf(params, "(%u %u %u %u)=(%u %u %u %u)", - in, ic, ih, iw, - on, oc, oh, ow); + sprintf(params, "(%u %u %u %u)=(%u %u %u %u)", in, ic, ih, iw, on, oc, oh, ow); sprintf(buffer, "%20s, %80s", "Transpose", params); double ops = len; ut_log(dt, buffer, ops, time); free(input); - free(out_1); - free(out_2); return 0; } - -int main(int argc, char** argv) { +int main(int argc, char **argv) +{ #ifdef _USE_FP16 transposeTest(argc, argv, DT_F16); #endif diff --git a/compute/tensor/tests/test_transpose_ocl.cpp b/compute/tensor/tests/test_transpose_ocl.cpp new file mode 100644 index 00000000..0f1368fc --- /dev/null +++ b/compute/tensor/tests/test_transpose_ocl.cpp @@ -0,0 +1,158 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include "tensor_computing.h" +#include "ut_util.h" +#include "gcl.h" +#include "libkernelsource.h" +inline GCLMem_t alloc(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_map(Tensor tensor) +{ + auto mem = (OclMemory *)tensor.get_memory(); + mem->mapped_alloc(); + return (GCLMem_t)mem->get_ptr(); +} + +inline GCLMem_t alloc_bytes(Tensor tensor, U32 size) +{ + auto mem = (OclMemory *)tensor.get_memory(); + GCLMem_t ptr = NULL; + if (size > 0) { + mem->resize(tensor1d(DT_U8, size)); + mem->alloc(); + ptr = (GCLMem_t)mem->get_ptr(); + } + return ptr; +} + +int transposeTest(int argc, char **argv, DataType dt) +{ + CHECK_REQUIREMENT(argc == 9); + U32 in = atoi(argv[1]); + U32 ic = atoi(argv[2]); + U32 ih = atoi(argv[3]); + U32 iw = atoi(argv[4]); + TransposeParamSpec p; + p.trans_size = 4; + for (int i = 0; i < 4; i++) { + I32 value = atoi(argv[5 + i]); + p.trans_dims[i] = value; + } + + ArchInfo archInfo; + ArchInfo archInfo_org; + archInfo.arch = MALI; + archInfo_org.arch = CPU_GENERAL; + + TensorDesc inputDesc_cpu, inputDesc_gpu, outputDesc; + inputDesc_cpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + inputDesc_gpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + + U32 len = tensorNumElements(inputDesc_cpu); + U8 *input_cpu = ut_input_v(len, dt, UT_INIT_RANDOM); + + Tensor inputTensorCpu; + inputTensorCpu.resize(inputDesc_cpu); + inputTensorCpu.alloc(); + memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu, tensorNumBytes(inputDesc_cpu)); + Tensor outputTensorCpu; + Tensor tmpTensorCpu; + //run on cpu + CHECK_STATUS(transpose_infer_output_size(&inputTensorCpu, p, &outputTensorCpu, &archInfo_org)); + outputTensorCpu.alloc(); + CHECK_STATUS(transpose(inputTensorCpu, p, tmpTensorCpu, outputTensorCpu, &archInfo_org)); + //run on gpu + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + ; + GCLHandle_t handle = handleSharedPtr.get(); + std::vector kernelVec; + handle->kernelVec = &kernelVec; + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(inputDesc_gpu); + U8 *output_gpu = NULL; + MaliPara maliPara; + maliPara.handle = handle; + archInfo.archPara = &maliPara; + + CHECK_STATUS(transpose_infer_output_size(&inputTensor, p, &outputTensor, &archInfo)); + + U32 maxBytes = 0; + U32 tmpBytes; + CHECK_STATUS(transpose_infer_forward_tmp_bytes(inputTensor, outputTensor, &tmpBytes, &archInfo)) + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + + GCLMem_t output = alloc_map(outputTensor); + GCLMem_t input = alloc(inputTensor); + CHECK_STATUS(gcl_fill_memory_zero(handle, input)); + + tmpBytes = tensorNumBytes(inputDesc_gpu); + maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; + GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); + + CHECK_STATUS(ocl_set_input(handle, input, inputDesc_gpu, input_cpu, tmpbuf, true)); + CHECK_STATUS(transpose(inputTensor, p, tmpTensor, outputTensor, &archInfo)); + /*warp up*/ + UNI_INFO_LOG("Warp up gpu:\n") + for (U32 i = 0; i < 2; i++) { + CHECK_STATUS(gcl_run_kernelVec(handle)); + } + + UNI_INFO_LOG("Run:\n") +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); + double time = handle->t_execute * 0.001; +#else + CHECK_STATUS(gcl_run_kernelVec(handle)); +#endif + outputDesc = outputTensor.get_desc(); + ; + CHECK_STATUS(ocl_get_output(handle, output, outputDesc, true)); + output_gpu = output->mapPtrArray.back(); + + char buffer[150]; + char params[120]; + U32 on = outputDesc.dims[3]; + U32 oc = outputDesc.dims[2]; + U32 oh = outputDesc.dims[1]; + U32 ow = outputDesc.dims[0]; + sprintf(params, "(%u %u %u %u)=(%u %u %u %u)", in, ic, ih, iw, on, oc, oh, ow); + sprintf(buffer, "%20s, %80s", "Transpose", params); +#ifdef _DEBUG + double ops = len; + ut_log(dt, buffer, ops, time); +#endif + ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, UT_ARCH), len, dt); + + CHECK_STATUS(gcl_finish(handle)); + CHECK_STATUS(gcl_clean_kernelVec(handle)); + free(input_cpu); + return 0; +} + +int main(int argc, char **argv) +{ +#ifdef _USE_FP16 + transposeTest(argc, argv, DT_F16); +#endif + return 0; +} diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 00000000..1cb2adb1 --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,21 @@ +# Architecture + +![bolt_framework](images/Framework.PNG) + +- [common](../common) + - [uni](../common/uni) hosts the common headers that are used in bolt. The model representation of bolt is [ModelSpec](../uni/include/type), which shows the rigorous model format defined by bolt. + - [gcl](../common/gcl) hosts the setup of MALI GPU environment. + - [memory](../common/memory) hosts the memory data structure which bolt needs. +- [model_tools](../model_tools) + - [X2bolt](../model_tools/tools/X2bolt) : a general converter of converting different deep learning models to bolt models. + - [model_optimizer](../model_tools/include/model_optimizer.hpp) : a static computing graph optimizer to fuse the operators and simplify the calculation graph. +- [compute](../compute) + - [blas_enhance](compute/blas_enhance) hosts the fast implementation of matrix-matrix multiplication and matrix-vector multiplication of FP32, FP16 and INT8. It is referenced by some of the operators in [tensor](compute/tensor). + - [tensor](compute/tensor) hosts the implementation for all kinds of operators defined by bolt. + - [image](compute/image) hosts common preprocessing routines for image inputs (e.g. bilinear interpolation). +- [inference](../inference) + - [engine](inference/engine) hosts the inference engine of neural networks. + - [flow](inference/flow) hosts the multi-backends(CPU+GPU) heterogeneous device schedule for time series data. + - [examples](inference/examples) gives some application examples (Network Benchmark, ImageNet classification). + +For API, Flow and operator development, please refer to [DEVELOPER.md](DEVELOPER.md). \ No newline at end of file diff --git a/docs/BENCHMARK.md b/docs/BENCHMARK.md index ac877563..2d488639 100644 --- a/docs/BENCHMARK.md +++ b/docs/BENCHMARK.md @@ -1,165 +1,340 @@ # Benchmark Report -We have tested kinds of neural network models with Bolt(v0.2.0) on HUAWEI 810 mobile phone and HUAWEI 990 mobile phone respectively. The benchmark data are given under different operating systems, different computing cores, and different inference accuracy. For more detailed evaluation data, please refer to the following table. +We have tested kinds of neural network models with Bolt(v1.0.0) on HUAWEI 810 mobile phone and HUAWEI 990 mobile phone respectively. The benchmark data are given under different operating systems, different computing cores, and different inference accuracy. For more detailed evaluation data, please refer to the following table. -| Model | Framework | Os | Compiler | Kirin Soc Version | Core | Precision | Cpu or Gpu | Thread | Input Size | Performance1 | Performance2 | Performance3 | Accuracy1 | Accuracy2 | -| -------------------- | --------- | ----------- | -------- | ----------------- | ------------- | --------- | ---------- | ------ | ----------- | --------------------------- | ------------------------ | ------------------------ | ------------- | ------------- | -| tinybert | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 32+32+32 | avg_time:17.5701ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 32+32+32 | avg_time:9.3479ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 32+32+32 | avg_time:4.48315ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 32+32+32 | avg_time:2.6499ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 32+32+32 | avg_time:16.9138ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 32+32+32 | avg_time:8.96313ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 32+32+32 | avg_time:3.69189ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 32+32+32 | avg_time:2.2041ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 32+32+32 | avg_time:16.96ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 32+32+32 | avg_time:8.88281ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 32+32+32 | avg_time:4.39697ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 32+32+32 | avg_time:2.84619ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 32+32+32 | avg_time:15.3071ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 32+32+32 | avg_time:8.33203ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 32+32+32 | avg_time:3.84497ms/sequence | | | | | -| tinybert | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 32+32+32 | avg_time:2.3479ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 32+32+32 | avg_time:875.392ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 32+32+32 | avg_time:497.91ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 32+32+32 | avg_time:295.943ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 32+32+32 | avg_time:156.86ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 32+32+32 | avg_time:975.889ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 32+32+32 | avg_time:492.725ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 32+32+32 | avg_time:285.426ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 32+32+32 | avg_time:136.338ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 32+32+32 | avg_time:874.251ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 32+32+32 | avg_time:457.736ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 32+32+32 | avg_time:300.887ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 32+32+32 | avg_time:160.95ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 32+32+32 | avg_time:854.466ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 32+32+32 | avg_time:455.878ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 32+32+32 | avg_time:246.937ms/sequence | | | | | -| nmt | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 32+32+32 | avg_time:128.898ms/sequence | | | | | -| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:69.29ms/image | min_time:58.073ms/image | avg_time:59.0912ms/image | top5:0.973684 | top1:0.861842 | -| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:34.3389ms/image | min_time:27.093ms/image | avg_time:29.1388ms/image | top5:0.973684 | top1:0.875 | -| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:20.813ms/image | min_time:18.3352ms/image | avg_time:18.4675ms/image | top5:0.973684 | top1:0.861842 | -| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:9.94312ms/image | min_time:8.79199ms/image | avg_time:8.85982ms/image | top5:0.973684 | top1:0.875 | -| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:74.1409ms/image | min_time:56.3591ms/image | avg_time:59.8203ms/image | top5:0.973684 | top1:0.861842 | -| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:36.093ms/image | min_time:25.9141ms/image | avg_time:28.1741ms/image | top5:0.973684 | top1:0.875 | -| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:13.7258ms/image | min_time:13.3469ms/image | avg_time:13.4824ms/image | top5:0.973684 | top1:0.861842 | -| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:7.14282ms/image | min_time:6.92798ms/image | avg_time:7.01458ms/image | top5:0.973684 | top1:0.875 | -| ghostnet | onnx | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:74.3708ms/image | min_time:66.72ms/image | avg_time:67.9283ms/image | top5:0.973684 | top1:0.861842 | -| ghostnet | onnx | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:36.603ms/image | min_time:32.616ms/image | avg_time:33.2657ms/image | top5:0.973684 | top1:0.868421 | -| ghostnet | onnx | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:27.4438ms/image | min_time:18.6541ms/image | avg_time:18.8378ms/image | top5:0.973684 | top1:0.861842 | -| ghostnet | onnx | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:15.4668ms/image | min_time:9.32104ms/image | avg_time:9.45095ms/image | top5:0.973684 | top1:0.868421 | -| ghostnet | onnx | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:70.2061ms/image | min_time:55.4148ms/image | avg_time:61.4079ms/image | top5:0.973684 | top1:0.861842 | -| ghostnet | onnx | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:37.0811ms/image | min_time:27.947ms/image | avg_time:29.9928ms/image | top5:0.973684 | top1:0.868421 | -| ghostnet | onnx | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:15.6489ms/image | min_time:13.8831ms/image | avg_time:14.0345ms/image | top5:0.973684 | top1:0.861842 | -| ghostnet | onnx | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:8.07397ms/image | min_time:7.29102ms/image | avg_time:7.38781ms/image | top5:0.973684 | top1:0.868421 | -| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:190.538ms/image | min_time:160.101ms/image | avg_time:168.335ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:71.125ms/image | min_time:63.7568ms/image | avg_time:67.3027ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:50.1008ms/image | min_time:40.907ms/image | avg_time:41.3074ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:24.7939ms/image | min_time:19.5669ms/image | avg_time:19.6895ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:174.117ms/image | min_time:151.17ms/image | avg_time:154.917ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:74.842ms/image | min_time:59.8052ms/image | avg_time:62.1738ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:34.542ms/image | min_time:33.5129ms/image | avg_time:33.7169ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:16.0791ms/image | min_time:15.8879ms/image | avg_time:15.9935ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:178.423ms/image | min_time:167.58ms/image | avg_time:169.656ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:77.3501ms/image | min_time:66.8999ms/image | avg_time:68.3253ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:52.7412ms/image | min_time:41.7322ms/image | avg_time:42.0848ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:26.1299ms/image | min_time:19.927ms/image | avg_time:20.0323ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:173.724ms/image | min_time:151.815ms/image | avg_time:154.701ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:74.7651ms/image | min_time:60.425ms/image | avg_time:62.6472ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:36.054ms/image | min_time:33.9338ms/image | avg_time:34.2018ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:17.3879ms/image | min_time:16.575ms/image | avg_time:16.7297ms/image | top5:0.894737 | top1:0.638158 | -| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:154.487ms/image | min_time:141.754ms/image | avg_time:145.152ms/image | top5:0.940789 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:64.2239ms/image | min_time:58.8081ms/image | avg_time:59.9808ms/image | top5:0.934211 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:49.458ms/image | min_time:38.323ms/image | avg_time:38.5056ms/image | top5:0.940789 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:23.1702ms/image | min_time:17.5068ms/image | avg_time:17.6611ms/image | top5:0.934211 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:150.025ms/image | min_time:128.345ms/image | avg_time:132.484ms/image | top5:0.940789 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:68.334ms/image | min_time:53.7939ms/image | avg_time:55.3328ms/image | top5:0.934211 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:30.573ms/image | min_time:29.303ms/image | avg_time:29.476ms/image | top5:0.940789 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:14.3711ms/image | min_time:13.9141ms/image | avg_time:14.0406ms/image | top5:0.934211 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:165.334ms/image | min_time:142.623ms/image | avg_time:145.197ms/image | top5:0.940789 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:70.1091ms/image | min_time:63.4839ms/image | avg_time:65.2859ms/image | top5:0.934211 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:52.321ms/image | min_time:39.3108ms/image | avg_time:39.6574ms/image | top5:0.940789 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:24.9519ms/image | min_time:17.696ms/image | avg_time:18.0272ms/image | top5:0.934211 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:154.559ms/image | min_time:121.211ms/image | avg_time:130.884ms/image | top5:0.940789 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:77.2429ms/image | min_time:52.5979ms/image | avg_time:57.1493ms/image | top5:0.934211 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:34.0449ms/image | min_time:29.9241ms/image | avg_time:30.6575ms/image | top5:0.940789 | top1:0.756579 | -| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:15.9011ms/image | min_time:14.261ms/image | avg_time:14.4966ms/image | top5:0.934211 | top1:0.756579 | -| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:112.09ms/image | min_time:83.571ms/image | avg_time:93.3668ms/image | top5:0.875 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:53.822ms/image | min_time:35.8259ms/image | avg_time:41.434ms/image | top5:0.881579 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | int8 | cpu | 1 | 1/3/224/224 | max_time:44.7251ms/image | min_time:31.103ms/image | avg_time:35.5491ms/image | top5:0.875 | top1:0.592105 | -| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:31.9202ms/image | min_time:22.28ms/image | avg_time:22.4415ms/image | top5:0.875 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:16.2849ms/image | min_time:11.3989ms/image | avg_time:11.5547ms/image | top5:0.881579 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | int8 | cpu | 1 | 1/3/224/224 | max_time:13.2561ms/image | min_time:9.74902ms/image | avg_time:9.84788ms/image | top5:0.875 | top1:0.592105 | -| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:99.656ms/image | min_time:81.1941ms/image | avg_time:83.3297ms/image | top5:0.875 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:47.0339ms/image | min_time:34.7532ms/image | avg_time:37.4053ms/image | top5:0.881579 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | int8 | cpu | 1 | 1/3/224/224 | max_time:38.6331ms/image | min_time:30.9619ms/image | avg_time:31.3992ms/image | top5:0.875 | top1:0.592105 | -| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:18.9709ms/image | min_time:17.4961ms/image | avg_time:17.6799ms/image | top5:0.875 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:10.2009ms/image | min_time:9.35303ms/image | avg_time:9.4497ms/image | top5:0.881579 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | int8 | cpu | 1 | 1/3/224/224 | max_time:8.61816ms/image | min_time:7.65796ms/image | avg_time:7.70149ms/image | top5:0.875 | top1:0.592105 | -| squeezenet | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:107.665ms/image | min_time:89.8459ms/image | avg_time:91.3227ms/image | top5:0.875 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:52.929ms/image | min_time:41.8279ms/image | avg_time:42.9839ms/image | top5:0.868421 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | llvm | 810 | A55 | int8 | cpu | 1 | 1/3/224/224 | max_time:39.2102ms/image | min_time:35.2351ms/image | avg_time:36.3468ms/image | top5:0.868421 | top1:0.664474 | -| squeezenet | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:30.657ms/image | min_time:22.5791ms/image | avg_time:22.897ms/image | top5:0.875 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:13.2539ms/image | min_time:11.6641ms/image | avg_time:12.1377ms/image | top5:0.868421 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | llvm | 810 | A76 | int8 | cpu | 1 | 1/3/224/224 | max_time:12.3049ms/image | min_time:9.78296ms/image | avg_time:10.041ms/image | top5:0.868421 | top1:0.664474 | -| squeezenet | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:98.7239ms/image | min_time:81.147ms/image | avg_time:83.2495ms/image | top5:0.875 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:45.7671ms/image | min_time:33.8918ms/image | avg_time:36.3651ms/image | top5:0.868421 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | llvm | 990 | A55 | int8 | cpu | 1 | 1/3/224/224 | max_time:40.23ms/image | min_time:32.166ms/image | avg_time:33.0669ms/image | top5:0.868421 | top1:0.664474 | -| squeezenet | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:19.312ms/image | min_time:17.6091ms/image | avg_time:17.8038ms/image | top5:0.875 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:10.54ms/image | min_time:9.58398ms/image | avg_time:9.73297ms/image | top5:0.868421 | top1:0.618421 | -| squeezenet | caffe | ubuntu16_04 | llvm | 990 | A76 | int8 | cpu | 1 | 1/3/224/224 | max_time:8.69287ms/image | min_time:7.7019ms/image | avg_time:7.80628ms/image | top5:0.868421 | top1:0.664474 | -| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:39.6121ms/image | min_time:30.645ms/image | avg_time:31.2452ms/image | top5:0.809211 | top1:0.539474 | -| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:20.4729ms/image | min_time:15.2979ms/image | avg_time:15.8074ms/image | top5:0.822368 | top1:0.546053 | -| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:9.58984ms/image | min_time:7.26196ms/image | avg_time:7.37196ms/image | top5:0.809211 | top1:0.539474 | -| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:6.87085ms/image | min_time:4.26001ms/image | avg_time:4.38828ms/image | top5:0.822368 | top1:0.546053 | -| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:36.8108ms/image | min_time:26.8101ms/image | avg_time:28.4539ms/image | top5:0.809211 | top1:0.539474 | -| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:15.5039ms/image | min_time:13.3491ms/image | avg_time:14.5589ms/image | top5:0.822368 | top1:0.546053 | -| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:5.90894ms/image | min_time:5.72192ms/image | avg_time:5.80782ms/image | top5:0.809211 | top1:0.539474 | -| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:3.52197ms/image | min_time:3.30493ms/image | avg_time:3.34958ms/image | top5:0.822368 | top1:0.546053 | -| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:36.0649ms/image | min_time:31.5652ms/image | avg_time:32.6065ms/image | top5:0.809211 | top1:0.539474 | -| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:19.0042ms/image | min_time:16.8479ms/image | avg_time:17.7286ms/image | top5:0.815789 | top1:0.539474 | -| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:15.012ms/image | min_time:7.82007ms/image | avg_time:7.97568ms/image | top5:0.809211 | top1:0.539474 | -| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:6.84717ms/image | min_time:4.59595ms/image | avg_time:4.69553ms/image | top5:0.815789 | top1:0.539474 | -| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:38.3628ms/image | min_time:27.134ms/image | avg_time:29.9821ms/image | top5:0.809211 | top1:0.539474 | -| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:20.6179ms/image | min_time:15.635ms/image | avg_time:16.1516ms/image | top5:0.815789 | top1:0.539474 | -| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:6.75513ms/image | min_time:6.01196ms/image | avg_time:6.08447ms/image | top5:0.809211 | top1:0.539474 | -| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:4.0332ms/image | min_time:3.573ms/image | avg_time:3.6405ms/image | top5:0.815789 | top1:0.539474 | -| fingerprint_resnet18 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | UNKNOWN | avg_time:6.20508ms/image | | | | | -| fingerprint_resnet18 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | UNKNOWN | avg_time:3.82397ms/image | | | | | -| fingerprint_resnet18 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | UNKNOWN | avg_time:6.42603ms/image | | | | | -| fingerprint_resnet18 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | UNKNOWN | avg_time:3.63501ms/image | | | | | -| fingerprint_resnet18 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | UNKNOWN | avg_time:5.70605ms/image | | | | | -| fingerprint_resnet18 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | UNKNOWN | avg_time:3.38989ms/image | | | | | -| fingerprint_resnet18 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | UNKNOWN | avg_time:5.6731ms/image | | | | | -| fingerprint_resnet18 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | UNKNOWN | avg_time:3.521ms/image | | | | | -| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:769.38ms/image | min_time:726.458ms/image | avg_time:738.653ms/image | top5:0.934211 | top1:0.730263 | -| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:333.544ms/image | min_time:298.813ms/image | avg_time:311.864ms/image | top5:0.934211 | top1:0.730263 | -| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:194.501ms/image | min_time:185.442ms/image | avg_time:186.798ms/image | top5:0.934211 | top1:0.730263 | -| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:97.8091ms/image | min_time:92.2888ms/image | avg_time:93.0512ms/image | top5:0.934211 | top1:0.730263 | -| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:716.164ms/image | min_time:620.755ms/image | avg_time:676.802ms/image | top5:0.934211 | top1:0.730263 | -| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:298.58ms/image | min_time:252.569ms/image | avg_time:274.926ms/image | top5:0.934211 | top1:0.730263 | -| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:153.967ms/image | min_time:152.562ms/image | avg_time:153.01ms/image | top5:0.934211 | top1:0.730263 | -| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:76.0752ms/image | min_time:75.5671ms/image | avg_time:75.844ms/image | top5:0.934211 | top1:0.730263 | -| resnet50 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | max_time:956.05ms/image | min_time:669.802ms/image | avg_time:692.174ms/image | top5:0.934211 | top1:0.730263 | -| resnet50 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | max_time:308.14ms/image | min_time:283.158ms/image | avg_time:287.35ms/image | top5:0.934211 | top1:0.730263 | -| resnet50 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | max_time:157.702ms/image | min_time:154.775ms/image | avg_time:155.15ms/image | top5:0.934211 | top1:0.730263 | -| resnet50 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | max_time:79.03ms/image | min_time:77.3899ms/image | avg_time:77.7673ms/image | top5:0.934211 | top1:0.730263 | -| birealnet18 | onnx | ubuntu16_04 | gcc8.3 | 810 | A55 | binary | cpu | 1 | 1/3/224/224 | max_time:103.304ms/image | min_time:86.573ms/image | avg_time:88.4649ms/image | top5:0.782895 | top1:0.460526 | -| birealnet18 | onnx | ubuntu16_04 | gcc8.3 | 810 | A76 | binary | cpu | 1 | 1/3/224/224 | max_time:32.9131ms/image | min_time:30.6541ms/image | avg_time:32.1737ms/image | top5:0.782895 | top1:0.460526 | -| birealnet18 | onnx | ubuntu16_04 | gcc8.3 | 990 | A55 | binary | cpu | 1 | 1/3/224/224 | max_time:99.9858ms/image | min_time:82.6409ms/image | avg_time:85.7396ms/image | top5:0.782895 | top1:0.460526 | -| birealnet18 | onnx | ubuntu16_04 | gcc8.3 | 990 | A76 | binary | cpu | 1 | 1/3/224/224 | max_time:25.5642ms/image | min_time:23.6038ms/image | avg_time:24.7936ms/image | top5:0.782895 | top1:0.460526 | -| birealnet18 | onnx | ubuntu16_04 | llvm | 810 | A55 | binary | cpu | 1 | 1/3/224/224 | max_time:89.7971ms/image | min_time:77.2161ms/image | avg_time:78.3607ms/image | top5:0.782895 | top1:0.473684 | -| birealnet18 | onnx | ubuntu16_04 | llvm | 810 | A76 | binary | cpu | 1 | 1/3/224/224 | max_time:27.375ms/image | min_time:25.1851ms/image | avg_time:25.892ms/image | top5:0.782895 | top1:0.473684 | -| birealnet18 | onnx | ubuntu16_04 | llvm | 990 | A55 | binary | cpu | 1 | 1/3/224/224 | max_time:86.949ms/image | min_time:73.6069ms/image | avg_time:75.5283ms/image | top5:0.782895 | top1:0.473684 | -| birealnet18 | onnx | ubuntu16_04 | llvm | 990 | A76 | binary | cpu | 1 | 1/3/224/224 | max_time:20.3091ms/image | min_time:19.636ms/image | avg_time:19.7468ms/image | top5:0.782895 | top1:0.473684 | -| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 990 | MALI-G76-MP16 | fp16 | gpu | 1 | 1/3/224/224 | max_time:11.2231ms/image | min_time:7.74097ms/image | avg_time:8.5096ms/image | top5:0.894737 | top1:0.644737 | -| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 990 | MALI-G76-MP16 | fp16 | gpu | 1 | 1/3/224/224 | max_time:13.521ms/image | min_time:8.36597ms/image | avg_time:9.10277ms/image | top5:0.940789 | top1:0.756579 | -| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 990 | MALI-G76-MP16 | fp16 | gpu | 1 | 1/3/224/224 | max_time:12.2771ms/image | min_time:5.12305ms/image | avg_time:7.50856ms/image | top5:0.671053 | top1:0.342105 | -| squeezenet | caffe | ubuntu16_04 | llvm | 990 | MALI-G76-MP16 | fp16 | gpu | 1 | 1/3/224/224 | max_time:11.2141ms/image | min_time:6.36304ms/image | avg_time:7.38169ms/image | top5:0.875 | top1:0.618421 | -| ghostnet | onnx | ubuntu16_04 | llvm | 990 | MALI-G76-MP16 | fp16 | gpu | 1 | 1/3/224/224 | max_time:15.1619ms/image | min_time:8.94604ms/image | avg_time:11.566ms/image | top5:0.973684 | top1:0.868421 | -| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 810 | MALI-G52-MP6 | fp16 | gpu | 1 | 1/3/224/224 | max_time:16.575ms/image | min_time:11.979ms/image | avg_time:13.0283ms/image | top5:0.894737 | top1:0.644737 | -| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 810 | MALI-G52-MP6 | fp16 | gpu | 1 | 1/3/224/224 | max_time:17.498ms/image | min_time:13.28ms/image | avg_time:14.2388ms/image | top5:0.940789 | top1:0.756579 | -| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 810 | MALI-G52-MP6 | fp16 | gpu | 1 | 1/3/224/224 | max_time:12.8831ms/image | min_time:7.37183ms/image | avg_time:9.25686ms/image | top5:0.671053 | top1:0.342105 | -| squeezenet | caffe | ubuntu16_04 | llvm | 810 | MALI-G52-MP6 | fp16 | gpu | 1 | 1/3/224/224 | max_time:14.5859ms/image | min_time:9.90796ms/image | avg_time:10.5907ms/image | top5:0.875 | top1:0.618421 | -| ghostnet | onnx | ubuntu16_04 | llvm | 810 | MALI-G52-MP6 | fp16 | gpu | 1 | 1/3/224/224 | max_time:15.1641ms/image | min_time:12.0188ms/image | avg_time:13.2528ms/image | top5:0.973684 | top1:0.868421 | +| Model | Framework | Os | Compiler | Kirin Soc Version | Core | Precision | Cpu or Gpu | Thread | Input Size | avg_time | +| ------------ | --------- | ----------- | -------- | ----------------- | ---- | --------- | ---------- | ------ | ----------- | --------------------------- | +| tinybert | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 32+32+32 | 16.605225 | +| tinybert | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 32+32+32 | 9.114014 | +| tinybert | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | int8 | cpu | 1 | 32+32+32 | 7.108154 | +| tinybert | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 32+32+32 | 4.708984 | +| tinybert | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 32+32+32 | 3.208984 | +| tinybert | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | int8 | cpu | 1 | 32+32+32 | 2.205811 | +| tinybert | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 32+32+32 | 17.630127 | +| tinybert | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 32+32+32 | 9.800049 | +| tinybert | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | int8 | cpu | 1 | 32+32+32 | 7.642090 | +| tinybert | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 32+32+32 | 4.029053 | +| tinybert | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 32+32+32 | 2.835938 | +| tinybert | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | int8 | cpu | 1 | 32+32+32 | 1.875977 | +| tinybert | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 32+32+32 | 16.586914 | +| tinybert | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 32+32+32 | 9.926025 | +| tinybert | caffe | ubuntu16_04 | llvm | 810 | A55 | int8 | cpu | 1 | 32+32+32 | 8.780029 | +| tinybert | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 32+32+32 | 4.817139 | +| tinybert | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 32+32+32 | 3.411133 | +| tinybert | caffe | ubuntu16_04 | llvm | 810 | A76 | int8 | cpu | 1 | 32+32+32 | 2.603027 | +| tinybert | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 32+32+32 | 17.857910 | +| tinybert | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 32+32+32 | 10.166016 | +| tinybert | caffe | ubuntu16_04 | llvm | 990 | A55 | int8 | cpu | 1 | 32+32+32 | 9.524902 | +| tinybert | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 32+32+32 | 4.157959 | +| tinybert | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 32+32+32 | 2.849854 | +| tinybert | caffe | ubuntu16_04 | llvm | 990 | A76 | int8 | cpu | 1 | 32+32+32 | 2.161865 | +| nmt | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 32+32+32 | 642.424072 | +| nmt | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 32+32+32 | 356.841064 | +| nmt | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 32+32+32 | 232.143066 | +| nmt | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 32+32+32 | 116.806885 | +| nmt | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 32+32+32 | 710.650879 | +| nmt | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 32+32+32 | 373.492920 | +| nmt | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 32+32+32 | 188.436035 | +| nmt | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 32+32+32 | 95.020996 | +| nmt | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 32+32+32 | 656.012939 | +| nmt | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 32+32+32 | 355.530029 | +| nmt | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 32+32+32 | 233.974854 | +| nmt | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 32+32+32 | 120.966797 | +| nmt | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 32+32+32 | 694.150146 | +| nmt | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 32+32+32 | 370.792969 | +| nmt | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 32+32+32 | 191.239014 | +| nmt | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 32+32+32 | 96.389893 | +| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 59.0912 | +| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 29.1388 | +| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 18.4675 | +| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 8.85982 | +| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 59.8203 | +| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 28.1741 | +| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 13.4824 | +| ghostnet | onnx | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 7.01458 | +| ghostnet | onnx | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 67.9283 | +| ghostnet | onnx | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 33.2657 | +| ghostnet | onnx | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 18.8378 | +| ghostnet | onnx | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 9.45095 | +| ghostnet | onnx | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 61.4079 | +| ghostnet | onnx | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 29.9928 | +| ghostnet | onnx | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 14.0345 | +| ghostnet | onnx | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 7.38781 | +| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 157.002292 | +| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 58.171620 | +| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 39.825202 | +| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 19.578837 | +| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 162.878805 | +| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 60.626796 | +| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 34.055518 | +| mobilenet_v1 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 16.632785 | +| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 146.566687 | +| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 56.444002 | +| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 41.474450 | +| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 19.824521 | +| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 154.871895 | +| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 58.995204 | +| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 35.317029 | +| mobilenet_v1 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 17.012493 | +| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 128.224905 | +| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 52.524629 | +| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 37.341060 | +| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 17.874094 | +| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 133.358493 | +| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 54.601522 | +| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 30.195150 | +| mobilenet_v2 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 14.732285 | +| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 128.164528 | +| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 52.918264 | +| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 39.283749 | +| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 18.219124 | +| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 135.029091 | +| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 53.469070 | +| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 31.188000 | +| mobilenet_v2 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 14.823740 | +| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 81.188454 | +| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 41.068093 | +| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | int8 | cpu | 1 | 1/3/224/224 | 40.926114 | +| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 22.351720 | +| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 12.302643 | +| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | int8 | cpu | 1 | 1/3/224/224 | 12.866001 | +| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 83.545513 | +| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 38.876539 | +| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | int8 | cpu | 1 | 1/3/224/224 | 42.012501 | +| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 18.350247 | +| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 10.457777 | +| squeezenet | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | int8 | cpu | 1 | 1/3/224/224 | 10.791769 | +| squeezenet | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 87.881422 | +| squeezenet | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 41.555842 | +| squeezenet | caffe | ubuntu16_04 | llvm | 810 | A55 | int8 | cpu | 1 | 1/3/224/224 | 42.674965 | +| squeezenet | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 23.652636 | +| squeezenet | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 12.840966 | +| squeezenet | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 83.131129 | +| squeezenet | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 40.010485 | +| squeezenet | caffe | ubuntu16_04 | llvm | 990 | A55 | int8 | cpu | 1 | 1/3/224/224 | 42.526015 | +| squeezenet | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 18.508339 | +| squeezenet | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 10.512915 | +| squeezenet | caffe | ubuntu16_04 | llvm | 990 | A76 | int8 | cpu | 1 | 1/3/224/224 | 10.664495 | +| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 671.152633 | +| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 285.339836 | +| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | int8 | cpu | 1 | 1/3/224/224 | 392.863447 | +| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | int8 | cpu | 1 | 1/3/224/224 | 238.454521 | +| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 704.473506 | +| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 305.267536 | +| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | int8 | cpu | 1 | 1/3/224/224 | 415.956299 | +| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 163.979900 | +| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 84.558313 | +| resnet50 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | int8 | cpu | 1 | 1/3/224/224 | 107.986291 | +| resnet50 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 659.161729 | +| resnet50 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 291.172460 | +| resnet50 | caffe | ubuntu16_04 | llvm | 810 | A55 | int8 | cpu | 1 | 1/3/224/224 | 364.732873 | +| resnet50 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 192.280511 | +| resnet50 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 104.545466 | +| resnet50 | caffe | ubuntu16_04 | llvm | 810 | A76 | int8 | cpu | 1 | 1/3/224/224 | 106.067036 | +| resnet50 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 696.839442 | +| resnet50 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 305.819625 | +| resnet50 | caffe | ubuntu16_04 | llvm | 990 | A55 | int8 | cpu | 1 | 1/3/224/224 | 384.351028 | +| resnet50 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 162.355430 | +| resnet50 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 85.032211 | +| resnet50 | caffe | ubuntu16_04 | llvm | 990 | A76 | int8 | cpu | 1 | 1/3/224/224 | 87.458490 | +| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 26.322640 | +| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 13.424403 | +| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 6.937206 | +| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 4.120585 | +| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 27.295058 | +| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 14.080605 | +| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 5.702913 | +| mobilenet_v3 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 3.402296 | +| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 27.603806 | +| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 13.785805 | +| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 7.579590 | +| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 4.360988 | +| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | 1/3/224/224 | 29.233688 | +| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | 1/3/224/224 | 14.552715 | +| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | 1/3/224/224 | 6.072735 | +| mobilenet_v3 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | 1/3/224/224 | 3.545582 | +| asr_rnnt | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | / | 1394.038574 | +| asr_rnnt | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | / | 647.142578 | +| asr_rnnt | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | / | 491.428833 | +| asr_rnnt | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | / | 253.110596 | +| asr_rnnt | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | / | 1431.238037 | +| asr_rnnt | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | / | 717.116455 | +| asr_rnnt | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | / | 406.063965 | +| asr_rnnt | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | / | 201.946533 | +| asr_rnnt | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | / | 1373.467163 | +| asr_rnnt | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | / | 645.445068 | +| asr_rnnt | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | / | 513.895874 | +| asr_rnnt | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | / | 258.136353 | +| asr_rnnt | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | / | 1390.146973 | +| asr_rnnt | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | / | 689.771851 | +| asr_rnnt | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | / | 407.601929 | +| asr_rnnt | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | / | 204.034424 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | / | 73.672119 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | / | 38.642822 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | / | 20.154053 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | / | 11.269043 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | / | 80.286865 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | / | 43.187988 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | / | 17.701904 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | / | 9.718018 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | / | 70.837158 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | / | 38.571045 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | / | 20.748047 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | / | 11.482910 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | / | 78.658203 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | / | 41.130127 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | / | 17.866943 | +| asr_convolution_transformer_encoder | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | / | 9.725830 | +| asr_convolution_transformer_joint_net | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | / | 1.088867 | +| asr_convolution_transformer_joint_net | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | / | 0.776123 | +| asr_convolution_transformer_joint_net | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | / | 0.474121 | +| asr_convolution_transformer_joint_net | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | / | 0.277832 | +| asr_convolution_transformer_joint_net | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | / | 1.187012 | +| asr_convolution_transformer_joint_net | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | / | 0.599854 | +| asr_convolution_transformer_joint_net | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | / | 0.366211 | +| asr_convolution_transformer_joint_net | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | / | 0.197021 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | / | 12.655029 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | / | 5.782227 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | / | 3.854980 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | / | 2.052002 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | / | 12.645996 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | / | 6.840088 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | / | 3.276123 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | / | 1.738037 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | / | 11.480957 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | / | 6.652100 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | / | 3.854004 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | / | 2.174072 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | / | 12.539062 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | / | 6.432129 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | / | 3.377930 | +| asr_convolution_transformer_prediction_net | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | / | 1.842041 | +| birealnet18 | onnx | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | / | 68.122700 | +| birealnet18 | onnx | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | / | 27.371033 | +| birealnet18 | onnx | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | / | 72.439991| +| birealnet18 | onnx | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | / | 22.747310 | +| birealnet18 | onnx | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | / | 64.079100 | +| birealnet18 | onnx | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | / | 23.102961| +| birealnet18 | onnx | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | / | 68.148505 | +| birealnet18 | onnx | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | / | 19.146468 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 |/ | 10.235107 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 |/ | 5.970947 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 |/ | 2.722900 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 |/ | 1.936035 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 |/ | 10.719971 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 |/ | 6.263916 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 |/ | 2.169189 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 |/ | 1.608887 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 |/ | 9.856934 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 |/ | 6.613037 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 |/ | 2.770996 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 |/ | 1.895996 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 |/ | 9.671875 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 |/ | 6.834961 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 |/ | 2.219971 | +| fingerprint_resnet18 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 |/ | 1.579102 | +| tinybert384 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | / | 22.581055 | +| tinybert384 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | / | 11.834961 | +| tinybert384 | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | int8 | cpu | 1 | / | 9.236084 | +| tinybert384 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | / | 6.566895 | +| tinybert384 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | / | 4.331055 | +| tinybert384 | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | int8 | cpu | 1 | / | 2.608154 | +| tinybert384 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | / | 25.033936 | +| tinybert384 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | / | 12.445801 | +| tinybert384 | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | int8 | cpu | 1 | / | 8.791992 | +| tinybert384 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | / | 5.834229 | +| tinybert384 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | / | 4.145020 | +| tinybert384 | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | int8 | cpu | 1 | / | 2.248047 | +| tinybert384 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | / | 23.499023 | +| tinybert384 | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | / | 11.804932 | +| tinybert384 | caffe | ubuntu16_04 | llvm | 810 | A55 | int8 | cpu | 1 | / | 10.481934 | +| tinybert384 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | / | 6.758057 | +| tinybert384 | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | / | 4.487061 | +| tinybert384 | caffe | ubuntu16_04 | llvm | 810 | A76 | int8 | cpu | 1 | / | 2.980957 | +| tinybert384 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | / | 24.321045 | +| tinybert384 | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | / | 11.989014 | +| tinybert384 | caffe | ubuntu16_04 | llvm | 990 | A55 | int8 | cpu | 1 | / | 11.645020 | +| tinybert384 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | / | 5.864990 | +| tinybert384 | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | / | 3.717041 | +| tinybert384 | caffe | ubuntu16_04 | llvm | 990 | A76 | int8 | cpu | 1 | / | 2.541992 | +| tts_postnet | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | / | 190.645020 | +| tts_postnet | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | / | 76.032959 | +| tts_postnet | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | / | 50.892822 | +| tts_postnet | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | / | 26.031006 | +| tts_postnet | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | / | 196.128906 | +| tts_postnet | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | / | 78.461182 | +| tts_postnet | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | / | 45.419922 | +| tts_postnet | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | / | 23.227051 | +| tts_postnet | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | / | 177.791016 | +| tts_postnet | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | / | 76.929932 | +| tts_postnet | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | / | 50.316162 | +| tts_postnet | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | / | 26.363037 | +| tts_postnet | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | / | 187.880859 | +| tts_postnet | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | / | 77.622070 | +| tts_postnet | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | / | 45.151123 | +| tts_postnet | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | / | 23.705078 | +| tinybert_onnx | onnx | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | / | 16.797119 | +| tinybert_onnx | onnx | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | / | 9.298096 | +| tinybert_onnx | onnx | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | / | 4.791992 | +| tinybert_onnx | onnx | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | / | 3.198975 | +| tinybert_onnx | onnx | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | / | 18.088135 | +| tinybert_onnx | onnx | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | / | 9.281006 | +| tinybert_onnx | onnx | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | / | 4.185059 | +| tinybert_onnx | onnx | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | / | 2.751953 | +| tinybert_onnx | onnx | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | / | 17.781006 | +| tinybert_onnx | onnx | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | / | 10.011963 | +| tinybert_onnx | onnx | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | / | 5.114014 | +| tinybert_onnx | onnx | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | / | 3.356934 | +| tinybert_onnx | onnx | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | / | 19.456055 | +| tinybert_onnx | onnx | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | / | 10.182861 | +| tinybert_onnx | onnx | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | / | 4.295898 | +| tinybert_onnx | onnx | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | / | 2.818848 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | / | 1678.862793 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | / | 776.404053 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | / | 525.206055 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | / | 263.708008 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | / | 1764.489014 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | / | 813.339844 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | / | 417.965088 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | / | 215.749023 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | / | 1681.593994 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | / | 745.455078 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | / | 493.489014 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | / | 261.107910 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | / | 1768.612061 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | / | 784.126953 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | / | 410.212891 | +| tts_melgan_vocoder | onnx | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | / | 211.166016 | +| tts_encoder_decoder | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | / | 420.667969 | +| tts_encoder_decoder | caffe | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | / | 216.756104 | +| tts_encoder_decoder | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | / | 150.831055 | +| tts_encoder_decoder | caffe | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | / | 76.963867 | +| tts_encoder_decoder | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | / | 452.342041 | +| tts_encoder_decoder | caffe | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | / | 227.301025 | +| tts_encoder_decoder | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | / | 126.700195 | +| tts_encoder_decoder | caffe | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | / | 64.525879 | +| tts_encoder_decoder | caffe | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | / | 418.912109 | +| tts_encoder_decoder | caffe | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | / | 210.114990| +| tts_encoder_decoder | caffe | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | / | 150.854004 | +| tts_encoder_decoder | caffe | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | / | 76.160156| +| tts_encoder_decoder | caffe | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | / | 449.791016 | +| tts_encoder_decoder | caffe | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | / | 222.086914 | +| tts_encoder_decoder | caffe | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | / | 124.972168 | +| tts_encoder_decoder | caffe | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | / | 65.025146 | +| vad | tflite | ubuntu16_04 | gcc8.3 | 810 | A55 | fp32 | cpu | 1 | / | 10.746826 | +| vad | tflite | ubuntu16_04 | gcc8.3 | 810 | A55 | fp16 | cpu | 1 | / | 8.893066 | +| vad | tflite | ubuntu16_04 | gcc8.3 | 810 | A76 | fp32 | cpu | 1 | / | 2.525146 | +| vad | tflite | ubuntu16_04 | gcc8.3 | 810 | A76 | fp16 | cpu | 1 | / | 2.598145 | +| vad | tflite | ubuntu16_04 | gcc8.3 | 990 | A55 | fp32 | cpu | 1 | / | 10.649902 | +| vad | tflite | ubuntu16_04 | gcc8.3 | 990 | A55 | fp16 | cpu | 1 | / | 8.578857 | +| vad | tflite | ubuntu16_04 | gcc8.3 | 990 | A76 | fp32 | cpu | 1 | / | 2.264893 | +| vad | tflite | ubuntu16_04 | gcc8.3 | 990 | A76 | fp16 | cpu | 1 | / | 2.222168 | +| vad | tflite | ubuntu16_04 | llvm | 810 | A55 | fp32 | cpu | 1 | / | 11.018066 | +| vad | tflite | ubuntu16_04 | llvm | 810 | A55 | fp16 | cpu | 1 | / | 11.024170 | +| vad | tflite | ubuntu16_04 | llvm | 810 | A76 | fp32 | cpu | 1 | / | 3.061035 | +| vad | tflite | ubuntu16_04 | llvm | 810 | A76 | fp16 | cpu | 1 | / | 2.907959 | +| vad | tflite | ubuntu16_04 | llvm | 990 | A55 | fp32 | cpu | 1 | / | 11.968994 | +| vad | tflite | ubuntu16_04 | llvm | 990 | A55 | fp16 | cpu | 1 | / | 11.719971 | +| vad | tflite | ubuntu16_04 | llvm | 990 | A76 | fp32 | cpu | 1 | / | 2.645020 | +| vad | tflite | ubuntu16_04 | llvm | 990 | A76 | fp16 | cpu | 1 | / | 2.597900 | diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 79ef083b..ff974e3d 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -7,6 +7,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](). +## [1.0.0] - 2020-11-20 + +### Added + +- Support fp32 on X86 AVX2 CPU +- Support partial fp32 operator(convolution, lstm) multi-threads parallel +- Support Tensorflow model +- Support more networks(Pointnet, ...) +- Support more networks int8 inference(TinyBert, NMT, ASR) +- Support time-series data acceleration +- Support Apple IOS phone + ## [0.3.0] - 2020-06-01 diff --git a/docs/DEVELOPER.md b/docs/DEVELOPER.md index 8eabde05..3df69f1e 100644 --- a/docs/DEVELOPER.md +++ b/docs/DEVELOPER.md @@ -1,613 +1,418 @@ -# Customize Models +Before learning this developer guide of bolt, the [code architecture](ARCHITECTURE.md) is strongly recommended for you to read in advance. After reading the [code architecture](ARCHITECTURE.md), your will get the deep understanding of the whole design of bolt, which helps you develop bolt more efficiently. -- ### model-tools customization +If you want to verify your model quickly, you can use the out-of-the-box c api or java api to infer your model and check the inference result. If your model run with time series data, you can use “Flow” to accelerate the inference. What’s more, if your encounter unsupported operators in conversion or inference of your model, you can customize the unsupported operators step by step which has been described in details in the document. - ​ In model-tools, you can define any operator for model conversion. +[Use out-of-the-box API to infer your model](#use-out-of-the-box-api-to-infer-your-model) +    [C API](#c-api) +    [Java API](#java-api) +[Accelerate time series model by Flow](#accelerate-time-series-model-by-flow) +[Customize models with unsupported operators step by step](#customize-models-with-unsupported-operators-step-by-step) +    [model conversion customization](#model-conversion-customization) +    [tensor computing customization](#tensor-computing-customization) +    [inference's engine customization](#inference's-engine-customization) +[How to contribute](#how-to-contribute) +    [submit issue](#submit-issue) +    [pull request](#pull-request) - 1. Switch to code of the specific framework (caffe/onnx/tflite) you are working on; - 2. Judge the op whether it is a weight-op or non-weight-op; - 3. Define the Operator parameter format; - 4. Extract the meta information of the op; - 5. Extract the weight data if the op is a weight-op, otherwise skip this step. - +# Use out-of-the-box API to infer your model - +## C API - - [ ] caffe converter +Bolt provides C API document generated by doxygen to help you use [C API](../inference/engine/api/c/bolt.h) with a detailed [example](../inference/examples/c_api/test_api_c.c). You can compile it and link *libbolt.so* library with your C/C++ project. - - add `pooling` in caffe converter +## Java API - (1) Switch to bolt/model-tools/src/caffe, which is the caffe converter for bolt; +Bolt provides Java API document generated by doxygen to help use [Java API](../inference/engine/api/java) with a detailed [example](../inference/examples/java_api/test_api_java.java). You can compile bolt and load *libBoltModel.so* to using the Java Native Interface(JNI) with your Java project. - (2) Pooling is non-weight-op. +# Accelerate time series model by Flow - (3) Define `pooling` parameter format. +Flow provides API document generated by doxygen to help use [Flow C++ header](../inference/flow/include), and examples([tinybert](../inference/examples/bert/flow_tinybert.cpp), [faceSR](../inference/examples/facesr/flow_facesr.cpp), [ASR](../inference/examples/automatic_speech_recognition/flow_asr.cpp)). - Add `pooling` definition of bolt in bolt/model-tools/include/model_tools.h +- ## Generally, it includes the following steps: - ``` - // Addition begin - typedef struct { - U32 kernel_size_h; - U32 kernel_size_w; - U32 stride_h; - U32 stride_w; - U32 padding_top; - U32 padding_bottom; - U32 padding_left; - U32 padding_right; - RoundMode rm; - PoolingMode mode; - } PoolingParamSpec; - // Addition end - ``` + - ### Use [predefined flow protobuf standard](../inference/flow/src/flow.proto) to define a graph - Modified "OperatorType" data structure in bolt/uni/include/op_type.h + Here is an example for CV application faceSR graph file [flow_facesr.prototxt](../inference/examples/facesr/flow_facesr.prototxt). This graph has one input, one input node, one inference node and one output. Input node need to be marked as *Input*, and inference node need to be marked as *Inference*. Each node can have multiple input or output tensors. Each type node has typical fields. - ``` - typedef enum { - ... - OT_Pooling, // Addition - ... - } OperatorType - ``` + - ### Add output tensor size infer function for each node, and register function to Flow function manager (optional) - Modified "inline const char* const* OperatorTypeName()" function in bolt/uni/include/op_type.h + Because facesr doesn't have post process function, so the node's output tensor can use model inference result tensor directly. - ``` - inline const char* const* OperatorTypeName() { - static const char* const names[] = { - ... - "OT_Pooling", // Addition, please corresponds to the OperatorType - ... - } - } - ``` - - (4) Extract the meta information of `pooling` operator in caffe. - - Modified the function named "OperatorType convert_caffe_type(std::string inputType)" in bolt/model-tools/caffe/caffe_adaptee.h . - - Add the caffe type mapping code as following: - - ``` - OperatorType convert_caffe_type(std::string inputType) { - // Addition begin - if (inputType == "Pooling") { - return OT_Pooling; - } // Addition end - else if (inputType == "Convolution") { - ... - } - } - ``` - - Extract the meta information of pooling operator from caffe model, add the function "ParameterSpec adapt_Pooling() override" in bolt/model-tools/caffe/caffe_adaptee.h - - ``` - // Addition begin - ParameterSpec adapt_Pooling() override { - ParameterSpec curPs; - PoolingParamSpec pps; - if (layer.pooling_param().has_kernel_w() && layer.pooling_param().has_kernel_h()) { - pps.kernel_size_w = layer.pooling_param().kernel_w(); - pps.kernel_size_h = layer.pooling_param().kernel_h(); - } else { - pps.kernel_size_h = layer.pooling_param().kernel_size(); - pps.kernel_size_w = pps.kernel_size_h; - } - if (layer.pooling_param().has_stride_w() && layer.pooling_param().has_stride_h()) { - pps.stride_w = layer.pooling_param().stride_w(); - pps.stride_h = layer.pooling_param().stride_h(); - } else { - pps.stride_h = layer.pooling_param().stride(); - pps.stride_w = pps.stride_h; - } - bool global_pooling = layer.pooling_param().global_pooling(); - if (global_pooling) { - pps.kernel_size_h = 0; - pps.kernel_size_w = 0; - pps.stride_h = 1; - pps.stride_w = 1; - }else { - CHECK_REQUIREMENT(pps.kernel_size_h > 0); - } - if (layer.pooling_param().has_pad_w() && layer.pooling_param().has_pad_h()) { - pps.padding_left = layer.pooling_param().pad_w(); - pps.padding_right = pps.padding_left; - pps.padding_top = layer.pooling_param().pad_h(); - pps.padding_bottom = pps.padding_top; - } else { - pps.padding_top = layer.pooling_param().has_pad() ? layer.pooling_param().pad() : 0; - pps.padding_bottom = pps.padding_top; - pps.padding_left = pps.padding_top; - pps.padding_right = pps.padding_top; - } - - if (layer.pooling_param().has_round_mode() && layer.pooling_param().round_mode() == 1) { - pps.rm = FLOOR; - }else{ - pps.rm = CEIL; - } - switch (layer.pooling_param().pool()) { - case caffe::PoolingParameter_PoolMethod_MAX: { - pps.mode = POOLING_MAX; - break; - } - case caffe::PoolingParameter_PoolMethod_AVE: { - pps.mode = POOLING_MEAN; - break; - } - default: { - std::cerr << "[ERROR] encounter unsupported Pooling method " << layer.pooling_param().pool() << std::endl; - break; - } - } - curPs.pooling_spec = pps; - return curPs; - } - // Addition end - ``` - - (5) Pooling is non-weight op, skip this step. - - - - - [ ] onnx converter - - - add `pooling` in onnx converter - - (1) Switch to bolt/model-tools/src/onnx, which is the onnx converter for bolt; - - (2) Pooling is non-weight-op; - - (3) Define `pooling` parameter format. - - Note: Definition actions same with add pooling in caffe converter step(3) . Please refer the former content. - - (4) Extract the meta information of `pooling` operator in onnx. - - Modified the function named "OperatorType convert_onnx_type(std::string inputType)" in bolt/model-tools/onnx/onnx_adaptee.h . - - Add the onnx type mapping code as following: - - ``` - OperatorType convert_onnx_type(std::string inputType) { - // Addition begin - if (inputType == "AveragePool" || inputType == "MaxPool") { - return OT_Pooling; - } // Addition end - else if (inputType == "Conv") { - ... - } - } - ``` - - Extract the meta information of pooling operator from onnx model, add the function "ParameterSpec adapt_Pooling() override" in bolt/model-tools/onnx/onnx_adaptee.h - - ``` - // Addition begin - ParameterSpec adapt_Pooling() override - { - ParameterSpec curPs; - PoolingParamSpec pps; - std::string autoPad = get_node_str_attribute_by_name(node, "auto_pad"); - std::vector kernelShape = get_node_vector_ints_attribute_by_name(node, "kernel_shape"); - std::vector strides = get_node_vector_ints_attribute_by_name(node, "strides"); - std::vector pads = get_node_vector_ints_attribute_by_name(node, "pads"); - - if (op == "AveragePool" || op == "ReduceMean") { - pps.mode = POOLING_MEAN; - } else { - pps.mode = POOLING_MAX; - } - - if (autoPad == "SAME_UPPER") { - pps.rm = CEIL; - } else { - pps.rm = FLOOR; - } - - if (kernelShape.size() == 2) { - pps.kernel_size_h = kernelShape[0]; - pps.kernel_size_w = kernelShape[1]; - } else { - pps.kernel_size_h = 0; - pps.kernel_size_w = 0; - std::cerr << "[Info] pooling: kernel_size unknown. This could be global pooling." << std::endl; - } - - if (strides.size() == 2) { - pps.stride_h = strides[0]; - pps.stride_w = strides[1]; - } else { - pps.stride_h = 0; - pps.stride_w = 0; - std::cerr << "[Info] pooling: stride unknown. This could be global pooling." << std::endl; - } - - if (pads.size() == 4) { - pps.padding_top = pads[0]; - pps.padding_bottom = pads[2]; - pps.padding_left = pads[1]; - pps.padding_right = pads[3]; - } else { - pps.padding_top = 0; - pps.padding_bottom = 0; - pps.padding_left = 0; - pps.padding_right = 0; - } - curPs.pooling_spec = pps; - return curPs; - } - // Addition end - ``` - - (5) Pooling is non-weight op, skip this step. - - - - - [ ] tflite converter - - - add `pooling` in tflite converter - - (1) Switch to bolt/model-tools/src/onnx, which is the onnx converter for bolt; - - (2) Pooling is non-weight-op; - - (3) Define `pooling` parameter format. - - Note: Definition actions same with add pooling in caffe converter step(3) . Please refer the former content. - - (4) Extract the meta information of `pooling` operator in tflite. - - Modified the function named "OperatorType convert_tflite_type(std::string inputType)" in bolt/model-tools/tflite/tflite_adaptee.h . - - Add the tflite type mapping code as following: - - ``` - OperatorType convert_tflite_type(tflite::BuiltinOperator tfliteType) { - // Addition begin - if (tfliteType == tflite::BuiltinOperator_MAX_POOL_2D) { - return OT_Pooling; - } // Addition end - else if (tfliteType == tflite::BuiltinOperator_CONCATENATION) { - ... - } - } - ``` - - Extract the meta information of pooling operator from tflite model, add the function "ParameterSpec adapt_Pooling() override" in bolt/model-tools/tflite/tflite_adaptee.h - - ``` - // Addition begin - ParameterSpec adapt_Pooling() override - { - ParameterSpec curPs; - const auto& tflitePoolOption = ops[curIndex]->builtin_options.AsPool2DOptions(); - PoolingParamSpec poolingPs; - poolingPs.kernel_size_h = tflitePoolOption->filter_height; - poolingPs.kernel_size_w = tflitePoolOption->filter_width; - poolingPs.stride_h = tflitePoolOption->stride_h; - poolingPs.stride_w = tflitePoolOption->stride_w; - poolingPs.padding_top = 0; - poolingPs.padding_bottom = 0; - poolingPs.padding_left = 0; - poolingPs.padding_right = 0; - poolingPs.rm = CEIL; - if (opCode == tflite::BuiltinOperator_MAX_POOL_2D) { - poolingPs.mode = POOLING_MAX; - } else if (opCode == tflite::BuiltinOperator_AVERAGE_POOL_2D) { - poolingPs.mode = POOLING_MEAN; - } - curPs.pooling_spec = poolingPs; - return curPs; - } - // Addition end - ``` - - (5) Pooling is non-weight op, skip this step. + If you have post process function, you can refer [flow_tinybert](../inference/examples/bert/flow_tinybert.cpp) defines *tinybertInferOutputSize* function and registers it by using *flowRegisterFunction* API. + + - ### Add input tensor preprocess function for each node, and register function to Flow function manager (optional) + + *(same as output tensor size infer function)* + + - ### Add output tensor postprocess function for each node, and register function to Flow function manager (optional) + + *(same as output tensor size infer function)* + + - ### Define a *Flow* object, and add task + + Declare a *Flow* object and set the used CPU cores number and GPU. Use *Task* format to describe task. Use *enque* API to add task into Flow heterogeneous executor. + + - ### Get Flow process result + + Use *dequeue* API to get the already finished task result. This is in the FIFO order. You can choose to set as blocked to get all enqueue tasks result. *size* function can be used to query the unfinishe +d task number. + +# Customize models with unsupported operators step by step + +## model conversion customization + +In [model_tools](../model_tools), you can define any operator for model conversion. + +1. Switch to code of the specific framework (caffe/onnx/tflite) you are working on; +2. Judge the op whether it is a weight-op or non-weight-op; +3. Define the Operator parameter format; +4. Extract the meta information of the op; +5. Extract the weight data if the op is a weight-op, otherwise skip this step. + +- Example: support `pooling` in caffe converter + 1. Switch to [model_tools/src/caffe](../model_tools/src/caffe), which is the caffe converter for bolt; + + 2. Judgment: pooling is non-weight-op. + + 3. Define `pooling` parameter format. + + 3.1 Add `pooling` definition of bolt in [model_tools/include/model_tools.h](../common/uni/include/types.h) + + ``` + // Addition ======> + typedef struct { + U32 kernel_h; + U32 kernel_w; + U32 stride_h; + U32 stride_w; + U32 padding_top; + U32 padding_bottom; + U32 padding_left; + U32 padding_right; + RoundMode rm; + PoolingMode mode; + } PoolingParamSpec; + // <====== Addition end + ``` + + 3.2 Modify "OperatorType" data structure in [common/uni/include/op_type.h](../common/uni/include/op_type.h) + + ``` + typedef enum { + ... + OT_Pooling, // Addition + ... + } OperatorType + ``` + + 3.3 Modify "inline const char* const* OperatorTypeName()" function in [common/uni/include/op_type.h](../common/uni/include/op_type.h) + + ``` + inline const char* const* OperatorTypeName() { + static const char* const names[] = { + ... + "OT_Pooling", // Addition, please corresponds to the OperatorType + ... + } + } + ``` + + 3.4 Modify "int get_operator_parameter_size(OperatorType operatorType)" function in [model_tools/src/model_deserialize.cpp](../model_tools/src/model_deserialize.cpp) + + ``` + std::map operatorParameterSizeMap = { + ... + {OT_Pooling, sizeof(PoolingParamSpec)}}; + ``` + + 4. Extract the meta information of `pooling` operator in caffe. + + 4.1 Modify the function named "OperatorType convert_caffe_type(std::string inputType)" in [model_tools/caffe/caffe_adaptee.h](../model_tools/src/caffe/caffe_adaptee.h). + + Add the caffe type mapping code as following: + + ``` + OperatorType convert_caffe_type(std::string inputType) { + // Addition ======> + if (inputType == "Pooling") { + return OT_Pooling; + } // <====== Addition + else if (inputType == "Convolution") { + ... + } + } + ``` + + 4.2 Extract the meta information of pooling operator from caffe model, add the function "ParameterSpec adapt_Pooling() override" in [model_tools/caffe/caffe_adaptee.h](../model_tools/src/caffe/caffe_adaptee.h). + + ``` + // Addition ======> + ParameterSpec adapt_Pooling() override { + ParameterSpec curPs; + PoolingParamSpec pps; + if (layer.pooling_param().has_kernel_w() && layer.pooling_param().has_kernel_h()) { + pps.kernel_w = layer.pooling_param().kernel_w(); + pps.kernel_h = layer.pooling_param().kernel_h(); + } else { + pps.kernel_h = layer.pooling_param().kernel_size(); + pps.kernel_w = pps.kernel_h; + } + if (layer.pooling_param().has_stride_w() && layer.pooling_param().has_stride_h()) { + pps.stride_w = layer.pooling_param().stride_w(); + pps.stride_h = layer.pooling_param().stride_h(); + } else { + pps.stride_h = layer.pooling_param().stride(); + pps.stride_w = pps.stride_h; + } + bool global_pooling = layer.pooling_param().global_pooling(); + if (global_pooling) { + pps.kernel_h = 0; + pps.kernel_w = 0; + pps.stride_h = 1; + pps.stride_w = 1; + }else { + CHECK_REQUIREMENT(pps.kernel_h > 0); + } + if (layer.pooling_param().has_pad_w() && layer.pooling_param().has_pad_h()) { + pps.padding_left = layer.pooling_param().pad_w(); + pps.padding_right = pps.padding_left; + pps.padding_top = layer.pooling_param().pad_h(); + pps.padding_bottom = pps.padding_top; + } else { + pps.padding_top = layer.pooling_param().has_pad() ? layer.pooling_param().pad() : 0; + pps.padding_bottom = pps.padding_top; + pps.padding_left = pps.padding_top; + pps.padding_right = pps.padding_top; + } + + if (layer.pooling_param().has_round_mode() && layer.pooling_param().round_mode() == 1) { + pps.rm = FLOOR; + }else{ + pps.rm = CEIL; + } + switch (layer.pooling_param().pool()) { + case caffe::PoolingParameter_PoolMethod_MAX: { + pps.mode = POOLING_MAX; + break; + } + case caffe::PoolingParameter_PoolMethod_AVE: { + pps.mode = POOLING_MEAN; + break; + } + default: { + std::cerr << "[ERROR] encounter unsupported Pooling method " << layer.pooling_param().pool() << std::endl; + break; + } + } + curPs.pooling_spec = pps; + return curPs; + } + // <====== Addition + ``` + + 5. Pooling is non-weight op, skip this step. + +- Example: support `pooling` in onnx converter + + 1. Switch to [model_tools/src/onnx](../model_tools/src/onnx), which is the onnx converter for bolt; + + 2. Judgment: pooling is non-weight-op; + + 3. Define `pooling` parameter format. + + Note: Definition actions same with add pooling in caffe converter step 3 . Please refer the former content. + + 4. Extract the meta information of `pooling` operator in onnx. + + 4.1 Modify the function named "OperatorType convert_onnx_type(std::string inputType)" in [model_tools/onnx/onnx_adaptee.h](../model_tools/src/onnx/onnx_adaptee.h). + + Add the onnx type mapping code as following: + + ``` + OperatorType convert_onnx_type(std::string inputType) { + // Addition ======> + if (inputType == "AveragePool" || inputType == "MaxPool") { + return OT_Pooling; + } // <====== Addition + else if (inputType == "Conv") { + ... + } + } + ``` + + 4.2 Extract the meta information of pooling operator from onnx model, add the function "ParameterSpec adapt_Pooling() override" in [model_tools/onnx/onnx_adaptee.h](../model_tools/src/onnx/onnx_adaptee.h). + + ``` + // Addition ======> + ParameterSpec adapt_Pooling() override + { + ParameterSpec curPs; + PoolingParamSpec pps; + std::string autoPad = get_node_str_attribute_by_name(node, "auto_pad"); + std::vector kernelShape = get_node_vector_ints_attribute_by_name(node, "kernel_shape"); + std::vector strides = get_node_vector_ints_attribute_by_name(node, "strides"); + std::vector pads = get_node_vector_ints_attribute_by_name(node, "pads"); + + if (op == "AveragePool" || op == "ReduceMean") { + pps.mode = POOLING_MEAN; + } else { + pps.mode = POOLING_MAX; + } + if (autoPad == "SAME_UPPER") { + pps.rm = CEIL; + } else { + pps.rm = FLOOR; + } + + if (kernelShape.size() == 2) { + pps.kernel_h = kernelShape[0]; + pps.kernel_w = kernelShape[1]; + } else { + pps.kernel_h = 0; + pps.kernel_w = 0; + std::cerr << "[Info] pooling: kernel_size unknown. This could be global pooling." << std::endl; + } + + if (strides.size() == 2) { + pps.stride_h = strides[0]; + pps.stride_w = strides[1]; + } else { + pps.stride_h = 0; + pps.stride_w = 0; + std::cerr << "[Info] pooling: stride unknown. This could be global pooling." << std::endl; + } + + if (pads.size() == 4) { + pps.padding_top = pads[0]; + pps.padding_bottom = pads[2]; + pps.padding_left = pads[1]; + pps.padding_right = pads[3]; + } else { + pps.padding_top = 0; + pps.padding_bottom = 0; + pps.padding_left = 0; + pps.padding_right = 0; + } + curPs.pooling_spec = pps; + return curPs; + } + // <======= Addition + ``` + + 5. Pooling is non-weight op, skip this step. + +- Example: support `pooling` in tflite converter + + 1. Switch to [model_tools/src/tflite](../model_tools/src/tflite), which is the tflite converter for bolt; + + 2. Judgment: pooling is non-weight-op; + + 3. Define `pooling` parameter format; + + Note: Definition actions same with add pooling in caffe converter step(3) . Please refer the former content. + + 4. Extract the meta information of `pooling` operator in tflite. + + 4.1 Modify the function named "OperatorType convert_tflite_type(std::string inputType)" in [model_tools/tflite/tflite_adaptee.h](../model_tools/src/tflite/tflite_adaptee.h). + + Add the tflite type mapping code as following: + + ``` + OperatorType convert_tflite_type(tflite::BuiltinOperator tfliteType) { + // Addition ======> + if (tfliteType == tflite::BuiltinOperator_MAX_POOL_2D) { + return OT_Pooling; + } // Addition + else if (tfliteType == tflite::BuiltinOperator_CONCATENATION) { + ... + } + } + ``` + + 4.2 Extract the meta information of pooling operator from tflite model, add the function "ParameterSpec adapt_Pooling() override" in [model_tools/tflite/tflite_adaptee.h](../model_tools/src/tflite/tflite_adaptee.h). + + ``` + // Addition ======> + ParameterSpec adapt_Pooling() override + { + ParameterSpec curPs; + const auto& tflitePoolOption = ops[curIndex]->builtin_options.AsPool2DOptions(); + PoolingParamSpec poolingPs; + poolingPs.kernel_h = tflitePoolOption->filter_height; + poolingPs.kernel_w = tflitePoolOption->filter_width; + poolingPs.stride_h = tflitePoolOption->stride_h; + poolingPs.stride_w = tflitePoolOption->stride_w; + poolingPs.padding_top = 0; + poolingPs.padding_bottom = 0; + poolingPs.padding_left = 0; + poolingPs.padding_right = 0; + poolingPs.rm = CEIL; + if (opCode == tflite::BuiltinOperator_MAX_POOL_2D) { + poolingPs.mode = POOLING_MAX; + } else if (opCode == tflite::BuiltinOperator_AVERAGE_POOL_2D) { + poolingPs.mode = POOLING_MEAN; + } + curPs.pooling_spec = poolingPs; + return curPs; + } + // <====== Addition + ``` + + 5. Pooling is non-weight op, skip this step. + +## tensor computing customization -- ### tensor_computing customization - - In tensor_computing, you can define any operator for operator computing process. - -- ### inference customization - - In inference, you can define any operator for the inference of your model. - - 1. Add the definition of the specific operator in bolt/inference/include; - 2. If the specific operator implement in CPU is different from its implement in GPU, implement should be divided into CPU and GPU version. If the specific operator implement in CPU is same with its implement in GPU, skip this step. - - - - - [ ] Example: add `pooling` operator in bolt/inference - - 1. Create `pooling.hpp` in bolt/inference/include , add the definition of `pooling` operator; - - ``` - // Addition begin - #ifndef _POOLING_H - #define _POOLING_H - #include "operator.hpp" - #include "tensor_computing.h" - - class Pooling: public Operator { - public: - Pooling(PoolingMode mode, U32 ksH, U32 ksW, U32 strideH, U32 strideW, - U32 paddingT, U32 paddingB, U32 paddingL, U32 paddingR, RoundMode rm) - { - this->mode = mode; - this->kernelSizeH = ksH; - this->kernelSizeW = ksW; - this->strideH = strideH; - this->strideW = strideW; - this->paddingT = paddingT; - this->paddingB = paddingB; - this->paddingL = paddingL; - this->paddingR = paddingR; - this->rm = rm; - } - - PoolingDesc create_PoolingDesc(PoolingMode pm, U32 ksH, U32 ksW, U32 strideH, U32 strideW, - U32 paddingT, U32 paddingB, U32 paddingL, U32 paddingR, RoundMode rm) - { - PoolingDesc poolingDesc; - poolingDesc.pm = pm; - poolingDesc.kernelSize_h = ksH; - poolingDesc.kernelSize_w = ksW; - poolingDesc.stride_h = strideH; - poolingDesc.stride_w = strideW; - poolingDesc.padding_top = paddingT; - poolingDesc.padding_bottom = paddingB; - poolingDesc.padding_left = paddingL; - poolingDesc.padding_right = paddingR; - poolingDesc.rm = rm; - return poolingDesc; - } - - void set_kernelSize(U32 globalKernelSizeH, U32 globalKernelSizeW) { - this->kernelSizeH = globalKernelSizeH; - this->kernelSizeW = globalKernelSizeW; - } - - void set_stride(U32 globalStrideH, U32 globalStrideW) { - this->strideH = globalStrideH; - this->strideW = globalStrideW; - } - - virtual void run() = 0; - virtual EE infer_output_tensors_size(Vec, Vec*) = 0; - #ifdef _USE_MALI - virtual EE infer_output_tensors_size(Vec, Vec*, Vec*, Vec*){return NOT_SUPPORTED;} - #endif - - protected: - PoolingMode mode; - RoundMode rm; - U32 kernelSizeH; - U32 kernelSizeW; - U32 strideH; - U32 strideW; - U32 paddingT; - U32 paddingB; - U32 paddingL; - U32 paddingR; - }; - - #endif //_POOLING_H - // Addition end - ``` - - 2. `pooling` operator implement in CPU is different from its implement in GPU. So `pooling` implement should be two version: CPU and GPU - - Create `pooling_cpu.hpp` and add `pooling` CPU implement in bolt/inference/include/cpu : - - ``` - #ifndef _POOLING_CPU_H - #define _POOLING_CPU_H - #include - #include "operator.hpp" - #include "tensor_computing.h" - #include "tensor_desc.h" - #include "model_tools.h" - #include "pooling.hpp" - - class PoolingCPU: public Pooling { - public: - - /** - * @param mode - * @param ks - * @param stride - * @param padding - * @param name - */ - PoolingCPU(PoolingMode mode, U32 ksH, U32 ksW, U32 strideH, U32 strideW, U32 paddingT, U32 paddingB, U32 paddingL, U32 paddingR, RoundMode rm): - Pooling(mode, ksH, ksW, strideH, strideW, paddingT, paddingB, paddingL, paddingR, rm){} - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - PoolingDesc poolingDesc = Pooling::create_PoolingDesc(this->mode, this->kernelSizeH, this->kernelSizeW, this->strideH, this->strideW, - this->paddingT, this->paddingB, this->paddingL, this->paddingR, this->rm); - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - F16 scales[2]; - if (DT_I8 == inputDesc.dt) { - scales[0] = inputTensor.get_scale(); - } - CHECK_STATUS(pooling(inputDesc, inputTensor.get_val(), - poolingDesc, scales, - outputDesc, outputTensor.get_val(), - this->schedule)); - if (DT_I8 == inputDesc.dt) { - outputTensor.set_scale(scales[1]); - } - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - - virtual EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - auto inDim = inDims[0]; - DataType dt; - DataFormat df; - U32 width ; - U32 height; - U32 numChannels; - U32 num; - CHECK_STATUS(tensor4dGet(inDim, &dt, &df, &num, &numChannels, &height, &width)); - - TensorDesc inputDesc = tensor4df(dt, df, num, numChannels, height, width); - if (this->kernelSizeH == 0 && this->kernelSizeW == 0) { - Pooling::set_kernelSize(height, width); - Pooling::set_stride(1, 1); - } - PoolingDesc poolingDesc = Pooling::create_PoolingDesc(this->mode, this->kernelSizeH, this->kernelSizeW, this->strideH, this->strideW, - this->paddingT, this->paddingB, this->paddingL, this->paddingR, this->rm); - CHECK_STATUS(pooling_infer_output_size(inputDesc, poolingDesc, &((*outDims)[0]), this->schedule)); - return SUCCESS; - } - - }; - - #endif //_POOLINGCPU_H - ``` - - Create `pooling_ocl.hpp` and add `pooling` GPU implement in bolt/inference/include/ocl : - - ``` - #ifndef _POOLING_OCL_H - #define _POOLING_OCL_H - #include - #include "operator.hpp" - #include "tensor_computing.h" - #include "tensor_desc.h" - #include "model_tools.h" - #include "pooling.hpp" - - class PoolingOCL: public Pooling { - public: - - /** - * @param mode - * @param ks - * @param stride - * @param padding - * @param name - */ - PoolingOCL(PoolingMode mode, U32 ksH, U32 ksW, U32 strideH, U32 strideW, U32 paddingT, U32 paddingB, U32 paddingL, U32 paddingR, RoundMode rm): - Pooling(mode, ksH, ksW, strideH, strideW, paddingT, paddingB, paddingL, paddingR, rm){} - - virtual void run() override - { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - PoolingDesc poolingDesc = Pooling::create_PoolingDesc(this->mode, this->kernelSizeH, this->kernelSizeW, this->strideH, this->strideW, - this->paddingT, this->paddingB, this->paddingL, this->paddingR, this->rm); - Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - F16 scales[2]; - if (DT_I8 == inputDesc.dt) { - scales[0] = inputTensor.get_scale(); - } - CHECK_STATUS(pooling(inputDesc, inputTensor.get_val(), - poolingDesc, scales, - outputDesc, outputTensor.get_val(), - this->schedule, &this->oclExtInfo)); - if (DT_I8 == inputDesc.dt) { - outputTensor.set_scale(scales[1]); - } - - UTIL_TIME_TOC(__CLASS_FUNCTION__) - } - - virtual EE infer_output_tensors_size(Vec inDims, Vec* outDims) override - { - auto inDim = inDims[0]; - DataType dt; - DataFormat df; - U32 width ; - U32 height; - U32 numChannels; - U32 num; - CHECK_STATUS(tensor4dGet(inDim, &dt, &df, &num, &numChannels, &height, &width)); - - TensorDesc inputDesc = tensor4df(dt, df, num, numChannels, height, width); - if (this->kernelSizeH == 0 && this->kernelSizeW == 0) { - Pooling::set_kernelSize(height, width); - Pooling::set_stride(1, 1); - } - PoolingDesc poolingDesc = Pooling::create_PoolingDesc(this->mode, this->kernelSizeH, this->kernelSizeW, this->strideH, this->strideW, - this->paddingT, this->paddingB, this->paddingL, this->paddingR, this->rm); - CHECK_STATUS(pooling_infer_output_size(inputDesc, poolingDesc, &((*outDims)[0]), this->schedule)); - return SUCCESS; - } - - virtual EE infer_output_tensors_size(Vec inDims, Vec* outDims, Vec* gclmemInputDesc, Vec* gclmemOutputDesc) override - { - auto inDim = inDims[0]; - DataType dt; - DataFormat df; - U32 width ; - U32 height; - U32 numChannels; - U32 num; - CHECK_STATUS(tensor4dGet(inDim, &dt, &df, &num, &numChannels, &height, &width)); - - this->oclExtInfo.maliInfo.gclmemInputDesc = &((*gclmemInputDesc)[0]); - this->oclExtInfo.maliInfo.gclmemOutputDesc = &((*gclmemOutputDesc)[0]); - TensorDesc inputDesc = tensor4df(dt, df, num, numChannels, height, width); - if (this->kernelSizeH == 0 && this->kernelSizeW == 0) { - Pooling::set_kernelSize(height, width); - Pooling::set_stride(1, 1); - } - - PoolingDesc poolingDesc = Pooling::create_PoolingDesc(this->mode, this->kernelSizeH, this->kernelSizeW, this->strideH, this->strideW, - this->paddingT, this->paddingB, this->paddingL, this->paddingR, this->rm); - CHECK_STATUS(pooling_infer_output_size(inputDesc, poolingDesc, &((*outDims)[0]), this->schedule, &this->oclExtInfo)); - return SUCCESS; - } - - - private: - }; - - #endif //_POOLING_OCL_H - ``` +In [tensor](../compute/tensor), you can define any operator for operator computing process. + +1. Create a new operator file in [compute/tensor/src](../compute/tensor/src); +2. The implementation of the operator is related to the backends(x86 CPU, ARM CPU, GPU), for a specific backend, you need to add the corresponding operator implementation to the specific folder in [compute/tensor/src](../compute/tensor/src). + +- Example: add `pooling` operator in [tensor](../compute/tensor) + + 1. Create `pooling.cpp` in [compute/tensor/src](../compute/tensor/src), the complete implementation refers to [compute/tensor/src/pooling.cpp](../compute/tensor/src/pooling.cpp) + + 2. For ARM CPU, create `pooling.cpp` in [compute/tensor/src/arm/pooling.cpp](../compute/tensor/src/arm/pooling.cpp), and dispatch to implementations of different data type(bnn/fp16/fp32/int8). + + 3. For ARM GPU, create `pooling.cpp` in [compute/tensor/src/gpu/mali/pooling.cpp](../compute/tensor/src/gpu/mali/pooling.cpp), and only fp16 supported now [compute/tensor/src/gpu/mali/fp16/pooling_mali_fp16.cpp](../compute/tensor/src/gpu/mali/fp16/pooling_mali_fp16.cpp), and put your cl file in [compute/tensor/src/gpu/mali/cl/pooling_max.cpp](../compute/tensor/src/gpu/mali/pooling_max.cpp), the file name of cl must be the same with kernel name. if your kernel has compile option, create .sh file in [common/gcl/tools/kernel_lib_compile/sh/compile](../common/gcl/tools/kernel_lib_compile/sh/compile), the file name of sh must be the same with kernel name. + +## inference's engine customization + +In [engine](../inference/engine), you can define any operator for the inference of your model. + +1. Add the definition of the specific operator in [inference/engine/include](../inference/engine/include); +2. If the specific operator implement in CPU is different from its implement in GPU, implement should be divided into CPU and GPU version. If the specific operator implement in CPU is same with its implement in GPU, skip this step. + +- Example: add `pooling` operator in [inference/engine](../inference/engine) + + 1. Create `pooling.hpp` in [inference/engine/include](../inference/engine), add the definition of `pooling` operator, the complete implement code refers to [inference/engine/include/pooling.hpp](../inference/engine/include/pooling.hpp) + + 2. `pooling` operator implement in CPU is different from its implement in GPU. So `pooling` implement should be two version: CPU and GPU + + (1) Create `pooling_cpu.hpp` and add `pooling` CPU implement in [inference/engine/include/cpu](../inference/engine/include/cpu) , the complete implement refers to [inference/engine/include/cpu/pooling_cpu.hpp](../inference/engine/include/cpu/pooling_cpu.hpp) + + (2) Create `pooling_ocl.hpp` and add `pooling` GPU implement in [inference/engine/include/ocl](../inference/engine/include/ocl) , the complete implement refers to [inference/engine/include/ocl/pooling_ocl.hpp](../inference/engine/include/ocl/pooling_ocl.hpp) # How to contribute -- ### submit issue +## submit issue - - [ ] question +- question - Submit any question you have encountered when you use Bolt. You can give feedback to us through committing issues. Refer to https://github.com/huawei-noah/bolt/issues, create your new issue and submit it. The issue can be a bug in Bolt, a suggestion for Bolt, or anything you don't understand about Bolt. + Submit any question you have encountered when you use Bolt. You can give feedback to us through committing issues. Refer to https://github.com/huawei-noah/bolt/issues, create your new issue and submit it. The issue can be a bug in Bolt, a suggestion for Bolt, or anything you don't understand about Bolt. - +- feature request - - [ ] feature request + Submit any feature that you want but it has not been implemented in Bolt. We have created a [special issue](https://github.com/huawei-noah/bolt/issues/5) and you can leave a commit under this issue . We will seriously consider the needs of all users and continue to enrich the functions of Bolt. - Submit any feature that you want but it has not been implemented in Bolt. We have created a [special issue](https://github.com/huawei-noah/bolt/issues/5) and you can leave a commit under this issue . We will seriously consider the needs of all users and continue to enrich the functions of Bolt. +## pull request -- ### pull request +- add a license - - [ ] add a license - - Add the license at the head of your source codes indicating your codes will be open to all. - - - [ ] provide an executable unit test - - Fork [Bolt](https://github.com/huawei-noah/bolt) on your github account. Modify your code and make sure your code pass all testing cases. Commit the code and initiate a pull request on github. - - - - + Add the license at the head of your source codes indicating your codes will be open to all. + +- provide an executable unit test + + Fork [Bolt](https://github.com/huawei-noah/bolt) on your github account. Modify your code and make sure your code pass all testing cases. Commit the code and initiate a pull request on github. \ No newline at end of file diff --git a/docs/FAQ.md b/docs/FAQ.md new file mode 100644 index 00000000..df27e835 --- /dev/null +++ b/docs/FAQ.md @@ -0,0 +1,25 @@ +# FAQ on Bolt + +1. Why configuring bolt.cmake does not take effect? + + The [install.sh](install.sh) serves as an example of compilation setup, and it overwrites some settings in [bolt.cmake](common/cmakes/bolt.cmake). Please check install.sh first. + +2. More details about dependency libraries for cross-compilation? + + The major dependency is Protobuf. Protoc should agree with your building platform but protbuf should be the ARM version. + +3. Restrictions for 1-bit BNN? + + For BNN convolution layers, the number of input channels must be divisible by 32, and the output channels must be divisible by 16. + +4. Restrictions on quantization (int8)? + + For the time being, Bolt only supports post-training int8 quantization. The quantization method is symmetrical for both activation and weight. We have added a calibration tool for image CNN pipelines. Please feel free to report cases of usage failure. + +5. Requirements for float16 and int8? + + Only arm-v8.2 CPU supports float16 and int8 dotprod instructions. + +6. Restrictions for ARM Mali GPU? + + Only *arm_llvm* compilation supports ARM Mali computing. \ No newline at end of file diff --git a/docs/INSTALL.md b/docs/INSTALL.md index 6b27b0cf..4842bedc 100644 --- a/docs/INSTALL.md +++ b/docs/INSTALL.md @@ -1,172 +1,249 @@ -# Prerequisites - -- CMake +This document will help you compile and install bolt on your server. Generally, you only need to be concerned about two parts. The first part is how to build bolt which is described in the "Download and Build Bolt" section. And the second part is when you fail to build bolt , you should check the "Prerequisites" section which is required by bolt. - We use [cmake v3.15.1](https://cmake.org/files/v3.15/cmake-3.15.1-Linux-x86_64.tar.gz) to build Bolt. After installing the cmake, you need to set shell environment **PATH** to find it. You can use this simple test to confirm you have installed it successfully. - - ```shell - cmake -version - ``` +[Download and Build Bolt](#download-and-build-bolt) +    [Options](#options) +    [Environment variables](#environment-variables) +[Prerequisites](#prerequisites) +    [Compilation Tools](#compilation-tools) +    [Android Tools](#android-tools) +    [Other Dependency Libraries](#other-dependency-libraries) +    [Optional Software](#optional-software) -- GNU make - - We use [GNU make v3.81](http://ftp.gnu.org/gnu/make/make-3.81.tar.gz) to build Bolt. After installing the make, you also need to set shell environment **PATH**. Simple test: - - ```shell - make -version - ``` - -- Cross compiler - - If you plan to directly compile Bolt on ARM platform and run on ARM, you can use gcc and skip this section. - - NDK compiler uses Android NDK toolchains to build Bolt for Java APIs required by Android applications and ARM MALI GPU Bolt. GNU compiler uses gcc to build Bolt for simple ARM CPU tests. Please choose according to your scenario. - - - Android NDK compiler - - We use Android NDK [android-ndk-r20](https://dl.google.com/android/repository/android-ndk-r20b-linux-x86_64.zip?hl=zh-cn) to build Bolt. After installing the Android NDK, you need to set shell environment **PATH** to find *aarch64-linux-android21-clang++*. Simple test: - - ```shell - aarch64-linux-android21-clang++ --version - ``` - - - GNU compiler - - We use GNU compiler [gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu](https://developer.arm.com/-/media/Files/downloads/gnu-a/8.3-2019.03/binrel/gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz?revision=2e88a73f-d233-4f96-b1f4-d8b36e9bb0b9&la=en&hash=167687FADA00B73D20EED2A67D0939A197504ACD) to build Bolt. You need to set shell environment **PATH** to find *aarch64-linux-gnu-g++*. Simple test: - - ```shell - aarch64-linux-android21-clang++ -version - ``` - -- ADB - - We use [ADB](https://developer.android.com/studio/command-line/adb.html) tool to transfer the executables to android mobile phones and run the program. You also need to set shell environment **PATH**. Simple test: - - ```shell - # this will list all available android devices - adb devives - ``` - -- Optional - - Java SDK - - If you want to compile Java programs, you need to download and install [Java SE SDK](https://www.oracle.com/java/technologies/oracle-java-archive-downloads.html). After installing the SDK, you need to set shell environment **PATH** to find it. Simple test: - ```shell - java --version - ``` - - - Android dx - - If you want to directly run *jar* file on Android device, you can use [Android dx tool](https://developer.android.com/studio/releases/build-tools). We currently use Android *v28.0.3* build tools. After installing the *dx* tool, you also need to set shell environment **PATH**. Simple test: - ```shell - dx --version - ``` - -- Third party library - - We provide a simple [install shell script](../third_party/install.sh) to install third party libraries(*protoc, protobuf, flatbuffers, tensorflow-lite, jpeg, ARM GPU OpenCL*) to the [third_party](third_party) directory and generate a shell script to set up compilation environment. You can choose between LLVM and GCC. Here is an example of installation for LLVM. - - ```shell - ./third_party/install.sh -c llvm -t 33 - ``` # Download and Build Bolt -We provide a simple shell script [install.sh](../install.sh) to build and install the Bolt library, and you can modify it according to your scenario and environment. Please refer to the options section of [bolt.cmake](../bolt.cmake) and configure accordingly. Here we give an example of building Bolt with LLVM. +A simple shell script [install.sh](../install.sh) is provided to build and install the Bolt library, and you should modify it according to your scenario and environment. Set all the options correctly in [bolt.cmake](../common/cmakes/bolt.cmake) . Use help message to find more useful information. + +NOTE: Some Build options are set by default in [install.sh](../install.sh), which will be overwritten in [bolt.cmake](common/cmakes/bolt.cmake). You should check these two files meticulously before installation. -NOTE: Some build options are turned on or off by default in the given install.sh, which overwrites the settings in bolt.cmake. Be sure to check install.sh first. +Here are the examples of installation on different platforms(*arm_gnu*, *arm_llvm*, *arm_ndkv7* and *x86_gnu*). ```shell git clone https://github.com/huawei-noah/bolt.git cd bolt -./install.sh -c llvm -t 33 + +# build for ARM V8+ GNU CPU platform +./install.sh -c arm_gnu -t 33 + +# build for Android ARM V8+ CPU platform +./install.sh -c arm_llvm -t 33 -g OFF + +# build for Android ARM V8+ CPU + MALI GPU platform +./install.sh -c arm_llvm -t 33 -g ON + +# build for X86 GNU CPU platform +./install.sh -c x86_gnu -t 33 + +# build for Android ARM V7 CPU platform +./install.sh -c arm_ndkv7 -t 33 ``` -We will install Bolt to *install_llvm* directory, you will find these subdirectories in it. +We will install Bolt to *install_* directory, you will find these subdirectories in it. + +- include + - [C API](inference/engine/api/c) header file + - [Java API](inference/engine/api/java) class file +- lib + - libBoltModel.so: build for Java application + - libbolt.so: build for C/C++ application + - libflow.so: flow sub project library + - libinference.so: inference sub project library + - libtensor.so: tensor computing sub project library + - libimage.so: image sub project library + - libblas_enhance.so: blas_enhance sub project library + - libmodel_tools.so: model_tools sub project library + - libuni.so: uni sub project library +- tools + - *X2bolt* for generally converting deep learning(caffe/onnx/tflite) model to bolt model + - *tensorflow2caffe* for converting tensorflow model to caffe model + - *pytorch2caffe* for converting pytorch model to caffe model + - *tensor_computing_library_search* for performance tuning of the operator library -- kits +- examples + - *benchmark* for measuring any model(.bolt) inference performance - *tinybert* for intention identification - - *nmt* for machine translation - - - *classification* for computer vision classification task - + - *classification* for imagenet classification task - *asr_rnnt* for automatic speech recognition task RNNT model - - *asr_convolution_transformer* for automatic speech recognition task Convolution+Transformer model - - *tts* for text to speech - - - *super_resolution* for super resolution task - - - *hdr* for high dynamic range task - -- include - - C API - - Java API - -- lib: all static and shared library -- tools - - *caffe2bolt* for converting caffe model to bolt model - - - *onnx2bolt* for converting onnx model to bolt model - - - *tflite2bolt* for converting tflite model to bolt model - - - *tensorflow2caffe* for converting tensorflow model to caffe model +- docs + - API/html: doxygen html document for C/Java/Flow API - - *pytorch2caffe* for converting pytorch model to caffe model - - - *tensor_computing_library_search* for performance tuning of the operator library - -If you want to build operator and API tests, please turn on the *BUILD_TEST* option and rebuild Bolt. These programs will be installed to *tests/bin* directory. +If you want to build [operator uni tests](compute/tensor/tests) and [C](inference/examples/c_api/test_api.c)/[Java](inference/examples/java_api/test_api_java.java)/Flow API tests, please turn on the *BUILD_TEST* option and rebuild Bolt. These executables will be installed to ***install_/tests*** directory. ## Options -Here we list all options in [bolt.cmake](../bolt.cmake). +Here are all options in [bolt.cmake](../common/cmakes/bolt.cmake). | options | default | note | | --------------------- | ------- | --------------------------------------------- | -| USE_CROSS_COMPILE | OFF | use cross compile or not | +| USE_CROSS_COMPILE | ON | use cross compile or not | | USE_GNU_GCC | OFF | use GNU gcc compler or not | | USE_LLVM_CLANG | OFF | use LLVM clang compiler or not | -| USE_DEBUG | OFF | use debug information or not | +| USE_IOS_CLANG | OFF | use ios compiler or not | | USE_DYNAMIC_LIBRARY | OFF | use dynamic library or not | +| USE_MINSIZEREL | OFF | use cmake library storage size optimization | +| USE_ANDROID_LOG | OFF | use Android log or not | +| USE_DEBUG | OFF | use debug information or not | +| USE_PROFILE | OFF | print each layer performance information or not | +| USE_PROFILE_STATISTICS | OFF | print performance statistics information or not | +| USE_THREAD_SAFE | OFF | use thread safe function or not | | USE_CAFFE | ON | use caffe model as input or not | | USE_ONNX | ON | use onnx model as input or not | | USE_TFLITE | ON | use tflite model as input or not | -| USE_NEON | ON | use ARM NEON instruction or not | +| USE_TFLITE | ON | use tensorflow model as input or not | +| USE_GENERAL | ON | use serial CPU code for debug or not | +| USE_X86 | OFF | use X86 AVX2 instruction or not | +| USE_NEON | OFF | use ARM NEON instruction or not | +| USE_ARMV7 | OFF | use ARMv7 CPU or not | +| USE_ARMV8 | ON | use ARMv8 CPU or not | +| USE_MALI | ON | use MALI GPU for parallel or not | | USE_FP32 | OFF | use FP32 implementation or not | | USE_FP16 | ON | use FP16 implementation or not | | USE_F16_MIX_PRECISION | ON | use ARM NEON mixed-precision (F16/F32) or not | | USE_INT8 | ON | use INT8 implementation or not | -| BUILD_TEST | OFF | build unit test or not | -| USE_MALI | ON | use MALI GPU for parallel or not | -| USE_ARMV7 | OFF | use ARMv7 CPU or not | -| USE_ARMV8 | ON | use ARMv8 CPU or not | -| USE_GENERAL | ON | use serial CPU code for debug or not | +| USE_OPENMP | OFF | use OpenMP to run operator multi-thread or not, currently only support partial float32 operator | +| USE_LIBRARY_TUNING | ON | use algorithm tuning or not | +| USE_FLOW | ON | use flow or not | +| USE_TEST | OFF | build unit test or not | ## Environment variables -We reserve some shell environment variable for Bolt. +Some Linux shell environment variables are reserved for Bolt. -- *Bolt_ROOT*: Bolt project home directory, set by user or Bolt. -- *BOLT_MEMORY_REUSE_OPTIMIZATION*: whether to use memory reuse optimization(default is ON), you can set it to *OFF* to disable memory reuse optimization. +- *BOLT_ROOT*: Bolt project home directory, set by user or Bolt. +- *BOLT_MEMORY_REUSE_OPTIMIZATION*: whether to use memory reuse optimization. The default value is ON and you can set it *OFF* to disable memory reuse optimization. - *Bolt_TensorComputing_LibraryAlgoritmMap*: a path on the target device set by user to save tensor_computing library performance tuning result. -## How to build Bolt MALI GPU -For compile bolt MALI GPU, -- Ensure your ADB works well, and connected with your target device with mali gpu. - NOTE: Bolt need to precompile all GPU kernels to bins on your target device, and they will be packaged to libkernelbin.a/.so - If you change your target device, these kernel bins may be not adaptive, you should recompile them. - Bolt support mult devices precompiling for GPU Kernels, you can connect all the target devices you need with ADB, and the kernel bins for them will be built and packged together. -- LLVM Compiler must be used and version of andriod NDK is more than r19. -- OpenCL headfiles and lib are provided in "/cheetah/third_party/llvm/opencl", if the OpenCL lib we provided are not matching with your target device, you can replace it with the Opencl lib on your device. -- When you compile bolt MALI GPU, please set these options ON: - USE_CROSS_COMPILE - USE_LLVM_CLANG - USE_FP16 - USE_MALI - They can be set in install.sh, options of compiler_arch llvm. -- After open these options, run "./install.sh -c llvm -t 33" to build bolt MALI GPU +# Prerequisites + +## Compilation Tools + +- ### Cmake + + <1> Installation + + ``` + sudo apt-get install cmake + ``` + + <2> Verfication + + ``` + cmake -version + ``` + + cmake -version + + If cmake has been installed successfully, you can see the uniquely determined cmake version number(For example: 3.15.1). If you fail to see the version number or the number you see is lower than 3.2.0, please reinstall the cmake tool on your server. You can refer to the [cmake official docs](https://cmake.org/install/) to implement the installation of cmake and set environment **PATH** to find it. + +- ### GNU make + + <1> Installation + + ``` + sudo apt-get install make + ``` + + <2> Verification + + ``` + make -version + ``` + + + + If GNU make has been installed successfully, you can see the uniquely determined GNU make version number(For example: 4.1). If you fail to see the version number or the number you see is lower than 3.81, please reinstall the GNU make tool on your server. You can refer to the [GNU make installation example](https://stackoverflow.com/questions/35568016/install-make-3-75-on-ubuntu-15-10) to implement the installation of GNU make and set the environment **PATH** to find it. + +- ### Cross compiler + + NDK compiler uses Android NDK toolchains to build Bolt for Java APIs required by Android applications and ARM MALI GPU Bolt. ARM GNU compiler uses gcc to build Bolt for simple ARM CPU tests. Choose **one of them** according to your scenario. + + - Android NDK compiler + + <1> Installation + + Refer to the [NDK installation example](https://askubuntu.com/questions/837847/how-to-install-android-ndk) to install [android-ndk-r20](https://dl.google.com/android/repository/android-ndk-r20b-linux-x86_64.zip?hl=zh-cn) and set environment **PATH** to find *aarch64-linux-android21-clang++*. + + <2> Verification + + ``` + aarch64-linux-android21-clang++ --version + ``` + + NDK + + If android ndk has been installed successfully, you can see the InstalledDir which represent the ndk compilers storage path. If you fail to see InstalledDir, please reinstall ndk and set the environment **PATH** to find it. + + - ARM GNU compiler + + <1> Installation + + ``` + sudo apt-get install gcc-arm-linux-gnueabi + ``` + + <2> Verification + + Install [gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu](https://developer.arm.com/-/media/Files/downloads/gnu-a/8.3-2019.03/binrel/gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz?revision=2e88a73f-d233-4f96-b1f4-d8b36e9bb0b9&la=en&hash=167687FADA00B73D20EED2A67D0939A197504ACD) and set Linux shell environment **PATH** to find *aarch64-linux-gnu-g++*. Simple test: + + ``` + aarch64-linux-gnu-g++ --version + ``` + + GNU + + If GNU has been installed successfully, you can see the uniquely determined GNU compiler version number(For example: 8.3.0). If you fail to see the version number or the number you see is lower than 8.3.0, please reinstall the ARM GNU compiler on your server. You can refer to the [GNU compiler installation example](https://askubuntu.com/questions/472219/how-to-install-gcc-4-7-arm-linux-gnueabihf-on-ubuntu-12-04) to implement the installation of GNU compiler and set the environment **PATH** to find it. + +## Android Tools + +- ### ADB + + <1> Installation + + Refer to the [ADB installation example](https://unix.stackexchange.com/questions/378041/how-to-install-adb-on-ubuntu-from-download) to install [ADB](https://developer.android.com/studio/command-line/adb.html) tool helping you transfer the executables to android mobile phones. + + ``` + unzip platform-tools-latest-linux.zip + cd platform-tools + mv adb /usr/bin/adb + ``` + + <2> Verification + + ``` + # list all available android devices + adb devices + ``` + + ADB + + If GDB has been installed successfully, you can see all the android devices on your server. + +## Other Dependency Libraries + +Use [install script]() to install the dependency libraries(*protoc, protobuf, flatbuffers, tensorflow-lite, jpeg, ARM GPU OpenCL*) to the [third_party]() directory and generate a shell script to set up compilation environment. To choose ARM or X86, LLVM or GCC. Here is an example of installation for ARM NDK LLVM build. + +``` +./third_party/install.sh -c arm_llvm -t 33 +``` + +## Optional Software + +- ### JDK + +If you want to compile Java programs, you need to download and install [Java SE Development Kit](https://www.oracle.com/java/technologies/oracle-java-archive-downloads.html) and set Linux shell environment **PATH**. Run the command "java -version" to verify whether jdk has been installed. + +JDK + +You can see the uniquely determined JDK version number(For example: 1.8.0_265). If you fail to see the version number or the number you see is lower than 1.8.0_265, please reinstall the JDK on your server. You can refer to the [JDK installation example](https://stackoverflow.com/questions/14788345/how-to-install-the-jdk-on-ubuntu-linux?page=2&tab=Votes) to implement the installation of JDK and set the environment **PATH** to find it. + +- ### Android dx + +If you want to directly run *jar* file on Android device, you can use [Android dx tool](https://developer.android.com/studio/releases/build-tools).Install Android *v28.0.3* build tools and set Linux shell environment **PATH**. Run the command "dx --version" to verify the dx tool version. + +dx + +You can see the uniquely determined dx version number(For example: 1.16). If you fail to see the version number or the number you see is lower than 1.16, please reinstall the dx tool on your server. \ No newline at end of file diff --git a/docs/IOS_USAGE.md b/docs/IOS_USAGE.md new file mode 100644 index 00000000..94827ce2 --- /dev/null +++ b/docs/IOS_USAGE.md @@ -0,0 +1,78 @@ +# How to Use Bolt on iOS Devices + +## Overview + +Bolt can be used on iOS and you can use the option <-c arm_ios> in our shell script [install.sh](../install.sh) to finish compilation on Linux platform. Before using the script, you need to make an ARM-iOS cross complier toolchain first. A tutorial is given below. + +After compilation, you will find libbolt.a and libbolt.dylib in the install_arm_ios/lib directory. We have tested the development using the Objective-C language, in which you can direcly use our [C API](DEVELOPER.md). You can also try our C++ API as in the [examples](../inference/examples). For the time being you need to include more headers than using the C API, and some compilation flags need to be managed. + +You can also find demo projects as described in [KIT.md](KIT.md). Note that before compiling bolt the demo cannot be directly used, because the built libraries and headers will be installed dynamically in [install.sh](../install.sh). The demo is in the experimental stage, based on our new feature [Flow](DEVELOPER.md). + +## Call for Contribution + +- Xcode simulator support. So far we haven't supported compilation for MAC-x86, so bolt can only be tested on real devices. +- Swift/Objective-C API based on our C API. +- Compilation on platforms other than Linux. + +## Related links + +In addition to our tutorial, you can also refer to the following two links. + +- https://heroims.github.io/2017/09/10/Linux%20%E6%9E%84%E5%BB%BA:%E7%BC%96%E8%AF%91IOS:Mac%E7%A8%8B%E5%BA%8F/ +- https://medium.com/@fyf786452470/%E5%9C%A8linux%E7%9A%84%E7%9A%84%E7%9A%84%E4%B8%8B%E4%BA%A4%E5%8F%89%E7%BC%96%E8%AF%91%E7%94%9F%E6%88%90%E7%9A%84ios%E7%89%88%E7%9A%84%E5%B7%A5%E5%85%B7%E9%93%BE%E7%9A%84%E6%8C%87%E5%AF%BC%E6%89%8B%E5%86%8C-b87b472cbe14 + +## Preparations + +- llvm clang: You can download and install llvm clang from the [llvm website](https://releases.llvm.org/). +- openssl: Generally this tool is installed by default. **Note that if you want to copy your created iOS cross compiler toolchain to another computer for use, you need to confirm that the versions of openssl on these two machines are the same, otherwise your created toolchain can not be used.** +- iPhoneOSSDK: If you don't have your own iPhoneOS SDK, you can download and choose one iPhoneOS SDK from [iPhoneOSSDK](https://github.com/okanon/iPhoneOS.sdk), which contains iPhoneOS SDKs from the version 8.4 to 13.2. +- cctools : This open-source tool can help us make the ARM-iOS cross compiler toolchain and you can clone the tool from [cctools-port](https://github.com/tpoechtrager/cctools-port). + +### Our versions of these tools : + +- llvm clang : 3.9.1 +- openssl : 1.0.2g +- iPhoneOsSDK: 10.0 +- cctools : 949.0.1, ld64: 530 (the latest version on github) + +## Step by Step + +**1.** First, make sure that you have available tools including llvm clang and openssl. + +**2.** Clone iPhoneOS SDK from [iPhoneOSSDK](https://github.com/okanon/iPhoneOS.sdk), and then place the archive **in the user home directory ~/**. For example we place it in */data/home/test*. We tried to put it in other directories, but it failed for us. + +**3.** Clone cctools-port from [cctools-port](https://github.com/tpoechtrager/cctools-port). + +``` +test@ubuntu:~$ pwd +/data/home/test +test@ubuntu:~$ mkdir ioscompile +test@ubuntu:~$ cd ioscompile +test@ubuntu:~/ioscompile$ git clone https://github.com/tpoechtrager/cctools-port.git +test@ubuntu:~/ioscompile$ ls +cctools-port-master +test@ubuntu:~$ cd .. +``` + +**4.** Use the shell script build.sh of cctools-port in the directory *cctools-port-master/usage_examples/ios_toolchain/* to make aarch64/arm64-ios cross compiler toolchain. The commands are : +``` +test@ubuntu:~$ cd ioscompile/cctools-port-master/ +test@ubuntu:~$ ./usage_examples/ios_toolchain/build.sh /data/home/test/iPhoneOS10.0.sdk.tar.gz arm64 +``` +After a while, a folder **target** is created in the directory cctools-port-master/usage_examples/ios_toolchain/ and this folder **target** is the created aarch64-ios cross compiler toolchain. Now you have successfully made an ARM-IOS cross compiler toolchain on Linux. In this folder, the sub-folder */bin* contains cross compilers and related tools like *arm-apple-darwin-clang/clang++*, and the sub-folder */lib* contains the dependent libraries. By the way, if you want to make an armv7-ios cross compiler toolchain, you can change these commands like : +``` +test@ubuntu:~$ cd ioscompile/cctools-port-master/ +test@ubuntu:~$ ./usage_examples/ios_toolchain/build.sh /data/home/test/iPhoneOS10.0.sdk.tar.gz armv7 +``` + +**5.** When you use your created ARM-IOS cross compiler toolchain to build bolt, you need to configure the toolchain in your environment with the following commands or you can configure the toolchain permanently in your environment. +``` +test@ubuntu:~$ export PATH=/data/home/test/ioscompile/cctools-port-master/usage_examples/ios_toolchain/target/bin:$PATH +test@ubuntu:~$ export LD_LIBRARY_PATH=/data/home/test/ioscompile/cctools-port-master/usage_examples/ios_toolchain/target/lib:$LD_LIBRARY_PATH +``` + +**6.** Simply go to the root directory for bolt, and run: + +``` +test@ubuntu:~/bolt$ ./install.sh -c arm_ios +``` diff --git a/docs/KIT.md b/docs/KIT.md new file mode 100644 index 00000000..09980d29 --- /dev/null +++ b/docs/KIT.md @@ -0,0 +1,50 @@ +# Kit + +Kit is an experimental feature based on [Flow](DEVELOPER.md), which aims to simplify the integration of bolt into applications. At this stage we are still rapidly exploring different designs. In the long run we want to provide symmetrical APIs for different platforms including iOS, Android, etc. + +In the [kit](../kit) directory, you can find the available demo project. In order to use the demo, bolt should be compiled first and some [headers and libraries](../kit/iOS/setup_lib_iOS.sh) need to be installed into the project, which is also taken care of in [install.sh](../install.sh). Currently we have uploaded an iOS project for image classification. + +## iOS Overview + +Our demo is using the Objective-C Language and the C++ API of Flow. Mainbody of the codes is in [ViewController.mm](../kit/iOS/image_classification/ImageClassificationDemo/ViewController.mm). There are some notes regarding iOS kits: + +- Compilation flags. The C++ API of Flow requires quite a few headers, and some compilation flags need to be set. For convenience, you can include [kit_flags.h](../kit/iOS/image_classification/ImageClassificationDemo/libbolt/headers/kit_flags.h) before including flow.h. +- Model path in flow prototxt. Flow reads the model paths in prototxt in order to locate the models. On iOS, however, the exact storage path for model files is dynamically determined. [ViewController.mm](../kit/iOS/image_classification/ImageClassificationDemo/ViewController.mm) demonstrates how to update prototxt with the new model path. + +### Image Classification + +The demo takes video input from camera, and uses GhostNet model trained on ImageNet. You can easily switch to other models trained on other datasets, following the steps below. As a tutorial, we will show how to change the model to the FP16 GhostNet that is also included in the project. You can try other models if your device is older than iPhone X and thus not in ARMv8.2 architecture. + +**0.** In [image_classification.prototxt](../kit/iOS/image_classification/ImageClassificationDemo/libbolt/image_classification.prototxt), you can see that the Inference node includes a path to ghostnet_f32.bolt. Actually, it is not necessary to change this path to ghostnet_f16.bolt, because this path will be dynamically overwritten as explained above. We will show how to switch to FP16 in Step 1. + +In the following steps, if the file name is not specified, please check [ViewController.mm](../kit/iOS/image_classification/ImageClassificationDemo/ViewController.mm). + +**1.** Switch to FP16 model. Change Line 78 to: + +``` +NSString *boltPath=[[NSBundle mainBundle]pathForResource:@"ghostnet_f16" ofType:@"bolt"]; +``` + +Please also change the variable inferencePrecision to DT_F16. + +**2.** Adjust the pixelProcess function, which is registered as the preprocessing function for the Inference node. For FP16 inference, actual input to the model should be in FP16: + +``` +F16 *oneArr = (F16 *)((CpuMemory *)outputs["input:0"]->get_memory())->get_ptr(); +``` + +If you are using your own model, change "input:0" to the name of your model input tensor. + +The provided Ghostnet requires input pixels organized as BGRBGRBGR... Adjust accordingly if your other model is trained with different preprocessing (i.e. normalizing each channel). + +**3.** Adjust the postProcess function, which is registered as the postprocessing function for the Inference node. For FP16 inference, the output score is also in FP16: + +``` +F16 *score1000 =(F16 *)((CpuMemory *)inputs[boltModelOutputName]->get_memory())->get_ptr(); +``` + +If necessary, change boltModelOutputName to the name of your model output tensor. If your model is not trained on ImageNet, there may not be 1000 scores. You may also change the topK variable. + +**4.** If necessary, replace imagenet_classes.txt. Add codes to handle the class index numbers that Flow outputs. + +**5.** Please run it under file path "/data/local/tmp" for andriod devices to ensure the program get full authorities. diff --git a/docs/QUANTIZATION.md b/docs/QUANTIZATION.md new file mode 100644 index 00000000..4d2ec3ab --- /dev/null +++ b/docs/QUANTIZATION.md @@ -0,0 +1,48 @@ +# Quantization Toolchain + +So far bolt supports various modes of post-training quantization, including quantized storage, dynamic quantization inference, calibration, etc. In the future, we will also provide quantization training tools. + +## post_training_quantization + +Please refer to [model_tools/tools/quantization/post_training_quantization.cpp](../model_tools/tools/quantization/post_training_quantization.cpp). All post-training quantization utilities are covered in this tool, except the calibration, which will also be merged into this tool in the future. + +Before using this tool, you need to first produce the input model with X2bolt using the "-i PTQ" option. Later, you can use the tool: + +``` +./post_training_quantization -p model_ptq_input.bolt +``` + +Different options of the tool are explained below. The default setting will produce model_int8_q.bolt which will be executed with dynamic int8 quantization. + +Here are the list of covered utilities: + +1. **Quantized Storage**: If you would like to compress your model, use the -q option. Choose from {FP16, INT8, MIX}. INT8 storage could lead to accuracy drop, so we provided the MIX mode which will try to avoid accuracy-critical layers. Note that this option is independent from the -i option, which sets the inference precision. +2. **Global Clipping of GEMM Inputs**: In some cases of quantization-aware training, GEMM inputs will be clipped so that they can be better quantized symmetrically. Please use the -c option is necessary. +3. **Ad-Hoc Clipping of Feature Maps**: In some other cases, the clip value is a trainable parameter for individual layers. Please use the -s option. The parameter **scaleFileDirectory** is the directory of your scale table file(.txt). Note that text format of the file is like the following codes, and **clipvalue** is the clip value of each feature map in your model. In our tool, we will calculate true scales of each tensor with the equation clipvalue/127.0 and store them into the created int8 model. +``` +tensor_name_0 clipvalue +tensor_name_1 clipvalue +tensor_name_2 clipvalue +``` + +## Calibration tool + +The post training quantization calibration tool is in the directory [inference/engine/tools/ptq_calibration/ptq_calibration.cpp](../inference/engine/tools/ptq_calibration/ptq_calibration.cpp). The command to use this tool is : +``` +./ptq_calibration modelPath dataDirectory dataFormat scaleValue affinityPolicyName algorithmMapPath +``` +So these parameters are : + +1. **modelPath** : the directory of your int8 Bolt model, make sure that you get your int8 Bolt model with our converter tool X2Bolt and then you can use this post training quantization calibration tool with your own related calibration datasets. +2. **dataDirectory** : the directory of your calibration datasets, note that the structure of the folder is : +``` +HWSEA:/data/local/tmp/test # cd calibration_dataset +HWSEA:/data/local/tmp/test # ls +XXXXX.JPEG XXXXX.JPEG XXXXX.JPEG XXXXX.JPEG XXXXX.JPEG +``` +3. **dataFormat** : specific imageFormat : BGR/RGB/RGB_SC/BGR_SC_RAW/BGR_SC_R +4. **scaleValue** : specific scaleValue for image classification, the default value is 1 +5. **affinityPolicyName** : specific running mode: CPU_AFFINITY_HIGH_PERFORMANCE/CPU_AFFINITY_LOW_POWER/GPU, the default value is CPU_AFFINITY_HIGH_PERFORMANCE. +6. **algorithmMapPath** : specific file path to read or write algorithm auto tunning result + +After running this post training quantization calibration tool, you will get a int8-KL Bolt model named by **_int8_q_KL.bolt** in the directory of the folder which stores your original int8 model. diff --git a/docs/REDUCE_GPU_PREPARE_TIME.md b/docs/REDUCE_GPU_PREPARE_TIME.md new file mode 100644 index 00000000..b0dd34fd --- /dev/null +++ b/docs/REDUCE_GPU_PREPARE_TIME.md @@ -0,0 +1,62 @@ +# How to reduce gpu initial time + +Bolt support ARM Mali GPU, large addtitional prepare time is cost due to algorithm selecting and building kernel from source code. + +- ## Build extra resources for reducing prepare time on GPU + + Bolt provides offline tools [preprocess_ocl](../inference/engine/tools/preprocess_ocl/build_preprocess_ocl.sh) to reduce GPU prepare time. We have test mobilenet_v1 on MALI G76 GPU. Prepare time can be reduced from 2-3s to 60ms after build algorithm file and OpenCL kernel binary. Here we give an exaple: + + - ### Step By Step + + <1> Connect target device by Andriod ADB; + + <2> Convert your models to .bolt with X2bolt; + + <3> Make a write/read able folder on target device, copy all your needed .bolt models into it, E.g: + + ``` + adb shell "mkdir /data/local/tmp/preprocess_bolt_models" + adb shell "cp ${boltModelDir}/*.bolt /data/local/tmp/preprocess_bolt_models" + ``` + + <4> Set essential variables in file *tools/preproces_ocl/build_preprocess_ocl.sh*: + + - dNum: Device serial number, which can be aquired by using command + + ``` + adb devices + ``` + + - device_bolt_models: which is created in step <3>; + + - device_work_local: "preprocess_ocl" work path on target device, suggest to be: /data/local/tmp/preprocess; + + <5> Run *build_preprocess_ocl.sh* on host; + + After running build_preprocess_ocl.sh successfully, these extra resources will be produced: + - algorithm file: record best algorithm for your model on target devices, such as: *${BOLT_ROOT}/tools/preprocess_ocl/algoFiles/algorithmInfo_Mali_G76p_GPUSR_p-16-1-p-8-1-p_1input_2_4* + + - OpenCL kernel bin dynamic library: All needed kernels for your model has been compiled from sources to bins, and package into .so, such as: *${BOLT_ROOT}/tools/preprocess_ocl/lib/libMali_G76p_map.so* + +- ## Use algorithm file and kernel binary dynamic library to reduce gpu prepare time for your model + + - ### Reduce Imagenet classification prepare time + ``` + adb shell "mkdir /data/local/tmp/kits" + adb push install_arm_llvm/kits/classification /data/local/tmp/kits + adb push tools/preprocess_ocl/algoFiles/algorithmInfo_Mali_G76p_GPUSR_p-16-1-p-8-1-p_1input_2_4 /data/local/tmp/kits + adb push tools/preprocess_ocl/lib/libMali_G76p_map.so /data/local/tmp/kits + adb shell "cd /data/local/tmp/kits && export LD_LIBRARY_PATH=./ && ./classification -m ./mobilenet_v1_f16.bolt -a GPU -p ./" + ``` + + - ### Reduce C project prepare time + + - Argument *algoPath* of C API *ModelHandle CreateModel(const char *modelPath, AFFINITY_TYPE affinity, const char *algoPath)* is used to set your algofile; + - Argument *algoFileStread* of C API *ModelHandle CreateModelWithFileStream( const char *modelFileStream, AFFINITY_TYPE affinity, const char *algoFileStream)* is used to set your algofile filestream; + - Package kernel binary dynamic library into your project; + +- ## Note + - AlgoFiles are binding with specific model and target device; + - Kernel binary dynamic library are binding with specific GPU type; + - tools *preprocess_ocl* can produce several bolt models algoFiles once, and package all kernels they need into one single kernel binay dynamic library; + - Please run it under file path "/data/local/tmp" for android devices to ensure the program get full authorities; diff --git a/docs/THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.md b/docs/THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.md new file mode 100644 index 00000000..b57d845d --- /dev/null +++ b/docs/THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.md @@ -0,0 +1,208 @@ +Please note we provide an open source software notice for the third party open source software along with this software and/or this software component contributed by Huawei (in the following just “this SOFTWARE”). The open source software licenses are granted by the respective right holders. + + + +Warranty Disclaimer + +THE OPEN SOURCE SOFTWARE IN THIS SOFTWARE IS DISTRIBUTED IN THE HOPE THAT IT WILL BE USEFUL, BUT WITHOUT ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. SEE THE APPLICABLE LICENSES FOR MORE DETAILS. + + + +Copyright Notice and License Texts + +Software: caffe 1.0 () + +Copyright notice: + +All contributions by the University of California: + +Copyright (c) 2014-2017 The Regents of the University of California(Regents) + +All right reserved. + +All other contributions: + +Copyright (c) 2014-2017, the respective contributors + +All rights reserved. + +Caffe uses a shared copyright model: each contributor holds copyright over their contributions to Caffe. The project versioning records all such contribution and copyright details. If a contributor want to further mark their specific copyright on a particular contribution, they should indicate their copyright solely in commit message of the change when it is committed. + +License: + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the followingdisclaimer in the documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + + +Copyright Notice and License Texts + +Software: onnx 1.6.0 () + +Copyright notice: + +Copyright (c) 2017 ONNX Project Contributors + +All rights reserved. + +License: + +Permission is hereby granted, free of charge, to any person obtaining a copy + +of this software and associated documentation files (the "Software"), to deal + +in the Software without restriction, including without limitation the rights + +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + +copies of the Software, and to permit persons to whom the Software is + +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all + +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + +SOFTWARE. + + + +Copyright Notice and License Texts + +Software: protobuf 2.7.0 () + +Copyright (c) 2008 Google Inc. + +All rights reserved. + +License: + +Redistribution and use in source and binary forms, with or without + +modification, are permitted provided that the following conditions are + +met: + + \* Redistributions of source code must retain the above copyright + +notice, this list of conditions and the following disclaimer. + + \* Redistributions in binary form must reproduce the above + +copyright notice, this list of conditions and the following disclaimer + +in the documentation and/or other materials provided with the + +distribution. + + \* Neither the name of Google Inc. nor the names of its + +contributors may be used to endorse or promote products derived from + +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Code generated by the Protocol Buffer compiler is owned by the owner + +of the input file used when generating it. This code is not + +standalone and requires a support library to be linked with it. This + +support library is itself covered by the above license. + + + +Copyright Notice and License Texts + +Software: tensorflow 1.15.0 () + +Copyright (c) 2019 The TensorFlow Authors. + +All rights reserved. + +License: + +Licensed under the Apache License, Version 2.0 (the "License"); + +you may not use this file except in compliance with the License. + +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software + +distributed under the License is distributed on an "AS IS" BASIS, + +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and + +limitations under the License. + + + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files(the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + + + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + + + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/docs/USER_HANDBOOK.md b/docs/USER_HANDBOOK.md index 34b227f6..51b5a791 100644 --- a/docs/USER_HANDBOOK.md +++ b/docs/USER_HANDBOOK.md @@ -1,338 +1,322 @@ Before you try any step described in this document, please make sure you have installed Bolt correctly. You can refer to [INSTALL.md](INSTALL.md) for more details. - +[Basic Usage](#basic-usage) +    [Model Conversion](#model-conversion) +    [Model Inference](#model-inference) +    [API](#api) +    [Performance Profiling](#performance-profiling) +[Advanced Features](#advanced-features) +    [INT8 Post Training Quantization](#int8-post-traning-quantization) +    [BNN Network Support](#bnn-network-support) +    [Algorithm Tuning for Key Layers](#algorithm-tuning-for-key-layers) +    [Time-Series Data Acceleration](#time-series-data-acceleration) +[Feedback](#feedback) # Basic Usage -### Model Conversion - - +It's quiet easy for users to get started with bolt by learning the following two steps: "Model conversion" and "Model inference". And after successfully running bolt with your model, you can further explore the "API" section to customize your application. -1. **Caffe model to Bolt model** +## Model Conversion - <1> Push the `caffe2bolt` executable file to the phone; +![ModelConversion](images/ModelConversion.PNG) - <2> Push the caffe model to the phone; +[X2bolt](models_tools/tools/X2bolt/X2bolt.cpp) is a general converter, which focuses on converting different deep learning model to bolt model. Currently, X2bolt support caffe/onnx/tflite/tensorflow model conversion. Here we list the examples of two typical model conversions for ARM backend, for X86 backend the ADB tool is not required, bolt X86 only support FP32 precision inference now. - <3> Use `caffe2bolt` to transform model of caffe format to model of bolt format +### resnet50(caffe) Model Conversion - Parameters: caffe_model_path caffe_model_name precision +resnet50(caffe) model contains two model files : *resnet50.prototxt* and *resnet50.caffemodel*. Prepare these two model files on */home/resnet/* in advance. - ​ Note: Your should make sure the .prototxt file and the .caffemodel file have the same model name +<1> Push your model to the phone; -Example: Transform mobilenet_v1 of caffe format into bolt format - -```shell -<1> adb push /home/bolt/install_llvm/tools/caffe2bolt /data/local/bolt/tools/caffe2bolt -<2> adb push /home/bolt/models/caffe/mobilenet_v1/ /data/local/bolt_model/caffe/mobilenet_v1 -<3> adb shell "./data/local/bolt/tools/caffe2bolt ./data/local/bolt_model/caffe/mobilenet_v1/ mobilenet_v1 FP16" ``` +adb push /home/resnet50/ /data/local/tmp/models/resnet50 - After running, you can see the mobilenet_v1_f16.bolt file in the same directory with the original caffe model. - - The suffix "_f16" indicates that the bolt model is saved in FP16 representations, and will be run with FP16 operations (ARMv8.2) by default. - - If you want to deploy the model as FP32, please set the last parameter to "FP32" for caffe2bolt. You will then get mobilenet_v1_f32.bolt. +adb shell "ls /data/local/tmp/models/resnet50" +# command output$ resnet50.caffemodel resnet50.prototxt +``` - This precision setting also applies to onnx2bolt and tflite2bolt. +<2> Push the ***X2bolt*** to the phone and get the help information of ***X2bolt*** ; +``` +adb push /home/bolt/install_arm_gnu/tools/X2bolt /data/local/tmp/bolt/tools/X2bolt + +adb shell "ls /data/local/tmp/bolt/tools/" +# command output$ X2bolt + +adb shell "./X2bolt --help" +``` +<3> Execute ***X2bolt*** to convert a model from caffe model to bolt model. Here shows the example of float16 model conversion. -2. **Onnx model to Bolt model** +``` +adb shell "/data/local/tmp/bolt/tools/X2bolt -d /data/local/tmp/models/resnet50/ -m resnet50 -i FP16" + +adb shell "ls /data/local/tmp/models/resnet50" +# command output$ resnet50_fp16.bolt +``` - <1> Push the `onnx2bolt` executable file to the phone; +Note : Model conversion procedure of onnx and tflite is similar to caffe. - <2> Push the onnx model to the phone; +### mobilenet_v1(tensorflow) Model Conversion - <3> Use `onnx2bolt` to transform model of onnx format to model of bolt format +Save your mobilenet_v1 to frozen .pb model. And preprocess your model using [tf2json](model_tools/tools/tensorflow2json/tf2json.py) which can convert the .pb to .json. Then use **X2bolt** to convert .json to .bolt model. - ​ Parameters: onnx_model_path onnx_model_name remove_op_number precision inputN inputC inputH inputW +Here is the example of mobilenet_v1_frozen.pb converted to mobilenet_v1.bolt. -Example: Transform ghostnet of onnx format into bolt format +<1> Prepare mobilenet_v1 model(frozen .pb) on the server; -```shell -<1> adb push /home/bolt/tools/onnx2bolt /data/local/bolt/tools/onnx2bolt -<2> adb push /home/bolt/models/onnx/ghostnet/ /data/local/bolt_model/caffe/ghostnet -<3> adb shell "./data/local/bolt/tools/onnx2bolt ./data/local/bolt_model/onnx/ghostnet/ ghostnet 3 FP16 1 3 224 224" +``` +file /home/mobilenet_v1/mobilenet_v1_frozen.pb ``` - After running, you can see the ghostnet_f16.bolt file in /data/local/bolt_model/onnx/ghostnet/ on the phone. - - Since onnx models may not specify the input dimensions, onnx2bolt accepts 4 more parameters. If they are not provided, the .bolt model will specify 1x3x224x224 by default, which is the typical input size for ImageNet networks. - - - -3. **TensorFlow model to Bolt model** +<2> Convert mobilenet_v1_frozen.pb to mobilenet_v1.json; - The process flow is : TensorFlow model to Caffe model, and then to Bolt model. +``` +python3 model_tools/tools/tensorflow2json/tf2json.py /home/mobilenet_v1/mobilenet_v1_frozen.pb /home/mobilenet_v1/mobilenet_v1.json - <1> Tensorflow model to Caffe model +ls /home/mobilenet_v1 +# command output$ mobilenet_v1.json +``` - Refer to the [tensorflow2caffe README.md](../model-tools/tools/tensorflow2caffe/README.md) for more details on transforming TensorFlow model to Caffe model. +<3> Push the mobilenet_v1.json to the phone; - <2> Caffe model to Bolt model +``` +adb push /home/mobilenet_v1/mobilenet_v1.json /data/local/tmp/models/mobilenet_v1/mobilenet_v1.json + +adb shell "ls /data/local/tmp/models/mobilenet_v1" +# command output$ mobilenet_v1_frozen.pb mobilenet_v1.json +``` - Refer to the former steps in "Caffe model to Bolt model" section in this chapter. +<4> Push the ***X2bolt*** to the phone and get the help information of ***X2bolt*** ; +``` +adb push /home/bolt/install_arm_gnu/tools/X2bolt /data/local/tmp/bolt/tools/X2bolt + +adb shell "ls /data/local/tmp/bolt/tools/" +# command output$ X2bolt + +adb shell "./X2bolt --help" +``` +<5> Execute ***X2bolt*** to convert model from .json(converted from .pb) to bolt model. Here shows the example of float32 model conversion. -4. **PyTorch model to Bolt model** +``` +adb shell "/data/local/tmp/bolt/tools/X2bolt -d /data/local/tmp/models/mobilenet_v1/ -m mobilenet_v1 -i FP32" - PyTorch should have native support for onnx format. For your own convenience, you can try that first. +adb shell "ls /data/local/tmp/models/mobilenet_v1" +# command output$ mobilenet_v1.json mobilenet_v1_f32.bolt +``` - The process flow is: PyTorch model to Caffe model, and then to Bolt model +## Model Inference - <1> PyTorch model to Caffe model +### General Benchmark - Refer to the [pytorch2caffe README.md](../model-tools/tools/pytorch2caffe/README.md) for more details on transforming Pytorch model to Caffe model. +[*benchmark*](../inference/examples/benchmark/benchmark.cpp) is a general tool for measuring any .bolt model inference performace. - <2> Caffe model to Bolt model +<1> Push the ***benchmark*** to the phone and check its usage; - Refer to the former steps in "Caffe model to Bolt model" section in this chapter. +``` +adb push /home/bolt/install_arm_gnu/kits/benchmark /data/local/tmp/bolt/bin/benchmark +adb shell "./benchmark --help" +``` +<2> Execute ***benchmark*** for your model inference performace. -### Model Inference +``` +# running with fake data +adb shell "./data/local/tmp/bolt/bin/benchmark -m /data/local/tmp/bolt_model/caffe/resnet/resnet_f16.bolt" -We provide several demo programs, and here we will explain the usage of two typical programs: image classification and tinybert. +# running with real data +adb shell "./data/local/tmp/bolt/bin/benchmark -m /data/local/tmp/bolt_model/caffe/resnet/resnet_f16.bolt -i /data/local/tmp/data/1_3_224_224_fp16.bin" +``` +### Imagenet classification +Example: Run mobilenet_v1 for image classification with CPU -1. **Classification** +<1> Push classification to the phone; - <1> Push classification to the phone; - - <2> Push the testing image data to the phone; - - <3> Run classification and get the result. - - Parameters: bolt_model image_directory image_format scale_value TopK correct_label archInfo algorithmMapPath +``` +adb push /home/bolt/install_arm_gnu/kits/classification /data/local/tmp/bolt/bin/classification +``` -Example: Run mobilenet_v1 for image classification +<2> Push the testing image data to the phone; -```shell -<1> adb push /home/bolt/install_llvm/kits/classification /data/local/bolt/bin/classification -<2> adb push /home/bolt/data/ILSVRC/n02085620/ /data/local/bolt_data/cv/ILSVRC/n02085620 -<3> adb shell "./data/local/bolt/bin/classification /data/local/bolt_model/caffe/mobilenet_v1/mobilenet_v1_f16.bolt /data/local/bolt_data/cv/ILSVRC/n02085620 BGR 0.017 5 151 CPU_AFFINITY_HIGH_PERFORMANCE" +``` +adb push /home/bolt/data/ILSVRC/n02085620/ /data/local/tmp/bolt_data/cv/ILSVRC/n02085620 ``` - After running, you should be able to see the TopK labels for each image calculated according to the model, the Top1 and TopK accuracy, and the execution time. +<3> Run CPU classification and get the result. - Here we explain a little more for some of the parameters. +``` +adb shell "/data/local/tmp/bolt/bin/classification -m /data/local/tmp/bolt_model/caffe/mobilenet_v1/mobilenet_v1_f16.bolt -i /data/local/tmp/bolt_data/cv/ILSVRC/n02085620 -f BGR -s 0.017 -t 5 -c 151 -a CPU_AFFINITY_HIGH_PERFORMANCE -p ./" +``` - - image_format: The image format requested by the model. For example, caffe models usually require BGR format. You can refer to [image_processing.cpp](../image/src/image_processing.cpp) for more details. - - scale_value: The scale value requested in the input preprocessing. This value is also used in [image_processing.cpp](../image/src/image_processing.cpp). If your network required normalized inputs, the typical scale value is 0.017. - - TopK: The number of predictions that you are interested in for each image. Typical choice is 5. - - correct_label: The correct label number for the whole image directory. - - archInfo: - -- CPU_AFFINITY_HIGH_PERFORMANCE, Bolt will look for a high-frequency core and bind to it. - -- CPU_AFFINITY_LOW_POWER, Bolt will look for a low-frequency core. - -- GPU, Bolt will run the model on MALI GPU. - If the parameter is missing, the default value is "CPU_AFFINITY_HIGH_PERFORMANCE". - - algorithmMapPath The file path to save algorithm selection result info, it is strongly recommended to be set when use GPU. +After running, you should be able to see the TopK labels for each image calculated according to the model, the Top1 and TopK accuracy, and the execution time. - More Details for using GPU on classification nets: - Example for running with bolt GPU: - /*bolt_model*/ /*image_directory*/ /*image_format*/ /*scale*/ /*TopK*/ /*correct_lable*/ /*archInfo*/ /*algorithMapPath*/ - ./classification /data/local/tmp/model/mobilenet_v1_f16.bolt /data/local/tmp/data BGR 0.017 5 151 GPU /data/local/tmp +**Detailed explanation of the parameters:** - When you first running program, GPU will take lots of time to do algorithm selected and save the results to the algorithmMapPath you set. - After algorithm selected results been saved successfully, this step will be skipped. - If you want to get the best performance, please set the algorithmMapPath, and running your model after algorithm selected results been produced. - NOTE: - -- The file name of algorithm selected results are constitute with "modelname + archInfo + dataType", such as "algorithmInfo_MOBILENET_2_4". - -- If you modified your model, please delete the old algorithm selected results and run it again, or it may cause unpredicted errors. +- -f/--imageFormat: The image format requested by the model. For example, caffe models usually require BGR format. You can refer to [image_processing.cpp](../compute/image/src/image_processing.cpp) for more details. - - -2. **Tinybert** +- -s/--scaleValue: The scale value requested in the input preprocessing. This value is also used in [image_processing.cpp](../compute/image/src/image_processing.cpp). If your network required normalized inputs, the typical scale value is 0.017. - <1> Push tinybert to the phone; +- -t/--topK: The number of predictions that you are interested in for each image. Typical choice is 5.correct_label: The correct label number for the whole image directory. - <2> Push the testing sequence data to the phone; +- -c/--correctLabels: The correct label number for the whole image directory. - <3> Run tinybert and get the result. +- -a/--archinfo: - Parameters: bolt_model sequence_directory thread_affinity + The default value is "CPU_AFFINITY_HIGH_PERFORMANCE". -Example: + -- CPU_AFFINITY_HIGH_PERFORMANCE, Bolt will look for a high-frequency core and bind to it. -```shell -<1> adb push /home/bolt/install_llvm/kits/tinybert /data/local/bolt/bin/tinybert -<2> adb mkdir /data/local/bolt_data/nlp/tinybert/data -<3> adb mkdir /data/local/bolt_data/nlp/tinybert/data/input -<4> adb mkdir /data/local/bolt_data/nlp/tinybert/data/result -<5> adb push /home/bolt/model-tools/tools/tensorflow2caffe/tinybert/sequence.seq /data/local/bolt_data/nlp/tinybert/data/input/0.seq -<6> adb shell "./data/local/bolt/bin/tinybert /data/local/bolt_model/caffe/tinybert/tinybert_f16.bolt /data/local/bolt_data/nlp/tinybert/data CPU_AFFINITY_HIGH_PERFORMANCE" -``` + -- CPU_AFFINITY_LOW_POWER, Bolt will look for a low-frequency core. - After running, you should be able to see the labels for each sequence calculated according to the model, and the execution time. + -- GPU, Bolt will run the model on MALI GPU. - Here we explain a little more for some of the parameters. +- -p/--algoPath: The file path to save algorithm selection result info, it is strongly recommended to be set when use GPU. - - thread_affinity: When it is set to be CPU_AFFINITY_HIGH_PERFORMANCE, Bolt will look for a high-frequency core and bind to it. When it is set to be CPU_AFFINITY_LOW_POWER, Bolt will look for a low-frequency core. If the parameter is missing, the default value is "CPU_AFFINITY_HIGH_PERFORMANCE". +<4> Run GPU classification and get the result. +``` +adb shell "/data/local/tmp/bolt/bin/classification -m /data/local/tmp/bolt_model/caffe/mobilenet_v1/mobilenet_v1_f16.bolt -i /data/local/tmp/bolt_data/cv/ILSVRC/n02085620 -f BGR -s 0.017 -t 5 -c 151 -a GPU -p /data/local/tmp/tmp +``` -3. **Neural Machine Translation** +When you run the program for the first time, GPU will take lots of time to do algorithm selected and save the results to the algorithmMapPath you set. After algorithm selected results been saved successfully, this step will be skipped. - <1> Push nmt to the phone; +If you want to get the best performance, please set the *-p/--algoPath*, and running your model after algorithm selected results been produced. - <2> Push the testing sequence data to the phone; + NOTE: - <3> Run nmt and get the result. +- The file name of algorithm selected results are constitute with "modelName + archInfo + dataType", such as "algorithmInfo_MOBILENET_2_4". +- If you modified your model, please delete the old algorithm selected results and run it again, or it may cause unpredicted errors. - Parameters: bolt_model sequence_directory thread_affinity +### tinybert -Example: +<1> Push tinybert to the phone; -```shell -<1> adb push /home/bolt/install_llvm/kits/nmt /data/local/bolt/bin/nmt -<2> adb mkdir /data/local/bolt_data/nlp/machine_translation/data -<3> adb mkdir /data/local/bolt_data/nlp/machine_translation/data/input -<4> adb mkdir /data/local/bolt_data/nlp/machine_translation/data/result -<5> adb push /home/bolt/model-tools/tools/tensorflow2caffe/nmt/0.seq /data/local/bolt_data/nlp/machine_translation/data/input/0.seq -<6> adb shell "./data/local/bolt/bin/nmt /data/local/bolt_model/caffe/nmt/nmt_f16.bolt /data/local/bolt_data/nlp/machine_translation/data CPU_AFFINITY_HIGH_PERFORMANCE" +``` +adb push /home/bolt/install_arm_gnu/kits/tinybert /data/local/tmp/bolt/bin/tinybert ``` - After running, you should be able to see the machine translation result, and the execution time. - - Here we explain a little more for some of the parameters. +<2> Push the testing sequence data to the phone; - - thread_affinity: When it is set to be CPU_AFFINITY_HIGH_PERFORMANCE, Bolt will look for a high-frequency core and bind to it. When it is set to be CPU_AFFINITY_LOW_POWER, Bolt will look for a low-frequency core. If the parameter is missing, the default value is "CPU_AFFINITY_HIGH_PERFORMANCE". +``` +adb mkdir /data/local/tmp/bolt_data/nlp/tinybert/data +adb mkdir /data/local/tmp/bolt_data/nlp/tinybert/data/input +adb mkdir /data/local/tmp/bolt_data/nlp/tinybert/data/result +adb push /home/bolt/model_tools/tools/tensorflow2caffe/tinybert/sequence.seq /data/local/tmp/bolt_data/nlp/tinybert/data/input/0.seq +``` +<3> Run tinybert and get the result. -4. **Automatic Speech Recognition RNNT** +``` +adb shell "./data/local/tmp/bolt/bin/tinybert -m /data/local/tmp/bolt_model/caffe/tinybert/tinybert_f16.bolt -i /data/local/tmp/bolt_data/nlp/tinybert/data -a CPU_AFFINITY_HIGH_PERFORMANCE" +``` - <1> Push asr_rnnt to the phone; +After running, you should be able to see the labels for each sequence calculated according to the model, and the execution time. - <2> Push the testing sequence data to the phone; +### neural machine translation(nmt) - <3> Run asr_rnnt and get the result. +<1> Push nmt to the phone; - Parameters: bolt_model sequence_directory thread_affinity +``` +adb push /home/bolt/install_llvm/kits/nmt /data/local/tmp/bolt/bin/nmt +``` -Example: +<2> Push the testing sequence data to the phone; -```shell -<1> adb push /home/bolt/install_llvm/kits/asr_rnnt /data/local/bolt/bin/asr_rnnt -<2> adb mkdir /data/local/bolt_data/nlp/asr/asr_rnnt/data -<3> adb mkdir /data/local/bolt_data/nlp/asr/asr_rnnt/data/input -<4> adb mkdir /data/local/bolt_data/nlp/asr/asr_rnnt/data/result -<5> adb push /home/bolt/model-tools/tools/tensorflow2caffe/asr/asr_rnnt.seq /data/local/bolt_data/nlp/asr/asr_rnnt/data/input/0.seq -<6> adb shell "./data/local/bolt/bin/asr_rnnt /data/local/bolt_model/caffe/asr_rnnt/asr_rnnt_f16.bolt /data/local/bolt_data/nlp/asr/asr_rnnt/data CPU_AFFINITY_HIGH_PERFORMANCE" +``` +adb mkdir /data/local/tmp/bolt_data/nlp/machine_translation/data +adb mkdir /data/local/tmp/bolt_data/nlp/machine_translation/data/input +adb mkdir /data/local/tmp/bolt_data/nlp/machine_translation/data/result +adb push /home/bolt/model_tools/tools/tensorflow2caffe/nmt/0.seq /data/local/tmp/bolt_data/nlp/machine_translation/data/input/0.seq ``` - After running, you should be able to see the speech recognition result, and the execution time. - - Here we explain a little more for some of the parameters. +<3> Run nmt and get the result. - - thread_affinity: When it is set to be CPU_AFFINITY_HIGH_PERFORMANCE, Bolt will look for a high-frequency core and bind to it. When it is set to be CPU_AFFINITY_LOW_POWER, Bolt will look for a low-frequency core. If the parameter is missing, the default value is "CPU_AFFINITY_HIGH_PERFORMANCE". +``` +adb shell "./data/local/tmp/bolt/bin/nmt -m /data/local/tmp/bolt_model/caffe/nmt/nmt_f16.bolt -i /data/local/tmp/bolt_data/nlp/machine_translation/data -a CPU_AFFINITY_HIGH_PERFORMANCE" +``` +After running, you should be able to see the machine translation result, and the execution time. -5. **Automatic Speech Recognition Convolution+Transformer** +## API - <1> Push asr_convolution_transformer to the phone; +Please refer to [DEVELOPER.md](DEVELOPER.md#api-usage) for more details. - <2> Push the testing sequence data to the phone; +## Performance Profiling - <3> Run asr_convolution_transformer and get the result. +Bolt provides a program performance visualization interface to help user identify performance bottlenecks. - Parameters: bolt_model sequence_directory thread_affinity +- ### Visualize an inference program performance -Example: +<1> Edit [common/cmakes/bolt.cmake](../common/cmakes/bolt.cmake) file to open performance profile switch *USE_PROFILE*, and recompile bolt library. -```shell -<1> adb push /home/bolt/install_llvm/kits/asr_convolution_transformer /data/local/bolt/bin/asr_convolution_transformer -<2> adb mkdir /data/local/bolt_data/nlp/asr/asr_convolution_transformer/data -<3> adb push /home/bolt/model-tools/tools/tensorflow2caffe/asr /data/local/bolt_data/nlp/asr/asr_rnnt/data -<4> adb shell "./data/local/bolt/bin/asr_convolution_transformer /data/local/bolt_model/caffe/asr_rnnt/asr_convolution_transformer_encoder_f16.bolt /data/local/bolt_data/nlp/asr/asr_convolution_transformer/data CPU_AFFINITY_HIGH_PERFORMANCE" -<5> adb shell "./data/local/bolt/bin/asr_convolution_transformer /data/local/bolt_model/caffe/asr_rnnt/asr_convolution_transformer_prediction_net.f16.bolt /data/local/bolt_data/nlp/asr/asr_convolution_transformer/data CPU_AFFINITY_HIGH_PERFORMANCE" -<6> adb shell "./data/local/bolt/bin/asr_convolution_transformer /data/local/bolt_model/caffe/asr_rnnt/asr_convolution_transformer_joint_net_f16.bolt /data/local/bolt_data/nlp/asr/asr_convolution_transformer/data CPU_AFFINITY_HIGH_PERFORMANCE" +<2> Use the newly generated executable program or library to do inference. Bolt will print performance log in the command line window or Android log. Collect the performance log that started with *[PROFILE]*. Here is an example. +``` +[PROFILE] thread 7738 {"name": "deserialize_model_from_file", "cat": "prepare", "ph": "X", "pid": "0", "tid": "7738", "ts": 1605748035860637, "dur": 9018}, +[PROFILE] thread 7738 {"name": "ready", "cat": "prepare", "ph": "X", "pid": "0", "tid": "7738", "ts": 1605748035889436, "dur": 8460}, +[PROFILE] thread 7738 {"name": "conv1", "cat": "OT_Conv::run", "ph": "X", "pid": "0", "tid": "7738", "ts": 1605748035898106, "dur": 764}, +[PROFILE] thread 7738 {"name": "conv2_1/dw", "cat": "OT_Conv::run", "ph": "X", "pid": "0", "tid": "7738", "ts": 1605748035898876, "dur": 2516}, ``` - After running, you should be able to see the result of each sub network(encoder, prediction net, joint net), and the execution time. - - Here we explain a little more for some of the parameters. - - - thread_affinity: When it is set to be CPU_AFFINITY_HIGH_PERFORMANCE, Bolt will look for a high-frequency core and bind to it. When it is set to be CPU_AFFINITY_LOW_POWER, Bolt will look for a low-frequency core. If the parameter is missing, the default value is "CPU_AFFINITY_HIGH_PERFORMANCE". - - -### API - -Currently, we provide C and Java API. After installation, you can find the API documents docs/API/html/index.html. - +<3> Remove the prefix of thread private information *[PROFILE] thread 7738* and the comma at the end of log, add *[* at the beginning of the file and *]* at the end of file. Save it as a JSON file. Here is an JSON file example. +``` +[ + {"name": "deserialize_model_from_file", "cat": "prepare", "ph": "X", "pid": "0", "tid": "7738", "ts": 1605748035860637, "dur": 9018}, + {"name": "ready", "cat": "prepare", "ph": "X", "pid": "0", "tid": "7738", "ts": 1605748035889436, "dur": 8460}, + {"name": "conv1", "cat": "OT_Conv::run", "ph": "X", "pid": "0", "tid": "7738", "ts": 1605748035898106, "dur": 764}, + {"name": "conv2_1/dw", "cat": "OT_Conv::run", "ph": "X", "pid": "0", "tid": "7738", "ts": 1605748035898876, "dur": 2516} +] +``` +<4> Use Google Chrome browser to open extension. Load the JSON file. You can see the program execution time. +![](/images/PerformanceProfiling.PNG) # Advanced Features -### Graph Optimization - - By default, all graph optimizers that we have implemented are activated during model conversion. In the converters (caffe2bolt, onnx2bolt), you can find a function call: - ```c++ - ms_optimizer.suggest(); - ``` - If you wish to turn them off, you can adjust the suggest() function, or simply call: - ```c++ - ms_optimizer.empty(); - ``` - However, some of the optimizers are essential, which will be marked with * below. - - - *DeprecatedOpOptimizer: This optimizer removes the deprecated layers from the model - - *ConvBNOptimizer: This optimizer folds BN parameters into the weight and bias of convolution. - - *BNScaleOptimizer: When a BN layer is not precedented by a convolution layer, we will fold it into the following scale layer. - - *ConvScaleOptimizer: This optimizer folds scale parameters into the weight and bias of convolution. - - InPlaceOptimizer: If the input and output of a layer are identical in dimensions, they might share the same tensor name. Typical layers include the Activation Layer. - - ConvActivationOptimizer: This optimizer fuses convolution and activation layers - - *ChannelPaddingOptimizer: This optimizer will pad the output channels to a multiple of 8 for convolution layers. This increases the model compatibility. - - DepthwisePointwiseOptimizer: This optimizers fuses depthwise conv and pointwise conv for computation efficiency. - - TransposeMulToScaleOptimizer: This is useful for some NLP models. - - *MemoryReuseOptimizer: When a feature map tensor is no longer needed as input or output, the storage that it occupies can be reused by other feature maps. This saves on average **two-thirds** of feature map storage for networks that we have tested. - - - -### INT8 Post-Training Quantization - - If quantization is activated, the second convolution layer will quantize the tensors to 8-bit integers. For now, int8 operators include Convolution, Pooling and Concatenation (end-to-end support for Squeezenet). If your network includes other operators, you may need to add type casting in the front of those operators. The quantization method is symmetrical for both activation and weight. +## INT8 Post Training Quantization - If you want to activate the quantization, pass "INT8_Q" as the precision parameter to caffe2bolt or onnx2bolt during model conversion. +Operations are smartly quantized, avoiding layers that are critical to accuracy. When possible, gemm layers (e.g. conv, FC) will directly output int8 tensors so as to save dequantization time. The quantization method is symmetrical for both activation and weight. Please refer to [QUANTIZATION.md](QUANTIZATION.md) for more details. +## BNN Network Support +Bolt supports both XNOR-style and DoReFa-style BNN networks. Just save the binary weights as FP32 in an Onnx model, and X2bolt will automatically convert the storage to 1-bit representations. So far, the floating-point portion of the BNN network can only be FP16 operations, so pass "FP16" as the precision parameter to X2bolt. The number of output channels for BNN convolution layers should be divisible by 32. -### BNN Network Support +## Algorithm Tuning for Key Layers - Bolt supports both XNOR-style and DoReFa-style BNN networks. Just save the binary weights as FP32 in an Onnx model, and onnx2bolt will automatically convert the storage to 1-bit representations. So far, the floating-point portion of the BNN network can only be FP16 operations, so pass "FP16" as the precision parameter to onnx2bolt. The number of output channels for BNN convolution layers should be divisible by 32. - - - -### Layer Performance Benchmark - - If you target device is an Android phone connected to your compilation server, you can call "make test" to run a quick verification test, which runs the [quick_benchmark.sh](../quick_benchmark.sh). For more details, please refer to the individual unit test programs under [tests](../tests). +Bolt provides tensor_computing_library_search program for performance tuning of the operator library. Bolt currently supports convolution layer algorithm tuning. +<1> Push tensor_computing_library_search to the phone; +``` +adb push /home/bolt/install_arm_gnu/tools/tensor_computing_library_search /data/local/tmp/bolt/tools/tensor_computing_library_search +``` -### Algorithm Tuning for Key Layers +<2> Set Bolt_TensorComputing_LibraryAlgoritmMap shell environment variable; - Bolt provides tensor_computing_library_search program for performance tuning of the operator library. Bolt currently supports convolution layer algorithm tuning. +<3> Run library tuning program; - <1> Push tensor_computing_library_search to the phone; +``` +adb shell "export Bolt_TensorComputing_LibraryAlgoritmMap=/data/local/tmp/bolt/tensor_computing_library_algorithm_map.txt && ./data/local/tmp/bolt/tools/tensor_computing_library_search" +``` - <2> Set Bolt_TensorComputing_LibraryAlgoritmMap shell environment variable +After running, you should be able to get algorithm map file on device. - <3> Run library tuning program. - - <4> Use *CONVOLUTION_LIBRARY_SEARCH* convolution policy during model inference. +<4> Use *CONVOLUTION_LIBRARY_SEARCH* convolution policy during model inference. -Example: +Modify Convolution algorithm search policy in [inference/engine/include/cpu/convolution_cpu.hpp](../inference/engine/include/cpu/convolution_cpu.hpp) -```shell -<1> adb push /home/bolt/inference/tools/tensor_computing_library_search /data/local/bolt/tools/tensor_computing_library_search -<2> adb shell "export Bolt_TensorComputing_LibraryAlgoritmMap=/data/local/bolt/tensor_computing_library_algorithm_map.txt && ./data/local/bolt/tools/tensor_computing_library_search" -``` +## Time-Series Data Acceleration - After running, you should be able to get algorithm map file on device. +Flow is the time-series data acceleration module for Bolt. Flow simplifies the application development process. Flow uses graph as an abstraction of application deployment, and each stage (function) is viewed as a node. A node can do data preprocessing, deep learning inference or result postprocessing. Separate feature extraction can also be abstracted as a node. The bridging entity between function is data (tensor), and that can be represented as an edge. +Flow provides flexible CPU multi-core parallelism and heterogeneous scheduling (CPU + GPU). User don't need to pay excessive attention to heterogeneous management and write lots of non-reusable code to implement a heterogeneous application. User can get the best end-to-end performance with the help of Flow. Flow supports data parallelism and subgraph parallelism, with a simple API. +More usage information can be find in [DEVELOPER.md](docs/DEVELOPER.md#time-series-data-acceleration-by-using-flow). # Feedback - If you have encountered any difficulty, feel free to reach out to us by summitting issues. You are also encouraged to contribute your implementations. Please refer to [DEVELOPER.md](DEVELOPER.md). +If you have encountered any difficulty, feel free to reach out to us by submitting issues. You are also encouraged to contribute your implementations. Please refer to [DEVELOPER.md](DEVELOPER.md). diff --git a/docs/images/ADB.PNG b/docs/images/ADB.PNG new file mode 100644 index 0000000000000000000000000000000000000000..a7a0778656c472f72d86054980c78bc1068a7535 GIT binary patch literal 3875 zcmaKvc|4Ts-^XubjTncDvezm74xuCaGNGu1ETN3;kg|>OtBgjN5;{e+2o+^d6Qd&x z8Z&XC>@kDPVzgjpj8TkX#_+r6^t_(e^E`h%f86(VU9bCEzpwY_^S$mrPkFj2EACPR z06_Wpu_HbJ0KtRx00lYl&GWGI1`DZ+K5ouH{h&4rY{;B<@^AuxS9wb7;j&Kue;=@m0$0kH(oPh?ulG4+#kd+Nvn?j@rm7D zfL@v32`<~P66b7&Ddz}`?shgbQkzHRzbGRXNp5-6$YC>VV zn9xQXUkPe@c;{c%y-Qe{`c}cPV0La(B%L!*FWP(>u`bYSh8b*#0Hj(chSM^KVI~s2 zYdlMYaCW1`C`Jh|yoPaipg;Ew^-2A=t>%Jd=NI2SObBUO0Z$&ywdL=n)HJfPOrk{u zl_#7gB<+X-9zfzvm?5vB)j=s{m)T@`1>H5(lixJ?C?9>efjYl@r9JnoNSDTUjYm;R%t5X7zUQxtcM(k5UpGl!^@Xq6}dsN;$2;{8qK&Ni`kdrsm{P4^&)!f z4^7%@@iM=#9*3);3UGA$P5R7DJrkh+>GGb&OBJ3aw*1zyQPsD=Eh++LZB`=+jl=7>BiXvq)zZytAkY&08xbVWpAH%7B6@vevhzmG+b+iz{%W$+=T=F0TU=bD9np%MIUi;u;PexSHm#;q2~v z?awA>{RMUbhZB(`EVb%krNG{0Zw30^<@GJ=BgovotnL@2lplE3pF8SNQN};gX?xc7SUI^`StGeBLN-eCh{)oNO+e8*lA~2h zj!IWJU5>V4tjMzHRphp#ZTp-y`NhxbIfU5k`0P$q+oT!4!;pV{uzrr8}0H4AjMi7WHZ~!p#qh`31xO)IT7z2P* ztF;|TTx$`Ec*FzITWik*SC`xrFjcnzi2(X}5o0;ydo$f}htV4LaQ zPjJIGY4$5toA%XbdVeQukVK(j{{EBAWq~ccdJUa`lubqcS<$<39^{h7*#VCF;1J_! zrO6v20E8J-E93W_nghJgI)7(ks62zFcs}HWW8g!|q#jbdm@xHaE>_BhsmL8?6K1WL z8+iuw(4~oMQY|%tbtT*sV<`I#tjQJt0&`f9`f8V(XRw4d#VfcZ0^@Z2NkH-VHSewa z<>hE+Ma_tifZc6r8s#TI_$^q%GK2B)9zS0C2iMT9<(QetmL5L_(T70Zcs9O?5!56( zf^*(EE{!1N$ZyZ^JfMG0nS0GH3W^Y4d5nAOVH(4-(Pm`Wt#_Q`r9X_GvY*x8<}PR% zLo|oP5`PG>QcX$eH&&{~YcDbxwia-v;#)Vv* za8+MiPRwku7|^7B6}o`aDPcCytp{H;2~2}LIetIep%Ht~0Pl#)fq{6I90VSOP39^&1CD*tB7-&A5g8051vYt!RR{lcT4K zVvyh44Z1Ypqj&m~wlk;79R>$e zI5niUfE07(LNkv*>(hny1G~+hdifP+TLtkJyMwe?h zbr9SSo^6s@iW6Fb=Q`zRh|<8uVV5@Mb#aWnLI=xP?bws!Bucw3%NhWpDwOc1s$0pADc7Jv85fC$t4&J~zmk4Q6(@RqPNdq>$(*;#u;ykT2^}^WdWCFLke! zQneY!&T_6IzMwQeoyT<1VNT&`9rAeo;Cx;*(JV6wW;drmnzKT(;qvl z$~_1oJ7Rf=>8h>7@ZiuCuFz$Uu0PUhgGTv{SV#f;du3^}FhcA@ws_r1{JDD?&Dt^c z92P=2bfhfRSP?(7s;}QlBI~A)2^|_fim_w&iB(mZIzM&7;+rooJg`vKrxv=YBeu zMiM0C)N-Y$1F5=#9P9T1uT|pvBiKNRQ`~>0Z6AyneL-Xv-r}1x(vd zLh*&3<=1SqDJcu&Sl>Qrhb&77tw25WlLo>p{jsLsARv=wZqurgyI0r>c+CKXy-h+P zuw$?I?oOa*9m!J|eO@s09_*p6&xGuBlej(e{iK1S=|&X-f%r#l6Q1fxHT9uu8K%_x zBLgr`@pn#iJi%gVN*G<)X5*_heJDJoX{m4B8O$9(Sm|fXV-t#2+oikgXz7wq&rwat zCSwr=Wwu%O<`hOLjuX`}MvDm=mTaeAT?)nEr&5)IlwOLf!geY>8WCI|J43^vYZ#H1 zNw)z*@Jh*{(qS3QK)acP?F_j|zCV|Tiov@0SsgJ@I99KFK+hd|E|TU$80d|p%jZ-x z#B_2MRD0BzLM3jvu!7xRNCReC-wxt=6v{bN!_?&c`#oY;aU$oTa>Kqw8L9WO<@E>% zLNzBY?+-QfkUvr0asxPpHB5Kc+pqnTD)rREAOA}<5y?=5A^kZln2`Er%*7pw`1DLv z12d~E7ma5Qk1Rr8`78V#Oh^(>FV`Qll?O1N0_!^BHk?Q$=LjjZ8(1F;5p z=PL>VhPV1&$g=v{?V`uX2qLECN+1j>N4f0{RtGRI-%1I76s}CMcIdd#9VX*@US{Ws zkQAA}B5z*Dn%{N~In>=`D;FS78Sv*Wi&dx%WfLp)GhG)JGBW=gFIhC|(#T~gAo=%1 z5_c#=G~GLt-e-@m3Cv56jC8J3@mk=-vI187GlE8PyjBD?j<=mPTpvC^9@S}G`1EluftSez-ye`Dr!d@l0QTyw=H!l&5kz z(VeaaObB4+6;2|oey?j2FXak_twtdoegWeF>=C{B%AmlCjbwgunVfn^#ue|(OzptA z@Lwf-FOg+w8tj(GEo}bLe;?*(X<$9A7T36NcGIgT`-YR0oQe+j`4G+}LX%U=vJt0W zISE>Vzx7`o&R-BsWa@Tuqydb-7Sc5BRDzK^UwELdN{jFIq7bb)sw?4d^jie0Ea!;( zw0|Kx-hZD`z`fhBQ-?0sW?y|&g%(K?pQj1aag$XN{6era3uSAVs}y3W2t(f%hwF(K znx;RjEbfhnqhhpV_DruYQFzyKVE~!^VQNHP(e3Ab7o7?K!}|ry4!?03xZpxd$w|UC zBpez^?Q(-6&Tuy-+SZu)Hzg+K-Ua*ntbrfx*S_hJn1A2-u;=AkRUfa&Qz4MCB3ZtydtR1X$%8%(r3U!aG7Zn{f*p{1Ecjqm|>GCLTsX?cdwcw?Js!oQ@F_;sTkJlfZS%Ni zv$2b@^BcJN0lzL*=R``^I7%t(T?aCX)7V?Tk3@onH^jVU)z(uLO=rvt%-(6c_n@5u$B%j*sdo;${vXv` BuQ31s literal 0 HcmV?d00001 diff --git a/docs/images/Framework.PNG b/docs/images/Framework.PNG new file mode 100644 index 0000000000000000000000000000000000000000..f51360fa60171e695c687f59d3487623ec632b8b GIT binary patch literal 47027 zcmb@u2UJr}*FG8$1sg>~RHO(hN>!v8s?t}#AP*4$&E+D=4UPOu# zAVBCX2nI+3LnQ+~qunRHwf^R!e4IIQt}! zk=eHCTbEmCIgd;2jV@sqm)ccgWcI7}%jPXqTp@4E$8!PYJo73gPazPp9X z*Ux&^59Ltpvy-ZKI2Q0mCL~G%95kblTbC|-@}fl@B+_MH`RY~;2TOBW06BzAozPkX ze{|qK9Wcf6dviK5heOAjtWQnKp0e+>mSa5e4m6k9aI4MwiEc3JoAQ+OE0>FF;XqmS zTS~WJf&REP$QWUv!pdaDUg#>PW3s98D|3`SDR7*# zj(_yzZFogkW9HgPJK}mlv%$;E*SfTgwuFo7PL4FmP0qi2 z7`I?x{PpGCKg4vusXW+7wY$pjvqIiF5p6k5{s3%!&WdfJvp%<&g}f$V@0VIuaj$am z_<{@52nVUt<(Fx7!&eTmd$;XN?ul+$VyrPOk*USKwz+Wg*4eF>6t+IeTtwjP9!5ps zGft=MZ6vR;ab#8{zkOAkh^8VSeiG20rsSlOHNe4%SC^!a;< zYXYJNTYl{+)AO^;uHOErLcI^l5Y4?|EH$GsXy-xfVj^cZZY(@#!?1D<%q1Oq+0@6! zN{fJC^|(c@yA~#47HxmLyP!Ix*tyBJ+KoxjLf#n9XfnVOZ;N87$-YyV2d$9s=&bqV zw{om-$o~>P&Aq3*Je}s&~GygyhQZgY$yl}7jfa{p6uR6`$NF3RrJdE!dgu(w$K*1 zV|xE7cb>?F_Q%y@N$7oRsx|S+f^}HCkLQ50zu$#)l_bsgeg9K%M7eI3y`wbfcY_*U z%;0gO{0n}gwD2^gQJumf9%m+e8KC;Wu}Rrvw(KXolyMiyzj~)AD^&I|F~hLPmj7QJ zp@sujT$DdWLRV}no1rc)=VTAd)?`ns!B%Z71D?`490~ACx{W6)SHCG|3E>vpVff8b z^hW$U@^?aziAMc+BcsxpQf-6Q^;6IuB0NxrD1QKvZf$C6JMhJSOc$) ztsu$STdie;^$gug_Zw@B7K^p6Pj+Lnj=qThbn;1Fd*tZP_fkcuhH2uN9w&=4x%~B% zH6da{6&Yt2zxO*qWviS#GDbAX*SwIMoq79KDZN&^r)dQ@n{goWx0h_A1eg_^w2f3I zKcxnx9TC$Ulgb`35w;brEz5*TRZX)^2L;S{IKInVA6D#Wm{vPptykOvK4(!M##J0^ z)>9m$Gr^*m3B`VR909ElKU1*UgtqSui0_XFDp5l;OVEqy$2XQf2vTm?&Pk3Idv43J z@jP(_(WqNks<+r@5syz(f`L<2qUfp!PML;5>xE-~W=*s!YXm}@%UM##1PfM5v}N)m zY)K<;HV~uynWgwr&wa&&@Q0>@VEqOcs_w)m%H(#Fux{DWDN6}rV&Tx>=0Xh&8{F!6KZ?L!){jj?x8$I;+(5A zXd1E#25lvQue4jYUwed{X*@DV!Ix6jtYw4m#yg}><5>je1An0?g0Ntvgu-AY2dP9$ zy+h-+1xNmTi_(LwE;v1HqyEWsSuANNU7%X6X&f5pTKb|o z(JFZ^HvE|7`tbS^%GBjwxg;5br0FL)7`SzY6O6h1Vn!B>sn3hYUsPg}H#5=o z3qsh?8{oyr6|Y{*Y<*-3DgAbuqAxfVjVYQs`*2OSp8I6m8)6Sl+PuUhXB%!J+EbM_ zrq%i!k+EA+_C(SUE%I`JG8Nyo5|sehHSv>;&St&+3SVn%)3u!2jF}Bg_;aEeJ za~~rnX*?#?86}h&|4zCQN8wLvaNjy2(9oZt=K?ZgM!X?XP8-KqjDWhHS*1^e%4*z# zt!zTr#5p6`%NaUmoddbj93tA4X~p2fZzqZi?w)Vk$Y$ux8^9z;P%G>+Q&b$BKsO86 zW_g13o(CP<+p}e+VbQ3aFYhHrypIJgc9i=o{lf1!ZQuw&*+Da}q=V0qgltONmlSU3 zS{mo9v!Qe4FdD6!^=Jt9Tm8rBG-H|R8;wQ@2GYTpMwz(X>7U%mY&K1}Nx+AC$QFaU ziJu?x8274;3x1CMY3DUmaB3^QfUD^SCFYauaHM6{nUw4ahp<)azJl%Y+1UcQ3LQKn z&%}n+0N{EpT3nn!La6`#EnyZk3&$&A_w4T;k}vqPEsRW2s&a{t4p#rr=i6D%nLA`3 z^{mk(RvdYHV(+3vGegJTJjMQMk(==(h^##AyGP|kNh&83IrCrL!gZle6S4A5lOKqO(R*JQl*cww6lvP>?Uf$vlO|77GF^f%`?eJUIHZ$;` zVTPrXsvXic`{UiLx)h3p6G(d~azwkvE&pn6C5ynG(auI)5i(4~YgyAl|9^k(HdKG? z^tA-oq7?z`W%NGV+2G)*7_?$h)alK(26M)$Aw74 zN8K|jvkpT>5hy7-S7OZRLMf>0`6zy3TeP2ss6-&%U8K$#ZUbGAO0Kab{{ zHh}+GYH5ayy}`SdLw`eFS}7f3+me?|?wT=VXJX$TKg$$Ulg1bfTd;?+gMyW`ehJ`3 z1m@jDSr`Kzb>t$X>(Y;fw(*$;wRldxLBLw9p;Kj+uN$)jSXM$XOL)}2dB4K;iV%rH z-em~LWAQb(&8|9a%XT>ACc0im_f&`*dgLA`U%4wR%)u(llir#;w?!RST zsf*>5{7U4weESizSnH~=-n5q)(Yi`pf%3x&W#9{MNJ~pm4q7e=&K)dN@WX0B!Rcpu zgmS*EK4fRAnO+fX-B0Wa*!%D~$1>+ovt_9c0mX_uNP(wu0MGmk-qrLygxr z8b0pLAx|v?MmBgI&s!Qv9JxiBakX-cBYjql2XPyiTDW{EF%c#yH9}MM?-f>-%!Qia z8ZT*AiUHyEoXUg)!uZ$E=eelesH;UuJfqbR{A7Ox7gGqAxrxU&MN;vhVKi;ciocY| zmDIAzybU3cf2ol+N6^&!N)YDHJE#C!Nt(}I8*0y8i3UqfPUybnL(U~t)hyumCf^bA zZ8U14XS<~+io(glPyE*h3DZNpJeg(B+yYIKnFlF$tg1`y1}Rm_M}Pb#HFY=< zx6~hxOtTYhvRioJtLBSPEXw{o?XANM&F}PIx-gKGrd^LQMm6Mn9XpRXRxDZ78JnB5&BrwVZPh<%XVxaGl8M2!F=eTAt!OMwjSG62xgZjbRYAjH zNqn9&qppTP7FJgWJtfD44T|UU!0Sd`iFtbT&6i91fo%{K{}i>rWV~F>YI2xdW-2bOx0@H{KM(gU}Z9{<;&8C975vh z`9pJYV+(CFV;47Nj&#BFm~nBj;l0_w_ouodVTP%O-F@F`J!;se!oC|ScspUtv@2`#SPi+2#YM9w zyNqgnHs=(CYl+%C9;uKjaGiEYo^~E0kA}7eAuO)zo63lv#3^h~ZeQ3Vs2ApBe>p^3 z$v41mZO!Dg6y66@)@xg2bfrr(Q-bzdD7dj?iPomXZIW}zt&b9V-McM8vf{SZp8Ha# zM{~|NOxVZ9Ov8MM``r-bi6#{iDqSxfwmF#7uR<*G)wOxCHGyrGn;HJ<$zMm^@>ikY zZcWOpq(fletMTsWo@kkOW^M-&Og4@W{NIcBwlV?H^KKVYb zwj}TJZJ2LkZ^IcxiFRR{YWMsqfRYJ6u8)n8Z6)Llw@V4t>sf z?UTdpMVWyJ&OFwfN4e#y*Vv=rqv zAsAUNdaWdv(SfL~^pRZH#isr?b5Q=#Wgt}98!E9UiWKRJ_loq4IZo8=jgXL8Dd?^G zqKA)-s!#X8+P{yIn(tn|r930sv(j~2?HqFP#(Q<%=9qUwOVzrP29I5zJvKJ$mu+^l z^))8WFJ&XU+)}?PH-(NaaXz}eG9IvTjr)A3y%fj4b4unFkl6miP($p zTrBRgo7RTo8q-3BOi5zA2)<)Q{_*#rnE~PUY=@0CfZ&CGm6Za9ZZaj$09rXZ;z^6* z14)d%T^MWTJP>#hYVj%}poktF zeQ>$81_UC(b={|!wx%N>cq1FBp>5`Q#c5q&Jz(>m-)88MJc(^fZ`#ZvH)n10o$!#O zYdXUALheY!bENV^Iab(*t2y6h3;nZSEQ8n-8FW5Auw;%p$N>&&On4DA!#I zcTZyB@EyRt;Ew(I`yDCk`v@hy_=uOjC5VR?w;c3>i)(1OY(|YueNl2tkHtRr*eP8S4+OfP`q6wDP zczvJk`%Y)(=QFjCziK6#oNGO`Qp(XsEIlRadZ|PhKdZ6+?)%5O6Sdm^lt?_xV6*7Y z{y?tP@(fCp<*KeuZs~bSz_i;x&Dft~3bfvf>EL7f+OXZu=KTw?=X<2b^AN3%A-!w$ zuPFBli=EXS^=VfmZJFZ#`|J5cQB*_V`~b5D=TW5@`rWPSk0zyOcX#&Uk{i})OXNhJ$)qKxn7Gezjq>s25k{^SMe^j~ zgYf=YnZ^t5Gy6-&7;DV}T)woJWR;1yeJP<7^@N%lL9zObf#xK%+O!KA-7|v6KceUcGjsYT@>&`G_QV@{I!M)?AP65RVr>#E; z1Q-AHDgm2n)3~k_o}-bdNr>zJ8Sm%_C!AwR=N<&NBbPszO4!uu5OcTpt&||8Ch%=h zlA@%RSZRcx_ccId2QxT_5=#daTcDO6M9=HZDQi%#jcKg^td&Y&-o4uUfF01ID?oJ@jbw(I`0@8+Ja*LSox5NOp+Q(w0&Aq1D%&@{QUT0Ku1* zmy;FYuE{16M8(&OGU|_wrmrK&QSz5qo+e64ChU5RH0Bbk3XR#BjGtP4J6t*Ht3z9q zq?yh14F{+!bYq^$&lYdJRFF^sqI0c9?Axp_8QW$iEz^1~DaydiN&gmTJzDrjv0-Ba z!Dus^3yjcbd#c8TQ%p^K3qiYuxF2IAr1)a>KgyRyk(vu&NtbQ@k|Wv2iW%l)4-)M?POy?q1d?3_>}eWhc0pdiG$G zHkE$38#@eZ#2yo~BMhxdi?!MF;xabKm%)%yAT;#44Sfx^kYpMcV^+D5<7;-v`t>o! zAhyZch0=A09Rzg$3sS-Jh;Vm?U*_VG_B8B zvI=P=dsih7T2_7d<<2bf_Yj*0Jh+ooC3PfZ#SH&>l;2lelKDxJy(sQJYmrw#MCHz! z``){jwIDCo+igp`Tl3r+fo6Mm!uaRR$KK!(XPE*MBbphzU#d4<{tzxbJ% zcj%hd>YvBpis;Er*A27(=bJ}poWiN4PH9)o3QS!ckjCD@ACnlbB}Y^6hdMm16Yzm$ z&NlyKgDoSq1b4q~&C+AX*8NyBoK$uyr|L0TOXRemUNzr<`KsZX1w|#*9Pgm1IEeb2P#_VkZ`69~l)NJKk-JQ}N2;0})Z<19rI~}-|tAJP!(U!?NSGrfm z-M_SW>HyJI{103A2T=dVJ)S>GM?gsUThh^wD#xtV{ zX$refl2{yT^T(OQ5u>-KUxiN=FfyF1?G%*yjr!ggRR6iOwDCbQD*oM{(X7f2;!8X zN64Zh)-y0@YhP=)uo11?ptjNuBt!h~ykzzlM{4sz31zY0coxoG4vU$%ZAEcU7m=gGF(*EiY>$^Un&+`bYUSlfX{b@c< z;a&9UKD_Nyk57Nx47Z)mAkcO?)lk-&aPcKL_LnegAAeN)r0&ZEzm<49r>)zM`K`W- z;1T$?Fa;5w2n`C!5;m4i} z&qR|3mjyS|CK*WIZb|F1o5qkWU-g?OodE|Ng)DqR`NC*eHIyy2_dbwu(HD-??wY2( z6@FLk#)v1PFI9K%1D}AMu_H)JPn$jJ2>QjJABy!o600IyTZ1xU&-tHN>sj)xn#-0x z`Sflh!!d?8X-kS+*vjn>0Y1cq)F(w#J~nTgBo>!2jp}CWi$TQa zpLNaJynQMa8blHr8zI=5scclgc$j6(}i_myOKU!Iw^Dmc^7jo}jQZ zGby?E5v`4k(TImG(5wrM|KGa1u}aMr0CX5}0gclnWZ7)1)nrDwnTZDjso83c>}X0r zORNrju0nIxp(b??0T1b77a&OnCexnoNkLOZ+oh8k_?GpKN+-q?bb}YCw2F4j0&#lx z4dZV1)*$Z_Zsl$@N-i&XrnOZ_H=?o-oyG~pTzNX7Li>Zx#?Q+SGYpZ=88aips}(JL zd33f*(++88JQJujtFC!+@A20v66n6z?&u9~8t4&v;9M_br}fR>IxSa69IUNfmj+*o zugMuH1-2L1+b{Q;_<5w(EPW-pB64aiJv$HBv1Sv}oGF($7sB6vHDET@4UXRH#+R1j zv!12!*NB*(H>%+lB{=fKyvg(&a%R89?`un;Ty3fnR7AIlkmirQY&84@E(JbqwC_m! z4H;TpZEP=aa=MCiu{etZvag!_zq2oL3FA&Sj=^r~+aWx8G~MNwV7#Al5E@qP85Af< zPbGzjUw77HA$ssXb1zYJxK|0{=VbmOsjmMxQaRwMKSx+~e)fZS=(#RMYV!lwn>rn0 z#UZsk>*lq+n+7w-A)I6DNZZ6?g#}^xp>1Xo3=PG23t-tIicDsfo{wvl*l)7=F?+LwU%Cl4fO$2}$|wbpy>AkB%56!7WA&j$lA?V*r&UC4)=JmLU#gl} zD?X#=rFjM_R3~b|&AattZiR~4afnLnLwb>x!wlEvLjnl8aw9}V0j<;56;manP#*=# z_<;niH!{*OS<2>c*!a$HuwrvA$)q;LXGC@62dqJ-w)D2@P8I)N(3dXzb(pNUQMVM| zvOPGoZOdY8TS`=9F%GJ-;N`B0Ymfid#tY6fC^1RHp7}@m44WgLlo;Tp;0eVKF5`YoLFxf?? z&p+qzOtU75X-~Z{xls8b=n(~@dgg>>Xj{_%Drz={QLSBCE!{8Dtk>!A8wE z!79OTx2m`S{Hw)QUUivl_376u-%Y30%>eA9 z-6eAm2+Pn!<4@@L^9?~w(509vuyX5gLR@XzD~MLYZeXw|y|J2#((g4Eb>K!jXl-Pg zQ96KoJPxFl%`~(u6<@WHU zJ-6=E><`Od3ruT#0}`M&{)zHRs~8_J8!4hCXK!O2-5Nur!V+$zZeRH=w!mIV#}LBm z*znigkm#t5`1n6qBPLYFfsIgJ)k9&D5A`t(<=SQvvFAZ=W0JwM%M}cFei5i|FH7@E z-V^j}-hdj@C*O7VTZ<#^HgVepRM?3{Fh%r>X5d5vfOH(cnW)2u8B6|q%0&VD2*!%( z9psa3ZTu^jHpdokjy7U{(fNCC*BkH)jTH2<#36hjozC8US;)o zq)PgeaC6AC8p%qg*LV2&m)|qS8rm8mZ5!diM#!P^m5jCm=7YTbfBxHw`DwQitORUJ z0)Z6tptJDd1`u~zzT*CbuWpRZww}7W`m|^2cixao7X<|c*V0&d89I2Y6w`!-t>vXk zH#cE&n>n{TW(G5ag6m$A*8>ms@z7J?q(uOukS1s=rjDipIlX@x__UdIL>&z5&BX5e zU}=vSlZ(jy6CK>E$<%a+ed*%7+wuU1>hx{u3u7(S&37z6YAg;p^4AxpN%kvIetf`0 z)TJJLI?78wAoqYYBGOUotx(R)Q0h25M7xSp*? zW%Or0skkP1m@nh3$MvN^vMK|8;Nyke>Wh>~$6LS=5fOFabXDY@?iRu-uWqWTnL{*#7TN}>SIf^ zBcctqhC-B52kVp*2&qjST3nC^#-z9UY883#+EgMH7CH#p3+Kpb*e2Z~5k3VnGT z^sDv=4lWnC)|2kb7`^M@lv-; z2TTR%dZpn(6&~`J*%ZI!E$P3ej~*2tN>03;3~Uh$R?1id@Rq->3-(fpfrFatpwY<5 zG@0*V`4uSmxT4?tn{7K~KrSaUk(f6ML*JGFQBOQ`Hp*OL{RqiL`|ndw>9pP{_OUf3 zq_RtKnOmKt88PJ?BDCgh)>c2ivYj||2R1d%uye3^Lh^}|qXm(*Ey_{vtEiA$&yrq? zsobQQ0u$*zS82o6s#8mZEvo#=^nIvUlRh>LydRjv_9Q)8<{IObXq#%??j%>qc*Q<8 z&$r;wR~F@VPJ9t+-cKLYLBY_4)o&kjb_JsKyOTUbUcbOZl2)bR*z4fhmLZNa&#}!b76d!MfHsB z8BU9QPGh_OteExP(b|XIPoB`*vE`rx5bx&4vFUtC3B@-ZQoF7Fw+4b-7g~OxoAC=I z$5TtNO@&$=NSXrd>Fil((b3u`KZg`+^sAe#CevL90AsMox4lu$r5(RmE*2JQTH~@p zKB{6-)8|IK*9r7R6H?kCu95&w5Lb<3IuIhZArw%E0Nl!0A< zB*pZ2%-#5*ZXa{L@F8M!x7vgoe6?$P*l80-tTFp9am}>894B0;vicBp_Wgy!&7bwY z6&G^O``HQCN$DnJo+$B5Y(;M6I!{@lJX;bxxp+f9YaT<7t-QK8HEQ+xUkN5hIU<~X zzxQi#yJta`x549@eXmPB(Xva1F0;%T<4kp2IzMLniD`dGeGC04QF=)mm|&SbCd;s~ zr?*qkXAGej337=?6~EaZqUfb`>nfr5acVo18mjn`c%jgHE7zVHr0cnVtZQny0?gQx ztGL#~ntH%&J~*FVx{7O#kCN(>zz%S<6%Utx9;XkGX}lnw<*v%0=99^bybNsbKYSjA zbbXLNAu<(OfDfs-#JcW$I_v8+K|_HNBU{lQ&h`u3qL(OpsmJIW$wX$ zhr2nIvN_-r+b8;S(*6y+f6~na%#q+;2~6tg`@B(aicKq|*)AkZZQhC#F1E*+a=OvA zo?D5}vY>oXvu*!Zt~SqGqsVJnfyTCp|0p$HtWtFTn%3af?eMNw|C(Niye6-6`$4@O z7r_=&pHp1ShJVKQWnCpiZC%@IG2q8EAUxvOWL@w7ZTW}(nuUovSA0cg=*2RK7 z!-9Mi5NzdrYOcjWCLbr44YULV!1^avBje9l+W-E|K&#&}e z?p$9>vd!PK?T6Ont9}KH{2W<4>gLz}z z_-+f`*@Q&YA;C=d0eezFAGjoTj%~AW)i;1iEKv@;qTswTp8cw)D0C$dO*is3YB@ex zKH2-NeLHpVKdNfX7n^ynLO5r-9TNSzW=t<)ZzVM1GFgWbGJITEvhlF|=KzIzNJY&g5~E;lbh zYc~!puu*Cgq?ElsXeue7TOkqW(+cp*N|w1LvM z@x=KzEh*9QDtE3_uFkn(|4Y2(j^w`WZOoeqNMM84s1!{%dMAKi8S+W!h~QMt)Cfn! z6uW1=7~JMkw0TEG7qrJIeF@_x8Z)IKCb<^R8gdF)75?t*>603M4<#Uh81FzyDz0|* zt4Ug*Mt$GYsW&FQLmWT!$Y!&cDr6Mz*})pPFZEA`zhBJ>Uz7jn{k7MZ!MkzW-IcM9 zegx~CHxKcYqh?u0CdvX{OWFVdYJr%@%Rl|$e=rmDiWu@O_&r}`6bTd9BUa3qrp0-w zH{&I7eT3=t57HOs$A@V)+(uK)eSmz!M@1_F>4U8mvSH8cel)(07vLpo&rHz>=HgaK zZ1iCt{`SzsIH_)FA{^BbVY?1&7A$qEGfJj9%)AK5<214JyC}F;86Adbxakqsr){D- zU8Of-Tw3Wc6;=FkF`RB$XCx#a5uGJRhP3d=>HvvvQ@c=X%AKkXZw9NfQ%$qDH|uVr!x|;%ho(P`?SyNnxR>1QxCD=zUE=jkuSE{8=P^aW#6AV8Twh9=P&FPxw6LV7}b7P zX%!iyxHKmfuI=}4+U$?Drndllztt@pX?PknAM0*I%*NWP1Pz+oeeWE#V>YN>-JWWY zpY+=WcHLs$9J4*{d{yl?ovNwg{1%@!C&*;`AUJ2r1Fnn!R|_=>CGVs8cfYi zpw@a%)z!UJQL&Jv)~1u6Byr52RfKL?&gd0t1~$vgSmU@vX8)8UbNvM8U2q-qodqSPdRB!ZT-AnLejd(Uei5%N-3*nJj`wonBD%fnF8o{lLs#c`sW%zI+h#*<_Z_SF z2hLi36%vps5>XU5YI6n4a^io)sx5YFjBx5vHRIC)XB&MHykqxfGR~qKL%ejKV=Q>0Lh4GuG@n$%rHUPu zE2;W$RomfkKJCr2dx^$XGE7##BIIMD+dNVm7^c--|i5lNN6tsoWCj*}6o%-Tv`&&AZRY9B9q_p*vzw+>z-JU~y9I z==5Ikz;^&yz?_5kPOyjQ|F3WG!>u>C$hA3uO?}Zb{eXEH5rrN9HF^))q+C$d8iZLI z`rk8!$OBQds2vk2vor@r9`}liiN6Cwo@}ifjV}4d8)9;GwVoI&tzubA_0lqX z833myY?`R_Aid@thQ))?P2_k7CIJt6w)TgW0VEd9Yy4cFH;Pc^@kV*k$e+z20=sb(tD1gK zXz?pmO``&-NzT!|+x9&`4t3AHZl$8Z#Ymx{`w1#9@3_f_IG9dkk?u}Kg%Wk=r!aok zb#>Y(t^ZIhIxXkt@RoPen}zP^k=3aLr6BY-nuqVFwGWL1<*Js`7)6t+YSIm?>GShk zQ>Pj|o4^(vSGhJ7ui7(cMY^P2UAk8GMqWikaliqQb!XQektvw$i_29}l5PglG^}ah zdEj(P)wHca$JQLttzmD&VG^=(Lw7W-Q%pE0Kel;U;dX%5mlVl)Sq)1*E^nqY-y`GW zc|0Tfmw94CABoun^7k!@=EY~R++ny&tmG_Q6dfmz!0Hu6ULUd0?-xB(2RI$eqNh&v z-FcHm4;0uj8yhd)-w}39H|B{(skuy9JD6w*xS4cd^lxiDxWnXEHDc0_uN9C1eW@GrWGK;ft#Ch)&LMp?DYN0`&jgbaH3*$*y z!3UFZInEQGy^C(WJ0@yw?gr#2S*|)0%>#MMjr{o-OrZV|HQ|09|FGYm{0g4y`^e94 z=&*I!{~23;9YO5+YP;@aXgWCL6?3mu#ipOT9F<%h=8_hX#dMzILBr>qVRwa3sV?Uf zGSn0p_fvE)3=%osl}Ee9fk#s`!??u7w?Ch@*Llt*q+Jp2pk!4ehQs!}n9lB5to`l% z_7k_S*}2N&s`~xUJI_k~zTh8MR~FrwtN&O-8}V*D*S7EPmNoRFYc#2sEc&_Efv~;H zH7jTK=V=%(17WiU`XJNF{w*&jM}pR2Ai;Cuje=yg)5pIY(w+2o*Sz#Jl;cT_vl5KByQt#}!(Rp=i!^Eq81Rn{l(DYq+yQbV$WR>1ss*+t2WhVa{k~ z)nt)ymAZ#P%^pSIGwBDtD%YRGMk)}Gxsg|V=N~<1zX~=MBw+4_OzNljC>BIO6E$M|I+>6>FiMZOv zTPOc0`J)lYkE)tU#y#AG**l>A5qn!Azs6!7(Ia}nBri6R`dSq|c9H_FZp{3zs`bd2 zDxP$3YNuG)n-@kxrC(>ye8@rH*f$~&jo+^#;E80Y_m*Gu8?|Ej()+I8BU zLo0rG6v5g2$A_CxA75}X%4MSD!1??=w`7Mtze@MkXp-`+{6#S{EE{?nIx9LC#};b` z@rX2ALx`3gYc`pChmUl*q~>kqVu!w7VlU=AgCd=cO)8fFY#mmfOoWo1b`I$X}< zDC3GA=?KTybFrbJCJlm_d+9dRg6dBV9I*DJfxPnh(Y;j#bmAKKJe~GP0$nTGeS7|i z94j@gXC2%lTe%~vYm`i1wGAphR)(;SbjNv66W%p0>G;OcUSN^ z05Bos1-1wI(u?`9D0I`(Qnw@@h`+$xJ_S4(XD)b7D=g;m_~ko}r2S?_an@SODgmdS zIRGM{ThWuqoCBOQp6(}a(T;d4sB+?%>_h=zcizo!t)4K#8&bgbM;Xe5b7VYwR23jj z*kq(0@rDQ|O);9?p%W>(2iSeiXMhh;BkIjivHmUAGO#u5OPVncN?<++K($n-Z)mKr zEZ3yV17|+MhvJSw*(>ZL>H-e9104JS{dtMLAUS&+QML2UHR7f9jdan&_V@p7VrTCuU@*nFBSqm^^j<1 zk;}+Y3!#Jc(dT}~H)6c>XV0^Hp6Bq@I(T+XTJV8^RUd%D14n_PL0KT+tPLF??>^mk zlhGeOA134!Xm%@ANh~iq_&Zz7#kJ_a+I*qa)(ZZt^jo+GTjG9l=c-!GiGP~8)+8iA zqA<+&zO3cFXQHIFQeRK1{!NxzkFPx-(!Ls2y<-Okr-iGDg~I_(NXn zjkXBTyx!kkHNla$gf9%{>hpd*;pL~Xmc-~|*g#nCOw zt6Av1cr3u!+(6OU56lpUYKI@ad)(@Y-b~*3FTd%XivaRo9XhR76>ZS302~qxU%WW) zb6{g$j;k!o9pd!kNH-ybPT-_}K`u!ou-?1J53rKj11pKg^M-t^Q}kg{5Bxl!L<9VZ zRC+XF@W{Fkqo7*?-Im0Cjd&SSe~&N>1f&yc+2@7 zEM0#4??Nk3!4Xq*G7B&ZHUQf`p#S~ir zY6H(cFBiTdHAafj%dMqhXy@~Oo;sKx2R8E%sYF$Ut?21By_e8ZUl}@}%{s9Zx4Q{Y zKl*VoD6;jBchV7#a&5YgIKl;3QpZF$K{w%|AnO4&db_iUH^gsLP2HP5QapwooDBob zG{B)6qbb`?HYHV;J*mJ}bE9I9oHU#0vpWs`1ix-fP~&5Ow56(hT3)FNqPF%HV(v^u zI$64oAs)=uT%P<8$BV5GWc7kw(!J_C!VvMXH^Y!+^PKIZT>Bs| z6~Kmpr>@AUF-~ws!jzGDUtz6{C%{nio0qJ{IXw~XL>+ZSqjse*!o$A?qzLRCBo0s5 z0kxTT74ichVER>4gy;SDQ|CTwcKO|xY+9JZ{pM3RG1U5D4kcr>bSY<$tN9PMg|qRh z7RM)Hp+gbLf2po90l(W>z|D;IY5cC2|n);0A)I;7$vy{spG zsdkVKtniCqnxe+7=`|t(>(wl)*b2mkJU4XxK1vd!2--@nJQLuEI`h9e@{{nUADGcq zh;eYEV>Qx6T|i`h%PukfD!Ml|&uu%!7N!wB0Rk+;jK{ zN8Mf~4w;oCPmAUMdf=pe>1!}fFL=Eo>Xj~F`)z606Lm`R58U+8f88`ZD|MefSN^#k zUWxsgEK*v~gm$|D5<6KfT3w1;O)maf{OTExdr=edQ6VT0@y-%{)ar=|Zp9h@^rpG~ZCQAg+D|^KI(rD)q!+ z!p9w)Eeg+%ASd>)wB0z`$C9^oM;De=NAzr*bZZhsR(Y!FKzwJo%}z)yQM>qDEx`4@ z0RCzUgm`B~%GH^a61fd0PI_PN&@EVEzTjNZY=_;%VXq8zo z22dL_fYIV}avn)`mR4gsWhW{SeH;G1>{+Qz5sw=ibUso{vZEYldh>vM698P!?O(<~ zR9ednz*R5hjp?)w9Rvk()$=Fo4BcPgPS<`@fbTq*@hJ3}qRc%?EfYjAdlrF5-~Pj8 z4|C+fCFe+TG|wb4n&iQbGuN7B&H0;ibu8D-07(C> z04lg&AnAgK>pDnJeUU~qDR3v|=>VL!lr#j!kOmaKAs%8MD~Q(ep^>m2Gp;PVhTorlZ0&9PnkoJYRSym2^?|C6TbK zYY?Gu7+>uzkiJ!@R{zRlC+yBMkP7cT45ZP2$bj?WBbvotJVw)tftmcTew**YQ?EnG zx1PDxpd`h$&HdO3v5Jl&_A}Rz*tWilz7C{iS*@43Xu1_SxZ!o3kebAEQoq-R`&)Ko zxA^GeqNIsS6QRw*6=>j7ep-0C^MmMod*qt3&c+>%NNL}^GmnkE-|+ri|0q~~Ki4-{ zXS*W_C+wj#@Jd`Xs#GGPFfm?g@cfqKla=OacbQy|Rs(;inY zi8gYgGXuW~w=71g2brkF*w3?SBoNY2o5PB1FFcY)@VT6eF0vJ)dL=#P`fp6g!w7Ns zOP#eqS)V=qo_(YC-cY1gqAY3!{?w_=R#o#xF>+&gEx(YlAlgqctjBHXXniY=u;?U; z!JY_8@;z*0-Fv#c`ERR-{+C4<|3%zsiXkJ_2|So;ty%{--{@@dMSnt z4zlB)`ABT$NvqfvP8o$`;A@+ngIHh#RQ=w`W@)|WhhuL)N)MkM@C@tFeI#ETCt}Ny zK5&Lj-{(kUwCihxeT<~|{o6qTqp>2|A4Q01kXZ9Iub^3^{WB106IYknY4lj5j{(+u940*MKQ7nUXW`S7&p{wM{C!8s5FerRmR^mIYgp+z+|>@a9d1ex zPV>lnbGT5vJnz7$y>mI%i*!}4B5JWQI;}grx+uXXPkgD!YIoGhimYf)=*Y(VldDeV zzxbSZt62K&W{^g&4hEYPq3Ulf?BZgs>>O;7ErTxnZ_4-m(9DyuR~3uEX_P@H#MtWs z2ZWh#2&cIi<~K)uP8SC0FaU}soc3JIvRHrwJ0QvBVSQnG(S|PuZ-5t06LlY03zDb9 z74&oIBb04GKpr$mG9~7nw&yw6JF7qS;_n>Shh@=MHZw}9(MUkr-@x5Vu58tLgGd!{ zKNfaIVEps8xuamxSiS%lmT`Qz3~Vr#X{MTJHzJ}&KBDkOP*OT>Lb63XJvHs2uH$m1 z4wCAa1O)GOJVJ~B(OC~AN?A}+Uvxv?t2zI{3LMd+wDZDk;JWG_H>H-N2(^b}_h- z{3bffcl$LEm?tqQd*1HPKr_}ojc>^=J7)jvpa&RsoaxH!Z8=hxpSelu>Q>aa;u`bN z?3(gxkyRj<+;kWa=YN6 zIgupdN<35RP=y<(wxCKio5^S|)>G`$THsRO%>*Db}Y^LbSVlqzfGS zuk9$k+}2-Mk4(gYLon!SnCVdOf`8(<${voBFi=>4P>|>iHpEoT3!UKcJC&#(59JOA z&*xDFEhclYkiB!dC#MbTQM!xju8UVJ^H!aPb0_%jE(pBPxI{+bkl(V%?HpzmIs+kr z@9&WqB`+k>@_syh$`rQpxjQH@*Er30W=*BOn)DK2K6FfvWDau74e?3G)swfmmnq7J zz9h+>oDaSZj{R8SdEdd&X{P;>wq%8|x4g$WJmj}oOZDJ&BwG+}o1@97+s^-Z20qsl zyf|Zj}fMNQ$g#JChM)e{}19v3 zo*ECM{%FE;urDIpN9$3S4{ak36>4>VwAC((6B|Fy=mHLc9*-Y3OwTf1Nfw8c6ApP0 zzF-o3Y?DaFaG4f@Pv0EpJUB|ZZcErdRUwEql58v~*;vq=OG_C(-}HE#;xnMop$Lw3 z1E!FTmGN9duINQMz+v|R&P(S+^wllZ2T3dq3m`#MmNH+gIdR0Iwmxn`)tu@msdx4{a1bF- zo&eQx6j4rW+u~s|SH>uQa9kbH0!y_Nn>(#`Rdo0cv#?YYYj0>Su*iz;i+yS3eb2?O$oYf z9F@*-;5_>M*_p?u zBXGVKMkdbwwCch((CRrgKm~#p)MzunOn5UPfxY8X;ZB8nfu^Ob)dzqm`47T}AY?Y& zJ^mkr&#e3&Eg+o!gYbcb-gpdz+AhLpCeGHCas5e76H*AR{`YSl&8K6Ik=)(Fl_ehS$;=_BEkKN3>qDdqB@o%&C{fv$FypWw> ztpS>g|7|+AWt(&wd&@j6)+r?RUP?-vB4yt!wG!Xi;47L{apE6GlUGk`16QPR&2-Va zZd!$v3Ur3j+j*!&G8y&9YWp)R@nD|_rBcFPL}Su7mflzQAFn{VYAzz*zWOvoPg8Yr zLw^4IxQLNt-26Lpx3h+00zpnnsjE7ngMr+Vi=4B7v$PB&7X%*fi}>7n?=QN&V3sNO&IqR5n-qz(7%{W|3;^5{8GYG(e9#laR~06RonOvzs&_uZT0wuDEU|;>rPP zU_1o}dUNZ3rGKsxh|>&L(sPkKlQ}cw>6O3p<7-$iP8sFH&c2dtm8YA3FjFTpktHXV z;wBUZ;B<-KF9l_L)|1e)y5Q7b#Pg3;b5#(Hg9<&J&5K45y4?MVHxl`1Io#>j%79OR z)D~=Hx9Cy$x1IXct;fgUFbVDzoE?3OSMnM6+bE}N525wr>qa z%R%7uAbKD!NXA6X7D_L=OWcAMG^LNM1Ml@;8R53k)C)AJ7s1GwJ*npJn4 zcUgm*R80UdGGC}@<8BtyU0%H(Rx~R=!PIF;S?fRMbg||%nN4@VRJK23paTCF%oZdx z`t6;lHQ(PSrWUP(!b4ksIb;2G_wat!C3k7uj|SPtB#;mk{GG&8}@*y(aIH7Flq641{)V%eh zA3jb>7x*(DP}aNIEO1#o)6y2v)2Y|C89o|rE0}>+s{qjXX6}9b`U&Qjj7E!yY-@vg z0vlWua?~Rcv8Etv8J@*6wUPCKwa$cOc{(i2VYVRM+rGr@ynCb~iv8B5FIVQSpJNTj{!(SFfR*0_(rpGB9$F?d7U z<7P(LtLLD?6g=(DpKrpQYYUAK84EXY%r~6gL(DeH(F|=-%f*OS5uMY-S*P z%`ut%mqQ2q{mKNA7Fd8w7TXxj=zD~wTdQkjslO^TvoOvq&*~53q&N#AnzDF;CO>7Q zCNfTInjK{1tw?7tlxHlo-~U*Rc{#a=54D-3fBGZ?ww-VeU))xG!<@D%!Kl7H(dSCD zc5Hp7q=MO5fc7ZZ9KLVIsJJr1m0k&Nj!UDUD4Vi)YR#W8m(?Ap8REOks%5z18|NB< z{1XcC&IRhQ4Qi#F@+Ews4rSb)A~_%>Y^_)SI5N&%aXyPrjfIOSbe^;8^d`cx@<(OR za>aNylw3@Om7Rqnur~r`Hveqi54Os;2x#76i$w<)?EM_?w0d2X_TuH5ZxwzM`F#4H zDYWs1-gHt0#_8_mGb=K|Rv$m2pv&Jn3+c!9e2w3dMU~A3GiHsi7!*ve`_LrP)R4#{ zoaqxIIr0@1PN98*6R|&GYUJU7>nfE2I+w6?!|`Ps^Ntf(i*sLJXOVpjM9{zxs3Ch# z=}o4}JIorbO9!uhp<-urw!Rr3pgQ&W8@fJjht0776|R~ zSUIooUH$_VwfJpaX^CtcJdtdubR_=xW!~43wQ(k8ut~L-5&{|t&`M)dKF8G_A;Q(f zB4HdeVK%und)<(tLo>u$1#8b;>$uj1ogCWtl}H-~a7pTDDlX~%>-KM-tAUnY>!4y} z0Sw7|>e+7?(rv0wq#~c{Sz-6^-~;oZ;?9bW@+tFD&kWK(k-1h@^{mCeQF4sb!-3XM zmCr@^KQPS8YIMK>Xu#TI2aDNSfJz%m_989F>6QVU9c%OF6#2tY3u^9#913nX`E_Oj z(ChJW?Zu<_E7zY-rm-mnHeI`rh@jLEdaC+@o_ui$_ZRIYv0oxUUrr|PIzJACrioNb8Vsyp^k;~RGPTAXa z_m_jdIdwU)CtU_Mu^G@cXM2owFA7xX2CB0=|9Ki_#Xq`Fo^I>cOpA)_=RR-YcS)Id zbt^;J(j+4j(ax1V&NP(+S^g6fS>E$zv(9NHzPse#)A1N*dSe@EwS#Lj6dP71AJ2U*+g600=4q2`y!(kWX# z9*u$t9C3lKiV{69v}FJ8@sEMS4anG*4kvI-{_N8geQT0wzxj;Q7}L_e6=mT%Uu>yD$ZutPM@Zklwn+_!<4DbIDWdpG}X-7mI5XA6wOo zG_;!**vJoDWcNrCZ&dty=Qft2hxnsQ`>cM=S504ei7T!)+Rsv*y~N@RlKF4TNKJW3 z_A4M6C@E&j1W04NaaB`J5253$1IRG{up-{L^xu?Sf!nIS-x@FPOFD_S>)`0Qd5L_= z(CTnHk^3InbI| z-hq``SHdmWNp|KvT`b|g*AVdbUZiRzRh0b1cn%r+2bt;Bac(|~3>mt7yPR_xZ!`A! zo(K1TrWd`g9%9efSxG)%i}B^fw5VEDif>0<4ZU$8h5xBAEaGC!Vyf&3j?urrKET@u zT#l$2u>1xRb!a(LMb39KVD!dp?h+vj27bnY0tgvb8} zrO^LlnVJ|5Vk}q7$xzCyE(nj-r`gV@#%91&GQ4*1nE=|12wqNMtjtLtX#(PQqUMmf zbrFIMj_5)WWjw~y-9h*?Qk%MsjfvYa9T1QDa*)y@S2?@&i7*K?R*M@(9Szb|E+M(l zL+U`yjYhoko2r*id;EK>%8TPIiKusE!Qss$`9QQXtN8X~DkE839=kmJytVZNfU4Lo zQ1wz_*TkQ}nm^I>n?zms>1H+#$JL>r1qTJpBP9XB4BHi1*H{@{E7bU?DO{Cs>rntx z@3k=7ITnG9pn=;D0mM7wUG@dD`{Eom_G_&wo4!68U3?3K-sJOPV(={rp>lPT*=evS zKtV<2wTcpAeXMd`qdy;LX&;wl?-6+TIii};xQc70j!GcqnA3Jx&#Ys`at`0sWskLe z3N;R&OtRariV|7Tb%U+huY%{Rhw1s(zD1>1`Y%fcy{3^(OLFeIk9eLjZ*8ybb0TAT z@TNIU)o>Y+^<(k-$oc>~n)8=ecP4ImF(h}Mx%>KPn)48wnKjL+V_owpT8-h;w=KfN zP_l%VpZem-i&`zasNNHrAVV-7n(Y z>yK)T_hb3q#AYP+J83((2>N{1lPCHX%xxjjKR|8W1nZ5Y)$5dXNq4m@;wKgB;4LrD z?XrSrC2H`VSFP_qJP8=f%mpR9$Ci-d=y=5DM|bVjL#tW^^aY^;h}F{Wi`>Wsugwf0 zAKSS5MBlDwQU!bQ(pw~G+DoAlXCs|1pTV2WxAFl{)=@X{+A8Defc2VvE~h^SS9=<= zmwK335(}GOBUW^`|AcVl8kzD&o=ed;*xR^cHzm{>N|zfIPG@Wd;a_!}{J?CkHdSY? z*a50J_&V27p|ofKc>3%+K#CaOE;$|XtR&=+Ihmu>D@!zj)kaq8t)c9RK-3A1Df|Z> ze8YP>dnMNlM>-7Vm=F1TVI>4~%vaw0s`DpAmJhh+rb~BHvBIgvc;#_x0D9YvOUz5x z)!nIlKF$wn3I~t@Y!+c>-if!z^>u!!)Au)WfSP!2_T-KevWtJTt;2~^7TBwN;o2#0 zoxd)_XqE8KV&M`2#4kwu>5DK;n3Wv>-E2&S*t{!16?Wz#KDvo4-?o{hbr> zhhjb9MA%VzOSI}rDyM~X z+9J)L!XzrNeyYm2t8G)xXCWOUvE%JzDw^3Z;A5}PcrTjDmimlhFIBq z9~<@u;&SN+;>$UsfOmeTPBTvQX7L!{Xi%n(G~`08o9BEkK?sUG_e(!kH_d zAF?&nU=-@p$=k zajT%&oZit=vuX&>VWzwT05)E`W#S`#y(Vylw66zWGcNRG!93*rrCyGcno~ffcG*xd zuK+zLE>E{zF`ZnUb&Ne5-G3xRMM&*^WDATy8(!W6&Z)#qrmJ1)!>;c(g9`mdDr zwmkq7*~0qsU&#}HbB1PLaECcx=#i!em93p!x=?P@ljT_Hyp2M5b?)xgp+b;na>X`_ z;}Id_TrQ3Nno$x)|OH>@y(}xc2^qBKZ+tW}~qjc!xxKeFp3q8nPiJ zK)R6;y$S!(#`s6-E#@a~rU+}UUX1pnnd>fATn0PhDkI@CO4`J>e0mn?Ff>hXy5!B> z`N0h*O)La~=jfk5q#~_}@g?Y@hfvxt5~_94Wv}{h7_em*wg7nWSiE>30`#Cpqk%}vqIg+FRjDYI?Ws)z zU7sxcQ4(}uNP*cyZcrh2e!H^ItE`zcxeQO$n+#hpr~4>SDEcU0Q-1~@uTbzZ^4y0V z?6lJ1(Eb-s-k00KaH2bC|LX-X=@h*LsaA#K*9`f!UWp7V5^Vjju6X$r*X#B)O$WDc z=EIplg?wY3ZjL#@qoZG3t6a1K#lqO*?)c|Z z6D6*i>cnTL_gcg2AVT(LdL>C9iulG>+xaJI{F2C^k7S(qd9m|am8Zms`G(dxmuv%Z z$hdWECwXpqvOH{D1A26v>8thc)dP!?b#-^7F|Mof#dR&NMlm9*uANuxM8@A^+hys# z1!!KdG!bh&<7}}e1ESt8zeomf|G%lg;7!VkE`)w?bkgvDu}miAqk?cZ$?*;iSADxV z^Tx!F{f`YF4lx|)HPyc4|Dp?al4EpVDt@0V52P)~Td^!pPb%t<)Mljp6UV+n6o|a2}jDHCqI>K%b5+M}wCW(Lao@KLb2mbPl|Mu?D zPo@ebgUo{_pfMumiSKcYbU3f==&5U5nnxdb?FrIIo-AZ>m}c^VEN>KwVC-LaA2qgJ zV9SkgP4w@Yiwu@7;6+=cDcc>Kb@CoBcRDxMbEH1QHuq;Pz=&&x*+; zh|d1w8V9O}tM{tOQsUD@Irl4?s_Tb>UZ$&L$+>8%KEcqB@FAysJ{|G8+!b#`&X_Yu z=5)rkD6Zdrp;L0Y%HCBg|_+-Sc7kDuc|*o8P-p&Ps)$>x8Ly*;5cj{HeAE zW8f<5hOE)d1=;bJk@E&I<1r_E#NHKl$z^Lt7S8q zj~5)xNATTmgW|7x@v+&&wst?g^!IU$5MAm%N>*>b#(49tH!X;d`iEA(B<+#Bbkyq7 z{v+HBs(5t$ReST!(Cf@tioCF-L2Y9+eTSFU_o3_c_&@m!L4?iU+Tvv~S| z>@=nPaRsKP=>>_+7ah(Cx+{W*_OQl+dIp!>4%O;t{)lq(r6B(hdjG zR(3bcY>7u@9&1q;OrOuTV7wx|qMY;1N)otC>K|k1lz=Udop!wie@}`|5mqi1ZOkN_ zE6|gS6g-XCk4C? zdB_hs7-}lI*y`|#-Y&F%;M^Lozm!Q_(@JgvmqN`pcUSIky=`&O*p+UnziL8|7oj+o z4=bFVD06wqgL)?ww=b~@=kFnBy_6WERPp`CR8U$XK;Zt@H!| z-LE?4hdI)TUM0v2=Ji#DmWa=S2L#d%UgJq0kvyo+%9*`m_r+g7vCR33P%*0H)a!dm z0PEX#x!lW>%RK!YR!E?Dq2&@+MT^vg$HnP}e=*2Yl>1SVmPiCq*z!^6t{EGA&3*%v zebdE7_K*^T7iGPP=)QJsKfhne$w8|Zr^_a2Q`M{>0jZ(Qnn zBvsHOBR@fB(!ZI^mbwuV@Bi68&K!5tQN$FSWUTQZTjG7*tNCchq^qVEC5$aSvW`Z) z$neX)aPqjRj$BhEd{p6MZD(RX!Q7Dhc#T^^SIY`)(Iw^-jz+(bKIl(Xv&Ez5jBP17 zhkV!PAik*^4^L7?O)H%xQ(0f2a!$xNXBC~Rahvkg+gKSqUd5i(>$=yNhzLEzbAxFX zo0fNoz4Yeg5~6SH*Jj3X8qbV3XZsvMCE>>HkZFd$CveE-^2qo8SYi2f#{7TxH-~UN z9ghNe9dV}FyFKL48xhC?XltolEy`yjW*PQ5)x=+CS)a&>5=#37$q{z)>)Y^*quI)4 zUU&0`N&^En^JZW1SI#5?tUdK9iZCH8JMxz`qnFs`EdKr<$ca1DO1tp}YkH@5< z_o&F!6G{lEoCHx4yn$!wZT#65l$t`G5(LmzInTr|S}ND#G+UKD<5JObz77GyNZc!X zkxEaJm*@Ig_1km-xqN;q0=we)ek#s9(kk6WRTW_3Xl^e-s?TWivQwe)nn*2DirPa^ zUc31XEkKz^ntuwR0e>63`xy4Ls8EVh<8@k)S6`dX2;K!>zy3Ysm<#QE zUX_fl1@OF#5r`~KbS!0Iyy96;TJ zax1@v6RrFVRXKel2G{3>JT1uHCHSzB-MpKPj|x@yl(?x%#pr*R@4XaE{)~V_kKINp z3F>8}?@e&M0DgfUI>10J`K5`*FCPq%8sw4RUJSwem?x!Y#4ite?YRB z7M2%`OZYwR_=ZJ@)DJdzgrL9O(0Iv4@Q-SuMAyyXmE{VyEs=Zi(x7{{?w{R*h8(L^ zF9K9=oO!8u6Ms&`ZCkbE4EO={aYKo?t+Dyk23Ogg@kjHszaOLwR%g*Kl~;|R;HIzk z{rnY3bQn!Sw4vA7fP|XOWZa{1ZvN)$q1zy@NLzP+{CNEF>us6l-L1z_pPt0jwj6YU zT`*&k;^KPpD_vKUsj0HF(=G?x(0q?xUA{w`ohuaLlcACQ@jzwvg1qIv-LJ7B6Y|@| zjg#IGsP;B@iPi?>N1#;PcB8aO#XEr$yFUq;e>=I_s34|drd^S| zaGB`g6Z~!lfV7G9olqvBuv5l#!f37>g%$8jP&0J2&7syj%e^h%Wf80Bajjs#LI3Jx+Rz`vtq z2Egq`IR~dfRS>`tl0P}(GtzgmwC$l6aUDiahp4ya+JtirEmKZn@6vwcigSX9@SF<6 zZQZMEO1Yj+`@nODSL%LlW?5HWcF;1e_t+MMp3b7a-IhWE6$aS`9b9xy%17JdArVB)AU?Q&V&y10OHT$r#+$MJPRu zax3967rD6#l&)(oTk{G*ZUg~hTb2Kmer`R+tV}f`dm0tiVRa&VFySjj{V4f4wR;Qu z7`AfW%-!8f;kzdnPaSD8_opafDQ%>^yB=9aG5Z=a#2z-Dw`oNQ2qv*-$r5jZ)4#es;jc(!E~I z^OHlb(NDqX)W`CcccMVMTnHOeP}`3#YE?KzdTI;s^vkyfZ7}~762)ayApteeRtz|` z5wfU4uD(8W?B~xkYCQahf3RJr7Vh^+4OaEuqF602U;$)?SA*yKUHW@l@gJ=K)W&hD~+PZCc|Y@Gc%6R zXHU`3*dw5Lg?jHT^(npLJslj<}}ze-3_?tWgCr_g*G#c0jN1){oqIKLg#t_2h9f zYO@NxLKShkM$#&oBJ_j7rzS(rvJE|Vxc8GrVZ`y5v;EGdTZ{k_i907a_d=V%OT9(# zw`xr_A*^#m>>U%Q54~dgi!2|pJM{0@YE^=N8@;15ndGho zfXp z%aUpZtUND`%o4%D#1cCSlh4?bHK_F}9r@o*PjbOgacw0e$45X#<9>>Qz;OwBx^rC* zoqTAmDd$y-ZYhyt#iQOmC+)q1yj&N)fDNBrw;{G60B#p+gslMo`$W8@l?h+nYLfcI z2FnxX5fsIb-z&GWPIJm!m3l4UHf1Ku`oas&4`^O$w@Z?iGlFeC-zK`baF>1VN{w7{ zBd*xI>|;0#1lW*^Xu76UzzBAof!?MMOUooXla3XLYsNI2tze{v&#y0hSrQRnS3c`c z6$|;?at%lzU1whHF!ivtJT63)cnQ(z6!<{N`c$D?>r7SaHA1aF5!?Uct>5K*>n2Ae ztS;YvpiW`cW1D^C#;2^9`(o6&-P)a7q^D|C+p*beXFC_1=$h{=DNBhjD@+Lg=yfC* z?h6ozi`{9xP(xOKN_M`XhF2#Q0=4fU@Y6n6?KlmjO6f?r)f?7cBu{XZYUS}~sYiz; zK&ePvNd!q06?Eh+<=GnsV+;w5y5ah3R1aX8KP4 z7va=7n-A2rrcKxHOHF#(7Ig}_$Q^)qrsssqRtBcr4CdGSchl0E2Xi5+X7}Bi6Wo}u zHph}C-jeoGlC-TuwH7mo*5)GdFle}V?Lr-*{RPkkbrCb+E$3yu|A50BB)+JFYR z;9iyt|2U(w+45%lfAT*~sfKrRu=cX&V;1`FM*9#ZQEW!m*8Z9#t>wPmUz~FGVsR%d zl%xe4o}0AyDgGsocEA4p=TsQ5FrGAOiC3EWuY9#+2Eyg;@T=OwLgR{07l&?rc(S>W zkzi`fzjf7|hDPq*aB6KcDo?M(C&Pxll=g;GmJC`He{D-%h^A`LptgEa4C<8Zkz!Qq zZ*Gi~=8pmMJ)Ha`-Pc4~x{$pT+yq5!Bq_m5$k;E#G*_kOqe<>w971iUZz#*cx0-_$ zx5%w!EF2z=o3T8DAf>jPpiRZYN(}R*K8cDlLBnsq>07Gp@`>^~fs)_eTmty6fb|TQ zp`;m*XKXz;1m7C*?w9y5oGIVNW!V7XZjA1U2+QwvL#1D2NUVMhww!a+l0%7km!cY97K z{5|7Iw5inebE24*jjS-bxp>4Pb_RO#TeTQt!s`T71?4p%jR&B3O}%9`=TCoQIcyN8 z6T5ocKGcBJ&=?vPmiOVcHOC~U^rlK#refGYSP=(feSvctT~~58!~2wtU<9f1-O*#h z!*|`+J4krLQC1Z_<61n>0ja@k1&ynmjFxiLw6t@Q;B{L@sa5|ZKk@6;4oph!X=Taf z;gEH5x5{)F^QB{5cGyWWa|jEI-vH3GFT!I&i0X^e2JdYO7`VBueMbWE7(wCTpovvm zVdH-Ir;Bs_?4=wY(LQwr&cj{jEAI|5scIy&4wY=RO1|J9E;#JTRkz^M&m&ais}6vK9Xdboro|NrPD-qc(4lp;@b_Hrho=n?@r@Xl25i*7wK8U0 zZmU#kuVia7xvWvs8?-9BT_-$AYg|^I!dk{`(|gX22tKcs$YocQMQW>KwKZytE#6j# z$o6hq7BhWo|Xvvu4gbV4i2uo8bAfJ^DMpRsoLo=!=9PK z)pL{yX3A;EPBQ?*%l+9bXPITR4&&T_4gaqfdrcP8Z2fGw7kw<>e31NW65+StQdG#SdK zy_1uMZ=~LUvwWj!RG?G@5}Em&IkwsC_c|6`xX^N2Ktkx0kTAOXzz$9D&tP{fy^gS( zWtZfr_|N2bDeK;R3Xw88Z~L3H!#}>ueTL`!#6cqX60Du!^|(y=F!Hzy zZPMVXem30xwI@c=kWaRnRW>SYOnjnZnag3C#4fLX%Jo*u zY-MQ|PK!Z8h0TRLQad(3m$f8p`m76MyK8YDvd?-mtJ7_L<#a|_V}JpFA0xcmqqNuY zyor{?p4JF^@Y?&$l>f7L;ktL0O?sn(0A_0HeF~Dqp`cWvdT3D8KzZVw z=+Kq!YW>iYK6_%b<1RT{a9Q-0Oq{BVjOq9wQQ>Ct2pfs&P^fUXATS=t@0Si6sVF4L zMv+DjOd29l5gk~_yZSR3pGAj9vjUzF4FaH=%STi+3urYb^I0zY^)5{iC05mllF|vK z3kP}^ZdqEueJTcebb^w2RyFb94$jR+(%=lvCmk+5iEXNn@V5NHsWO{Pj%fb9U2B?q zefz;it7ED?>b^K{)^!G|Ss&6@As|puk>U>_cCJ6Udzyb7{!|lIkF?Rap6*#tTi}=F zY>q^AdpB;0*}hB5Q@nlWU-sz48#|nqt)|9oo{RyX;B1fGeY%M%IT)8WdNhULX%1z? zXhWthsINfybTkt`c^1ptL^k&Nw2O~Q-ie#PbJVk%OW||x=n)kZdR~hzgF+Ud#N5{2EHf@UH^KK0}qh(-a)8GNiH`2 zjb+2zE^Wu8a7(b&E?P9<03E0BFt77f2DRdq}6j%uJNqp%t~#E+?a${l75nfZmtz6%y$~+ zc>zwkZA+am#{4m;Z72EPQt6S8NrI3rlUFZcU$~F=v%XHqbWgFwY{CY{oS!p7mnr7{ zezCZ-@1AD~nr9x3$*MY#vGoKf=csBGM z-zz9oVa$?$Ist40*pm0Fx1?h+CE>%B3las2gXUir#=t88qo47Mf@=Vs zDlx)7ED<*)d?oRNpFUa#r9vlGpzER1N$avvf$dNTK&5^$LgbzmHDZOC+WT}ppCY2p zNT<$bM5EQ1NV9ghfubSvmv1XzbFpsGP{1$F0D;hW0U6SToya> z6heV83`~d0GXyF8`Y; zl63A%UFqa_%w=>DKarE~?OJIb01n>%G(T9a}HjZEE)4M zmP&W2EbXlD)6^kG-U>E4ENNQe3OD5_p|-d_QDw|8X6l+^&6X7EZW=SG85`FfPHJ~Nfa@Tzt|K)=yI}<7fDM6Hgr2|4 zfnl&a@rT{5_}Fx?R_o;I`8)UHgnqOeOVvI(^T_Wy-Xlv!ees2b^$(H!*1VdlR@whl zSDAmj{;JNSp$jy?R2pGE|5jrfRu8#z z@Kq51`D`^hmO&QsNA7+WEvD;XVJyLh!X8$xb|7)QCGArpg(MFE3QP2sGOODXW z#e>0nPFP01E~NGeyg5lZSLF;!q(@HG3=>xrF;_{5J$Un^Wd-bv;3>q8k!pN6 z&PZ+ZyP6y0Cde&O95@e77C%6K3`ckt*L#~8abOi`$L^dhY+S7JRzbj}hg{`ZVF4kv zz0jPt(o;2V8dmA$9@96v_hPQ!zEBr_9#S+h6ey=o=?dL0h-!EqXlK&-9*EriTL+pf?DoU~X~J)8Vo-r(dt=Pu9Uv zq@k+)Fh+B}4Ijj+)QnzM#|wEH>APZ?;hAp`qiSd_KajAS=DyJfO|@6k0M&?FdOu5V zRtb;o%~)G+o**4yF4jufIYf$k-?mbShu@VrYZXZf3Hd?ZTQOED7*5ZGsJ`(1lL=P> z=~Ahbsxbo@RE#iy_FpcMx@ddB>*fb|!@?7;MBov&?~T#%w<8%8=iQ}E^bbot z=WYn4wj`2E&MfF|*0FHhX&&}x+lOc%s)dj_(ejk=%cvpWo0)g%)2H7mW|Z+k1v72~>JM=jLb@RY%z5f)_~rL^g++L7jB&GvO>2?_O0=!ez&eP# z7@LTAH_?!iK3p6v-L0EaM#?nRfr6~jMLuKTbgbbs+)~)$F@$S>1!YsnZOt`Tx}cqx zK7Y$r+#nM_u0_S~;CRdIAYncFOn$1%|_XtG3l$sQgHE|E%`! z)ZM0@HYbJnhgY-2f!LFuE@gk}>*?)lzPS12E|$67?=GQ0F>v8cv!9~^;-Qc-!_e}; z+WY9(A>8@z`|e5^$F;OEPgMknLmZUq7Fa*luRRDUj2L@Hd)I(;6eownv&oeLLU2*N zK0}yH|M{kU40Kdi>HM?GO|r@>eg*GugT(MwKfC-Q-1|B zF&^rpZ_SO*EF*bJM~{_e^5T#%1=dq`L~dt^JC^0hl`hM1%;age;43ec%BA{7zTQ)4 z4GB@u-TF3#?4U@`8d}`ZE>5m*gfNew3nH z=Gi-c(U1%MN_J`9Th@A$xmA-mC!;e|ng!s( zhsA85nf(gx3k}WAg3akYNiY6s@;t)2NM38I1mTuJgXqR&e5s{LH2lW{qiM%k5i6&iGfh(|DGS$!!`?xVPnv`326NUw86o@2tq;l z;8}ZH!&Z*t9L?7&$z3iW_UaKQnQZu&>aC!@-A{*(oZHAljFgVI5bqY%Gnb_3+h=fn za@Bo%0hQ`6S{^Mq^bWpZp0Q-EjC%nh-TjVQV@ANmQcYbX9vrY8PY6d6{gShozez-N z3WC5dN9J;(EBORp32O3xZeT7I!^g1aaNPgcqYzi)`gDk+4sRwNL?K|6$J=BD-=y!4 zOx*sEdMkZW_yRID!c_%CtMz7vKnb6QG|!#OU%r2sS?^a~Pqge< zPE9GJR=@E@=wwhThNAWO;i%|amWA1ge&zO)$A;Jboh+U_CGnl^`|`JZLM*LM#W`DE z!}|!OleYu+mIzA3P2THnkK#q*O08BxLi7@sQeCtRS!wbibM;=ZjjX%-rlHvDbqo10 zk+-d?TqOHfYviyjr5L9YhBIeIstipSKFE8QtnhHuyX;H6w&NcnEbkg7H6A?DG2ZhZ z!t38loyV0Hz_|S9Kb^Hyu8G(5&rN5xaqMPpE<&j{0N$D;ZmX;KHyZr#uVvnJxRMi%(G6*j7m0-ACq5fiGzO{rwasA9@mMod|pH^hfeDyO-XWDt#8|>>m_lHTG7n zR5ig{ zvh4%#wcS7N15w#^!|@iDyzyL@V(|V(vWxPrXa3~*5?bVZ>W$T5$oGwJkjWMd3g$Bn zbsDP8C1?`&26W<|$f?X264)3f#`mSe0$ZMycz`ph&(f7(tMeKZ=W9?NwVLU|2Rg@h zj?t6ryY9x}BEEoR-byJ!jOJ3QRQ!nVL>{vp%lVkfYtS^qqkRZ7-8$%&H0r>S5)_0w z9w6TA8HPQ>wkt4l3^eza7WIupaN^x}hmMX_YT=F`0>I_$+JjYU)v?;6(+L8#(0eX(lj*RMdcmzdm-eFrqaigapP}` zCkyFi8V;bQZV{7GJRNZuB3kLK;e!!HDT%Xp$b_bPQB|f6A6g+ zG#BODv)sUUp0x+N3O)32u=Ho5?ga`J?Kq53)QFY!~lNTMA<@8 zPASP=aMxcB5gV+){Bk#IQWf4)q!|G-V^y&NtkS`SE^+*Ic8**k1-gc6xhred06Buo z780kY>L8o^ni3@ASc@^q+|p$6acE_N!-KG}Vu9_J=EG{zdbQoqe7(|x8JqkCSuVdk z?o`!9H6^<5D88A{RlW(qF&iyHt$k#c8^obPsv1)(ztVa{f2-M3&2&Vi27GFDy^ENFnYSet9Otxf3?%xw8_Nd~^ z#fgp&7|KgAN*pWJt7@atA7-KaQ>RA@lynztUQ$drQt|BHjDC!)+=S$%A4vwEU7YbcUzPXsb@zM4HP|)>AoSKce~!9ZipA zhMXTUje+)B#<0~3mY*@Pp5;J_>7F0SjhmxlL+1XY)jvSp&Tz2`IDWWaTyiM_nNhUv z(IYOd%OAr}wP;r!X9)OV7cv@CDMja~AN=1uuBY+$DH69?R1Egg+&)-a*ElJrIS*yj zFwT-SB7Lj3rlq+tZ8VAs6bt^Wb2TMdjC=HQk3Z69(-;Kr7LMb%XlNMT_L!4TFsHku z^!-SUb^J(04e)4=rZyJ#esG???e+lffF8-V_y)&r3+<$xQC1A02|Kr`XwwZhRnGqlG@F5{qA2zauEmjXAm+IRhaR@|tryQj==LgqeASlHP$cV+*@rD9Z4&dC7PLzc_2_e?E{Al$}D z*=R*Cf;tiLxn3ECWC2T&IPqyCu(k4r<9qt4vSn_CSV_K?h%m+$uTA5#!O1!tezRN^ zNe%#Tv9&!oTIt-r#GYM{Q+mqS+Fh<;fsUFcpDzKX_u5GI2&iE{_**I_K%Fx%4jO2Auq6&Z`Hy}>CKa3LT9xvZH6Km zGv&OEGo95ci~!t!g7UpPaH`2$)myoAPzT2R8BB)QJ(3-;nA4nj^G^_c1b4-1X6KCo zy4(@?cz>4eBpVKWSw%1!#slVZ9X*^dcH2#33cI|PH~z@g9z>yrUdXRWh3v!*z%QO6 zdp^K!{uDtQZu=NP*$Qrn51^mj&=N=et;b;gb=1IEMA0nr_M9Lvp#Fc-1-b^@2Y$&# zJFg&pbtoX}oY=9Rz&pW#k$wTCC<)%mk%lw{oLh{l*ywS?OL5XK|A&QF;Q zHcF$>kIxntkr`<_0xtIziR{sZ5iXF1PVp67X; z=e*AA!2g7>d0Ujo@ z!Hk%t$FK`y7R{Te5q7)tbP+4P@!$YpGAFLv^oNCg)LfoZc#)pZ`szQn=6sn2h67ca z48ODVJv>;$c&NZzyMmoD=39j&{VsrT+nfftQxO~%6fnnrWkWXBTjQ?IHk#IQhro3n z2Gj{G%l8A(mP*WQSl9yC?VO6Q%1TQb`2fUDJbs2haUNrxw-nVBGX0x8G}lH`NhW^r zV?&cve>HSeA$~=?8Ua-2H(D9q`F+m;-A35x-ir0NM;<3F&;r#yxm(=G)cxBI49EUg zVA}++BMjyGp9}$aUD84k*Uu|VF&#Qqvj*^-((C}D-92o(PkeS$@jNgNW&6WN%v{Z~e+&-mW6<|mbl4_^>X zh>R1lN)LF1n9X4;(SGICq-LPvfB%f9!{y7xY_om=pA?8q1t4KXacp+CQ8M8ZyBoz$49kj;CD+ILR-Yo)ye-9oBY?^YvK&7(&ez5?Sl6AQv^~mHL>XLy;Kmb zMhAx)k2^jX63xpvbE+lb8An51$n&u?vlCUIy-?OlE#)I*hO`~u2O)UBlbzu~9j{W( z1AF0@EH~an`g;ll&d6$rC!q{cInFx502=>$Lis}hE%Np5e#6xYy;$d!+^w3}_n^-c zZ<7Tg-X`fwPKJ-ItbZ(Jy&|H#%bjoQ=mIj3|M$TTyWH@bc`*4Smoa1HYy7;zeCaT7 zyb_>6-8vke)-*4diN4x+s(kpE>KhQ!xjzXiB+tKvjXST6BPzej5)Sn@qi<0lS=vC; zvdpFhRS{aOSNOhw3-iWW!`Zs|kkf|`I$PzmejB@K?-R;s{=WJRk233WO zajaT^aue7KJ+yPKlJQLjXME82h3O0d4|!V#sP!vh#7@05^B#w5xpJm1U3LlAMX%Qz9k*LrwKfkk=O2Y%rQ0kv8!ES?<4P{N3fM-4suNUIO#@ zE6TD1@JSz3#FGgyuU_Bm-zM#Du6^IZU~!w>G+7J)`$zO?K#VsN^|SOn&xC?E{z5LD zQeHJm{j9wkbE3ea=91!r zz|GZ@we-sEd!E4)TW$hCNwRgqr3pTp<8(MllL_hG)*n@<1pwXM?Ux|;_4h)!l)F&? zAZ%K}41>~bv*X@WpkFh0LJQTU?jgZIo(G_+%^gu-FL?92mP5}>NZ>oNDTnDy#a2mb z{Jc+%smetO+xz(dDE`wsIXd&i2jXt)uD3}#K${j_VQNC~`gfVjuI9EWEa$EEnT{$2hZHp zS*dzbuuriX1hhdbUtTXU_xFB7$&q4bVad*t_RTB{j*skla~ryIzPAP!U+l= zCS4D>U1Y&oDa2xq?_glB1L|38M+`?ocb741jb5AmUICmE6#EK9S5W@=7JYYAqW$+` zfA73VQOhF$Lr%`bVJ+~KuOy`kr5a+vX$J{#by{rUl)s_5B^CO_^Wae!AP~FJuGD4u z$E9Wp*|mx8#lDYs;+v+T(rLW zFx(WbH>cO~=GvSsw;Adv@XIn^0-f4P;>nk9mb@NQcW-wUMbF4o^(MR|vjQ}=8EW(X zM$=wh=8p{KY>~y7?<_p)NxzjD=@;eoYXqSlt|LcN=OLk6wY&k`g_Q1{t68h&vgn2xN+WoD0H_Qb)|5Yn%bQYlU@O7Cawq==~ zxV4U0v!b#kh^eOJ3sDDq3P1&HRmw|&^qUuZx0Ih%*m-w8R=fMT)llQow+`$W{4|dM z4K#c!H9>>qaY@uX>xvGeZS%$lw#uOTVi1A<;+Fv&pt%5AskXfse>T0qzXzY>zrfw@ z^_-zUl|D7{YTatuI5Qq&q)rVsn*Iz%FHB0nL=vyhWr_ha;`vcIWOyseLwC+XgkQ=x z^V9DgA6*D5zyrn`U*C-dtaj`7_$c~@1*6KBA)ENXxpOXG@0v})o~=pxjiY>vA@jJt zuYyNrslVN8tm8qTEZFy(1E-VEvpGu2qe?wtzLU*P{gFiu5k0;QsX#)^4|gn(*J$-_ zDP<$01xaU=Cr05KJxynQrK3_wQ7xurc9}d?n-A_js*G=ecYrEC>{qvhpLc@C$JAX1 zPG}8`P`s_GJ|W#jWasBiodbiU6n)2`8uHCKmZGQAt|2K+H*rFFh}eL;Zqb6OM5qYP z3LfQiVlw-B{k$$Xfn18}MbxwagBM|Dp>1BPlEvT-_6=86&J4_0Q03u@y`*8(ebM!G zVO%uqgO30oa4`N!rLO!+%!7I*cO>L=xua)neG#BxN^wkc6xZDQ zL6sWEm+gCdlX^j9FC?2nR4aA>VtC|gV%AId4?UCh6p#KzUEJz<9WgG;zRP79XY%jW zKgAon^zmb_+O&=wFgEm%95 z7{?8AOjSs^>WIJZX@q#rRIN?M5=$B20Kdji2R%yUT(sr~!=3-v2)md`` zs>5^cstH;8%Yqz{z-39F?AlJtOx>k8@-dUi5a*ej|J9(dN^~JvFD0MWy?mSQ@ynEj z;L`2rYWhQ4no1?x3A)~!$50Nfr+v{3QlzoHkZPpIRvet_Mx+NJgVyM{yTB z)}G-_|I1gu-@piW<8~~##@9)8UH*zve~gg7&0Rmx=lJZ_xQH*bza9KL!}Ez<^X5}F zD5}hv>tM_L>L?>!*QJ{^e1J7;>YZ&?Nzbalq_IJ1*AK^CvLdY4$#S3Y!B_ChM7PwX zH6i;G90D`41SWUXDto(F9lGQFYUwu0@#wx1+RQ{#xUlfQd0Be5{}qida)?i=k0oer zT#W;L7=?-&ec5QAcNBbu-+nB+U{p$F*FX{PV9!A`wGcVDXoWO%d}jBe4Y{J?jves( z$s)8+mOT}`x+Rw}nXP4aBZeV~wSR;I1mQ@|!XA**`2<#v#w-_maZCqmcwOQqSJX35 zX6;bO3lJg9t<9-3`z&Z^}fqrQRICQJU|P ztzIc}DQle*{!H+`R@2aF8CAZ`cz9-iz5tSArXlfbs$*mWowxQ}3rv-F7iqszxSEZ>Lyh9SVF+z1K$N zjTTTn4lZ}Ol<8PdJb#U4Diwc3Y}+GGiF8HRIbx*b@k7Od+WB+PGnMoU?#F;HPn&A_c#vvxJHFreV21*>@CX_S3L=PF^nMu>@;p zC8bYEDCYO-y4wj(&G3yuzXE((e7jt>i-N6({L;&Sl)z6D4|A^oYR_!@XlN!2TIkJQ zt^vYx)!y={5yHitl;4>&Y0%DG1pc>*E`zJ;Fl0MP{vgFiX*!EbD;RU3aVhk5N^Cf# zY@zaE^@xDKm3A>1X`Y{s(UkV07jZ9!qzTc_rP_5c@9eRh63 zB=}1h%UcfcvqU9&WUy>%W@+0;S&Hlme^y8v@m*N+CX>iY(fcVMNl=Pq_cCyO$lHNc zYk9`L20G=Ut&X%E>p#;Hz;vdL}6+lNdT-qh`JKOra zhUs-9p~v642?+zNAOw>#_P-2wE+O6wvhL#M)UXbyjmS$31FRoNfDTsfy5sXIDX#59 ze=HV7*L4v!<%I5aZU+)Eg`IyT9BD~9n} zTOO=56k?c7m#Es--D%o=*PEJUHz(cY5wa?F=viWqpTg*L(zxbqRRtGfOFf>%e}o?# zhNk}?roZ2etd~oNcMrO3%kPKcq|PmCUD!Eg*B^S0|4EkSp$X?jDURyU!RE+CSXDeD zhahzVvw9v~!KF=u#lWo5HUq6)SSZUu9&3PU*7E45lng)(D~FsP?jz4H-?p{zQGY9> z*)xT6&_Aa*o+jIwv`z6kus#vyx|Q>;ZWIXbP8|l${C83j&X&_*VaOH5SIEj>gk9|S z^$tcp@-V-9=nl~?vWvr5{RXsYwCJmWGAyBkqj2nzNCG+4Q-x6 zl_S-^u-_V<(B<-nB>KN8g1Sz;nn%mh?v-Y3TmbG&`~WTgX4_|alMCO3w&@7tg?(Ue zwY&+}gA+JPAe0avFvYjkj{Y(e!HI|vuEb&I(uy!fyOt5G3Kuvb9<=l;!*uM2%mUHq zVrU8QF0A2HWAw(=h{-d?i9>df-Z80IgwIxvRnywEQW!5 z7eAoy(biuTy#0RiA0~qJy>Je%9~7mwPfY)tGWA(ryUof|R>8O4Q3*+#j>=U3jfONs z3q;G&LFZ6Dsf0iA#zPk-Av-EsUpJO}qy5xeHq=~tY3bzopQCB50bUmm!I!S9ymELx z?L3p_oFPe>aO)3{<(BOdvWR_-1t?=b#)%19t9ruE+8V7wO1E_L!7_u39V+~Z}VCQmFW1Eya(`*?p)v`d7!k>rvtU4YIx25nrPOG=x z3$cKA=ABU$OCzd^1A7)g8*%nRuef&#LP{My?-`&xHtI!tT{Ob*G>WAbRcP18RB8{H zd5vvA0eJw80!t}@I~rfE{bBWpP3GlX#-ztjf!vQ&9!_HpM*t|3Blz`YSitNh#p@j- zH}@7X6a76w@!X&`s0`w#lHzA?!#y))^Db*rSn>sY+qMU7Xs(% z?xGBE@3ZC$h$0*T7;uxV*wDCaXKC?qT0mUR`~mDw@zb*)ELFebt!AF z$KHRG@YB9lBNAJe)F_4>BhY`br19VlhD2@fjc!#VARcU?xZyrkQzeSrp3nGbA&*h; z{gW(&q+5_+n@}@DP5;JJJ{ki}Wap$n@*h~ajnD&d^kOv%OVna_gDi;b^7E7+VS2yy zogJHfX#+Fn>qjuW#=Ldzj`Yb!yl?6FR=Jm%n80T6Me`StXZ}a&OT_q(0M(ZB0z;(D z|LvS#>tsXeWM0B6rB{x8Jzd@wt}&I9)Cg{vt@S8aTN8b?romwMUgQvxdfFf&-g?hH zN4ODT)pUnZ$0uzmqyX|!W6;{-tiu@nPaXl&Gj|tK_hK^>g|^!s_?7;}_0*}c16flG zk?3O1DRF|Tx-_=u{N>LmaQA9jqmU}9M0hy{WqO!>U8Jb|3Mc$Js;Pi)JE+1TMb+jb z&Ao5szVA<{YB(o%AQO}d%Qkm%8vzVZ^pqr6BzDHdLuU*ergwgdu18_XDZ+}LpU#(IGf?usizzfU-FE!hl zG^4=n*|Ayu5bRB@#(y&&{PdYd&Bm+VL>LNmy!5kK0h4+J1B)-CMh33&7V{v;kbt7? zp-?DV_{42&TJ)T^5#pYZbS`nY#>TmwYU)h8(=k^V3?@!~z2jQ9=)kFej*ZDsj&50)(qfMD zC6n!P-iG;8?FUX|uZJ9$kSlINo3cwzj`BXVzs2m1bnIwJ#P=Jg3+s?Gb#=cOS%|(; z@2FRD$u|ty8t%dFta{N7cPz5qkEs)rmitX(WaDC}NvPp9m}G98QdXgD9lH9oX)4fY zh6PZXGlCogDq2S^1aT!NZB5G7JHZ?*VF!vR?cb6s&_{sqSA5UmBJj8(P-81BP6w)Z z2F_m@SmGSOfG`d)nNm;Y+kq*tT#lqX#B+SZb&i*8OYv&iYnx^PFGI6{jcs19U!D^^ z)a!C(jy=rwgpjP>5!?^$f&YaK@t;2Ojg6Pv++`ccY5TnG-2NN$E7x>y>6B~PKl~r( C{7z8- literal 0 HcmV?d00001 diff --git a/docs/images/GNU.PNG b/docs/images/GNU.PNG new file mode 100644 index 0000000000000000000000000000000000000000..5729d3806e6c92122e515ade318619ef2d9305d3 GIT binary patch literal 5877 zcmYLN2|SeFyB{Q3BK#tS5k*CYP}Uhsq7bc=Wk!^&lYO7Dq^|4^Ea zY}qr)GPX&K8O-+H@xS+T@A-V*_dVx*p3ixn^DN(Uo)d3#)9k3gNdXWDbo9FU)!QHt zj~%D4e&jIc-JN)(g!AF@zinm=svMMFAySSN?#okjo%VP=6y@$CcP&9i?9=#&}CZ5cp7 zb%Mm&3Sl#LA@8YR(BCeH&AR0C-CP`nKyR}Y?cUsV+>~!_f}4sR7q2`QE*JG53J4S- zi?gaaVB)m-1db^5WxjRT_tP(>w~=YU4b?`{0+AK^!xb{`XTRyNM_Li>ox^<+ifV1g zgFp~2eu>3Y6L!ub{x$e!mBgH>$iXz0b@qq;Z;s7n@hiz@zR0Mw~fPcY7^Hd_IWdqUYP+V170& zD%)kPQ-bhRB|&VyGm1pEBQsCpw}2NyOvdx_#5^} zOk7QnRI(kLK=G@NoQ!W;Q<%}~^8^G^BkjrI3xXh!R#OabD2q>Lk$46YTQ_w`IbSkH zN1%pz)wM5!`*%)+h#YUir6I~9RtUUcST8JFn@IoQ6fMA2Aaqja?Qv=eXj_rn{9?z# zdu=|_o#@Y*Pw#%rITo3Sn7n zKP1>(qXKW*FH);nR50u=ApeS%9+^dJ-QopBojhZgI>GS(+saS*EnKt_h=UiqAUg4{ zAV&7)P+0;a(&|#sFW$)~F><}U4Yo%FjHB#&9ip=(=|K)(lM{Qd?s+f-$|u2{a8t9RCZ^}fz|jpo~_ndwc5LPPpG!$NBHAgABF zdp_Llm(TO`SEz1%G8vD&VO@}iAP=L@DxSJmC9SqYoIz!w^rj_4;y3R;lK$H@k&e6G z|0K}_826X_#D4erj?6}3wQP1ylAjwxl9R}6=00(tgD&F734f;j;umcDg){d}xb8FK zT0>New2A3v(_G6ZTYMfrKP9)l?-ccb7yr)Z)Uj;Y<0XgNOJ|d2P7K@EZkjV%=~}+i zlDSz3vQmQ#60eNTwP9LgZG+pkXTyluYw+-Hjx>){#6h^?&GJ!?!NQ3vEqc6m(owe% z^|L`*C*?@*{U}07N-1_+ihWHH3@H^o!~PB=6SkYRHgl*zxwXTGe|NO)qW6W48&p;L zN1F1CFzpiEHofm9eT^*%4*8^U^^y!Z)TJ=kQ zAK7c>$R(|g5%QO0Nih{ryPSEI94G6E3K*-ZBLbiIB#NlLM|mB#C6Rpl$)(SH>IHL^ z+&1Rlf?MF!JHeuFmacHgE8R2Zo5 zN?Wr%;O3)J)Te#qX=8gb?Lf{<3IdRDQS9(PF_{TS&BOj#CtiTj;^8?$XWF8P` zg##{&R4^x+(xszZJ|dmP8&g}KV|>6Mkl_{1B5{D+OJ>gE0pfumrZbDb&4y&cjT@TMnQSiQO+5=#7bcog&V!Bv)Rn!*4q)yWPMiU2)F}`)9`1uZZ z!K+b z)rw$}rgWfgGg~SQnGeBz_~hRk^pbc~?ct0qfVHm7=k~kxf%L|3*c|z#CMwt`TnE!# ztB;u=qxTvAeuATrDn;yz>+C)K>KWn`5J%f3*{T^>kM8BIU|P2-B!l}q&yscth2f}3 zG1C~+yi0J+)t>lc`Xj#Y`1?%XqtfR6P(cw7a~F@lc-;OOF3^)~GzH1X5pb>GKhc)^ z+NXXYlASzv!Zb`LRmyD^z43`>+6Et3A8or7E*BnH z%lK)|PqrJCu;>NMjuXGFf4vJvhq03wyJm9YDyUkfRhwI^&>6jz9up~O*NbTfC+kZO zZML&+reXO?QqyCU5^z!x2WLX#TA49DV#>E9gDR<7LFsZ%RxSccAKx$0QUs?nteOW2kJ zFeu)It#OA^m~-O0C#9epinVl!Tr=K^HB4a@p-RCnC{_C0Hbvq(-hj4Rjnjfk`Ok&0 z!)sWx?9I0@saoJ7Am75yp+B|+Dw35i`;lv1BK9KO@mpKRQ_;yNbmkHGFCcijBs5|% zB59c7#z2lWE3*aZqrTe%^FS%Yxt}l6H4AVL+E}~`ScZjeRjABte02p|lQA!gDc_kP zw*B^pT7@rc)1EuWcJnu6h<3DRIE;fA9U7lSX<}dLK8Jcc3hCz7${2>xceQN-`uTKr zi;s$Tw&l=LhH!B|Q4V(({=M}zeJDN4TdFRglHS;m6|hW+(N|T$T0NMMPa#19g_3Sv z-L-III)oaZ>W=bj`$iE-dJ*A)3Q*`O(Vla|lxi9M8|Xm`WK8IwBswQ`7aokd16Zp4 z54;vsdAa30{ak1g>n4LO_LfERFQ#v`wu%M@!jo)RqzP@Tc|gBWQvT=#svEppbxh; zA+JT)ztN(r{p=QJkg8X_7`xrK&e++mxxU@uUVWdRyONDKneA}%nDwCe#;u!G6{@~} zYyR8`6I+v9^kLZ+q!)E{<93lFntHnQoysLObkG+Ik)C za&mO0qu8H4%08RvR=jL>#ocReTOiX7=Iuat{&~gI9tZHUXp<$QK=wlIqrp?7XhI{= z1~|*ZHpoCUvx0Z}>ee>Lhz%(pFtOvPQAu6*Nuse;CAU4}a(8v@&+5&TXqtvN3&{`# z6GefZ>VHV}tm|%#zyN)0m0h|*JOM+_b~7*dh5`DDOosS5ag{X$`;n#8(^XKW{v)ds znZElpgN=sR&d++PSjB5f$ma|lQMo31cbH`4T}FmCU-(gNJ5}&n**feh>{Ko5Q?8b( z|3brvE-W!8;%}~3jipCJT$S}X3))9=ZA3>Adtha1#JU!vLh4cQ29A>MpqH1y6YB)@ z6Cc#hi12*)#%N9C6)bp3vLkHv4;%~bpy7j&KXNx9GZhizXI2^sPy^b}JVfaKAcHjl zbNv62A)9LU1rmwuM@~S$biM3~EY)Mhv{l1*a09+;l8yD!ee^4=azz7;l0;iY8l_NHh<;L^ zl&Ef{h36(3raoFJkai)LR*9HbEv7bqI>6k+Yhn$YlT^o&Y2x9FKW^!NQ&K`askNyd z)f%*Y>#rFzJ-~7xSSP0?%HwVJLZS^59#ER89va8Tn9JjkmtlFR*o!ay9y0WXxD`Ogn|rt56Vx_U)@y1 z|CYgXGV~MI%18lF^^TfgtVb05nGs~$nShCSL7vfBm1-@e{)Z9F6B$sLht|a}3rL8m z4E3xGUPHXb%soG-`$1sA$N+3!xNcy~J!EHTYs!Mcs#TTtO3ux1y>6-hUe5B+$@9zU zdpW5l?}BuZLN&vu?{nLt8V%bnnbb9Gd5d|~=7WccQ`-v9+4pGR@ryo#!?Q;2S!kx( z=4Dhl;b{!DnwEmPHv^8>=SWi%iG2H%)N<$@A`awu_=9wnkJA_p+allbDQI{>`Sj1m z3tQgqE$6>I3Lg(#JK4<+#2-JI|e5PtaO+lNENEP0EIrza2B{-UV}yjsf0 zG?k4h3vjR!x;=8N*9N|r@YgTKH1NA}fafJC@$3)mesk{I>TeL|-!y*m?k?Fck3ZSu zLxwTSZlQ5;Rc7k@~>;TlhoPZHz3VZUZ9q0Nl%-qJvFb zX2Zy4wBG9zOJNG(A1Is%i-HSB1}NRy`IR|kijJvojj1_WCA5G0iMNR$V(+b_<{H-+ z7k$a)CWw?qRbG>Eb;;=rk~j2R{X|u6H&R}`7VW^~jj-t+a?%gRhBI@we3xWwhFU@o zd#3Go_$N2qduLaxSzW1-tz-K*`RLEAzC-_jzNH6UE}?COa10DtZA!qkIV=)UuH> zTVQ$XjkZddWBx=Jgk|-R%%2@8LbvTONMENcezieg&-#G=NwWw#X;qa;@4=lf4SUua zm}5(a{qUwDr%?uW3#2KMSLhxr!*@e`uynmtJ1}xGDFbnIjgO;8)X^RF47K5I-=1*{ zupW7M{XIO2Qhn|3r94F7>V&A_RyA12b!_oM6&d5hXS|*@4h+AqP=E58wpY!*Z=W9# zV+`+xNhpe!JHTME<}8x05aGndg|B{ zS06=j6)W_C9dfxNH!K>i1&J=sr|@K*NL?YQX)Xz>r~ZbQ)EoLiJ4UFdgx?!C8$>6H zzrS*w;BbP=aRdIxTG+gCX{TK+1MZbmiI5_DuN={?*i!C#Iqznk&HnJ+>gnt&#}lE_ zw&uaqcdU8C@FUtL8%VzP0`}nwM_IQlsgr-4Q1@~ z6Vj;8WNQ-L@Ojvqg~)zIJI^mP4ZCV~DC)XGj^8b;m z3l??Yk2aREJn~K!QQUs|N2~n0=wrL!wwP3&BrM2-TqyBz>Z;5O@$&a@k2^Ng#?xqq zZ21sojah&mdoi0=XBw$3%3M5 zT}6OqcHJ4jL%29MhHW5089GfI6Tc+qxSnEtS+=e7F2}qs`UDuzoC6qrpU>V_Dny&C z^W@>A8;7a(U7tr$)IDUNIaz4^!2KW*`ie!}+9A)A7Y&~d(c|`?%|?lVK-bhNr8!kU z2N7}BRX2rf_A<{m4^T{)#zG)a_^&`=*z^Ih{dWfc{T3~?x$MH8-8)3%LSn&@ga4dX z^2{#gKLF0@_RkL(x0ANEO+$SAG0QHl^sB1_CwJM&F_|NlAP`JQv0_kEswo^$Vg-n;zneUls=Y_@KZ-vR=Gw%VSt zx&Q)+7m3zpn>LD`oSxw<(Mv4!f{g{J^oQb{XtTk~+}<1nD#wBOo|2;d=AbjzLO~#z z=D$;{75U~S2((+n*2?^1r2FDaNU5q}T4HFC1G&auRRVD!%lgBLQFN@5^e?9k;Dy}B zEBoTDAE`7H&>gkr;XAF*zb??^4HL1D^|8S7uMa#t*c0-M|BUCd)Z7{J*)_8Z4?~q^ z_A#1n&ib7Z@;RSf9cmRtI)Fgy7LIaa|5nw8`zL8aR{;-;b-@UMwHX|os!K2A57xfO zq|J7Q^xjO%Tn*!(jfy{52C)#mON*>;;bFXLTb^`IEQX!d^&NoJ1_Wu-#VJq_Vnb9s zKS431Z68)EM8qBfH+v5DwEHPgNfZ(B)|e9n@KT%?2DVqv8ImGp_e#Q6GrXg|t@g3>s;Eloi!7n~eoZI^28)KJe=CNnm z5^bk}dieWss|b(zAY+{$MdrAQfP?svi0?(;PdT@VfpDi^sR;iWk@^sAsFQ1pn1f3i z_WR@0jq5Rqdf z@&nqYN+*I&S*-bgZn!TF-QDfz_*-SemJq!I7|a z;<}sTw8XLANpYomOG>hQ)`ftzIK4K6$DDN zv^Q1e`X)lXn#jJM-RiT3Jlo3)pP6GwNs&*5Z!AJaVUbXj2P1$9sGc^-Qbm^jr*0J*ZSl#x4+!jeQ)Pi5{-mCVK zMYmLpcGa>{y7q89sh4wk(Q(D7;@2~hqz6hsU_ILn1$e{{fcy)*I)*pJPmAvqKg|Z4?V8w zrJIK;ZHaaa&5c>pE1^gRwxoqV3p2fX>fBnsk|)o^e&7xy)A8DVQ1~jyA671`&Uw=)587V8=ji4nADNEOPvt6I6JAw5ek5!$zt-j2TN_A+uNZDnSXH`>w^Uec zxR8<&vW5Rtsc%1`1U-+nivXa}n)vx+==~{f@~SJB>i3X6=)*5L!DGn}Gj{*GMQ=_% zz5|AY-@|QiaCV1FJJ(`%-!*vE0gpn3>xNo1SXI0KsTOPx}Q6=ET+aUEZ zu(*>*CUWobqc%0I%*z)g{%%EK*K%GibZJjk?~h-3CL(MRjfrcHa-o6BSuMmM3emR8 z6y*J7RM&UHQs(b*yEjSy4uQrLj&G~oPBOdyrz(mDn21L|(BTbEH1JA~^LUkEnV%|N zxxD0p#+ADSW1v^4NdF~$Gv@N@y2OCT_T$??%kti0)Vf_QkE3O?;@w5lh4Eljb?97l z-wB<0hiMbTg?07di;socXiWd3Uq{@p(2>7)+wAL6&sSKO=R_-amnRFPf{1h#sT5-L z^pL7UZ9gThLBHmsm$}k3>??9 z>+n4xZfv^+xPuANKT+fQX34+4y^|Q~@XZp|5bQov4)m?I{}7PFc_#d~D6|?7NMEPU zaC2V5j9W+w1NA0AS5E-ew`R4u^9(4`M#!MFpCUKvGkaIl3-YBFjzxti8vdaXj14wt zJem?=`u%c-Pc0+g>lI4B z!=62L$+bZgjNXh~_%D7s#~`Y@J!p{39wDB?+Ra|%UOPD8WJ`cPJHvXjICZOS938({ zhONTj6sE#l$F6oM)6_->tUo3NM%qTuRLet z?_)YKmw6_g@GfWu<{la?<;ap3c=ejpPkRMo!dcQ)RC&BAa83an1Rm?GL1yAHoNDw$ z{2EmoFIaD7>=wQf?|Y0tuv6|@?XrPao=U7+q-4|Mwrj>Z{9{I=DSc-EW>QBLwobMr zDUXD8hHaRaoMK6#BBS~nb;pYU?f2nfOdRRp08#r+$>#600*ofY@)nRT@Tx)*0U1TA z+8E(PxgcCRXK7n$k|-O9gul;;JI_LRY)P2_w*Mc-U&J8ug`iGehi zia9awk17}zP6~d00^G@35wSvt)SN={x%zydoX_NY&gsfdnOIoUPnN>6bDlULsE-6D zoXjHdnD|=n&MZMKzriecTH-2ZfBQ7%hyo-2c|*TZ5wFr(D1&$;;zzc?j|^FR8igZ; zm6%Qa4RMq%X5en3Cl*ZA85epaTtdKw>%!wG>-P;8@hRb4iaU?Rgca_c8nx=PO<}^* zkNp@eJDFR5pP;-&Q#PuTrC|hadsuzTIv=uTc z(V(8`=;91PxPMf+6ccMZfTGcGBTtFLGtSE+f}K5g;#^MYo?OckRdK0}9~7tVbllQ| z&Fq}1NIE~QP6#y4Ea-;n#1C*v*e)Tj)K)i(tU>dNCM8q;DgHxnkA0VT-nT}hq~Xnr zldn&4b2}wIu<{90lqnTr&vWqxW|{0RTHfZu(syB=6bYvuaxtnle27RAnjuhV?TZSL zb*TauLAy8ny*ZAZXHh{eqFPmqo*mn%Fu_=rzHkx(WD-iMv9WGS$ZjZ&R3UHoz^y5{_%!eSMB zjZHx;w_e0IDgl0P+&mz2_EU;|>z4#WP8_e5eotD{0UM%ec@{%$?_35WTQIKDIJ+B=3?lg+bGp!A4EY+ zPv7-+p8D>N(>(SFa&2AaRG4jDIfm2kbNNRVVIKdIof*w2pD-Q^dp1XpU8ovV(CBD2 z*01}qkzaZLdfKvnN4mdKPJ3d+hI=u0h5iAus|o|K$RDD#Ft{#C>?l4(9~jWchzOfKBe36>V15cdW%KM>^FS z2+RV0ex&CENg5y6^!;^!_ysng2yc({56bK@fAR zvBN9e(#E5zl+_mCN%m77v72I{(;=>;sSG_(TjF&V%lDDkg4XR{c0>w zV+lFX)^Nw=>U#x!DQ+-wKqDMbUi(>hit;V@JMlOCUd*;a$CHZ@!b<9 z@0OqUEIj8fjH7DYnx(Q25mxFi@r3iU*VuzY4 zbb<~O?QiVs$d!B&j@k%&?y`tjwHj?pI<7PQt~P^sy0vLck^F#O;Zn;n?`*t%tKqVa zEQ&dLH`oijmlTD@hDSAJokukb6L*obm2P}Co%pnjb-zDyvC@!TV^I1%_)d)T;_sGt zXv0bKC8~*pVO4!qMmZ8w>za$2eAxUL z_of@xgijy<`uA(*NwX06s=OG;bo0{8Pf{nGjCM5(YUI%~;8^|*PBIoAH`fM#^6=gl z_%5_wo~_z6g4QVuww6Bp7L0n4g3dj9OrwzDF#9p+S-PIRpcu|+@>Lln4(XnElimDX z#_jGse7JNMA@eoDDwH3oEWdlkZ)i+8&lIB1o;TbEzHqzsreTMrnWd2MvF+$%r?f_X7yt21BNwSZ88#e*^^>HHFq1-!Lz@~~zw9a51e7CKG^7_Gl?Fc^1F07r zp>kGm0qw-%Q=PDn{K;tOCqXy|Zo@kcsIo+#zNck7XxYCyv$TOZJCn%jBdX@)TB~Tx z`2XtdJa=2%DZ9;e;*i?3&UMZ_(wbm;{q;rl(_5(+0`5HT3su@q--P)~jrU8r?^0k2 zqF{LTx~a?{vME7HFkU{A+B>JkS6ThIE5Q48DRj9Z!vf9{6;drW9V%)cS0WQkqzVjf4f=a)mNXGq4LxXa#_x zLA{|!u&4|V)^&?3KH-6fOJnqc((?|aC%3;gD3APNWfkQzGxOz;`cZI2OzBCT+)3k3 zYt_TP#q{Uzy|FQ^>$l^4?q6LmSxNQ=&$JlawuZ{|AS(v7TGK~U>()5$%--a3Y_(tV za0f;DVm}JR_>@AWne0XX+DG^y!dtXgD=x$F^us@SiDza5 z%J}#mMXWu}#^;JPHhDbvsrSbU3pa8wpdmegOvPO%zZGn*$zBe=#jT>{wZb}3gc2H( zAj}d0|5}^NA~kE}9H^mr+qgcy16tqOZ!b$&4GA`XnCS=B#7FW`>NvS{yz6?wnUKrl z(V8&3n>WlIw!F1?xk%iel&5mGl1k6om2A`C9{FX%c!E~u$(GHmX0g-pKzH`o)3Y}`ES()uN03jd97W4{O+!D5WbL({j2*@*&lq7u@?`JHGT>&mzI-h#%b>4vo-RHTB4t2jI_b`BITC3O=uY zf7oS~1=qqgXJj4UNz?-bRgyzIkr0NjlA@|h~n=J^_Zp?FHL zG8-M@60B4zY=&PVd{D^21jiTyBSr;&3Kx0)(dU42e-@WkLbm8yzsb^qT+yJE-A2%{ zb>Z+YY)*Mm^mWc+bQ#c0$zj=Uq=?ck1?#*>K!>0^S@~|{lh~V!qTc-2e_#=DsB$8l zJ{YI_pv6I}+Gt1hZ)i9O1b_Iya0nXdPW$;MP(y@}N&lgUmSf7IX7lfeHDL@egj=tm U^b{UBANxzOwRW&7weX1lA8U(tk^lez literal 0 HcmV?d00001 diff --git a/docs/images/ModelConversion.PNG b/docs/images/ModelConversion.PNG new file mode 100644 index 0000000000000000000000000000000000000000..26e96bcc9e556526cd7dc72097ae6bdd40ddde3b GIT binary patch literal 47962 zcmaI7bySpJ)HaNOgv1Ob-AYI!f^>uaC^6^|kdp49rH2;5p#%gGM7oFWZcu7~p}V_= zdGGOgzO~+Ot#5t)A%o6+?z7|UeeG-S6ZuM0mE-~411u~ol9w-(USnZl!?CdL-4Wmc z|Ki2%-vRu%=k{7v0jp$~VGH;L*IHgf9t*2Hp6J%>KJYuC^9y}9EG$ysi=-X*dYpcJ zz{09Oc&Q}+7Gko80wpo(`fo%(k$))?>6mvABm86aiT-lV4QzpNNbMwZKX)o>fnGW% z{nU5DOSzfrX;UzVR=f@Lg@|`lDTi+ISnmC0)?g`UIo1)c1zB`7!|L}=;obz|*a?~$ z_uX#W-EIQlQ?I5)D7zv~@Snnw{cFa(%6A@jQ{wM7{ABG=lj7BdA9erAc>HcYD@5My zStfuL2v|eG{-L-f|2}QXvC_7tgxS{*!)xQrthsoUS8hZ z@rcxk`s;CtA$u{$DY>V-6py})|6zJlZK{^{@g!}kv71le zcS0+b0wrC;)um1@y*7au{kf2a$ZM;fS3B#L<_!z3wk-DgrDQ>f&E`CPPA1$?n~18D z@uJj|iAmel`en4(75c-HY3)b8WIXe)XLMHwmBa&igmdy}5cTWD*~qa?jllq=G1@eT zK-S}S{?ofU8?h8NU*72!kue*=mdU_DrH_T^HccPrI`N1T+Z(ny2J!BImY319&ivsD zqe`Yf&{|iWa;X+x947awt@209)%9s|ZMV zV(07s5Q`T!?@5_@$*IKdeCNWHH_yVwb1a@#jl`P*Ihxpt2|7x68atd_>a~p7y>0T> zXb^vRkK(a1{AKh)0tfEioXy+7>ErW2yVbc0Gz~glWIEsTP7?E4M1e<Ri zm6b;s=>#6aI@5Scu!XBuyx?xPmt37)O?6T<7{|N<>9FmH8_1$|oRHNK(mM+@b+eIx${)OSn_YJ? zdFJD&?KlvQX3k5czVrIHhjTfVe+X~)Kv`Hw@@6SFp`D9SIVO|VjOo^v0fu1?Jzng4 zvJ8$-=khg2Yg*6f|B?>p>-0#9kF9OV}eF=>3BCvWD_W3wvPI zwy<)Epo2U2CVxw2{q8~2+R1<`2b$p?d-N}XZ=~^EhlqiIr-g>Z9CzZD-QMI1btu!_ zG_#GS3~SMr1h)ZMQxqC^y2>~!#^9#a8$ByR?UPMNNe9rBtS3BLMc{cvBv()Pv@ zIk9e`1G=Hmytx^c3h^(mAwo1wye%9PEl*_Y>@OzUUA%N$7lZRTaV9_Gnm)dZ@4Gnc z$(Jd_KcOE*2Myky`Rh$)_0-(f+Z%df3keiu*@mqB>X>ccCe0^7(+IxzPtV(Trn^Pm zJ0F7`A!aoN5i%$3iNo8+Qxt4YyMbTtpV$XLNIo4(k)j7|CwQHv0}=1(_0jjX%h61) z;$;~X=HDYuOFe2@?wTtbi!I7Dcusb(0=;EYnbX=X3~uO{(p#ACJc$EaE{PGmyoF+W z9a6+L^yl-kjzW-Jlz_PE?TLf~ZC){NqmGEHW7tM^ONql2`rr=&A66~VYMOaq&n|+H zowQ8l{fLU#*`q$uM{;78SwTm1cpiv|0h%adp6wyUTKP`WoS(sRckEZ|ZPf~*3aIVQ zH8GqFV5zY{76-@#dF#vK>Y8Dann2!_12Mso(x*mK`3ubzf%+$Qd%BnN1OwX| zAt)HlNn0GE_2|5_{?m1hcXGQ0HU!x>_=S5+ zH&^<>TawIb1nvUo)@Lpb1MN?~FEoeutSd_EV;s3=FZpk}722MDu!Cg}-nF=d7Vzg< zxy8Cc-dyg4O+~2;^GSl*U>ku7K&(d=RD<-Vo*-nKBhXL#?a?Vc?ho$*Ld#4h17QE8k_sZTtn|^j zCmUg9-%Pz;!1ZOsld^@(8r-o@GkvYCeW7X4G6b^~wY!E~MZ zM60(YS|FWSb4sH2w*J$#UQf@zGZ5EiiroGoU@_PJ4RQojkctj@DT zMFcri78~3kpD?}?*v&=b(CTO^(lIbtdcN}f4CZ$lpj+~6@BVpo%bay}SM}FgC;G65 zkg)c?`Wt~8X21u~5k=23bi^#f|2iPbDZZYY(G?Bmwe??~P6^#sRQZE*(x=h$V#C@Y z=;*JAz@cHYr`rSrgO^V4186sz{5NhdYTMl5TK35!WpikxJe(1Rzg>%NYT+Ba3pAS# zp#LNrj{=oa-T8G06xNnb*0Ac|P}P_j=r`Gx*$&Rm%=SBZ2|+G5RBZ09K(B*B~@4MWdp&K^WJ`eH4)yU5vO5dCyrGOTd^#&yJctxv?SHW0 za>=}YOF95xq@=8%H@pP!Dtd!70N|X0mO6yiWm6uohHf(BX)d}d5_R$B=wj<)fL4wy z%?ps@X4gyKp925TRexmglm9Qkh564h-4~0%Dqsa@v%>#8u5e044*!SPa!Nk~@A`+` z!v24-TU5>c{o#xY<(;V(v3kf+MfyVHmEu^1C4Np{pSMP`^Y!IXPfiFva}>X^PoD-6 z=4DB-dH44R2dOxthBF#g^T^-?xrv-v&J%kNcr`9OeFk6+^QWm#-5h<)mJs`I{t3k^ z!smRk>)lHDol){`PGe-mGow)D0?FE9L-^8oNYN%npD!}yLYr?{D z_x~0~@Be+q;!et#_GOV4>u{Ttr0MBw*%Ioy38NHky6p#ZJ5zx)4zupbC$pK)g%XpZ z0q((&OIc$AXkZ)F0}A7XR7@Z3W~)2Z;ROx zm-Y+S@1+<{&aKz_z2GNdT6x^HmU@|IdRacd;(ZECft73`)Pp1?Z*ojtMyInFg(VE< z z7#bYV-a%x0Z|Ee=s!$jCFg6Z&vf-lfq>=4YUsbzJ_^M_%&339?M3~r7pU7Gb@jxZh z&KuBKY^;B$x6edPG^?Mg%D(#mkGcx5^utu_!9`nJS?@@PCPoWlr&Tv+?@nckj^ z^PAq)JSm5MnycOkpoYErbNDFqpw2DXs9s{(8-?l_Y#49>oSeo;_&&c0qxY5)9ptZ| zHq&9VBjSAQiT1$G9iQR8Co8tw<5(x< ze`1J{i_o388-dC(86AzE8(zz{7aZ>I&v~1Eo&H%#$4X51U5S7d570V~Kt`{^0?J$w z(w4wZU1+WonZNcI1-R9hpV#!uE`(i&7SUIlDtML=WD3wbRUvqet-2u8JSgJJ)S05< zLD8*>`$Oq~+w!12-hP*p?{U;qv!Unmo4;|2!3w7WQEgNj><0lyi+>w`o_9ctf2z+d zwuf08iQ?I@OB9f`1N+WFzalOHKMsFxF#h2xN29$)l)zXDF=^$l@@6_1ScCe1*AQaS zD(adUFNm7j-+u9zri#L)U-Wxd+Oq?l?)yN1VW$k?N$Zv}&G9-swB>Gr3Rv1Wj7jCv zBH%})Q*l+CIT&ZT(k3pkq4U1*C7J>6vW94P|Iz;~eYn1lxOo3A9nd|O81S+jCBa~7 zv|3{5L|*OW2hug)KkKpv8rcE$?73fYu;^JYY6*_E~iI1ozW z|JHZBOW9;1&R0ucF?&SxsTX9rxH_;KOsLa`My4b|>*IGKBu5BueU-((sA+xmZWa3S}Ht%ByK`V4tJQN|!|*fbE) z9{&W6cK;`HXXOd!jvl3EqdM0il0`W~(?*CmH`V+HN5{6XC+|;AbQ50#A}Hsd3c*W# z=Nl~U9p=4#Iu6TLuL^IxnQt>mc$#ESmYUu`&iLF})CVYv$unfpZ(q*_Iy0_%y$Zl= z1LRK@a??&EpjtcUI!HK`oh`lJP4Bu>%YAv2GU%$PJl%X0 z8~7y&>iLS;(Hic)_4;_v>czXTUzlL{@}*FDE&C9G(M@SM?EJj}rKm$a&w{GuTOc!3 zjEV`t^|7Zq_l&)yzX8uLsl-Q~PnTP$ibzQ@WQeT?v^Sm!*6FNJ`bUP zz9t%f=J#tt;fXzM5wA;3RNEdg!JOp)OdL^V`$j}uJasTKXDD5kGMO1VoH1~I{4mJs zK0jLdj@>dukSriRAwe>c9Ak?-0dr?oO8A^}`qpi*p7PwR-Rr`5VY%G2?_F^W-eqi_ zifwiXCBdgd0heX*9GT_LO~THry$LpZv(*lWdXF7*q1rn38vh$lPC7(&lkbHJ1qFoz zVyPoy1wV))HT>7JAiHiC7WbD#-RSts5w;Z~qdRudA#rAz=^*Lbat3sE1;n-Qd+UL` zrujcR2v|5DksnF#;icn=S;IgUSVQSMzMoRN;o>+u(>_U+{>+6PrRiI*1}jYTarrY* zOG`(W&wqEN7P+1O)O7Lcu>N_XM?0)Xlst=l?Qn4xC#&s>G4le?lJ_9u_!5+$+=;t?yoMgRbknbLK!`?S z{8iAOHY9puGL%23GPAcj-5qwdJk`G)q}%U!oD%q>_A{>Ip{0+8oyhXMsgY64`o;#L zBAQyDe(J24F0-a;^pn-@Vp~YT5`HY*5~-~jm}ZGKyUe)UT)7Gm1hOExyoKW;N$Kkx zRE7V%3~f;yAv)~K#IGNRwo5?ctFb;1GtVmr{%p=x+x3wHhcrqF*}WLfCPplKcr;ar zKVYuE7^wh%&CDjIhBfUvXdxTD-$tH+pFsZr!J`WsezWtzbE*BQWBsoxc4rb_Fz<2T zd(U&_3*Xz_k-PY#4Uz;khH52%t)h%?j%j;#NT5&5%O=djk!%T0kCijlqnq3sY$>oN z#KHlU5FS-P=PJA}q90(3QPy-f`-8eSl)NWGpRqf8Uw+V^aXk#pOy?ba$+y3U!dBgmaY4gm=*_^O3HQ%p6n@VD>*ikddZS zVieWExBRL}g%h{L+c}ypMD2IY+5pmX;#E^m9Aosr7BNI|b$Dp#u^W(sPro!n+t#{m z#0-=^8Z`2;eZ8{cJ&4vf#WE!3%Yt~3NEO$pbA54OLRd3-yvgsHc@3Iq6*)r$q_vVt zH<*{rRBi-aHa-VC%=>7)el3+o2KUptaXM~!f0ff7BGaNci-X>p$S+jd-~L&l)0gDVk$CB3^c)ITK!oV`a`}0}O4jR1tW<<(u3lgC)9Y;_w^t~_ED#g)`FC-do z&!N2bjA;{KE`KZWBe^Mn$Laf-Lwr|X96z=i$-GPFX940FlWr-$irwTQi8NS4U7h1+ z4!z!b=)1xNDn4F6N$w#$`G${we?Li3Q^g>U`TScUILCvVvp2+2)*~`|chQX;wI;W0 zroFQ2z7zXHRpLjrp%pdtGO`{YMN`=&?q##A z*Ey)p(Lv{ymX@9*_#VC&cQpnIrWx))0FU6{<_sLKm=mudO28D_?wg};AFsofAd-Lc zfIUTvI?Pf|`6{joi&O%c$8Rg1#%SyvfD5EmS?rb_gbOtoN`+Q#-Da_RiE$#;5MOKIL{AnR#wurSB*-20d? z89U~DB@rCfC^zVcs2Pehe42;xGB)P_#SzPL0h8tB@y`>&9p)er65sy@#eIM@6%QRQ zF82-5ckZ6=Au{04fLu{{a)>Fzcro|})Jku=U}V2D>6-(DJ5IOzpYQB$&g0*iJM_Vv zZ!azS=Jw@@O9Wga$G-MbDL{vF0O>@I_o)-Rx_0=(-GP!q@?OMJ!H4g`@}$Pk-K=DhriWHc#95oSM0^EuKLfi{@3nbdySiXs_FMJP;m#SD@V0sOU|K>gQn>TNc4$6dON**T+c=ZDTSwt(Zb*;Po^Vj|YZod-a zn@mYDs4XQ5=cs^~A?V`iP=nn%k!j;Es8)2JjMx5`le;gO#O0Ctb39o(-F`3&-0bS+#qjf0o3mze1Cs# zF8JhkQuRsXrc`5QcSHDvJvcNRP8;AA$`MQvzmq8qdTU;>)v3~~Z*D=BpWGYanbO8em?^uLw%vlS!0V8TMhubno_-E$zI zJ=T8OdicG=Znfg#HFZvmQG@3Y$=XizoV4%xBTV|p)z)2PbHs20{JM1!wbFX$n|!_8Q9gl~<`GnL>=iLTIHz^`9O6b*GKwC}-d*kEobzIGeL?gtN$56= zMhRSrsBtJHuT#e)zs?~MUT2Nfn3%b(JTl2#P`hkqBqL^zVbot>dyPm9yYYQP+hEyud6@ zlJ*eZ4JeW*oD<#gso&J8606;mJ(n#NEtN__yUP(?pH^)KPEhu;A4L0%bnd`yIh z2pPx0xnaC3%19Bpwg9jSj&h%Y*aZL9M=#naYtV2m?f6J(x#{Xdnk!)8frD9(7DIpm zIGC$b`|srxwk?~Nr>7|9+=gIy95xC(PkCo1an~_I?Qz_+_d|$jQUv=OB7N!MOn(EM zp+9CHKRz+2bG4icP-9CzNFrqI{ElhV{SuVCF?T;)R2<9G@akILWPBGqA%^GAsw@0s zZpmNy)3X0#C*e%v7&0(fF3jNb|II~!M)=S29jGl@z|FfGgE10s!vN{~`hMDT1LAW5 z;tRD3nlskL0R#^VCT2ZhjZCGcI@lZg5igwx+L*@JjGKYbE>gv&x122fd3Usp1^ei(90n>IEQ zzf#zEhK7S1AbEt5G;d!}tIP^g)X~^>0Y?!DG#qnAQZoYqEQ@o zhZ4+^qKpnRu3l?;;`_yh2QCT_u-PI$0HFUVREas*`eZOwoVxa-f)fz>8>);##)YCu z-9Efugcp;DIySOPczmpJ&)oJjBQOm=K2zpbL;*W(VGH6>%96L{Y3Z2${q7*@`~dx@ zbse?_?;BmTw)*Z5t`F&~wwrncWHX!)z~27YOL)2a!epxUe5E^I_1_cDE&+_(Z*vbq z%anA(7qHF1ht$+R91u0)E_U6gXFC$ApApE|xqZ-(SN5#KBEn;An8M0!YW0Ha94tJ4%8} zQNPvAuAqxRpC&gVtRn)X8)@`9D#1ViS2^seRRx}CA=XN52c&y9j16$xZz(!V!fv7?# zSNxXR!!A>FbpYI#pO=RTgxaJSQSKpX{QpGqz1HVmuaO)bh6Trx+JxW+>j0b6wwiqi zUrqSXnF}C=23yD0IWTa>pBcB?k}sZIr)!rh39;jP36&~Opqk!)xPda)Piq$-iPJ+N;(vL8UCgp)v)pxzR3t7 zn<{RYM{%ge%cLbmIR=;gMD>YF*>o^b1nxu|ExTdBkL7pUAUhY^hxkxcTSrIMde1#y zM}s8#KuJ&k8;I5gpxZV$vyfM+dWW3y#xs zyx6NN?_|+I>bhJi3I!Rz+ zO)Xi0Y=kmIvK<{45hLu*JMd<4c*r=Cl;L^0`WfDW)uHd;aC1r$@k1gaB1bM$3}MtD zh6#mTIjkW5_J;zQ%R>0nCSObSj?Qg>kddpjO5&seejB5sgJ*Iff%{$t`vPVdcV@IW z%VpizM)$VFxWO~AcfK-Tw&OLxTjk}oc7=S%c+$TpPkt4|>Gk)wmok8(6gCu+%l^Ed z4658*ee4|1;63^mVHc$4C(+U~J+1oXOB-%N15itIIu3RB^sMZ(zxQaRP@qgt!%k*8 z2Po|xOHFXFp8c3OWX&)dAT=_+KWRrFEch}!P-F;9bFoeR5muDVUfA zV>=0Srkd3D4{BStrEAEVzVnn$C*LXQ44a6Sm2dfbND7HE{{(rR0yO6{w{v85uHtBO)R)@OMYE6~|JaTIZ-G!(Jci4|3M&`StN$EBbtw(YUeC-}6=VN->lRit4r zIlNUg)p`k}Oa|x2yQoDQD=+S+qTUOuVtKou+*1AlbEruIxARLP75qwh`n1=7tj*0U zgui6`A}ReCSvX(kX8rebes#^(Gc2>YB7ACl zlQCTWjzoc*pR&IQ<`N?A1XI~#C^$9Y3B$WaCJmm&lF0x9K>!^FDmL7)ut;|T>F1ui zdLj}M+Y$*^%YVhz9cC(iexz+Q;UI`(OphDzNyndREC?n2(?MPQ@W+5PT`YkS!V0hg z_c09hC-67H+*;t#_2#Tcm`m6<8UOnKR%db2!3yq)>Os*FnkEtncsPMJZyP?+kx1)R ze|&e_GP|Vo$)l5&p5E=GUAGX)_`g4-^W9~7DHN9(NX8wg3&vsUBs=cHAc zVJxn1c`0pjawMj!v|mTfthN3i%WY%$U01T(f|dr!J{GM4bRV$!+D{6$bSnkWrgO;U zP<yA$#ecP{>Ahx1Irt|)2yCpmtwudfJ&SxE<3fGmii?3_`V;`}T zuX!nh&h+BEWjK8FVF&tVAa7xS*nu#UcA5h?YrC!>s&)Q071c-17Q3~z^&4MKiQfrm z=&?CrO1E%&X;FeI3ZGFzaP|4e#$?bpf_L-|FtrqW(2U* zptJGg?RCP8Tbn>U$oGl8+gj5if;p-Sl!`m*An-FU_c+VlBsEZ8{0GQ&tb*as96%10Rz>c?(2%q^Paf3xck`x zDE-$@KFY_`3AhTmZqN;K4y6FzgWa_lmA8d|5KyWJO1nS8$7Iy`aO-!!CM_=iRi^C$ z70&|G&=B9FpK1*0f#2DjX=`*v@YrCaU+)6#k8M-5g(X&+t!Eie#fp{2a|uaRuw zOdfJvd_k!sPh5_Cv=rEa$x$Yt_Zpgciq7v6O~$BjrU>|67um1va>M z(dnWNzi*Lc+1c6s7kzzw{}yG$EXp12)k2(&KV;6BS=JE|%}r{jU8rcF%`#b&fFJoT z>#{hiH{(jjOS1VZqf*IQAZe@^2T=U+1N!m5ZyU+M!I2Y7FQzjuqR8{#raaH*-~D%4 zp**LG>WeyrwRFe)IVBI~4#Oj1%19VM;}#fhonIFAekK&ZdsE-X=~n}E&R8$Wfn}h)Kv+Veqd-DZ|@5# zpHZgDN&y{jx1$$qtA7X`2xo&1aav+A(7hf~?xD5RmM?2S2A8y6NCvV~<2yO$@s}W0 z31L_3$4PlEH>l}5k5@WKp!`DAoya6xuI|p{&))&vL%?CziC5Nml_Wn-zCEN*BT=cv+RXYAV+suMIM zgYPikHs!jPEw*iA*oF`sp~1=S?6LKjPE|dE1>ND7-?+Y&LOg{Ol)$;SR;`>9NeE%y z5uYM0=|2(HQ?eV1|68&RQt8%$b6WbI;a%KbBJ5hmgJuO#eaG^$#|8aT*O}9CW0VueaMhq7(Dm%OU@9?1nJM?usMg2XHi#y$ihEQ3S`iaIKsnha6 zNSdSvKfwAcmk7b3?>ddR3?@2xkz|3&j5EKe4kz;>_2Q}rv1|1yeMSwz?2zGIS&bro zCIKn4d2*(C?4S;p*m$I2q+wcAT$;and zAjw_y`5tq_%fkvZ2FmZXNZ33WlU=0?iT=Fe;rAk~HO%+Ge=yelkH8=Da?6MCSP0-0 zucRPf)$Ny~sGQ;vcikjR0=lWD^R7njqRY?yv6Tdy%}+8$}TgaDyT>rVc)fn#2NmeyIY z+rnx3Wyg;C^n;DqKqjF>d<8uMZB5tMwxJwY>$Ai4PPO4)ddaN5k%Et(N|U>6YVh z8E50$X9I#Q0f8Bw%xr@VrK2IoA$lL)irnqwQdiP9(gr6x<28CTE_cz-7N;mnIY6xY zaD!@NQ3Ut-!oA3}_-?E-66^Z_boZJJK7d|68>`+~^Bc;5Sq9^?49CiErF=byg{af_ ztFS3u^*+NBBcd?S$G(^}l-&+}tUwO_!0QxDS>7+K1Hz`b*W=2u3NFA~24~wWk^iw9 zxJr3d{ro-bskZ%28kVJa|5Y5;Gx?8<{hw@2KV0Vy{cx=5X-1S+8pEC)5#I%jk%kv* ziQiDRH+psz3(3eZIdK|~nm>@{Vy-rszYh0`zUt)r5;*0WU=GpocF@@7ld?!5G?yuk zx#CszD=W5ZT0UZU-EtsJ9DAkgqa3#}sl?ATU(t53?nSd=7J;lWG&#&^b8+LpPNlVB z4lM!aFaTbbejl4yjRKw|G=iIH);@F)YISyL36ZUb5r%0!i4I=_D+Ri-SomfhejP2e=~8Z_K>LewPGG`8JVSp+#}67E*5k>w9|_>no7tQ?wss(8h90v_c) z*c%VK+b}PD2(PxkI$j$CntjIkl|$)bk_Y>l(!NHjBH=ro6O!WMv`A-xI#wy=@+G{H z9eetOs5o({%SHdkIuso%$}fV>>NgT?`yFUu*3y&tOxudDPJYAHOGCA!#om#@(`74V zx%ospxee;zzfH?ZBl1Ip@bi`#u%)h3eBiOl3C=|32;@lNtyM#wDD`@hNsDro_ zx5PBx;l~?nu_47h3&crN+4{_y9x;q)Ks>L=$1n^Ws@rn?_sG+cV4N8>#vF_3xS)m) zZZ-WT=JrFmX(_fX-)$z5kdTNA2qES%{S9(BM1;jcAU+&HQqp##{J0x7Kt>JIJi3$ z2&CnVmc6Cj)?+9#2wo{rfq#53UyS7RtdJ}#PDn_wad-c-*Rjy#`y&J&RPVn+ zGp&1mwf!#ETHs^l$`OJG`(iZtj+dK{G&i8qnM{T5M`e}`qf{--Am;OZXocK{N0Ui`FOc2de|anQ_6kwRh1J`hEs*? zCnScffJe8a(e}oh20_Qf~pa{ubG?3KePG7;}K&<1n zmpwN>k4Jq=HG{rc!{?VwHo5ZoUkq(~&{k=)4Etl6b22JE&UE|rKXs`2=oyWXFO{hp zBvp(`F_-oB@>KaZ;fukG;WV1uxz4gkUEG{V-5SOcJIjCw-pQXbrh0E8B70uFrPHC_ z316_V!&$$|nL&E(JXfJbyw9>nD%u;GaaI%^OD(;VVdJOtfk+>9J|vMEbbqU8lPTfx zM8b17h2p!r%?}Iu_M|w+`6%}FAS?l)Ok2EBxfBY#;0-$Y7|wK0B`{AM98x4x`-9EZ zy7?Wd<=}p2xPcya)17j_V$;tI?-Z7nN^nlCJ0ISZl2zJIZ+S8Z2OF6`^jSw}x)XG< z$aPyRfdLk=_)JB8))o3Ou&!w3@MU`ZJp{=B%HpZKokFs|RR9crAJ;xDLZYzInC@sl z5ht0{D*Kb-bC`JM*XQQ1Q(2*eFXh~?T{;QOJN;^xp{P^4PQgl0!1Xipj2T_r^6}EO zk;_vD0JW?R0Aor`K}=}(M~d!iXQG;K73(Eu(<2^qJtrD7^dS%7=_jFfE@BELUi2o8 zI9jWgwr$3{7tOA6IAKdN_L5UnHZDce5&FlVgVAgJQ03$>h~{FVbg8L3I+jJ}x_VOX zPg<+;?m0PL?_%J8Pk6fKJn?5|UYCvC(*T;||CZONT=2(bDn%$}hxL#-{elNEtEwA3 z#T617BzPqoMxL2cll@lgHWbh%if7_ zH2XQQ+j8KqvlGE#cfqF#l3sgtxmCW4${(tpmmvRNY-f>KeAp(5adaCMN>LKnG>K4- zncov^Vy*AJ%&$|fTsm`~D}ob`2k|xmS4NBVPfWvpnx3p+w7=^{J6;+qeS99(`q)FS zR_K6_trnV1c*=cvqdU6M+TR{P2E!BL2~kg=WxD%x!$`9X(9iz=3iW8wNz%sg?hfh) zjJPlfMJghR`^yf>9{}SLM+MP5{*#Mv>4f}1nSDFbVTS@ge7^?7$}_1i3STPL&cKi2 z;IUCxJ`6&3jHX+-1672^GB+<5GsT~71mH7?03H8-bAu9V^PDCW^5orLgf${KwKWOE zSj*&>>B;vC=+pI^sf+YGTo|EN)~4$J0dG?PO0SK>Sy}-juKx}nkQ+X^ zW)6MKq$^cQpb0qAJz;R3yZep~LpjOHZocVBgbQGpFx!KNftM(oF8>y7vO3*n#Ehni z)U72-0Kz!zkv`V%6ST&u)iM4&FLK=AWSqsp+~9@cSef=P4FOOo8)H<+A#flQLc{XJ zFI&EdzdeUpl09;ADI3a~K$d!k#R&L9!D#(6IsEl})0_MrQ&Tc{3o1MEzw2sE;uYY6 z0h(wzNzuijpbb+Ya&Iw(GU$;Wi2!MbPZ#nu7_R$G`ljkey1QHEOA;{E5cY4{udVy| z(^N;J1gT^;tKU%?;n)MhN_X z^KRl#Qw!67p56@%+oZhl`LH?+zm;n=@seC23wZBR)yyiV(U_N-z~M|*5z5MvmI{K} z1hz6gO$RN#7cUyQ!w|OmEq6^XBf&X_z}TXQhFIebq#inPdjFmZlaAx1!fGVRsJs!s z(?jq#Vgqv=_e;QWm5D|sEYTZ*K8lM>S1m}Xmiv{bcD4u$Qv$=X6fM{Cb8L!lwttSt z^M))$)J_O)upd^Kdy=&h7}9Tl`%GB)gPJXQZ1qz51+^z%fF7BGJBBe)2q|lwSVIoh zwM~fIjN(&)yV@`(V;SVPIlQSlGWVLS4!vzEKr$NCBPx|8X{naEftIO3zC8bZ!}6$H zB#jFG>DA9h?I1Pp>AoR}+NG<#`*W(beDtw0jA#|p=u$&~u=|dwr037~6GdZ&RhCFH zAgWUK3X}=4Sqnhzx?$oc#ocsIkrnaJ%8H!8&*CmJw5ut^Imou3o(v9gs@%?J==g!q zI9zlz5Z&)Ao{2tk;tTC%{zn~s^~xG6?e(Wy7$w6sv!-YD8Oodi0VZqaJkb2lFap0R zf;r$a?*v&Jcp85vaAbX9;Y>pD#dej?*2RUh)e zs9K#M;otb}gxY~jX>ep1F+SyDbt!U21#033RFMFo)vFkol2k+L(pKs{WcPdcW>nPS z`p4yoocs1$r}3TC&+6<>L578QCN2P(0?8s(vS{&POPi@GU~|paDT!&87E9)R9NLQm z2=Q8>abxwu85h5rl?qW!|8guv){`3rNc@qXP#<{&S~T~5-|UgB<{K?+Ww z&Kui!zocvSi+48H?gAg9c0G4}MwM_g^SAKF!PobjwygdC<+Wy{uq@S^ zJ9iCH_B8}vpKhg_Qy%S;4@~bv-zroMT;m6pvbl0n&tCjK#tU&n$RdXzm1edDI>kP^ zjrJF+!iw+B|LN;9VFgESTZJuRm?C^@?IH!}hiV9PgVqTQDe4Fon7Mjt!F3p8Q+nb& zX4KYZxr60iWLrlw?GU#;Q7WB2yRrb1+2IyI#uY2Z8KD`(R_1E&J?gFN53HgleJ~(x zH+nPxUxUHdompes9~?`SS=Co8 zJNAV^**g{rFdzT6k4&c93b^T5D!tb@nq`@LZ)%@%5rYC*7WRYSe2&AX$3vHG>BCkW zu32;dk4?ZT0WitZkVDps4W-ow+7BUNzNedGmmFf0SuJ9nSQEKsesfj3=It;XJH z$Un+<(3WA?mTCOTj!Bu?x^LqSc+0yM$W_4hqSISU)26k-=FU7g{pN67)03NnD|Kyg9}@%M{xK# ze1RsK&B;1QT=6|Hj+AMGxtKsNou&T9aiC0Ur(EXj74zBaK-uzhKi(?x1j&=)_0Wf@ z6r0btvN~ecRqNz;`Pim=8sWeI{GVry1D|hTG&ZwYnt|1qA8$Va^Njpj&d9r zZ)=%%RkaVTb}zr{ny6OkDUR-C z9#)go+MsK+9&g(vGOPu<(Tt-aU_e=KrhsAH@F~7+dr>%xCgf|39QRWwo;kJu*r(Xo z*sdPYEZ%3&jhgB=&F^11kQFz7?o{t3S7UA_i!Bs{g@#r&a_7OGc+7 zYOd0?As&$SLs;~12hRHj5IOPnA_(`Wcea|2KmjU>8FT_}4}DX`k0(Sb2PzISnFz_f z1?zgQ{fF)>5-yQFifQic61AURl(IqLf%0#|n}4l6+Xki$CCA=OqD2%Q$_M3($AA{< z&zPtmXyX!S2nMP3D$q$muFOO}+@)-f7JhvFPQT&M`FqOTaJdsQQ7Te$AeTkAc%7=lNS?gyOj`I< zi+^{e3RTFD__l6ThR?ucf4vn{O${dRP^8>@*}x=wrze~*Z4$>^hS`yo`&-P8uz?+Q zI7UJIFD(o+jiP0j&sg-zn(_voX~cUhUMSOmJEji6uurY;p{AxEz}?Qrt9>5|j^!Zt zG9SrWeXSL^?G9o42xbm7zbxlBdrvJP#K3hQ*#T!H?-uaN}z{ zq<%<4SinPyS(GYVKNz<@zl6nW9>-Q%7vcq9F2`M$BNU(?|7X}rp$bFux1H(-znoZ0 zS-%%XBx>T_`=92*$mp28ySs>EGQBwtqS4!}r^(j)6XpU9$NMs1z~DeiE#%S{yJx6w zs3WbKa4%$Z&njK86$eK;rMKxe!>JQt>dyZ8YJ)C=hO+7b6&E^5HNA7=P?fk5(mTof?3wvB^t<0~Tw? zx?{cEs}Bg52swaQ#^NVWpv_EZ#QG6Pfc_IUuW354HmEpfxK<;ppBqZNJDN3a&lR%a zTuyS3Ba(9Swl+#JN{NLvUFhamqCGr{F4{uy6CeBFFBr<2q@J^o2EJ0HhB&X@9=;hH zlgZeAF-hz$=xaUzi|YM}L&rE43a-YC*^ojeUsP$$a+szF`tME)iM1fWy4yYXmLn;K z6mK6fiO(jS+lvjo;gh-2O&qJpx-Rd(>Ocgo^u%+z`Dz(73WKm3qXZ00mKs?SiG^wc z7$mc0SL^mH86*qZU(i~yCBHStobg#;_GgSGSs+oZrv43BR9JXfs%gGmg1{UaFTW=@ z$$hwfU8in4p{Z!(^$r$CHShxsxd_x;e+X)S&ijrf{E{<(R?A6SHO7vr@ejk)^? z2$h&1hZp&}lr*P7ZOrPc!d#(v!C*FeK!0hWV|kO^Vsxgf%dzZM7Y4&GXI_pw1>=-{ z*<|R+}C1>np>ee8PSuBm`s;DUn(PlrBNKO9TXiT0k14yFsLu78MW>5F}K(yO)rVQaYE8 zg{5=h4Ep}B>pkau*^j&P%+51&-@lsK-$-Yb^grC6nJ-RuH2g0t6OoRWzBYop0}QOH z6b!p;AKu1n-pv?eF{C1cg)*g`sA0|rsIQm;@oZwR%%+DADKLK`C_srAQlS4392^g> z<`8eOlVn)J*woaNyU0GbIPgU^4QcBfZ36=E#io_mPUB+ko8Ls)Lc_wo=JEt&zh}2S z0b1}{_Xgk8YDi?3wyC`MP~?dTnH>7oFX_phbp{8ks#Ys>Es)pIYK#${k=@XXqShC$ z*ktH@CUha>4?o0zdpuNV@bfNHFpwR}<9{iGFsGlq4gI95sBBY3ey8CK)zM3$yBI5# zZ&xN=)jSWa&cU^TOU&A* zRjI{Ie`E{*4212a0vX#)dvLH(VP6W^#Ko)|-zmB-{6qm*eNw#4!y&$_!5JyfuS9V5 zykRoEg+@)1jq@*wrc=c{I%L1s80(N#OjWDA_+2EtmiacZUkgrVsu!!8NUc_Q>XEx( z-1U7WhaEd{un#Y!rsle^750oif`@M@SOkFJrdWzIy7a4@^1=U?AX7%oN><9#B4z34 zF;+J&@HEVF4oC~i?%x8d+4)*59pQPNmR!3cbzAst9-`C{^tH2r`F7ELSC_?mPB`*3 zhB8s3i@NVv?GM)fyl6TA#;6Z9e_$tDfBtugN|x!h`bwzG`WevKK@b&5NoIwRphz&0 zBC<+lgM)+Y^Fd@ERPY~~US41Zfu8=uca}{G5xJiUsWmkmgMm1fUjb6hHF&b8!IHgn z&4H;&?xjC0)ZUOL^2V9n&wQu#=cs|8ij~>tWHnSx-=nBAC6p-?4PoRp#JDA6E$W59 ztqq>0_S)O9+a`@&3It5HmKN0UwhyMq78kSh)3gglu$leP_BSF}3(%T;yECjG6LB2a z7`#26oUZfGCWQ8;vxa^^<9X*W|5)W6=9%)|P+Bl$Pj9}ChdXzK7GQ_Gox6+)zAq@6 zIlWVqRzxsW5KI5~afnr{wuPm^KQXI+wNB%6S0I77dH`am)j~(~3uKqDiHAyd@E2s6 zQFEmJ%fOy*2Iurb$2h5-=d)4x{CH- zFh+@MJSYT0K3T4ZUE2F+c^4s-IwW; z0J16LkxKb^V^ESOO%xC&V~Nj7y$A-kueJu})#uAZEVC~Qryv3JjyvfJAu*=GbP#=_ zqnWc$8}gxWKa>rrVyyTsg|_`MHkBz&j&7J+Br(nH1TG)dbM($^N4l^-n(d>7$A0s%jZw5{N?%O0mskJBS|MC z+Tgt`HmsH2;R&B z!@5IOt)VUY2~~)BMKZZj+~9eDy;c zg+=ZQ^x0sAmNDNJA8e880RUQohLT<5^=yz1td%k%lyHiijlMAJ6uEaeIV{#WKge>| zc)Av2uZytL#P|HtfVwoE(XYNZlKOVX;wWt;=Z7MT&C73TK07y_-=syNh%^-^>mCZQQqY*%xXtPRV1cgwmfaG*W};;ME!#pyz>#r}w|C_!y`hgsj@VSTw9_)X@z) zHp~fG#dGC7OBo!zsxPnhIE<_uc47)=Q6W}_%j&>K-2Q8Cv^{{CZ4mg2Mz^QsJXfWy zc9PHc*VsM`(Yq`v-`;+*-;84fmuxubi+VDj!?8D!DmgOwQFoCcr@{KDT*&e%o>4FV zbW9grT78@;Rcs+-`Yq^X?WT0dYDkOpvI80Ajh<@H?Izm~=hRJwTp+@evYr#)0Bkrn zv|29N?iH%aPps1pWJhOu8}%{@P_3H zugS{Tj=GO@N4}>gVH}yReIP%9ylNn#i^b~~zQz3%ueEoJ3PDBK{ zO=UFW6j2rmTk+e51)NIYaPED>m>7dW6EW{c`al?sCWOFcZ=S_{T~7C+EpDREfy zyB>{ky}Fq+adj@se(~lv{nq;5YcbW5ia4pwne3fW2r+jnO;f|{ugOv`OCVhCdlN}u zxI(ClMq=qw5`?P=yQcea#8I7WOg9SNjg0r^YR35>$2DFqJNWHC)nb=-IPPxrdja6} z{^0xMN!T91XWa0fQdp+?26HM!CR>vuom6Lsq}GnKJrKhCr--5{v5TrIXy~Wh=@-jH zQ%-<2*v|ny&^h}KoWWvgcNmHYz$@G6RQajRnkY^&r`i12fcwJx;DP*3%YWv>+lsfE z+s1)#DGxF6o~vcorwRx8Ft030gBeN{XuD_8cn}Y^t{G|kvVMmd7+H<0w9g5$-K-GO z&iZWbJ!RBQiv$H3f}lqV0jpY?#K%Rulka}@1#_o8)~Y9QUd3)xpL|zGjVc4M2A!YW zi>oh|h!d0a)6YI%)s{@V8mBUaP=m3Y!lJ+X!RxCvD+6YbN%qNoxJhgCU4kpG;!g47 z&T<(-G1*N9khigM-^Nb*;#F_~32aDZ-A!okJBxEHz`GvEZ31|OdLPT!272VOF=_)= zqbEr5pclPL@gP3W&_qR$LSwbjl=f!OSXc*f+mwT$K}^`OsXS+=b!SwU!saqY=!c=D zKsslfs?J+A@>&1Rd8m9EtBaupmIxGc~Nqm~(i?@O5xpofi(%?w{jHc}(*n;OP7)GcSTsL|0dx0*KVH+XD&B|gZQHO%J->eH=f3xk?=h> zW|J=s&UYZ7LBhukXPhef_yI>7@hH1&JbqhXF3j_ zz!`yI@_t+KJka$tL7%43I18&T3bDVBj0#E6@r+u6-i*elmDO-8L^lVG%Rxxx#i@aW zxEh~DjOy4dc^BRCbN!3tI?3$wPvb$fD|g&(Ka|f^hWMNF1Pli$fRAtstARcU_GqOuWIFz3tc)w6NNJEtdTdR4$>tC}-)crzZ{9 z)2zL=c7;T>7I|)6%ECTnVYta(-prjGB0KejBel;LQgex6yY!3iB2g0OU^lB^oC*$0 zeJXfM?=3!+hW(%{_W7?!0>gkNk(>O^jPSj#448XkQ}Ns`GP#SzXbXAn%dZWe$++U; zcDIN$Q(l67M;sDqVl-!>Kw0Cd;DM@bVt2n|3?DFmL$;hqC`-TkspI0T6XCHH>0+=e zlz1K2?hyoWkauUfluhAdLpOHjR*jt1sXK(8bGe{W4~wMjqWv4_-$C zzTd2eXxpo}oi*`_zSbQwo-H3YJ83d91)fwf?S~ND)GNsQ-`AK@xiiTNpT8cexKc~H#CfeszTf3ZT{s`yQ&ik~zAD=rPtL}UO~su03DO|_Z=q9V zN`jZ3DB$eGOx^VJ$tiQZ+vSqw#e&*6MJ<w0G9;|1Y?y!9)lUZz z%N+w(HeOa(0 zR0HF0j`{8&9-qu6tP$5E(QMczFhsrbwk!0oglydNOWTITH;9jzn9xhIYQ(a?&1?1~ z4*84wcvaFy2xy2oh%^yis+z* z@<}qw7`t}KFzP#aOD5$^B^Y~+T#2b0y}#x#`G$L#d4*!QMExzx+7f@=%Z@JD*z089 z&{L|ghr#qs^zSjIUTgJ-+ElC=*8#M<3wRF*IhR&I*uGAI%2j zhMUxks4Q+by9J~aA&RuVxe%n%p>bcnXdHu}qad5GwG_~b8m|?Y{o(&^JFAO@6N^U# z#p*OEqWmDVI@@5)SLhoJCAI&Q~;2MJ^bvpO8ih90bair}; z1l^aMdO{QOv)-8+`VRkgeL04bR(XQ8zK&RQF490WkF0fo!-?;TV8Q~wS_a4r!)NvG zj~9_-xdr#^#YmIxbHa?@rQ#4-{(4S=yYt)gK%bCFR3*dIFdIby872T^PPi0R@aw%ODk@?8Pu$t`6OLk1CeN=0JBmH(G)u1Hz}o;6^eFO;3kzR zC#55Lcy22RjD|XE)goP^{VTC_c?7V{qy`_*mboT?|KPM$5E0vUJ)wZ*h8je_*Zw}c z-;-~LDMwMxbK^-=&*>Ejv;vN3)Uq#oTYXEG_(1fYF#or7F$vg-}QrHYrQd z28Jv_$ksE!6?Buo>O|m{?bICjxK?uf`pQ*D=O^+8PJ@&0Yulh@~mKsJ5hs!qYC51no6EnA_>-@BmXh{gG zloNUTCk&PR$=sk#{{CB9onHtpH{SlE5p3wE+We_Qmi==r(j{wXJ=NSe1D&7G{+`t3 z-r)B6uWbZ!%~Mc7fEyX?9!`~|RY9|-Wj%<0zWF7itKP1PP{EN%rhu4LS?Mx_bn}!mU5JFJ(vcv18-99bvA00~}D)YNv=IfBzO;z(vH274>J5Vvj zBnav@T74)kAd#i&udJX+9jqm*obwQxycmniDwhPj>T_LE4M(n0d2VFF_LXIWv*x!< z>;ju%(xXB9Hhr#3rT|E@sl9`ag$3S9wcP5fLqq}>MVuXP*vB!_H?o{5SvS3!7OFpWIqn^n33(Vg&M5i5zQ)5O|DuM8>I>I!4x*IZ+|s9@zm0z9SRy3-Qcblc~GFn71vYTgkacv zir}MSu|$87R?@IJlcBrT`f4C7(q;q@!OS+Jps^ezdrTEHNlbYbQLM0td_W*ks+;-V z`~m58jxo_j;YVQN@Tp1+V*gFts~Jq5c2P&V7}76~vbgEIr|GWU^qpBaur*s7CNOym zJIvfg0dub?h#T>3`@?al_l~56UY5q(_NmxAw;S~}XNQA3dx|N`A+zJQZMptZ4+=<3 z;sfbz8#rd|@`K)oummfL7<1m`c(5pc#N;4=>Q{anbklIMpP1x@{FW)L@YR5Yz-qL^ zYm%dej*@EgmddmAXr)o=z1le51Bl!2TRjA?_Sh{^L}u|vz;MtpUS|3A;SPBmk8vDx4ac91AVa8)kz1k^!Sy>xC(hqb)0x-<+4yG??0xP09n5z&PKNJtAU z0kqLbv3@XWJ9Vf1enbpIXA0S6($So51d`U|w%ts<*~W;%PV`}D0qkerNhlwxzth$L zIGfh9{Xv;9#yx2%b^7K>p#C8>tgXIUN3S~2OUG{qUNu^ow0w}Q-i@g)(Ze1^@NXQS zHJ|2N9waKv<#-*H)z`Q_s-qMS_mJJ+nr)I2byV}O26Bz z2eUUswr$&vC_mx4>wEe&-c6^85^-31j$UMVK0=79XgW!s<~#f$2OX9Aj0>ZZ%m2JV z@6i28y~j!}DI~K*d~^cst@c`37O{D{R`g-Hd6m+Y_bX;erU@(Q_7lL?$Oj+L!j(ja znb?m|%+V^h?WW*t`FdXtC&tp}jfea>QEX>E^VmBNe+J%8KfXUS9~(x2Wt0`lSAM|D zV?S2Y+sPdRBdD26a;!j=7REEl>W<8kRvJ_F3i9yneQ%2iB;xAuXrk|lRyAn!Jj5?U z_e2u3x&abor*Q_7z1&xDU6!IH!jX=aHu7bG-isGXT&B7sYaW9(^bF!rW>}Du`AC>3 zu3Ivwu;X6#j>uFCb!TfgZXvj9JnZ~788Bh99-X6+$NGg|VK%7R4zQy>*Ru{CV`wuywvv zrG0efLmPX7F2S~q2un&Hjvi+8PBFb?l%@4|?yYQU2f>vU2Oa96mU5}|B|C#)_Jeji z)Vie@fo^4;zn@g%fN?qUjs`tf$8C16hz>WGkn-_tB!|dSL>u1$|~b5j~YW zLyn)*V}pyzzKuJ21XM8YeA}=Sm@UTp3Hr2SCpTW$Wz*|ve_d>tyZ*T)VTe_-a1~`L zq=9@VoocX8GM>i!$VC7_?fi0)^u&n^#zgAiRhHUEU?)vHDH?6@3Y}3laglrBF{+7D z+KSkSh%HELdzjL-KcBo3srzbzr_AO%p>Eq2&2QtehsjGAVXNgw=tLB2aMQqr)EW6@ zo)6t6^82$vQY!qqwxpdU2_FzwGrB{cfaNcRJ6mX=D*JHeD8=?*JsT33D)x1CAR~L= z&Dt6=T>dZ&1C`=C`^i@_s|!tq5X3xaw4A+H=_6mSjUl0uz6>`kk0mVdh6vL}3@O&v zRDk?e@9-X$=Yj98H(lr{-p6Gk3s(H7vJaK!q|l+Ck&laZTV@W?$@Qr_lS2zljN1^>vKfDfEDG37N5v)-9Dhx_0rPotlo4f9d082;&FXlP#`2`*mjI~%pAELK-AVqg z{VVJvk{%@v*_fD@MiC*wyDf@?A0{hbOoRY_A<&T%T3;&V9%UG$S#Hu!(h@cw*Ur)H zh#+Y9_p0l1Vl*Q(klqj7X)4EAih4TQ_Ab;@W9WevvAx(_su~e3q+mG8qbWhyqeN~d zLOpzjh>E*f2bpOsB0%#m8WW1pf}^Iq_KK@ej;K?U7Z$;aqA4h!-L39jUk7~JgTh3) z4iF8nmydKhkn12O((Q^!uMp+h|MCrcRQC485yRzp#JfGzpxZ`fQO9WZ_Mvsx3yFUu zVYUP{0dgtwBnNGCBbYX|Tvk(zzjS|{etU(HkK6`3M(m<5^EfbTh`yY3W|rRdR%}(; z%>eQIsId=qUJh+US*w!PGgt9nbjuj2F*@766&u&i)fcb(gf@(-mjbpsE}l*^#a z3I7R?Vf~{|!nKA1!c}v!6vgV1Ee7dQFUj(wU4_JskPGokgicmrAZBv$SDs;qOD-Wy z`s0LnFI5?3sC2c?oys&X-$-`HpZaQNmr{mjzpVUCS_$=x=1x1{M%wD1sjGkMjxN}2 zuOQEYE7~lzi((pwaWbA?-^oYhpF-_zZ6D-REM_=@mc_#N(v5C4KlM0ky7u_Y{Kga% zF<1H4O|baWM1HRCplE^j(QtiD&%y`#ew$pdz6ca}zUH5tY4!LHs=^fnN)V`5MO`$H zU+&-h<>4QuzS5c6==~k46JFGwr%!~m$1H{S*irO!4fY5c>tc^ef7bh4*brW3V+UsC8uGTFYcvAmi#s z^P+{FC+@n%@wX-{kG5#j>?PL(N3Zi=x7VG|46SbW{v%d7JD(X<@wlp?Mi;h1G1mJ8 z)TvHX{3T8c9XaybASWslHCffUG9G8e-pE(M?p-#raBzH;U{m#Q%ycA-Q3pNa_UzL; z>o>Dl4&J$@FNOWR9TduMCF+w(9Z)4v=(Xc{kNp+noqeJ{AS{}!4=;;p^sS5tc`;`rFCW^X7{KpAtbXZ&Xk6R#wk-O z`1y;eYr>!m*MhzY8#N>#4`ysq$`xQgPVIx#9iZ_Y88O;@&FzxooWs_s%(q{Y=c}#KnjnEX+Df*F5_n~7L(;ohq6E=TjKt$7u*wF1Fi*RiOk>b=> zw`CbmUQ|;@yDyfF`LfJjpCQklwOi?3df~m|kjW05&Qgf?il+q1XZe1apP7U{s|F{Z z)Qoy)6H2hpRfswJ6Blq9p_Wz}e3M2os`Ltq4jZiJq38ZV*gKQu0^r5!b2GF>7WlC? zzC6{%_hk;oD&m=$5ex#XotS3v09e4PibLHYP+Oya0hWpV*EHYA#!}~e zQ?__iZuL5ykgNQN2|^e9o%m1XBCZjFZlJk~{h;);?o-F7nid)8ccV z=c<-`=UYPXlkGjw>(|X5>j3U+v~l|8b>re-06|33Gthj$Ro$rOFb47C@}Jw%^&h7+ z4gfL)XPOhP!bzI8{s|A!G(K3tH_{9`VqIIim4N;@UmGevLMrQg{VXq+a0ixD?IaOe zsXk{;xVvZ~AB>m&bx+nnL0rxdF8AAEs;DQ-icgB6PkjTESRIFPEp;3wDw8Y?XE=3o zC$UGP#Ve4pnG|d@PstKNXINujZ+rC@*dEl-#hb4bN+be{1pRBV75HUgV&e}A0+o4> zR?k;5i-%(6U)zTTHeJ$~lnxjzxoce3-YgTr1Pl-QW13veK)<4OE=RmdCj)N-ELexM0KET7D1i5w(JZir@QCtlrkzO zmj@{mt@86({$`y{bH~}b+K^jDhx&~AChnkSApPiC?-zS{5aS@k)5suF#H~wr%ITue z*4xLGt<@%{>Z3CWNdURjT6Hu+JYbovkOC6u-cWr_))af4uy8ccj~_AIrN5BYdQWN% z9X8SPi?cR_i91ko*klpH3>8fMkFgF)?jFcZRB|~yFoXr1!Oc@F{J{q0CAg;jX zdewS`1j`-JW4b#zdIs0>pS|Pm#j9Y11@1i+><1bK4r9Xeu*iG%esJ@KTSnSa43W1$ zsdH$>x!CB9ekECSDTd3}>EB(>R%N#VQ>v|&v}F1ntCTo!UCVIXnL5+n*H>jsP3=Ww zo&TGKq%=d(#$%ZW{?I~6e_U?a#6y)cI@_ANKl4-Ne*^T|dVivBHg;x8(9`@$E`$9$ zubY2tpu?iJOi;2!>ly96!W=@CZr!laU!ERn_ZCnJizdEDArB^vX@D7?`k8%4c|oUd z_c<>fhw9{ty7Uhr#q-^;^#h4-sZpH=Asc^9IWxqQWlJMzP2)JTH}8JUCIwdqUC3(~ z6(oj(2-ZO$&!0Y#o!oi#JfTwmbGir6d2bNGX4By9cdY?q_4>}&nv=7~Y_&0v(WQLK za(reN)RZn=>#NYY1vkvr^iKNus#X|>c$xxL<0iwI%3ZFrz6$~ApzVx@`V}_c=flk>Jvh^)d~z=+-uSoijA>K;uns=wWlIbi%fwO5%$=S7X#L3O`;EX) zej+}+Z!%(M;rNXJHoljg8Z>ip_)ekgEjYtkY;2ea0gbHZ`IL)!HaR&WEv8-XB>~Qo zQB-^H3+H4?0+T2V=AM${B;w$a4c?YqFyALeVu|`j;7fIACKu%^*-vKZXUn{`?QlM) zsm&`ybfC$@bU#bS)WdM_Q$?UW0DOMO;+^ z#=L1fAm*zJ5;G8lf)b<3e`ZeYm%c_V+gs`xYTQg0#fPk}8vasiRIE z>vNvB@9AVXMS+SbdCyznmz; zm(nJ(nWerc-*D0!5~L~$qE=6^hlE8v$=C;aAb2JYWQ&y=1TNx3T;!wkoIG&54)Peb z+obHWw|Lk~8km_?GhNO3VUD@P^N$-m6Uo|aNgV_Lgo9pRrg5=5ivwftZ`VM<26nr{ z?swx;*~!1ulz6nax3_cY>T;gG#^b;(F-b@`kJsH-HmpMs7;-K6I0M_O{(^@;LBtpc zZ)$#N>mKh^n@N46csXz%5J-6L%57Awoaw@zEH!uotkkIk;9eEyl~u-;gYB-k$gSf7 zE&?@1dE=+Yb!*nbX52W6$JnD$Sgc(`=%E@H37Ur_fqjtnN>D`mrsxd4gpYcR*jOpG z-jHRs!NC?0>>f_wX~h&1*-bn}xjd#|HtFi@4*5(AixctQw>mGb9)ktjClG8YJ(@+uyEF)qE;x~;pgIEfXD`37%fCIIuAi4V`u%;J zJ)`Go6C#r8;WO#nxnt!?OF3Fo5TQEj<#hJPMxPYHA$Q5|Y7gE7FOQ~D2cr$_G6iaa zw`jruDZM+rP{Yxr)w>2(!^stUA0O9JU-Y`%s)1T~pPpzT%#XebCsIqZ0XIsTr5?S;Xc4HvtX*T`)N%oiBudho@W{Q!mdMrs$;<-gi`P^ zb7~*!viggJ@v{(8mh9;e{I@$hJ{a`HT4t38vIENw?1-Hx6@+kxQxJzXn5u?5f=_&ONWvZ4CB>r^a?(T_Y8l+U%h> zN?h)vqoE1!UvX}*Sga~Jucj8Bc^Ula8=jMCYv?9lL8W!>g42eZ(AO=Hapx_Z-tTT2 zAa~ zh`5P%9^`3%1%;VQ5~Xk8#-tG`Y26bNuJ-PcK(NB}5!Z#~>+}CjAjzmbG%ON}(Vd#Z zsOa@6zP#no>ozv)+NH``HF{nTa%Rc{q@?S8mK%8#i&u{u@wFML&>qXn#$R%>6H& z`R&tI4$>eHLuyoCGxgbvq<1^qY40U|`NTjRv8K6Q=qPXO=h@VV&I7TOY`1&NK5zSG zchT}%dXnhY-FGcPLPu1hRVJTj_p#=L+i;?E$8i3}eYfe`Y&ffZl@u?F$lww(6Rvhf z25;bkN_AGU%HMDQPZ0j}7xDsGSaB0cuv=dsBhviy-=+A6hZQf)e`gsPE$G0VGczMB z=Mcc(rjyTAlNv0WU#$ZrY_!v}RfP5Mx zV$-O+-TLN}BB#EAt7}j1Fm1mFtEdvy#Pg<45WYli1LsT->u0{djP&Z{*!Ea&$!kwZ z0MYWzK9+J@Cgr4Q4favZf4+N89gtoaX2(41pWd|FUHRZA1NX<;W@ow)%no*MVp2|~ zo7yxDd4#K7%ej1VPr_0sVWK6TY5&&hX?w7;bQjc$tg>-{3+#gGUkD7Eo{d!-b@#i_F_-MQG3YS&-EZgj_f-c) zB>^Qh)5X%0@Nla*CTrSjMFk9JJ<_*@Ahivhn%Al-WyBuoYIs()+w z+1F@suq5o)L(KA@B~Q4NBH}bP0a62>Hta~KqK&OXHzeA)3yGanXlO6n`3W{YAR6qP zvt~)9X8v%3Jt{mD=Z87osbx-_KRLKAfSpWC>dW?0xDl>A=bp@6{@tICNo$qOp-^Up z?kev-)_LfX>FwUC*&h;g$B@c<&D_?s<9jGJ@eBglM)IBNM&+AgDSg1N+bxb40x|h_ z6g~}>x>w{t%*$8hSJyN0?%`XSwE1#3vPLPm)B@4t(yUbi1FL5k6WQO4eJQG8f}%O` z6ry_b^JcEBM#Lr``HH$`4EBbGdL-=-ob`XA8kKT_GY9R(xkc>lrptd>*z`Aj=Wj|S z`yG6r8Ll0AfoUBnNMa&hoHlyfit>Q-W(Mog`2O2o=~K^FF4t_{bf(P z#VeT+Uv{sc3^)#7g>@!{kFObmtF1Ix+;^GbqV4xm<6V8<5|!7K|NS!Fnvz2y>Hin1 zPy28kM3(A4I4U+v^FjjB<50^t3?-E)*a(g`z!u8HJ{Ry%4 zTLB(Wwo(m76R>W6VF=YES#SztyPV5$4B~w4CbUo&PhQ9vT#dy<3?Qu)MSf;(VYOb1 zaSu~+y5Pg!8nuH+!a;1z7GQp`$uq*|5DVRpE`?s-P95tKR<+HK<~IMQ2ICAHul+6O zhq-5FL)NkIemPImmlXXshamAwP*UI5g(L{lqR^;eouBiaHIv#vs5^_$po~@j=gF&A znTua}1{OzZEN&0164n4{ymDqdbf$UGYD@pUEf_LB_XpEjbm{%$+rzWGv9t&-?X8BR z&{ctrdzU?zS_Vfvm(Kjma|I3t6N&Xd7pQS|3k={`uRtg~Kn!Cp^MY3TC{W}b3x?iH z8$Ul!fbUnAx^&dKqoCC2Ym{A2Q&bEI>TXZJJb7-sG5N%{X0L^G2IB8!yRGI>tLc&4 z+~CL*;eVRT;uxJOw(UYEY1`{Vd-KE1d&%6hf)eYb^uG6NUDsHIx$fXNYyta8x$ntt zwZu{%MfC7%@j}4H0!M<{PM?I0~@?ifLO3_6KeZvvt-$L zp{&Io{Ki2|O#wK?Ib0UY2H-6iW7`_?8?fM&)zu*nS$qP%=HWZLW2V9hZIC?RV;7JB zD9U7(J;zb>(q3z295cxu|AX~RA2$*Lx?RJ^=zS?qH=5IrpL0(eQrLP@d^SHJp$ejX z9SeVj_q%|42SV}qU0@DzHJ2sk2Of+E$$)(I25Rx=Wcod6a4rYFWrFL4gnGe+@gW)4 zDTRB)<+*ywh=(tiXYUcQ{Jx=C>}>l~ygEt`;ko5$r!vcoM^wKTbpVda+V8_)cj)Zs z$m~f>M2>{;>VOniB+mA7D#&-~+qL2*FafMC_* zCO4?AESx1dy7RBMgwqDV>FrmUe}z#R!DW!3YfcsLKL8oOBT??__2U)hi7(2}dqth< z$K2j%({beYo=XE;uSh;6uE34RXM4RCUhWugNC&t8lH}Z2s7~af{yx2@vvz@ZA}#%k z6&TW=vq>9$0cQC)X z?^eFTa{Q*Dv~=y<}SA`1?P5&6^WONf0&j>EZ5nVN|`4x$^Q$>Z&=k!>~I&ZgZ?#2M&4R25{B?xCUAGpjwl^-zY2<1(o zOK4#vj1|6^@PrwT8;1=7~xBRec+&5gGFBzYJ z*7A6M%lDl3h)nn!@zLjcyMy(~DIlj{b}VHWNz&U-vk?#&6xhKDXsFhc#V<6nQ%i>| zzo{Bomy;&%Pf@9lDyt{FUTCgb>@074UvhYNZsMDnF+FT|U2hsltJY-4*JX&$_Qay? zH~>7&At z8d_vms-3z<$J7?(XW<4QHDj;!S0C4-&d39jc`$)V{mzv!e`NfA+ZfIcj4>R7EJC{O z+rtLv&;BU%{Ay8HSRT-DBft(dC$r*Zhc9~*()E&uiPWv_4sz-95($#RshjP@1~czM z_)xkxM@D4NP8Y#}{ipM2=G21GLT$0MFRn?grM7QQgR)46=b`pe{l~25y8%rrt<2Q z>8#Z>EpeERw!HK0lQ#qR3J=4%^j~-U@%QRaTiwpd{ZKIx6l0ooqwI8?o96cVEuM{k z9g78uvhAKc+)GKm5$thQWG*k6m!emVT0N%(i~{Z@xdauLC2gN5+*|nxZVt zw5>T~9aJ8prH6^vDEJ6i(Oa1Klht4NyWEXh4rg3Gmfv9HH4^psWzNRvKSEUoIMn0~ zTFd_0B{4!TXN+B3bOsaB{9hy5mp^?mB@3T?$@+s<#m+rp^`%0^L1gYi zlRL+#__ZrlRq1of7LHmFmCTK3rV1sPW>(^^Ju!d#4qr5<cZMxI-NWN#*?3)@!*O7S^~i`ci@?z+}-kE@Nx+OMTq_RzG33g z2!U8i!&%SKiv)$c9=CQ)9KdyK@l^@dvgD^M0ivBPPnU?F(bMOa{xp+=aq*Q6(+WK& zRGs~Fw3#6{K29Wx*z1gWVVBrTtVUL}@OOGawLB>39!NC}_CYLT@vXG+^_xsr>bj&O zA4ArVKP$=gC6TF#tT|ZwtkWLuQ-c{Ss;)*ritPC1Yw{0SQQPViDFNr1J3C2bn4_~=1b_Lm= zf^_4|(FTyhIIt^WGs7HPP9NL5Gb9qTP*q7yQ{%!YM+J#Q^xP6oq0m;n!;Q56X*0Q2 zJ)!rGuBAWY?6^zWh6%D$`%8XhRW`+9#6Ca|WHYv;06Xc#aR9r+)|dAbv8r9zts~2r zT_cP`{5DVyqKAS09YTcAV|!nEgoB+Y`k~U4rxF0KK<(U3m{<7LAWc zY8Tu;L1O0Uj8BiKwEq_<^s5FXqHkC!W@ zVPXF@hVsK>3b5N@KYyxtx!`}y_k=0(m;JIRI^Dah;SazBHLWZQW2@ivA?a5(Ma)n} zLkVC(SK@QD`@#XYSQMoqtU&6KtQB0P^j1B;SW5mMF*Ie^*N+vcO2UPm{}8a=36S@% z+f`@uWU-G9R*G~|A-6DZBw!tuwN;+S?f~VO-+`@I0;?n;wrI z>na`c6!!f0%c%Yb;QrX%<(sV5R|i8`fa7dap|2F?yzQr+a1b@~CC6xhiK^IIMZ-p6 z7)NT;H6|P6aQE_ZVYNA=8ve8wmt!MrtX{Bmd0Z$T48SIaQB5aBN)Uz2@Mc>Bvnql0ws;-b-W{W*V?8nX8PS?lefOcVpz#IsgsJVH;i zIv0Qs*5^fPUvF!<2_7=R_8P1Gis&bbYgj3)T05Dit+zC|h8wjLy*`%MDuXn@-D2)Z zycwjr%-L0RQ}s-C+*HJ)O`ekPz-Ibz;Wfi#^5qoPHIVkCzxm3xUP|~b7h_qM#_{G1&#*<(y9AP+a z79M&n(cqFx^x{4d5cqr(#cRDSstzmqx6G3&_LLCd#-zadnjksq*t6RH^W5(8@vAE; z5@M#FmczE!PSQuwWdus{_RNg71vu2$iN$y$+riChM1iY#SNZX~1RldOjt?(pne{NA ziyN8T$r|9ZneBf@7P4Jdejr4dzb)~JYSL%1FS+dB zOdY&DT}+o(N*AZXv|UaHMD+*xhESPM7>9cnT%X6{jBL&hqH*PyZ;@pj63t(VzGRHE zOoO%FAmBl}w1g^FVzu6mxT~#CgqMs~dcb^XZ;$gBaUe#X`t=Gi$R|11ww%9SGg4pL zcKr_d(sc*!8E_pTfNG7vC^)tmif&cEI8+UoLptmsz{SHtp^kRt&OH`$Mm%B<9JyTN zURL8yb`RI7PVg3{1IY3o!3)usgG{|trJziWx@m4l;{Eg*1Wi2is`CKwHw9PAYe(q6 zCEl=UDC*O04kcd@lN08TCDIYoK>zN$PHU@g8zgGew}B6AX{6{eWX;v3K@Rqp5CCPx9@K$YBte0~gpWp97wpeqc3hBB{|@|M`z^Y8Bh)YES1|CDx?aZN^j+*b@*K$Nb5gc75X5M-1= zh(UJ=(%m338bkz10U3fpjvOIfGD1X&jT%U|G$Wofyzl38zk1$0yc*ZG>pIujxz72Y zfBe3^t6GG5o(1|7x949!7mfIzYE*z#{X5fv1r?rw?P>AAZa7+43z}`DR-Tjsrf;A2QUX2w+1GS&C4VYA z6!{*%vT(sufv1J`vj2Wq#r^L~yt9h4D<(BfosPezmK z$q!p2w1!nrUtf>kk*eLHW~;Ujc&j{s%#$idXlP~`z39i`K60>w{3}Kg*Y7GwHlu|b zE{3^16M@>ypmD@yH{K{hG=3fKrUy$U6hdmh6I9wfw}IvT@sCY||AE_fWov{n@h#J? z!;a;{5;i#o!9D4NbL`3Y z&S3s$Bk9KzIq}@~kxp*=!Xn>^oye$g5^}&EVLa#-9t zMYCCzyDt?reH;Z1|4uPLT|vL9iXV8L$rj5M_OCUddWgFuD?M!3ltj~Tc-m(W#`?n7 z&c4@@nI~Pu|4v60oR0}$M*l9n4N<(74Om-y8Abbx+1+%tdq@2PHSV)(+NL@SW*v8u za}s=yH8qE8Rw3&ghB zf%t_@-If z+VrT3R(8CMP|A=Y<-8Uh_xiPrvvswD^U+g~Z80Dr>pHF6y4$e#t-tac)D3BR@XjZ~{6RE*ARycC@RrKywo$(0vJ zMX2+>E;Oi99J|w(2fo*tvW%u*xT7}>%oqY%oXKVSc8tankpOqXjSPp3OcT@6cYM&a zefT3vAjmAnYps6s?#ltE@==I%CMb1IrfhtoSk56tdjGcDpXQHNQH6(q)}fCh0hx?d zF+Fd+gWVl|Bh+{jXP+x6DSe>|)~EBE6#&i~f1Rsjmx z*79}#`i^YQbUKrXb+rOXL{rj?&(?6>qtM_Ds%iJba6JRY$=Gz#)qCWW-&==@3{#fI zS1JTe8|uH_iFT?$_n)ln?&I0pPZ>Rv7cbEvHqtBCy}+jPW^omrRp^G2;0^XuF~ z(h~(9-E^VL+y&!fj0WTj+^ZQ*EH+A z*I%=*U5`kn*PUht?gis?4GoR)Z9D}PmD=;Z_fhM{!!yJ$soJfH5#;n)_g~PDTLCA> z@QH^g?L!W$%^`p}Qx`~gW6u-NqK^Ee1&tcr52<_0#*f~Bm$8%_c(3SPediNQ6W6w{ zsVNjj1P?FO4EGe-QqlEy zKj+fhC0hPKM`qbw`o`vx&=h0GPnkr>OEi6GfgI|U{obPO@2}4`4_uU#aRY6wf_~(j zfK)CIv`?t}WdVeGp?c{O(8*rHnD)KmUQkwhUlfHcj(a@hxM0;DoG^^Hf^*n-kJmC! zBky$tA%p9@1XrQ>@pes%)vN4^GS=HaA=hXUNP7Gx9xE5c8CbTA(4*!&+Z`~aK9eBL zO`;zTHL(b;U|F)pTX!QKNv=0^(fhi`kP=C0j4*v9Rx&$7L2=f82g$oe_+gNa^@u}m zhM-^BCYXut`XKBwMcOXi6il~%Yr?P|HD}yzAMGts@vOj!?i0hT9r_7X%R9eKAuV+Q z$9T!-_!TvFcu$+)8$xcl0|#4CC_@X)z>Dc5t$doChORdM7t@Ic9->MdPr^6H24vHK)4ugTh-tdCYMX@okxF;PKf z-mYsXn3cB9dcy*n{=^tSMe;n1W0N zOhsy3gj@U){gwL2CPT-W&qY7Ew#Wo4)g2%Maj{dwBzldbHm{^)IZ{I| zHnZo&Qith25Vw8F#}1O4nx>GsESYctUZM4 zz8;NmKL00N5IAu2Pp=@`M{-LKF=a*^Sz=;uBTN#)Y%zTew_rKTbAu*ZJt9l@?uICp z7&z@6l4Zr)pMKlpUv`ezs!f^H`_x=b76!1okn7b}sy=Sms%Q{4aZ3^%@Fbi+oLrJe z`O_PzYP}7n@t+aqfQO0rqPx6GW>o)Ko%X?-@Za&BLl>PHOw5G)=_{(7k`K^ToGw5b zsFTWeew@U$g4LWolqva~=|uUb2V*AkZa31nlp6@+Hm)AeNkwS+_AYNZ5iIUanCiD0 zaNdmFZ(rnf6wqlTJ7Hh6ZkG&|L3+W}s_5RnY$6Cy&|~F3%lR)ndWQ<+If0T72!dw9 z`L2N{3Xt`zNCz~47%T#CHu?A9ppXZga9KT>Te&F6!a)OmwIoATk1ET=0?wSH9m4GA zNP6Ea38gsq;7DP&ef7!A3=k@;4gJrc`O9;bUP@y;Y^!EW481XNFMM023D`)%j{`ddTwH%UQ9LH&{X-$Dso0VK*UgM>Zi~cuiy|}2r!wXf0!8hERNND z)$Yz3DO1T!iYh*%j?I0i;dBNWpslT~13j^94gTj%?$;-JQGTGse#WKA*Ad~E^eqed#88s(UY zhjjCzLrC?z1&`^itrS178D81g2XzNVRWWJzI;czLaCukw)sD(ovdw!+pzlZ7I;%lB z!q_GT={&L#xi;H*NBBrh2F`>zcv@{)KiZr7A$gqrLS^j&6!t}2A6Lxf&I$v!O+hovD=joH9 z8jnINwcGACLiz?AXW0{+PBgV7CjS_oukQgbVK`Xtb$j~zJ6kN=1%$z+ufm050U~VC zDl8!|#N4hX{*7U^dTArCmOSqD_p*Db3Gx~_B)_R&-Q{}~WeB@Q@{Ibd-Qw*>N6)+y zOw03;;%h=mN=xSJpC*H(ImRU`v?Ol_T@|uD@sg(|HY%0_Fz? zA=4mfnV=)2EkG^4TOqgcQK4zB^BP~5v8P%3Aqa(HA=%Jj;VLaMafN(3%V0G%g_UGn zj)|4hA(Xay1Z0r#068?lG@=w<$CCw+<_PDz^>>br{by7rP|vQ*eq+%?pV&}jup>l2 z_%K33u9?IN#^Uo?HEG&K(ZwPe5v#>-DUU|VEt`YzHi?eHvZ8JMJBw32?>A~}_Lna^ zZL|wskR{Q+_K#SP_V0RvqP`0?_tTV;b@)W>%jN7;!%@UwagFNcaH2T$g$bQsJWZQ* z42=*GMa^=IVfdfxhO$xQcCWw+E*nAd3HXqjbajQ zI`9l~adB<7*1EZq4Um=e^y$Ps2AW5qsyAuF@`{^O#zyQ}-A?cSp1IyfDm)l%m#`Oqkj5uY**--CA=|#&o>AZub`o*L`i2ZK zfZUJt3BLncmV~CU!UbB_(oumvzi)x(luWCYXju7*;?c_yh>s~(?KYKF&y@FN8M~=U z3D8ZWH6Dv^X)7DLKwvEr14b*b?7(PkkdS!Sa~**a7XY+_yC12Q&go|v{iPJu=u@0} zGB}%&UoEnMe=8UQ!kODZ!t0`7%Mg9--#Ic@F%LF8$Sb+-`f$E1Ozb(pTwlw`zf>#& zs_1^A-pe0<%y4>o*RD2oQ0~K<1(p&yy{n#(Lz=e2`Wo1;x&K&Bz#Jjy2oWTd;ndcu z6Go7-f5uLqmO2o2=OYGMjFBOP<6oPffHmwt5=e4DZW3~t4RnFtEA8&15Eepz@q^Ej zBs@ZA05xVMuZzJP#AUp;9K_l>K--VHg$td5en~^Y0#R0g{d11D3HmxWHhy@&ho~o< zPMHjBqA$mo{UCuHl3sQ*vl6APj@8>8M|ZnIyW@cLAk2!xxp={wM3LYNQ&t3XFacps z0$CE4m1U-+BCyCoD0cJ!c#FZzr^BAKJh@16@=333SmYrHsi~={<)R zCiFYw7|1Ehw+pEQkO0kdt0SB9PJTfrFT??3z!-?XNeOoMj+SfHO?tU*)|N}yPpsze z;K>GV|H}NhKs-}4IdR{V}X`M?dZ#v_p;pm%gRF5*Ca75{WHby#CYmL`yKn*4PgHC%EkOpy5B zZwcU#At2VThZ0R3@arZ|SDXp>T}RyCm;-{m4G##8PKFd{+|#tw)L5_8+R1O) zAO|#(ykr{Z4{S6PvFn<4fT)Hv5ElL?<-fr{A-5~*!FDEtjmmQ${f~mwsra8}l@Yh* zM<5l#?QlDBfIy4v=v>zTl%2RT6p8^z8R$-oQMh~rvLXwIZY$&d6FZmmDwm8kE9R^t z&6S1Cy~k!i&@yP7oZ#VV6Wj$w&R{%T4h0bLPe#+iIoT)s2wz~>j8TGmO|1nZYJ`fA zNj;yl6H%H$*GQS`4C@-MCX1x~g8!c`6<*cNmaDKZzFBz{2G))5;)Et@^KSZTraW~DPKirG^mHbBE=cMYCr()85Tli|(B9i)yu$3`hvCz1 zK;kHu5VvjPZb{mJ%Wbx@k2nplABhn>hWQ!yX^N}zzZ<`d21VVag%#FV!Jg5(kp(inh1U2!`XB~0cz1im0q zSzn|3KzQjIjH-Da#2meLGIHTFI8-#J2mSm!!R7+8j$kfVR?ly$JbHB|ycZ*jbO)DA z{7-}!P+6>^EJCJo6LlKNv2Vn4V&-cz7R71GDC+E|>2lDe^2~9{UPs9j*=TZ@GkoL8vEhmqN_D38*@L;$Gpz}6Q}YkE_O_M1Lxq;#nV|VeQx1> z@-XuwnfO59Zi_w}+kIIYb7T;XCsUa0rz8;Nb(GhtK?TU{*!K%i`eX6!?;NKUO~n{H zHHyTT^299^gw~5mPH{M@KcZ%Us8{_fx5cln5> zwJFS%T>bss3*; zHdE$RStCTNqp%M*YdGNd&OZHToF1^;uo>tDV;-j{!Rv9Bp)(cyZj5<7>|G(+-QsUT zsYeMGDdfSuYVboA8@Z>Qzu{!qsVtO6CuHn*^QI$98Q1;=={I+95ix&1kZY3@$U_O< zrjgTcK1rnvuP{qZ;MzukvpVtd02$o#$LqGapT%+ivm%w4{ywJ;`*lWhM%@E#w|J8x zm6Bt3P*P&(U1#Bhy{L^o_n=}P$b0hURho00DX6pwUXZp;5`{3GeT{GvsbF8O+lg3k zAeVl8-&v)WKNiTj$%A}(!p!2$S#6Rb`X*H20i%f#VX)1d-P2*Dfb-gGR#`-WMm=Vt zVo}JOYNT#2;4!C6b8{$FjTg6?6J6Wz3veCG|GmyWND3;q(RkNoEweqxcE|5mTMMKdYd|&1K=(WTT)=O%isP$CjN*sYX<>P}A4gCA(ezid6_sBYKouq5#fHH{d`Aa6YjobuT3eTpKwb5|uA|P)E zfis)~PlpZ3M0-x8NmFh7!0l)F(iOi%(YN2k>4*}HP89YwN>-;*(r>&=@;x9%4 zocxA;q+DuR%{1mfcX-SXSQQ^hXkis<1P<^3NQjU@q8vsu-j90@w|*bPlol6Rsv1i| zsX}5L79bucw3|`S4ubfY?^TZ;J?{adr8-=TbOSXkc)*kE6uy1~&>#FRARe!!&CzZAD8v)`` zcyarb+j$(K`S~5F5dC!_<*It)mD)?#?Mnr|i8WJsiU%B|M&RLAHERkHM?O#H+3nHH zmN(}98z38Wty%F2{aQIeWC$b9wBZnVqD#_%&1FpKWFenBbIxCW#0| zAIInEZpEh!sE-s|>#RN1ZtS!TxE)M=15UE{zB#%)uFyV^S8-}u`OrD}IB0vMBW?ir z?!n5g%)4MfAw*A|_4%N1zP*yysqWE^!4yU%<@iA;d66&8s~qu&}4dh~D1b zyy!b(KX2!e%CB@KaDB1p7tmNq1Gpb>IQHnXokf3aYOhDg4gOq5OvO!uZ|T}Uz7Fbo zGP#?4G>0YO!mr^TAZA zjA>PGd)kcIq<-h?urk%%OEV0ut0In_mv#^P=toYFV(~KemEPqZn7VAx(ZE)I+|K5O zYYDrRKo1-p5#YZn@AV17f60P|k1w&SuKf^4MugRasOBS|H7xAfW zi^Oa7z)gH`WxhPOM_&svuVt^2cZhIs4wh88%qQR8z1l6YQ)RnXLT$WtC$ zk=LX2n@zh1P7_o^NUz1ht~=VEi6$WPaw(405TuCJA1^U~Ej*2*Df`(+ELIOAGpu=Jw-n6_)(toLrm6l>FL)gu{w{%$4jAfs%)b{%|To+QCs@#`a!tsof$u? z;lei+{&3?h*h=k+rM1_%c;Q6pSm{oHc_uFqL+r#FKA`1+FUT3kHD7{5=&K6J<9zQx zmSJJGxnqszUKux0do-Mr(C!KdmAScP8hV~nxX5X#{J8mc_@j@9`}B4|t}d&{`4N>h z&J`nHR%l9Wow_QEpu12Nv9N=*L^#Tx&pO88Hc@>UDshEXkln#Hwhkp&`iK}aJA32( zm9*7~Z?ex;(!yZZ z8OM6DiR`tHmb4bsi!UgO&0!Q7Nzh$6P`Qy7`+IQyT0@-xoW-c@o!(X^19oak&SKovSV(-bNw_}Dd z!!_MG!ycM_XVBniPy3>%Xot+@r zm^gL)lksp#u(J~t6)bYU@qNIYCPt;@qCj@e9-Tu_`n`&sfEC2NX!$$rf=ScPU2ez8-er4wx4I@!on&Jn%|8|rcpsZ>gPV>eg&d|VF3#tI%-y-z zBppOxS0F$%$d2vMLVslshP{fan$rP4=>(ta)?GZIc<95VOBzrqe_(YC(q z6fab;H~l;*cRd_TK!8tYE5}qSw{=_o^fwbeODC8g%&Jg09}n=$b2enLxRm_bVyms5 zY&Ntv*<8gcBqxzU{v!BAP`gWV8f~1l)Bz0)7fg>1SqoqK@jbtD>X{rhB2flCxFo9Xc)ngAW z8ce5PaioyT7s6M$>2x`(ebvW)20ibW&m{CozC8%!t16HKD}YSI2ROW}RQ~*}&1@!9 zmTVha8~1PEbh1-`DDN#ipjhMh*$-Srpltpp11{G80 z8Ww2hH-g9sD+mxWX39up^nhR!89}ZYZp$cF?c=hV74n1mVh%Hzp#ywBM8;8D2rJST(4LE7oHDTy`mrc}&&*CwwBGT2W57#`i0IFzb=+_WCf;%KL?lj?NBRDLO?2wsmM#;DgP6}M zV~5iT0kwoZ7kmSVTgfE&?kri9*~q_XS)Oi3JNXkF<5kWF_Pr!(X4xaoyWl3+{@uh0 zpbg)O)N8pu1fJ@5pSLNYw`#2Q*EVnwnS-+crD3#xLDfh=pIl&4;D2D2AZ$F4QD5F5 zQWBYE4N-RgPml^0srgD+<L#Xa-n{p23>CmC9X;`Be$JGr}(DcRm*Q zMS_DzuGC}O;*WUS0sxH_Z!rPxjQ@Lm+w<8vHs~p&^_`(lY;1jta`|sAKn^q1L+t5^ zATY4$gMmGE0#KK>C7Z@>B>lLrOag&ZBOGT$pV$%-5Ba!w)&`YNU|$CG0zkdQdnsKI6{lkDW>6&Y=`t7p04R{aDGAOI z3CyP^i3BlGw@(?WD=XN&4_iyoCof|ekNSjx$o)LQ+fBHXHXRckDx8!9aQS@Q7>82q z!uK!9OBgpSDn;avi8n$=uXerXIQt;a|8NBnAb#yJ7)QpZLGPYUd@X#kou3M<#N(d` zjaMGE)h9e4gtIFX9JhpHrhwB$I088&BRF5ds{vhtM;W||Y=9A5#NgHTb%J{uyh4-^ ze8PlRfnfyiE_k)T|NnC;ii-rSFhb#EMnN;*snlV#EP9p_TkSZy_vDi%x4UesSijuN z((IE+9lQClfUS&4mSarGv4JSJvNNG2Ntc~aJk&KIoYOy*?McvOnkWIpFx(6ifYnKt zC^Kf?&16a<#_-2E58H_A#Kxi=|fADBKGif*cOygwCf z!WF__8*QvElA50WRoz{&jp^HW$#Q`mZ_E>(+L-aC)j5ax*TI&>Dhth(hfB>exL%CY z^siCA_XTll_OkHRYZhCrmZI8;l_pTqy=PXfz8&ifCd_HO*aW*1C9_lRQzk}DDJ7I( z-xrr)co*R94M-XR4{5sfcr47BaXIa7DS4r>i)_?5%HFt8qjV$)$$qu1{naGbwWRl2 zanY(nJ1Ld!#cYL%UYg|!UJc?zp4Swo%Gk|inO02MPMf{DUIZt^87HnXB@cz;cH$0B zYF0`n1%Ayq$1kd$^1?j8k32wUbg`A3cQ6S*qA?s1^Bm=5@~c)a0v0j*LU7VPnh#bR za;#a|$0*z16Ob?uu%72^DwhhmOIzH>`&fIFtx#GFoffmZhL}z5zS;mPm5_IqCkKnp zp~tg>3SO;M=BtmW3dD>$U(>c6oXBI4`m&?)Sb2*eY`Son%u%gyLCL_5bzNyoQmcv!=b6BeKimk_ya-VfrVB#U;_kMmDYvjPh{f>yS^BL$N zuUaK1%C0yz3ocJymtaAEm_P-Zw`%d><%bnOr#xjjdGU+CRNcGCW(C*T$Pe#;>+-!Ur~M;zRE>|0e_X*yzKekT+@d1YuQ2#;5lh50m$I6tNu`u@Tm?>G{E zDw~ef2t7F2J$Ap~5bO}_D4%_DFvC0#V&e0~jV6(q%T>JsdVQ1LX3RP^I}GNAqLa>I z`EeU^9S-_<=mFfiw^*U;2VF({NREAU%DlenzJ@$9%y2*2iA>6I4Jwcl>jYlIJNKFp!REsD?3g}N%*Zn zr(AGk8G`{?849^O1|<*=_A(twe@)FaS*SofR8J;#=HjwX{(phye@1*Q^SL| zreEnTBT~I`%U9v}j*4JpaTsfc@xCLb@jiYK*UsV)gsJr!t11~%oq}ec-H92Z+Zl=x zJ<2?C?>IGX*=ol$vhDgGJnEe7>TEf}ZTQPX65OeBD2X6~{YwlKtwfcd4s# zKB<+jVr>*M3k(j{?@*9$z)Ct~s2Ug=qy_hw;pf?_1rv8oE!eER3c&!g{h^9;Ca=)f zV>_{_$jubPInP&1ul7_3q9u>LsDg>}MTV+n;9VJqy511cDIz)(4$8R$`rAS;?C75a zGfojX(EqMDkUuFivi&ldJTF#%8`~=lpKXaLZfw-4Zj4pP^m<|~znoWu-qq`zJHT68 z>o4cOqa>@K?d)CGWH!T5W*kUT<$h~2?Zix)zFrhRvW7jB&Hjv?!#0dM&l#e!*Zn8) zU-Qdfw?en7*PVx7_YQq;ai%qdwq-kGX9l5Ed%q(bOPwJ>cx*6l5k+CpX|d?-|=BRkp7XW>`CV!Tg(h$e0o;`Y9LV({S6C zrp5&KP0)tc+I`KmBIco)RySM&lC@DjIM9%72DS3aX&-OVc{j~``8+yo86$$J#l^LE zAIT6_Y1h!Dx>{;MGtS-~`W!Q&AKC1F@zn7_X;D(N9Vc$oLiQFvdgjXOO|y9128Z_A z1xsUl%h!42q}duNN2<(s4!2qZH|E2phX9SbDgc8#Thak#uSz5&Lp7ORhw%Ra0oR%F zZ{cqT%YLNf-7y0OMLswCFvdT<1iv~6U06McJedc&KT$w$M@6^s^uc3JWeL-fT8Uao zu|)N#AT{tkd8<`SrRG!TiF$pQF2zLxf}&!jzgks^JvbCzd~bGeQ-UV(2P*|I8Wxn) zr_%2)Li1Pn14j0s`^ql3&hTw?5l@5r$TzL!osdVCfDM^KSk zy5Lnt$P!c-LJhB(^GAomSIr2$#cn~Q+oD53#^b{6ph9=GDwJRm1$F@+-VKNLyRHCA pm|S{YBB7L3;QwJDsVNVfvbQGAd~tfsMtTnX-B;06E{2)C`ai_2a0mbZ literal 0 HcmV?d00001 diff --git a/docs/images/NDK.PNG b/docs/images/NDK.PNG new file mode 100644 index 0000000000000000000000000000000000000000..22fd45df2b51b25521b010a52c1e2b4a66911d94 GIT binary patch literal 14189 zcmZvD2UJsA&@NWQhJXl2i6BJw0 zXU@a3D~bF4;l91xcRAP)jr*~~-`wCDPf?G=BDb^K?W)OD9-b290hYraZlCXwp|w8` z&*A1juN^qp7iS(Gjh#k%SMLPduVx-do>5PXnm1jm?HaK#7+v(8zm65kY7ZOvomJ@M z6<`4Q5!5FbZg_%k=d08XgCUdv^l@Gj@4hET-*%*O1nq^m6Z~zzax~Z1Km%wH#+1+) zk=_VwH*XDgAm@DV7FT+Ek zaUhSS}o7F}!P8}qa*?>_g zs6x0Svv)*qZ8}Rm>ASKUqwfS0O!qjCNE~Dyo+ggQcp4w(;ej1wZ0ed~`twh}LZ16) zVPS7NF_R(hP0Ix)UhTHnFkJm9oc`7ha@j0zvmLdL-RK<3NV`?l|2>)4ai@6VX7@_+ z0%Q;U1Kg;GJ|ti15HCLStx&)jQhTF9<=wD4U%BS#+6s8tR@#`iy0%Bk#Mq3BR=4{3 zK9Zstnq)hPto?bFzfo2tQU%8}SBowzI=^;EF{Wt!y86 z_;AS>8dey=gbBKnUGB$L5{+Zc(hxj6{5Kf<2o(o~Y>}B)x`*laJ6oNDmgQB4;`l7x z@l#$Do#-5jnu2TUqoMYi+^(7PIpY}q4eQhrJPf!)?A*B5Yo#BLt~bgYKgnzdy()}F*C3+* z$Zm`%j`*usNU~0ZUlGje*vQ@YG`^4<@Pz@oX30zOg0#hOdv)`=;$+{e_6o@voEz7# z-<&?K3B$;SD7<-IX(aX9ff4`sb`0BVQqP7lp59g(5-trTnLMcD2(x8@eT>2l^rgnz z$t1Md7rsVwSiAsgk3o3XqsxI~kg@o=Mjr1$CHll%QUGx_uvRa8mJXx2TZF&k?BLNk z&;Cx`!@cQrIffe-jrLF8%9XujW+pA&A>Q$OsM>$z)*Sk=-@6Wf!<-KNiY7|g%u@$J zcR50`oyiSxO}SWydL6!%$3~Wrmuu-;2V;zueApuWQ6G%lPeyUxBhKXz&HxXIww@4u z){UG;;jXvLDo}%%=l9n3Qx&k{XHntt7a>aY(%Z}761b|tOb=|{lGNA-&8#s#&+!iv zKR1PV)*o&0j{l?elkKg}^McQ36nvyC<*qKKaF&Af??xDBi7GrbjvVdW=_+VIDU>ZX zx^=)ue7Ef7%ELk{AX94%Z!I3<)sOM6tNXD`pZ^ zu@h!X+7WllIXrP#h0#K6zIMber>(PzCgj;n6wecb9LPxHUvmc;WhM78Xeyo@X`n>p%EuRv#v(Opx;}_mvnF z8OgXA4ez{^#u!zm)2FbB|-DfY-UmICnr-OVC znh&_*1w0@n8W!X++asu;jwqU!*|4^4(GxvYX#*n5-}B8>eW!_kEZl|;)~ z-<9sV?Ew&!^A-$eUt+ToB?&(YMHWri(R!yYeSqr=CS_=-J>2){f*j?u{lPPHUdB=% zW@dYngKuA}GzaT?V={CRH=zc=nG#l}$&@V)f9BLx*WOi|Wy}@3fJM4{O#;JV@*YzY zetPc+(?tMA)NTmE@e=!HHl>J1b{Eh}ebF$}%80jD+57*lrrq+iTRS?+g}|n2gtW1K zVIQ#Tg!XO{7w8Q`K#+)rI6f^b0bzkw$Ji(;AKWPI1D06ua?b7h(tw)PRDBr0+?vIs za#3)$Rs@SaX}++it7^UwJhfW-TwP%2Gr{OnmQuMAf_pfz6xF^5Csx-yd?V*V9sL}) zVX6oBqEkc)WOLsP(-Nzi+0&l_eql9^xmtN(w44U56}-npS2oPwD1dL*H*oGw z=hN;dz{D>%z9`LP8*oknXxo57#$A@7FX60ePxK4^`ByKpBMQCGa_l%J{yIZYacCCZ zgCJ=h2lD5F8M$bcSIO`jFc;{wX}zTT5nTZ(~mIM*#ot3T%=o{KwT1UeB6 zh7TI4706`JB27N&u);oV-#OQ1n>FHV+omKzm zBPDE+J}jbh6XAneN~L-NpDS_nN;7+`diH|lJmR?u`w7pxZ0LM7pY5OdRcp0za5Q%$e#K$WtQ96|1ObCias2mnw4y($yJeix#j ztL9WnRF7GX-6Qt?CTxH^G_ZIqmAB%%*9j%o7Q9D6h6A_Z!~(kKzE1f z)eC;ED%G+lwdgvUZ~YC^R2`H;PB#aD#>a-yC#jhJ#@87vUB>ehGu~ZRobhF7Hs=kQ z&s|-ipYO~Fcxwo7y-kJ=Dvv97=t}GZ-x)_feyLL~ymZdVQ{=@KJXZNT{N$%1rE~W$ zjY9{hXMnW?^c6e^KY-_4f`5TiF%H=#%h?_vew%QyE~=xiAsd}|Qf9OR{BwUicBKOO zdDC$1r5Oq^(RWg^+SsW)-%qGSNdb*C9eGIc66kDi*XDD{xZpATH13NsMRB331M$Z0 zIPfZSb(35|oDNoNFsvMfB$Cl&2rdJ)b63qJ%y7A~R23^tED`!-+i7XO9Ef>Mt)TKd zg=j%+=G9-0j~OJAe~OMUwW!^U#b5jgz?;EhGQ)KL5fup=}=_-p2ljrNyV(L_b|OFv_TfB5%y<%$%8%E~Pv@<`fFh>wn_oz&O~ zNuq+f^oe{EQ!mU{<7yo2!4P2_?kY++JE?Ft3K(9}pviib&K>P}&#hsq2OLnXsI8o2XE(k^zY~SthQLwWp1VGSptLLIcCl z_(WrNpUPLbgY0%Ws3Vw#c9N5R$=biU?6di-fjF+HY&bhdOn_A*{KVL-Iw;pD6wpoD2AZQh|zjVR| zbJ|t6p%Q8VG2@Jmycw^$!yu=b4b1nV@|2;CH=!7KW@7|XJp$+@_?^Vc_fLJ*#~LN0 z!P4S*DAd5+?MV(U_mFcAsmh1&3Yl@Un4|7}B6g(F721!COsC%~fv~~Z_FiA%LQvsI z=si0A3&ugM2RXCHF?l2ckaFgIdYjsB_dcW3bnPmeWZ9v#j>LGK zcNnM3E4|4g>@uANl>7_mlsU3_4f=WaXdgt=Fb-VE7H1k6-8v3ZPFafCIK+9q)XTt; z7s;z1D>bv@Pz8v-JjHa6@f###aR-%lVys^oTS8e!%3Z32d^vN8aB`5Gxhm3;kFjo> z)e{$`je%MlBoS#=n46Av(V;&gZ7V?T#JW^+OE1GDN2jr~6(b3qrL`3qVZwD56qq4) z5JB9}3?)k$Mw2HfA4t-X!n28+7$1~FxC?e;xl7pu^0o0!AQb+fb-wzqvqaglI_($~xPR3W73!RauN}IY)8P;5qTg zas7ws(P_VX&&LNfK3pMmFkifp-^n%N4@c=>z>g`%p05oFAC__d*s76PUBh2Mh^Kg0 zFz~hSR-;um>gFgx3I`7OCJk7M2ZtMk0s#!;zyP8<>H%?vdYjlg3q1y1#fMDhq}T-| zCs>iaTaa1^+QhGQ6_5nr8ZKaMR&u|H;2trsq`+*igzgyq{ci$NXl!`xq;x5IB z9mwgm;tb$}!Xh%f)z*?r@6VWVBK;kUNk&&-xFj1 z7m7&_lnv1QRtTPI$+rmYKbQa81b_GP0~grClU8m`LUm6gshPix;d#N`3KjP3#JTsF zchwYy(R)DEFn@LRAnbS@A@w_uHvfU*5)jmx$jj(9l(67DuW3+}3udoSfjxNTZI;c` zi{KVue5T{^7uNHA4EZbyelJ!I+-4G)&!JD?8a&&tgn62Ta zu7=sL9lo^6(IvnjR@~UE1=F5h|FmZh@me`(dX0O1wMNo2+pZhsMp&V+=RA`?D)+0w znWH-oXayZvVvxsq@(CgGlV1|gTh`YeL_rt#F?ermDZ1Cmhy-srib zjUpGU$2`9NJ((2M3)ntd=2ZPk(0z1insq4@4x<2)*aUZoaj$KnlQ(D3amccl)L^#; z?FB{a)xD_{o1*x_*iS+^3gzUw=^q_1Ke2XrT4G(b=}M=?nVZZ!-=U2P;?Gq#25ILP zCh82>aBAU0FKT^$-9%RG)n&`)NB4Iqt>y$qU;TafM+y9=(H84%!86wR0UioBy_WAh z(hq%tgp`NK=XFyo`W{lcof|!XhYHahXt8`mv|R;KPMLhcEPe~F*F<{HyxGus$b-W1P#fgWW(3nJcr{+%Q7e6?AJ%{XnE{~_?lIqlRkd0%D11y0nZDRnW+F^yH-E&caC#ip{*4H#}r)eqV{uZJdZ`N5ZB>LDdh;Va8^iFyi9f2oJlH;x0=Z`q=j` zOX#=IQTx~4^>ktBp=~NGzV5^*ES#e@-CT+q1-~YbL^D2CSjK?LI4@~SD=4fb&hyzA z#sz|XIZ z+j09=LZoJ!=jZVgIWJ!F?$fM*VS<*fU<6ydlHNqd9JCx;iA)_>#Q3pZO7gWoeaJk_ zY+-y1*Ho#nEX``(4e5@(-KV^5Idj4Ml`XrDZP~YavOODfgK2HQjm~s7r#rAN25?Ho zG??3quvVDfhS{^)D~7e9Vid{o0-WV^%^9wvZv4YiE;Vk*fN=*Xpt);bAt}Tw6;M!^ zRE-u!f_j=nr`cR~dUvS@9Qc{h&k}Eq%0nFbkGcqs|HoVuD2iRLN22C1x8T+Ae8zae zL#PDwMzSMoro=qcp4WwQY7@FjkVaZ@Kna8kkj!~AZXh0m-df3v?@hKq#5KGJJ6=X3 zBzFTb^b|paus2sH+5Q63TPV@-N?JC}ARZk*;}irJKC&Ff#AELtS;dXP=8@x#T(eo_ z)uBq)rD}PvSX1yVmM}o=+*wF@FR6znA)$erUx0RkvM=k>1C$ahAb z2n|pMXQK*XMgW)}#cn>VM>!9=+c?iTMXL1gl+}K!dpP&|@Kv8G$e=Q>N`x6)-JJN` zDyjm%_q56|L4$M6`p6iO&*SL@uA!V9tP)*RmaJj~F_k}#yU8k`jsfd@NiIV)9~F_*6|D~15;+yZ0J*o!b+@G zxgIu*$66K7?6L3pjPZtebl;#Y37PLK%WaIm3%G~4ijHSdD`+Z9#_avvOfX6=4GVF0 zV}^&XR9})ZQBL-zBAr;CtX`G*yrwJXde9eFJ=uqGX+LM`Co3foy&Va}yRLURvgeC> zd-7Kz#EprJ7kAJQ4K1SLHJ8_$o(^}}!fgKWyI5E&VFlspmK;=~R*V{I)Kkn;9|^la z{l;jd-5B8vY&LhvGcQuq3jFQ^gXlMq{t`jZN2OlKg$@=UBIgg1)$7rB#Fby&kJ2mG zd(nAfauqhDyt~I(()r;lA5Kox*dgNa(HF0ltXE-4Rede{oX%W5nzW^J4HD%7vd&_I znadubb7bA!l5^CB4aD6VMe*cZ>3L+$k+H!ucN3Q0*51MpM2{u##x zd<7j;te`r6-f(M%_NsY`}_&0l>Mr6ag(0jf{{DcMQl;7U6wFLy({=;+VH{tqasN2vAGWP`*&cynf&*z)9aklYk zC4XqMVPN8|2!zfS+3A1KneAA6&#I`86AOe?w>y9$F{s;&K)1+%_RwN*YRp5pG?kfl z=;^Bkey8k_ix(3uc{ADNG$GNEss~xP7KY-6fxubFdQ0>@!T_SFxbvX02)oV)-cFSq z=^bixL|H-Gx?`=dr7$!mJ`*IoL`IVkqNuwKk$DluQ!V^y#>3&#a6%C`r5sectOVx9z38*Oa{h)nKA!I116+MHB z{DhFEl(lr*v@lAbE&_^HjD#6-aH&BFOdK(RA(}YbK=iqx?ZO-J%}xi z4)lz+R!s;4zv4DjwKXn7qAbk2eEbIba52(RzC)6ny`09#B=B!#tIMK}ioFK0a}9p77ugwp73_nyI8R&3@<|jTkR{3}folV9N`S|5)n~-% zY&z~C;u2U2qvVrMw%SxU*+YtH2EOT({Q#zE`@30#y{OT(>)@2}tu90!8%x}5vrru^ ztdjzc2X82x<*YEq=#v&GL-_Go~+z)_sBvL5&$*TOV;8Ar4LC++|vGE{s9(nDd-pC(ifYz5go@$+k^W%y!)1>OE5E)g3f)VxK@N#Npn{#7PM~5C&o5JfKP>3_TgW7r z&8&ZWH5|^+)}DKk9^Y$e&ZwBl7xVf=osu&tK$_$vx`5JHpX<&-=rU9N z@31#~n_(o0<$@TwP%3|%@8Yq3tt?x2{!_J;6HSQfxD$V7XiC4L<8B{om!_0!GE-b% z!S~B5J>A$FzIQcV3cLC&t9cOSdM^R6sn0dZ5k)M%vm$%8`U=ri8~Vz8OEXxk4Z-l$ z(yKm*#&-ssKIiKhIfuO0jKf3=Q*6EGsNLMs0B&19kqWRbv6r`d2hx(x>q2 zuHpz|cukJxOFcs`QS7W{&hIRr##1;VVe>o7pA`ax5E1$Sa)U<=??Z;ixRLs|5Y2EoEw% zf|1nXnkdGHP|x=krz?&PwLL_E2J@PQza(65TK0#mhsb3x6 zvaYF~(RIq*&QW~rJN5+IjKhyvUhgNo^nU?|Gp3z>YAixvFSFsMKuGvM_l%SzV-vra zW`;2#5mdgfFBh=yiU`Q@LKq8k6lMyo+&n*{oMEmGIIo0{gpyo z6Kg7U7R{&10H#J#h($-y%;g}XZ6p-? z1>-6tUOa6=fnAASN}oDFnM^nQia?ttO~J>fz0@F{44PpMx;z&xV`-^sNrMqs ze`n?4zs5i=Yb2?>JF0$Fj_+Ko8#BH37U?GRv*N6iw$klx(rNn5Sten3&%tC%V<|+l za3YCq!p%;WP+}c?8Jb5UpWN`2^367MrLTWxh$*?;riS;Njjr_ZT&$oqA(R*26G)A2=z&_IsgU)g?L7RDx+8-DDPcw|+{eCfIOFUVR{h}@s zypOzBYc}^n6lkjJq|)#N&3%s#Q*sdU_uT4z(^nsv6vx){KMMN!!WK4N9Q${(yV{MT z%BG+IjR?jT*XnC@risg2tK(IR%x%&D0I$xjJ8SMpKb{gkKlcKX`!V#nzHzz5DgQ!k z7e^sFqq@vQ_VT!2NWAf8%!%a9{PvOXzcNdhMEuHwVs3&Ea7)QL@u_ur)yrk)Q`^NW z^5zEulBI;MdB5gWltM=v*bWo zlwPNn$$78>Udt~}PwL%B&y$a}q{MBw2)!%XU*Y!|{v6_kTCrfE!p|f`a3V7EBY2Y~ z_d6r|1I2^RG1XSh!<0-1qIY}WEEsv~NnR{9i&1$D*P5$Zz{I;^VkRMOqJ)I|e|=Ar z^dPT(dfWV~!OS`ItJ^T&aDislVx zjdCtF#>GPx?h0-%-Eh3Q5i)HeZaNH~+MO{Xe`TztUq5FgehC^V_cjx z4rSMD#FBDr^u;&y>g))SWXa=m35t-lQlulHOVyMVz4ySwaG!Ea4&gohEd6FVv}pji z_J00T#DBWpOWCoFw;qyx`#SEHd4{^!X34ikIX7IqEZ}I>zA#1hcT7gdW4v~ncFKPk zZ!s6k?X?ow;h940C%I_h6mLf&3K(yIMIfZ0v)cV+((>C??WC2y^HX{@)Dn#(mrdF= z@ly`j6)%ebN$zG5if~{US@f2vDM^C^peQzCJThF~w(UVFiS+on`Q%NuoydLkaE8IKI# zvaV$-%E=De0dws`M3Wo?;Aw`yI85V?fG*-JSUHCv@{imYnpRj9}%6L*A zo1?g=T=N0n%zN2^`u-~$^0BEOw6gs2xsx~1G+hPHjzo7lv0ZoFb`LmO!U$^Lq%8z% z-^0@iX!|xSMsoHt`}Jiwu0>w@UDQm&RW*s)*sfXALEM?i1ezMNY6Ud{#)weuV%0QW zwS4#|Bs{vkw2O=N4L9Zp>j{VpatiN^^xK`3ln2o>I~d)+=(2LlCZ*O%=-~oWnL_e? zch*N1O(TgRf3l)5JhS5Jz~^WzuA_=rw&lq($7+pZH5=8z)f``)*@vEAi~;CY#rSs8 z$_Mr_-|^RK&5Z>&7wblxe9T&7dK0LyRgsAEF{e8bdqRXfxb;HKquhF-cG6#$@Ia38 z^e5h$eqlc`Q*!Y}-%HsthL-&o7$NiV=p*0YcVMx}+SqbjP%OEMn7&-Ae)Vs;Pv2vK zee6Rxw?&cJPWe)r!y00M))A#CpaJ@vFd{X+QZp9xW2JZm+3K?gk6S=hD7Ko7>UC2j z#cdLgbGNyBoenlm93eOXV$|ghHMSd~ilTVmUB~;KKH{%Xk5pQo;RfVN{&u);Qat2e zvmeoUJ($rOmXi1GpWgNFkw^S7y!;>7T5jye*Z(`2|LOmQF^6>`7GPbzo<8jK4qIQ1zdY*VVuBe@SmE_WdU14u;r)1wi_6 zu!$S>0eRM85j$y(%1Ve{gp>u&Mt|gwO|ME@lo;m+vO1bOdnvm%hU#!OZ0$9$r;p6$ zF1&`1sZriEsu2+pZZ=WnoK5#t5*t$!3Hf5%zLQ*)Y)A6iY;F8!)+d(NA|BMo&WQ7m zou{P72FI%(UfrA1X2qszCiOb(Dj&=~WHt%Ei&D?vJ($@`u3O3?w?7D7g4yi`NPcCgn9Y9z<@@pt|^ zl2iuc2^ZD;koTyLRkG}`31-jVic$w9+l_@ArI!l~%lAJd$iV{U86r7>=f2ys{oieN z8cXp;%vv(eH;w7NQEcdr68>kb_4i-+B}!xCEx(j^ThvD0WgkXAB-AFmHfh{0m~uhu zJtTH=Ep;Lrafl7}+tM+9Ec6Z+^hxzu<-bbjH;ef7*2Fpa=UjFwA{1;gZ!sD_V`54_ zlu1d~y;|hxBQr~-TNQ99WMTJ=G1wm1R(jhlRKWcJLTPLP?jB>g^WTBEYihck-WE`H zjPcEB#BTS(T0U#7$ZVzO6S)WdW73ScSMd#>c#mn(uMs^gb*L`ihP_No=VPb&QY**G zVkbg^hOwNXyPXeLGUVhLCfOAZf2ypxP3P3~)pf&l6V-pZJQ8uiM|1>7^!^12MbUq{ ziW*6)%Z2TKWLdA|L!8Oso-Is15!_miu9E*l!{Y2jg9j5wIQ`CBq23$m#VT;63G(Axz=e8tuv~q}^Mh|k8^HJ1mzS!n zPz7XCWp9IjF_2hEcu$pMH+?UNLEADFSYKatc*(KEWM1>~*zG3&if}ypMnuONl=U1c z_TRvdmTBMRH#j4;;QM~}>*Vlu7*ecf!tw&#J|7{K$xAk-4M@Kp*oTNRoZZ0_t&4!a z<#5LTr!jJv4qMfU3Ad(e+)B!hbsC?`)iFv8O*3-@GVB)wWP`HKSD`L#h$!Pb7ARvg zg3$<>ur2wX=><**(|1a;E;h$4tq1vmFCr850!QpZ7V~ZV&W;lN1Q2x~=f%cr!Iuq_ z+XPm_x+&wozF9gKT|AeM5+*maBamGKn*pxKl1~N!6`l%+gZF@aXT^iq3wgKnt((sP z6E)BuC~5N$8J0j&$4(xf{fJE;1`hPYUq-ogO19fZNx9~2a?y~ypNEEA`F8m1XN3Mq z#E^@pSc83f{rvvk;u*64fy6ab6=;6)>85l!A~z?$9K_6Lr)62#Y?ZLIUukTzroHZ? z&eas%fAKV;(AUm8!}2YV;hE66gp(wQv$u^1xm9P_xjiS$=gJwjbu`fJ=NbP(at#!C0*t6MY`%Y}c%vFEXIU;r#FSe91h^ z8Q4;!TzhBDF9l5`=b2b7mubnXZ845QDK-{pP(!%n6eEC>S2I)uJ5-1$We?IOiKs)* zAwD?_o+r^yF+bIl!W}*d$wT8ej2g8*W-(Gs?AqR^T%Xfadcvj2AD`c}UZJTZ$|y6A zvAN|~UP(nWsjqH-x^?WK5h+l**S+0y<>Z)0>Jzz!oIrMtdb!X&W~i;-DEJ|(wYqoi;$d~XX)Wjv^eWUXF~R)rgzGL)#PEKCcO>~KHt#6lVlMpqtdxq$Eo z`A6hOcua`*D8FaR+z3}*S%*Hb2-n16X#V55ruiV3n~-)$-Uo{&P%&~+kWK_nKL+rr~@)fcb5kmt5=3PdEv{yW*zNTV^CcqKqKS11eC*&3B#o= z=P6$8p`ZPg3A!J7p*QReD+!)_?)`oJ$2XZ;s~Fda>9-sK(KZ|xOM`I?Q}@tAIMn~% zu!tqgF1jT`NZbD~>9(!-GBq*GVb#I``J1zI6axP@i1u&{3W3Aqh zR$%v#$G9|gEbqkYfkY9V)v)2-ra2m)OBZFbl=Q~8M@hWG=U~0mKpisgDLjkJyG0)# z$vQ1~a6;N8cGQuF|4iIFqXH4DNq#H#fKYv2*R2XQEsnbY8YOUgIB z_jsq!AhKG&@^BS9uNUfRcZ1nq{er`{%5ofqUE)N_RgLj`y;LUA{*?7gk6@keK$kFD z^0c5|AhB%tN?Zi3>32jtOK4ep7}X><$EC;TY3T`EsJ1V29{uo?QB?WJbw9zog3nl) z9JqjDyC4KB)_HozhgSceDvL@T-&ZafvMaW6SRH)(#r1+zr#Nd{l+RJ=bB-Sg=B9SH zqWx~@r8O=_uTmy)8+*@nG{?T#HgoExI8LthxYVD^s~laIJzwS|bUJgNDC=|kpI9r< z%=Kh7rlmJn&H;dh<9DLJkzoYcWgGHG_Yqkcxuu~6+i#7`b+s`fOg?Cp8Vf2~X!e*{ zh|t;SdP(spBCR7E;eG^PzjX02(r{2-1dh1fW>Jck|9+EnCQFT7db6REGa+|yL}E=MJh+I+Nl+UWs{FDtRV|+q+0RE?!Dw}`l91~_PJp-;4S&_ z4({@w#B)KDXlSGbtX%C+@XaV**=4V>4wOvPMcieV?`9`Fq5Be=AUsdBuj}lSihXT# ziy2DslZ}Om(*Ucr$;LNu*k$l}$eK9!X^KY}ZaPLWH=^O9CJ#^72_xS0c4&#zQ}pwl tJoXw~?DogMx^D3RtGd z5M~HgAp#*msh}c)Oi=;^WDY{Y7?Ke3J11zrp}qI6b=UptTbH#Udp_^k`+eT$d9%6Y z>}Vq=t0oJ9K;&#sS-L_X8z?~8apMNy@3q1jKJXzC=xSpQseHF%3aET@(agaN0(qRW zdG&%cP~UXr)Y(7?M7|09m1w~f{|tfb@wT-zbH^fPvW28ep18qw!o9Vv`vUaTV$D+R zZuJ>$Rl0K3v^dT6?4f_yKe}3A5MURG-V?3T8fuI~ zXZx?cD|s_5)b|l6_IYpzLXq4Wa>L9Y!AZ3RwW(1x-GY+w?dHVrRF>>|AC6a7Nv)Ra>8xi8XJ;1(qSaf* zY2(=@^^a?E&+%tNX^e*DYSPK8dc=i9a9G6PLe8ukf!}@07(rj$8Q$aJd2rCCEBFu& zui^F$0wULCH$*i21)77wHmR#1$SHU3q+veP(I~2C^fAO}Pc_rxLeZgB>jV0D{gg#B zXEJP}nll5I>TP}#RvHN3=zZfkGFtDx1q7%n}Er#JF)(Vsqf4J zt)`u$-rgnbe!r4SH;{y0kGEBQSW>-@W5@g<|K-~xLSGl7jxcx^ilAUWCRpfW(6SW$ zizhtYm#oVaE={3r1ML>jT{0H2osF%5r%-447urB$`(T`TvxI82JbY#7M#R!b35Ozh zbh?72Rn(^&T+gRTd5!jg77Nx3MM3Bhq4HUmp)uH-z}ui*YS=L9B*mtp9bJxo^+pQX zrdn|!LsJE%UP{#GbY>JBpYL{aI!LdYe5jReJQbccSXo=M1uLQd7M}eS*rJ0!pX-=6 zJa3ycP*@W*y!fKj%6W#*DV9!Ci1efDQSu&7$r!aAYx9hVEnk&|+E2R$b-XzjZ|6QD zOF{=a`{r>k>O+?{QOZ{v+}|*;c&p3@GKdV-DK&Rrw3{)Tq^m&DhYqeSlcDq?_i}Wx z`*bCz)2N{KIk>l7$_QS62O99IJbli*_xrkn+QR+pB|E!``})m&)ZuRU7yW zr|$PzDTgQJrc$@(^Y+V8^mQzwyjX(gj)~*y=GDMK6Yv{t8Iq)+J4AAY&-nQ|AAOT# zWvnfQ2CSSoC<>@VQ)utJ{D-R1GZ)Ze-)JBOIG)PUo9as)tmyCz}5RsS58)xvm z!6Da%h{yAs?X-&Dzm#)2G55#3_sxf&C>x_yO`EXqKggg3a!};lnEhEdwAZ)6wPkF4 z@1kZkxaE-@l$+xTmob9#MJz-16UzI%np7FYx%<6ss=Y_uOVQEkqsOsnTsIssSK-_V z`8GG4BxzCTLUx;5_sB50V~C{wcaJ6a1McZxS)juT@&eAbIIXkMe)=9=i{!Z#INV&| zG)NW*dZz)fm39wA@#zJ@w$pd%N80WMFZ^=5I=ON4qYD(j&o}sp;W%eGVzjEcMZkbE zVxwHHF2~$&qZQb&k1q#}jdxyD2bzhKdU9LT%%`cHin&@!(g?v`(7ClZ=7Gx5s%rM% zApCyZg3WzwITY8d^fwKVAMJKxahw~PCu(8T!fz>sk%NH)Wc{(VqHie=F(arqIH&E{ zIc|=%+bsS~V-nARTG-Eh5KZ?+2KraFi!bs%o`Yv+Gf1`MNBAwMQcLVadSq$U0&49KIijxo?!MKyOQg=5a` z4{Xo=AvV6p3D57E?eRpOyt+}YNEI3RIAZzZMW-F+O{`BZlLlI$w8&42YL4lLd{{YQ z)hlGQk1;r;V&T9!r{AOs9GxrgFkrCE{_lKG`D!7d58Z z))e1M;73vJ_XWrE(RUis}@uw3x^Xma}yTkw+KT`}=RL z7V?`k*Q)40fb;N0_ed@dvkJdy3MXwg=TV>B#k5kbVYLa$=Zv+H`5FD~ziF-x$>eJ0 zfkif6B**Sy*1 zHU8d4Qx2CLa7`iQP_IG5A*NuE>sZYItqinAvQ>Fnz#1R?L>U+t=7-1$W9BCDV7;AT zqxU-K#nuFj{zPXD&UZU@3h%)S8Qrbq+|4=Y>Bd?wcz}gdV9T~J!VrQ^;BS=P3X5#* zdBEQ&6b7t3E|vdTa9gjWu$yZp3JF+AE8VxWMfQ7sc!H`6UOcKH2NszG)4?#u1&?lw zsZogmcDPu_c{-=MtYd_29Q6vU%YQ(vq?L)Mx(ms_w+qNTa3x&FtrfMkj?&(mU}0gm zjqcszVXsP{Fgbcjj%MG}H{Q~FakU$?MOq0KY2Wi8oa|2>o3W0zl4H8Uh;Z|0dpR1( zgK&J2O=YhXXSiz4ahep^B4vcEb^ussZk62`umbP-%<{5+ZXu@$yl#7<(@`6oASULf z8oacyn|yoxlpLv4>SK2%Fq+Rj(_{lY&%!i1<|g0Z9&@66eNgY$(ekg_KMCC3SYGC4 zFV0es+XUcKjvDQA7SAi0i@eFVb5V2=*U~y(3)QGju=?-{bO~lt_Vx1Nd|)?Pb%Rbd z^j0tOCWOC_A?2GCiCOXzt{>G?FbnC(7=xa?j*xX}k701^yi-F{7VtH>O896g0sM z>(prgJZV94|l;CL6*9@w6ZufT!?#-16cJHhWYat ztcDE25I6jk#$VoyYPQMkat~fQqq?@)FxbY%Foe5I#d|xHq&Sa9r-@w_-ad0KBna(HFibdHuM(h?7@8ql3N2Cc`+Q*tB=(n9B?RBxm!vgmwWtiK*#{buU!Deq@O*6d^``)ogTA~dD03}b>{t&Z*+v`V(8SszB5$fv4s zk;|mYXLVff5W@ZCB3jAN8(bM_&k5Mt=-h$k`uz`E`%*O7ua%L!l@}nArIqf1!2E(l zk4ej#DpK5^sN`o_N3no72Oo(CKw|L~lQ8F9_aYLM{6*r(uxMlx<=C@0L%JoOw|gC< z+21Fvp7EFrThcJG0vDGb-RRq2Fy_86W**hypHb-#OMnXCNq#3Mjib^7uF>=| zNgVz-ZsYzbSiZh2y9&ton}T`xqIP-;yURKnGeqw*pB&;!N%%Z4lx(bmaVT79&*2 zTO3OglrxO)G2MfAF)n7k7G}Mtb%|q2!tEI8Elb&PkzUK|LZEL87*qhVZagt>A=VJr zwdNUp0`bcCrN!9iD!1P+#^`=onQ+W#N)u%3hW7%~?!lu*N4A4Y>|Bu}l{hL|`bN6Dvpb zo(AoqXK?n@=HF&rYVnNCrXHQnv8t+>GpyZE_oDTmEc%t>EeCa;(U*z<8nu*TUIY0Q zPg9F37dwfan4W3bUkF|D7Ikpgs(XDD>7pF-r?%G&G7mm^zBV3HB902Hu*jMHiz|XV zn!kf}sbw|l7|8vSn><7npBL89Vrifmq)m+nf_^pHhbGnofTj=!q2pdnY}e`<-s7e1 zCc&UB66+*Ch|a)-)w`Vk8V`qDo7kwHi}J#HwlT=fHn$hl5u56M0+OLlJyIGvsq9UV z2Ys%llVqbL-$PN5RxT2xG|*lg7RptQ(KT{neFGL&TARpS0`7;gl|RU_F$rRh=BK%E z)=B)yr~7YRnAAtR^1cB7xNB1x%hX^APz$kZCwn;n`qvblfqtsPDED~TdUSw&9W6Eg z1c)9F_-W2}uc)x;f2<1#N7-(0UR1yd%{;2BlBKu?M3-x{c(z+xB?9FIND7@@D6wie zDy!j23ObAB-uzuUDahJoYfwS@j(O>7AQO)+;xnZv719;stZ`t);kH zT0}b(---&B{&)-7>v6Y+bCSB@alwD1#CY@66MJin9h-d@hsz^%&(oFad@h_0*OcBK zfb=dITmD5iWc67;_1kKFcm-AR()0aFH$1Qp1RzvTf59bEoe+74zpNpx`i0}i9Y+m4 zG+r%lzX-^PMYg_JJ80#fH?zcD)MBQstCgoA;s^u8fx{`QieT{2J%4kxW*H8yJo&Sk zI2XaaT=*VjlJ%7*ZaebD;smn@SF9%AcKp#A6g)CfO{o2V_z}w%gFf20SGS=7AV5Gd z(j*&q60j;6&h&dnL<^dGnVh~@$;7W%1PWq-E_n-V(PnS!pXR&$3%ApdaweyHqAwAy z?g3mFKFM_asqZ-T>;Ra#A64z!jTzbDSTw@ZI}?@Qh-n|(k`tuzug#}a3eyS_3&^tPf` z3c&2-L7BYewD-4Oi)LO62(0&)&hED62IBn?5lvN(gAS(enP#i_9>bkDl;rNP!g{P}S3H!#{43K@Vr{RlB^3c3BF&&1V zM6IZx90#dwTr-*0FM>LIlr)U;%xz(YuHFTcVvQ>LmsI{#F(2~_fs0uHR}wthHV0eH z^{7Bk^Z%Ja0Em43Q7tOHw7M;&qJ5=AoQ>bJz+G`^C*skJv-3AA#zJR{*xZ^|@ zaFRn;cpnKu=B&x;%4P^8eXe_=>(eUvS>lH)H1-ti7+9CxDiIuRW;4zOxcq;4k@c{_4nNx=`+s<~NALG8-x9e>}rDe_$h} zFScoC3(>r*#!XExrD7IyWr>4_@~ZNonVPGE$+KT&4W zYg!5QeAet^1>B^2J*979?jm43qy~7$z_`ZgOW$tCqJ5c74?7!C3?b_GS@x+R4;e}5 zLn+|Vd&|*>RlY%>fw8zx)uS1= zi^egG3ghKuMUCZ1s*#P$ZCmu*U9i7bI*^fE6#HEFw8_s<$_x)Ne~iduA$P ztCL>ri*x@(TAha;G%v$14&$p&^u9+YyVdi1#;*p@hATx+cj&Dx2a6s>FpET2Tr?v6 z-+GyFQql#nH6QkH>ShT-;g~nCLuYO37qq94uU=nIoqImJo4e-7RS>;S-Q2xqD0+uv z&#rbcO{yz=q@dDvgDICM&~YOOL5UYdx^gOAgMwTqKcB$-(xG z3-(#MJuEbc`#BZ?S<+fR2L)+OS%gyFD?LRQQzFL#Eqms~yWCqAsu&C3YB`y`vZ&bR zwH{jNSYGML0YoA>J^ly}-YFz82rY%&_dl@Re0O~IGmgp$aMDaC7YV1=`tJ0>tY@Zt zUFr#*gE=EJteV-8mQO8H6PcH7bU(NfT8QmBxc+@ps0SD)@{+Ot`@mOZsUv@Jy)qvq zyM0;Ls#4?KClF8RmZrWsOXSQl=-Ut0rx%6>>-87c>34hAWqsHO8%%4a!Z%f3Ez?9pGjYPLpmUJ zz~r5Y;FV39AhfEG>KBpu-2qNwO!>_x@juDZDxZB?8X#*%PX2!Neb0Hwl@I{im5SJ< z-E!8~NTz#)NaHDGD_NWF!Xxs(bUwc9S)A8s#kI9ajbyOwTToe2yRP=I2PpmAI$>D3 z?TwwhAkoyqTP|^~k5Kd5WjcauOM0}m&zefh3W}KCX>VkJcj(T6_XepSpEBrb8Jock zIf+77cI2U-k11lG?*O241dNxvI04%vNg=N-Di&FXhRbSWNt~^+kGs>N;x4c>5{VLO zPi>ivda@Ps2GTxk3^& zfI)2wGbPqt#@EtE8rc1{W4_O3>ZV^z)ZBUw#E1=Nw*c3lUF%V(OjFiE>2`ruBI!h* zQt`%w{WPHs{@U5|yXUK#8C^ekR2K=HTFRIy`x*{Xn34G7eRqI&U*r`qoS%(`44=0f zf$ha*1+ItHyrSEXp%)N% zvwGk4`BWc&QDt;jp{YHb_QIaAJhJ#9h01N%Olrx5r@_D7id~ZXoge#?Inv`{RB$c_ z6ElYNf0!b5{o0q)w#iK(J#;5(N))_s*W+Fi-{9&InRgUqu#oS-L<>b%~}=s1I$ z$>n#>u{<+t*fWXxw*xb|MKgs3W+{<%B7MbwI1%o*6wLG`)QnA(q4}_FeIdZ92S=xO zjGQ7pbEWll(Jc|&t4M0{gGC-OMQ;GH_`f`MDQNED)|SKaRZ6k7GfQmBwe5+-nLfwx zm#UtjZ8xaV=`Zwz*69=W!0nklEkFigK75xua^0>6EuU(sraczzu4X#MP{yphMD~9N zloooNh@MG>In22{I6)`Y&xYa4#?MmU@){!7%H7Uq1fD3wSaU`$j{`F9* z^r$cFb3MBiW;zTxrEQhbboeK|X7~Ixt>T^79%`MvU%6z(VwvocM?BxIDafQQ8{eh{ zd0-*tw9SzCn7;(y=Z98Gp_Q{4G5~{HEj8-#(G@$8&#PON&+_QXl>+{Y;F&++*p{?9 z&HKY0`QC8ZcaDBY4ZrlWz6uL)o4?Zo>Gz{N@}990^{g8bQQLZ?NbuT~(}xm75EW-z(R-18enM z%@Zrpogfw&s*rC1$?;Y~68hb?UW4y8OOg!0T?2+-zxiD z=^PO!`)YnB9O?f|CHYU6lW27Q3*e!UGcwWi3obAYD{$;Ip{#K=H9?m2M;&^6q(&I> z9VAN*Pyu=XNDnLhC7f=C^qLI?pK%w-K_Pw$|HpBPSc%5dO4yks1a>&?rNjnv-%~nM z2;5Q+aAkKTeD0;HYBb#3ck=)8^6%`bOkiyD-FGvr9c*~g`ekN^w-q1}wgUIJz<*iz z9RMzLkNi!8G~~YdP~7-_NvM9>MoPy+ZH~F`t;WsC`~Ic@@=ytA_59ym4XXD&p$?|wqXCQ!F*%x4He*O`ag#G{r_SAwoOL=Z#U$8@PJ5W;r8uBiPi8v Q@Rr)v%F(jY{Cv#+0M)NN)c^nh literal 0 HcmV?d00001 diff --git a/docs/images/QuickStart.PNG b/docs/images/QuickStart.PNG new file mode 100644 index 0000000000000000000000000000000000000000..5bd4bfc0923239efb541c12ac8ce25f510d119ae GIT binary patch literal 11037 zcmaia2UL^G)^=#pQB;Z)v2jo!fQ8;sR73=60s%tgAT2a$QW8L#0*Z=)0s$hd}6H z;Q!4`2f+KXt5yp5L*uT0O#_nKCNK*=(A%o(s6!z6(X1o{1NhABdfn6=0^z8q{-bGj z`SdpgB3htzMcwd$<>Fh%4gs68T)MOO9|jk=X1}(`{dFZsf?br4Z7d~`jq`8K!!jqYa@$-n?x7zDv{R#{nJhY4d!X@k$e|Nw zDiyc>PI_ZIyu3)cS1s?+uGnwnX>xj2n*iziR>_uL|0lWp#wAZU38DUOan&?34D(D{6%RKXU$q4tFuM4S@`wYL^L5(>tWx9rIab&?al*Mu zpbp!Sz2{szLx%oo-_8=h8+Y0~>@wiSev`O%21x^{#hgSMsSU6)Rn0ug=MlA(2%rgc zL;Qtkp?zr|N4I(Zo|utiF`F$_Uc*D!u#l<;*QM7i>tb_6ge8wctOPKaS47fF%!)N4 zOFVJoFt!Lecb*xTUrLk-cKLnwo54y#s5%Q&1jSUiGtY1UQY&W_bT$`8X%%6GWhRsh zG77`O*G+VyMzRZRP&pgX*QmxrT*mPo#)st0VhYX7RGNnIi+^EgAcjvEKD{(<7e_vR za4f^juEp0s@CEmbzuZO7-2x&T*z&r=F>JP*D3Z`JqvUo(hv6{d$-w$pLwpTh*3y=9 zRe{V(zw2LB9{sL)wIuSapSsoBxc7YnT)}V;8M#v&Jh34tZ@^;$N{1(uA@hPuA zj}e}@D8QE9PXs8y3-b0ViQSG*<)I!H;8ct$gqKKpfJ5xYB^A zjnxTF=uuM6P1~-$9TxkFvb}i(iwyXP)xo)M(tN8k7KJp<&)JL|LnqIS6x6U&xz&~$ z%wxD}OiFlWZOk65*qVHod;PL%Cb~WfFVUUX%oypMzrhP!_CV+u)=3j_)2{lUHaE6t z`R@31S4;JpT{$zA%d?9EVbFSw=r;zz+_WUP}XD-3(QMuJKdC~zzb zV`H18FS%0j*>VP^HY+UUal}&TA~VcvI9zD(pv}8frolUCR8kN2pdxZWC4W0(hnBkX zXPrqX6d_5R;Q7TZ!B+JwI{0Z7u4K0AzyW;h>ROlYxSsW(6n^az5|8$Q855oI7hg+c zn>>hU-{i{ievJ_0Fu}4@RzcQ))W5q>7hCm3vJ1nrmalTO^^(-bsCwhd?A9%H5gE&H z4C;B8b<4KXIY04ZE?ZLdvCca?kv*5FD+A3xO^V?QFg7?=&x>4!;wN6aNzF)0Ju(b@ z+%9m;EDw6o0w+4yCs&$!T#^gR#(I0W=U6Ht6iS9S4cfJ&QoJCL`6WL&B|^`|R%#ua zU`*!%>j+XmQ1@n~uFS~vW*A`K|ZOfc&} z%S%1mC4XC@wB|oIxq5Zf_wHKX)KL&?V;VTV)0rw2sn3e-yVG31ol6h7Q$y@U^kti}dmek|EaZhl zJi!!uE&iBN238EEjhmYZ)>ZDNI$TN#?A6qj(0|9vYrcFwm$lHg-8od;LV-};UnL=K zf}tKm*7eCrByBnJTrwxHmDS84Y*Un~3;sA|BXeF%zG5R-h;*V7*w&FAirN|VnSRTI zbsFT3m4pv7!8V_S0QY`UWHDng-*WcNq#mADD}ckZtUhw?i$WNjJa?y_^O^wXQx)P2 z=3Z<3R8I#BEOCG~&`l#~UR$;LotynhU_U8fzmp0TsqF#_58Q0szuA=%#v}RRQTdMj zxl=Ez8W7sHwIE<=P>S!WF9lPN(E31(blS{9AN#zljEhlA1eX|}dW*zqaN^aQz zB}zvhP6QBwidNK{C~>*Km?vFm&G`Vz$BReiLr3gL9xb#a({ z-(3`Bp35QZg_OBMm~Ar3>$Dw+gcBm-u24&NMQU=~qeqC*QG);@?YD^p+4FZeZNJT~ zkJ$dhBbiDSHVEy*V43XEf}_}@ne79S!CVR05-rgEc!bbRWOfJaOqZ5*2P zn(DTFf_``aN^Z5`Nc}?w*1DlG_j+i_P!~wUXOh0wk^6)nGuxeMAUF9ki*3c>f=NFL za`w72c7)jFOyUAJr1p#kwN{*Bg1xC5fIYGp^B$>hJqNN<>k#&@Kvm^!o;IMJVF+!UN&3Ys3Es(J1Uhm|gh*}a5h3PGRFCRk0 z9l>)cJ`d)12Ti9k6_@NKfEmlw0sn=Y#zQ}8AXbDK#-|fR$GJfk8psJCjjQYUX`U6hbW`gh}HPP z`Wj15l!@;e5#$@>3v*pxNXkWXqB->-NU>ZPDi+&6A)g%y$zi^n%uncwcA=g_)Vw}c z*GE4tDMnBI5I81pqxy7haba zK%S@8Ro5-NcQ`nsPE-=c2~ieA=-u>nd){llEUk3Pu``=J;M0u*dW%4l{z#v3%K8W(3@q+wK-s$9GmAq zDON9Lxf-#8EvbScabT<%C`k3m^VO))T)0KN)blpCK9&PlHtk>UO0v02PUA0q zhgX%#Z!au+3hX8))<`K8+Bmw0K-69-Q3w-zM<0EbY2m$B3VxdlE3fkWg^?rGRX)qU z>VT;QVEQVzE>^;3Wf?Pj_<%+AGX)%{`=&u;?4_px{H*0FiP~I{%*oU^OO|BHdU`!& zXIW@-(52>TN>>fF8>kOgS$`+xGF-Vbb}*pS&8wvjcQRbyaBjs?`4rzA9ho~{b+%hi zwwwh) zj@5vFWrOr$>^k4O{(OE-Mk7|E$3FdSUt;&yC45CaD9lSM@%@!)ZrlrjbR10H!#=j| z&Kn?x7)4Ngb9F_CmH>Qq?lE1#jz=*ws;Hg!eK^8*Z;78<$`H}nx5_gGX=Shd~<2`WD^z)#H{*Hr=WdNv1pMsSt53q$et z2kbmY>v2M^MpjQ*z^5gWeWug!TdBi96*5Z z(Lj<{-1eQKrf;N1 z?F8!fbswTt_@2s*KWgf)7W;d)&}`YK-Opj|LlemUu^Y)@qUYFPwwCv8=4*SFxju4O zUNPrVai1;dGa2)B4fM@-pKfcKe8^qov7Pepj-Z6lC1AMvS1RmA`n=dFXl1FVRa=>d z&#zI<1|!7Mq?noY;(%;=+ zsqag#&i@1$SbV?JaVpl{T9a07136TBUcjS6T~J;7bJmP(X=VsIQqCWQIaes6Md)Ql zFgNEQag`R*d%xepJCu$?E@k4PS^3MA=m3&6h~BaJYnjz2?(P!Z4oKZ#f<2<^h~|NO zdMc9i;8f)pyNew?G*o=M&AYl#1|Igju0PrkrQSf-KYcdtDO&jiC~ z>R}(Oj!(sB*8fW*f7}cDTwgVuhCm>?@@JS6n1<%Q#6n?p&#xUHARN$O)w?H={?1g9uhP2yyiVKY9>sE4x%So#Zs=D!03k1_^besh2h8`4?r7%mI_`1E`h>7732Q;=-aU-}%avw5Qpn(s zr#{wk2W9#G6D4s&GecNYARWy_v%w$fG!c>i65qplaQMB>C@rr5TKtcS`rOzvL0NqN z;^Nf_Hex%V`t*-A-$)>}X!~b6lXUL}(7#czV0n*v!^C(u28$cHq!s?5CU1k!Kmt5qv zN8`VN;rS?NI=F7hjfG6TeDh8@u|de=cWJpxurK4~nLY(%#7{Ke*mu)=cXWBlq(o5G z5zx48Z*bKxQ85r?z*r+@EPpr7JPj^sf^xUUOI8<&nS1ZxzuI6-R4t?0>YJ1bpBB3yB z_IdJDf{XL5bJzT29;?piGcKOVJY^IH-)x7K$fkRZ$CZuudxwa~xHz>jfBoy{TeY@7J z?4*CjbuslUvh~fyaE$cLMZCNtpGKoe_k@kn&yd(&6iJEInZ<9LftKU~TLdzE#;uInhBYouB zu~PqKj7n;?oyp3oS2zDeiecY}OBQQS64-A`_<;^I^P z+HO#*ucpNS)rpLgUHq{kS8H?(D^7nH8?iCbIFr~XVHqPDBo@NJ%6*B&U36ysFcztN zE}iwZQ3%x0#CKSVOI1RDLzssd)~kg(g>3D)%^)!^Sj}UG6J?RtUyFJ_JmYr|;mEep zuK~k!*)Rgw!Fu&qcZSBPm)I>N>)%-dx^Hv=FwF8nY%M8!7Q*ahC0HN zj#GcMMP_N!hEjSNgAC%Bys|=}p>nEt*>lQaD7!oA*IUw5F5cTJoEGt@d4HFm$2wG# z)htPl=@cT1M?~ILhX?EbT(WMsOq+p6Ck{-)nPYEklUb9i#zn?5J!=vUmy;e`?~OB0 z)Dr5x1a(Pv>?P@O zEiNyPg!FpZ%g^T%0~~fdZ5}U=5d%n9oh!*dEXnZbbksW=eT0|_hx_s(Ix%%7ab%#x zl@Qe8d#&@8K!(+kb}_SXk=_q%I$NmM7^Blh-?Ode;9RXe_wk=jo%2_J+4nJ9F6(R) z15PsUeU5!sd{Cb~yI8Nkeq$+80aPZccTJ1_!oQ)HfDdKY65pTAeiW}Mxim59zw#(J z2rh$GiiTQyx!VUzf7o3k-Rs%g%H?yK+GXkW(`KRYwtTuioUzddNBJVbP>Bok|*{Hm1UBhAGs@1_xX zNACZV$LX6;0q4xO#xLzuZeU7fk6=ah8CqX|oeWN6c^`3K z2_KQmQsTl#am)F1L#)~@__n(AxKGAia~VOi5@)Otx+# zqGPBvB`MTT(3on52|x66xAw?sm?XDm^yy3ZEwy<>NKX9B>_X6c-Uc{iP)8b_k{tGZ3nI<)AeY{CD5oI&cEwCC(Z$yc1yymVgU+liBR5f#+ zT~#O6sb(xaBfJ?^X(!mdVXq3$yHhl*C6_R{QT3PRG^x!p)M)~<(Rs0*gI@^k(OyZp zzQKV&wQjv}&R3cAyLVgY7Ss`SekmX%hsE%7m!iJt-OXDsSzrzWr;OH}1T(g6G|jd+ zeUW&q_jE*3Fsj0eBw=v)GgflLpUsaW!v+~JK*=A>K%GSLlPLm!(P)>Wxu-5X*zM(tvc=t;8-vgWcs~!e<7mprZl)hT| zbe34uT9P8KWm|M}#1~Dkqgs#GOTvy~v-`aU zyVIPFj_lR=p)8FTEG1kLtO|8-tkheDK|^EGw}_%Dy0fH$YEB>2k+tr~hz$=-0cIGP zSf-ggi-S8ZmAbdD-x^P`#<6Z+$l=|AST{8mi1V5R6{p-SX<4726-F}qRaX<5OO!YL z$5B7e`Ej^e=c5pV{6{1u*M}~(8enxrLgkR>{wB|Ut&-Fyl(oRv0bjQW~o=nVQ~xi+D4W_Imbh z+U&56s92XT+q=7&gJy9UZzk)U#{<9a{qga{Mc7EqCK2PfwWPeJaO?{1!g$>xtBt7? z<}Pxq-Z^fzzuOUa*lbv}#+grlfC$bvD@04Uc)ieW2JuB?)%y8)1B(9V3z0;_x$}#K z#`pvB|59qDdqMnVK$7PC>s;NPZOUA%vt;Ub^}TLqdBGrN2@@Q9cHXIu_(*o?T*ir4 zzH8?#+xkB2?w}Lq+`<9`$ChK>yJnHp;}q=NTu#1R^-vrXUUxA)-%YNNr!zMGxQL9; zX;vYL=_A_fOiCJk%TGNG$$6s?(~fKm?T(nb3I75MD%pr-pwsDAVim$>pt+8A^(_U{Opi%c}@OhomBnbNWFy9eg{L@FP6k1*dXO@zq?m2{l|J z70Bqai%VcM8x@i)4Zn++{?kgb;yx;NyPUomO0N^=2I(wN@wXFPB~JkDg5{8=yuY?& zi*5?&M?BOgDTTsI;Fc{vJcNFTU(w;QLzWRJ=qfZ!Txqt!-nC;aztZL<2pjy&UDu( zPlc`Vhw1s8`q-_w13!ereUUlW*uP0St|67pnv&AFRtUeCm1T2a4Xw??3_U~-t!tUL zZS7r_zI4OIJ0FAHkws|Zie9>vK8h>eRw->rqrD+JcoHdJ1Lej> z^l`d)54gvy{>s2i27j2P`|)w2S%X@6@=)KmQh9|Ak$MF*$=X%DTaNReHKKF2iFZ(K zlD4)(ApvN!t1YYn8P)^8HqFQTIaEldyj>`EXkK;J^u75GYe)W~MYL+FM1}a=GP27!Eu23z*+CT77tQm*{YQ_6Oh(%oyT=6>VSULN1NFwbz>E!FUv}6juIu9) zeB53?JrG%Tuux^5iyIHH?IbL2!ZDz6{;Xf+gPN~~RyRVSoQ!wtjQLfMgQir(wySx3 z5@mIEu0#{1)4f>D;_nu)-hP=z{A#y;bjGY^OPq3C52^_5hsNSrE~}W#`ee!SIMp~^ z9v~Gox82^$*W@1+e`!7Jv+A_e`Pa0ikgfy07c+A+m|LxFuOr<<6+nxYyo883q3*b1 z*Z$WkwJPmRjC*Z-kycC!!?UL7$o&xzWQm7Ri^an}*N)-;e^|r}`DfrAYf1rN5#A;jYqhU z$o{y`qN>VR!3?9%>HM#`w{fNW{&|TW&BH~I^5*DQ`${*yi8GhqOxny@gAIal`IO!N z4A;D9Gl`}~1!17QEa}1iaDCrq;7N^xleH;z0r%p@nN7xL|24ZVm^|cGTPB>}AL1u( z$RZ+Bqz2FB>|9~zEB~MZqr0hs0 ze{dN9;Dl=L2BjDs`43nk^N_nXWp(1*Z^yQ931KVga~w|oL3jAoS*=zG=U?sPVK+pP zoJE=8zri*38YfG@!D?kfbf3d3IY2N+^B?5uCLkDHu@>7W09!DbZ3O*?!wVp%35oC@ zIqUkawJRm07@gjib7k;q2Q9DCzksdxnADWw)v;>{1}Pl-CP55tkR9_}ZqWp)kI(Nz zov~u7+Z#~&96pVGWD9Hm*=SdB;YU>kxN$u2_+Q9jNBTVYDcC=V8!+zJf4KYsOyh=5 z0Jre`LFz6B-UeXr`m5(&dTmvtA3uX4ANi#|qN}}01)srHfuSY>7hke6-lr}8T zj)(KkqK{kAdud}8O0hc?15Z)>7i!#A`SV*}&^EX^(m7uFs8sYjz>giXXEcvM)Fi2x zIKcKZKT08?;BsremeliJ_-ioW7;4a712|4eTK*7Fhd}V)f<+(~TwcAX!)!zjqDK^@ zREZnzF^rO@a#@6SrWkVa@3-nWQW5a~+FZuP59?`_wBA@)S1BwO!zeX=MsYdf)!0YL z9nlm;n>K{_YI47BFbiGi}@CD+#k(IFY;s8r!!hn1(}uid=h^F z!b=bIqNM6xemf(IRNk$N6pWMoM3d-CIT*tzceics?FD%4bkZ8Ri2!~yut)n*_c9*P zKDNao*}?r6#&hLc0C7{({}GRf2I@;6dqTVu0$n{CK?Pzn}wAg7CW`;;1K~x!}>eiQby5lZ5 zZ`W5n$?J?d=-Oc3yPE5Aar`nCB6S0h;DI9*E=*vAJ*4JSo=SSEeb*8kA)dSvAqz2N z0GDVc;I`{cbzSWAY-@m{z>I=tRxkxzVu;9}I5eOAbkK)Y(pjLnf2b&SzKSNrtHZ_Z z1$EE!81T!0y_eJf@zYJ}!~g#eJpT_WI+2jwgP(-@>1IBtCsS#(G;dwW)v$W_{{S;r BL6ZOg literal 0 HcmV?d00001 diff --git a/docs/images/cmake.PNG b/docs/images/cmake.PNG new file mode 100644 index 0000000000000000000000000000000000000000..079e603aa754b759da3fb2ce18d12720d7af31e2 GIT binary patch literal 3959 zcmZ8kc|25Y8y?D1b`vAp$SaEQwn&nhgd{>#vSchFS%$m~GG@wFwk9o9WI`E{HCtJR zvCNy63CR|NLAGJWG|VtF=Ns?${=PrH=l46$dDipX=f1D&zRpQ?cC?k*sk##c0?F8& zv33E0L}9`FhkM z5GsGReSe<%w7&N%ZTIH6zIcxX;+@K0wP{zgY|?nSE+MTvK_!ubXhp|@1!e#j2YjK< z4l^3y0&~FboP_ZF+n%eN6#NGfVG}eF&Rfkv^e2w}iR&lxn3fJ5KfBg!tpo&IAs|gc;hSHuvi(}_@jpp8AfiFB8GdS95Jmh;KVST9@5e95hL1p+&YcTbA;UN}Tbg-P zbqSMneLv3TvC;7;Q&S0nyY*Q>7xUZwJB5JzB+}3(P#{{14*5pg4nmfy5WSjEghEbr z=iUS30hg6M->LaT2h>ZY{AH`twsMR?ZH}5ua^I(`ZGGGry>$P){;dnZAz-%-JBxjd zg-06y3R2tg$V*z|1LF+`LONfU*yIb`LuiXERW%hYN}+}-uiO{s zpqYa7Q@9?L{j>Gzt?DHN*JNBqC1KxpZSzL8H5_NJVw-G}`G;1YyZp2@4#()4}K2`ixiNjPOd z2O|$dCg#d1TJaJzP(!}N)-k$AsQ0&h)H6~=VCCkLTrrfDS#wF$2G0T(KB*)Xz7KtE z(!0+yvYL}aBG05R2HwxoVvvV`GumCFPVJJ4t3~%7Dgz;LgZ|>zp@b`}a3xq4maPeu z#5c5mkxJczX%_;t3PEjkS5Xi9&1)7IiQG}OHii0&TS1;^k4DqvYSeDW+bRhZIwMryt!?Q zyKQ5f4V>N%{Pb+o=EKd4|L__1j6ml!XEC-qZRw(uQGaxTgY2D-e(^<>@+c=C;UllZ zA(U!S$R^_=CGz(-{y8a=mCQSn4 zF}DGHVF3N$RFL<#mMbk}!lZ~tA9+^dRq|s^@k`rD!hFRu7z67~WT|0;H=9wSG&HYWZRueZr$h$f^A5l2qt!CvM!h z?~cC9F6{w3cx5;T0;LcGi}!K_%Z)JIlQg7=GVx&0q|j*-?=B}heehSDh}PjCt0R*A z{y1e6;V1-R(S_e7f(D}psqlG1IS(E8_Q3>jyYf@l-vv_mH0&Pfi*bKkdJOY^TD32* z0F_@QD)e1`n>JdpK)zQS9dl3`C3erp_ZMZ&#j(ccR@~=eu^UQ|RL->n#C_8x(gNbG zP_tOyf}POaa3^i_&b9JR^Ru4Ue&>!3m)>yZe9KMCRu(=`@)(j#UcR?|y+uc=$pDYj zYdwawj4|vVbfa;*V882km#Yi6Yx#^?z)U8s zq&6Rpr{MzbJleuN1Rj;4?1IhU<8=I*TXt0B4rkff*x7SK;(qvxm%@d>ufZFV^j0g! zX;CLR)@;uy{P&KrGFN>!R_{Q!Jhi%HSUmU4)`VpN-c?<)*T+Gg1Ev9jhY z=`Msf2eJDd>ptOtc~2`C^UB+x3d(ewSg&^AvRB`c?%o@PEMM+21#Z21T5v?j9mC*M zL(*MI>`mn-L}z*3C8ufA1M~Y(HT>^6A1g<^KE%)D6f^|Z@`R_|-m?n^YBu_v{d(^U zhpph9yTcH1OB_d~s@`(#?GwhJalbrlBJJB~@Ko}+qoJG-OH$h=w^c~DSXh_*>(#bo zju=VecHo?hwj>BtAuSXqVHr=s!?6onf(^ReB(jAJ(Cr41Ei{5a$Zh|w5f{plur$la zZ&3vlqlV$NQHHme#n43qf-9z5qoALeRLthNf7>$uzeNkZwT5w2$Y)`h2#a~nm`{n_ zcsg6)8ftvn)PY4FDyuhO470J%0Zbme*mwEJ=wVJM;1*=h`^-D|wIwj_kQ*NAF-OM5 zE_GGQu-45xf1&bfVe7ObPF92w@12s+2zM=X#TT1o3cYtjjJ4ikR)(#-k>5=ssGTn5 zka_J6G*wC1ZqCyStHoJ)6~3zZ4?eQE!LWJnwvg86uak2c0jyZuQrDW>#J!41TeS z9-st!CbAN_4dqkq|6QHXfY14up49heUr>YpkxW0AbkUH64!x5Tm4^o<2YLKro7Y9h zjhZe_hb$#8eAoRNGi4I?7ngSbFgJcUL2K;JMiVDP7tS}2!TJrB??M!NV6PK1&THLJ z8(BXCqkd**>Rd>+Pe8k*#OiW*CWe~Y0{3254Qgwc=p!Qp&L^wn1;V;s$gd$@uzchh zoZIZmC|U0-bkjHwVT9@8UeTuTlM{gK)iM^lI``uQRo!aQ@PfQY@Z?l|lwBj5# z{pp~1^tQ#hvmJ+?v9w--3=>f@2YzLl+iGgo7CW$p^QGyYfhU;fx$RoeOHhx^@9j!2 zLJz`Z)9B9pB2El+)*2PCi)P@wi0sKpd_pUlHm+M~7lbgSme7i<=^q~=6xq!81(;ku zR2^S|sRs7KOw0=g;F@^d%r`+X=*cz?d>1igZ7;3Dd2l-f%d)9>7)0T~ zS-xvJ7JTEgodfy@*DjZ65uJ(q=v2-0d$}mF3LcKQ^+UlkzGFwl@n|LnVG>~ z#2bJ(-Jpx|!u~yzsTisZD{fa;89k``_xq3b`PN2Nm$I&7ev&4Rn|wrcVQ9JH3o7m@ zwjqdOo`v4>#n{LcG3-eqI-jDfI5k`7>6SXy0cF22juwO|ufP<<++b>!X6d>82{;(SW)J{=Szc z1mN*nZcsS;xFIO_R`uMeMh`@tw-Vslv#!H7501~i8GEr_3eP|u{yUy#cc(^c)zddc ztBN-qeo+sc-ax)~=Zj@PM)*|CXnbKu-CDr$@WIr$T0xD_fc7@SF1H5|O&L0+jxwY< zM^q;M*vKi_vRHP5a0rE?*p6y+r9PZI5owmOAD(9Tt{0nK%v<%m7FLk>PLvAnS?Ksef6t!4LQa(W4emNc?gDT}Svk#^YT&yZ>CxjG$LIB|I~fYf_whUb z=HwpL4_05#aMCr}+w>N%j+lyZp?aWi2+C&MrrhY%HIC9IaFIi)#9o0t-N~x6pibOZ zRpXT!gf)KYXphpMJ@G@!AjGp_A>eeg(;_S~M|F0JDE@>JJlGr=A4WD#d>etYtfm#B z&fNb&ghfa(l}gw-GjY>x>sLqDV}Lr2*jfViMfMgMJo&C%A@|=P7O>|K)nE_)>oGzq zPD};9*$wn^21bIaBd3u?U8v5dt5W%KI^0>W=kbZYG03S^z|&ud`SfEhA}h zp%Cfa`b-}(6ybTxLYZ3Edap{n=H@!%qt22gAAL7w^VCS7JDIT^?hK@Wi&Ur{`~a@z zWRThE>5+fl#SVHTTNJptKzrv+Jj_R`3aLG-R&D=CaQ&5!sWU&3)4ewGDqt8ckn1$= zhZ&YXdr513o3#i8d@apDS_*iRG@}+Knc6mpo?(4*@x0K(L?$gNucyC31`As4(O%}? zBcIL6^*fKZ<>%vz!~DE=EEx2c?Bba|M&N<7{1Dtl+TgboG#Y>AfKcL$&xDzGMkZV)c=1O64(anUhr`?b8zex^ gp#Q4xN_!3$5W|Px#1ZP1_K>z@;j|==^1poj532;bRa{vGi!vFvd!vV){sAK>D1^P)uK~#8N?Vah- zs!9+*O~WkSAELN95Dmc)3$d63uX`3|p%GBgPdQW_s6Dse;N_Cm#jfX`~ct+Ab!M&6F*b@aoV&5L^r1m{n$2* zk!~Z19{_xUD1P}>C#o;v#EBFCrg-`sP7ioMxY}Ow_3hZi5x=G#yT048?lu?4VeGn6 z{M_2Myj^ZxucT{Y{v(mLRdvnR+TA;q!J~aNNT1|_97jjB--hs=Ke?fjW)gXU)L7X`8GsT}qgd$G)ZWoJfBW6)xI+dRfNQ)OYKaE;St+u}s zD}%e@m!Tu6w}tW^AMr!{05*a65hqUkJn=Bz;vqtzQse|>M>%oLnHLpn zji1U*60Vp&FXM2Wkm-orT*`{}3r*XNy+~r6#uPst?bB)A@Oi)Z+5I=wh4=w%2k|3L zocMX-kBXn*rhr9CrfJR~f>v6Se5!Sr?dqM2UMYg_!PjXEXT7Vpn?3)c_z60lp6|FV zr?Sgw=tM4aD&KkiNhA?JfE^%y#EBC>Q~YVdh@Tk)0Ql>$xr2Cp5hwm@@%p=`K>PsU zo3Tdxj^o||pyf0Oe^>zk003Uhd&Lg`0000!#$ECI(Z#V#yT$+j004fPGm0Mo006)# zLHyo_8)*#d8z1S{E&1~!6Rel00{{TPi9!56z_b0jkl!CDo6Z)~16I>Z(E$Je;PjN@ zrw5dV5hde*Q7;ymaJWM=tJml(q6dSPY3FP0?(O>ZxC26aedl9rXFA|iPlHeIlhmET z)j^^vOfhb*@!N1fXkN$e16%`#h~H}udzG8RiU9xsz?YJXpY=+dS}@Yd?E6KdJk{MF z-;(Dq>JRby6nK|L1+td9D3()sDv{lbUkQ}UpqB5`*X68og!lz9?J_a7UNFCg+zPBT z0002+r6lp2kI3B{SsGLd_exJ*-s}}S+v+dnF(F9smFUzLZJ)W+Emm2#OP@T7ou_qJ%mviJ#7 zhQ3SFmotq0LON6Y)}DVy|EzMD`0Z<-+*Yyj0RRBNmva5LbHpzQMyK-hfV_Bd^V6v1 zG_lrSiHpHq@ypQle&q|kv)IEUet%f}to}37$qiPR`VRmA;Mdr<#!uxU30q7b^KU|B z#Er%`S?cke^|vJ8sLV==|Aoa+yrQ`{M8a=FRtZ=H9(`X7}_xd-m)}yk}-`mhB=N001~^WTM)( z#g731B}pS)tp_2F*qp?S%Vp={AeLr5qw0b^V>x|D9k7wtTP~B(XXx?ZA>tQ@dy!n{ z=RVcZMp6Gz(Hm83qI&}uKsni6L(vi{`iG5rC`=EiQ}8Kdz?atIHGAlg zc7H@4&iOnOZ)+n$9gcO9b;I0p|(US-UtNW2sNySmymur;LlO(~K8SLL(32bba}PV zl%x?;xt|s}Cww;PRBkB#sV2S;IE5Gw5M~|ilJ%5_3apm$#wm*c_5>;-j-Q+%7lz@%q_GrmbYzsa{{{$AC0(4kv z6XG#k<$vj&Ul?aC4INMV(x_C+>k+E=wu7=+SQxCm!zvLr?tRMIYc~VT*Tm*%1T5>u zw0&Zy8y2v~j9>9ud>uT&fUW@A(Eh%i0PgFmbCk|Txx~Awk(F>gKkVP?D^R0izOaMa zU#yF7i1loQ&+zCoRL15EA&_#3V>~o2-AG)Lr;>JXACS%AD{vP*Cc@WLPTmu7pxY!> zJ*evcp*U3uhLMG`HhzxMea~`x=rTLzDP-7ky(%)m_jDdMZA9Uy^3y2lXaN^n&sHf~ zL}tuyyidKi&eKK>>YDvyJvy?KVXhA)nyTxpGXPlrAA2wA1fSGuAdcNqZKblE8NpUNGI=gl)Q$CSromNs!Ku@kwou&A);#(A!e)E zPU_!8z#dB#*ZA(s6@(AM+=*Z8Yu9#1+|Jan00P;eejEPe02`}_&u+KtWrMEM`$ zT+=;PJyZTF{khxr>yB_md7mrrIw|j|Yq}xhL5_SkF!{mSYGt{yqlyH9kEH@hz8y-6 z=9mU#D>GxOaC`4+`*G${#Wgbdd$NI>2IrnHIm^vP-xv;T9cf}g^m z%-Z$eHXV@?P(IW3Dg}9`jLL~c?%<# zN)}z7;O`@Yq>E{5vC?4mpcRp6g?y8`cNw$UaB1i>yPcjum$5$uCZWiZ2!yGa2v^1W zm-NVzH#sa0C%emgf&YODQ5j1e*K0LE#9n6C;(naNW%~0siz}7-y09v9NdN2 zpU;-BYUT@pz5ZgF{QQ00)VrPY51p0D4#$e`IOXwEkyq+uZ-xBHsavkm>L{CL*6sf-1HcP)4x+xDjwoQ5q~5vV~{X zaA;$sWk)DtR?xfq9Z>_sui@}Eayb0J{i_W4D01;Q4r6`I_wqj!WLfWmmY1boJwE|0 zr$d`h>rP;&k7bZ%_%dMxu1O;TmJ{I~5I2rU2Z3Y?y@S09m*e{OfC~SV*j`$BN!#f# zpZ1CKqWlbU`ltseaaY%l_=^$3Qx`q^$8)v^=&E*`eOqZpi7MI|DSsza&#%R04qoYE zV`%k6+om3QQ;W;^=4?+mv|ujy(rQ1k1pl!fbB?&8*H;9cX9n+7`?7teQBz~r)gUG6 zH2b#ujzM};`EliVg+)@@+)ny(#OfyVKdw@^n0Vy5a@wUwR^U)e-=*&YxjZ&q&W?); zMECpv3+?qc_vY>nyOh5RwO_Yv>!L33{t|y__+h?DTtPQ_*x8)x+6)K_`ChawuqwpT zYfCDVUVTP;m>-6MsSK_7MvT1mg~%+;VfrdxZkC;{oc=)ZYr`pk9;3L259E(Og>X$!0r!lo6EA!Mg6_H?3M^^&bFr$y{NqQv)Yq1CHSQnz;tV(;??YDBzT z*|k=C&9(djXiE|1Vg>a)TsPZJ#+{8@`9^ti`e?Tg9t&fae!WvIZbtNGZyXMU2Hw>^ z&hGlg->ZMU)DHfS@xAAt?(oE5R#^`u{0>*``J;QRJ9xx9Z;m3Y{VJ?tvkmV`{|KrW zvHenX0<+C}k)P!yFaE2u441m3DM=Ji#a&>ZkHMcM3)KS;oM>}TF@heh`|QV_JI!Ro zZkHMHeYyre%2KTlwTm%Ye}An%$%h$JQR)l(hq_&kmBH3f{f7j??cj7o`p97P>Q5J= zX;@HV1KTxQi^X-_l_DQwQy6pAXRygz*ewxpKBblx_Y zdVe+hQ>+B>;cN1l#1p)^{y`^P3BM!CafXKjS8l<7PEUFJXQ-;be5te5n^kqxr_$KF zkDOcgf1g=;dQtytI`n@~GB)F2b{qIt#u*i=Pe# z$fllA#B_@W`J>Np#3`crw=ukzbpFHUcG#Vg;I`=R#j_ii#%CrEIt+sj-alqYzR%;Y zK_%F+r+jt~OJ#L%+z7|n2?+Ao*1qxpPMVU+-=d$&NQ?(LOg_PE)r)iW)isAN6|GayU&xBvW&uLYp`qrzXtK7XEYTPiL_ zKP-;YaQp?zJg_~2cvL(;TKF*Xd071vat|lRZof^69 z5~`EI55z{<$yLflFT+G>?hsPsOgC59CbwXX18&Ty@leY`{zd4%YZjW~r0tNWeP=gawp&usI^ z_xx^nkx%+gHK+8E zfKm_W0f;y|>+Min=VsSP5zbQ^5w7~dK9~}A{JGaJ`p8#fFIuW+in$ZlX@z~*rL@q$ znoY-w?qE~oC(!uz2ipAiGcqCGh5p-}bMuuI$}&a@)T` zy65nTljoku3%%OJA&C|bYMg_Zjr1ziUz0k>aWF0M(W&6pU<8cx@JDMBIK`oHhoUk7 z{tAgB0V?=0mDfOnYFDwy2Y7NBPjE}1OR4$Y6!QQt>4AOj&SV;CxyEVMFt%P1b-W6r z+Esd?65-Z828l(Leq^p<&eU#Tuj>-BgmwV(GC%J)Q;eCQ8oW6%4xaPQpyhFH0swam zua9Vkyj?n_dX~Jw`z`Q}9OIF6?s{x8$6VcAnHZ_WovI*LiAZr0S@#bUT;#zxRTA0$ zMaS>tY1RCm@9M8M?zFQOxjrZf?T_$qQrS{!+L%pRwKL{g_ClM57f)E~v|4YyDD24y z*aZHO=*th5%6YnHd;@W5<|&Rd1r>YH91r6!vtHK`YfkbuYJ>Z2?)O^^v&w*|)8aHHyB;`Bg7Uzf2pAL73y6>Stg zUzEXpwnn(_I z^-9j1(^QoL!eMPm+E9@$d+z1*MZ;N}Nro zjZ2lRx`Xth4G5GHgp~126uEE1ueIw}%%u$M;XVc$7GJR)``HyZ(|c4b740+|!A{eK z5e>;>v@mK~WgIZ~CUt)BwYFLnJq#5w5xH~3$Pixgpp`FDN=C$Ky!`@(QT0_t;VI-9 z!|RLlicnY5InKi>42*H7_?#S#KPX(;O*tw@5l>a~sgMk@Y=+9!JrV!dRpI1cID1*N z7LwwQ5hR_@;yrW*`>bg8tIS}0)|`NxsYuZXwj+~+-)#KnHBeQlz3>pSLhMc<&1Am+ zs|GO%UQcdpl`db|N(JZiyr$Y!Jw-LbE#al;Kh)Rzm-|p%yF_<)vc@%P?a4Q+k|9Wo zAX3jZZ@ohnnSV#a(&KTe@~C@ZJ*Bud zKSC^qkv|i47-6NZ*Yo8UtC$$~akrb&a+>{M#(!!#yqQhwCc(Eh@abt07yl&SatIeJ zl`>Ra(T|QRsJEN8PHekj&)W0APydE~$ry4G0^jhVR3{-X?}}Mdcag?OVdM*7pQ|)9 z(SjxrWp(nz$2=!>7fs&FI9~go!-#`VqcqB^!8a9~aTA0Nm&r+Z5BTmjJf~hCeZ2(p zp2B(o8MCMH9adYWMp;9n>ctQN(;xsKf$=zzsp?b#Ut)Z9HX)bUoL}YY)oA7uoh1&s z7Q0uB>8YY{Rp$i|TKhPWQ3kR(QL&q^0#DEf0H$Mm{J*_t08mYTw!>6o%k6p6O)Q5W z4IUxdmj2I4t{o}Par(dQX<^(5JT&EGg@+P})Lj&E*8s!hcQcrCtEQ;$2Qua)GqmsK zadVY6hTGx7!mVOX62%PRD}<*|4k)b+x}MvMc0TwWE%QAp2Jfs>GQ9b<2kb}QZ{2U( zxj|0gME2lBUepCng0DBeCj2hx58ez2LdLX}5zqMB!P2lSv!VyTPKsQ&hJrqy?0}X} zfl?1eJukr5uw>8SJ+hAQzVEN}R8%uIOnB7*&RNFsxHJX9{MS0uVcThd(JyDj&E?gY zbjn@*1#BAHjv>vDIK&)(SiOH*aUXR1sK87@-&te>UiR%^)eyqoY53T#rw9n|jbOD= zg+Ydl08M+4+Yz7VCL~W{m!Oj91`J=8AU-%)Tsi8%I#2^b0?ESc5LYj+1xli!A{k?F z(;UJ|*~*XAasL9yFuw34fMzdZBKJ#`WgZNeB{t4&VlWt@Bhb*+2; z{eT|E<&YcBRXkc|kJ#f#Z*0^vPGB-%XgUaW>4%wku$ri;Jod=p$5w4bloe&L114-O zVxJabN6Q)-Xnd`+UvTV{QT8I}NkD2WzkGsS*kPO&dvqIv*Fl^a}f>+nfX!ETsnP(8xuC z6Y%;RT4?SB*ym`sAO*#zKy)+=_d>J*7=qwOd06nPnXya}Il3z+7O^>G;dbgOo{1`*Ty=PA!Bi023u znLo06OP}{peyE)ar8xhF(=wKKxv7|yJ%6s*AWGvKu-C&5Qlc+p6hTC7bV{|3jntln zPs-hTADw<>YM2yM|Beq%921MTz%24cwmHmi?lW%gE?x@-H{MwNWwXsf#LZ9H$i{f3 z@27`5gT7jXOYi?y%|GlZ3n};PhvT&}R!LJw(*oSxYb~F{kG0auPL3&0o)txz^FOn~UJ1I7Z49VkAyZ9dC$9Ahg z5VZ+fmhuF3d9LwOsS!2L=&t$Qx#MUUaeCaX&b$2nflz!8EcRF~B6opQ=Yf2VI#4^} z)F^TOZu;u9XYKK+qx!7(+}$E1W7wWj>*q}zf7+r-3ic&jISbP;*IqbX+^*=K zJQl6n%gt9xt#p6~zS%rE%^bP1$ocNdSDHm`6Rh3_8WR&>zK(~f#ce$29Xp-@r=8V) zV?+0mYe4w5J57`q`hnRhH$ioR1k|@ToAGBG&!we?H~jITsB9X<<-wZU3^(E_vb=|w z-{mo&!^jiQK9IdnRa_s|dWPp#V((gMbpsYJhVIj~?I=#kd9ze$i@DIX`xn(A>mWD` zbFrlMff9E~gKd;#oXXsnoDd%Tn0ZHLu}DhDAC^8yS1>heW_Y7<=W0MH;iJwfX#M?W zqLLWK^7hM~Vd)4fYU{q7>Nw}xdn`_Xnuzm{J?NIr=|s0F2=4>29ra^{;d6!F9zMtUFrVAa_tl}a(ot_j~!hd`_^TLGbx)6I** zGl|HiseN{+ z5af9R|0-(fH*U2d`yJ{ltsMSRzHj)|+WXc+9@T??OA9O4#N3}lPMQNPF(tb_7s_kYZl*j+3)9(5|QPI;ZwI<%{7* zXQyiq=;BcJx)ZFhUsa?NY9ZMI_6PA;kB&TMY|7|bLWB`j-9oeJ@g8QMC!*#UrKy%k`fgtEI+j0mz zDaB2D7UV*;-)?$_qstBj9uWIgy&sSY6;~MX6%!>ZJ*2bYq{tJKTea@cYm<3Yy&eyu zx1@Ux;;z^AlJBTuSiR(<8DH@b{e2!Y{5^*&s6~Qq8$CJKKlrWtkL|4j~BWG(P%ZEno_GvMTf6BU%vpK=1c}%&nGWvy&W2% zL@HFK!Im$@y}oqO=p5+_+f>w}Iy?7|<~3fLnu|d~#E9CUL~*38-(23Zbz#2AO+9yC zk9E+kOTsr(_e%O?P_{ICPMGK$&cpGh9_%o)=I0F7%)SYi>#aN`6KU&`kCT#jxS>Rj z4q5x|_W14iL@q-Kp$>8*XEnAXV&ED0h%jD!O5v`-z6toz&TWuaYXI?G5$k0dE=%tv z)wzLFT2m18V_9`Acy=?asR($X7v?<2dArZOmiD*pQGF`)ApGNPE7#ADzpVZXY6Bl)VnvInLR@(P5sWyWeStpwmI*JyU^P7O*}{<-YH(>M`CNw^N;eGN{-6(QnN9+mdDj=Ct;b zs~;X%P6IBNQ}-jOBy&+$?~C#A0n^#aBcqc2>JfUG@~Sv#zNH_kCpia;D%q*Lflfk~ zUomW+?&m!+RJTBukQ6x2=dK87XB;U16i@S-zYS9D+X$gYfh(5?wVpyerYP9J+q2X~ zE}x1ThODhGOCKbrrMw@6bK{%k`AaLZ;5;A*a2Kz+8rC?2&`6;OA2;Ut-Y%eC7+Q18 zM^6a>_G}WmW2OFGJ_NDEb|PJ>megJ|0*4R927{wghNmhLh$fgGU+fD7qz|oGWq;T1 z5Ce}^{x3@kh%}etIG^J1_6lybc8BMS{U*BKKrxHh|LpDViRHy~#pc%`cO1@%QrQ*H zo(pB9&Bw{D`O#)R+Yh~7k`(qxMbyIK8$_`nVXyf4{wcYp<*O}okcIvYIZcE5h-tKV z;hZHu5Nt53Wr)E0wq2*$^$61wihRy_?Yt{#Q4uv@^P6G3wYlqKy6I!1yMu{I-Yoq~ zy*@X4AH5_BFYr==Ie(jZ_s7bwyb_%KnaHAXLE3r0!_r`O*$TVc&(Se=WmyH~W3a>S z-A4RtQ)SdN7?%3lH&B!5crq8=>?9WxbZc9uN-tcJ_?g;Kf$-#HeeO_fM(wo=(B{IF zzF3KkqC`AyVQDmSqNgN7l%4aGKNg!`AD#RWCi@`d(@jZB&(LgN(_s13X?~Zh{xhtB zo!Zx>^{FZ?nKU2$&Cyr|v?VZvRb4(Z;hnqin)%~GU<2NcF+HAF*qFvxHoWk#D**_*E%G!CQ*-ChrR+$n5Z2#bpT3gL)7$ zFJdZasT~-t?9Z1&Yd~_^zS5(gVMShMJq_BBG{3lAz?va7Bn`V>vUS)2O-zeiU{%UK z-^|+Lu90tslTJSn`5{4;q&A;b%!<|jp4j)BXhQ3Y$ZCT?e^h#BmU~x4S~y#C9JSO? zG<=_dac;J?11_~~#$`LpV?paRjLrJQ*oc~+vk~o=vAnIO&a;k}?02^X_}9wkzYq!O zdK<|enl670R%6PT%OxqFV(_r-@APXFCoRE;Qih?J`IL4(D&LLhZU_wOGRfcYh20UlJ z{GeJ&ZtjddHP0j;7Cw-Y?^Q}p;7!!XF#EMsW|o)p!6coQa!)9U?<4t(_{$Gwgt#J> zoa23r^Ja{5$?(&tdz-fC*iDg&Cn~D93t-&dLh8G5T=uLpHnnh`kk7B}?JVrxhnZ-g z_I|StrFRzBW~5@?`SQlK?$zo>zc80qX=}5?BDQ)|>&?-Yv{hxEEt}{bF}wQCVug<9 zs*51yHU}{kjrT9z40;sk1Y`0z<)iaV?XS9l?PlT>joc{#D`eD{M{a;8ppEw1NpZh4bs0~0%aOdvm7p6;Pv z+M-!noeo+GDZRjr!onJhldvx}W6Z|Qlc0Mi&0kHP+JuUx;wwrMk|WydWYQ4fqa<%h zs2%2Y<_Y2g;rk5F(IwNW;E>_HLh6{KjVy#XR_|PW;kaX`0UUS5Zo}SgcbG~S?_iiGeO#KX|OY;k_ zj~20KBx90f&VnCr>4T5e*@6B%mi;Pji4K8p)wu}8i^qgNn3epNoCMsT#VdJF%o=ZM zx&5XW9x$tZ+tlA6thoL~%?3BMq_EBpK~kJad<-;U?xt?lzS4JUnu&|*%dS>%%u5yE zHHR$vm2Sw5jd+d`vS?h^2l?y2#&s7w?VNJ{gT3SGi>l`IK1 zqGVC9c0~e|u{BpcCebhec~34~bgGE8>=KE+g6C$%F@#%%m-~C(jCcQ71+R&eV9B@3 zSj+YB0%vd>gM5a~cyEONZH)2ybHVz0f%h(OuJ>3Tt^WOPGms<`b_;6}dSx2^yTSro zy1Fw$yf@i6o1W_K3OT3pjNMAJmYdgo`t!L^{S9o32Ml%`=i@CHwBAP$;3K5c%m+>g;bqU91)KZ1k$r-Nz z6Hj{Xrwuodn5ZE?X4&LMQh!TG(zBgls_=%dsVSOxO944&bB_nb$I!?z8C1n=LumoO zbB{92))llk05fNOunfc?=Gr9Zrjkjfo@)~Vswb&iS!HMYukvJI1b=I&yh7H9(H%fD z3uBz>ubzEw8Pm|)_pWQbXDtAZ`0{!LX&+SHc4PQ{$ZMU+QFJ#p1%vQr>j$oCk=tk@ zrXH!~Lzx8>84Fx?G6yC?=Z90Q^>QtCS1+g5eWyMgop7!nf4K38#BpY-5FV5e;uEx! zPV#uhp23OEApb>&Nj$y(!5dXvG2N!I4NQXG5?Qa-q!Mafx#{}kW@590oOvO*)28Fc zMc9SpCoEqQ;}_dVO|Vj43GTT=A7U>0f3vWxj0c~Ge|+8pka}&CzbPvyO?~c9ns?J~ zJm27Ju*!{J>wiN+D2{udIsP4`^kehZ%G+&dT>1P^W&e&FdcObLsE`=h;~`Bf`2z9U z!DvSB$dpV)V9qkA0S_0eXT)lL^(6o0_|{EVBfwmoW4rVH6Y@{m!-4Nkj}l)kg5>RM zrZ2YS_V}f%?{tkUnzY;dRD#!3`1^bGzB-+jUK?dUdYIQm(jUO$3ty~Ly5K`Qm!bv- z!oe*9ws}F@of`Gt<<-09AAe9vwtEzs1OE_3Y*SkyyXT|!V;XPt=^beK{+cu6ap|B8|{6`5Sng7h|dL3eC za{dYHl$b5S^b8P}wqn~w69bFn^vDR^_v&2YPw%4ejjsyWkd z>OI3A0QEu}pPSO25$fg{{?$VQ8u$bjV%_A0uN&R@ZFDuOCh2C{X?kFhl6G9n1U}B#E;#I9&Vl!J7zy zuE202PEI&NItE^BL4la%!c8>X~@ylXwO)d81eUp zoo}5)SaX$?YG;G;aUb`-_$lz*6dOJ=gC_&FZSGx0J#9%H!^Z*rwiJ&7G_3E`h2;i- zo>y_PjO8y_kYuAK#Z;Myf1%dv_@Hb1tCTul++_>t@0?vNkZ6UQ zsyBHI$)XC?MiGX}-vwN}{4wBUhZ;0gQJ3}=8M{V)iTzjZ6?taWfak(3v7*V6YeapH z)6AxXaNkX+p4sSW-eg`LCaYI(6jnX&-s+WGtZIAZtEVpojrXG%xcp+fvY*O*-_d?w z6{`fQdR8la7Y&QR-ftYr&0k3Lo?b`Wi zT_kaRm%m9Xvzl)n*L^kzh0ob9*fp;;?WPp@GPku#akV(N5$j8be;NEUK;=ES(c>DE zk2+s3GFP2Y-tGyS4+sx4C;Rd#=5d&4c8lQG6`0gg)yVB%R(UmLuOHZPdD(F%-Ou0hwezn>_<@|k zTw;j+tl!S}*Mle>zkBYuA;JoR<9y~2$LPo{|IbygfH3qX*17qge{f2inkUlXW6DHz@7SFZ?sHxy4wCDPI$bho>Hj6dT`-eY*fV?`0#zq+Q*ak>S+} zDcLoC+~{|CWD^vF^ZeuQV4|54=krb}F2l~Y!L%S^r6WGDMjQI-7st-@oEmnG_MK^&r(>f&dt>9wVebWX3 z?bB!t{)$HvTxla$MwjBU`DI&y_PLk-7^oo6gu03Gvfrl;K6{rsZ~RQY>LaQPlI`e~ zbj$}fx)Af?@3kzC}#TGNSJ7#{gYown#>rgRkqcg3{ZDRsV^i4qsZ zzkiepcTikg!wRy^_hN-EpOOv%4u-4#GTAJHO|uuV5r4NatNyX6cMoInc%zg7eq=P+ z2`fyz2^@NbzUAc7gHI|yFzu~&& z%!zTk*Xq9-G=g2t+00mqT*k`~v3y;I7CtE3KGVrG1R(C-O7N>oa6I7SfVg!wqUoL^GC24hwQPUHnlo>cncr| z)VyBw80X=CVJ&g!V$PkE8TCP!O~;gn^v>4xz6K45nBrq6I1Z4Z;*YFr5eh2MZ&ZgF%pYu4sD2JOE&%XQo@O?G*k00g3~B#sB~S literal 0 HcmV?d00001 diff --git a/gcl/include/context.h b/gcl/include/context.h deleted file mode 100644 index 91d3bdcb..00000000 --- a/gcl/include/context.h +++ /dev/null @@ -1,167 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -#ifndef _H_CONTEXT -#define _H_CONTEXT - -#ifdef __cplusplus -extern "C" { -#endif - - /** - * @brief create OpenCL Context based on platform - * - * @param platform input, context will be created on this platform - * @param num_devices input, context will be created on num_devices Device - * @param devices input, context created contains devices - * @param context output, return context created - * - * @return - * - */ - inline EE create_context(Platform platform, - U32 num_devices, Device *devices, - Context *context) { - if(NULL == context) return NULL_POINTER; - - I32 ret; - cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0}; - *context = clCreateContext(properties, num_devices, devices, NULL, NULL, &ret); - map_cl_error_2_ee(ret); - } - - - /** - * @brief get context information - * - * @warning please free the memory allocate by this function - **/ - inline EE get_context_info(Context context, cl_context_info info, - void** value, U32 *len) { - if(NULL == value) return NULL_POINTER; - - size_t size; - I32 ret = clGetContextInfo(context, info, 0, NULL, &size); - if(CL_SUCCESS == ret) { - if(NULL == len) *len = size; - void* data = malloc(size); - if(NULL == data) return ALLOC_FAILED; - ret = clGetContextInfo(context, info, size, data, NULL); - if(CL_SUCCESS == ret) { *value = data; } else { free(data); } - } - - map_cl_error_2_ee(ret); - } - - inline EE retain_context(Context context) { - I32 ret = clRetainContext(context); - map_cl_error_2_ee(ret); - } - - inline EE release_context(Context context) { - I32 ret = clReleaseContext(context); - map_cl_error_2_ee(ret); - } - - inline EE create_command_queue_properties(Context context, Device device, - cl_queue_properties* properties, CommandQueue* queue) { - if(NULL == queue) return NULL_POINTER; - I32 ret; - *queue = clCreateCommandQueueWithProperties(context, device, properties, &ret); - map_cl_error_2_ee(ret); - } -/* - inline EE create_command_queue(Context context, Device device, - cl_command_queue_properties properties, CommandQueue* queue) { - if(NULL == queue) return NULL_POINTER; - I32 ret; - *queue = clCreateCommandQueue(context, device, properties, &ret); - map_cl_error_2_ee(ret); - } -*/ - /** - * @brief get information of command queue - * - * @warning please free memory associated with value - * - **/ - inline EE get_command_queue_info(CommandQueue queue, - cl_command_queue_info info, - void** value, size_t *len) { - if(NULL == value) return NULL_POINTER; - - size_t size; - I32 ret = clGetCommandQueueInfo(queue, info, 0, NULL, &size); - if(CL_SUCCESS == ret) { - if(NULL != len) *len = size; - void* data = malloc(size); - if(NULL == data) return ALLOC_FAILED; - ret = clGetCommandQueueInfo(queue, info, size, data, NULL); - if(CL_SUCCESS == ret) { *value = data; } else { free(data); } - } - - map_cl_error_2_ee(ret); - } - - /** - * @brief get context of command queue - * - **/ - inline EE command_queue_get_context(CommandQueue queue, Context *context) { - if(NULL == context) return NULL_POINTER; - I32 ret = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(Context), context, NULL); - map_cl_error_2_ee(ret); - } - - /** - * @brief get device of command queue - * - **/ - inline EE command_queue_get_device(CommandQueue queue, Device *device) { - if(NULL == device) return NULL_POINTER; - I32 ret = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(Device), device, NULL); - map_cl_error_2_ee(ret); - } - - inline EE retain_command_queue(CommandQueue queue) { - I32 ret = clRetainCommandQueue(queue); - map_cl_error_2_ee(ret); - } - - inline EE release_command_queue(CommandQueue queue) { - I32 ret = clReleaseCommandQueue(queue); - map_cl_error_2_ee(ret); - } - - /** - * @brief flush command queue, issue all command to execuate - **/ - inline EE flush(CommandQueue queue) { - I32 ret = clFlush(queue); - map_cl_error_2_ee(ret); - } - - /** - * @brief wait all commands finish - **/ - inline EE finish (CommandQueue queue) { - I32 ret = clFinish(queue); - map_cl_error_2_ee(ret); - } - -#ifdef __cplusplus -} -#endif -#endif diff --git a/gcl/include/event.h b/gcl/include/event.h deleted file mode 100644 index b9931755..00000000 --- a/gcl/include/event.h +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -#ifndef EVENT_H_ -#define EVENT_H_ - -#ifdef __cplusplus -extern "C" { -#endif - - /** - * @brief wait for event to complete - **/ - inline EE wait_events(U32 num_events, const Event *event_list) { - I32 ret = clWaitForEvents(num_events, event_list); - map_cl_error_2_ee(ret); - } - - /** - * @brief get informaiton about event - * - * @warning please free memory associated with value - **/ - inline EE get_event_info(cl_event event, cl_event_info info, void* *value, size_t *size) { - size_t len; - I32 ret = clGetEventInfo(event, info, 0, NULL, &len); - if(CL_SUCCESS == ret){ - if(NULL != size) *size = len; - void* data = malloc(len); - if(NULL == data) return ALLOC_FAILED; - ret = clGetEventInfo(event, info, len, data, NULL); - if(CL_SUCCESS == ret) { *value = data; } else { free(data); } - } - - map_cl_error_2_ee(ret); - } - - - /** - * @brief increase reference count of event - **/ - inline EE retain_event(Event event) { - I32 ret = clRetainEvent(event); - map_cl_error_2_ee(ret); - } - - inline EE release_event(Event event) { - I32 ret = clReleaseEvent(event); - map_cl_error_2_ee(ret); - } - - inline EE enqueue_barrier_wait_lists(CommandQueue queue, - U32 num_wait_events, - const Event *wait_events, Event *event) { - I32 ret = clEnqueueBarrierWithWaitList(queue, num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } - - inline EE event_counting_time(Event* event, double* t_queue, double* t_submit, double* t_start, double* t_end, double* t_execute){ - cl_ulong queued, submit, start, end; - CHECK_STATUS(wait_events(1, event)); - I32 ret; - ret = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &queued, NULL); - if(ret) map_cl_error_2_ee(ret); - ret = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submit, NULL); - if(ret) map_cl_error_2_ee(ret); - ret = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); - if(ret) map_cl_error_2_ee(ret); - ret = clGetEventProfilingInfo(*event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); - if(ret) map_cl_error_2_ee(ret); - - double t0, t1, t2, t3, t4; - t0 = (double)(queued) * 1e-03; - t1 = (double)(submit) * 1e-03; - t2 = (double)(start) * 1e-03; - t3 = (double)(end) * 1e-03; - t4 = ((double)(end) - (double)(start)) * 1e-03; - - if(t_queue) *t_queue = t0; - if(t_submit) *t_submit = t1; - if(t_start) *t_start = t2; - if(t_end) *t_end = t3; - if(t_execute) *t_execute = t4; - return SUCCESS; - } - /** - * @brief get profiling information - **/ - inline EE event_get_profiling_info(Event event, cl_profiling_info info, - void* *value, size_t *size) { - size_t len; - I32 ret = clGetEventProfilingInfo(event, info, 0, NULL, &len); - if(CL_SUCCESS == ret) { - if(NULL != size) *size = len; - void* data = malloc(len); - if(NULL == data) return ALLOC_FAILED; - ret = clGetEventProfilingInfo(event, info, len, data, NULL); - if(CL_SUCCESS == ret) { *value = data; } else { free(data); } - } - - map_cl_error_2_ee(ret); - } - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/gcl/include/gcl_common.h b/gcl/include/gcl_common.h deleted file mode 100644 index d95035ad..00000000 --- a/gcl/include/gcl_common.h +++ /dev/null @@ -1,257 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - - -#ifndef H_GCL_COMMON -#define H_GCL_COMMON -#define CL_TARGET_OPENCL_VERSION 200 - -#include "type.h" -#include "error.h" -#include "tensor_desc.h" -#include "CL/cl.h" -#include -#include -#include -#include -#include -/** - * @file - */ -#define ERROR_CASE(x) case x: return(#x) - -#ifdef __cplusplus -extern "C" { -#endif - - typedef cl_platform_id Platform; - typedef cl_device_id Device; - typedef cl_context Context; - typedef cl_command_queue CommandQueue; - typedef cl_program Program; - typedef cl_mem Mem; - typedef cl_sampler Sampler; - typedef cl_kernel Kernel; - typedef cl_event Event; - typedef cl_mem_flags MemFlags; - typedef cl_image_format ImgFormat; - - inline CI8* map_cl_error_2_string(cl_int err){ - switch(err) - { - ERROR_CASE(CL_SUCCESS ); - ERROR_CASE(CL_DEVICE_NOT_FOUND ); - ERROR_CASE(CL_DEVICE_NOT_AVAILABLE ); - ERROR_CASE(CL_COMPILER_NOT_AVAILABLE ); - ERROR_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE ); - ERROR_CASE(CL_OUT_OF_RESOURCES ); - ERROR_CASE(CL_OUT_OF_HOST_MEMORY ); - ERROR_CASE(CL_PROFILING_INFO_NOT_AVAILABLE ); - ERROR_CASE(CL_MEM_COPY_OVERLAP ); - ERROR_CASE(CL_IMAGE_FORMAT_MISMATCH ); - ERROR_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED ); - ERROR_CASE(CL_BUILD_PROGRAM_FAILURE ); - ERROR_CASE(CL_MAP_FAILURE ); -#ifdef CL_VERSION_1_1 - ERROR_CASE(CL_MISALIGNED_SUB_BUFFER_OFFSET ); - ERROR_CASE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST); -#endif -#ifdef CL_VERSION_1_2 - ERROR_CASE(CL_COMPILE_PROGRAM_FAILURE ); - ERROR_CASE(CL_LINKER_NOT_AVAILABLE ); - ERROR_CASE(CL_LINK_PROGRAM_FAILURE ); - ERROR_CASE(CL_DEVICE_PARTITION_FAILED ); - ERROR_CASE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE ); -#endif - ERROR_CASE(CL_INVALID_VALUE ); - ERROR_CASE(CL_INVALID_DEVICE_TYPE ); - ERROR_CASE(CL_INVALID_PLATFORM ); - ERROR_CASE(CL_INVALID_DEVICE ); - ERROR_CASE(CL_INVALID_CONTEXT ); - ERROR_CASE(CL_INVALID_QUEUE_PROPERTIES ); - ERROR_CASE(CL_INVALID_COMMAND_QUEUE ); - ERROR_CASE(CL_INVALID_HOST_PTR ); - ERROR_CASE(CL_INVALID_MEM_OBJECT ); - ERROR_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR ); - ERROR_CASE(CL_INVALID_IMAGE_SIZE ); - ERROR_CASE(CL_INVALID_SAMPLER ); - ERROR_CASE(CL_INVALID_BINARY ); - ERROR_CASE(CL_INVALID_BUILD_OPTIONS ); - ERROR_CASE(CL_INVALID_PROGRAM ); - ERROR_CASE(CL_INVALID_PROGRAM_EXECUTABLE ); - ERROR_CASE(CL_INVALID_KERNEL_NAME ); - ERROR_CASE(CL_INVALID_KERNEL_DEFINITION ); - ERROR_CASE(CL_INVALID_KERNEL ); - ERROR_CASE(CL_INVALID_ARG_INDEX ); - ERROR_CASE(CL_INVALID_ARG_VALUE ); - ERROR_CASE(CL_INVALID_ARG_SIZE ); - ERROR_CASE(CL_INVALID_KERNEL_ARGS ); - ERROR_CASE(CL_INVALID_WORK_DIMENSION ); - ERROR_CASE(CL_INVALID_WORK_GROUP_SIZE ); - ERROR_CASE(CL_INVALID_WORK_ITEM_SIZE ); - ERROR_CASE(CL_INVALID_GLOBAL_OFFSET ); - ERROR_CASE(CL_INVALID_EVENT_WAIT_LIST ); - ERROR_CASE(CL_INVALID_EVENT ); - ERROR_CASE(CL_INVALID_OPERATION ); - ERROR_CASE(CL_INVALID_GL_OBJECT ); - ERROR_CASE(CL_INVALID_BUFFER_SIZE ); - ERROR_CASE(CL_INVALID_MIP_LEVEL ); - ERROR_CASE(CL_INVALID_GLOBAL_WORK_SIZE ); -#ifdef CL_VERSION_1_1 - ERROR_CASE(CL_INVALID_PROPERTY ); -#endif -#ifdef CL_VERSION_1_2 - ERROR_CASE(CL_INVALID_IMAGE_DESCRIPTOR ); - ERROR_CASE(CL_INVALID_COMPILER_OPTIONS ); - ERROR_CASE(CL_INVALID_LINKER_OPTIONS ); - ERROR_CASE(CL_INVALID_DEVICE_PARTITION_COUNT ); -#endif -#ifdef CL_VERSION_2_0 - ERROR_CASE(CL_INVALID_PIPE_SIZE ); - ERROR_CASE(CL_INVALID_DEVICE_QUEUE ); -#endif -#ifdef CL_VERSION_2_2 - ERROR_CASE(CL_INVALID_SPEC_ID ); - ERROR_CASE(CL_MAX_SIZE_RESTRICTION_EXCEEDED ); -#endif - - default: - return"CL_UNKNOW_ERROR"; - } - } - -#define map_cl_error_2_ee(err)\ - {\ - if(err == 0) return SUCCESS;\ - std::cout << "GCLAPI error in: " << std::endl;\ - std::cout << "File: " << __FILE__ << std::endl;\ - std::cout << "Line: " << __LINE__ << std::endl;\ - std::cout << "Func name is: " << __func__ << std::endl;\ - std::cout << "GCLERROR = " << map_cl_error_2_string(err) << std::endl;\ - return GCL_ERROR;\ - } - - inline EE has_dedicated_local(Device device, I32 *b) { - void* value; - I32 ret = clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(void*), &value, nullptr); - if(CL_SUCCESS == ret) *b = (*((cl_device_local_mem_type*)value) == CL_LOCAL); - free(value); - map_cl_error_2_ee(ret); - } - - -/** -*@ enum define -**/ -typedef enum{ - GCL_MEM_BUF = 0, - GCL_MEM_IMG_1D = 1, - GCL_MEM_IMG_2D = 2, - GCL_MEM_IMG_3D = 3 -}GCLMemType; - -typedef enum{ - HOST_TO_DEVICE_BUF = 0, - HOST_TO_DEVICE_IMG = 1, - DEVICE_BUF_TO_HOST = 2, - DEVICE_IMG_TO_HOST = 3, - DEVICE_BUF_TO_BUF = 4, - DEVICE_BUF_TO_IMG = 5, - DEVICE_IMG_TO_BUF = 6, - DEVICE_IMG_TO_IMG = 7 -}GCLMemTransType; -/** -*@ struct define -**/ -struct GCLKernelInfo{ - Kernel kernel = NULL; - U32 dim = 0; - U32 gs[3] = {0}; - U32 ls[3] = {0}; - std::string name; -}; - -struct GCLKernelBin{ - CU8* data; - CU32 len; -}; - -struct GCLHandle{ - Platform* platforms; - U32 numPlatform; - U32 platformId; - - Device* devices; - U32 numDevice; - U32 deviceId; - cl_device_type deviceType; - - Context context; - CommandQueue queue; - CommandQueue queue_profiling; - - cl_command_queue_properties queueProperties; - Event eventObj; - Event* eventPtr; - U32 numWaitEvents; - Event* waitEvents; - double t_execute; - double t_total; - - std::string deviceBinmapName; - std::unordered_map* binMapPtr; - std::map kernelMap; - std::vector kernelVec; - std::string curOpName; -}; - -typedef struct GCLHandle* GCLHandle_t; - -struct GCLHandleConfig{ - CI8* deviceBinmapName; -}; - -typedef GCLHandleConfig* GCLHandleConfig_t; - -struct GCLMemDesc{ - U32 stride[3]; - U32 offset[3]; - GCLMemType memType; - DataFormat memFormat; - U32 byteSize; - U32 num; - MemFlags flags; - ImgFormat imgFormat; - void* host_ptr; - U8* map_ptr; - bool use_map; - bool has_alloc; -}; -typedef struct GCLMemDesc* GCLMemDesc_t; -struct GCLMem{ - Mem mem; - GCLMemDesc desc; - std::vector subMem; - std::vector mapPtrArray; -}; -typedef struct GCLMem* GCLMem_t; - - - - -#ifdef __cplusplus -} -#endif -#endif diff --git a/gcl/include/gcl_func.h b/gcl/include/gcl_func.h deleted file mode 100644 index 90e9f579..00000000 --- a/gcl/include/gcl_func.h +++ /dev/null @@ -1,1232 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -#ifndef H_GCL_FUNC -#define H_GCL_FUNC - -#include -#include "gcl_common.h" -#include "platform.h" -#include "context.h" -#include "program.h" -#include "memory.h" -#include "kernel.h" -#include "event.h" -#include "gcl_kernel_binmap.h" -#include -#ifdef __cplusplus -extern "C" { -#endif - inline EE gcl_get_device_name(GCLHandle_t handle) { - cl_device_id device = handle->devices[handle->deviceId]; - U32 len; - I8* data; - CHECK_STATUS(get_device_info(device, CL_DEVICE_NAME, (void**)&data, &len)); - I8 devName[64]; - for(U32 i = 0; i < len - 1; i++) { - if(data[i] == '-') { - data[i] = '_'; - } - devName[i] = data[i]; - } - U32 version_len; - free(data); - CHECK_STATUS(get_device_info(device, CL_DEVICE_VERSION, (void**)&data, &version_len)); - std::string deviceV = std::string(data); - U32 be = deviceV.find("r"); - U32 end = deviceV.find("p", be + 1); - std::string numV = deviceV.substr(be + 1, end - be - 1); - U32 i = atoi(numV.c_str()); - if(i >= 14) { - devName[len - 1] = 'p'; - devName[len] = '\0'; - } else { - devName[len - 1] = '\0'; - } - free(data); - handle->deviceBinmapName = devName; - return SUCCESS; - } - - inline EE gcl_create_handle(GCLHandle_t* handlePtr) { - - if(handlePtr == NULL) { - printf("the handlePtr set to gcl_create_handle is NULL\n"); - return NULL_POINTER; - } - GCLHandle_t handle= new GCLHandle(); - handle->platformId = 0; - handle->deviceId = 0; - handle->deviceType = CL_DEVICE_TYPE_GPU; - handle->eventPtr = NULL; - handle->numWaitEvents = 0; - handle->waitEvents = NULL; - handle->t_execute = 0; - handle->t_total = 0; - handle->curOpName = "unknow"; - U32 platformId = handle->platformId; - U32 deviceId = handle->deviceId; - CHECK_STATUS(get_platforms(&handle->numPlatform, &handle->platforms)); - CHECK_STATUS(platform_get_devices(handle->platforms[platformId], - handle->deviceType, - &handle->numDevice, - &handle->devices)); - CHECK_STATUS(create_context(handle->platforms[platformId], - handle->numDevice, - handle->devices, - &handle->context)); - cl_queue_properties props[]={CL_QUEUE_PROPERTIES, 0, 0}; -#ifdef _DEBUG - handle->queueProperties = CL_QUEUE_PROFILING_ENABLE; - handle->eventPtr = &handle->eventObj; - props[1] = props[1] | CL_QUEUE_PROFILING_ENABLE; -#endif - CHECK_STATUS(create_command_queue_properties(handle->context, - handle->devices[deviceId], - props, - &handle->queue)); - CHECK_STATUS(gcl_get_device_name(handle)); - *handlePtr = handle; - return SUCCESS; - } - - inline EE gcl_create_handle_profiling(GCLHandle_t* handlePtr) { - - if(handlePtr == NULL) { - printf("the handlePtr set to gcl_create_handle is NULL\n"); - return NULL_POINTER; - } - GCLHandle_t handle= new GCLHandle(); - handle->platformId = 0; - handle->deviceId = 0; - handle->deviceType = CL_DEVICE_TYPE_GPU; - handle->eventPtr = NULL; - handle->numWaitEvents = 0; - handle->t_execute = 0; - handle->t_total = 0; - handle->curOpName = "unknow"; - U32 platformId = handle->platformId; - U32 deviceId = handle->deviceId; - CHECK_STATUS(get_platforms(&handle->numPlatform, &handle->platforms)); - CHECK_STATUS(platform_get_devices(handle->platforms[platformId], - handle->deviceType, - &handle->numDevice, - &handle->devices)); - CHECK_STATUS(create_context(handle->platforms[platformId], - handle->numDevice, - handle->devices, - &handle->context)); - cl_queue_properties props[]={CL_QUEUE_PROPERTIES, 0, 0}; - handle->queueProperties = CL_QUEUE_PROFILING_ENABLE; - handle->eventPtr = &handle->eventObj; - props[1] = props[1] | CL_QUEUE_PROFILING_ENABLE; - CHECK_STATUS(create_command_queue_properties(handle->context, - handle->devices[deviceId], - props, - &handle->queue)); - CHECK_STATUS(gcl_get_device_name(handle)); - *handlePtr = handle; - return SUCCESS; - } - - inline void gcl_destroy_handle(GCLHandle_t handle) { - U32 deviceId = handle->deviceId; - CHECK_STATUS(finish(handle->queue)); - for(auto k : handle->kernelMap) CHECK_STATUS(release_kernel(k.second)); - for(auto k : handle->kernelVec) CHECK_STATUS(release_kernel(k.kernel)); - handle->kernelMap.clear(); - handle->kernelVec.clear(); - CHECK_STATUS(release_command_queue(handle->queue)); - CHECK_STATUS(release_context(handle->context)); - CHECK_STATUS(release_device(handle->devices[deviceId])); - free(handle->devices); - free(handle->platforms); - delete handle; - } - - inline EE gcl_create_queue_profiling(GCLHandle_t handle) { - - cl_queue_properties props[]={CL_QUEUE_PROPERTIES, 0, 0}; - handle->eventPtr = &handle->eventObj; - props[1] = props[1] | CL_QUEUE_PROFILING_ENABLE; - CHECK_STATUS(create_command_queue_properties(handle->context, - handle->devices[handle->deviceId], - props, - &handle->queue_profiling)); - return SUCCESS; - } - - inline EE gcl_destroy_queue_profiling(GCLHandle_t handle) { - CHECK_STATUS(finish(handle->queue_profiling)); - CHECK_STATUS(release_command_queue(handle->queue_profiling)); - handle->eventPtr = NULL; - return SUCCESS; - } - - inline EE gcl_regist_binMap(GCLHandle_t handle){ - gcl_kernel_binmap_factory::instance()->create_gcl_kernel_binmap(handle->deviceBinmapName); - gcl_kernel_binmap* kernel_binmap; - U32 EE = gcl_kernel_binmap_container::instance()->get(handle->deviceBinmapName, &kernel_binmap); - if(EE == NULL_POINTER) { - DEBUG_info("warning: the kernel binmap is not found"); - } else { - handle->binMapPtr = &kernel_binmap->binMap(); - } - return SUCCESS; - } - - inline GCLMemDesc gcl_mem_desc(U32 stride[], U32 offset[], DataType dt, DataFormat memFormat){ - GCLMemDesc desc; - U32 s0, s1, s2; - s0 = stride[0]; - s1 = stride[1]; - s2 = stride[2]; - desc.stride[0] = s0; - desc.stride[1] = s1; - desc.stride[2] = s2; - desc.offset[0] = offset[0]; - desc.offset[1] = offset[1]; - desc.offset[2] = offset[2]; - desc.memFormat = memFormat; - desc.memType = GCL_MEM_BUF; - desc.num = s0 * s1 * s2; - desc.byteSize = s0 * s1 * s2 * bytesOf(dt); - desc.flags = CL_MEM_READ_WRITE; - desc.host_ptr = NULL; - desc.imgFormat.image_channel_order = CL_RGBA; - desc.imgFormat.image_channel_data_type = CL_HALF_FLOAT; - desc.use_map = false; - desc.map_ptr = NULL; - desc.has_alloc = false; - return desc; - } - - - inline GCLMem_t gcl_create_gclmem(){ - GCLMem_t ret = new GCLMem; - ret->mem = NULL; - U32 str[3] = {0, 0, 0}; - U32 off[3] = {0, 0, 0}; - ret->desc = gcl_mem_desc(str, off, DT_U8, DF_NCWHC4); - return ret; - } - - inline EE gcl_release_memory(GCLMem_t gclMem) { - if(gclMem->mem) { - if(gclMem->subMem.size()) { - for(auto p: gclMem->subMem) CHECK_STATUS(release_memory(p)); - gclMem->subMem.clear(); - } - CHECK_STATUS(release_memory(gclMem->mem)); - gclMem->mem = NULL; - gclMem->desc.has_alloc = false; - } - return SUCCESS; - } - - inline void gcl_destroy_gclmem(GCLMem_t mem){ - CHECK_STATUS(gcl_release_memory(mem)); - delete mem; - } - - inline EE gcl_finish(GCLHandle_t handle) { - CHECK_STATUS(finish(handle->queue)); - return SUCCESS; - } - - - inline EE gcl_unmap_memory(GCLHandle_t handle, GCLMem_t gclMem) - { - for(auto p : gclMem->mapPtrArray) { - CHECK_STATUS(enqueue_unmap_memory(handle->queue, gclMem->mem, (void*)p, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); -#ifdef _DEBUG - DEBUG_info_s("DATAUNMAP>>> enqueue_unmap_memory runInfo:"); - double executeTime = 0; - CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); - CHECK_STATUS(release_event(handle->eventObj)); - DEBUG_info("executeTime = " << executeTime << " us"); - CHECK_STATUS(gcl_finish(handle)); -#endif - } - gclMem->mapPtrArray.clear(); - return SUCCESS; - - } - - inline EE gcl_produce_program_kernel_with_source(GCLHandle_t handle, - U32* len, - CI8* src, - CI8* option, - Program* program, - U32 numKernel, - Kernel* kernels) { - U32 deviceId = handle->deviceId; - CHECK_STATUS(create_build_program_from_source(handle->context, len, src, handle->devices[deviceId], option, program)); - CHECK_STATUS(create_kernels_in_program(*program, numKernel, kernels)); - return SUCCESS; - } - - inline EE gcl_get_program_info(Program program, - U8** binary, - U32* len) { - CHECK_STATUS(get_program_binary(program, binary, len)); - return SUCCESS; - } - - inline EE gcl_kernelmap_put(GCLHandle_t handle, - std::string kernelName, - Kernel kernel) { - handle->kernelMap.insert(std::pair(kernelName, kernel)); - return SUCCESS; - } - - inline Kernel gcl_kernelmap_get(GCLHandle_t handle, - std::string kernelName) { - auto it = handle->kernelMap.find(std::string(kernelName)); - if(it == handle->kernelMap.end()) CHECK_STATUS(NOT_MATCH); - return it->second; - } - - inline EE gcl_create_kernel_binary(GCLHandle_t handle, - CI8* kernelName, - Kernel* kernel) { - - std::string binmapname = handle->deviceBinmapName; - std::string binmap_kernelname = binmapname + "_" + std::string(kernelName); - auto binMapPtr = handle->binMapPtr; - auto it = binMapPtr->find(binmap_kernelname); - if(it == binMapPtr->end()) { - DEBUG_info("get kernel " << kernelName << " failed"); - return NULL_POINTER; - } - - U32 length = it->second.len; - CU8* data = it->second.data; - I32 binsta; - Program program; - CI8* options = ""; - Device device = handle->devices[handle->deviceId]; - CHECK_STATUS(create_program_from_binary(handle->context, device, &length, &data, &binsta, &program)); - CHECK_STATUS(build_program(program, device, options)); - CHECK_STATUS(create_kernel(program, kernelName, kernel)); - CHECK_STATUS(release_program(program)); - return SUCCESS; - } - - inline EE gcl_get_kernel_from_map(GCLHandle_t handle, - CI8* kernelName, - Kernel* kernel) { - std::string binmapname = handle->deviceBinmapName; - std::string binmap_kernelname = binmapname + "_" + std::string(kernelName); - if(handle->kernelMap.find(binmap_kernelname) == handle->kernelMap.end()) { - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelName, kernel)) - CHECK_STATUS(gcl_kernelmap_put(handle, binmap_kernelname, *kernel)); - } else { - *kernel = gcl_kernelmap_get(handle, binmap_kernelname); - } - return SUCCESS; - } - - - inline EE gcl_set_kernelVec(GCLHandle_t handle, - Kernel kernel, - U32 work_dim, - U32 global_work_size[], - U32 local_work_size[], - CI8* kernelName = NULL) { - GCLKernelInfo kernelInfo; - kernelInfo.kernel = kernel; - kernelInfo.dim = work_dim; - kernelInfo.name = handle->curOpName + "_" + std::string(kernelName); - switch(work_dim) { - case 1: { - kernelInfo.gs[0] = global_work_size[0]; - kernelInfo.gs[1] = 1; - kernelInfo.gs[2] = 1; - kernelInfo.ls[0] = local_work_size[0]; - kernelInfo.ls[1] = 0; - kernelInfo.ls[2] = 0; - break;} - case 2: { - kernelInfo.gs[0] = global_work_size[0]; - kernelInfo.gs[1] = global_work_size[1]; - kernelInfo.gs[2] = 1; - kernelInfo.ls[0] = local_work_size[0]; - kernelInfo.ls[1] = local_work_size[1]; - kernelInfo.ls[2] = 0; - break;} - case 3: { - kernelInfo.gs[0] = global_work_size[0]; - kernelInfo.gs[1] = global_work_size[1]; - kernelInfo.gs[2] = global_work_size[2]; - kernelInfo.ls[0] = local_work_size[0]; - kernelInfo.ls[1] = local_work_size[1]; - kernelInfo.ls[2] = local_work_size[2]; - break;} - default: - return NOT_SUPPORTED; - } - handle->kernelVec.push_back(kernelInfo); - return SUCCESS; - } - - inline EE gcl_run_kernelVec(GCLHandle_t handle) { - U32 len = handle->kernelVec.size(); - CommandQueue queue = handle->queue; - U32 numWaitEvents = handle->numWaitEvents; - Event* waitEvents = handle->waitEvents; - Event* eventPtr = handle->eventPtr; - for(U32 i = 0 ; i < len; ++i) { - auto kernelInfo = handle->kernelVec[i]; - CHECK_STATUS(enqueue_ndrange_kernel(queue, kernelInfo.kernel, kernelInfo.dim, NULL, - kernelInfo.gs, kernelInfo.ls, numWaitEvents, waitEvents, eventPtr)); -#ifdef _DEBUG - DEBUG_info_s("KERNEL>>> " << kernelInfo.name << " runInfo:"); - double executeTime = 0; - CHECK_STATUS(event_counting_time(eventPtr, NULL, NULL, NULL, NULL, &executeTime)); - CHECK_STATUS(release_event(*eventPtr)); - handle->t_execute = executeTime; - DEBUG_info("executeTime = " << executeTime << " us"); - CHECK_STATUS(gcl_finish(handle)); -#endif - } - return SUCCESS; - } - - inline EE gcl_run_kernelVec_timing(GCLHandle_t handle, U32 be, U32 end, std::vector *kernelArrayTime = NULL) { - if(handle->queueProperties & CL_QUEUE_PROFILING_ENABLE) { - double executeTime = 0; - double totalTime = 0; - CommandQueue queue = handle->queue; - U32 numWaitEvents = handle->numWaitEvents; - Event* waitEvents = handle->waitEvents; - Event* eventPtr = handle->eventPtr; - for(U32 i = be ; i < end; ++i) { - auto kernelInfo = handle->kernelVec[i]; - CHECK_STATUS(enqueue_ndrange_kernel(queue, kernelInfo.kernel, kernelInfo.dim, NULL, - kernelInfo.gs, kernelInfo.ls, numWaitEvents, waitEvents, eventPtr)); - CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); - CHECK_STATUS(release_event(handle->eventObj)); - totalTime += executeTime; - if(kernelArrayTime) (*kernelArrayTime).push_back(executeTime); - } - handle->t_execute = totalTime; - return SUCCESS; - } - return NOT_SUPPORTED; - } - - inline EE gcl_clean_kernelVec(GCLHandle_t handle) { - for(auto k : handle->kernelVec) CHECK_STATUS(release_kernel(k.kernel)); - handle->kernelVec.clear(); - return SUCCESS; - } - - inline EE gcl_run_kernel(GCLHandle_t handle, Kernel kernel, U32 work_dim, U32* gs, U32* ls, CI8* kernelName = NULL) { -#ifdef _DEBUG - std::string name = "unknown kernel"; - if(kernelName) name = handle->curOpName + "_" + std::string(kernelName); - DEBUG_info_s("KERNEL>>> " << name.c_str() << " runInfo:"); -#endif - CHECK_STATUS(enqueue_ndrange_kernel(handle->queue, kernel, work_dim, - NULL, gs, ls, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); -#ifdef _DEBUG - double executeTime = 0; - CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); - CHECK_STATUS(release_event(handle->eventObj)); - handle->t_execute = executeTime; - DEBUG_info("executeTime = " << executeTime << " us"); - CHECK_STATUS(gcl_finish(handle)); -#else - UNUSED(kernelName); -#endif - return SUCCESS; - } - - inline U32 get_next_ls_size(U32 ls_size) { - return (ls_size << 1); - } - inline EE gcl_run_kernel_select_ls(GCLHandle_t handle, GCLKernelInfo* kernelInfo) { - auto kernel = kernelInfo->kernel; - auto work_dim = kernelInfo->dim; - auto gs = kernelInfo->gs; - double minTime = DBL_MAX; - double time; - U32 test_ls[3]; - U32 best_ls[3]; - U32 test_gs[3]; - U32 maxSize = 384; - U32 gs_x = 256; - U32 gs_y = (work_dim > 1) ? 256 : 1; - U32 gs_z = (work_dim > 2) ? gs[2] : 1; - for(U32 z = 1; z <= gs_z; z = get_next_ls_size(z)) { - if(0 != gs_z % z) continue; - for(U32 y = 1; y <= gs_y; y = get_next_ls_size(y)) { - if(0 != gs_y % y) continue; - for(U32 x = 1; x <= gs_x; x = get_next_ls_size(x)) { - if(0 != gs_x % x) continue; - U32 total = x * y * z; - if(total <= maxSize) { - test_gs[0] = (gs[0] + x - 1) / x * x; - test_gs[1] = (gs[1] + y - 1) / y * y; - test_gs[2] = (gs[2] + z - 1) / z * z; - test_ls[0] = x; - test_ls[1] = y; - test_ls[2] = z; - CHECK_STATUS(enqueue_ndrange_kernel(handle->queue_profiling, kernel, work_dim, NULL, test_gs, test_ls, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &time)); - if(minTime > time){ - minTime = time; - best_ls[0] = test_ls[0]; - best_ls[1] = test_ls[1]; - best_ls[2] = test_ls[2]; - } - CHECK_STATUS(release_event(handle->eventObj)); - } - } - } - } - test_ls[0] = 0; - test_ls[1] = 0; - test_ls[2] = 0; - CHECK_STATUS(enqueue_ndrange_kernel(handle->queue_profiling, kernel, work_dim, NULL, gs, test_ls, handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &time)); - if(minTime > time){ - minTime = time; - best_ls[0] = test_ls[0]; - best_ls[1] = test_ls[1]; - best_ls[2] = test_ls[2]; - } - CHECK_STATUS(release_event(handle->eventObj)); - if(best_ls[0] != 0 && best_ls[1] != 0 && best_ls[2] != 0) { - kernelInfo->gs[0] = (gs[0] + best_ls[0] - 1) / best_ls[0] * best_ls[0]; - kernelInfo->gs[1] = (gs[1] + best_ls[1] - 1) / best_ls[1] * best_ls[1]; - kernelInfo->gs[2] = (gs[2] + best_ls[2] - 1) / best_ls[2] * best_ls[2]; - } - kernelInfo->ls[0] = best_ls[0]; - kernelInfo->ls[1] = best_ls[1]; - kernelInfo->ls[2] = best_ls[2]; - handle->t_execute = minTime; -#ifdef _DEBUG - DEBUG_info_s("SELECT LS KERNEL>>> " << kernelInfo->name.c_str() << " runInfo:"); - DEBUG_info_s("best ls = " << best_ls[0] << " " << best_ls[1] << " " << best_ls[2] << " "); - DEBUG_info(" executeTime = " << minTime << " us"); -#endif - return SUCCESS; - } - - inline EE gcl_run_kernelVec_select_ls(GCLHandle_t handle, std::vector kernelIndex) { - if(kernelIndex.size() == 0) return SUCCESS; - CHECK_STATUS(gcl_create_queue_profiling(handle)); - for(auto index : kernelIndex) { - auto kernelInfo = handle->kernelVec[index]; - CHECK_STATUS(gcl_run_kernel_select_ls(handle, &kernelInfo)); - handle->kernelVec[index].gs[0] = kernelInfo.gs[0]; - handle->kernelVec[index].gs[1] = kernelInfo.gs[1]; - handle->kernelVec[index].gs[2] = kernelInfo.gs[2]; - handle->kernelVec[index].ls[0] = kernelInfo.ls[0]; - handle->kernelVec[index].ls[1] = kernelInfo.ls[1]; - handle->kernelVec[index].ls[2] = kernelInfo.ls[2]; - } - CHECK_STATUS(gcl_destroy_queue_profiling(handle)); - return SUCCESS; - } - -#ifdef _DEBUG - inline EE gcl_run_kernel_profiling(GCLHandle_t handle, Kernel kernel, U32 work_dim, U32* gs, U32* ls, CI8* kernelName = NULL) { - std::string name = "unknown kernel"; - if(kernelName) name = kernelName; - DEBUG_info_s("KERNEL>>> " << name.c_str() << " runInfo:"); - double totalTime = 0; - double executeTime = 0; - U32 loop = 10; - for(U32 i = 0; i < loop; i++) { - double t; - CHECK_STATUS(enqueue_ndrange_kernel(handle->queue, kernel, work_dim, - NULL, gs, ls, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &t)); - CHECK_STATUS(release_event(handle->eventObj)); - DEBUG_info("loop " << i << " executeTime = " << t << " us"); - totalTime += t; - } - executeTime = totalTime / loop; - DEBUG_info("executeTime = " << executeTime << " us for " << loop << " times average"); - CHECK_STATUS(gcl_finish(handle)); - return SUCCESS; - } -#endif - - inline EE gcl_create_memory(GCLHandle_t handle, GCLMem_t gclMem) { - GCLMemDesc_t desc = &gclMem->desc; - if(!desc->has_alloc){ - switch(desc->memType) { - case GCL_MEM_BUF: { - CHECK_STATUS(create_buffer(handle->context, desc->flags, desc->byteSize, desc->host_ptr, &gclMem->mem)); - desc->has_alloc = true; - break; - } - case GCL_MEM_IMG_1D: { - CHECK_STATUS(create_image1D(handle->context, desc->flags, &desc->imgFormat, desc->stride[0], 0, desc->host_ptr, &gclMem->mem)); - desc->has_alloc = true; - break; - } - case GCL_MEM_IMG_2D: { - CHECK_STATUS(create_image2D(handle->context, desc->flags, &desc->imgFormat, desc->stride[0], desc->stride[1], 0, desc->host_ptr, &gclMem->mem)); - desc->has_alloc = true; - break; - } - case GCL_MEM_IMG_3D: { - CHECK_STATUS(create_image3D(handle->context, desc->flags, &desc->imgFormat, desc->stride[0], desc->stride[1], desc->stride[2], 0, 0, desc->host_ptr, &gclMem->mem)); - desc->has_alloc = true; - break; - } - default: return NOT_SUPPORTED; - } - } else { - //std::cout << "warning try to alloc the same gpu mem again without release" << std::endl; - } - return SUCCESS; - } - - inline EE gcl_trans_memory(GCLHandle_t handle, void* src, void* dst, U32* size, GCLMemTransType type, cl_bool blocking, U32* offset = NULL) - { - DEBUG_info_s("DATATRANS>>>"); - switch(type) { - case HOST_TO_DEVICE_BUF: { - U8* hostPtr = (U8*)src; - GCLMem_t gclMem = (GCLMem_t)dst; - U32 dstOff = (offset) ? offset[0] : 0; - CHECK_STATUS(enqueue_write_buffer(handle->queue, gclMem->mem, blocking, dstOff, *size, hostPtr, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - DEBUG_info_s("enqueue_write_buffer runInfo: "); - break; - } - case HOST_TO_DEVICE_IMG: { - U8* hostPtr = (U8*)src; - GCLMem_t gclMem = (GCLMem_t)dst; - U32 origin[3] = {0, 0, 0}; - if(offset) { - origin[0] = offset[0]; - origin[1] = offset[1]; - origin[2] = offset[2]; - } - CHECK_STATUS(enqueue_write_image(handle->queue, gclMem->mem, blocking, origin, size, 0, 0, hostPtr, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - DEBUG_info_s("enqueue_write_image runInfo: "); - break; - } - case DEVICE_BUF_TO_HOST: { - U8* hostPtr = (U8*)dst; - GCLMem_t gclMem = (GCLMem_t)src; - U32 srcOff = (offset) ? offset[0] : 0; - CHECK_STATUS(enqueue_read_buffer(handle->queue, gclMem->mem, blocking, srcOff, *size, hostPtr, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - DEBUG_info_s("enqueue_read_buffer runInfo: "); - break; - } - case DEVICE_IMG_TO_HOST: { - U8* hostPtr = (U8*)dst; - GCLMem_t gclMem = (GCLMem_t)src; - U32 origin[3] = {0, 0, 0}; - if(offset) { - origin[0] = offset[0]; - origin[1] = offset[1]; - origin[2] = offset[2]; - } - CHECK_STATUS(enqueue_read_image(handle->queue, gclMem->mem, blocking, origin, size, 0, 0, hostPtr, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - DEBUG_info_s("enqueue_read_image runInfo: "); - break; - } - case DEVICE_BUF_TO_BUF: { - GCLMem_t srcBuf = (GCLMem_t)src; - GCLMem_t dstBuf = (GCLMem_t)dst; - U32 srcOff = 0; - U32 dstOff = 0; - if(offset) { - srcOff = offset[0]; - dstOff = offset[1]; - } - CHECK_STATUS(enqueue_copy_buffer(handle->queue, srcBuf->mem, dstBuf->mem, srcOff, dstOff, *size, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - DEBUG_info_s("enqueue_copy_buffer runInfo: "); - break; - } - case DEVICE_BUF_TO_IMG: { - GCLMem_t srcBuf = (GCLMem_t)src; - GCLMem_t dstImg = (GCLMem_t)dst; - U32 origin[3] = {0, 0, 0}; - U32 srcOff = 0; - if(offset) { - srcOff = offset[0]; - origin[0] = offset[1]; - origin[1] = offset[2]; - origin[2] = offset[3]; - } - CHECK_STATUS(enqueue_copy_buffer_to_image(handle->queue, srcBuf->mem, dstImg->mem, srcOff, origin, size, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)) - DEBUG_info_s("enqueue_copy_buffer_to_image runInfo: "); - break; - } - case DEVICE_IMG_TO_BUF: { - GCLMem_t srcImg = (GCLMem_t)src; - GCLMem_t dstBuf = (GCLMem_t)dst; - U32 origin[3] = {0, 0, 0}; - U32 dstOff = 0; - if(offset) { - origin[0] = offset[0]; - origin[1] = offset[1]; - origin[2] = offset[2]; - dstOff = offset[3]; - } - CHECK_STATUS(enqueue_copy_image_to_buffer(handle->queue, srcImg->mem, dstBuf->mem, origin, size, dstOff, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)) - DEBUG_info_s("enqueue_copy_image_to_buffer runInfo: "); - break; - } - case DEVICE_IMG_TO_IMG: { - return NOT_SUPPORTED; - break; - } - default: return NOT_SUPPORTED; - } -#ifdef _DEBUG - double executeTime = 0; - CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); - CHECK_STATUS(release_event(handle->eventObj)); - DEBUG_info("executeTime = " << executeTime << " us"); - CHECK_STATUS(gcl_finish(handle)); -#endif - return SUCCESS; - } - - inline EE gcl_trans_buffer_rect(GCLHandle_t handle, void* src, void* dst, U32* host_org, U32* buf_org, U32* region, U32 host_row_pitch, U32 host_slice_pitch, - U32 buf_row_pitch, U32 buf_slice_pitch, GCLMemTransType type, cl_bool blocking) { - DEBUG_info_s("DATATRANS>>>"); - switch(type) { - case HOST_TO_DEVICE_BUF: { - GCLMem_t dstBuf = (GCLMem_t)dst; - CHECK_STATUS(enqueue_write_buffer_rect(handle->queue, dstBuf->mem, blocking, buf_org, host_org, region, buf_row_pitch, buf_slice_pitch, - host_row_pitch, host_slice_pitch, src, handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - DEBUG_info_s("enqueue_write_buffer_rect runInfo: "); - break; - } - case DEVICE_BUF_TO_HOST: { - return NOT_SUPPORTED; - break; - } - default: return NOT_SUPPORTED; - } -#ifdef _DEBUG - double executeTime = 0; - CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); - CHECK_STATUS(release_event(handle->eventObj)); - DEBUG_info("executeTime = " << executeTime << " us"); - CHECK_STATUS(gcl_finish(handle)); -#endif - return SUCCESS; - } - - inline EE gcl_map_memory(GCLHandle_t handle, GCLMem_t gclMem, U32*offset, U32* size, cl_map_flags flags, cl_bool blocking) - { - DEBUG_info_s("DATAMAP>>> enqueue_map_buffer runInfo:"); - GCLMemDesc_t desc = &gclMem->desc; - if (gclMem->desc.memType == GCL_MEM_BUF) { - CHECK_STATUS(enqueue_map_buffer(handle->queue, gclMem->mem, blocking, flags, *offset, *size, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr, (void**)&desc->map_ptr)); - gclMem->mapPtrArray.push_back(desc->map_ptr); - } else { - return NOT_SUPPORTED; - } -#ifdef _DEBUG - double executeTime = 0; - CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); - CHECK_STATUS(release_event(handle->eventObj)); - DEBUG_info("executeTime = " << executeTime << " us"); - CHECK_STATUS(gcl_finish(handle)); -#endif - return SUCCESS; - } - - - inline EE gcl_fill_memory_zero(GCLHandle_t handle, GCLMem_t gclMem) { - if(gclMem->desc.memType == GCL_MEM_BUF) { - DEBUG_info_s("FILLMEM>>> enqueue_fill_buffer runInfo:"); - U8 pat_val = 0; - CHECK_STATUS(enqueue_fill_buffer(handle->queue, gclMem->mem, &pat_val, sizeof(pat_val), 0, gclMem->desc.byteSize, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - } else { - DEBUG_info_s("FILLMEM>>> enqueue_fill_image runInfo:"); - F32 color[4] = {0.0f, 0.0f, 0.0f, 0.0f}; - U32 origin[3] = {0, 0, 0}; - U32 region[3]; - region[0] = gclMem->desc.stride[0]; - region[1] = gclMem->desc.stride[1]; - region[2] = gclMem->desc.stride[2]; - CHECK_STATUS(enqueue_fill_image(handle->queue, gclMem->mem, color, origin, region, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - } -#ifdef _DEBUG - double executeTime = 0; - CHECK_STATUS(event_counting_time(handle->eventPtr, NULL, NULL, NULL, NULL, &executeTime)); - CHECK_STATUS(release_event(handle->eventObj)); - DEBUG_info("executeTime = " << executeTime << " us"); - CHECK_STATUS(gcl_finish(handle)); -#endif - return SUCCESS; - } - - inline EE gcl_get_mem_size(GCLMem_t gclMem, U32* size) { - CHECK_STATUS(get_memory_size(gclMem->mem, size)); - return SUCCESS; - } - - inline EE gcl_create_sub_buffer(U32 size, U32* offset, GCLMem_t src, Mem* subbuf){ - CHECK_STATUS(create_sub_buffer(src->mem, CL_MEM_READ_WRITE, *offset, size, subbuf)); - src->subMem.push_back(*subbuf); - *offset += (size + 1023) / 1024 * 1024; - return SUCCESS; - } - #ifdef __cplusplus - } - #endif - template - struct DummpyWrapper{ - static void set_kernel_arg_wrapper(Kernel kernel, const Tuple& t) { - DummpyWrapper::set_kernel_arg_wrapper(kernel, t); - auto arg = std::get(t); - set_kernel_arg(kernel, N-1, sizeof(arg), (void*)&arg); - } - }; - - template - struct DummpyWrapper{ - static void set_kernel_arg_wrapper(Kernel kernel, const Tuple& t) { - UNUSED(kernel); - UNUSED(t); - } - }; - - template - inline EE gcl_set_kernelArgs(Kernel kernel, Args ... args) { - std::tuple t = std::make_tuple(args...); - DummpyWrapper::set_kernel_arg_wrapper(kernel, t); - return SUCCESS; - } - - inline std::string gclMemDesc2Str(GCLMemDesc desc) { - char buff[128]; - snprintf(buff, sizeof(buff), "memFormat: %d, ", desc.memFormat); - std::string descStr = buff; - descStr += "stride("; - for(U32 i = 0; i < 3; i++) { - descStr += std::to_string(desc.stride[i]); - if(i < 2) descStr += ","; - } - descStr += "), "; - descStr += "offset("; - for(U32 i = 0; i < 3; i++) { - descStr += std::to_string(desc.offset[i]); - if(i < 2) descStr += ","; - } - descStr += ")"; - return descStr; - } -#ifdef _DEBUG - template - inline EE gcl_print_memory(GCLHandle_t handle, GCLMem_t gclMem, CI8* gclMemName = NULL) { - UNUSED(handle); - UNUSED(gclMem); - UNUSED(gclMemName); -/* GCLMemDesc_t desc = &gclMem->desc; - if(gclMemName) std::cout << "MEMORY>>>"<< gclMemName << " info:"<stride[0]; - U32 s1 = desc->stride[1]; - U32 s2 = desc->stride[2]; - switch(desc->memType) { - case GCL_MEM_BUF: { - U32 size = desc->byteSize; - hostPtr = new U8[(size_t)size]; - gcl_trans_memory(handle, (void*)gclMem, (void*)hostPtr, &size, DEVICE_BUF_TO_HOST, CL_TRUE); - break; - } - case GCL_MEM_IMG_1D: { - U32 dim[3]; - dim[0] = s0; - dim[1] = s1; - dim[2] = s2; - U32 size = desc->byteSize; - hostPtr = new U8[(size_t)size]; - gcl_trans_memory(handle, (void*)gclMem, (void*)hostPtr, dim, DEVICE_IMG_TO_HOST, CL_TRUE); - s0 = s0 * 4; - break; - } - case GCL_MEM_IMG_2D: { - break; - } - case GCL_MEM_IMG_3D: { - break; - } - default: return NOT_SUPPORTED; - } - - T* data = (T*)hostPtr; - if(desc->memFormat == DF_NCHW) { - std::cout << "Format: NCHW" << std::endl; - std::cout << "s0 = " << s0 << std::endl; - std::cout << "s1 = " << s1 << std::endl; - std::cout << "s2 = " << s2 << std::endl; - U32 num = 0; - for(U32 i = 0; i < s2; i++) { - U32 ii = i * s1 * s0; - for(U32 j = 0; j < s1; j++) { - U32 jj = j * s0; - for(U32 k = 0; k < s0; k++) { - std::cout << 0.0 + data[ii + jj + k] << " "; - if(num >= 63) {std::cout << std::endl; goto end;} - num++; - } - std::cout << std::endl; - } - std::cout << std::endl; - } - } - - if(desc->memFormat == DF_NCWHC4) { - std::cout << "Format: NCWHC4" << std::endl; - std::cout << "s0 * 4 = " << s0 * 4 << std::endl; - std::cout << "s1 = " << s1 << std::endl; - std::cout << "s2 = " << s2 << std::endl; - U32 num = 0; - for(U32 i = 0; i < s2; i++) { - U32 ii = i * s1 * s0 * 4; - for(U32 j = 0; j < s1; j++) { - U32 jj = j * s0 * 4; - for(U32 k = 0; k < s0 * 4; k++) { - std::cout << 0.0 + data[ii + jj + k] << " "; - if(num >= 63) {std::cout << std::endl; goto end;} - num++; - } - std::cout << std::endl; - } - std::cout << std::endl; - } - } - - if(desc->memFormat == DF_NHWC || desc->memFormat == DF_HWCN) { - if(desc->memFormat == DF_NHWC) std::cout << "Format: NHWC" << std::endl; - if(desc->memFormat == DF_HWCN) std::cout << "Format: HWCN" << std::endl; - std::cout << "s0 = " << s0 << std::endl; - std::cout << "s1 = " << s1 << std::endl; - std::cout << "s2 = " << s2 << std::endl; - U32 num = 0; - for(U32 i = 0; i < s2; i++) { - U32 ii = i * s1 * s0; - for(U32 j = 0; j < s1; j++) { - U32 jj = j * s0; - for(U32 k = 0; k < s0; k++) { - std::cout << 0.0 + data[ii + jj + k] << " "; - if(num >= 63) {std::cout << std::endl; goto end;} - num++; - } - std::cout << std::endl; - } - std::cout << std::endl; - } - } - - if(desc->memFormat == DF_NCHWN4C4) { - std::cout << "Format: NCHWN4C4" << std::endl; - std::cout << "s0 * 16 = " << s0 * 16 << std::endl; - std::cout << "s1 = " << s1 << std::endl; - std::cout << "s2 = " << s2 << std::endl; - U32 num = 0; - for(U32 i = 0; i < s2; i++) { - U32 ii = i * s1 * s0 * 16; - for(U32 j = 0; j < s1; j++) { - U32 jj = j * s0 * 16; - for(U32 k = 0; k < s0 * 16; k++) { - std::cout << 0.0 + data[ii + jj + k] << " "; - if(num >= 63) {std::cout << std::endl; goto end;} - num++; - } - std::cout << std::endl; - } - std::cout << std::endl; - } - } - - if(desc->memFormat == DF_NCWHN4C4) { - std::cout << "Format: NCWHN4C4" << std::endl; - std::cout << "s0 * 16 = " << s0 * 16 << std::endl; - std::cout << "s1 = " << s1 << std::endl; - std::cout << "s2 = " << s2 << std::endl; - U32 num = 0; - for(U32 i = 0; i < s2; i++) { - U32 ii = i * s1 * s0 * 16; - for(U32 j = 0; j < s1; j++) { - U32 jj = j * s0 * 16; - for(U32 k = 0; k < s0 * 16; k++) { - std::cout << 0.0 + data[ii + jj + k] << " "; - if(num >= 63) {std::cout << std::endl; goto end;} - num++; - } - std::cout << std::endl; - } - std::cout << std::endl; - } - } - - if(desc->memFormat == DF_NCHWN4C4) { - std::cout << "Format: NCHWN4C4" << std::endl; - std::cout << "s0 * 4 = " << s0 * 4 << std::endl; - std::cout << "s1 = " << s1 << std::endl; - std::cout << "s2 = " << s2 << std::endl; - U32 num = 0; - for(U32 i = 0; i < s2; i++) { - U32 ii = i * s1 * s0 * 4; - for(U32 j = 0; j < s1; j++) { - U32 jj = j * s0 * 4; - for(U32 k = 0; k < s0 * 4; k++) { - std::cout << 0.0 + data[ii + jj + k] << " "; - if(num >= 63) {std::cout << std::endl; goto end;} - num++; - } - std::cout << std::endl; - } - std::cout << std::endl; - } - } - if(desc->memFormat == DF_NHWCN4) { - std::cout << "Format: NHWCN4" << std::endl; - std::cout << "s0 * 4 = " << s0 * 4 << std::endl; - std::cout << "s1 = " << s1 << std::endl; - std::cout << "s2 = " << s2 << std::endl; - U32 num = 0; - for(U32 i = 0; i < s2; i++) { - U32 ii = i * s1 * s0 * 4; - for(U32 j = 0; j < s1; j++) { - U32 jj = j * s0 * 4; - for(U32 k = 0; k < s0 * 4; k++) { - std::cout << 0.0 + data[ii + jj + k] << " "; - if(num >= 63) {std::cout << std::endl; goto end;} - num++; - } - std::cout << std::endl; - } - std::cout << std::endl; - } - } -end: - delete[] hostPtr;*/ - return SUCCESS; - } - - template - inline EE gcl_print_buffer(GCLHandle_t handle, Mem mem, U32 num, CI8* bufferName = NULL) { - UNUSED(handle); - UNUSED(mem); - UNUSED(num); - UNUSED(bufferName); -/* if(bufferName) std::cout << "BUFFER>>> "<< bufferName << " info:"<>> unknown info: " << std::endl; - std::cout << "Element number = " << num << std::endl; - U8* hostPtr = new U8[(size_t)num * sizeof(T)]; - CHECK_STATUS(enqueue_read_buffer(handle->queue, mem, CL_TRUE, 0, num * sizeof(T), (void*)hostPtr, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - T* val = (T*)hostPtr; - for(U32 i = 0; i < num; i++){ - std::cout << val[i] << " "; - if(i >= 63) break; - } - std::cout << std::endl; - delete[] hostPtr;*/ - return SUCCESS; - } - - template - inline EE gcl_write_buf_to_bin(GCLHandle_t handle, Mem buf, U32 size, CI8* dataName) { - U32 num = size / sizeof(T); - U8* hostPtr = new U8[size]; - F32* hostPtrTran = new F32[num]; - CHECK_STATUS(enqueue_read_buffer(handle->queue, buf, CL_TRUE, 0, size, hostPtr, - handle->numWaitEvents, handle->waitEvents, handle->eventPtr)); - T* val = (T*)hostPtr; - for(U32 i = 0; i < num; i++) hostPtrTran[i] = (F32)val[i]; - - FILE* outfile; - std::string fileName = dataName; - replace(fileName.begin(), fileName.end(), '/', '_'); - replace(fileName.begin(), fileName.end(), '.', '_'); - replace(fileName.begin(), fileName.end(), ' ', '_'); - fileName += "_gpu"; - fileName +=".out"; - outfile = fopen(fileName.c_str(), "wb"); - if(outfile == NULL) { - DEBUG_info("waring fopen outfile " << fileName <<" failed"); - delete[] hostPtr; - delete[] hostPtrTran; - return SUCCESS; - } - fwrite(hostPtrTran, sizeof(float), num, outfile); - fclose(outfile); - delete[] hostPtr; - delete[] hostPtrTran; - return SUCCESS; - } - template - inline EE gcl_write_data_to_bin(GCLHandle_t handle, TensorDesc tensorDesc, void* ptr, U32 ptrType, CI8* dataName = NULL) { - /*ptrType: - *GPU: 0 - *CPU: 1 - */ - DataFormat tdf; - DataType tdt; - U32 tn, tc, th, tw; - U32 dims; - tn = 1; tc = 1; th = 1; tw = 1; - dims = tensorDesc.nDims; - switch(dims) { - case 1: - tensor1dGet(tensorDesc, &tdt, &tw); - break; - case 2: - tensor2dfGet(tensorDesc, &tdt, &tdf, &th, &tw); - break; - case 3: - tensor3dGet(tensorDesc, &tdt, &tdf, &tc, &th, &tw); - break; - case 4: - tensor4dGet(tensorDesc, &tdt, &tdf, &tn, &tc, &th, &tw); - break; - default: CHECK_STATUS(NOT_SUPPORTED); - } - U32 num = tn * tc * th * tw; - F32* hostPtrTran = new F32[num]; - - if(ptrType == 0) { - GCLMem_t mem = (GCLMem_t)ptr; - GCLMemDesc desc = mem->desc; - GCLMemType type = desc.memType; - DataFormat df = desc.memFormat; - U8* hostPtr = nullptr; - U32 s0 = desc.stride[0]; - U32 s1 = desc.stride[1]; - U32 off0 = desc.offset[0]; - U32 off1 = desc.offset[1]; - U32 byteSize = desc.byteSize; - hostPtr = new U8[(size_t)byteSize]; - - GCLMemTransType tranType = DEVICE_BUF_TO_HOST; - U32 size[3] = {byteSize, 1, 1}; - if(type == GCL_MEM_IMG_1D) { - tranType = DEVICE_IMG_TO_HOST; - size[0] = s0; - } - gcl_trans_memory(handle, (void*)mem, (void*)hostPtr, size, tranType, CL_TRUE); - - T* val = (T*) hostPtr; - if(df == DF_NCWHC4) { - if(tdf == DF_NCHW) { - for(U32 i = 0; i < num; i++) { - U32 iw = i % tw; - U32 ih = (i / tw) % th; - U32 ic = i / (tw * th); - hostPtrTran[i] = (float)(val[((ic / 4) * s1 + iw + off1) * s0 * 4 + (ih + off0) * 4 + (ic & 3)]); - } - } - if(tdf == DF_MKT) { - for(U32 i = 0; i < num; i++) { - U32 ih = i % tw; - U32 ic = i / tw; - U32 in_off = ((ic / 4) * s1 + off1) * s0 * 4 + (ih + off0) * 4 + (ic & 3); - hostPtrTran[i] = (float)val[in_off]; - } - } - } else if(df == DF_NCHW || df == DF_NHWC) { - for(U32 i = 0; i < num; i++) { - U32 iw = i % tw; - U32 ih = (i / tw) % th; - U32 ic = i / (tw * th); - hostPtrTran[i] = (float)(val[(ic * s1 + ih + off1) * s0 + (iw + off0)]); - } - } else if(df == DF_NORMAL) { - for(U32 i = 0; i < num; i++) hostPtrTran[i] = (float)val[i]; - } else { - DEBUG_info("warning write GPU memory " << dataName <<" to bin, format not support: " << df); - delete[] hostPtrTran; - delete[] hostPtr; - return SUCCESS; - } - - delete[] hostPtr; - } - - if(ptrType == 1) { - T* val = (T*) ptr; - if(tdf == DF_NCHWC8) { - for(U32 i = 0; i < num; i++) { - U32 iw = i % tw; - U32 ih = (i / tw) % th; - U32 ic = i / (tw * th); - hostPtrTran[i] = (float)(val[((ic / 8) * th + ih) * tw * 8 + iw * 8 + (ic & 7)]); - } - } else if(tdf == DF_NORMAL || tdf == DF_NCHW) { - for(U32 i = 0; i < num; i++) { - hostPtrTran[i] = (float)(val[i]); - } - } else if(tdf == DF_MTK) { - for(U32 i = 0; i < num; i++) { - U32 it = i % th; - U32 ik = i / th; - U32 in_off = it * tw + ik; - hostPtrTran[i] = (float)(val[in_off]);//write as MKT, for compare with gpu - } - } else { - DEBUG_info("warning write CPU memory" << dataName <<" to bin, format not support: " << tdf); - delete[] hostPtrTran; - return SUCCESS; - } - } - - FILE* outfile; - std::string fileName = dataName; - replace(fileName.begin(), fileName.end(), '/', '_'); - replace(fileName.begin(), fileName.end(), '.', '_'); - replace(fileName.begin(), fileName.end(), ' ', '_'); - if(ptrType == 0) fileName += "_gpu"; - if(ptrType == 1) fileName += "_cpu"; - fileName +=".out"; - - outfile = fopen(fileName.c_str(), "wb"); - if(outfile == NULL) { - DEBUG_info("waring fopen outfile " << fileName <<" failed"); - delete[] hostPtrTran; - return SUCCESS; - } - fwrite(hostPtrTran, sizeof(float), num, outfile); - fclose(outfile); - delete[] hostPtrTran; - return SUCCESS; - } -#endif -#endif diff --git a/gcl/include/gcl_kernel_binmap.h b/gcl/include/gcl_kernel_binmap.h deleted file mode 100644 index a214cd06..00000000 --- a/gcl/include/gcl_kernel_binmap.h +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -#ifndef GCL_KERNELMAP -#define GCL_KERNELMAP - -#include"gcl_common.h" -#include -#include -typedef GCLKernelBin kernelBin; - -class gcl_kernel_binmap{ - public: - gcl_kernel_binmap(){} - std::unordered_map& binMap() {return binMap_;} - EE put(std::string kernelname, kernelBin kernelbin) { - std::lock_guard lock(mtx_); - auto it = binMap_.find(kernelname); - if(it == binMap_.end()) binMap_.insert({kernelname, kernelbin}); - return SUCCESS; - } - EE get(std::string kernelname, kernelBin** kernelbin_ptr) { - std::lock_guard lock(mtx_); - auto it = binMap_.find(kernelname); - if(it == binMap_.end()){ - printf("the kernel %s doesn't exist in binMap\n", kernelname.c_str()); - return NULL_POINTER; - } - *kernelbin_ptr = &it->second; - return SUCCESS; - } - private: - std::unordered_map binMap_; - std::mutex mtx_; -}; - - -class gcl_kernel_binmap_container{ - public: - static gcl_kernel_binmap_container* instance(){ - static gcl_kernel_binmap_container sInst; - return &sInst; - } - EE put(std::string kernel_binmap_name, std::unique_ptr kernel_binmap) { - std::lock_guard lock(mtx_); - auto it = kernel_binmap_container_.find(kernel_binmap_name); - if(it == kernel_binmap_container_.end()) kernel_binmap_container_.insert(std::make_pair(kernel_binmap_name, std::move(kernel_binmap))); - return SUCCESS; - } - EE get(std::string kernel_binmap_name, gcl_kernel_binmap** kernel_binmap_ptr) { - std::lock_guard lock(mtx_); - auto it = kernel_binmap_container_.find(kernel_binmap_name); - if(it == kernel_binmap_container_.end()){ - printf("the kernel_binmap %s doesn't exist in kernel_binmap container\n", kernel_binmap_name.c_str()); - return NULL_POINTER; - } - *kernel_binmap_ptr = it->second.get(); - return SUCCESS; - } - private: - gcl_kernel_binmap_container(){} - std::unordered_map> kernel_binmap_container_; - std::mutex mtx_; -}; - -class gcl_kernel_binmap_factory{ - public: - static gcl_kernel_binmap_factory* instance() { - static gcl_kernel_binmap_factory sInst; - return &sInst; - } - typedef gcl_kernel_binmap* (*PFN_GCLKERNELMAP_CREATOR)(); - EE register_gcl_kernel_binmap(const std::string& kernel_binmap_name, PFN_GCLKERNELMAP_CREATOR pfnCreator) { - std::lock_guard lock(mtx_); - auto it = creators_.find(kernel_binmap_name); - if(it == creators_.end()) creators_.insert({kernel_binmap_name, pfnCreator}); - return SUCCESS; - } - EE create_gcl_kernel_binmap(const std::string& kernel_binmap_name) { - std::lock_guard lock(mtx_); - auto it = creators_.find(kernel_binmap_name); - if(it == creators_.end()){ - printf("the kernel_binmap creator %s doesn't exist in kernel_binmap factory\n", kernel_binmap_name.c_str()); - return NULL_POINTER; - } - PFN_GCLKERNELMAP_CREATOR pfn = it->second; - gcl_kernel_binmap_container::instance()->put(kernel_binmap_name, std::unique_ptr(pfn())); - return SUCCESS; - } - private: - gcl_kernel_binmap_factory(){} - std::unordered_map creators_; - std::mutex mtx_; -}; - -#define REGISTER_GCLKERNELMAP_CREATOR_IMPL(kernel_binmap_name)\ - namespace{\ - static gcl_kernel_binmap* kernel_binmap_name ## _gcl_kernel_binmap_pfn() {return new kernel_binmap_name();}\ - class kernel_binmap_name ## _gcl_kernel_binmap_loader{\ - public:\ - kernel_binmap_name ## _gcl_kernel_binmap_loader() {\ - gcl_kernel_binmap_factory::instance()->register_gcl_kernel_binmap(#kernel_binmap_name, kernel_binmap_name ## _gcl_kernel_binmap_pfn);\ - }\ - };\ - static kernel_binmap_name ## _gcl_kernel_binmap_loader kernel_binmap_name ## _sLoader;\ - } - -#define REGISTER_GCLKERNELMAP(kernel_binmap_name) REGISTER_GCLKERNELMAP_CREATOR_IMPL(kernel_binmap_name) -#endif diff --git a/gcl/include/kernel.h b/gcl/include/kernel.h deleted file mode 100644 index adf29fb4..00000000 --- a/gcl/include/kernel.h +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -#ifndef KERNEL_H_ -#define KERNEL_H_ - -#ifdef __cplusplus -extern "C" { -#endif - - /** - * @brief get information of kernel - * @warning please free memory associate with value - **/ - inline EE get_kernel_info(Kernel kernel, cl_kernel_info info, void* *value, size_t *size) { - if(NULL == value) return NULL_POINTER; - - size_t len; - cl_int ret = clGetKernelInfo(kernel, info, 0, NULL, &len); - if(CL_SUCCESS == ret) { - if(NULL != size) *size = len; - void* data = malloc(len); - if(NULL == data) return ALLOC_FAILED; - ret = clGetKernelInfo(kernel, info, len, data, NULL); - if(CL_SUCCESS == ret) { *value = data; } else { free(data); } - } - - map_cl_error_2_ee(ret); - } - - /** - * @brief get workgroup information of kernel - * @warning please free memory associate with value - **/ - inline EE get_kernel_workgroup_info(Kernel kernel, Device device, cl_kernel_work_group_info info, void* *value, size_t *size) { - size_t len; - cl_int ret = clGetKernelWorkGroupInfo(kernel, device, info, 0, NULL, &len); - if(CL_SUCCESS == ret) { - if(NULL != size) *size = len; - void* data = malloc(len); - if(NULL == data) return ALLOC_FAILED; - *value = data; - } - - map_cl_error_2_ee(ret); - } - - inline EE create_kernels_in_program(Program program, U32 num_kernel, Kernel* kernels){ - if(kernels == nullptr) return NULL_POINTER; - I32 ret = clCreateKernelsInProgram(program, num_kernel, kernels, NULL); - map_cl_error_2_ee(ret); - } - - inline EE create_kernel(Program program, CI8 *name, Kernel* kernel) { - if(kernel == nullptr) return NULL_POINTER; - I32 ret; - *kernel = clCreateKernel(program, name, &ret); - map_cl_error_2_ee(ret); - } - - inline EE retain_kernel(Kernel kernel) { - cl_int ret = clRetainKernel(kernel); - map_cl_error_2_ee(ret); - } - - inline EE release_kernel(Kernel kernel) { - cl_int ret = clReleaseKernel(kernel); - map_cl_error_2_ee(ret); - } - - inline EE set_kernel_arg(Kernel kernel, - U32 arg_index, U32 arg_size, - const void *arg_value) { - cl_int ret = clSetKernelArg(kernel, arg_index, arg_size, arg_value); - map_cl_error_2_ee(ret); - } -/* - inline EE clone_kernel(Kernel src_kernel, Kernel* dst_kernel) { - //TODO - I32 ret; - dst_kernel = clCloneKernel(src_kernel, &ret); - map_cl_error_2_ee(ret); - } -*/ - inline EE enqueue_ndrange_kernel(CommandQueue queue, Kernel kernel, U32 work_dim, CU32* global_work_offset, CU32* global_work_size, CU32* local_work_size, - U32 num_events_in_wait_list, const Event* event_in_wait_list, Event* event){ - I32 ret; - UNUSED(global_work_offset); - UNUSED(local_work_size); - switch(work_dim){ - case 1:{ - size_t gs = global_work_size[0]; - size_t ls = local_work_size[0]; - size_t* ls_ptr = (ls == 0) ? NULL : &ls; - ret = clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, &gs, ls_ptr, num_events_in_wait_list, event_in_wait_list, event); - break;} - case 2:{ - size_t gs[2] = {global_work_size[0], global_work_size[1]}; - size_t ls[2] = {local_work_size[0], local_work_size[1]}; - size_t* ls_ptr = (ls[0] == 0 || ls[1] == 0) ? NULL : ls; - ret = clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, gs, ls_ptr, num_events_in_wait_list, event_in_wait_list, event); - break;} - case 3:{ - size_t gs[3] = {global_work_size[0], global_work_size[1], global_work_size[2]}; - size_t ls[3] = {local_work_size[0], local_work_size[1], local_work_size[2]}; - size_t* ls_ptr = (ls[0] == 0 || ls[1] == 0 || ls[2] == 0) ? NULL : ls; - ret = clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, gs, ls_ptr, num_events_in_wait_list, event_in_wait_list, event); - break;} - default: - return NOT_SUPPORTED; - } - map_cl_error_2_ee(ret); - } - -#ifdef __cplusplus -} -#endif -#endif diff --git a/gcl/include/memory.h b/gcl/include/memory.h deleted file mode 100644 index a4003074..00000000 --- a/gcl/include/memory.h +++ /dev/null @@ -1,487 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -#ifndef _H_BUFFER -#define _H_BUFFER - -#include "event.h" - -#ifdef __cplusplus -extern "C" { -#endif - - /** - * @brief get memory information - * - **/ - inline EE get_mememory_info(Mem mem, cl_mem_info info, void* *value, U32 *len) { - if(NULL == value) return NULL_POINTER; - - size_t size; - I32 ret = clGetMemObjectInfo(mem, info, 0, NULL, &size); - if(CL_SUCCESS == ret) { - if(NULL != len) *len = size; - void* data = malloc(size); - if(NULL == data) return NULL_POINTER; - ret = clGetMemObjectInfo(mem, info, size, data, NULL); - if(CL_SUCCESS == ret) *value = data; - } - - map_cl_error_2_ee(ret); - } - -#if defined(CL_VERSION_1_2) - - inline EE create_image1D(Context context, cl_mem_flags flags, const cl_image_format *format, U32 len, U32 pitch, void* host_ptr, Mem *image) { - cl_image_desc image_desc; - image_desc.image_type = CL_MEM_OBJECT_IMAGE1D; - image_desc.image_width = len; - image_desc.image_height = 1; - image_desc.image_depth = 1; - image_desc.image_array_size = 1; - image_desc.image_row_pitch = pitch; - image_desc.image_slice_pitch = 0; - image_desc.num_mip_levels = 0; - image_desc.num_samples = 0; - image_desc.buffer = NULL; - - I32 ret; - Mem temp = clCreateImage(context, flags, format, &image_desc, host_ptr, &ret); - *image = temp; - map_cl_error_2_ee(ret); - } - - /** - * @brief create 1d image buffer - * - **/ - inline EE create_image1D_buffer(Context context, cl_mem_flags flags, const cl_image_format *format, U32 len, const cl_mem buffer, Mem *image) { - cl_image_desc image_desc; - image_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; - image_desc.image_width = len; - image_desc.image_height = 1; - image_desc.image_depth = 1; - image_desc.image_array_size = 1; - image_desc.image_row_pitch = len; - image_desc.image_slice_pitch = len; - image_desc.num_mip_levels = 0; - image_desc.num_samples = 0; - image_desc.buffer = buffer; - - I32 ret; - Mem temp = clCreateImage(context, flags, format, &image_desc, NULL, &ret); - if(CL_SUCCESS == ret) *image = temp; - map_cl_error_2_ee(ret); - } -#endif - - /** - * @brief create 2d image object - * - **/ - inline EE create_image2D(Context cont, cl_mem_flags flags, cl_image_format *format, U32 width, U32 height, U32 pitch, void* host_ptr, Mem *mem) { - I32 ret; -#if defined(CL_VERSION_1_2) - cl_image_desc image_desc; - image_desc.image_type = CL_MEM_OBJECT_IMAGE2D; - image_desc.image_width = width; - image_desc.image_height = height; - image_desc.image_depth = 1; - image_desc.image_array_size = 1; - image_desc.image_row_pitch = pitch; - image_desc.image_slice_pitch = 0; - image_desc.num_mip_levels = 0; - image_desc.num_samples = 0; - image_desc.buffer = NULL; - - Mem temp = clCreateImage(cont, flags, format, &image_desc, host_ptr, &ret); -#else - Mem temp = clCreateImage2D(cont, flags, format, width, height, pitch, host_ptr, &ret); -#endif - if(CL_SUCCESS == ret) *mem = temp; - - map_cl_error_2_ee(ret); - } - -#if defined(CL_VERSION_1_2) - /** - * @brief create 2d image buffer object - * - **/ - inline EE create_image2D_array(Context cont, cl_mem_flags flags, cl_image_format *format, U32 width, U32 height, U32 pitch, U32 arraySize, void* host_ptr, Mem *mem) { - I32 ret; - cl_image_desc image_desc; - image_desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY; - image_desc.image_width = width; - image_desc.image_height = height; - image_desc.image_depth = 1; - image_desc.image_array_size = arraySize; - image_desc.image_row_pitch = pitch; - image_desc.image_slice_pitch = 0; - image_desc.num_mip_levels = 0; - image_desc.num_samples = 0; - image_desc.buffer = NULL; - - *mem = clCreateImage(cont, flags, format, &image_desc, host_ptr, &ret); - map_cl_error_2_ee(ret); - } -#endif - - /** - * @brief create 3d image object - * - **/ - inline EE create_image3D(Context cont, cl_mem_flags flags, cl_image_format *format, U32 width, U32 height, U32 depth, U32 rowPitch, U32 slicePitch, void* host_ptr, Mem *mem) { - I32 ret; -#if defined(CL_VERSION_1_2) - cl_image_desc image_desc; - image_desc.image_type = CL_MEM_OBJECT_IMAGE3D; - image_desc.image_width = width; - image_desc.image_height = height; - image_desc.image_depth = depth; - image_desc.image_array_size = 1; - image_desc.image_row_pitch = rowPitch; - image_desc.image_slice_pitch = slicePitch; - image_desc.num_mip_levels = 0; - image_desc.num_samples = 0; - image_desc.buffer = NULL; - - Mem temp = clCreateImage(cont, flags, format, &image_desc, host_ptr, &ret); -#else - Mem temp = clCreateImage3D(cont, flags, format, width, height, depth, rowPitch, slicePitch, host_ptr, &ret); -#endif - if(CL_SUCCESS == ret) *mem = temp; - - map_cl_error_2_ee(ret); - } - - /** - * @brief get image information - * - **/ - inline EE get_image_info(Mem mem, cl_mem_info info, void* *value, U32 *len) { - size_t size; - I32 ret = clGetImageInfo(mem, info, 0, NULL, &size); - if(CL_SUCCESS == ret) { - if(NULL != len) *len = size; - - void* data = malloc(size); - if(NULL == data) return NULL_POINTER; - ret = clGetImageInfo(mem, info, size, data, NULL); - if(CL_SUCCESS == ret) *value = data; - } - - map_cl_error_2_ee(ret); - } - - /** - * @brief get supported image format - * - * @warning please free memory associated with format - **/ - inline EE get_supported_image_formats(Context cont, cl_mem_flags flags, cl_mem_object_type type, cl_image_format **format, U32 *num) { - if(NULL == format) return NULL_POINTER; - - U32 len; - I32 ret = clGetSupportedImageFormats(cont, flags, type, 0, NULL, &len); - if(CL_SUCCESS == ret) { - if(NULL != num) *num = len; - cl_image_format *data = (cl_image_format*) malloc(len); - if(NULL == data) return NULL_POINTER; - ret = clGetSupportedImageFormats(cont, flags, type, len, data, 0); - if(CL_SUCCESS == ret) *format = data; - } - - map_cl_error_2_ee(ret); - } - - inline EE retain_memory(Mem mem) { - I32 ret = clRetainMemObject(mem); - map_cl_error_2_ee(ret); - } - - inline EE release_memory(Mem mem) { - I32 ret = clReleaseMemObject(mem); - map_cl_error_2_ee(ret); - } - - inline EE enqueue_unmap_memory(CommandQueue queue, Mem mem, void* mapped_ptr, - I32 num_wait_events, const Event *wait_events, Event *event) { - I32 ret = clEnqueueUnmapMemObject(queue, mem, mapped_ptr, - num_wait_events, wait_events, event); - - map_cl_error_2_ee(ret); - } - - inline EE create_buffer(Context context, cl_mem_flags flags, U32 size, - void* host_ptr, Mem* buffe) { - I32 ret; - size_t len = size; - *buffe = clCreateBuffer(context, flags, len, host_ptr, &ret); - map_cl_error_2_ee(ret); - } - - inline EE create_sub_buffer(Mem buffer, cl_mem_flags flags, - U32 offset, U32 size, Mem* sub) { - I32 ret; - cl_buffer_region region = { offset, size}; - *sub = clCreateSubBuffer(buffer, flags, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &ret); - map_cl_error_2_ee(ret); - } - - inline EE enqueue_read_buffer(CommandQueue queue, Mem buffer, cl_bool blocking, - U32 offset, U32 size, void* ptr, - U32 num_wait_events, const Event* wait_events, Event* event) { - I32 ret = clEnqueueReadBuffer(queue, buffer, blocking, - offset, size, ptr, num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } - - /* - inline EE enqueue_read_buffer_rect(CommandQueue queue, Mem buffer, cl_bool blocking, - const U32 *buffer_origin, const U32 *host_origin, const U32 *region, - U32 buffer_row_pitch, U32 buffer_slice_pitch, U32 host_row_pitch, - U32 host_slice_pitch, void *ptr, U32 num_wait_events, - const Event *wait_events, Event *event) { - - I32 ret = clEnqueueReadBufferRect(queue, buffer, blocking, - buffer_origin, host_origin, region, - buffer_row_pitch, buffer_slice_pitch, host_row_pitch, - host_slice_pitch, ptr, num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } -*/ - inline EE enqueue_write_buffer(CommandQueue queue, Mem buffer, cl_bool blocking, - U32 offset, U32 size, const void *ptr, U32 num_wait_events, - const Event *wait_events, Event *event) { - - I32 ret = clEnqueueWriteBuffer(queue, buffer, blocking, - offset, size, ptr, num_wait_events, - wait_events, event); - map_cl_error_2_ee(ret); - } - - inline EE enqueue_fill_buffer(CommandQueue queue, Mem buffer, const void *pattern, - U32 pattern_size, U32 offset, U32 size, U32 num_wait_events, - const Event *wait_events, Event *event) { - size_t pat_size = pattern_size; - size_t off = offset; - size_t si = size; - I32 ret = clEnqueueFillBuffer(queue, buffer, pattern, pat_size, off, si, num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } - - inline EE enqueue_write_buffer_rect(CommandQueue queue, Mem buffer, cl_bool blocking_write, const U32 *buffer_origin, const U32 *host_origin, - const U32 *region, U32 buffer_row_pitch, U32 buffer_slice_pitch, U32 host_row_pitch, U32 host_slice_pitch, const void *ptr, - U32 num_wait_events, const Event *wait_events, Event *event) { - size_t b_ori[3]; - size_t h_ori[3]; - size_t reg[3]; - size_t b_rp = buffer_row_pitch; - size_t b_sp = buffer_slice_pitch; - size_t h_rp = host_row_pitch; - size_t h_sp = host_slice_pitch; - for(U32 i = 0; i < 3; i++) { - b_ori[i] = buffer_origin[i]; - h_ori[i] = host_origin[i]; - reg[i] = region[i]; - } - I32 ret = clEnqueueWriteBufferRect(queue, buffer, blocking_write, b_ori, h_ori, reg, b_rp, b_sp, h_rp, h_sp, - ptr, num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } - - inline EE enqueue_copy_buffer(CommandQueue queue, Mem src_buffer, Mem dst_buffer, - U32 src_offset, U32 dst_offset, U32 size, U32 num_wait_events, - const Event *wait_events, Event *event){ - I32 ret = clEnqueueCopyBuffer(queue, src_buffer, dst_buffer, - src_offset, dst_offset, size, - num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } - - /* - EE enqueue_copy_buffer_rect(CommandQueue queue, Mem src_buffer, Mem dst_buffer, - const U32 *src_origin, const U32 *dst_origin, const U32 *region, - U32 src_row_pitch, U32 src_slice_pitch, U32 dst_row_pitch, - U32 dst_slice_pitch, U32 num_wait_events, - const Event *wait_events, Event *event) { - I32 ret = clEnqueueCopyBufferRect(queue, src_buffer, dst_buffer, - const size_t *src_origin, const size_t *dst_origin, const size_t *region, - src_row_pitch, src_slice_pitch, dst_row_pitch, - dst_slice_pitch, num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } - */ - - inline EE enqueue_map_buffer(CommandQueue queue, Mem buffer, cl_bool blocking_map, - cl_map_flags map_flags, U32 offset, U32 size, - U32 num_wait_events, const Event *wait_events, Event *event, - void* *ptr) { - I32 ret; - *ptr = clEnqueueMapBuffer(queue, buffer, blocking_map, map_flags, offset, size, - num_wait_events, wait_events, event, &ret); - map_cl_error_2_ee(ret); - } - - inline EE create_image(Context context, cl_mem_flags flags, const cl_image_format *image_format, - const cl_image_desc *image_desc, void *host_ptr, Mem* mem) { - I32 ret; - *mem = clCreateImage(context, flags, image_format, image_desc, host_ptr, &ret); - map_cl_error_2_ee(ret); - } - - inline EE enqueue_read_image(CommandQueue queue, Mem image, cl_bool blocking_read, - const U32 *origin, const U32 *region, U32 row_pitch, U32 slice_pitch, - void *ptr, U32 num_wait_events, const Event *wait_events, - Event *event) { - size_t org [3]; - size_t reg [3]; - for(U32 i = 0; i < 3; ++i){ - org[i] = (size_t)origin[i]; - reg[i] = (size_t)region[i]; - } - I32 ret = clEnqueueReadImage(queue, image, blocking_read, org, reg, row_pitch, slice_pitch, - ptr, num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } - - inline EE enqueue_write_image(CommandQueue queue, Mem image, cl_bool blocking_write, - const U32 *origin, const U32 *region, U32 input_row_pitch, - U32 input_slice_pitch, const void *ptr, U32 num_wait_events, - const Event *wait_events, Event *event) { - size_t org [3]; - size_t reg [3]; - for(U32 i = 0; i < 3; ++i){ - org[i] = (size_t)origin[i]; - reg[i] = (size_t)region[i]; - } - I32 ret = clEnqueueWriteImage(queue, image, blocking_write, org, reg, input_row_pitch, - input_slice_pitch, ptr, num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } - - inline EE enqueue_fill_image(CommandQueue queue, Mem image, const void *fill_color, - const U32 *origin, const U32 *region,U32 num_wait_events, - const Event *wait_events, Event *event) { - size_t org [3]; - size_t reg [3]; - for(U32 i = 0; i < 3; ++i){ - org[i] = (size_t)origin[i]; - reg[i] = (size_t)region[i]; - } - I32 ret = clEnqueueFillImage(queue, image, fill_color, - org, reg, num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } - - inline EE enqueue_copy_image_to_buffer(CommandQueue queue, Mem src_image, Mem dst_buffer, - const U32 *src_origin, const U32 *region, U32 dst_offset, - U32 num_wait_events, const cl_event *wait_events, cl_event *event) { - size_t org [3]; - size_t reg [3]; - for(U32 i = 0; i < 3; ++i){ - org[i] = (size_t)src_origin[i]; - reg[i] = (size_t)region[i]; - } - I32 ret = clEnqueueCopyImageToBuffer(queue, src_image, dst_buffer, org, reg, dst_offset, - num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } - - inline EE enqueue_copy_buffer_to_image(CommandQueue queue, Mem src_buffer, Mem dst_image, - U32 src_offset, const U32 *dst_origin, const U32 *region, - U32 num_wait_events, const cl_event *wait_events, cl_event *event) { - size_t org [3]; - size_t reg [3]; - for(U32 i = 0; i < 3; ++i){ - org[i] = (size_t)dst_origin[i]; - reg[i] = (size_t)region[i]; - } - I32 ret = clEnqueueCopyBufferToImage(queue, src_buffer, dst_image, - src_offset, org, reg, num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } -/* - - EE enqueue_copy_image(CommandQueue queue, Mem src_image, Mem dst_image, - const U32 *src_origin, const U32 *dst_origin, const U32 *region, - U32 num_wait_events, const cl_event *wait_events, cl_event *event) { - I32 ret = clEnqueueCopyImage(queue, src_image, dst_image, - const size_t *src_origin, const size_t *dst_origin, const size_t *region, - num_wait_events, wait_events, event); - map_cl_error_2_ee(ret); - } - - - - EE enqueue_map_image(CommandQueue queue, Mem image, cl_bool blocking_map, - cl_map_flags map_flags, const U32 *origin, const U32 *region, - U32 *image_row_pitch, U32 *image_slice_pitch, U32 num_wait_events, - const cl_event *wait_events, cl_event *event, void* *ptr) { - I32 ret; - *ptr = clEnqueueMapImage(queue, image, blocking_map, - map_flags, const size_t *origin, const size_t *region, - size_t *image_row_pitch, size_t *image_slice_pitch, - num_wait_events, wait_events, event, &ret); - map_cl_error_2_ee(ret); - } -*/ - - inline EE create_sampler(Context context, const cl_sampler_properties* properties, Sampler *s) { - I32 ret; - *s = clCreateSamplerWithProperties(context, properties, &ret); - map_cl_error_2_ee(ret); - } - - inline EE retain_sampler(Sampler s) { - I32 ret = clRetainSampler(s); - map_cl_error_2_ee(ret); - } - - inline EE release_sampler(Sampler s) { - I32 ret = clReleaseSampler(s); - map_cl_error_2_ee(ret); - } - - inline EE get_sampler_info(Sampler s, - cl_sampler_info info, - void** value, size_t *len) { - if(NULL == value) return NULL_POINTER; - - size_t size; - I32 ret = clGetSamplerInfo(s, info, 0, NULL, &size); - if(CL_SUCCESS == ret) { - if(NULL != len) *len = size; - void* data = malloc(size); - if(NULL == data) return NULL_POINTER; - ret = clGetSamplerInfo(s, info, size, data, NULL); - if(CL_SUCCESS == ret) *value = data; - } - - map_cl_error_2_ee(ret); - } - - inline EE get_memory_size(Mem memory, U32* size){ - size_t len; - int ret = clGetMemObjectInfo(memory, CL_MEM_SIZE, sizeof(len), &len, NULL); - *size = len; - map_cl_error_2_ee(ret); - } -#ifdef __cplusplus -} -#endif - -#endif diff --git a/gcl/include/platform.h b/gcl/include/platform.h deleted file mode 100644 index d2726514..00000000 --- a/gcl/include/platform.h +++ /dev/null @@ -1,397 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -#ifndef _H_PLATFORM -#define _H_PLATFORM - -#include -#include - -#if defined(__cplusplus) -extern "C" { -#endif - - typedef enum { - VENDOR_ARM = 0, - } PlatformVendor; - - inline EE get_platforms(U32 *numPlatforms, Platform** platforms){ - if(NULL == platforms || NULL == numPlatforms) return NULL_POINTER; - U32 num; - I32 ret = clGetPlatformIDs (0, NULL, &num); - if(SUCCESS == ret){ - *numPlatforms = num; - Platform *p = (Platform*)malloc(num * sizeof(Platform)); - if(NULL == p) return ALLOC_FAILED; - - ret = clGetPlatformIDs(num, p, NULL); - if(SUCCESS != ret) { free(p); } else { *platforms = p; } - } - - map_cl_error_2_ee(ret); - } - - static cl_bool stringContains(char* big, const char* s) { - for(unsigned int i = 0; i < strlen(big); i++) big[i] = tolower(big[i]); - std::string str(big); - return std::string::npos != str.find(s); - } - - /** - * @brief get information from platform - * - * @param value value associate with info, memory is allocate by this - * function - * @param len the lengith of value, return by this function - * - **/ - - inline EE get_platform_info(Platform platform, - cl_platform_info info, - void** value, U32 *len){ - if(NULL == len || NULL == value) return NULL_POINTER; - size_t sizeRet; - I32 ret = clGetPlatformInfo(platform, info, 0, NULL, &sizeRet); - if(CL_SUCCESS == ret){ - if(len) *len = (U32)sizeRet; - void* data = malloc(sizeRet+1); - if(NULL == data){ return ALLOC_FAILED; } - - ret = clGetPlatformInfo(platform, info, sizeRet+1, data, NULL); - if(CL_SUCCESS != ret){ free(data); } else { *value = data; } - } - - map_cl_error_2_ee(ret); - } - - /** - * @brief select platfrom by platform type - * - * @param numPlatforms the number of platforms - * @param platforms platform array need to be selected - * @param type the type of platform we want - * @param index index of the selected platform - * - **/ - inline EE select_platform(PlatformVendor vendor, Platform* platform) { - if(NULL == platform) return NULL_POINTER; - - const static char* key[] = {"arm", "qualcomm"}; - U32 num_platforms; - Platform* platforms; - EE ret = get_platforms(&num_platforms, &platforms); - if(SUCCESS == ret) { - const char* platform_vendor = key[vendor]; - for(U32 i = 0; i < num_platforms; i++) { - Platform p = platforms[i]; - U32 nameLen; - char *name; - ret = get_platform_info(p, CL_PLATFORM_NAME, (void**)&name, &nameLen); - if(SUCCESS == ret) { - if(stringContains(name, platform_vendor)) *platform = p; - free(name); - } - } - } - free(platforms); - - map_cl_error_2_ee(ret); - } - -#define CHAR_PLATFORM_INFO(info, str) {\ - EE ret = get_platform_info(p, info, &value, &len); \ - if(SUCCESS == ret){\ - char* tmp = (char*) value;\ - tmp[len] = '\0';\ - printf(str": %s\n", tmp);\ - free(value);\ - }else{ map_cl_error_2_ee(ret);}\ -} - -/** - * @brief list information about platform - * - */ -inline EE list_platform_info(Platform p){ - void* value; - U32 len; - - CHAR_PLATFORM_INFO(CL_PLATFORM_PROFILE, "\t Profile"); - CHAR_PLATFORM_INFO(CL_PLATFORM_VERSION, "\t Version "); - CHAR_PLATFORM_INFO(CL_PLATFORM_NAME, "\t Name "); - CHAR_PLATFORM_INFO(CL_PLATFORM_VENDOR, "\t Vendor "); - CHAR_PLATFORM_INFO(CL_PLATFORM_EXTENSIONS, "\t Extensions "); - - return SUCCESS; -} - -/** - * @brief get devices in platform, and allocate space for storing devices - * @warning please free space of devices allocated in this function - * - * @param p input, specify platform, device will be retrived from this platform - * @param type input, specify device type - * @param num_devices output, return device number with type in platform p - * @param devices output, return devices - * - * @return - * 0 means sucess - * -1 means fail - * - */ -inline EE platform_get_devices(Platform platform, - cl_device_type type, U32 *num_devices, Device **devices){ - if(NULL == devices || NULL == num_devices) return NULL_POINTER; - - U32 num; - I32 ret = clGetDeviceIDs(platform, type, 0, NULL, &num); - if(CL_SUCCESS == ret) { - *num_devices = num; - - Device *did = (Device*) malloc(num*sizeof(Device)); - if(NULL == did) return ALLOC_FAILED; - - ret = clGetDeviceIDs(platform, type, num, did, NULL); - if(CL_SUCCESS != ret){ free(did);} else { *devices = did;} - } - map_cl_error_2_ee(ret); -} - -inline EE create_sub_device(Device device, - const cl_device_partition_property* properties, - U32* num_devices, Device** devices) { - U32 len; - I32 ret = clCreateSubDevices(device, properties, 0, NULL, &len); - if(CL_SUCCESS == ret) { - if(NULL != num_devices) *num_devices = len; - Device *d = (Device*) malloc(sizeof(Device)*len); - if(NULL == d) return ALLOC_FAILED; - ret = clCreateSubDevices(device, properties, len, d, NULL); - if(CL_SUCCESS == ret) { *devices = d; } else { free(d); } - } - map_cl_error_2_ee(ret); -} - -inline EE retain_device(Device device) { - I32 ret = clRetainDevice(device); - map_cl_error_2_ee(ret); -} - -inline EE release_device(Device device) { - I32 ret = clReleaseDevice(device); - map_cl_error_2_ee(ret); -} - -/** - * - *@brief get device information - * - * @warning please free memory space allocated for value - * - **/ - -inline EE get_device_info(Device device, cl_device_info info, - void** value, U32 *len) { - if(NULL == value) return NULL_POINTER; - - size_t size; - I32 ret = clGetDeviceInfo(device, info, 0, NULL, &size); - if(CL_SUCCESS == ret) { - if(NULL != len) *len = (U32)(size); - void* data = malloc(size); - if(NULL == data) return ALLOC_FAILED; - ret = clGetDeviceInfo(device, info, size, data, NULL); - if(CL_SUCCESS != ret) { free(data); } else { *value = data; } - } - - map_cl_error_2_ee(ret); -} - -#define V_Q_Info(device, info, type, str, modifier) {\ - type v;\ - I32 ret = clGetDeviceInfo(device, info, sizeof(type), &v, NULL);\ - if(CL_SUCCESS != ret){\ - map_cl_error_2_ee(ret);\ - }\ - \ - printf(str "%" modifier "\n", v);\ -} - -#define B_Q_Info(device, info, str) {\ - cl_bool v;\ - I32 ret = clGetDeviceInfo(device, info, sizeof(cl_bool), &v, NULL);\ - if(CL_SUCCESS != ret){\ - map_cl_error_2_ee(ret);\ - }\ - \ - printf(str "%s\n", v? "Yes" : "NO");\ -} - -#define STR_Q_Info(device, info, str) {\ - size_t len;\ - I32 ret = clGetDeviceInfo(device, info, 0, NULL, &len);\ - if(SUCCESS != ret){\ - map_cl_error_2_ee(ret);\ - }\ - \ - char* v = (char*) malloc(len+1);\ - ret = clGetDeviceInfo(device, info, len, v, NULL);\ - if(SUCCESS != ret){\ - map_cl_error_2_ee(ret);\ - }\ - \ - v[len] = '\0';\ - printf(str"%s\n", v);\ - free(v);\ - \ -} - - -/** - * @brief list all attributes of device - * - * @param device input - * - * @return - * 0 : success - * -1: error - */ -inline EE list_device_info(Device device){ - printf("..........Device Info..............\n"); - STR_Q_Info(device, CL_DEVICE_NAME, "Device name : "); - V_Q_Info(device, CL_DEVICE_ADDRESS_BITS, U32, "Address Bits : ", "u"); - B_Q_Info(device, CL_DEVICE_AVAILABLE, "Device Available : "); - B_Q_Info(device, CL_DEVICE_COMPILER_AVAILABLE, "Device Compiler Available : "); - B_Q_Info(device, CL_DEVICE_ENDIAN_LITTLE, "Device is little Endian : "); - B_Q_Info(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, "ECC Supported : "); - STR_Q_Info(device, CL_DEVICE_EXTENSIONS, "Device Extensions : "); - STR_Q_Info(device, CL_DEVICE_OPENCL_C_VERSION, "OpenCL C Version : "); - STR_Q_Info(device, CL_DEVICE_PROFILE, "Device Profile : "); - V_Q_Info(device, CL_DEVICE_PROFILING_TIMER_RESOLUTION, size_t, "Timer Resolution : ", "ld"); - { cl_device_fp_config v; - I32 ret = clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(cl_device_fp_config), &v, NULL); - if(CL_SUCCESS != ret){ - map_cl_error_2_ee(ret); - } - - if(v & CL_FP_DENORM){printf("Device Support Denorm Single Float \n");} - if(v & CL_FP_INF_NAN){printf("Device Support Single Float INF NAN\n");} - if(v & CL_FP_ROUND_TO_NEAREST){printf("Device Support Single Float Round to Nearest\n");} - if(v & CL_FP_ROUND_TO_ZERO){printf("Device Support Single Float Round to Zero \n");} - if(v & CL_FP_ROUND_TO_INF){printf("Device Support Single Float Round to Inf\n");} - if(v & CL_FP_FMA){printf("Device Support Single Float FMA\n");} - if(v & CL_FP_SOFT_FLOAT){printf("Device does not Support Hardware Single Float\n");} - } - - STR_Q_Info(device, CL_DEVICE_VENDOR, "Device Vendor : "); - V_Q_Info(device, CL_DEVICE_VENDOR_ID, U32, "Device Vendor ID : ", "u"); - STR_Q_Info(device, CL_DEVICE_VERSION, "Device Version : "); - STR_Q_Info(device, CL_DRIVER_VERSION, "Driver Version : "); - B_Q_Info(device, CL_DEVICE_HOST_UNIFIED_MEMORY, "Unified Memory Supported : "); - V_Q_Info(device, CL_DEVICE_MAX_PARAMETER_SIZE, size_t, "Max Parameter Size : ", "ld"); - - printf("..............Global Memory Configuration.............\n"); - V_Q_Info(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong, "Max Memory Allocate Size : ", "lu"); - V_Q_Info(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, U32, "Max Base Address Align Size : ", "u"); - V_Q_Info(device, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, U32, "Min Data Type align Size :", "u"); - - V_Q_Info(device, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong, "Global Memory Cache Size : ", "lu"); - { cl_device_mem_cache_type v; - I32 ret = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(cl_device_mem_cache_type), &v, NULL); - if(CL_SUCCESS != ret){ - map_cl_error_2_ee(ret); - } - switch(v) { - case CL_NONE: printf("Global Memory does not have Cache \n"); break; - case CL_READ_ONLY_CACHE : printf("Global Memory has Readonly Cache \n"); break; - case CL_READ_WRITE_CACHE : printf("Global Memory has Read Write Cache \n"); break; - } - } - - V_Q_Info(device, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, U32, "Global Memory, Cacheline Size : ", "u"); - V_Q_Info(device, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong, "Global Memory Size : ", "lu"); - //CL_DEVICE_HALF_FP_CONFIG - - printf("..................Image Information...................\n"); - B_Q_Info(device, CL_DEVICE_IMAGE_SUPPORT, "Image Supported : "); - V_Q_Info(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, size_t, "2D Image Max Height : ", "ld"); - V_Q_Info(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, size_t, "2D Image Max Width : ", "ld"); - V_Q_Info(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, size_t, "3D Image Max Depth : ", "ld"); - V_Q_Info(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, size_t, "3D Image Max Height : ", "ld"); - V_Q_Info(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, size_t, "3D Image Max Width : ", "ld"); - V_Q_Info(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, U32, "Max Read Image Args : ", "u"); - V_Q_Info(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, U32, "Max Write Image Args : ", "u"); - V_Q_Info(device, CL_DEVICE_MAX_SAMPLERS, U32, "Max Samples : ", "u"); - - printf(".................Local Memory...............................\n"); - V_Q_Info(device, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong, "Local Memory Size : ", "lu"); - { cl_device_local_mem_type v; - I32 ret = clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(cl_device_local_mem_type), &v, NULL); - if(CL_SUCCESS != ret){ - map_cl_error_2_ee(ret); - } - switch(v) { - case CL_LOCAL: printf("Device has Dedicate Local Memory\n"); break; - case CL_GLOBAL : printf("Local Memory uses Global Memory\n"); break; - default: - printf("%d\n", __LINE__); - } - } - - printf("...................CU Information...........................\n"); - V_Q_Info(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, U32, "Max Clock Frequency : ", "u"); - V_Q_Info(device, CL_DEVICE_MAX_COMPUTE_UNITS, U32, "Max Compute Units : ", "u"); - - printf(".................Constant Memory Information.............\n"); - V_Q_Info(device, CL_DEVICE_MAX_CONSTANT_ARGS, U32, "Max Constant Args : ", "u"); - V_Q_Info(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong, "Max Constant Buffer Size : ", "lu"); - - printf("...................ND Range Information........................\n"); - V_Q_Info(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t, "Max Work Group Size : ", "ld"); - V_Q_Info(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, U32, "Work Item Dimensions : ", "u"); - - { size_t v[3]; - I32 ret = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, &v, NULL); - if(CL_SUCCESS != ret){ - map_cl_error_2_ee(ret); - } - printf("Max Work Item size : %ld %ld %ld\n", v[0], v[1], v[2]); - } - - printf(".....................Vector Information..................\n"); - V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, U32, "Native Vector Width Char : ", "u"); - V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, U32, "Native Vector Width Short : ", "u"); - V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, U32, "Native Vector Width Int : ", "u"); - V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, U32, "Native Vector Width Long : ", "u"); - V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, U32, "Native Vector Width Float : ", "u"); - V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, U32, "Native Vector Width Double : ", "u"); - V_Q_Info(device, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, U32, "Native Vector Width Half : ", "u"); - - V_Q_Info(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, U32, "Preferred Vector Width Char : ", "u"); - V_Q_Info(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, U32, "Preferred Vector Width Short : ", "u"); - V_Q_Info(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, U32, "Preferred Vector Width Int : ", "u"); - V_Q_Info(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, U32, "Preferred Vector Width Long : ", "u"); - V_Q_Info(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, U32, "Preferred Vector Width Float : ", "u"); - V_Q_Info(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, U32, "Preferred Vector Width Double : ", "u"); - V_Q_Info(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, U32, "Preferred Vector Width Half : ", "u"); - - return SUCCESS; - -} - -#if defined(__cplusplus) -} -#endif -#endif diff --git a/gcl/include/program.h b/gcl/include/program.h deleted file mode 100644 index 39ae2e22..00000000 --- a/gcl/include/program.h +++ /dev/null @@ -1,238 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -#ifndef PROGRAM_H_ -#define PROGRAM_H_ - -#ifdef __cplusplus -extern "C" { -#endif -#define check_build_program_error(ret, program, device) {\ - if(SUCCESS != ret){\ - void* buildLog; \ - U32 buildLogSize;\ - ret = get_program_build_info(program, device, CL_PROGRAM_BUILD_LOG, &buildLog, &buildLogSize);\ - if(SUCCESS == ret) { \ - printf("build log of device %s\n", (char*)buildLog);\ - free(buildLog);\ - }\ - }\ -}\ - /** - * @brief get build information of program - * @warning please free memory associate with value - **/ - - inline EE get_program_build_info(Program program, - Device device, - cl_program_build_info info, - void* *value, U32 *size) { - if(NULL == value) return NULL_POINTER; - - size_t len; - I32 ret = clGetProgramBuildInfo(program, device, info, 0, NULL, &len); - if(SUCCESS == ret) { - if(NULL == size) *size = len; - void* data = malloc(len); - if(NULL == data) return ALLOC_FAILED; - ret = clGetProgramBuildInfo(program, device, info, len, data, NULL); - if(SUCCESS == ret) { *value = data; } else { free(data); } - } - - map_cl_error_2_ee(ret); - } - - /** - * @brief create program from source code - * - * @param context input, specify associate context - * @param source input, source code - * @param program output, created and built program - * - **/ - - inline EE create_program_from_source(Context context, U32* len, CI8* str, Program *program) { - I32 ret; - size_t length = (size_t)(*len); - *program = clCreateProgramWithSource(context, 1, &str, &length, &ret); - map_cl_error_2_ee(ret); - } - - /** - * @brief create program from binary code - * - * @param context input, specify associate context - * @param numDevices input, the number of devices need to compile the - * code for - * @param devices input, devices need to compile the code for - * @param lengths input, - * @param binaries - * @param binary_status output, compiled status for every devices - * @param program output, created and built program - * - **/ - - inline EE create_program_from_binary(Context context, const Device device, - U32* length, CU8 **binary, I32 *binary_status, Program *program) { - I32 ret; - size_t len = *length; - *program = clCreateProgramWithBinary(context, 1, &device, &len, binary, binary_status, &ret); - map_cl_error_2_ee(ret); - } - - /** - * @brief build program - * - **/ - - inline EE build_program(Program program, Device device, CI8 *options) { - I32 ret = clBuildProgram(program, 1, &device, options, NULL, NULL); - if(CL_SUCCESS != ret) check_build_program_error(ret, program, device); - map_cl_error_2_ee(ret); - } - - /** - * @brief create program from source then build it - * - * @param cont input, specify associate context - * @param source input, source code - * @param devices input, source will be built on devices - * @param options input, options for compiling source - * @param program output, created and built program - * - */ - - inline EE create_build_program_from_source(Context context, U32* length, CI8* source, Device device, CI8* options, Program *program) { - if(NULL == program) return NULL_POINTER; - Program prog; - EE ret; - create_program_from_source(context, length, source, &prog); - ret = build_program(prog, device, options); - *program = prog; - map_cl_error_2_ee(ret); - } - - /** - * @brief create program from binary then build it - * - **/ - - inline EE create_build_program_from_binary(Context context, Device device, U32* length, CU8** binary, CI8* options, I32 *binary_status, Program *program) { - if(NULL == program) return NULL_POINTER; - Program prog; - EE ret; - create_program_from_binary(context, device, length, binary, binary_status, &prog); - ret = build_program(prog, device, options); - *program = prog; - map_cl_error_2_ee(ret); - } - - /** - * @brief get information of program - * @warning please free memory associate with value - **/ - - inline EE get_program_info(Program program, cl_program_info info, void* *value, U32 *size) { - if(NULL == value) return NULL_POINTER; - size_t len; - I32 ret = clGetProgramInfo(program, info, 0, NULL, &len); - if(CL_SUCCESS == ret) { - if(NULL != size) *size = len; - void* data = malloc(len); - if(NULL == data) return ALLOC_FAILED; - ret = clGetProgramInfo (program, info, len, data, NULL); - if(CL_SUCCESS == ret) { *value = data;} else { free(data); } - } - map_cl_error_2_ee(ret); - } - - /** - * @brief get information of program - * @warning please free memory associate with value - **/ - inline EE get_program_binary(Program program, U8* *binary, U32 *len) { - size_t size; - I32 ret = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL); - if(CL_SUCCESS == ret){ - *len = (U32)(size); - void*data = malloc(size); - if(NULL == data) return ALLOC_FAILED; - ret = clGetProgramInfo(program, CL_PROGRAM_BINARIES, size, &data, NULL);//waring: need set &data - if(CL_SUCCESS == ret ){*binary = (U8*)(data);} - else{free(data);} - } - map_cl_error_2_ee(ret); - } - - /** - * @brief get binary of source code - * - * @warning please don't free binary, it is return by ocl - * - **/ - - inline EE get_program_binary_from_source(Context context, U32* length, CI8* str, Device device, CI8* options, U8* *binary, U32 *len) { - if(NULL == binary) return NULL_POINTER; - - Program program; - EE ret = create_build_program_from_source(context, length, str, device, options, &program); - if(SUCCESS == ret) { ret = get_program_binary(program, binary, len); } - return ret; - } - -/* -inline EE create_program_from_il(Context context, - const void *il, U32 length, Program *program) { -//TODO - I32 ret; - *program = clCreateProgramWithIL(context, il, length, &ret); - map_cl_error_2_ee(ret); -} -*/ - - inline EE release_program(Program program) { - map_cl_error_2_ee(clReleaseProgram(program)); - } - - inline EE compile_program(Program program, - const Device device, - CI8 *options, U32 num_input_headers, const Program *input_headers, - CI8 **header_include_names) { - I32 ret = clCompileProgram(program, 1, &device, - options, num_input_headers, input_headers, header_include_names, - NULL, NULL); - map_cl_error_2_ee(ret); - } - - inline EE link_program(Context context, - const Device device, - CI8* options, U32 num_input_programs, - const Program *input_programs, Program* program) { - I32 ret; - *program = clLinkProgram(context, 1, &device, options, num_input_programs, input_programs, - NULL, NULL, &ret); - map_cl_error_2_ee(ret); - } - - inline EE unload_platform_compiler(Platform p) { - I32 ret = clUnloadPlatformCompiler(p); - map_cl_error_2_ee(ret); - } - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/gcl/tools/gcl_sample/sample.cpp b/gcl/tools/gcl_sample/sample.cpp deleted file mode 100644 index 2102b25f..00000000 --- a/gcl/tools/gcl_sample/sample.cpp +++ /dev/null @@ -1,152 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - -#include"gcl.h" -#include"libkernelbin.h" - - -void setMemDesc(GCLMem_t mem, DataType dt, DataFormat ft, GCLMemType mt, - U32 s0, U32 s1, U32 s2, U32 off0, U32 off1, U32 off2){ - mem->desc.stride[0] = s0 + 2 * off0; - mem->desc.stride[1] = s1 + 2 * off1; - mem->desc.stride[2] = s2; - mem->desc.offset[0] = off0; - mem->desc.offset[1] = off1; - mem->desc.offset[2] = off2; - mem->desc.num = s0 * s1 * s2; - mem->desc.byteSize = s0 * s1 * s2 * bytesOf(dt); - mem->desc.memFormat = ft; - mem->desc.memType = mt; -} - -int main(){ -while(1) { - GCLHandle_t handle; - CHECK_STATUS(gcl_create_handle(&handle)); - CHECK_STATUS(gcl_regist_binMap(handle)); - U32 iw, ih, ic, in; - U32 fw, fh, fc, fn; - U32 sv, pv; - U32 ow, oh, oc, on; - - iw = 4; - ih = 4; - ic = 4; - in = 1; - - fw = 3; - fh = 3; - fc = 4; - fn = 4; - - ow = iw; - oh = ih; - oc = fn; - on = in; - - sv = 1; - pv = 1; - - GCLMem_t input = gcl_create_gclmem(); - GCLMem_t flt = gcl_create_gclmem(); - GCLMem_t bias = gcl_create_gclmem(); - GCLMem_t output = gcl_create_gclmem(); - setMemDesc(input, DT_F16, DF_NCHW, GCL_MEM_BUF, iw, ih, ic, pv, pv, 0); - setMemDesc(flt, DT_F16, DF_NCHW, GCL_MEM_BUF, fw * fh, fc, fn, 0, 0, 0); - setMemDesc(bias, DT_F16, DF_NCHW, GCL_MEM_BUF, fn, 1, 1, 0, 0, 0); - setMemDesc(output,DT_F16, DF_NCHW, GCL_MEM_BUF, ow, oh, oc, 0, 0, 0); - CHECK_STATUS(gcl_create_memory(handle, input)); - CHECK_STATUS(gcl_create_memory(handle, flt)); - CHECK_STATUS(gcl_create_memory(handle, bias)); - CHECK_STATUS(gcl_create_memory(handle, output)); - - U8* iptr = new U8[input->desc.byteSize]; - U8* fptr = new U8[flt->desc.byteSize]; - U8* bptr = new U8[bias->desc.byteSize]; - - F16* ival = (F16*)iptr; - F16* fval = (F16*)fptr; - F16* bval = (F16*)bptr; - for(U32 i = 0; i < input->desc.num; i++){ - ival[i] = (rand() & 1023) / 1024.0 - 0.5; - U32 s0 = input->desc.stride[0]; - U32 s1 = input->desc.stride[1]; - U32 j = i % (s0 * s1); - if((j % s0) == 0 || (j % s0) == s0 - 1) ival[i] = 0; - if( j / s0 == 0 || j / s0 == s1 - 1) ival[i] = 0; - } - - for(U32 i = 0; i < flt->desc.num; i++){ - fval[i] = (rand() & 1023) / 1024.0 - 0.5; - } - - for(U32 i = 0; i < bias->desc.num; i++){ - bval[i] = (rand() & 1023) / 1024.0 - 0.5; - } - - CHECK_STATUS(gcl_trans_memory(handle, (void*)iptr, (void*)input, &input->desc.byteSize, HOST_TO_DEVICE_BUF, CL_TRUE)); - CHECK_STATUS(gcl_trans_memory(handle, (void*)fptr, (void*)flt, &flt->desc.byteSize, HOST_TO_DEVICE_BUF, CL_TRUE)); - CHECK_STATUS(gcl_trans_memory(handle, (void*)bptr, (void*)bias, &bias->desc.byteSize, HOST_TO_DEVICE_BUF, CL_TRUE)); -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, input, "input")); - CHECK_STATUS(gcl_print_memory(handle, flt, "flt")); - CHECK_STATUS(gcl_print_memory(handle, bias, "bias")); -#endif - - - Kernel kernel; - char kernelname[128]; - for(int i = 0; i < 1; i++){ - sprintf(kernelname, "sample"); - CHECK_STATUS(gcl_create_kernel_binary(handle, kernelname, &kernel)); - U32 iw_str = input->desc.stride[0]; - U32 ih_str = input->desc.stride[1]; - U32 iwh_str = iw_str * ih_str; - - U32 fwh_str = flt->desc.stride[0]; - U32 fc_str = flt->desc.stride[1]; - U32 flt_str = fwh_str * fc_str; - - U32 ow_str = output->desc.stride[0]; - U32 oh_str = output->desc.stride[1]; - U32 oc_str = output->desc.stride[2]; - U32 gs[3]; - gs[0] = ow_str; - gs[1] = oh_str; - gs[2] = oc_str; - U32 dim = 3; - U32 ls[3] = {0, 0, 0}; - CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, iwh_str, fc_str, flt_str, ow_str, oh_str, gs[0], gs[1], input->mem, flt->mem, bias->mem, output->mem)); - CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, "sample")); - CHECK_STATUS(gcl_run_kernelVec(handle)); - - } - -#ifdef _DEBUG - CHECK_STATUS(gcl_print_memory(handle, output, "output")); -#endif - delete[] iptr; - delete[] fptr; - delete[] bptr; - gcl_destroy_gclmem(input); - gcl_destroy_gclmem(flt); - gcl_destroy_gclmem(bias); - gcl_destroy_gclmem(output); - gcl_destroy_handle(handle); - } -} - - - diff --git a/gcl/tools/kernel_lib_compile/sh/adbDeviceNum.sh b/gcl/tools/kernel_lib_compile/sh/adbDeviceNum.sh deleted file mode 100644 index 0522b27e..00000000 --- a/gcl/tools/kernel_lib_compile/sh/adbDeviceNum.sh +++ /dev/null @@ -1,11 +0,0 @@ -adbDeviceNum=($(adb devices | grep ".device$")) -i=0 -length=${#adbDeviceNum[@]} -while [ "$i" -lt "$length" ];do - if - ((i%2!=0)) - then - unset adbDeviceNum[i] - fi - ((i++)) -done diff --git a/gcl/tools/kernel_lib_compile/sh/compile/concat.sh b/gcl/tools/kernel_lib_compile/sh/compile/concat.sh deleted file mode 100644 index 3e701a10..00000000 --- a/gcl/tools/kernel_lib_compile/sh/compile/concat.sh +++ /dev/null @@ -1,18 +0,0 @@ -for file in * - do - if [ "${file##*.}"x = "cl"x ];then - if [[ "${file}" == "concat.cl" ]];then - echo ./gcl_binary --input=$file --output=${file%.*}_11.bin --options=\"${copt} -D A=1 -D N=1\" - echo ./gcl_binary --input=$file --output=${file%.*}_12.bin --options=\"${copt} -D A=1 -D N=2\" - echo ./gcl_binary --input=$file --output=${file%.*}_13.bin --options=\"${copt} -D A=1 -D N=3\" - echo ./gcl_binary --input=$file --output=${file%.*}_14.bin --options=\"${copt} -D A=1 -D N=4\" - echo ./gcl_binary --input=$file --output=${file%.*}_15.bin --options=\"${copt} -D A=1 -D N=5\" - echo ./gcl_binary --input=$file --output=${file%.*}_16.bin --options=\"${copt} -D A=1 -D N=6\" - echo ./gcl_binary --input=$file --output=${file%.*}_17.bin --options=\"${copt} -D A=1 -D N=7\" - echo ./gcl_binary --input=$file --output=${file%.*}_18.bin --options=\"${copt} -D A=1 -D N=8\" - fi - fi - done - - - diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1.sh b/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1.sh deleted file mode 100644 index e9554c64..00000000 --- a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s1.sh +++ /dev/null @@ -1,102 +0,0 @@ -for file in * - do - if [ "${file##*.}"x = "cl"x ];then - if [[ "${file}" == "conv_direct_s1.cl" ]];then - echo ./gcl_binary --input=$file --output=${file%.*}_111.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_131.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_151.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_161.bin --options=\"${copt} -D F=1 -D ON=6 -D IN=6 -D LN=6 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_171.bin --options=\"${copt} -D F=1 -D ON=7 -D IN=7 -D LN=7 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_181.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_111.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_131.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_151.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_161.bin --options=\"${copt} -D F=1 -D ON=6 -D IN=6 -D LN=6 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_171.bin --options=\"${copt} -D F=1 -D ON=7 -D IN=7 -D LN=7 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_181.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -D KN=1 -DUSE_HALF -DUSE_RELU\" - - echo ./gcl_binary --input=$file --output=${file%.*}_112.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_132.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_112.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_132.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" - -# echo ./gcl_binary --input=$file --output=${file%.*}_114.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=4 -DUSE_HALF\" -# echo ./gcl_binary --input=$file --output=${file%.*}_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF\" -# echo ./gcl_binary --input=$file --output=${file%.*}_relu_114.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU\" -# echo ./gcl_binary --input=$file --output=${file%.*}_relu_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU\" - - echo ./gcl_binary --input=$file --output=${file%.*}_311.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_321.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_331.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_341.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_351.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_361.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -D KN=1 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_371.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -D KN=1 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_381.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -D KN=1 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_311.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_321.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_331.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_341.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_351.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_361.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_371.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_381.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - - echo ./gcl_binary --input=$file --output=${file%.*}_312.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_322.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_332.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_342.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=6 -D LN=6 -D UN=5 -D Fsq=9 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_312.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_322.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_332.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_342.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=6 -D LN=6 -D UN=5 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" - -# echo ./gcl_binary --input=$file --output=${file%.*}_314.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=4 -DUSE_HALF\" -# echo ./gcl_binary --input=$file --output=${file%.*}_324.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=4 -DUSE_HALF\" -# echo ./gcl_binary --input=$file --output=${file%.*}_relu_314.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=4 -DUSE_HALF -DUSE_RELU\" -# echo ./gcl_binary --input=$file --output=${file%.*}_relu_324.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=4 -D LN=4 -D UN=3 -D Fsq=9 -D KN=4 -DUSE_HALF -DUSE_RELU\" - - echo ./gcl_binary --input=$file --output=${file%.*}_511.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_521.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_531.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=1 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_541.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=1 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_551.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_561.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_571.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -D KN=1 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_581.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -D KN=1 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_511.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU \" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_521.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU \" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_531.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_541.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_551.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_561.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_571.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_581.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - - echo ./gcl_binary --input=$file --output=${file%.*}_512.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_522.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_532.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_542.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=2 -DUSE_HALF -D BASICE_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_512.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_522.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_532.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_542.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU -D BASICE_REG\" - -# echo ./gcl_binary --input=$file --output=${file%.*}_514.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=4 -DUSE_HALF\" -# echo ./gcl_binary --input=$file --output=${file%.*}_524.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=4 -DUSE_HALF\" -# echo ./gcl_binary --input=$file --output=${file%.*}_relu_514.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=4 -DUSE_HALF -DUSE_RELU\" -# echo ./gcl_binary --input=$file --output=${file%.*}_relu_524.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=6 -D LN=6 -D UN=5 -D Fsq=25 -D KN=4 -DUSE_HALF -DUSE_RELU\" - fi - fi - done - - - diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2.sh b/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2.sh deleted file mode 100644 index 2ff47db7..00000000 --- a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_s2.sh +++ /dev/null @@ -1,102 +0,0 @@ -for file in * - do - if [ "${file##*.}"x = "cl"x ];then - if [[ "${file}" == "conv_direct_s2.cl" ]];then - echo ./gcl_binary --input=$file --output=${file%.*}_111.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_131.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_151.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_161.bin --options=\"${copt} -D F=1 -D ON=6 -D IN=6 -D LN=6 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_171.bin --options=\"${copt} -D F=1 -D ON=7 -D IN=7 -D LN=7 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_181.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_111.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_121.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_131.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_141.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_151.bin --options=\"${copt} -D F=1 -D ON=5 -D IN=5 -D LN=5 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_161.bin --options=\"${copt} -D F=1 -D ON=6 -D IN=6 -D LN=6 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_171.bin --options=\"${copt} -D F=1 -D ON=7 -D IN=7 -D LN=7 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_181.bin --options=\"${copt} -D F=1 -D ON=8 -D IN=8 -D LN=8 -D Fsq=1 -D KN=1 -DUSE_RELU -DUSE_HALF\" - - echo ./gcl_binary --input=$file --output=${file%.*}_112.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_132.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_112.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_122.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_132.bin --options=\"${copt} -D F=1 -D ON=3 -D IN=3 -D LN=3 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_142.bin --options=\"${copt} -D F=1 -D ON=4 -D IN=4 -D LN=4 -D Fsq=1 -D KN=2 -DUSE_HALF -DUSE_RELU\" - -# echo ./gcl_binary --input=$file --output=${file%.*}_114.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=4 -DUSE_HALF\" -# echo ./gcl_binary --input=$file --output=${file%.*}_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF\" -# echo ./gcl_binary --input=$file --output=${file%.*}_relu_114.bin --options=\"${copt} -D F=1 -D ON=1 -D IN=1 -D LN=1 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU\" -# echo ./gcl_binary --input=$file --output=${file%.*}_relu_124.bin --options=\"${copt} -D F=1 -D ON=2 -D IN=2 -D LN=2 -D Fsq=1 -D KN=4 -DUSE_HALF -DUSE_RELU\" - - echo ./gcl_binary --input=$file --output=${file%.*}_311.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_321.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_331.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_341.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_351.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_361.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_371.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_381.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_311.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_321.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_331.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_341.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_351.bin --options=\"${copt} -D F=3 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_361.bin --options=\"${copt} -D F=3 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_371.bin --options=\"${copt} -D F=3 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_381.bin --options=\"${copt} -D F=3 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=9 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - - echo ./gcl_binary --input=$file --output=${file%.*}_312.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_322.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_332.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_342.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=2 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_312.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_322.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_332.bin --options=\"${copt} -D F=3 -D ON=3 -D IN=7 -D LN=7 -D UN=6 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_342.bin --options=\"${copt} -D F=3 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=9 -D KN=2 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - -# echo ./gcl_binary --input=$file --output=${file%.*}_314.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=4 -DUSE_HALF\" -# echo ./gcl_binary --input=$file --output=${file%.*}_324.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=4 -DUSE_HALF\" -# echo ./gcl_binary --input=$file --output=${file%.*}_relu_314.bin --options=\"${copt} -D F=3 -D ON=1 -D IN=3 -D LN=3 -D UN=2 -D Fsq=9 -D KN=4 -DUSE_HALF -DUSE_RELU\" -# echo ./gcl_binary --input=$file --output=${file%.*}_relu_324.bin --options=\"${copt} -D F=3 -D ON=2 -D IN=5 -D LN=5 -D UN=4 -D Fsq=9 -D KN=4 -DUSE_HALF -DUSE_RELU\" - - echo ./gcl_binary --input=$file --output=${file%.*}_511.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_521.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_531.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_541.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_551.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_561.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_571.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_581.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -D KN=1 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_511.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=1 -D LN=0 -D UN=0 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_521.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=2 -D LN=1 -D UN=1 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_531.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_541.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_551.bin --options=\"${copt} -D F=5 -D ON=5 -D IN=5 -D LN=4 -D UN=4 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_561.bin --options=\"${copt} -D F=5 -D ON=6 -D IN=6 -D LN=5 -D UN=5 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_571.bin --options=\"${copt} -D F=5 -D ON=7 -D IN=7 -D LN=6 -D UN=6 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_581.bin --options=\"${copt} -D F=5 -D ON=8 -D IN=8 -D LN=7 -D UN=7 -D Fsq=25 -D KN=1 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - - echo ./gcl_binary --input=$file --output=${file%.*}_512.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_522.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=2 -DUSE_HALF\" - echo ./gcl_binary --input=$file --output=${file%.*}_532.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=2 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_542.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=2 -DUSE_HALF -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_512.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_522.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_532.bin --options=\"${copt} -D F=5 -D ON=3 -D IN=3 -D LN=2 -D UN=2 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_542.bin --options=\"${copt} -D F=5 -D ON=4 -D IN=4 -D LN=3 -D UN=3 -D Fsq=25 -D KN=2 -DUSE_HALF -DUSE_RELU -DBASIC_REG\" - -# echo ./gcl_binary --input=$file --output=${file%.*}_514.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=4 -DUSE_HALF\" -# echo ./gcl_binary --input=$file --output=${file%.*}_524.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=4 -DUSE_HALF\" -# echo ./gcl_binary --input=$file --output=${file%.*}_relu_514.bin --options=\"${copt} -D F=5 -D ON=1 -D IN=5 -D LN=5 -D UN=4 -D Fsq=25 -D KN=4 -DUSE_HALF -DUSE_RELU\" -# echo ./gcl_binary --input=$file --output=${file%.*}_relu_524.bin --options=\"${copt} -D F=5 -D ON=2 -D IN=7 -D LN=7 -D UN=6 -D Fsq=25 -D KN=4 -DUSE_HALF -DUSE_RELU\" - fi - fi - done - - - diff --git a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_spe_fwhs1.sh b/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_spe_fwhs1.sh deleted file mode 100644 index 67663d6e..00000000 --- a/gcl/tools/kernel_lib_compile/sh/compile/conv_direct_spe_fwhs1.sh +++ /dev/null @@ -1,19 +0,0 @@ -for file in * - do - if [ "${file##*.}"x = "cl"x ];then - if [[ "${file}" == "conv_direct_spe_fwhs1.cl" ]];then - echo ./gcl_binary --input=$file --output=${file%.*}_1.bin --options=\"${copt} -D OC=1\" - echo ./gcl_binary --input=$file --output=${file%.*}_2.bin --options=\"${copt} -D OC=2\" - echo ./gcl_binary --input=$file --output=${file%.*}_3.bin --options=\"${copt} -D OC=3\" - echo ./gcl_binary --input=$file --output=${file%.*}_4.bin --options=\"${copt} -D OC=4\" - echo ./gcl_binary --input=$file --output=${file%.*}_8.bin --options=\"${copt} -D OC=8\" - echo ./gcl_binary --input=$file --output=${file%.*}_16.bin --options=\"${copt} -D OC=16\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_4.bin --options=\"${copt} -D OC=4 -D USE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_8.bin --options=\"${copt} -D OC=8 -D USE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_16.bin --options=\"${copt} -D OC=16 -D USE_RELU\" - fi - fi - done - - - diff --git a/gcl/tools/kernel_lib_compile/sh/compile/eltwise.sh b/gcl/tools/kernel_lib_compile/sh/compile/eltwise.sh deleted file mode 100644 index 4b5da516..00000000 --- a/gcl/tools/kernel_lib_compile/sh/compile/eltwise.sh +++ /dev/null @@ -1,36 +0,0 @@ -for file in * - do - if [ "${file##*.}"x = "cl"x ];then - if [[ "${file}" == "eltwise.cl" ]];then - echo ./gcl_binary --input=$file --output=${file%.*}_max1.bin --options=\"${copt} -D N=1 -D TP=max -DUSE_MAX\" - echo ./gcl_binary --input=$file --output=${file%.*}_max2.bin --options=\"${copt} -D N=2 -D TP=max -DUSE_MAX\" - echo ./gcl_binary --input=$file --output=${file%.*}_max3.bin --options=\"${copt} -D N=3 -D TP=max -DUSE_MAX\" - echo ./gcl_binary --input=$file --output=${file%.*}_max4.bin --options=\"${copt} -D N=4 -D TP=max -DUSE_MAX\" - echo ./gcl_binary --input=$file --output=${file%.*}_max5.bin --options=\"${copt} -D N=5 -D TP=max -DUSE_MAX\" - echo ./gcl_binary --input=$file --output=${file%.*}_max6.bin --options=\"${copt} -D N=6 -D TP=max -DUSE_MAX\" - echo ./gcl_binary --input=$file --output=${file%.*}_max7.bin --options=\"${copt} -D N=7 -D TP=max -DUSE_MAX\" - echo ./gcl_binary --input=$file --output=${file%.*}_max8.bin --options=\"${copt} -D N=8 -D TP=max -DUSE_MAX\" - - echo ./gcl_binary --input=$file --output=${file%.*}_sum1.bin --options=\"${copt} -D N=1 -D TP=sum -DUSE_SUM\" - echo ./gcl_binary --input=$file --output=${file%.*}_sum2.bin --options=\"${copt} -D N=2 -D TP=sum -DUSE_SUM\" - echo ./gcl_binary --input=$file --output=${file%.*}_sum3.bin --options=\"${copt} -D N=3 -D TP=sum -DUSE_SUM\" - echo ./gcl_binary --input=$file --output=${file%.*}_sum4.bin --options=\"${copt} -D N=4 -D TP=sum -DUSE_SUM\" - echo ./gcl_binary --input=$file --output=${file%.*}_sum5.bin --options=\"${copt} -D N=5 -D TP=sum -DUSE_SUM\" - echo ./gcl_binary --input=$file --output=${file%.*}_sum6.bin --options=\"${copt} -D N=6 -D TP=sum -DUSE_SUM\" - echo ./gcl_binary --input=$file --output=${file%.*}_sum7.bin --options=\"${copt} -D N=7 -D TP=sum -DUSE_SUM\" - echo ./gcl_binary --input=$file --output=${file%.*}_sum8.bin --options=\"${copt} -D N=8 -D TP=sum -DUSE_SUM\" - - echo ./gcl_binary --input=$file --output=${file%.*}_prod1.bin --options=\"${copt} -D N=1 -D TP=prod -DUSE_PROD\" - echo ./gcl_binary --input=$file --output=${file%.*}_prod2.bin --options=\"${copt} -D N=2 -D TP=prod -DUSE_PROD\" - echo ./gcl_binary --input=$file --output=${file%.*}_prod3.bin --options=\"${copt} -D N=3 -D TP=prod -DUSE_PROD\" - echo ./gcl_binary --input=$file --output=${file%.*}_prod4.bin --options=\"${copt} -D N=4 -D TP=prod -DUSE_PROD\" - echo ./gcl_binary --input=$file --output=${file%.*}_prod5.bin --options=\"${copt} -D N=5 -D TP=prod -DUSE_PROD\" - echo ./gcl_binary --input=$file --output=${file%.*}_prod6.bin --options=\"${copt} -D N=6 -D TP=prod -DUSE_PROD\" - echo ./gcl_binary --input=$file --output=${file%.*}_prod7.bin --options=\"${copt} -D N=7 -D TP=prod -DUSE_PROD\" - echo ./gcl_binary --input=$file --output=${file%.*}_prod8.bin --options=\"${copt} -D N=8 -D TP=prod -DUSE_PROD\" - fi - fi - done - - - diff --git a/gcl/tools/kernel_lib_compile/sh/compile/gemm_tn.sh b/gcl/tools/kernel_lib_compile/sh/compile/gemm_tn.sh deleted file mode 100644 index 762c9eff..00000000 --- a/gcl/tools/kernel_lib_compile/sh/compile/gemm_tn.sh +++ /dev/null @@ -1,114 +0,0 @@ -for file in * - do - if [ "${file##*.}"x = "cl"x ];then - if [[ "${file}" == gemm_tn.cl ]];then - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_41.bin --options=\"${copt} -D LM=4 -D LN=1 -D UN=0 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_42.bin --options=\"${copt} -D LM=4 -D LN=2 -D UN=1 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_43.bin --options=\"${copt} -D LM=4 -D LN=3 -D UN=2 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_44.bin --options=\"${copt} -D LM=4 -D LN=4 -D UN=3 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_45.bin --options=\"${copt} -D LM=4 -D LN=5 -D UN=4 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_46.bin --options=\"${copt} -D LM=4 -D LN=6 -D UN=5 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_47.bin --options=\"${copt} -D LM=4 -D LN=7 -D UN=6 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_48.bin --options=\"${copt} -D LM=4 -D LN=8 -D UN=7 -DUSE_NCWHC4\" - - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_81.bin --options=\"${copt} -D LM=8 -D LN=1 -D UN=0 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_82.bin --options=\"${copt} -D LM=8 -D LN=2 -D UN=1 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_83.bin --options=\"${copt} -D LM=8 -D LN=3 -D UN=2 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_84.bin --options=\"${copt} -D LM=8 -D LN=4 -D UN=3 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_85.bin --options=\"${copt} -D LM=8 -D LN=5 -D UN=4 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_86.bin --options=\"${copt} -D LM=8 -D LN=6 -D UN=5 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_87.bin --options=\"${copt} -D LM=8 -D LN=7 -D UN=6 -DUSE_NCWHC4\" - echo ./gcl_binary --input=$file --output=${file%.*}_ncwhc4_88.bin --options=\"${copt} -D LM=8 -D LN=8 -D UN=7 -DUSE_NCWHC4\" - - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_41.bin --options=\"${copt} -D LM=4 -D LN=1 -D UN=0 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_42.bin --options=\"${copt} -D LM=4 -D LN=2 -D UN=1 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_43.bin --options=\"${copt} -D LM=4 -D LN=3 -D UN=2 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_44.bin --options=\"${copt} -D LM=4 -D LN=4 -D UN=3 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_45.bin --options=\"${copt} -D LM=4 -D LN=5 -D UN=4 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_46.bin --options=\"${copt} -D LM=4 -D LN=6 -D UN=5 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_47.bin --options=\"${copt} -D LM=4 -D LN=7 -D UN=6 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_48.bin --options=\"${copt} -D LM=4 -D LN=8 -D UN=7 -DUSE_NCWHC4 -DUSE_RELU\" - - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_81.bin --options=\"${copt} -D LM=8 -D LN=1 -D UN=0 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_82.bin --options=\"${copt} -D LM=8 -D LN=2 -D UN=1 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_83.bin --options=\"${copt} -D LM=8 -D LN=3 -D UN=2 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_84.bin --options=\"${copt} -D LM=8 -D LN=4 -D UN=3 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_85.bin --options=\"${copt} -D LM=8 -D LN=5 -D UN=4 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_86.bin --options=\"${copt} -D LM=8 -D LN=6 -D UN=5 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_87.bin --options=\"${copt} -D LM=8 -D LN=7 -D UN=6 -DUSE_NCWHC4 -DUSE_RELU\" - echo ./gcl_binary --input=$file --output=${file%.*}_relu_ncwhc4_88.bin --options=\"${copt} -D LM=8 -D LN=8 -D UN=7 -DUSE_NCWHC4 -DUSE_RELU\" - - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_13.bin --options=\"${copt} -D LM=1 -D LN=3 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_14.bin --options=\"${copt} -D LM=1 -D LN=4 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_15.bin --options=\"${copt} -D LM=1 -D LN=5 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_16.bin --options=\"${copt} -D LM=1 -D LN=6 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_17.bin --options=\"${copt} -D LM=1 -D LN=7 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_18.bin --options=\"${copt} -D LM=1 -D LN=8 -D NO_BIAS\" - - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_22.bin --options=\"${copt} -D LM=2 -D LN=2 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_23.bin --options=\"${copt} -D LM=2 -D LN=3 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_24.bin --options=\"${copt} -D LM=2 -D LN=4 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_25.bin --options=\"${copt} -D LM=2 -D LN=5 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_26.bin --options=\"${copt} -D LM=2 -D LN=6 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_27.bin --options=\"${copt} -D LM=2 -D LN=7 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_28.bin --options=\"${copt} -D LM=2 -D LN=8 -D NO_BIAS\" - - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_31.bin --options=\"${copt} -D LM=3 -D LN=1 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_32.bin --options=\"${copt} -D LM=3 -D LN=2 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_33.bin --options=\"${copt} -D LM=3 -D LN=3 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_34.bin --options=\"${copt} -D LM=3 -D LN=4 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_35.bin --options=\"${copt} -D LM=3 -D LN=5 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_36.bin --options=\"${copt} -D LM=3 -D LN=6 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_37.bin --options=\"${copt} -D LM=3 -D LN=7 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_38.bin --options=\"${copt} -D LM=3 -D LN=8 -D NO_BIAS\" - - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_41.bin --options=\"${copt} -D LM=4 -D LN=1 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_42.bin --options=\"${copt} -D LM=4 -D LN=2 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_43.bin --options=\"${copt} -D LM=4 -D LN=3 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_44.bin --options=\"${copt} -D LM=4 -D LN=4 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_45.bin --options=\"${copt} -D LM=4 -D LN=5 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_46.bin --options=\"${copt} -D LM=4 -D LN=6 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_47.bin --options=\"${copt} -D LM=4 -D LN=7 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_48.bin --options=\"${copt} -D LM=4 -D LN=8 -D NO_BIAS\" - - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_51.bin --options=\"${copt} -D LM=5 -D LN=1 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_52.bin --options=\"${copt} -D LM=5 -D LN=2 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_53.bin --options=\"${copt} -D LM=5 -D LN=3 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_54.bin --options=\"${copt} -D LM=5 -D LN=4 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_55.bin --options=\"${copt} -D LM=5 -D LN=5 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_56.bin --options=\"${copt} -D LM=5 -D LN=6 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_57.bin --options=\"${copt} -D LM=5 -D LN=7 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_58.bin --options=\"${copt} -D LM=5 -D LN=8 -D NO_BIAS\" - - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_61.bin --options=\"${copt} -D LM=6 -D LN=1 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_62.bin --options=\"${copt} -D LM=6 -D LN=2 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_63.bin --options=\"${copt} -D LM=6 -D LN=3 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_64.bin --options=\"${copt} -D LM=6 -D LN=4 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_65.bin --options=\"${copt} -D LM=6 -D LN=5 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_66.bin --options=\"${copt} -D LM=6 -D LN=6 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_67.bin --options=\"${copt} -D LM=6 -D LN=7 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_68.bin --options=\"${copt} -D LM=6 -D LN=8 -D NO_BIAS\" - - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_71.bin --options=\"${copt} -D LM=7 -D LN=1 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_72.bin --options=\"${copt} -D LM=7 -D LN=2 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_73.bin --options=\"${copt} -D LM=7 -D LN=3 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_74.bin --options=\"${copt} -D LM=7 -D LN=4 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_75.bin --options=\"${copt} -D LM=7 -D LN=5 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_76.bin --options=\"${copt} -D LM=7 -D LN=6 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_77.bin --options=\"${copt} -D LM=7 -D LN=7 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_78.bin --options=\"${copt} -D LM=7 -D LN=8 -D NO_BIAS\" - - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_81.bin --options=\"${copt} -D LM=8 -D LN=1 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_82.bin --options=\"${copt} -D LM=8 -D LN=2 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_83.bin --options=\"${copt} -D LM=8 -D LN=3 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_84.bin --options=\"${copt} -D LM=8 -D LN=4 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_85.bin --options=\"${copt} -D LM=8 -D LN=5 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_86.bin --options=\"${copt} -D LM=8 -D LN=6 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_87.bin --options=\"${copt} -D LM=8 -D LN=7 -D NO_BIAS\" - echo ./gcl_binary --input=$file --output=${file%.*}_nobias_88.bin --options=\"${copt} -D LM=8 -D LN=8 -D NO_BIAS\" - fi - fi - done - - - diff --git a/image/src/CMakeLists.txt b/image/src/CMakeLists.txt deleted file mode 100644 index bc70c095..00000000 --- a/image/src/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -if (USE_GENERAL) - file(GLOB general_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/general/*.cpp) -endif (USE_GENERAL) - -if (USE_NEON) - file(GLOB arm_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/*.cpp) -endif (USE_NEON) - -file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) -set(srcs "${srcs};${general_srcs};${arm_srcs}") - -include_directories(${CMAKE_CURRENT_SOURCE_DIR}) - -# shared library -ADD_LIBRARY(${PROJECT_NAME} SHARED ${srcs}) - -# static library -ADD_LIBRARY(${PROJECT_NAME}_static STATIC ${srcs}) - -SET_TARGET_PROPERTIES(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}") -SET_TARGET_PROPERTIES(${PROJECT_NAME} PROPERTIES CLEAN_DIRECT_OUTPUT 1) -SET_TARGET_PROPERTIES(${PROJECT_NAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) diff --git a/image/src/resize.cpp b/image/src/resize.cpp deleted file mode 100644 index 290c1821..00000000 --- a/image/src/resize.cpp +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#include "image.h" -#ifdef _USE_GENERAL -#include "cpu/general/image_general.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/image_arm.h" -#endif -#include - -// params is a pointer to either the target size or the resize ratios -// When resizeDesc specifies DT_U32, params should point to target sizes (height and width) -// When resizeDesc specifies DT_F32, params should point to resize ratios -EE resize_infer_output_size(TensorDesc inputDesc, ResizeDesc resizeDesc, void* params, - TensorDesc* outputDesc, U32* outputBytes) -{ - if (nullptr == outputDesc || nullptr == outputBytes) { - CHECK_STATUS(NULL_POINTER); - } - DataType idt; - DataFormat idf; - U32 in, ic, ih, iw; - U32 oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - - switch(resizeDesc.paramDT) { - case DT_F32: { - F32 *scales = (F32*)params; - oh = ih * scales[0]; - ow = iw * scales[1]; - break; - } - case DT_U32: { - U32 *len = (U32*)params; - oh = len[0]; - ow = len[1]; - break; - } - default: { - return NOT_SUPPORTED; - } - } - - *outputDesc = tensor4df(idt, idf, in, ic, oh, ow); - *outputBytes = tensorNumBytes(*outputDesc); - return SUCCESS; -} - -EE resize(TensorDesc inputDesc, void* input, - TensorDesc outputDesc, void* output, - Arch arch) -{ - DataType idt, odt; - DataFormat idf, odf; - U32 in, ic, ih, iw; - U32 on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - - CHECK_REQUIREMENT(in == on && ic == oc); - - if (ih == oh && iw == ow) { - memcpy(output, input, tensorNumBytes(inputDesc)); - return SUCCESS; - } - - EE ret = NOT_SUPPORTED; - if (arch == CPU_GENERAL) { -#ifdef _USE_GENERAL - ret = resize_bilinear_general(inputDesc, input, - outputDesc, output); -#endif -#ifdef _USE_NEON - } else { - ret = resize_bilinear_arm(inputDesc, input, - outputDesc, output); -#endif - } - return ret; -} diff --git a/inference/CMakeLists.txt b/inference/CMakeLists.txt index 9599b2f6..b1e7ea08 100644 --- a/inference/CMakeLists.txt +++ b/inference/CMakeLists.txt @@ -1,33 +1,19 @@ cmake_minimum_required(VERSION 3.2) -file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake) +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) if (BOLT_CONFIGURE_FILE) include(${BOLT_CONFIGURE_FILE}) else (BOLT_CONFIGURE_FILE) message(FATAL_ERROR " -FATAL: can not find bolt.cmake in directory, +FATAL: can not find bolt.cmake in /common/cmakes directory, please set shell or cmake environment variable BOLT_ROOT. ") endif (BOLT_CONFIGURE_FILE) project(inference) -set_policy() - -SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${BOLT_ROOT}/cmakes") -find_package(Uni) -find_package(ModelTools) -find_package(Image) -find_package(TensorComputing) -if(USE_MALI) - find_package(Gcl) -endif(USE_MALI) -if(BUILD_TEST) - find_package(jpeg) -endif(BUILD_TEST) - -set_project_install_directory() - -set_c_cxx_flags() - -add_subdirectory(src) +add_subdirectory(engine) +if (USE_FLOW) + add_subdirectory(flow) +endif (USE_FLOW) +add_subdirectory(examples) diff --git a/tensor_computing/CMakeLists.txt b/inference/engine/CMakeLists.txt similarity index 50% rename from tensor_computing/CMakeLists.txt rename to inference/engine/CMakeLists.txt index d5cebaf4..ac1be1a5 100644 --- a/tensor_computing/CMakeLists.txt +++ b/inference/engine/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.2) -file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/bolt.cmake ${BOLT_ROOT}/bolt.cmake) +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) if (BOLT_CONFIGURE_FILE) include(${BOLT_CONFIGURE_FILE}) else (BOLT_CONFIGURE_FILE) @@ -10,20 +10,19 @@ FATAL: can not find bolt.cmake in directory, ") endif (BOLT_CONFIGURE_FILE) -project(tensor_computing) +project(engine) -set_policy() - -SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${BOLT_ROOT}/cmakes") -find_package(Uni) -find_package(BlasEnhance) -find_package(TensorComputing) -if(USE_MALI) - find_package(Gcl) -endif(USE_MALI) - -set_project_install_directory() +if (BUILD_TEST) + find_package(jpeg) +endif (BUILD_TEST) set_c_cxx_flags() +include_engine() + add_subdirectory(src) +add_subdirectory(tools) + +install(DIRECTORY api/java + api/c + DESTINATION include) diff --git a/inference/engine/api/c/bolt.h b/inference/engine/api/c/bolt.h new file mode 100644 index 00000000..4e4428bc --- /dev/null +++ b/inference/engine/api/c/bolt.h @@ -0,0 +1,341 @@ +/** + * @file + * @brief C API Document + * + * @copyright + * @code + * Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE + * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * @endcode + */ +#ifdef __cplusplus +extern "C" { +#endif + +/** inference pipeline handle */ +typedef void *ModelHandle; + +/** result data memory handle */ +typedef void *ResultHandle; + +/** CPU affinity policy */ +typedef enum { + CPU_HIGH_PERFORMANCE = 0, ///< performance is high priority(use big core) + CPU_LOW_POWER = 1, ///< power is high priority(use small core) + GPU = 2 ///< use GPU +} AFFINITY_TYPE; + +/** heterogeneous device type */ +typedef enum { + CPU_SERIAL = 0, ///< CPU serial + CPU_ARM_V7 = 1, ///< ARMv7 CPU + CPU_ARM_V8 = 2, ///< ARMv8 CPU + CPU_ARM_A55 = 3, ///< ARM A55 CPU + CPU_ARM_A76 = 4, ///< ARM A76 CPU + CPU_X86_AVX2 = 5, ///< X86_64 AVX2 CPU + GPU_MALI = 10 ///< ARM MALI GPU +} DEVICE_TYPE; + +/** data precision */ +typedef enum { + FP_32 = 0, ///< 32 bit float + FP_16 = 1, ///< 16 bit float + INT_32 = 2, ///< 32 bit integer + UINT_32 = 3 ///< 32 bit unsigned integer +} DATA_TYPE; + +/** multi-dimension data format */ +typedef enum { + NCHW = 0, ///< batch->channel->high->width data order + NHWC = 1, ///< batch->high->width->channel data order + NCHWC8 = 2, ///< batch->channel/8->high->width->channel four element data order + MTK = 3, ///< batch->time->unit data order + NORMAL = 4 ///< batch->unit data order +} DATA_FORMAT; + +/** + * @brief create model from file + * @param modelPath model file path + * @param affinity CPU affinity setting + * @param algoPath the file path to save and load algos info + * + * @return inference pipeline handle + * + * @note destroy model when pipeline end + * @code + * ModelHandle handle = CreateModel(...); + * ... + * DestroyModel(handle); + * @endcode + * valid algoPath can reduce PrepareModel significantly + * if you set a valid algoPath, algorithm selected only need to run once, which is usually time consuming + * the algorithm select result will be saved to the file path you set, and loaded when you run it next time, + * which avoid to do the algorithm selected again + * it is strongly suggest that set a valid algoPath, especiall for GPU running + * @note + * if your inputSize changed, please delete the old algorithm file be saved + * if your model changed, please delete the old algorithm file be saved + * if any unexpected error happen, you can try to delete algorithm file and run it again + */ +ModelHandle CreateModel(const char *modelPath, AFFINITY_TYPE affinity, const char *algoPath); + +/** + * @brief create model from file stream + * Other info is the same with CreateModel + **/ +ModelHandle CreateModelWithFileStream( + const char *modelFileStream, AFFINITY_TYPE affinity, const char *algoFileStream); + +/** + * @brief get the number of model input from ModelHandle + * @param ih inference pipeline handle + * + * @return the number of input + */ +int GetNumInputsFromModel(ModelHandle ih); + +/** + * @brief get input Data info set in model handle, which is read from .bolt + * @param ih inference pipeline handle + * @param number_inputs the number of input + * @param inputNames the array of all input data's name + * @param n the array of all input data's n dimension + * @param c the array of all input data's c dimension + * @param h the array of all input data's h dimension + * @param w the array of all input data's w dimension + * @param dt the array of all input data's data type + * @param df the array of all input data's data format + * + * @return + * @note + * ptr of inputNames/n/c/h/w need be managed by user, the space must be larger than numInputs * Bytesof(dataType) + */ +void GetInputDataInfoFromModel(ModelHandle ih, + const int number_inputs, + char **inputNames, + int *n, + int *c, + int *h, + int *w, + DATA_TYPE *dt, + DATA_FORMAT *df); + +/** + * @brief complete model inference engine prepare + * @param ih model inference handle + * @param num_input the number of input data + * @param name the array of all input data's name in string format + * @param n the array of all input data's n dimension + * @param c the array of all input data's c dimension + * @param h the array of all input data's h dimension + * @param w the array of all input data's w dimension + * @param dt_input the array of all input data's data type + * @param df_input the array of all input data's data format + * + * @return + */ +void PrepareModel(ModelHandle ih, + const int num_input, + char **name, + const int *n, + const int *c, + const int *h, + const int *w, + const DATA_TYPE *dt_input, + const DATA_FORMAT *df_input); + +/** + * @brief clone model from a model + * @param ih a inference pipeline handle pointer of a model + * + * @return inference pipeline handle + **/ +ModelHandle CloneModel(ModelHandle ih); + +/** + * @brief resize model input size + * @param ih model inference handle + * @param num_input the number of input data + * @param n the array of all input data's n dimension + * @param c the array of all input data's c dimension + * @param h the array of all input data's h dimension + * @param w the array of all input data's w dimension + * @param name the array of all input data's name in string format + * @param dt_input the array of all input data's data type + * @param df_input the array of all input data's data format + * + * @return + * + * @code + * // model_resize must behind PrepareModel; + * PrepareModel(...); + * ResizeModelInput(...); + * RunModel(...); + * @endcode + */ +void ResizeModelInput(ModelHandle ih, + const int num_input, + char **name, + const int *n, + const int *c, + const int *h, + const int *w, + const DATA_TYPE *dt_input, + const DATA_FORMAT *df_input); + +/** + * @brief malloc result data memory + * @param ih inference pipeline handle + * + * @return result data memory handle + */ +ResultHandle AllocAllResultHandle(ModelHandle ih); + +/** + * @brief malloc result data memory according to user specification + * @param ih inference pipeline handle + * @param num_outputs the number of tensor that needed + * @param outputNames the array of tesor name that needed + * + * @return result data memory handle + */ +ResultHandle AllocSpecificResultHandle(ModelHandle ih, const int num_outputs, char **outputNames); + +/** + * @brief clone result handle + * @param ir a result data handle + * + * @return result data memory handle + **/ +ResultHandle CloneResultHandle(ResultHandle ir); + +/** + * @brief set process to run on specified CPU core + * @param ih inference pipeline handle + * @param cpu_id cpu core id(0, 1, 2...) + * @param device cpu core architecture(ARM_A76) + * + * @return + */ +void SetRuntimeDevice(ModelHandle ih, int cpu_id, DEVICE_TYPE device); + +/** + * @brief set process cpu affinity according cpu average occupy + * @param ih inference pipeline handle + * + * @return + */ +void SetRuntimeDeviceDynamic(ModelHandle ih); + +/** + * @brief inference result from input + * @param ih inference pipeline handle + * @param ir result data memory handle + * @param num_input the number of input data + * @param inputNames the array of all input data's name in string format + * @param mem the array of all input data + * + * @return + */ +void RunModel(ModelHandle ih, ResultHandle ir, const int num_input, char **inputNames, void **mem); + +/** + * @brief get the number of model output from ResultHandle + * @param ir result data memory handle + * + * @return the number of output + */ +int GetNumOutputsFromResultHandle(ResultHandle ir); + +/** + * @brief get output Data info from ResultHandle + * @param ir result data memory handle + * @param num_outputs the number of output data + * @param outputNames the array of all output data's name + * @param n the array of all output data's n dimension + * @param c the array of all output data's c dimension + * @param h the array of all output data's h dimension + * @param w the array of all output data's w dimension + * @param dt_output the array of all output data's data type + * @param df_output the array of all output data's data format + * + * @return + * @note + * ptr of outputNames/n/c/h/w/ need be managed by user, the space must be larger than num_outputs * Bytesof(dataType) + */ +void GetOutputDataInfoFromResultHandle(ResultHandle ir, + int num_outputs, + char **outputNames, + int *n, + int *c, + int *h, + int *w, + DATA_TYPE *dt_output, + DATA_FORMAT *df_output); +/** + * @brief get data from ResultHandle, default to pass value of output ptr, + * if need copy data to your own ptr, please use CopyOutputsFromResultHandle + * @param ir result data memory handle + * @param num_outputs the number of output data + * @param outputNames the array of all output data's name + * @param data the array of all output data's content + * @param n the array of all output data's n dimension + * @param c the array of all output data's c dimension + * @param h the array of all output data's h dimension + * @param w the array of all output data's w dimension + * @param dt_output the array of all output data's data type + * @param df_output the array of all output data's data format + * + * @return + */ +void GetPtrFromResultHandle(ResultHandle ir, + int num_outputs, + char **outputNames, + void **data, + int *n, + int *c, + int *h, + int *w, + DATA_TYPE *dt_output, + DATA_FORMAT *df_output); + +/** + * @brief get data ptr from ResultHandle with memcpy + * @param ir result data memory handle + * @param num_outputs the number of output data + * @param size the array of size of output + * @param data the array of all output data's content + * + * @return + * ptr of data need be managed by user, the space must be >= size + */ +void CopyOutputsFromResultHandle(ResultHandle ir, int num_outputs, const int *size, void **data); + +/** + * @brief free result data memory + * @param ir result data memory handle + * + * @return + */ +void FreeResultHandle(ResultHandle ir); + +/** + * @brief destroy model + * @param ih inference pipeline handle + * + * @return + */ +void DestroyModel(ModelHandle ih); +#ifdef __cplusplus +} +#endif diff --git a/inference/engine/api/dllite/Bolt.h b/inference/engine/api/dllite/Bolt.h new file mode 100644 index 00000000..e212fc9e --- /dev/null +++ b/inference/engine/api/dllite/Bolt.h @@ -0,0 +1,101 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef DLLITE_BOLT_H +#define DLLITE_BOLT_H + +#include +#include + +namespace bolt { + +/** inference pipeline handle */ +using ModelHandle = void *; + +/** result data memory handle */ +using ResultHandle = void *; + +/** CPU affinity policy */ +enum class AffinityType { + CPU_HIGH_PERFORMANCE = 0, ///< performance is high priority(use big core) + CPU_LOW_POWER = 1, ///< power is high priority(use small core) + GPU = 2 ///< use GPU +}; + +/** data precision */ +enum class TensorType { + FP32 = 0, ///< 32 bit float + FP16 = 1, ///< 16 bit float + INT32 = 2, ///< 32 bit integer + UINT32 = 3 ///< 32 bit unsigned integer +}; + +/** multi-dimension data format */ +enum class TensorLayout { + NCHW = 0, ///< batch->channel->height->width data order + NHWC = 1, ///< batch->height->width->channel data order + NCHWC8 = 2, ///< batch->channel/8->height->width->8 data order + ROW_MAJOR = 3, ///< batch->unit data order + RNN_MTK = 4 ///< batch->time->unit data order +}; + +// IOTensor +struct IOTensor { + std::string name; + TensorType type; + TensorLayout layout; + std::vector shape; + std::pair buffer; // +}; + +// For model and algo config, either both use stream (default) or both use path +struct ModelConfig { + AffinityType affinity; + std::pair modelStream; + std::pair algoStream; + std::string modelPath; + std::string algoPath; +}; + +// Return status +enum class ReturnStatus { + SUCCESS = 0, ///< SUCCESS + FAIL = -1, ///< FAIL + NULLPTR = -2 ///< NULLPTR +}; + +ModelHandle CreateModel(const ModelConfig &modelConfig); + +ReturnStatus GetIOFormats( + ModelHandle modelHandle, std::vector &inputs, std::vector &outputs); + +ReturnStatus PrepareModel(ModelHandle modelHandle, const std::vector &inputs); + +ReturnStatus GetInputTensors(ModelHandle modelHandle, std::vector &inputs); + +ReturnStatus ResizeInput(ModelHandle modelHandle, const std::vector &inputs); + +ResultHandle AllocResult(ModelHandle modelHandle, const std::vector &outputs); + +ReturnStatus RunModel( + ModelHandle modelHandle, ResultHandle resultHandle, const std::vector &inputs); + +ReturnStatus GetOutputTensors(ResultHandle resultHandle, std::vector &outputs); + +ReturnStatus FreeResult(ResultHandle resultHandle); + +ReturnStatus DestroyModel(ModelHandle modelHandle); + +} // namespace bolt + +#endif // DLLITE_BOLT_H diff --git a/inference/engine/api/java/BoltModel.java b/inference/engine/api/java/BoltModel.java new file mode 100644 index 00000000..bd043611 --- /dev/null +++ b/inference/engine/api/java/BoltModel.java @@ -0,0 +1,440 @@ +/** + * @file + * @brief Java BoltModel Class Document + * + * @copyright + * @code + * Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE + * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * @endcode + */ + +import java.io.File; +import java.io.FileNotFoundException; + +/** affinity policy */ +enum AffinityType { + CPU_HIGH_PERFORMANCE, ///< performance is high priority(use CPU big core) + CPU_LOW_POWER, ///< power is high priority(use CPU small core) + GPU ///< use ARM MALI GPU +} + +/** heterogeneous device type */ +enum DeviceType { + CPU_ARM_V7, ///< ARMv7 CPU + CPU_ARM_V8, ///< ARMv8 CPU + CPU_ARM_A55, ///< ARM A55 CPU + CPU_ARM_A76, ///< ARM A76 CPU + GPU_MALI, ///< ARM MALI GPU + CPU_X86_AVX2, ///< X86_64 AVX2 CPU + CPU_SERIAL ///< CPU serial +} + +/** data precision */ +enum DataType { + FP32, ///< 32 bit float + FP16, ///< 16 bit float + INT32, ///< 32 bit integer + UINT32 ///< 32 bit unsigned char +} + +/** multi-dimensions data format */ +enum DataFormat { + NCHW, ///< batch->channel->high->width data order + NHWC, ///< batch->high->width->channel data order + MTK, ///< batch->time->unit data order + NORMAL ///< vectorize input of row major +} + +public final class BoltModel implements Cloneable { + private static void loadLibrary(String libraryAbsolutePath, boolean optional) + { + File file = new File(libraryAbsolutePath); + if (file.exists()) { + System.load(libraryAbsolutePath); + } else { + if (!optional) { + System.err.println("[ERROR] unable to load " + libraryAbsolutePath); + } + } + } + + static + { + String dir = System.getProperty("user.dir"); + loadLibrary(dir + "/libc++_shared.so", true); + loadLibrary("/system/lib64/libOpenCL.so", true); + loadLibrary(dir + "/libkernelsource.so", true); + loadLibrary(dir + "/libBoltModel.so", false); + } + + private long modelAddr; + + private long IResult; + + private native long createModel(String modelPath, String affinity); + + private native void prepareModel(long modelAddr, + int inputNum, + String[] inputName, + int[] inputN, + int[] inputC, + int[] inputH, + int[] inputW, + String[] inputDataType, + String[] inputDataFormat); + + private native long cloneModel(long modelAddr); + + private native long cloneResult(long IResult); + + private native void resizeModelInput(long modelAddr, + int inputNum, + String[] inputName, + int[] inputN, + int[] inputC, + int[] inputH, + int[] inputW, + String[] inputDataType, + String[] inputDataFormat); + + private native long allocAllResultHandle(long modelAddr); + + private native long allocSpecificResultHandle(long modelAddr, int outputNum, String[] outputName); + + private native void setRuntimeDeviceJNI(int cpuId, String device); + + private native void setRuntimeDeviceDynamicJNI(); + + private native void runModel( + long modelAddr, long IResult, int inputNum, String[] inputName, float[][] inputData); + + private native BoltResult getOutput(long IResult); + + private native void freeResultHandle(long IResult); + + private native void destroyModel(long modelAddr); + + public String affinityMapping(AffinityType affinity) + { + String ret = "CPU_AFFINITY_HIGH_PERFORMANCE"; + if (affinity == AffinityType.CPU_HIGH_PERFORMANCE) { + ret = "CPU_AFFINITY_HIGH_PERFORMANCE"; + } else if (affinity == AffinityType.CPU_LOW_POWER) { + ret = "CPU_AFFINITY_LOW_POWER"; + } else if (affinity == AffinityType.GPU) { + ret = "GPU"; + } else { + System.err.println("[ERROR] unsupported CPU affinity in " + this.getClass().getName()); + } + return ret; + } + + public String deviceMapping(DeviceType device) + { + String ret = "CPU_ARM_V8"; + if (device == DeviceType.CPU_ARM_V7) { + ret = "CPU_ARM_V7"; + } else if (device == DeviceType.CPU_ARM_V8) { + ret = "CPU_ARM_V8"; + } else if (device == DeviceType.CPU_ARM_A55) { + ret = "CPU_ARM_A55"; + } else if (device == DeviceType.CPU_ARM_A76) { + ret = "CPU_ARM_A76"; + } else if (device == DeviceType.GPU_MALI) { + ret = "GPU_MALI"; + } else if (device == DeviceType.CPU_X86_AVX2) { + ret = "CPU_X86_AVX2"; + } else if (device == DeviceType.CPU_SERIAL) { + ret = "CPU_SERIAL"; + } else { + System.err.println("[ERROR] unsupported device in " + this.getClass().getName()); + } + return ret; + } + + public String dataTypeMapping(DataType data_type) + { + String ret = "FP32"; + if (data_type == DataType.FP32) { + ret = "FP32"; + } else if (data_type == DataType.FP16) { + ret = "FP16"; + } else if (data_type == DataType.INT32) { + ret = "INT32"; + } else if (data_type == DataType.UINT32) { + ret = "UINT32"; + } else { + System.err.println("[ERROR] unsupported data type in " + this.getClass().getName()); + } + return ret; + } + + private String dataFormatMapping(DataFormat data_format) + { + String ret = "NCHW"; + if (data_format == DataFormat.NCHW) { + ret = "NCHW"; + } else if (data_format == DataFormat.NHWC) { + ret = "NHWC"; + } else if (data_format == DataFormat.MTK) { + ret = "MTK"; + } else if (data_format == DataFormat.NORMAL) { + ret = "NORMAL"; + } else { + System.err.println("[ERROR] unsupported data format in " + this.getClass().getName()); + } + return ret; + } + + BoltModel() + { + this.modelAddr = 0; + this.IResult = -1; + } + + /** + * @brief initial model and alloc memory + * @param modelPath model file path of String type + * @param affinity CPU affinity setting of AffinityType(enum) type + * @param device heterogeneous device setting of DeviceType(enum) type + * @param inputNum the number of input data of int type + * @param inputName the array of all input data's name of string type + * @param inputN the array of all input data's n dimension of int type + * @param inputC the array of all input data's c dimension of int type + * @param inputH the array of all input data's h dimension of int type + * @param inputW the array of all input data's w dimension of int type + * @param inputDataType the array of all input data's data type of DataType(enum) type + * @param inputDataFormat the array of all input data's data format of DataFormat(enum) type + * + * @return + * + * @note destroy model when pipeline end + * @code + * BoltModel example = BoltModel(...); + * ... + * example.estructor(); + * @endcode + */ + BoltModel(String modelPath, + AffinityType affinity, + int inputNum, + String[] inputName, + int[] inputN, + int[] inputC, + int[] inputH, + int[] inputW, + DataType[] inputDataType, + DataFormat[] inputDataFormat) + { + String affinityString = affinityMapping(affinity); + String[] inputDataTypeString = new String[inputNum]; + String[] inputDataFormatString = new String[inputNum]; + for (int i = 0; i < inputNum; i++) { + inputDataTypeString[i] = dataTypeMapping(inputDataType[i]); + inputDataFormatString[i] = dataFormatMapping(inputDataFormat[i]); + } + + this.modelAddr = createModel(modelPath, affinityString); + if (0 != this.modelAddr) { + prepareModel(this.modelAddr, inputNum, inputName, inputN, inputC, inputH, inputW, + inputDataTypeString, inputDataFormatString); + this.IResult = allocAllResultHandle(this.modelAddr); + } else { + this.IResult = -1; + System.err.println("[ERROR] model cannot be created in " + this.getClass().getName()); + } + } + + /** + * @brief initial model and alloc memory, and the output is decided by user + * @param modelPath model file path of String type + * @param affinity CPU affinity setting of AffinityType(enum) type + * @param device heterogeneous device setting of DeviceType(enum) type + * @param inputNum the number of input data of int type + * @param inputName the array of all input data's name of string type + * @param inputN the array of all input data's n dimension of int type + * @param inputC the array of all input data's c dimension of int type + * @param inputH the array of all input data's h dimension of int type + * @param inputW the array of all input data's w dimension of int type + * @param inputDataType the array of all input data's data type of DataType(enum) type + * @param inputDataFormat the array of all input data's data format of DataFormat(enum) type + * @param outputNum the number of output data of int type + * @param outputName the array of all output data's name of string type + * + * @return + * + * @note destroy model when pipeline end + * @code + * BoltModel example = BoltModel(...); + * ... + * example.estructor(); + * @endcode + */ + BoltModel(String modelPath, + AffinityType affinity, + int inputNum, + String[] inputName, + int[] inputN, + int[] inputC, + int[] inputH, + int[] inputW, + DataType[] inputDataType, + DataFormat[] inputDataFormat, + int outputNum, + String[] outputName) + { + String affinityString = affinityMapping(affinity); + String[] inputDataTypeString = new String[inputNum]; + String[] inputDataFormatString = new String[inputNum]; + for (int i = 0; i < inputNum; i++) { + inputDataTypeString[i] = dataTypeMapping(inputDataType[i]); + inputDataFormatString[i] = dataFormatMapping(inputDataFormat[i]); + } + + this.modelAddr = createModel(modelPath, affinityString); + if (0 != this.modelAddr) { + prepareModel(this.modelAddr, inputNum, inputName, inputN, inputC, inputH, inputW, + inputDataTypeString, inputDataFormatString); + this.IResult = allocSpecificResultHandle(this.modelAddr, outputNum, outputName); + } else { + this.IResult = -1; + System.err.println("[ERROR] model cannot be created in " + this.getClass().getName()); + } + } + + /** + * @brief clone BoltModel + * + * @return cloneModel: shared weight with original model but has different tensor space + */ + protected Object clone() { + BoltModel cloneModel = new BoltModel(); + if (0 != this.modelAddr) { + cloneModel.modelAddr = cloneModel(this.modelAddr); + } else { + cloneModel.modelAddr = 0; + } + if (-1 != this.IResult) { + cloneModel.IResult = cloneResult(this.IResult); + } else { + cloneModel.IResult = -1; + } + return cloneModel; + } + + /** + * @brief set process to run on specified CPU core + * @param cpuId cpu core id(0, 1, 2...) + * @param device cpu core architecture(ARM_A76) + * + * @return + */ + public void setRuntimeDevice(int cpuId, DeviceType device) throws FileNotFoundException + { + if (0 == this.modelAddr) { + throw new FileNotFoundException(); + } + String deviceString = deviceMapping(device); + setRuntimeDeviceJNI(cpuId, deviceString); + } + + /** + * @brief set process cpu affinity according cpu average occupy + * + * @return + */ + public void setRuntimeDeviceDynamic() throws FileNotFoundException + { + if (0 == this.modelAddr) { + throw new FileNotFoundException(); + } + setRuntimeDeviceDynamicJNI(); + } + + /** + * @brief inference result from input + * @param inputNum the number of input data of int type + * @param inputName the array of all input data's name of string type + * @param inputData the 2D array of all input data of float type + * + * @return BoltResult : the result class of bolt model after inference + */ + public BoltResult run(int inputNum, String[] inputName, float[][] inputData) + { + if (0 == this.modelAddr) { + return null; + } + runModel(this.modelAddr, this.IResult, inputNum, inputName, inputData); + BoltResult boltResult = getOutput(this.IResult); + return boltResult; + } + + /** + * @brief inference result from resized input + * @param inputNum the number of input data of int type + * @param inputName the array of all input data's name of String type + * @param inputN the array of all input data's n dimension of int type + * @param inputC the array of all input data's c dimension of int type + * @param inputH the array of all input data's h dimension of int type + * @param inputW the array of all input data's w dimension of int type + * @param inputDataType the array of all input data's data type of DataType(enum) type + * @param inputDataFormat the array of all input data's data format of DataFormat(enum) type + * @param inputData the 2D array of all input data of float type + * + * @return BoltResult : the result class of bolt model after inference + */ + public BoltResult run(int inputNum, + String[] inputName, + int[] inputN, + int[] inputC, + int[] inputH, + int[] inputW, + DataType[] inputDataType, + DataFormat[] inputDataFormat, + float[][] inputData) + { + if (0 == this.modelAddr) { + return null; + } + String[] inputDataTypeString = new String[inputNum]; + String[] inputDataFormatString = new String[inputNum]; + for (int i = 0; i < inputNum; i++) { + inputDataTypeString[i] = dataTypeMapping(inputDataType[i]); + inputDataFormatString[i] = dataFormatMapping(inputDataFormat[i]); + } + + resizeModelInput(this.modelAddr, inputNum, inputName, inputN, inputC, inputH, inputW, + inputDataTypeString, inputDataFormatString); + runModel(this.modelAddr, this.IResult, inputNum, inputName, inputData); + BoltResult boltResult = getOutput(this.IResult); + return boltResult; + } + + /** + * @brief recycle memory and destroy model + * + * @return + */ + public void destructor() + { + if (-1 != this.IResult) { + freeResultHandle(this.IResult); + this.IResult = -1; + } + if (0 != this.modelAddr) { + destroyModel(this.modelAddr); + this.modelAddr = 0; + } + } +} diff --git a/inference/engine/api/java/BoltResult.java b/inference/engine/api/java/BoltResult.java new file mode 100644 index 00000000..e0e6a80e --- /dev/null +++ b/inference/engine/api/java/BoltResult.java @@ -0,0 +1,127 @@ +/** + * @file + * @brief Java BoltResult Class Document + * + * @copyright + * @code + * Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE + * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * @endcode + */ + +public class BoltResult { + /** 2d float array of output data in the inference result, the length of value is output size */ + private float[][] value; + + /** 2d int array of output dimension info in the inference result, the length of dimension is output size */ + private int[][] dimension; + + /** String array of output names info in the inference result, the length of name is output size */ + private String[] name; + + /** String array of output data info in the inference result, the length of dataFormat is output size */ + private String[] dataFormat; + + /** calculate product and skip 0 */ + public static int calculateLength(int[] array) + { + int num = array.length; + int length = 0; + for (int j = 0; j < num; j++) { + if (array[j] == 0) { + break; + } else { + if (length == 0) { + length = array[j]; + } else { + length *= array[j]; + } + } + } + return length; + } + + public BoltResult(float[][] value, int[][] dimension, String[] name, String[] dataFormat) + { + this.value = value; + this.dimension = dimension; + this.name = name; + this.dataFormat = dataFormat; + } + + /** + * @brief get result data name from BoltResult object + * + * @return 1d String array of output data in the inference result + */ + public String[] getResultName() + { + return this.name; + } + + /** + * @brief get result data format from BoltResult object + * + * @return 1d String array of output data in the inference result + */ + public String[] getResultDataFormat() + { + return this.dataFormat; + } + + /** + * @brief get result data dimension information from BoltResult object + * + * @return 2d int array of output data in the inference result + */ + public int[][] getResultDimension() + { + return this.dimension; + } + + /** + * @brief get result data array from BoltResult object + * + * @return 2d float array of output data in the inference result + */ + public float[][] getResultData() + { + return this.value; + } + + /** + * @brief print BoltResult object info + * @param num the number of the result you want + * + * @return + */ + public void print(int num) + { + for (int i = 0; i < name.length; i++) { + System.out.println("[INFO] output name: " + name[i]); + System.out.println(" data format: " + dataFormat[i]); + int len = calculateLength(this.dimension[i]); + System.out.println(" data number: " + len); + if (num >= 0) { + if (num < len) { + len = num; + } + } + + for (int j = 0; j < len; j++) { + System.out.print(value[i][j] + " "); + } + System.out.println(); + } + } +} diff --git a/inference/engine/include/BoltModel.h b/inference/engine/include/BoltModel.h new file mode 100644 index 00000000..163fc68c --- /dev/null +++ b/inference/engine/include/BoltModel.h @@ -0,0 +1,145 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#ifndef _Included_BoltModel +#define _Included_BoltModel +#ifdef __cplusplus +extern "C" { +#endif +// there is no need to add '/' before com +// #define BOLT_JNI_PATH_PREFIX "com/huawei/noah/" +// #define BOLT_JNI_PREFIX_(X) Java_com_huawei_noah_##X +#define BOLT_JNI_PATH_PREFIX "" +#define BOLT_JNI_PREFIX_(X) Java_##X +#define BOLT_JNI_PREFIX(X) BOLT_JNI_PREFIX_(X) +/* + * Class: BoltModel + * Method: createModel + * Signature: (Ljava/lang/String;Ljava/lang/String;)J + */ +JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_createModel)(JNIEnv *, jobject, jstring, jstring); + +/* + * Class: BoltModel + * Method: prepareModel + * Signature: (JI[Ljava/lang/String;[I[I[I[I[Ljava/lang/String;[Ljava/lang/String;)V + */ +JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_prepareModel)(JNIEnv *, + jobject, + jlong, + jint, + jobjectArray, + jintArray, + jintArray, + jintArray, + jintArray, + jobjectArray, + jobjectArray); + +/* + * Class: BoltModel + * Method: cloneModel + * Signature: (J)J + */ +JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_cloneModel)(JNIEnv *, jobject, jlong); + +/* + * Class: BoltModel + * Method: resizeModelInput + * Signature: (JI[Ljava/lang/String;[I[I[I[I[Ljava/lang/String;[Ljava/lang/String;)V + */ +JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_resizeModelInput)(JNIEnv *, + jobject, + jlong, + jint, + jobjectArray, + jintArray, + jintArray, + jintArray, + jintArray, + jobjectArray, + jobjectArray); + +/* + * Class: BoltModel + * Method: allocAllResultHandle + * Signature: (J)J + */ +JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_allocAllResultHandle)(JNIEnv *, jobject, jlong); + +/* + * Class: BoltModel + * Method: allocSpecificResultHandle + * Signature: (JI[Ljava/lang/String;)J + */ +JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_allocSpecificResultHandle)( + JNIEnv *, jobject, jlong, jint, jobjectArray); + +/* + * Class: BoltModel + * Method: cloneResult + * Signature: (J)J + */ +JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_cloneResult)(JNIEnv *, jobject, jlong); + +/* + * Class: BoltModel + * Method: setRuntimeDeviceJNI + * Signature: (ILjava/lang/String;)V + */ +JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_setRuntimeDeviceJNI)( + JNIEnv *, jobject, jlong, jint, jstring); + +/* + * Class: BoltModel + * Method: setRuntimeDeviceDynamicJNI + * Signature: (V)V + */ +JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_setRuntimeDeviceDynamicJNI)( + JNIEnv *, jobject, jlong); + +/* + * Class: BoltModel + * Method: runModel + * Signature: (JJI[Ljava/lang/String;[[F)V + */ +JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_runModel)( + JNIEnv *, jobject, jlong, jlong, jint, jobjectArray, jobjectArray); + +/* + * Class: BoltModel + * Method: getOutput + * Signature: (J)LBoltResult; + */ +JNIEXPORT jobject JNICALL BOLT_JNI_PREFIX(BoltModel_getOutput)(JNIEnv *, jobject, jlong); + +/* + * Class: BoltModel + * Method: freeResultHandle + * Signature: (J)V + */ +JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_freeResultHandle)(JNIEnv *, jobject, jlong); + +/* + * Class: BoltModel + * Method: destroyModel + * Signature: (J)V + */ +JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_destroyModel)(JNIEnv *, jobject, jlong); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/inference/include/activation.hpp b/inference/engine/include/activation.hpp similarity index 77% rename from inference/include/activation.hpp rename to inference/engine/include/activation.hpp index f1a5404a..38e3054a 100644 --- a/inference/include/activation.hpp +++ b/inference/engine/include/activation.hpp @@ -1,34 +1,27 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _ACTIVATION_H #define _ACTIVATION_H #include "operator.hpp" -#include "tensor_computing.h" -#include "tensor_desc.h" -class Activation: public Operator -{ +class Activation : public Operator { public: - /** - @param mode - */ - Activation(ActivationDesc activationDesc) + Activation(ActivationParamSpec activationDesc) { this->activationDesc = activationDesc; - switch(activationDesc.mode) { + switch (activationDesc.mode) { case ACTIVATION_RELU: { this->opt = OT_Relu; break; @@ -57,6 +50,14 @@ class Activation: public Operator this->opt = OT_TanH; break; } + case ACTIVATION_MISH: { + this->opt = OT_Mish; + break; + } + case ACTIVATION_GREATER: { + this->opt = OT_Greater; + break; + } default: { CHECK_STATUS(NOT_SUPPORTED); } @@ -64,20 +65,19 @@ class Activation: public Operator this->lenOfTemp = 0; } - OperatorType get_op_type() override + OperatorType get_type() override { return this->opt; } - bool can_input_output_the_same() override { return true; } protected: - ActivationDesc activationDesc; + ActivationParamSpec activationDesc; OperatorType opt; }; -#endif //_ACTIVATION_H +#endif // _ACTIVATION_H diff --git a/inference/engine/include/argmax.hpp b/inference/engine/include/argmax.hpp new file mode 100644 index 00000000..50d98fdb --- /dev/null +++ b/inference/engine/include/argmax.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _ARGMAX_H +#define _ARGMAX_H + +#include "operator.hpp" + +class ArgMax : public Operator { +public: + ArgMax(DataType dt, ArgMaxParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_ArgMax; + } + +protected: + ArgMaxParamSpec p; +}; + +#endif // _ARGMAX_H diff --git a/inference/engine/include/attention.hpp b/inference/engine/include/attention.hpp new file mode 100644 index 00000000..a992be7f --- /dev/null +++ b/inference/engine/include/attention.hpp @@ -0,0 +1,65 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _ATTENTION_H +#define _ATTENTION_H + +#include "operator.hpp" + +class Attention : public Operator { +public: + Attention(DataType dt, AttentionParamSpec p) + { + this->dt = dt; + this->p = p; + } + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new Attention(this->dt, this->p)); + *mem = *this; + return mem; + } + + OperatorType get_type() override + { + return OT_Attention; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + auto inDesc = inputTensor.get_desc(); + inDesc.dt = this->dt; + inputTensor.resize(inDesc); + CHECK_STATUS(attention(inputTensor, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + auto inTensor = *inTensors[0]; + auto inDesc = inTensor.get_desc(); + inDesc.dt = this->dt; + inTensor.resize(inDesc); + CHECK_STATUS(attention_infer_output_size(&inTensor, this->p, outTensors[0])); + return SUCCESS; + } + +private: + AttentionParamSpec p; +}; + +#endif // _ATTENTION_H diff --git a/inference/engine/include/attention_mask.hpp b/inference/engine/include/attention_mask.hpp new file mode 100644 index 00000000..91561b33 --- /dev/null +++ b/inference/engine/include/attention_mask.hpp @@ -0,0 +1,57 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _ATTENTION_MASK_H +#define _ATTENTION_MASK_H + +#include "operator.hpp" + +class AttentionMask : public Operator { +public: + AttentionMask(DataType dt, AttentionMaskParamSpec p) + { + this->dt = dt; + this->p = p; + } + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new AttentionMask(this->dt, this->p)); + *mem = *this; + return mem; + } + + OperatorType get_type() override + { + return OT_AttentionMask; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(attention_mask(inputTensor, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + return attention_mask_infer_output_size(inTensors[0], outTensors[0]); + } + +private: + AttentionMaskParamSpec p; +}; + +#endif diff --git a/inference/engine/include/bilateral_slice_apply.hpp b/inference/engine/include/bilateral_slice_apply.hpp new file mode 100644 index 00000000..bcfc6f18 --- /dev/null +++ b/inference/engine/include/bilateral_slice_apply.hpp @@ -0,0 +1,37 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _BILATERAL_SLICE_APPLY_H +#define _BILATERAL_SLICE_APPLY_H + +#include "operator.hpp" + +class BilateralSliceApply : public Operator { +public: + BilateralSliceApply(BilateralSliceApplyParamSpec p) + { + this->p = p; + } + virtual ~BilateralSliceApply() + {} + + OperatorType get_type() override + { + return OT_BilateralSliceApply; + } + +protected: + BilateralSliceApplyParamSpec p; +}; + +#endif // _BILATERAL_SLICE_APPLY_H diff --git a/inference/engine/include/channel_resize.hpp b/inference/engine/include/channel_resize.hpp new file mode 100644 index 00000000..837f45a0 --- /dev/null +++ b/inference/engine/include/channel_resize.hpp @@ -0,0 +1,42 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CHANNEL_RESIZE_H +#define _CHANNEL_RESIZE_H + +#include "operator.hpp" + +class ChannelResize : public Operator { +public: + ChannelResize(DataType dt, ChannelResizeParamSpec p) + { + this->dt = dt; + this->valid = true; + this->rearrange = true; + this->p = p; + } + + OperatorType get_type() override + { + return OT_ChannelResize; + } + +protected: + bool valid; + // whether to rearrange cut data to specific format(NCHWC8) + bool rearrange; + + ChannelResizeParamSpec p; +}; + +#endif // _CHANNEL_RESIZE_H diff --git a/inference/engine/include/check.hpp b/inference/engine/include/check.hpp new file mode 100644 index 00000000..2c8ce0b0 --- /dev/null +++ b/inference/engine/include/check.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CHECK_H +#define _CHECK_H + +#include "operator.hpp" + +class Check : public Operator { +public: + Check(DataType dt, CheckParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_Check; + } + +protected: + CheckParamSpec p; +}; + +#endif // _CHECK_H diff --git a/inference/include/clip.hpp b/inference/engine/include/clip.hpp similarity index 69% rename from inference/include/clip.hpp rename to inference/engine/include/clip.hpp index b35baa61..bdbda939 100644 --- a/inference/include/clip.hpp +++ b/inference/engine/include/clip.hpp @@ -1,37 +1,30 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _CLIP_H #define _CLIP_H #include "operator.hpp" -#include "tensor_computing.h" -class Clip: public Operator -{ +class Clip : public Operator { public: - /** - @param mode - */ - Clip(DataType dt, F32 clipMinScalar, F32 clipMaxScalar) + Clip(DataType dt, ClipParamSpec p) { this->dt = dt; - this->clipMinScalar = clipMinScalar; - this->clipMaxScalar = clipMaxScalar; + this->p = p; } - OperatorType get_op_type() override + OperatorType get_type() override { return OT_Clip; } @@ -40,9 +33,9 @@ class Clip: public Operator { return true; } + protected: - F32 clipMinScalar; - F32 clipMaxScalar; + ClipParamSpec p; }; -#endif //_CLIP_H +#endif // _CLIP_H diff --git a/inference/engine/include/cnn.h b/inference/engine/include/cnn.h new file mode 100644 index 00000000..54b197c9 --- /dev/null +++ b/inference/engine/include/cnn.h @@ -0,0 +1,114 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CNN_H +#define _CNN_H + +#include +#include +#include "model.hpp" +#include "memory_tracker.hpp" +#ifdef _USE_MALI +#include "gcl_common.h" +#endif + +class CNN : public Model { +public: + CNN() + {} + + explicit CNN(AffinityPolicy affinityPolicy, DataType dt, std::string name) + : Model(affinityPolicy, dt, name) + {} + + virtual ~CNN() = default; + + CNN clone(); + + void sort_operators_sequential(const ModelSpec *ms); + + void initialize_ops(const ModelSpec *ms); + + void ready(std::map inputDescMap) override; + + void reready(std::map inputDescMap); + + EE mark_input_output(); + + void copy_to_named_input(std::string inputName, const U8 *data); + + void set_input_tensors_value(std::map> modelTensorsInput); + + std::map> get_inputs(); + + std::map> get_outputs(); + + Tensor get_tensor_by_name(std::string tensorName); + + TensorDesc get_tensor_desc_by_name(std::string tensorName); + + std::vector get_model_input_tensor_names(); + + std::vector get_model_input_tensor_descs(); + + std::vector get_model_output_tensor_names(); + + EE infer_output_tensors_size(std::map inputDescMap) override; + + void assign_output_tensor() override; + + void addOutputTensorNames(std::vector outputTensorNames); + + void run() override; + +#ifdef _USE_MALI + void mali_prepare(bool reset); +#endif +private: + std::shared_ptr allocate_tensor(U32 size = 0); + + void add(std::shared_ptr op, + std::vector inputTensorsName, + std::vector outputTensorsName); + + void infer_layout_desc(); + + void update_op_tensors(); + + void set_input_tensors_desc(std::map inputDescMap); + + void infer_tmp_memory_size() override; + + void assign_tmp_tensor() override; + + void check_memory_reuse_ratio(); + +private: + std::map> tensorMap; + std::map> operatorMap; + std::map>> operatorTensorMap; + + std::set weightOpOutputNames; + std::map> inputTensors; + std::map> outputTensors; + std::vector> storageMemory; + Tensor tmpTensor; + + std::vector sortedOps; + + std::vector modelInputTensorNames; + std::vector modelInputTensorDescs; + std::vector modelOutputTensorNames; + MemoryTracker memoryTracker; +}; +#endif diff --git a/inference/include/concat.hpp b/inference/engine/include/concat.hpp similarity index 73% rename from inference/include/concat.hpp rename to inference/engine/include/concat.hpp index 7a11c800..a055619e 100644 --- a/inference/include/concat.hpp +++ b/inference/engine/include/concat.hpp @@ -1,38 +1,35 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _CONCAT_H #define _CONCAT_H #include "operator.hpp" -#include "tensor_computing.h" -class Concat: public Operator { +class Concat : public Operator { public: - Concat(int axis) + Concat(ConcatParamSpec p) { - this->axis = axis; + this->p = p; } - OperatorType get_op_type() override + OperatorType get_type() override { return OT_Concat; } - protected: - I32 axis; + ConcatParamSpec p; }; -#endif //_CONCAT_H +#endif // _CONCAT_H diff --git a/inference/include/constant.hpp b/inference/engine/include/constant.hpp similarity index 66% rename from inference/include/constant.hpp rename to inference/engine/include/constant.hpp index 5abb427c..93c3e344 100644 --- a/inference/include/constant.hpp +++ b/inference/engine/include/constant.hpp @@ -1,45 +1,49 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _CONSTANT_H #define _CONSTANT_H #include "operator.hpp" -class Constant: public Operator { +class Constant : public Operator { public: - Constant(TensorDesc constDesc, void* data) + Constant(TensorDesc constDesc, void *data) { this->constDesc = constDesc; this->data = data; } - OperatorType get_op_type() override + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new Constant(this->constDesc, this->data)); + *mem = *this; + return mem; + } + + OperatorType get_type() override { return OT_Constant; } void run() override { - UTIL_TIME_TIC(__CLASS_FUNCTION__) Tensor outputTensor = this->outputTensors[0]; - - U8* outputPtr = outputTensor.get_val().get(); + auto outputPtr = ((CpuMemory *)outputTensor.get_memory())->get_ptr(); memcpy(outputPtr, data, tensorNumBytes(constDesc)); - UTIL_TIME_TOC(__CLASS_FUNCTION__) } - EE infer_output_tensors_size(Vec* outDims) override + EE infer_output_tensors_size(std::vector *outDims) override { (*outDims)[0] = constDesc; return SUCCESS; @@ -47,7 +51,7 @@ class Constant: public Operator { private: TensorDesc constDesc; - void* data; + void *data; }; -#endif //_CONSTANT__H +#endif // _CONSTANT__H diff --git a/inference/engine/include/convolution.hpp b/inference/engine/include/convolution.hpp new file mode 100644 index 00000000..a4f7975f --- /dev/null +++ b/inference/engine/include/convolution.hpp @@ -0,0 +1,69 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CONVOLUTION_H +#define _CONVOLUTION_H + +#include "weight_operator.hpp" + +class Convolution : public WeightOperator { +public: + Convolution(DataType dt, + ConvolutionParamSpec p, + ActivationParamSpec dwActivationParamSpec, + ActivationParamSpec pwActivationParamSpec) + { + this->dt = dt; + this->p = p; + this->dwActivationParamSpec = dwActivationParamSpec; + this->pwActivationParamSpec = pwActivationParamSpec; + this->hasBias = false; + this->pwAlg = CONVOLUTION_ALGORITHM_NULL; + this->dwAlg = DEPTHWISE_CONVOLUTION_ALGORITHM_NULL; + } + + OperatorType get_type() override + { + return OT_Conv; + } + + TensorDesc desc_process(TensorDesc inputDesc) + { + TensorDesc resultDesc; + if (tensorIs3d(inputDesc)) { + DataType idt; + DataFormat idf; + U32 in, ic, ih; + CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &in, &ic, &ih)); + resultDesc = tensor4df(idt, idf, in, ic, ih, 1); + } else { + resultDesc = inputDesc; + } + return resultDesc; + } + +public: + U32 numChannels; + + ConvolutionParamSpec p; + ActivationParamSpec dwActivationParamSpec; + ActivationParamSpec pwActivationParamSpec; + + ConvolutionForwardAlgorithm pwAlg; + DepthwiseConvolutionForwardAlgorithm dwAlg; +#ifdef _USE_FP16 + std::shared_ptr scales; +#endif +}; + +#endif // _CONVOLUTION_H diff --git a/inference/engine/include/copy.hpp b/inference/engine/include/copy.hpp new file mode 100644 index 00000000..60649b0e --- /dev/null +++ b/inference/engine/include/copy.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _COPY_H +#define _COPY_H + +#include "operator.hpp" + +class Copy : public Operator { +public: + Copy(DataType dt, CopyParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_Copy; + } + +protected: + CopyParamSpec p; +}; + +#endif // _COPY_H diff --git a/inference/include/cpu/activation_cpu.hpp b/inference/engine/include/cpu/activation_cpu.hpp similarity index 54% rename from inference/include/cpu/activation_cpu.hpp rename to inference/engine/include/cpu/activation_cpu.hpp index e9970041..c598c142 100644 --- a/inference/include/cpu/activation_cpu.hpp +++ b/inference/engine/include/cpu/activation_cpu.hpp @@ -1,54 +1,48 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _ACTIVATION_CPU_H #define _ACTIVATION_CPU_H -#include "operator.hpp" -#include "tensor_computing.h" -#include "tensor_desc.h" #include "activation.hpp" -class ActivationCPU: public Activation -{ +class ActivationCPU : public Activation { public: - /** - @param mode - */ - ActivationCPU(ActivationDesc activationDesc): Activation(activationDesc) {} + ActivationCPU(ActivationParamSpec activationDesc) : Activation(activationDesc) + {} - virtual void run() override + std::shared_ptr clone() override { - UTIL_TIME_TIC(__CLASS_FUNCTION__) + std::shared_ptr mem = + std::shared_ptr(new ActivationCPU(this->activationDesc)); + *mem = *this; + return mem; + } + void run() override + { Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - U8* inPtr = inputTensor.get_val(); - U8* outPtr = outputTensor.get_val(); - - CHECK_STATUS(activation(inputDesc, inPtr, this->activationDesc, outputDesc, outPtr, this->schedule)); + CHECK_STATUS(activation(inputTensor, this->activationDesc, outputTensor, &this->archInfo)); outputTensor.set_scale(inputTensor.get_scale()); - UTIL_TIME_TOC(__CLASS_FUNCTION__) } - virtual EE infer_output_tensors_size(VecinDims, Vec* outDims) override + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override { - CHECK_STATUS(activation_infer_output_size(inDims[0], &((*outDims)[0]), this->schedule)); + CHECK_STATUS(activation_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); return SUCCESS; } }; -#endif //_ACTIVATION_CPU_H +#endif // _ACTIVATION_CPU_H diff --git a/inference/engine/include/cpu/argmax_cpu.hpp b/inference/engine/include/cpu/argmax_cpu.hpp new file mode 100644 index 00000000..5c3b643b --- /dev/null +++ b/inference/engine/include/cpu/argmax_cpu.hpp @@ -0,0 +1,48 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _ARGMAX_CPU_H +#define _ARGMAX_CPU_H + +#include "argmax.hpp" + +class ArgMaxCPU : public ArgMax { +public: + ArgMaxCPU(DataType dt, ArgMaxParamSpec p) : ArgMax(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new ArgMaxCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(argmax(inputTensor, this->p, this->temp, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS( + argmax_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // _ARGMAX_CPU_H diff --git a/inference/engine/include/cpu/channel_resize_cpu.hpp b/inference/engine/include/cpu/channel_resize_cpu.hpp new file mode 100644 index 00000000..cf5d6c5f --- /dev/null +++ b/inference/engine/include/cpu/channel_resize_cpu.hpp @@ -0,0 +1,121 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CHANNEL_RESIZE_CPU_H +#define _CHANNEL_RESIZE_CPU_H + +#include "channel_resize.hpp" + +class ChannelResizeCPU : public ChannelResize { +public: + ChannelResizeCPU(DataType dt, ChannelResizeParamSpec p) : ChannelResize(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new ChannelResizeCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + TensorDesc inputDesc = inputTensor.get_desc(); + U32 inputSize = tensorNumBytes(inputDesc); + U8 *inputPtr = (U8 *)((CpuMemory *)(inputTensor.get_memory()))->get_ptr(); + Tensor outputTensor = this->outputTensors[0]; + TensorDesc outputDesc = outputTensor.get_desc(); + U32 outputSize = tensorNumBytes(outputDesc); + U8 *outputPtr = (U8 *)((CpuMemory *)(outputTensor.get_memory()))->get_ptr(); + // don't need to span or cut + if (!this->valid) { + if (inputPtr != outputPtr) { + CHECK_REQUIREMENT(inputSize == outputSize); + memcpy(outputPtr, inputPtr, inputSize); + } + } else if (this->rearrange && DF_NCHWC8 == inputDesc.df && DF_NCHWC8 == outputDesc.df) { + transformNCHWC8ToNCHWC8ByGroup( + inputDesc, inputPtr, this->p.group, outputDesc, outputPtr); + } else { + U32 batch = inputDesc.dims[inputDesc.nDims - 1]; + U32 inputChannelGroupSize = this->p.channel_before / this->p.group; + U32 inputTileSize = inputSize / (batch * this->p.group); + U32 outputChannelGroupSize = this->p.channel_after / this->p.group; + U32 outputTileSize = outputSize / (batch * this->p.group); + int channelAxis = inputDesc.nDims - 2; + TensorDesc tmpInputDesc = inputDesc; + tmpInputDesc.dims[channelAxis] = inputChannelGroupSize; + TensorDesc tmpOutputDesc = outputDesc; + tmpOutputDesc.dims[channelAxis] = outputChannelGroupSize; + for (int g = 0; g < this->p.group; g++) { + if (this->p.channel_after > this->p.channel_before) { + transformNCHWToNCHWC8(tmpInputDesc, inputPtr, tmpOutputDesc, outputPtr); + } else { + transformToNCHW(tmpInputDesc, inputPtr, tmpOutputDesc, outputPtr); + } + inputPtr += inputTileSize; + outputPtr += outputTileSize; + } + } +#ifdef _USE_INT8 + outputTensor.set_scale(inputTensor.get_scale()); +#endif + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_REQUIREMENT(inTensors.size() > 0); + auto inDesc = inTensors[0]->get_desc(); + CHECK_REQUIREMENT(inDesc.nDims > 0); + // don't need to span + if (this->p.channel_after > this->p.channel_before && inDesc.df == DF_NCHWC8) { + this->valid = false; + } + // don't need to cut + if (this->p.channel_after < this->p.channel_before && inDesc.df == DF_NCHW) { + this->valid = false; + } + if (!this->valid) { + outTensors[0]->resize(inDesc); + return SUCCESS; + } + + int channelAxis = inDesc.nDims - 2; + // channel span or cut for OT_Resize + if (this->p.group == 0) { + this->p.group = 1; + this->p.channel_before = (int)inDesc.dims[channelAxis]; + this->p.channel_after = + (this->p.channel_before / 8 + ((this->p.channel_before % 8 == 0) ? 0 : 1)) * 8; + } else { + CHECK_REQUIREMENT((int)inDesc.dims[channelAxis] == this->p.channel_before); + } + + inDesc.dims[channelAxis] = this->p.channel_after; + DataFormat dataFormat; + if (this->p.channel_after > this->p.channel_before || + (this->rearrange && this->p.channel_after % 8 == 0)) { + dataFormat = DF_NCHWC8; + } else { + dataFormat = DF_NCHW; + } + inDesc.df = dataFormat; + outTensors[0]->resize(inDesc); + return SUCCESS; + } +}; + +#endif // _CHANNEL_RESIZE_CPU_H diff --git a/inference/engine/include/cpu/check_cpu.hpp b/inference/engine/include/cpu/check_cpu.hpp new file mode 100644 index 00000000..464721f9 --- /dev/null +++ b/inference/engine/include/cpu/check_cpu.hpp @@ -0,0 +1,46 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CHECK_CPU_H +#define _CHECK_CPU_H + +#include "check.hpp" + +class CheckCPU : public Check { +public: + CheckCPU(DataType dt, CheckParamSpec p) : Check(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new CheckCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputATensor = this->inputTensors[0]; + Tensor inputBTensor = this->inputTensors[1]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(check(inputATensor, inputBTensor, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + return check_infer_output_size(inTensors, outTensors[0], &this->archInfo); + } +}; + +#endif // _CHECK_CPU_H diff --git a/inference/engine/include/cpu/clip_cpu.hpp b/inference/engine/include/cpu/clip_cpu.hpp new file mode 100644 index 00000000..4d656ee7 --- /dev/null +++ b/inference/engine/include/cpu/clip_cpu.hpp @@ -0,0 +1,46 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CLIP_CPU_H +#define _CLIP_CPU_H + +#include "clip.hpp" + +class ClipCPU : public Clip { +public: + ClipCPU(DataType dt, ClipParamSpec p) : Clip(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new ClipCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(clip(inputTensor, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(clip_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // _CLIP_CPU_H diff --git a/inference/engine/include/cpu/concat_cpu.hpp b/inference/engine/include/cpu/concat_cpu.hpp new file mode 100644 index 00000000..b758c63d --- /dev/null +++ b/inference/engine/include/cpu/concat_cpu.hpp @@ -0,0 +1,52 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CONCAT_CPU_H +#define _CONCAT_CPU_H + +#include "concat.hpp" + +class ConcatCPU : public Concat { +public: + ConcatCPU(ConcatParamSpec p) : Concat(p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new ConcatCPU(this->p)); + *mem = *this; + return mem; + } + + void run() override + { + CHECK_STATUS( + concat(this->inputTensors, this->p, this->temp, outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(concat_infer_output_size(inTensors, this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(concat_infer_forward_tmp_bytes(this->inputTensors, &bytes, &this->archInfo)); + return bytes; + } +}; + +#endif // _CONCAT_CPU_H diff --git a/inference/engine/include/cpu/convolution_cpu.hpp b/inference/engine/include/cpu/convolution_cpu.hpp new file mode 100644 index 00000000..f2a405c6 --- /dev/null +++ b/inference/engine/include/cpu/convolution_cpu.hpp @@ -0,0 +1,539 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CONVELTWISEPOOLING_CPU_H +#define _CONVELTWISEPOOLING_CPU_H + +#include "convolution.hpp" + +class ConvolutionCPU : public Convolution { +public: + ConvolutionCPU(DataType dt, + ConvolutionParamSpec p, + ActivationParamSpec dwActivationParamSpec, + ActivationParamSpec pwActivationParamSpec) + : Convolution(dt, p, dwActivationParamSpec, pwActivationParamSpec) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new ConvolutionCPU( + this->dt, this->p, this->dwActivationParamSpec, this->pwActivationParamSpec)); + *mem = *this; + return mem; + } + + EE init_weight_bias_from_model(std::shared_ptr *modelPtrShared) override + { + U8 *modelPtr = nullptr; + if (modelPtrShared != nullptr) { + modelPtr = (*modelPtrShared).get(); + } + auto curOpWs = this->get_weightspec(); + DataType filterDt = curOpWs.mdt; // weight data type may not be the same as input and output + if (modelPtr != nullptr) { + filterDt = this->dt; + } + DataType dtNoQ = (this->dt == DT_F16_8Q) ? DT_F16 : this->dt; + U32 isBNN = 0; + if (filterDt == DT_BIN01 || filterDt == DT_BIN11) { + isBNN = 1; + } + + int weight_num = 1; + std::vector weight_desc(2), bias_desc(2); + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + U32 vectorLen = this->p.num_outputs; // bias length + if (isBNN == 1) { + this->dt = dtNoQ; // BNN convolution should not be quantized further + vectorLen *= 2; // Scale has the same vector length as bias, so double the length + } + weight_desc[0] = tensor4df(filterDt, DF_NCHW, this->p.num_outputs, + this->numChannels, this->p.kernel_h, this->p.kernel_w); + bias_desc[0] = tensor1d(dtNoQ, vectorLen); + break; + } + case Convolution_Depthwise: { + weight_desc[0] = tensor4df( + filterDt, DF_NCHW, 1, this->p.num_outputs, this->p.kernel_h, this->p.kernel_w); + bias_desc[0] = tensor1d(dtNoQ, this->p.num_outputs); + break; + } + case Convolution_Depthwise_Pointwise: { + weight_desc[0] = tensor4df( + filterDt, DF_NCHW, 1, this->numChannels, this->p.kernel_h, this->p.kernel_w); + bias_desc[0] = tensor1d(dtNoQ, this->numChannels); + weight_desc[1] = + tensor4df(filterDt, DF_NCHW, this->p.num_outputs, this->numChannels, 1, 1); + bias_desc[1] = tensor1d(dtNoQ, this->p.num_outputs); + weight_num = 2; + break; + } + case Convolution_Dilation: { + weight_desc[0] = tensor4df(filterDt, DF_NCHW, this->p.num_outputs, + this->numChannels, this->p.kernel_h, this->p.kernel_w); + bias_desc[0] = tensor1d(dtNoQ, this->p.num_outputs); + break; + } + default: + return NOT_SUPPORTED; + } + + std::shared_ptr weight_ptr(curOpWs.weight); + std::shared_ptr bias_ptr(curOpWs.vec); + U32 weight_offset = 0; + U32 bias_offset = 0; + for (int j = 0; j < weight_num; j++) { + Tensor weight_tensor, bias_tensor; + weight_tensor.resize(weight_desc[j]); + bias_tensor.resize(bias_desc[j]); + U32 weight_bytes = weight_tensor.bytes(); + U32 bias_bytes = bias_tensor.bytes(); + U32 offset_bytes = 0; + if (modelPtr != nullptr) { + weight_tensor.alloc(); + memcpy( + ((CpuMemory *)(weight_tensor.get_memory()))->get_ptr(), modelPtr, weight_bytes); + offset_bytes += weight_bytes; + if (this->hasBias) { + bias_tensor.alloc(); + memcpy(((CpuMemory *)(bias_tensor.get_memory()))->get_ptr(), + modelPtr + offset_bytes, bias_bytes); + offset_bytes += bias_bytes; + } + *modelPtrShared = std::shared_ptr(*modelPtrShared, modelPtr + offset_bytes); + } else { + ((CpuMemory *)(weight_tensor.get_memory())) + ->set_shared_ptr( + std::shared_ptr(weight_ptr, weight_ptr.get() + weight_offset)); + weight_offset += weight_bytes; + if (this->hasBias) { + ((CpuMemory *)(bias_tensor.get_memory())) + ->set_shared_ptr( + std::shared_ptr(bias_ptr, bias_ptr.get() + bias_offset)); + bias_offset += bias_bytes; + } + } + if (!this->hasBias) { + bias_tensor.alloc(); + if (isBNN == 1) { +#ifdef _USE_FP16 + U8 *ptr = (U8 *)((CpuMemory *)(bias_tensor.get_memory()))->get_ptr(); + UNI_init(p.num_outputs, DT_F16, 1.0, ptr); + ptr += bias_bytes / 2; + memset(ptr, 0, bias_bytes / 2); // second half is bias +#endif + } else { + memset(((CpuMemory *)(bias_tensor.get_memory()))->get_ptr(), 0, bias_bytes); + } + } + this->weightTensors.push_back(weight_tensor); + this->biasTensors.push_back(bias_tensor); + } + return SUCCESS; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor filterTensor = this->weightTensors[0]; + U8 *scalePtr = nullptr; + Tensor biasTensor = this->biasTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + if (DT_F16_8Q == this->dt) { +#ifdef _USE_INT8 + F16 *ptr = this->scales.get(); + scalePtr = (U8 *)ptr; + auto inputDesc = inputTensor.get_desc(); + + ptr[0] = inputTensor.get_scale(); + if (featureScale.size() > 0 && featureScale[0][0] > 0) { + ptr[0] = featureScale[0][0]; + } else if (DT_F16 == inputDesc.dt) { + ptr[0] = -1; + } + + if (featureScale.size() > 0 && (featureScale.back())[0] != -2) { + ptr[1] = (featureScale.back())[0]; + } else { + ptr[1] = -1; + } +#endif + } + CHECK_STATUS( + convolution(inputTensor, filterTensor, p, this->pwAlg, scalePtr, biasTensor, + this->temp, outputTensor, this->pwActivationParamSpec, &this->archInfo)); +#ifdef _USE_INT8 + auto outputDesc = outputTensor.get_desc(); + if (DT_I8 == outputDesc.dt) { + F16 *ptr = (F16 *)scalePtr; + outputTensor.set_scale(ptr[1]); + } +#endif + break; + } + case Convolution_Depthwise: { + CHECK_STATUS( + depthwise_convolution(inputTensor, filterTensor, p, this->dwAlg, biasTensor, + this->temp, outputTensor, this->dwActivationParamSpec, &this->archInfo)); + break; + } + case Convolution_Depthwise_Pointwise: { + CHECK_STATUS( + depthwise_pointwise_convolution(inputTensor, filterTensor, weightTensors[1], p, + this->dwAlg, biasTensor, biasTensors[1], this->temp, outputTensor, + this->dwActivationParamSpec, this->pwActivationParamSpec, &this->archInfo)); + break; + } + case Convolution_Dilation: { + CHECK_STATUS( + convolution(inputTensor, filterTensor, p, this->pwAlg, scalePtr, biasTensor, + this->temp, outputTensor, this->pwActivationParamSpec, &this->archInfo)); + break; + } + default: { + UNI_ERROR_LOG("unsupported convolution type %d\n", this->p.convolution_type); + } + } + } + + EE infer_forward_algorithm(std::shared_ptr algorithmMap) override + { + auto inputTensor = this->inputTensors[0]; + auto filterTensor = this->weightTensors[0]; + auto outputTensor = this->outputTensors[0]; + TensorDesc inputDesc = this->desc_process(inputTensor.get_desc()); + inputTensor.resize(inputDesc); + TensorDesc filterDesc = filterTensor.get_desc(); + + ConvolutionPolicy policy = CONVOLUTION_FASTEST; + DataType targetType = filterDesc.dt; + I32 algo; + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + if (this->dt == DT_F16_8Q) { + targetType = DT_I8; + } + if (algorithmMap->getAlgorithmInfoFromMap(this->name, &algo, 1)) { + this->pwAlg = (ConvolutionForwardAlgorithm)algo; + } else if (algorithmMap->getCommonAlgoInfoFromMap(OT_Conv, this->dt, + inputDesc.dims[2], inputDesc.dims[1], inputDesc.dims[0], + filterDesc.dims[3], filterDesc.dims[1], filterDesc.dims[0], + this->p.stride_h, this->p.stride_w, &algo, 1)) { + this->pwAlg = (ConvolutionForwardAlgorithm)algo; + } else { + CHECK_STATUS(convolution_infer_forward_algorithm(inputTensor, filterTensor, + outputTensor, p, policy, &(this->pwAlg), targetType, + this->pwActivationParamSpec, &this->archInfo)); + algo = this->pwAlg; + algorithmMap->setAlgorithmInfoToMap(this->name, &algo, 1); + } + break; + } + case Convolution_Depthwise: { + if (algorithmMap->getAlgorithmInfoFromMap(this->name, &algo, 1)) { + this->dwAlg = (DepthwiseConvolutionForwardAlgorithm)algo; + } else { + CHECK_STATUS(depthwise_convolution_infer_forward_algorithm(inputTensor, + filterTensor, outputTensor, p, policy, &(this->dwAlg), targetType, + this->dwActivationParamSpec, &this->archInfo)); + algo = this->dwAlg; + algorithmMap->setAlgorithmInfoToMap(this->name, &algo, 1); + } + break; + } + case Convolution_Depthwise_Pointwise: { + if (algorithmMap->getAlgorithmInfoFromMap(this->name, &algo, 1)) { + this->dwAlg = (DepthwiseConvolutionForwardAlgorithm)algo; + } else { + CHECK_STATUS(depthwise_pointwise_convolution_infer_forward_algorithm( + inputTensor, filterTensor, this->weightTensors[1], outputTensor, p, policy, + &(this->dwAlg), targetType, this->dwActivationParamSpec, + this->pwActivationParamSpec, &this->archInfo)); + algo = this->dwAlg; + algorithmMap->setAlgorithmInfoToMap(this->name, &algo, 1); + } + break; + } + case Convolution_Dilation: { + if (algorithmMap->getAlgorithmInfoFromMap(this->name, &algo, 1)) { + this->pwAlg = (ConvolutionForwardAlgorithm)algo; + } else { + CHECK_STATUS(convolution_infer_forward_algorithm(inputTensor, filterTensor, + outputTensor, p, policy, &(this->pwAlg), targetType, + this->pwActivationParamSpec, &this->archInfo)); + algo = this->pwAlg; + algorithmMap->setAlgorithmInfoToMap(this->name, &algo, 1); + } + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + } + return SUCCESS; + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + TensorDesc inDim = this->desc_process(inTensors[0]->get_desc()); + Tensor tmpTensor; + tmpTensor.resize(inDim); + auto inputTensor = &tmpTensor; + auto outputTensor = outTensors[0]; + + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + CHECK_STATUS(tensor4dGet(inDim, &idt, &idf, &in, &ic, &ih, &iw)); + if (DF_NCHW == idf && DT_F16_8Q == this->dt && DT_F16 == idt) { + this->dt = DT_F16; + } + this->numChannels = ic; + if (this->p.convolution_type == Convolution_Dilation || + this->p.convolution_type == Convolution_Pointwise) { + this->numChannels /= this->p.group; + } + + Tensor filterTensor; + TensorDesc filterDim = tensor4df(this->dt, DF_NCHW, this->p.num_outputs, this->numChannels, + this->p.kernel_h, this->p.kernel_w); + filterTensor.resize(filterDim); + + DataType targetType = this->dt; + if (DT_F16_8Q == this->dt && Convolution_Pointwise == this->p.convolution_type) { + targetType = DT_I8; + } + + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + CHECK_STATUS(convolution_infer_output_size( + inputTensor, filterTensor, p, outputTensor, targetType, &this->archInfo)); + break; + } + case Convolution_Depthwise: { + filterDim.dims[3] = 1; + CHECK_STATUS(depthwise_convolution_infer_output_size( + inputTensor, filterTensor, p, outputTensor, targetType, &this->archInfo)); + break; + } + case Convolution_Depthwise_Pointwise: { + TensorDesc dwFilterDesc = tensor4df( + this->dt, DF_NCHW, 1, this->numChannels, this->p.kernel_h, this->p.kernel_w); + TensorDesc pwFilterDesc = + tensor4df(this->dt, DF_NCHW, this->p.num_outputs, this->numChannels, 1, 1); + Tensor dwFilterTensor; + Tensor pwFilterTensor; + dwFilterTensor.resize(dwFilterDesc); + pwFilterTensor.resize(pwFilterDesc); + CHECK_STATUS(depthwise_pointwise_convolution_infer_output_size(inputTensor, + dwFilterTensor, pwFilterTensor, p, outputTensor, targetType, &this->archInfo)); + break; + } + case Convolution_Dilation: { + CHECK_STATUS(convolution_infer_output_size( + inputTensor, filterTensor, p, outputTensor, targetType, &this->archInfo)); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + } + if (DT_F16_8Q == this->dt && featureScale.size() > 0 && -2 == (featureScale.back())[0]) { + TensorDesc outputDesc = outputTensor->get_desc(); + outputDesc.dt = DT_F16; + outputTensor->resize(outputDesc); + } + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + auto inputTensor = this->inputTensors[0]; + TensorDesc inDim = this->desc_process(inputTensor.get_desc()); + inputTensor.resize(inDim); + auto filterTensor = this->weightTensors[0]; + TensorDesc filterDesc = filterTensor.get_desc(); + if (DT_F16_8Q == filterDesc.dt) { + filterDesc.dt = DT_I8; + filterTensor.resize(filterDesc); + } + auto outputTensor = this->outputTensors[0]; + + U32 bytes = 0; + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + CHECK_STATUS(convolution_infer_forward_tmp_bytes(inputTensor, filterTensor, + outputTensor, p, this->pwAlg, &bytes, &this->archInfo)); + break; + } + case Convolution_Depthwise: { + CHECK_STATUS(depthwise_convolution_infer_forward_tmp_bytes(inputTensor, + filterTensor, outputTensor, p, this->dwAlg, &bytes, &this->archInfo)); + break; + } + case Convolution_Depthwise_Pointwise: { + CHECK_STATUS(depthwise_pointwise_convolution_infer_forward_tmp_bytes(inputTensor, + filterTensor, this->weightTensors[1], outputTensor, p, this->dwAlg, &bytes, + &this->archInfo)); + break; + } + case Convolution_Dilation: { + CHECK_STATUS(convolution_infer_forward_tmp_bytes(inputTensor, filterTensor, + outputTensor, p, this->pwAlg, &bytes, &this->archInfo)); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + } + return bytes; + } + + U32 infer_filter_transform_bytes(U32 *bytesExtra) + { + auto filterTensor = this->weightTensors[0]; + U32 bytes = 0; + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + CHECK_STATUS(convolution_transform_filter_bytes( + filterTensor, this->p, this->pwAlg, &bytes, &this->archInfo)); + break; + } + case Convolution_Depthwise: { + CHECK_STATUS(depthwise_convolution_transform_filter_bytes( + filterTensor, this->p, this->dwAlg, &bytes, &this->archInfo)); + break; + } + case Convolution_Depthwise_Pointwise: { + CHECK_STATUS(depthwise_pointwise_convolution_transform_filter_bytes(filterTensor, + weightTensors[1], this->p, this->dwAlg, &bytes, bytesExtra, &this->archInfo)); + break; + } + case Convolution_Dilation: { + CHECK_STATUS(convolution_transform_filter_bytes( + filterTensor, this->p, this->pwAlg, &bytes, &this->archInfo)); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + } + return bytes; + } + + EE transform_filter() override + { + Tensor filterTensor = this->weightTensors[0]; + this->wtm = std::shared_ptr(new Tensor()); + + TensorDesc wtmDesc; + if (DT_F16_8Q == this->dt && Convolution_Pointwise == this->p.convolution_type && + CONVOLUTION_ALGORITHM_WINOGRAD == this->pwAlg) { // int8 winograd +#ifdef _USE_INT8 + U32 ftBytes; + CHECK_STATUS(convolution_transform_filter_bytes( + filterTensor, this->p, this->pwAlg, &ftBytes, &this->archInfo)); + + Tensor tFilter; + tFilter.resize(tensor1d(DT_U8, ftBytes)); + tFilter.alloc(); + + // To label as int8 + TensorDesc filterDesc = filterTensor.get_desc(); + filterDesc.dt = DT_F16_8Q; + + filterTensor.resize(filterDesc); + CHECK_STATUS(convolution_transform_filter( + filterTensor, this->p, this->pwAlg, this->temp, &tFilter, &this->archInfo)); + + U32 ftmBytes = ftBytes / bytesOf(DT_F16); + wtm->resize(tensor1d(DT_U8, ftmBytes)); + wtm->alloc(); + + std::shared_ptr fsp((F16 *)operator new(38 * bytesOf(DT_F16))); + this->scales = fsp; + TensorDesc wtmDesc; + CHECK_STATUS(quantize_tensor(tFilter.get_desc(), + ((CpuMemory *)(tFilter.get_memory()))->get_ptr(), &wtmDesc, + ((CpuMemory *)(wtm->get_memory()))->get_ptr(), this->scales.get() + 2)); + wtm->resize(wtmDesc); + } else if (DT_F16_8Q == this->dt && + Convolution_Pointwise == this->p.convolution_type) { // int8 tilegemm + Tensor qFilterTensor; + TensorDesc qDesc = filterTensor.get_desc(); + qDesc.dt = DT_I8; + qFilterTensor.resize(qDesc); + qFilterTensor.alloc(); + std::shared_ptr fsp((F16 *)operator new(3 * bytesOf(DT_F16))); + this->scales = fsp; + this->scales.get()[2] = -1; + CHECK_STATUS(quantize_tensor(filterTensor.get_desc(), + ((CpuMemory *)(filterTensor.get_memory()))->get_ptr(), &qDesc, + ((CpuMemory *)(qFilterTensor.get_memory()))->get_ptr(), this->scales.get() + 2)); + + U32 ftmBytes; + qFilterTensor.resize(qDesc); + CHECK_STATUS(convolution_transform_filter_bytes( + qFilterTensor, this->p, this->pwAlg, &ftmBytes, &this->archInfo)); + + wtm->resize(tensor1d(DT_U8, ftmBytes)); + wtm->alloc(); + + // trans filter + CHECK_STATUS(convolution_transform_filter( + qFilterTensor, this->p, this->pwAlg, this->temp, this->wtm.get(), &this->archInfo)); +#endif + } else { // All other cases + U32 bytesExtra; + auto wtmBytes = this->infer_filter_transform_bytes(&bytesExtra); + wtm->resize(tensor1d(DT_U8, wtmBytes)); + wtm->alloc(); + + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + CHECK_STATUS(convolution_transform_filter(filterTensor, this->p, this->pwAlg, + this->temp, this->wtm.get(), &this->archInfo)); + break; + } + case Convolution_Depthwise: { + CHECK_STATUS(depthwise_convolution_transform_filter( + filterTensor, this->p, this->dwAlg, this->wtm.get(), &this->archInfo)); + break; + } + case Convolution_Depthwise_Pointwise: { + Tensor pwTensor; + pwTensor.resize(tensor1d(DT_U8, bytesExtra)); + pwTensor.alloc(); + CHECK_STATUS(depthwise_pointwise_convolution_transform_filter(filterTensor, + weightTensors[1], this->p, this->dwAlg, this->wtm.get(), &pwTensor, + &this->archInfo)); + weightTensors[1] = pwTensor; + break; + } + case Convolution_Dilation: { + CHECK_STATUS(convolution_transform_filter(filterTensor, this->p, this->pwAlg, + this->temp, this->wtm.get(), &this->archInfo)); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + } + } + this->weightTensors[0] = *this->get_wtm(); + return SUCCESS; + } +}; + +#endif // _CONVELTWISEPOOLING_H diff --git a/inference/engine/include/cpu/copy_cpu.hpp b/inference/engine/include/cpu/copy_cpu.hpp new file mode 100644 index 00000000..83f482c3 --- /dev/null +++ b/inference/engine/include/cpu/copy_cpu.hpp @@ -0,0 +1,83 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _COPY_CPU_H +#define _COPY_CPU_H + +#include "copy.hpp" + +class CopyCPU : public Copy { +public: + CopyCPU(DataType dt, CopyParamSpec p) : Copy(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new CopyCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor srcTensor = this->inputTensors[0]; + TensorDesc srcDesc = srcTensor.get_desc(); + Tensor dstTensor = this->inputTensors[1]; + TensorDesc dstDesc = dstTensor.get_desc(); + + std::vector input; + input.push_back(((CpuMemory *)(srcTensor.get_memory()))->get_ptr()); + input.push_back(((CpuMemory *)(dstTensor.get_memory()))->get_ptr()); + + U32 batch = srcDesc.dims[srcDesc.nDims - 1]; + U32 copyLength = (this->p.length >= 0) ? this->p.length : tensorNumElements(srcDesc) / batch; + U32 srcBatchStride = (this->p.src_dims[0] >= 0) ? this->p.src_dims[0] + : tensorNumElements(srcDesc) / batch; + U32 srcStride = (this->p.src_dims[0] >= 0) ? this->p.src_dims[1] + : tensorNumElements(srcDesc) / batch; + U32 dstBatchStride = (this->p.dst_dims[0] >= 0) ? this->p.dst_dims[0] + : tensorNumElements(dstDesc) / batch; + U32 dstStride = (this->p.dst_dims[0] >= 0) ? this->p.dst_dims[1] + : tensorNumElements(dstDesc) / batch; + for (U32 i = 0; i < batch; i++) { + U32 srcBlockIndex = 0; + if (this->inputTensors.size() > 2) { + U32 *ptr = (U32 *)((CpuMemory *)(this->inputTensors[2].get_memory()))->get_ptr(); + srcBlockIndex = ptr[i]; + } + U32 dstBlockIndex = 0; + if (this->inputTensors.size() > 3) { + U32 *ptr = (U32 *)((CpuMemory *)(this->inputTensors[3].get_memory()))->get_ptr(); + dstBlockIndex = ptr[i]; + } + U32 srcIndex = i * srcBatchStride + srcBlockIndex * srcStride + this->p.src_dims[2]; + U32 dstIndex = i * dstBatchStride + dstBlockIndex * dstStride + this->p.dst_dims[2]; + CHECK_STATUS( + copy(this->inputTensors, srcIndex, dstIndex, 0, 0, copyLength, &this->archInfo)); + } + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + UNUSED(inTensors); + auto desc = outTensors[0]->get_desc(); + desc.dt = this->dt; + desc.df = getTensorDefaultDataFormat(0); + desc.nDims = 0; + outTensors[0]->resize(desc); + return SUCCESS; + } +}; + +#endif // _COPY_CPU_H diff --git a/inference/engine/include/cpu/deconvolution_cpu.hpp b/inference/engine/include/cpu/deconvolution_cpu.hpp new file mode 100644 index 00000000..1fc71357 --- /dev/null +++ b/inference/engine/include/cpu/deconvolution_cpu.hpp @@ -0,0 +1,150 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _DECONVOLUTION_CPU_H +#define _DECONVOLUTION_CPU_H + +#include "deconvolution.hpp" + +class DeconvolutionCPU : public Deconvolution { +public: + DeconvolutionCPU(DataType dt, ConvolutionParamSpec p, ActivationParamSpec activationDesc) + : Deconvolution(dt, p, activationDesc) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr( + new DeconvolutionCPU(this->dt, this->p, this->activationDesc)); + *mem = *this; + return mem; + } + + EE infer_weight_desc() override + { + auto curOpWs = this->get_weightspec(); + DataType filterDt = curOpWs.mdt; // weight data type may not be the same as input and output + if (curOpWs.weight == nullptr) { + filterDt = this->dt; + } + DataType dtNoQ = (this->dt == DT_F16_8Q) ? DT_F16 : this->dt; + CHECK_REQUIREMENT(filterDt != DT_BIN01 && filterDt != DT_BIN11); + DataFormat filterDf = DF_NCHW; + TensorDesc filterTensorDesc = tensor4df(filterDt, filterDf, this->numInputs, + this->p.num_outputs, this->p.kernel_h, this->p.kernel_w); + // bias length + U32 vectorLen = this->numInputs * this->p.group; + // bias data type should be the same as input and output + TensorDesc vectorTensorDesc = tensor1d(dtNoQ, vectorLen); + + this->weightTensors = std::vector(1); + this->weightTensors[0].resize(filterTensorDesc); + this->biasTensors = std::vector(1); + this->biasTensors[0].resize(vectorTensorDesc); + return SUCCESS; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor filterTensor = this->weightTensors[0]; + U8 *scalePtr = nullptr; + Tensor biasTensor = this->biasTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + auto filterDesc = filterTensor.get_desc(); + if (filterDesc.dt == DT_BIN01 || filterDesc.dt == DT_BIN11) { + CHECK_STATUS(NOT_SUPPORTED); + } + CHECK_STATUS(deconvolution(inputTensor, filterTensor, p, this->alg, scalePtr, biasTensor, + this->temp, outputTensor, this->activationDesc, &this->archInfo)); + } + + EE infer_forward_algorithm(std::shared_ptr algorithmMap) override + { + ConvolutionPolicy policy = CONVOLUTION_FASTEST; + auto filterDesc = this->weightTensors[0].get_desc(); + DataType targetType = filterDesc.dt; + I32 algo; + if (algorithmMap->getAlgorithmInfoFromMap(this->name, &algo, 1)) { + this->alg = (ConvolutionForwardAlgorithm)algo; + } else { + CHECK_STATUS(deconvolution_infer_forward_algorithm(this->inputTensors[0], + this->weightTensors[0], this->outputTensors[0], p, policy, &(this->alg), targetType, + this->activationDesc, &this->archInfo)); + algo = this->alg; + algorithmMap->setAlgorithmInfoToMap(this->name, &algo, 1); + } + return SUCCESS; + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + auto inputTensor = inTensors[0]; + TensorDesc inDim = inputTensor->get_desc(); + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + CHECK_STATUS(tensor4dGet(inDim, &idt, &idf, &in, &ic, &ih, &iw)); + this->numInputs = ic / this->p.group; + + Tensor filterTensor; + TensorDesc filterDim = tensor4df(this->dt, DF_NCHW, this->numInputs, this->p.num_outputs, + this->p.kernel_h, this->p.kernel_w); + filterTensor.resize(filterDim); + + this->p = createConvolutionParamSpec(this->p.group, this->p.kernel_h, this->p.kernel_w, + this->p.stride_h, this->p.stride_w, this->p.padding_top, this->p.padding_bottom, + this->p.padding_left, this->p.padding_right, this->p.dilatedRate_h, + this->p.dilatedRate_w, this->p.num_outputs, this->p.convolution_type); + + DataType targetType = this->dt; + if (DT_F16_8Q == this->dt) { + targetType = DT_I8; + } + + CHECK_STATUS(deconvolution_infer_output_size( + inputTensor, filterTensor, p, outTensors[0], targetType, &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(deconvolution_infer_forward_tmp_bytes(this->inputTensors[0], + this->weightTensors[0], this->outputTensors[0], p, this->alg, &bytes, &this->archInfo)); + return bytes; + } + + U32 infer_wtm_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(deconvolution_transform_filter_bytes( + this->weightTensors[0], this->p, this->alg, &bytes, &this->archInfo)); + return bytes; + } + + EE transform_filter() override + { + this->wtm = std::shared_ptr(new Tensor()); + Tensor filterTensor = this->weightTensors[0]; + auto wtmBytes = this->infer_wtm_memory_size(); + Tensor wtm = Tensor::alloc_sized(tensor1d(DT_U8, wtmBytes)); + CHECK_STATUS(deconvolution_transform_filter( + filterTensor, this->p, this->alg, this->temp, &wtm, &this->archInfo)); + this->weightTensors[0] = wtm; + return SUCCESS; + } +}; + +#endif // _DECONVOLUTION_CPU_H diff --git a/inference/engine/include/cpu/eltwise_cpu.hpp b/inference/engine/include/cpu/eltwise_cpu.hpp new file mode 100644 index 00000000..e538eea1 --- /dev/null +++ b/inference/engine/include/cpu/eltwise_cpu.hpp @@ -0,0 +1,81 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _ELTWISE_CPU_H +#define _ELTWISE_CPU_H + +#include "eltwise.hpp" + +class EltwiseCPU : public Eltwise { +public: + EltwiseCPU(EltwiseParamSpec eltwiseDesc) : Eltwise(eltwiseDesc) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new EltwiseCPU(this->eltwiseDesc)); + *mem = *this; + return mem; + } + + void run() override + { + std::vector inputDesc; + for (auto p : this->inputTensors) { + inputDesc.push_back(p.get_desc()); + } + if (this->eltwiseDesc.elt_mode == ELTWISE_PROD && inputDesc.size() == 2 && + (inputDesc[1].nDims == 2 || (inputDesc[1].nDims == 3 && inputDesc[1].dims[0] == 1) || + (inputDesc[1].nDims == 4 && inputDesc[1].dims[0] == 1 && inputDesc[1].dims[1] == 1)) && + tensorNumElements(inputDesc[0]) != tensorNumElements(inputDesc[1])) { + Tensor inTensor = this->inputTensors[1]; + U8 *alpha = (U8 *)((CpuMemory *)(inTensor.get_memory()))->get_ptr(); + ScaleParamSpec scaleParam; + scaleParam.axis = 1; + CHECK_STATUS(scale(this->inputTensors[0], alpha, nullptr, scaleParam, + this->outputTensors[0], &this->archInfo)); + } else { + CHECK_STATUS(eltwise(this->inputTensors, this->eltwiseDesc, this->temp, + this->outputTensors[0], &this->archInfo)); + } + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + std::vector inDims; + for (auto p : inTensors) { + inDims.push_back(p->get_desc()); + } + if (this->eltwiseDesc.elt_mode == ELTWISE_PROD && inDims.size() == 2 && + (inDims[1].nDims == 2 || (inDims[1].nDims == 3 && inDims[1].dims[0] == 1) || + (inDims[1].nDims == 4 && inDims[1].dims[0] == 1 && inDims[1].dims[1] == 1)) && + tensorNumElements(inDims[0]) != tensorNumElements(inDims[1])) { + CHECK_STATUS(scale_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + } else { + CHECK_STATUS(eltwise_infer_output_size(inTensors, outTensors[0], &this->archInfo)); + } + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(eltwise_infer_forward_tmp_bytes( + this->inputTensors, this->outputTensors[0], &bytes, &this->archInfo)); + return UNI_MAX(bytes, this->lenOfTemp); + } +}; + +#endif // _ELTWISE_CPU_H diff --git a/inference/engine/include/cpu/embedding_cpu.hpp b/inference/engine/include/cpu/embedding_cpu.hpp new file mode 100644 index 00000000..9caa733f --- /dev/null +++ b/inference/engine/include/cpu/embedding_cpu.hpp @@ -0,0 +1,87 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _EMBEDDING_CPU_H +#define _EMBEDDING_CPU_H + +#include "embedding.hpp" + +class EmbeddingCPU : public Embedding { +public: + EmbeddingCPU(DataType dt, EmbedParamSpec p) : Embedding(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new EmbeddingCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor weightTensor = (this->weightTensors.size()) ? this->weightTensors[0] + : this->inputTensors[1]; + CHECK_STATUS(embedding( + this->inputTensors[0], weightTensor, this->p, this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(embedding_infer_output_size( + inTensors[0], this->p, this->dt, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + EE init_weight_bias_from_model(std::shared_ptr *modelPtrShared) override + { + U8 *modelPtr = nullptr; + if (modelPtrShared != nullptr) { + modelPtr = (*modelPtrShared).get(); + } + TensorDesc weightDesc; + if (this->p.transpose) { + weightDesc = tensor2df(this->dt, DF_TRANSPOSE, this->p.num_output, this->p.input_dim); + } else { + weightDesc = tensor2df(this->dt, DF_NORMAL, this->p.input_dim, this->p.num_output); + } + U32 weightBytes = tensorNumBytes(weightDesc); + + std::shared_ptr modelWeightTensor(new Tensor()); + modelWeightTensor->resize(weightDesc); + + bool set_ptr = false; + if (modelPtr != nullptr) { + modelWeightTensor->alloc(); + memcpy( + ((CpuMemory *)(modelWeightTensor->get_memory()))->get_ptr(), modelPtr, weightBytes); + *modelPtrShared = std::shared_ptr(*modelPtrShared, modelPtr + weightBytes); + set_ptr = true; + } else { + auto curOpWs = this->get_weightspec(); + if (curOpWs.weight != nullptr) { + ((CpuMemory *)(modelWeightTensor->get_memory())) + ->set_shared_ptr(std::shared_ptr(curOpWs.weight)); + set_ptr = true; + } + } + if (set_ptr) { + this->weightTensors.push_back(*modelWeightTensor.get()); + } + return SUCCESS; + } +}; + +#endif // _EMBEDDING_CPU_H diff --git a/inference/engine/include/cpu/factory_cpu.hpp b/inference/engine/include/cpu/factory_cpu.hpp new file mode 100644 index 00000000..44a30ae7 --- /dev/null +++ b/inference/engine/include/cpu/factory_cpu.hpp @@ -0,0 +1,366 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _FACTORY_CPU_H +#define _FACTORY_CPU_H + +#include "factory.hpp" +#include "attention.hpp" +#include "reduction.hpp" +#include "jump.hpp" +#include "cpu/resize_cpu.hpp" +#include "cpu/pooling_cpu.hpp" +#include "cpu/convolution_cpu.hpp" +#include "cpu/deconvolution_cpu.hpp" +#include "cpu/eltwise_cpu.hpp" +#include "cpu/softmax_cpu.hpp" +#include "cpu/activation_cpu.hpp" +#include "cpu/fully_connected_cpu.hpp" +#include "cpu/scale_cpu.hpp" +#include "cpu/concat_cpu.hpp" +#include "cpu/clip_cpu.hpp" +#include "cpu/squeeze_cpu.hpp" +#include "cpu/reshape_cpu.hpp" +#include "cpu/embedding_cpu.hpp" +#include "cpu/layer_norm_cpu.hpp" +#include "cpu/matmul_cpu.hpp" +#include "cpu/power_cpu.hpp" +#include "cpu/transpose_cpu.hpp" +#include "cpu/slice_cpu.hpp" +#include "cpu/shared_weight_cpu.hpp" +#include "cpu/repeat_cpu.hpp" +#include "cpu/copy_cpu.hpp" +#include "cpu/check_cpu.hpp" +#include "cpu/preallocated_memory_cpu.hpp" +#include "cpu/argmax_cpu.hpp" +#include "cpu/unsqueeze_cpu.hpp" +#include "cpu/rnncell_cpu.hpp" +#include "cpu/rnn_cpu.hpp" +#include "cpu/padding_cpu.hpp" +#include "attention_mask.hpp" +#include "relative_position_embedding.hpp" +#include "relative_shift.hpp" +#include "detection_output.hpp" +#include "prior_box.hpp" +#include "yolov3_detection_output.hpp" +#include "cpu/channel_resize_cpu.hpp" +#include "cpu/l2normalization_cpu.hpp" +#include "cpu/tile_cpu.hpp" +#include "cpu/prelu_cpu.hpp" +#include "cpu/tfslice_cpu.hpp" +#include "cpu/splice_cpu.hpp" +#include "cpu/shape_cpu.hpp" + +class FactoryCPU : public Factory { +public: + std::shared_ptr createConvolution(DataType dt, + ConvolutionParamSpec p, + ActivationParamSpec dwActivationParamSpec, + ActivationParamSpec pwActivationParamSpec) override + { + auto cep = + (Convolution *)(new ConvolutionCPU(dt, p, dwActivationParamSpec, pwActivationParamSpec)); + return std::shared_ptr(cep); + } + + std::shared_ptr createDeconvolution( + DataType dt, ConvolutionParamSpec p, ActivationParamSpec activationDesc) override + { + auto cep = new DeconvolutionCPU(dt, p, activationDesc); + return std::shared_ptr(cep); + } + + std::shared_ptr createPooling(PoolingParamSpec p) override + { + auto cep = (Pooling *)(new PoolingCPU(p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createFullyConnected( + DataType dt, FullyConnectedParamSpec p, U32 numInput) override + { + auto cep = (FullyConnected *)(new FullyConnectedCPU(dt, p, numInput)); + return std::shared_ptr(cep); + } + + std::shared_ptr createSoftmax(DataType dt, SoftmaxParamSpec p) override + { + auto cep = new SoftmaxCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createConcat(ConcatParamSpec p) override + { + auto cep = (Concat *)(new ConcatCPU(p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createActivation(ActivationParamSpec activeDesc) override + { + auto cep = (Activation *)new ActivationCPU(activeDesc); + return std::shared_ptr(cep); + } + + std::shared_ptr createEltwise(EltwiseParamSpec eltwiseDesc) override + { + auto cep = (Eltwise *)new EltwiseCPU(eltwiseDesc); + return std::shared_ptr(cep); + } + + std::shared_ptr createScale(DataType dt, ScaleParamSpec p, int numChannels) override + { + auto cep = (Scale *)(new ScaleCPU(dt, p, numChannels)); + return std::shared_ptr(cep); + } + + std::shared_ptr createRNN(DataType dt, RNNParamSpec p) override + { + auto cep = (RNNCell *)new RNNCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createRNNCell(DataType dt, RNNParamSpec p) override + { + auto cep = (RNNCell *)new RNNCellCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createEmbedding(DataType dt, EmbedParamSpec p) override + { + auto cep = (Embedding *)(new EmbeddingCPU(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createPower(DataType dt, PowerParamSpec p) override + { + auto cep = (Power *)(new PowerCPU(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createMatMul(DataType dt, MatMulParamSpec p) override + { + auto cep = (MatMul *)(new MatMulCPU(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createLayerNorm(DataType dt, U32 weightNum) override + { + auto cep = (LayerNorm *)(new LayerNormCPU(dt, weightNum)); + return std::shared_ptr(cep); + } + + std::shared_ptr createReshape(DataType dt, ReshapeParamSpec p) override + { + auto cep = (Reshape *)(new ReshapeCPU(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createResize(DataType paramDT, ResizeParamSpec p) override + { + auto cep = (Resize *)(new ResizeCPU(paramDT, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createSlice(DataType dt, SliceParamSpec p) override + { + auto cep = (Slice *)(new SliceCPU(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createTranspose(DataType dt, TransposeParamSpec p) override + { + auto cep = (Transpose *)new TransposeCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createAttention(DataType dt, AttentionParamSpec p) override + { + auto cep = new Attention(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createClip(DataType dt, ClipParamSpec p) override + { + auto cep = (Clip *)(new ClipCPU(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createSqueeze(DataType dt, SqueezeParamSpec p) override + { + auto cep = (Squeeze *)(new SqueezeCPU(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createUnsqueeze(DataType dt, UnsqueezeParamSpec p) override + { + auto cep = (Unsqueeze *)new UnsqueezeCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createReduction(DataType dt, ReductionParamSpec p) override + { + auto cep = new Reduction(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createArgMax(DataType dt, ArgMaxParamSpec p) override + { + auto cep = (ArgMax *)new ArgMaxCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createCopy(DataType dt, CopyParamSpec p) override + { + auto cep = (Copy *)new CopyCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createCheck(DataType dt, CheckParamSpec p) override + { + auto cep = (Check *)new CheckCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createRepeat( + DataType dt, RepeatParamSpec p, I32 jumpOperatorIndex, I32 currentOperatorIndex) override + { + auto cep = (Repeat *)new RepeatCPU(dt, p, jumpOperatorIndex, currentOperatorIndex); + return std::shared_ptr(cep); + } + + std::shared_ptr createBilateralSliceApply(BilateralSliceApplyParamSpec p) override + { + OP_UNSUP(1, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createPreAllocatedMemory(DataType dt, TensorDesc desc) override + { + auto cep = (PreAllocatedMemory *)new PreAllocatedMemoryCPU(dt, desc); + return std::shared_ptr(cep); + } + + std::shared_ptr createSharedWeight(DataType dt, + TensorDesc desc, + std::string outputTensorName, + std::map> *tensorMapPtr) override + { + auto cep = (SharedWeight *)new SharedWeightCPU(dt, desc, outputTensorName, tensorMapPtr); + return std::shared_ptr(cep); + } + + std::shared_ptr createJump( + DataType dt, I32 jumpOperatorIndex, I32 currentOperatorIndex) override + { + auto cep = new Jump(dt, jumpOperatorIndex, currentOperatorIndex); + return std::shared_ptr(cep); + } + + std::shared_ptr createSpace2Depth(DataType dt) override + { + OP_UNSUP(1, dt); + return std::shared_ptr(cep); + } + + std::shared_ptr createDepth2Space(DataType dt, Depth2SpaceParamSpec p) override + { + OP_UNSUP(2, dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createPReLU(DataType dt) override + { + auto cep = new PReLUCPU(dt); + return std::shared_ptr(cep); + } + + std::shared_ptr createAttentionMask(DataType dt, AttentionMaskParamSpec p) override + { + auto cep = new AttentionMask(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createRelativePositionEmbedding(DataType dt, EmbedParamSpec p) override + { + auto cep = new RelativePositionEmbedding(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createRelativeShift(DataType dt, RelativeShiftParamSpec p) override + { + auto cep = new RelativeShift(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createPadding(DataType dt, PadParamSpec p) override + { + auto cep = (Padding *)(new PaddingCPU(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createPriorBox(DataType dt, PriorBoxParamSpec p) override + { + auto cep = new PriorBox(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createDetectionOutput(DataType dt, DetectionOutputParamSpec p) override + { + auto cep = new DetectionOutput(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createYolov3DetectionOutput( + DataType dt, Yolov3DetectionOutputParamSpec p) override + { + auto cep = new Yolov3DetectionOutput(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createChannelResize(DataType dt, ChannelResizeParamSpec p) override + { + auto cep = new ChannelResizeCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createL2Normalization(DataType dt) override + { + auto cep = new L2NormalizationCPU(dt); + return std::shared_ptr(cep); + } + + std::shared_ptr createTile(DataType dt, TileParamSpec p) override + { + auto cep = new TileCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createTfSlice(DataType dt, TfSliceParamSpec p) override + { + auto cep = new TfSliceCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createSplice(DataType dt, SpliceParamSpec p) override + { + auto cep = new SpliceCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createShape() override + { + auto cep = new ShapeCPU(); + return std::shared_ptr(cep); + } +}; +#endif // _FACTORY_CPU_H diff --git a/inference/engine/include/cpu/fully_connected_cpu.hpp b/inference/engine/include/cpu/fully_connected_cpu.hpp new file mode 100644 index 00000000..9d76e972 --- /dev/null +++ b/inference/engine/include/cpu/fully_connected_cpu.hpp @@ -0,0 +1,266 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _FULLY_CONNECTED_CPU_H +#define _FULLY_CONNECTED_CPU_H + +#include "fully_connected.hpp" +#include "blas_enhance.h" + +class FullyConnectedCPU : public FullyConnected { +public: + FullyConnectedCPU(DataType dt, FullyConnectedParamSpec p, U32 numInput) + : FullyConnected(dt, p, numInput) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr( + new FullyConnectedCPU(this->dt, this->p, this->numInput)); + *mem = *this; + return mem; + } + + EE infer_weight_desc() override + { + DataType dtNoQ = (DT_F16_8Q == this->dt) ? DT_F16 : this->dt; + this->weightTensors = std::vector(1); + this->weightTensors[0].resize( + tensor2df(dtNoQ, DF_TRANSPOSE, this->p.num_outputs, this->numInput)); + this->biasTensors = std::vector(1); + this->biasTensors[0].resize(tensor1d(dtNoQ, this->p.num_outputs)); + return SUCCESS; + } + + TensorDesc desc_process(TensorDesc inDim) + { + TensorDesc inputDesc; + DataType dt; + DataFormat df; + U32 in, ic, ih, iw; + switch (inDim.nDims) { + case 2: { + CHECK_STATUS(tensor2dGet(inDim, &dt, &df, &in, &(this->numInput))); + inputDesc = inDim; + break; + } + case 3: { + CHECK_STATUS(tensor3dGet(inDim, &dt, &df, &in, &ih, &iw)); + this->numInput = iw; + inputDesc = tensor2df(dt, DF_NORMAL, in * ih, iw); + break; + } + case 4: { + CHECK_STATUS(tensor4dGet(inDim, &dt, &df, &in, &ic, &ih, &iw)); + this->numInput = ic * ih * iw; + inputDesc = inDim; + break; + } + default: + break; + } + return inputDesc; + } + + TensorDesc desc_process_reverse(TensorDesc inDim, TensorDesc outDim) + { + TensorDesc outDesc; + DataType dt; + DataFormat df; + U32 in, ih, iw; + switch (inDim.nDims) { + case 2: { + outDesc = outDim; + break; + } + case 3: { + CHECK_STATUS(tensor3dGet(inDim, &dt, &df, &in, &ih, &iw)); + outDesc = tensor3df(dt, df, in, ih, this->p.num_outputs); + break; + } + case 4: { + outDesc = outDim; + break; + } + default: + break; + } + return outDesc; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + TensorDesc inputDesc = desc_process(inputTensor.get_desc()); + inputTensor.resize(inputDesc); + + Tensor outputTensor = this->outputTensors[0]; + TensorDesc outputDesc = outputTensor.get_desc(); + outputDesc.dims[0] = this->p.num_outputs; + outputDesc = desc_process(outputDesc); + outputTensor.resize(outputDesc); + + if (featureScale.size() > 1 && featureScale[0][0] > 0 && DT_I8 != inputDesc.dt) { + inputTensor.set_scale(featureScale[0][0]); + } + + CHECK_STATUS(fully_connected(inputTensor, weightTensors[0], biasTensors[0], this->temp, + outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->mvm = false; + TensorDesc inputDesc = desc_process(inTensors[0]->get_desc()); + TensorDesc weightDesc = + tensor2df(inputDesc.dt, DF_TRANSPOSE, this->p.num_outputs, this->numInput); + TensorDesc outputDesc; + + DataType idt; + DataFormat idf; + U32 in = 0, ic, ih, iw; + if (tensorIs2d(inputDesc)) { + CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &in, &iw)); + } else if (tensorIs4d(inputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + } else { + CHECK_STATUS(NOT_MATCH); + } + if (1 == in) { + this->mvm = true; + } + + Tensor tmpInput = *inTensors[0]; + tmpInput.resize(inputDesc); + Tensor tmpFilter; + tmpFilter.resize(weightDesc); + CHECK_STATUS( + fully_connected_infer_output_size(&tmpInput, tmpFilter, outTensors[0], &this->archInfo)); + if (1 == this->p.num_slices) { + outputDesc = outTensors[0]->get_desc(); + outputDesc = desc_process_reverse(inTensors[0]->get_desc(), outputDesc); + if (DT_F16_8Q == this->dt) { + if (featureScale.size() > 0 && -2 == (featureScale.back())[0]) { + outputDesc.dt = DT_F16; + } else { + outputDesc.dt = DT_I8; + } + } + outTensors[0]->resize(outputDesc); + } else { + UNI_ERROR_LOG("FC merge is deprecated\n"); + outputDesc = desc_process_reverse(inTensors[0]->get_desc(), outputDesc); + for (U32 i = 0; i < this->p.num_slices; i++) { + outputDesc.dims[0] = this->p.slice_point[i]; + if (DT_F16_8Q == this->dt) { + if (featureScale.size() > 0 && -2 == (featureScale.back())[0]) { + outputDesc.dt = DT_F16; + } else { + outputDesc.dt = DT_I8; + } + } + } + } + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + TensorDesc inputDesc = desc_process((this->inputTensors[0]).get_desc()); + U32 bytes = 0; + + Tensor tmpInput, tmpFilter; + tmpInput.resize(inputDesc); + + CHECK_STATUS(fully_connected_infer_forward_tmp_bytes( + tmpInput, weightTensors[0], &bytes, &this->archInfo)); + return bytes; + } + + U32 infer_wtm_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS( + fully_connected_transform_filter_bytes(weightTensors[0], &bytes, &this->archInfo)); + return bytes; + } + + EE transform_filter() override + { + TensorDesc inputDesc = desc_process(this->inputTensors[0].get_desc()); + Tensor weightTensor = this->weightTensors[0]; + TensorDesc weightDesc = weightTensor.get_desc(); + TensorDesc wtmDesc; + auto wtm_bytes = this->infer_wtm_memory_size(); + + TensorDesc tmpDesc; + Tensor tmpFilter; + + Tensor tmpInput; + tmpInput.resize(inputDesc); + if (inputDesc.df == DF_NCHWC8) { + tmpFilter.resize(tensor1d(DT_U8, wtm_bytes)); + tmpFilter.alloc(); + CHECK_STATUS(fully_connected_transform_filter( + tmpInput, weightTensors[0], &tmpFilter, &this->archInfo)); + } else { + tmpDesc = weightDesc; + if (this->mvm) { + tmpDesc.df = DF_NORMAL; + } + tmpFilter = weightTensor; + tmpFilter.resize(tmpDesc); + } + +#ifdef _USE_INT8 + if (DT_F16_8Q == this->dt) { + std::shared_ptr qFilter = std::shared_ptr( + (U8 *)operator new(bytesOf(DT_I8) * tensorNumElements(tmpDesc))); + + F16 scale = -1; + F16 *inD = (F16 *)((CpuMemory *)(tmpFilter.get_memory()))->get_ptr(); + CHECK_STATUS( + quantize_tensor(tmpFilter.get_desc(), inD, &tmpDesc, qFilter.get(), &scale)); + tmpFilter.resize(tmpDesc); + ((CpuMemory *)(tmpFilter.get_memory()))->set_shared_ptr(qFilter); + tmpFilter.set_scale(scale); + } +#endif + this->wtm = std::shared_ptr(new Tensor()); + wtm->resize(tensor1d(DT_U8, wtm_bytes)); + wtm->alloc(); + wtm->set_scale(tmpFilter.get_scale()); + if (this->mvm) { + if (X86_AVX2 != this->archInfo.arch) { + CHECK_STATUS(matrix_vector_multiply_transform_weight(tmpFilter.get_desc(), + ((CpuMemory *)(tmpFilter.get_memory()))->get_ptr(), &wtmDesc, + ((CpuMemory *)(wtm->get_memory()))->get_ptr(), this->archInfo.arch)); + wtm->resize(wtmDesc); + } else { + *wtm.get() = tmpFilter; + } + } else { + CHECK_STATUS(matrix_matrix_multiply_transform_rhs(tmpFilter.get_desc(), + ((CpuMemory *)(tmpFilter.get_memory()))->get_ptr(), &wtmDesc, + ((CpuMemory *)(wtm->get_memory()))->get_ptr(), this->archInfo.arch)); + wtm->resize(wtmDesc); + } + this->weightTensors[0] = *this->get_wtm(); + return SUCCESS; + } + + bool mvm; +}; + +#endif // _FULLY_CONNECTED_CPU_H diff --git a/inference/engine/include/cpu/l2normalization_cpu.hpp b/inference/engine/include/cpu/l2normalization_cpu.hpp new file mode 100644 index 00000000..9cdb463f --- /dev/null +++ b/inference/engine/include/cpu/l2normalization_cpu.hpp @@ -0,0 +1,47 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _L2NORMALIZATION_CPU_H +#define _L2NORMALIZATION_CPU_H + +#include "l2normalization.hpp" + +class L2NormalizationCPU : public L2Normalization { +public: + L2NormalizationCPU(DataType dt) : L2Normalization(dt) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new L2NormalizationCPU(this->dt)); + *mem = *this; + return mem; + } + + void run() override + { + CHECK_STATUS( + l2normalization(this->inputTensors[0], this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS( + l2normalization_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // _L2NORMALIZATION_CPU_H diff --git a/inference/engine/include/cpu/layer_norm_cpu.hpp b/inference/engine/include/cpu/layer_norm_cpu.hpp new file mode 100644 index 00000000..3f34e1b3 --- /dev/null +++ b/inference/engine/include/cpu/layer_norm_cpu.hpp @@ -0,0 +1,67 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _LAYER_NORM_CPU_H +#define _LAYER_NORM_CPU_H + +#include "layer_norm.hpp" + +class LayerNormCPU : public LayerNorm { +public: + LayerNormCPU(DataType dt, U32 weightNum) : LayerNorm(dt, weightNum) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new LayerNormCPU(this->dt, this->weightNum)); + *mem = *this; + return mem; + } + + EE infer_weight_desc() override + { + auto curOpWs = this->get_weightspec(); + DataType dtNoQ = (DT_F16_8Q == this->dt) ? DT_F16 : this->dt; + if (0 != curOpWs.bytes_of_weight) { + this->weightNum = curOpWs.bytes_of_weight / bytesOf(curOpWs.mdt); + } + Tensor weightTensor; + weightTensor.resize(tensor1d(dtNoQ, this->weightNum)); + this->weightTensors.push_back(weightTensor); + Tensor biasTensor; + biasTensor.resize(tensor1d(dtNoQ, this->weightNum)); + this->biasTensors.push_back(biasTensor); + return SUCCESS; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor weightTensor = this->weightTensors[0]; + Tensor biasTensor = this->biasTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + + CHECK_STATUS(layer_normalization( + inputTensor, weightTensor, biasTensor, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(normalization_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // _LAYER_NORM_CPU_H diff --git a/inference/engine/include/cpu/matmul_cpu.hpp b/inference/engine/include/cpu/matmul_cpu.hpp new file mode 100644 index 00000000..4ea44af0 --- /dev/null +++ b/inference/engine/include/cpu/matmul_cpu.hpp @@ -0,0 +1,72 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _MATMUL_CPU_H +#define _MATMUL_CPU_H + +#include "matmul.hpp" + +class MatMulCPU : public MatMul { +public: + MatMulCPU(DataType dt, MatMulParamSpec p) : MatMul(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new MatMulCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensorA = this->inputTensors[0]; + TensorDesc inputDescA = inputTensorA.get_desc(); + Tensor inputTensorB = this->inputTensors[1]; + TensorDesc inputDescB = inputTensorB.get_desc(); + Tensor outputTensor = this->outputTensors[0]; + if (3 == featureScale.size() && featureScale[0][0] > 0 && DT_I8 != inputDescA.dt) { + inputTensorA.set_scale(featureScale[0][0]); + } + if (3 == featureScale.size() && featureScale[1][0] > 0 && DT_I8 != inputDescB.dt) { + inputTensorB.set_scale(featureScale[1][0]); + } + + CHECK_STATUS(matmul(inputTensors[0], this->p.transpose_a, inputTensors[1], + this->p.transpose_b, this->temp, outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(matmul_infer_output_size(inTensors[0], this->p.transpose_a, inTensors[1], + this->p.transpose_b, outTensors[0], &this->archInfo)); + if (DT_F16_8Q == this->dt && featureScale.size() > 0 && -2 == (featureScale.back())[0]) { + auto outDesc = outTensors[0]->get_desc(); + outDesc.dt = DT_F16; + outTensors[0]->resize(outDesc); + } + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(matmul_infer_forward_tmp_bytes(inputTensors[0], this->p.transpose_a, + inputTensors[1], this->p.transpose_b, &bytes, &this->archInfo)); + return bytes; + } +}; + +#endif // _MATMUL_CPU_H diff --git a/inference/engine/include/cpu/padding_cpu.hpp b/inference/engine/include/cpu/padding_cpu.hpp new file mode 100644 index 00000000..ac4979db --- /dev/null +++ b/inference/engine/include/cpu/padding_cpu.hpp @@ -0,0 +1,48 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _PADDING_CPU_H +#define _PADDING_CPU_H + +#include "padding.hpp" + +class PaddingCPU : public Padding { +public: + PaddingCPU(DataType dt, PadParamSpec p) : Padding(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new PaddingCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(padding(inputTensor, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS( + padding_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // _PADDINGCPU_H diff --git a/inference/include/cpu/memory_cpu.hpp b/inference/engine/include/cpu/pooling_cpu.hpp similarity index 50% rename from inference/include/cpu/memory_cpu.hpp rename to inference/engine/include/cpu/pooling_cpu.hpp index 365aca01..c2a058f5 100644 --- a/inference/include/cpu/memory_cpu.hpp +++ b/inference/engine/include/cpu/pooling_cpu.hpp @@ -11,63 +11,47 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#ifndef _POOLING_CPU_H +#define _POOLING_CPU_H -#ifndef _MEMORY_CPU_H -#define _MEMORY_CPU_H -#include -#include -#include -#include "memory.hpp" +#include "pooling.hpp" -class CpuMemory : public Memory_ -{ +class PoolingCPU : public Pooling { public: - CpuMemory(){ - len = 0; - type = CPUMem; - } - virtual ~CpuMemory() = default; + PoolingCPU(PoolingParamSpec p) : Pooling(p) + {} - virtual void alloc(TensorDesc desc) override + std::shared_ptr clone() override { - U32 size = tensorNumBytes(desc); - if (len < size) { - this->val = std::shared_ptr((U8*)operator new(size)); - len = size; - } + std::shared_ptr mem = std::shared_ptr(new PoolingCPU(this->p)); + *mem = *this; + return mem; } - virtual void alloc(U32 size) override + void run() override { - if (len < size) { - this->val = std::shared_ptr((U8*)operator new(size)); - len = size; - } - } - - virtual void set_val_by_copy(TensorDesc desc, U8* ptr) override { - memcpy(val.get(), ptr, tensorNumBytes(desc)); - } - - virtual void* get_val() override{ - return this->val.get(); - }; - - virtual MemoryType get_mem_type() override{ - return type; + CHECK_STATUS(pooling( + this->inputTensors[0], this->p, this->temp, this->outputTensors[0], &this->archInfo)); } - virtual void set_shared_ptr(PtrCasterShared val) override{ - this->val = val; + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + if (this->p.kernel_h == 0 && this->p.kernel_w == 0) { + Pooling::set_stride(1, 1); + } + CHECK_STATUS( + pooling_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; } - virtual std::shared_ptr get_shared_ptr() override{ - return val; + U32 infer_tmp_memory_size() override + { + U32 size = 0; + CHECK_STATUS(pooling_infer_forward_tmp_bytes( + this->inputTensors[0], this->outputTensors[0], &size, &this->archInfo)); + return size; } - -private: - std::shared_ptr val; - U32 len; - MemoryType type; }; -#endif + +#endif // _POOLINGCPU_H diff --git a/inference/include/cpu/multiply_cpu.hpp b/inference/engine/include/cpu/power_cpu.hpp similarity index 50% rename from inference/include/cpu/multiply_cpu.hpp rename to inference/engine/include/cpu/power_cpu.hpp index 2f7e27fc..aace8126 100644 --- a/inference/include/cpu/multiply_cpu.hpp +++ b/inference/engine/include/cpu/power_cpu.hpp @@ -1,63 +1,60 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#ifndef _POWER_CPU_H +#define _POWER_CPU_H -/** - * Project deploy - */ -#ifndef _MULTIPLY_CPU_H -#define _MULTIPLY_CPU_H -#include "operator.hpp" -#include "tensor_computing.h" -#include "multiply.hpp" +#include "power.hpp" -class MultiplyCPU: public Multiply { +class PowerCPU : public Power { public: - MultiplyCPU(DataType dt, F32 scale, F32 bias) : Multiply(dt, scale, bias) {} + PowerCPU(DataType dt, PowerParamSpec p) : Power(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new PowerCPU(this->dt, this->p)); + *mem = *this; + return mem; + } void run() override { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor inputTensor = this->inputTensors[0]; + Tensor inputTensor = this->inputTensors[0]; TensorDesc inputDesc = inputTensor.get_desc(); Tensor outputTensor = this->outputTensors[0]; - TensorDesc output_desc = outputTensor.get_desc(); if (DT_I8 == inputDesc.dt) { #ifdef _USE_INT8 - CHECK_REQUIREMENT(0 == this->beta); - F32 scaleO = inputTensor.get_scale() / this->alpha; + CHECK_REQUIREMENT(0 == this->p.shift); + F32 scaleO = inputTensor.get_scale() / this->p.scale; outputTensor.set_scale(scaleO); - U8 *inPtr = inputTensor.get_val(); - U8 *outPtr = outputTensor.get_val(); + auto inPtr = ((CpuMemory *)(inputTensor.get_memory()))->get_ptr(); + auto outPtr = ((CpuMemory *)(outputTensor.get_memory()))->get_ptr(); if (inPtr != outPtr) { memcpy(outPtr, inPtr, tensorNumBytes(inputDesc)); - } + } #endif } else { - CHECK_STATUS(multiply(&(this->alpha), &(this->beta), - inputDesc, inputTensor.get_val(), - output_desc, outputTensor.get_val(), this->schedule)); + CHECK_STATUS(power(inputTensor, this->p, outputTensor, &this->archInfo)); } - UTIL_TIME_TOC(__CLASS_FUNCTION__) } - - EE infer_output_tensors_size(Vec inDims, Vec* outDims) override + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override { - CHECK_STATUS(multiply_infer_output_size(inDims[0], &((*outDims)[0]), this->schedule)); - return SUCCESS; + return power_infer_output_size(inTensors[0], outTensors[0], &this->archInfo); } }; -#endif //_MULTIPLY_CPU_H +#endif // _POWER_CPU_H diff --git a/inference/engine/include/cpu/preallocated_memory_cpu.hpp b/inference/engine/include/cpu/preallocated_memory_cpu.hpp new file mode 100644 index 00000000..92a0fab5 --- /dev/null +++ b/inference/engine/include/cpu/preallocated_memory_cpu.hpp @@ -0,0 +1,48 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _PREALLOCATED_MEMORY_CPU_H +#define _PREALLOCATED_MEMORY_CPU_H + +#include "preallocated_memory.hpp" + +class PreAllocatedMemoryCPU : public PreAllocatedMemory { +public: + PreAllocatedMemoryCPU(DataType dt, TensorDesc desc) : PreAllocatedMemory(dt, desc) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new PreAllocatedMemoryCPU(this->dt, this->desc)); + *mem = *this; + return mem; + } + + void run() override + { + CHECK_STATUS(preallocated_memory(this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + if (inTensors.size() > 0) { + CHECK_STATUS(NOT_MATCH); + } + outTensors[0]->resize(this->desc); + return SUCCESS; + } +}; + +#endif // _PREALLOCATED_MEMORY_CPU_H diff --git a/inference/engine/include/cpu/prelu_cpu.hpp b/inference/engine/include/cpu/prelu_cpu.hpp new file mode 100644 index 00000000..5f81a573 --- /dev/null +++ b/inference/engine/include/cpu/prelu_cpu.hpp @@ -0,0 +1,65 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _PRELU_CPU_H +#define _PRELU_CPU_H + +#include "prelu.hpp" + +class PReLUCPU : public PReLU { +public: + PReLUCPU(DataType dt) : PReLU(dt) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new PReLUCPU(this->dt)); + *mem = *this; + return mem; + } + + EE infer_weight_desc() override + { + auto curOpWs = this->get_weightspec(); + U32 weightNum = 0; + if (curOpWs.weight != nullptr) { + weightNum = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt)); + } + if (weightNum == 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (weightNum == 1) { + this->preluDesc.propagate_down = true; + } else { + this->preluDesc.propagate_down = false; + } + Tensor weightTensor; + weightTensor.resize(tensor1d(this->dt, weightNum)); + this->weightTensors.push_back(weightTensor); + return SUCCESS; + } + + void run() override + { + CHECK_STATUS(prelu(this->inputTensors[0], this->weightTensors[0], this->preluDesc, + this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(prelu_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; +#endif // _PRELU_CPU_H diff --git a/inference/include/repeat.hpp b/inference/engine/include/cpu/repeat_cpu.hpp similarity index 54% rename from inference/include/repeat.hpp rename to inference/engine/include/cpu/repeat_cpu.hpp index 8ad2f80d..46f8789c 100644 --- a/inference/include/repeat.hpp +++ b/inference/engine/include/cpu/repeat_cpu.hpp @@ -1,45 +1,37 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#ifndef _REPEAT_CPU_H +#define _REPEAT_CPU_H -#ifndef _REPEAT_H -#define _REPEAT_H +#include "repeat.hpp" -#include "operator.hpp" - -class Repeat: public Operator -{ +class RepeatCPU : public Repeat { public: - /** - @param mode - */ - Repeat(DataType dt, I32 loops, I32 axis, I32 jumpOperatorIndex, I32 currentOperatorIndex) - { - this->dt = dt; - this->loops = loops; - this->axis = axis; - this->iter = 0; - this->jumpOperatorIndex = jumpOperatorIndex; - this->nextOperatorIndex = currentOperatorIndex + 1; - } + RepeatCPU(DataType dt, RepeatParamSpec p, I32 jumpOperatorIndex, I32 currentOperatorIndex) + : Repeat(dt, p, jumpOperatorIndex, currentOperatorIndex) + {} - OperatorType get_op_type() override + std::shared_ptr clone() override { - return OT_Repeat; + std::shared_ptr mem = std::shared_ptr( + new RepeatCPU(this->dt, this->p, this->jumpOperatorIndex, this->nextOperatorIndex - 1)); + *mem = *this; + return mem; } void run() override - { } + {} int get_next_operator_index() override { @@ -47,7 +39,7 @@ class Repeat: public Operator if (this->inputTensors.size() > 1) { Tensor inputTensor = this->inputTensors[1]; TensorDesc inputDesc = inputTensor.get_desc(); - I32 *ptr = (I32 *)(inputTensor.get_val()); + I32 *ptr = (I32 *)(((CpuMemory *)(inputTensor.get_memory()))->get_ptr()); U32 length = tensorNumElements(inputDesc); for (U32 i = 0; i < length; i++) { // end loop @@ -59,41 +51,36 @@ class Repeat: public Operator } // check loop - if (this->iter < this->loops) { - this->iter ++; + if (this->iter < this->p.loops) { + this->iter++; return this->jumpOperatorIndex; - } - else { + } else { this->iter = 0; return this->nextOperatorIndex; } } - EE infer_output_tensors_size(VecinDims, Vec* outDims) override + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override { this->iter = 0; - if (this->axis >= 0) { + if (this->p.axis >= 0) { int axisIndex = 0; - if (inDims.size() > 2) + if (inTensors.size() > 2) { axisIndex = 2; - else { - std::cerr << "[ERROR] set to use axis feature of Repeat must meet input tensors >= 3 requirement" << std::endl; - exit(1); + } else { + UNI_ERROR_LOG("[ERROR] set to use axis feature of Repeat must meet input tensors " + ">= 3 requirement\n"); } - TensorDesc desc = inDims[axisIndex]; - this->loops = desc.dims[desc.nDims-1-axis]; + TensorDesc desc = inTensors[axisIndex]->get_desc(); + this->p.loops = desc.dims[desc.nDims - 1 - this->p.axis]; } - - (*outDims)[0].dt = this->dt; - (*outDims)[0].nDims = 0; + TensorDesc outDesc = outTensors[0]->get_desc(); + outDesc.dt = this->dt; + outDesc.nDims = 0; + outTensors[0]->resize(outDesc); return SUCCESS; } - -private: - int loops; - int axis; - int iter; - int jumpOperatorIndex; - int nextOperatorIndex; }; -#endif //_REPEAT_H +#endif // _REPEAT_CPU_H diff --git a/inference/engine/include/cpu/reshape_cpu.hpp b/inference/engine/include/cpu/reshape_cpu.hpp new file mode 100644 index 00000000..eec8571b --- /dev/null +++ b/inference/engine/include/cpu/reshape_cpu.hpp @@ -0,0 +1,89 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RESHAPE_CPU_H +#define _RESHAPE_CPU_H + +#include "reshape.hpp" + +class ReshapeCPU : public Reshape { +public: + ReshapeCPU(DataType dt, ReshapeParamSpec p) : Reshape(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new ReshapeCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + Tensor tmpInputTensor = inputTensor; + Tensor tmpOutputTensor = outputTensor; + auto inputDesc = inputTensor.get_desc(); + auto outputDesc = outputTensor.get_desc(); + // if axis is 8, the mode of a model for reshape is tflite. + if (this->p.axis == 8 && outputDesc.nDims == 4) { + auto tmpOutputDesc = outputTensor.get_desc(); + tmpOutputDesc.df = DF_NHWC; + tmpOutputTensor = this->temp; + tmpOutputTensor.resize(tmpOutputDesc); + } + + // if axis is 8, the mode of a model for reshape is tflite. + // NCHW/NCHWC8 -> NHWC + if (this->p.axis == 8 && inputDesc.nDims == 4) { + auto inputDesc = inputTensor.get_desc(); + auto tmpInputDesc = inputDesc; + tmpInputDesc.df = DF_NHWC; + transformToNHWC(inputDesc, ((CpuMemory *)(inputTensor.get_memory()))->get_ptr(), + tmpInputDesc, ((CpuMemory *)(this->temp.get_memory()))->get_ptr()); + tmpInputTensor = this->temp; + tmpInputTensor.resize(tmpInputDesc); + } + + CHECK_STATUS(reshape(tmpInputTensor, this->temp, tmpOutputTensor, &this->archInfo)); + // NHWC -> NCHW + if (this->p.axis == 8 && outputDesc.nDims == 4) { + auto outputDesc = outputTensor.get_desc(); + auto tmpOutputDesc = tmpOutputTensor.get_desc(); + void *tmpOutputPtr = ((CpuMemory *)(tmpOutputTensor.get_memory()))->get_ptr(); + transformToNCHW(tmpOutputDesc, tmpOutputPtr, outputDesc, + ((CpuMemory *)(outputTensor.get_memory()))->get_ptr()); + } + outputTensor.set_scale(inputTensor.get_scale()); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS( + reshape_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(reshape_infer_forward_tmp_bytes( + this->inputTensors[0], this->outputTensors[0], &bytes, &this->archInfo)); + return bytes; + } +}; + +#endif // _RESHAPE_CPU_H diff --git a/inference/engine/include/cpu/resize_cpu.hpp b/inference/engine/include/cpu/resize_cpu.hpp new file mode 100644 index 00000000..b8ee1bf6 --- /dev/null +++ b/inference/engine/include/cpu/resize_cpu.hpp @@ -0,0 +1,78 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RESIZE_CPU_H +#define _RESIZE_CPU_H + +#include "image.h" +#include "resize.hpp" + +class ResizeCPU : public Resize { +public: + ResizeCPU(DataType paramDT, ResizeParamSpec p) : Resize(paramDT, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new ResizeCPU(this->paramDT, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + CHECK_STATUS(resize(inputTensors[0], temp, outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + ResizeDesc resizeDesc; + resizeDesc.paramDT = this->paramDT; + U32 bytes; + switch (paramDT) { + case DT_F32: { + CHECK_REQUIREMENT(1 == this->p.scales[0] && 1 == this->p.scales[1]); + CHECK_STATUS(resize_infer_output_size(inTensors[0], resizeDesc, this->p.scales + 2, + outTensors[0], &bytes, &this->archInfo)); + break; + } + case DT_U32: { + CHECK_STATUS(resize_infer_output_size(inTensors[0], resizeDesc, this->p.sizes, + outTensors[0], &bytes, &this->archInfo)); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 size = 0; + TensorDesc inputDesc = inputTensors[0].get_desc(); + if (DF_NCHW == inputDesc.df && IS_ARM(archInfo.arch)) { + U32 paddedC = (inputDesc.dims[2] + 7) / 8 * 8; + TensorDesc outputDesc = outputTensors[0].get_desc(); + inputDesc.dims[2] = paddedC; + outputDesc.dims[2] = paddedC; + size = tensorNumBytes(inputDesc) + tensorNumBytes(outputDesc); + } + return size; + } +}; + +#endif // _RESIZECPU_H diff --git a/inference/engine/include/cpu/rnn_cpu.hpp b/inference/engine/include/cpu/rnn_cpu.hpp new file mode 100644 index 00000000..de5691db --- /dev/null +++ b/inference/engine/include/cpu/rnn_cpu.hpp @@ -0,0 +1,62 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RNN_CPU_H +#define _RNN_CPU_H + +#include "cpu/rnncell_cpu.hpp" + +class RNNCPU : public RNNCellCPU { +public: + RNNCPU(DataType dt, RNNParamSpec p) : RNNCellCPU(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new RNNCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + // NOTE: no clean tmp and output + CHECK_STATUS(rnn(inputTensor, this->weightTensors, this->biasTensors, this->p, this->temp, + outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + TensorDesc inDim = inTensors[0]->get_desc(); + DataType dt; + DataFormat df; + U32 iB, inT, iX; + CHECK_STATUS(tensor3dGet(inDim, &dt, &df, &iB, &inT, &iX)); + this->xDim = iX; + CHECK_STATUS(rnn_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(rnn_infer_forward_tmp_bytes(this->inputTensors[0], this->weightTensors[0], + this->outputTensors[0], this->p, &bytes, &this->archInfo)); + return bytes; + } +}; + +#endif // _RNN_CPU_H diff --git a/inference/engine/include/cpu/rnncell_cpu.hpp b/inference/engine/include/cpu/rnncell_cpu.hpp new file mode 100644 index 00000000..f5fd5b58 --- /dev/null +++ b/inference/engine/include/cpu/rnncell_cpu.hpp @@ -0,0 +1,140 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RNNCELL_CPU_H +#define _RNNCELL_CPU_H + +#include "rnncell.hpp" + +class RNNCellCPU : public RNNCell { +public: + RNNCellCPU(DataType dt, RNNParamSpec p) : RNNCell(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new RNNCellCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor xTensor = this->inputTensors[0]; + Tensor stateTensor = this->inputTensors[1]; + Tensor hTensor = this->outputTensors[0]; + Tensor tmpTensor = this->temp; + U32 tmpOffset = 0; + if (this->featureScale.size() > 1) { + tmpTensor.resize(xTensor.get_desc()); + CHECK_STATUS(clip(xTensor, this->clipParam, tmpTensor, &this->archInfo)); + xTensor = tmpTensor; + tmpOffset = xTensor.bytes(); + } + CHECK_STATUS(rnncell(xTensor, this->weightTensors, this->biasTensors, stateTensor, this->p, + this->xDim, this->p.numOutput, tmpOffset, tmpTensor, hTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + TensorDesc inDim = inTensors[0]->get_desc(); + DataType dt; + DataFormat df; + U32 iB, iX; + CHECK_STATUS(tensor2dGet(inDim, &dt, &df, &iB, &iX)); + this->xDim = iX; + CHECK_STATUS(rnncell_infer_output_size(inTensors, this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(rnncell_infer_forward_tmp_bytes(this->inputTensors[0], this->weightTensors[0], + this->outputTensors[0], this->p, &bytes, &this->archInfo)); + + if (featureScale.size() > 1) { + CHECK_REQUIREMENT(featureScale[0][0] > 0); + CHECK_REQUIREMENT(featureScale[0][0] == featureScale[1][0]); + this->clipParam.max = 127.0 / featureScale[0][0]; + this->clipParam.min = -1 * this->clipParam.max; + bytes += this->inputTensors[0].bytes(); + } + return bytes; + } + + EE transform_filter() override + { + I32 filter_num = this->weightTensors.size(); + std::vector bytes(filter_num); + CHECK_STATUS( + rnn_transform_filter_bytes(this->weightTensors, this->p, bytes.data(), &this->archInfo)); + std::vector ftmTensors(filter_num); + std::vector tmp(filter_num); + for (I32 i = 0; i < filter_num; i++) { + ftmTensors[i].resize(tensor1d(DT_U8, bytes[i])); + ftmTensors[i].alloc(); + tmp[i] = &ftmTensors[i]; + } + CHECK_STATUS(rnn_transform_filter(this->weightTensors, this->p, tmp, &this->archInfo)); + this->weightTensors = ftmTensors; + return SUCCESS; + } + + EE infer_weight_desc() override + { + int num1 = (this->p.biDirection) ? 2 : 1; + int num2, column; + if (this->p.numProjection > 0) { + num2 = 2; + column = this->p.numProjection; + } else { + num2 = 1; + column = this->p.numOutput; + } + int factor = 0; + switch (this->p.mode) { + case RNN_LSTM: + factor = 4; + break; + case RNN_GRU: + factor = 3; + break; + case RNN_GRU_LBR: + factor = 3; + break; + default: + return NOT_SUPPORTED; + } + U32 filterRow = factor * column; + U32 filterCol = this->xDim + this->p.numOutput; + std::vector weight_desc(2), bias_desc(2); + weight_desc[0] = tensor2df(this->dt, DF_NK, filterRow, filterCol); + weight_desc[1] = tensor2df(this->dt, DF_NK, this->p.numOutput, this->p.numProjection); + bias_desc[0] = tensor1d(this->dt, filterRow); + bias_desc[1] = tensor1d(this->dt, this->p.numOutput); + this->weightTensors = std::vector(num1 * num2); + this->biasTensors = std::vector(num1 * num2); + for (int i = 0, id = 0; i < num1; i++) { + for (int j = 0; j < num2; j++, id++) { + this->weightTensors[id].resize(weight_desc[j]); + this->biasTensors[id].resize(bias_desc[j]); + } + } + return SUCCESS; + } +}; + +#endif // _RNNCELL_CPU_H diff --git a/inference/engine/include/cpu/scale_cpu.hpp b/inference/engine/include/cpu/scale_cpu.hpp new file mode 100644 index 00000000..3d4a7b0a --- /dev/null +++ b/inference/engine/include/cpu/scale_cpu.hpp @@ -0,0 +1,109 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SCALE_CPU_H +#define _SCALE_CPU_H + +#include "scale.hpp" + +class ScaleCPU : public Scale { +public: + ScaleCPU(DataType dt, ScaleParamSpec p, int numChannels) : Scale(dt, p, numChannels) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new ScaleCPU(this->dt, this->p, this->numChannels)); + *mem = *this; + return mem; + } + + void run() override + { + int inputTensorNumber = this->inputTensors.size(); + Tensor inputTensor = this->inputTensors[this->dataID]; + Tensor outputTensor = this->outputTensors[0]; + outputTensor.resize(inputTensor.get_desc()); + + void *alpha, *beta; + if (inputTensorNumber == 1) { + alpha = ((CpuMemory *)(this->weightTensors[0].get_memory()))->get_ptr(); + beta = ((CpuMemory *)(this->biasTensors[0].get_memory()))->get_ptr(); + } else { + alpha = ((CpuMemory *)(this->inputTensors[1 - this->dataID].get_memory()))->get_ptr(); + beta = nullptr; + } + CHECK_STATUS(scale(inputTensor, alpha, beta, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + auto inDim = inTensors[0]->get_desc(); + auto curOpWs = this->get_weightspec(); + if (curOpWs.bytes_of_weight == bytesOf(curOpWs.mdt) || + curOpWs.bytes_of_vec == bytesOf(curOpWs.mdt)) { + this->p.axis = 0; + } + I32 tmpAxis = (this->p.axis + inDim.nDims) % inDim.nDims; + tmpAxis = inDim.nDims - 1 - tmpAxis; + CHECK_REQUIREMENT(tmpAxis < (I32)inDim.nDims); + U32 ic = inDim.dims[tmpAxis]; + + if (0 != curOpWs.bytes_of_weight) { + this->numChannels = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt)); + } else if (0 != curOpWs.bytes_of_vec) { + this->numChannels = curOpWs.bytes_of_vec / UNI_MAX(1, bytesOf(curOpWs.mdt)); + } else { + this->numChannels = 0; + } + + if (ic != numChannels && 0 != numChannels) { + UNI_ERROR_LOG("ScaleCPU input channels (IC) do not match. Perhaps some channel padding " + "has been done earlier\n" + " IC is now %u but should be %u\n", + ic, numChannels); + CHECK_STATUS(NOT_SUPPORTED); + return NOT_SUPPORTED; + } else { + if (inTensors.size() > 1 && + tensorNumElements(inTensors[1]->get_desc()) > tensorNumElements(inDim)) { + this->dataID = 1; + } + } + + CHECK_STATUS( + scale_infer_output_size(inTensors[this->dataID], outTensors[0], &this->archInfo)); + return SUCCESS; + } + + EE infer_weight_desc() override + { + auto curOpWs = this->get_weightspec(); + if (0 != curOpWs.bytes_of_weight) { + this->numChannels = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt)); + } else if (0 != curOpWs.bytes_of_vec) { + this->numChannels = curOpWs.bytes_of_vec / UNI_MAX(1, bytesOf(curOpWs.mdt)); + } else { + this->numChannels = 0; + } + this->weightTensors = std::vector(1); + this->weightTensors[0].resize(tensor1d(this->dt, numChannels)); + this->biasTensors = std::vector(1); + this->biasTensors[0].resize(tensor1d(this->dt, numChannels)); + return SUCCESS; + } +}; + +#endif // _SCALE_CPU_H diff --git a/inference/include/padding.hpp b/inference/engine/include/cpu/shape_cpu.hpp similarity index 53% rename from inference/include/padding.hpp rename to inference/engine/include/cpu/shape_cpu.hpp index 48540b49..be56eebb 100644 --- a/inference/include/padding.hpp +++ b/inference/engine/include/cpu/shape_cpu.hpp @@ -1,62 +1,50 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#ifndef _SHAPE_CPU_H +#define _SHAPE_CPU_H -#ifndef _PADDING_H -#define _PADDING_H +#include "shape.hpp" -#include "operator.hpp" - -class Padding: public Operator -{ +class ShapeCPU : public Shape { public: - /** - @param mode - */ - Padding(DataType dt, PadDesc padDesc) - { - this->dt = dt; - this->padDesc = padDesc; - } + ShapeCPU() : Shape() + {} - OperatorType get_op_type() override + std::shared_ptr clone() override { - return OT_Pad; + std::shared_ptr mem = std::shared_ptr(new ShapeCPU()); + *mem = *this; + return mem; } void run() override { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor inputTensor = this->inputTensors[0]; TensorDesc inputDesc = inputTensor.get_desc(); Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); - U8* inPtr = inputTensor.get_val(); - U8* outPtr = outputTensor.get_val(); - CHECK_STATUS(padding(inputDesc, inPtr, this->padDesc, outputDesc, outPtr, this->schedule)); - - UTIL_TIME_TOC(__CLASS_FUNCTION__) + UNI_memcpy(((CpuMemory *)(outputTensor.get_memory()))->get_ptr(), inputDesc.dims, + inputDesc.nDims * sizeof(U32)); } - EE infer_output_tensors_size(VecinDims, Vec* outDims) override + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override { - CHECK_STATUS(padding_infer_output_size(inDims[0], this->padDesc, &((*outDims)[0]))); + TensorDesc inputDesc = inTensors[0]->get_desc(); + TensorDesc outputDesc = tensor1d(DT_U32, inputDesc.nDims); + outTensors[0]->resize(outputDesc); return SUCCESS; } - -private: - PadDesc padDesc; }; -#endif //_PADDING_H +#endif // _SHAPE_CPU_H diff --git a/inference/engine/include/cpu/shared_weight_cpu.hpp b/inference/engine/include/cpu/shared_weight_cpu.hpp new file mode 100644 index 00000000..26185ade --- /dev/null +++ b/inference/engine/include/cpu/shared_weight_cpu.hpp @@ -0,0 +1,73 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SHARED_WEIGHT_CPU_H +#define _SHARED_WEIGHT_CPU_H + +#include "shared_weight.hpp" + +class SharedWeightCPU : public SharedWeight { +public: + SharedWeightCPU(DataType dt, + TensorDesc desc, + std::string outputTensorName, + std::map> *tensorMapPtr) + : SharedWeight(dt, desc, outputTensorName, tensorMapPtr) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr( + new SharedWeightCPU(this->dt, this->desc, this->outputTensorName, this->tensorMapPtr)); + *mem = *this; + return mem; + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + UNUSED(inTensors); + outTensors[0]->resize(this->desc); + return SUCCESS; + } + + void run() override + {} + + EE init_weight_bias_from_model(std::shared_ptr *modelPtrShared) override + { + U8 *modelPtr = nullptr; + if (modelPtrShared != nullptr) { + modelPtr = (*modelPtrShared).get(); + } + TensorDesc weightDesc = this->desc; + Tensor modelWeightTensor; + modelWeightTensor.resize(weightDesc); + U32 weightBytes = modelWeightTensor.bytes(); + if (modelPtr != nullptr) { + modelWeightTensor.alloc(); + memcpy( + ((CpuMemory *)(modelWeightTensor.get_memory()))->get_ptr(), modelPtr, weightBytes); + *modelPtrShared = std::shared_ptr(*modelPtrShared, modelPtr + weightBytes); + } else { + auto curOpWs = this->get_weightspec(); + ((CpuMemory *)(modelWeightTensor.get_memory())) + ->set_shared_ptr(std::shared_ptr(curOpWs.weight)); + } + this->weightTensors.push_back(modelWeightTensor); + (*this->tensorMapPtr)[this->outputTensorName]->reuse(&(this->weightTensors[0])); + return SUCCESS; + } +}; + +#endif // _SHARED_WEIGHT_CPU_H diff --git a/inference/engine/include/cpu/slice_cpu.hpp b/inference/engine/include/cpu/slice_cpu.hpp new file mode 100644 index 00000000..e57da50f --- /dev/null +++ b/inference/engine/include/cpu/slice_cpu.hpp @@ -0,0 +1,50 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SLICE_CPU_H +#define _SLICE_CPU_H + +#include "slice.hpp" + +class SliceCPU : public Slice { +public: + SliceCPU(DataType dt, SliceParamSpec p) : Slice(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new SliceCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + CHECK_STATUS(slice(inputTensors[0], this->p, outputTensors, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(slice_infer_output_size(inTensors[0], this->p, outTensors, &this->archInfo)); + auto outDesc = outTensors[0]->get_desc(); + if (outDesc.nDims == 3 && outDesc.dims[1] == 1 && outDesc.dims[2] == 1) { + outDesc.nDims = 2; + outDesc.df = DF_NORMAL; + outTensors[0]->resize(outDesc); + } + return SUCCESS; + } +}; + +#endif // _SLICE_CPU_H diff --git a/inference/engine/include/cpu/softmax_cpu.hpp b/inference/engine/include/cpu/softmax_cpu.hpp new file mode 100644 index 00000000..03bdf6fa --- /dev/null +++ b/inference/engine/include/cpu/softmax_cpu.hpp @@ -0,0 +1,46 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SOFTMAX_CPU_H +#define _SOFTMAX_CPU_H + +#include "softmax.hpp" + +class SoftmaxCPU : public Softmax { +public: + SoftmaxCPU(DataType dt, SoftmaxParamSpec p) : Softmax(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new SoftmaxCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + CHECK_STATUS( + softmax(inputTensors[0], this->p, this->temp, outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(softmax_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // SOFTMAX_CPU_H diff --git a/inference/engine/include/cpu/splice_cpu.hpp b/inference/engine/include/cpu/splice_cpu.hpp new file mode 100644 index 00000000..2ef79256 --- /dev/null +++ b/inference/engine/include/cpu/splice_cpu.hpp @@ -0,0 +1,69 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SPLICE_CPU_H +#define _SPLICE_CPU_H + +#include "splice.hpp" + +class SpliceCPU : public Splice { +public: + SpliceCPU(DataType dt, SpliceParamSpec p) : Splice(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new SpliceCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + TensorDesc inputDesc = inputTensor.get_desc(); + + EmbedParamSpec embedParamSpec; + embedParamSpec.input_dim = inputDesc.dims[1]; + embedParamSpec.num_output = inputDesc.dims[0]; + embedParamSpec.transpose = false; + CHECK_STATUS(embedding( + weightTensors[0], inputTensor, embedParamSpec, outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + auto inDim = inTensors[0]->get_desc(); + CHECK_REQUIREMENT(this->p.outputDim % inDim.dims[0] == 0); + auto outDim = inDim; + outDim.dims[1] = this->p.numIndices / (this->p.outputDim / inDim.dims[0]); + outDim.dims[0] = this->p.outputDim; + outTensors[0]->resize(outDim); + return SUCCESS; + } + + EE infer_weight_desc() override + { + auto curOpWs = this->get_weightspec(); + if (curOpWs.weight != nullptr) { + Tensor weightTensor; + weightTensor.resize(tensor1d(DT_U32, this->p.numIndices)); + this->weightTensors.push_back(weightTensor); + } + return SUCCESS; + } +}; + +#endif // _SPLICE_CPU_H diff --git a/inference/engine/include/cpu/squeeze_cpu.hpp b/inference/engine/include/cpu/squeeze_cpu.hpp new file mode 100644 index 00000000..28f14d72 --- /dev/null +++ b/inference/engine/include/cpu/squeeze_cpu.hpp @@ -0,0 +1,48 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SQUEEZE_CPU_H +#define _SQUEEZE_CPU_H + +#include "squeeze.hpp" + +class SqueezeCPU : public Squeeze { +public: + SqueezeCPU(DataType dt, SqueezeParamSpec p) : Squeeze(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new SqueezeCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + squeeze(inputTensor, outputTensor, &this->archInfo); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS( + squeeze_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // _SQUEEZE_CPU_H diff --git a/inference/engine/include/cpu/tfslice_cpu.hpp b/inference/engine/include/cpu/tfslice_cpu.hpp new file mode 100644 index 00000000..61828926 --- /dev/null +++ b/inference/engine/include/cpu/tfslice_cpu.hpp @@ -0,0 +1,48 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _TFSLICE_CPU_H +#define _TFSLICE_CPU_H + +#include "tfslice.hpp" + +class TfSliceCPU : public TfSlice { +public: + TfSliceCPU(DataType dt, TfSliceParamSpec p) : TfSlice(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new TfSliceCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(tfslice(inputTensor, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS( + tfslice_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // _TFSLICE_CPU_H diff --git a/inference/engine/include/cpu/tile_cpu.hpp b/inference/engine/include/cpu/tile_cpu.hpp new file mode 100644 index 00000000..63a887aa --- /dev/null +++ b/inference/engine/include/cpu/tile_cpu.hpp @@ -0,0 +1,44 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _TILE_CPU_H +#define _TILE_CPU_H + +#include "tile.hpp" + +class TileCPU : public Tile { +public: + TileCPU(DataType dt, TileParamSpec p) : Tile(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new TileCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + CHECK_STATUS(tile(this->inputTensors[0], this->p, this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(tile_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // _TILECPU_H diff --git a/inference/engine/include/cpu/transpose_cpu.hpp b/inference/engine/include/cpu/transpose_cpu.hpp new file mode 100644 index 00000000..77dfde43 --- /dev/null +++ b/inference/engine/include/cpu/transpose_cpu.hpp @@ -0,0 +1,49 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _TRANSPOSE_CPU_H +#define _TRANSPOSE_CPU_H + +#include "transpose.hpp" + +class TransposeCPU : public Transpose { +public: + TransposeCPU(DataType dt, TransposeParamSpec p) : Transpose(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new TransposeCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(transpose(inputTensor, this->p, this->temp, outputTensor, &this->archInfo)); + outputTensor.set_scale(inputTensor.get_scale()); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS( + transpose_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // _TRANSPOSE_CPU_H diff --git a/inference/engine/include/cpu/unsqueeze_cpu.hpp b/inference/engine/include/cpu/unsqueeze_cpu.hpp new file mode 100644 index 00000000..39eeab2f --- /dev/null +++ b/inference/engine/include/cpu/unsqueeze_cpu.hpp @@ -0,0 +1,49 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _UNSQUEEZE_CPU_H +#define _UNSQUEEZE_CPU_H + +#include "unsqueeze.hpp" + +class UnsqueezeCPU : public Unsqueeze { +public: + UnsqueezeCPU(DataType dt, UnsqueezeParamSpec p) : Unsqueeze(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new UnsqueezeCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(unsqueeze(inputTensor, outputTensor, &this->archInfo)); + outputTensor.set_scale(inputTensor.get_scale()); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS( + unsqueeze_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // _UNSQUEEZE_CPU_H diff --git a/inference/include/data_loader.hpp b/inference/engine/include/data_loader.hpp similarity index 54% rename from inference/include/data_loader.hpp rename to inference/engine/include/data_loader.hpp index 9ba61194..ad1417d0 100644 --- a/inference/include/data_loader.hpp +++ b/inference/engine/include/data_loader.hpp @@ -1,38 +1,43 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#ifndef _H_DATA_LOADER - -#include -#include "tensor_desc.h" -#include "tensor.hpp" - -#ifdef _BUILD_TEST -Vec load_txt(std::string dataPath, Vec dataDesc); - -Vec load_data(std::string directoryPath, - Vec dataDesc, - Vec>* datas); - -Vec load_image_with_scale(std::string directoryPath, - Vec dataDesc, - Vec>* datas, - ImageFormat ImageFormat, - F32 scaleValue); - -Vec load_bin_with_type(std::string directoryPath, - Vec dataDesc, - Vec>* datas, - Vec sourceDataType); -#endif -#endif +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_DATA_LOADER + +#include +#include "tensor_desc.h" +#include "tensor.hpp" + +#ifdef _BUILD_TEST +int string_end_with(std::string s, std::string sub); + +std::vector load_txt(std::string dataPath, std::vector dataDesc); + +std::vector load_bin( + std::string dataPath, std::vector sourceDataType, std::vector dataDesc); + +std::vector load_data(std::string directoryPath, + std::vector dataDesc, + std::vector> *datas); + +std::vector load_image_with_scale(std::string directoryPath, + std::vector dataDesc, + std::vector> *datas, + ImageFormat ImageFormat, + F32 scaleValue); + +std::vector load_bin_with_type(std::string directoryPath, + std::vector dataDesc, + std::vector> *datas, + std::vector sourceDataType); +#endif +#endif diff --git a/inference/engine/include/deconvolution.hpp b/inference/engine/include/deconvolution.hpp new file mode 100644 index 00000000..e2778310 --- /dev/null +++ b/inference/engine/include/deconvolution.hpp @@ -0,0 +1,45 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _DECONVOLUTION_H +#define _DECONVOLUTION_H + +#include "weight_operator.hpp" + +class Deconvolution : public WeightOperator { +public: + Deconvolution(DataType dt, ConvolutionParamSpec p, ActivationParamSpec activationDesc) + { + this->dt = dt; + this->p = p; + this->activationDesc = activationDesc; + this->hasBias = false; + this->alg = CONVOLUTION_ALGORITHM_NULL; + } + + OperatorType get_type() override + { + return OT_Deconvolution; + } + +public: + U32 numInputs; + + ConvolutionParamSpec p; + + ActivationParamSpec activationDesc; + + ConvolutionForwardAlgorithm alg; +}; + +#endif // _DECONVOLUTION_H diff --git a/inference/include/depth2space.hpp b/inference/engine/include/depth2space.hpp similarity index 74% rename from inference/include/depth2space.hpp rename to inference/engine/include/depth2space.hpp index 8dca41ce..4ef72334 100644 --- a/inference/include/depth2space.hpp +++ b/inference/engine/include/depth2space.hpp @@ -1,38 +1,36 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _DEPTH2SPACE_H #define _DEPTH2SPACE_H #include "operator.hpp" -class Depth2Space: public Operator -{ +class Depth2Space : public Operator { public: - /** - @param mode - */ - Depth2Space(DataType dt) + Depth2Space(DataType dt, Depth2SpaceParamSpec p) { this->dt = dt; + this->p = p; } - OperatorType get_op_type() override + OperatorType get_type() override { return OT_Depth2Space; } +protected: + Depth2SpaceParamSpec p; }; -#endif //_DEPTH2SPACE_H +#endif // _DEPTH2SPACE_H diff --git a/inference/engine/include/detection_output.hpp b/inference/engine/include/detection_output.hpp new file mode 100644 index 00000000..cdc92799 --- /dev/null +++ b/inference/engine/include/detection_output.hpp @@ -0,0 +1,57 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _DETECTION_OUTPUT_H +#define _DETECTION_OUTPUT_H + +#include "operator.hpp" + +class DetectionOutput : public Operator { +public: + DetectionOutput(DataType dt, DetectionOutputParamSpec p) + { + this->dt = dt; + this->p = p; + } + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new DetectionOutput(this->dt, this->p)); + *mem = *this; + return mem; + } + + OperatorType get_type() override + { + return OT_DetectionOutput; + } + + void run() override + { + CHECK_STATUS( + detectionoutput(this->inputTensors, this->p, this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS( + detectionoutput_infer_output_size(inTensors, this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + +protected: + DetectionOutputParamSpec p; +}; +#endif // _DETECTION_OUTPUT_H diff --git a/inference/include/eltwise.hpp b/inference/engine/include/eltwise.hpp similarity index 65% rename from inference/include/eltwise.hpp rename to inference/engine/include/eltwise.hpp index b307ffd6..3c3d5e34 100644 --- a/inference/include/eltwise.hpp +++ b/inference/engine/include/eltwise.hpp @@ -1,46 +1,34 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -/** - * Project deploy - */ - - #ifndef _ELTWISE_H #define _ELTWISE_H #include "operator.hpp" -class Eltwise: public Operator { +class Eltwise : public Operator { public: - Eltwise(EltwiseMode eltMode, I32 coeffSize, F32* coeffValues) + Eltwise(EltwiseParamSpec eltwiseDesc) { - this->eltMode = eltMode; - this->coeffSize = coeffSize; - this->coeffValues = coeffValues; - this->lenOfTemp = 0; + this->eltwiseDesc = eltwiseDesc; } - OperatorType get_op_type() override + OperatorType get_type() override { return OT_Eltwise; } protected: - EltwiseMode eltMode; - I32 coeffSize; - F32* coeffValues; + EltwiseParamSpec eltwiseDesc; }; - -#endif //_ELTWISE_H +#endif // _ELTWISE_H diff --git a/inference/engine/include/embedding.hpp b/inference/engine/include/embedding.hpp new file mode 100644 index 00000000..d2c657ae --- /dev/null +++ b/inference/engine/include/embedding.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _EMBEDDING_H +#define _EMBEDDING_H + +#include "weight_operator.hpp" + +class Embedding : public WeightOperator { +public: + Embedding(DataType dt, EmbedParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_Embedding; + } + +protected: + EmbedParamSpec p; +}; + +#endif // _EMBEDDING__H diff --git a/inference/engine/include/factory.hpp b/inference/engine/include/factory.hpp new file mode 100644 index 00000000..504d24e5 --- /dev/null +++ b/inference/engine/include/factory.hpp @@ -0,0 +1,464 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _FACTORY_H +#define _FACTORY_H + +#include "operator.hpp" + +#define NOT_SUPPORT \ + Operator *cep = NULL; \ + CHECK_STATUS(NOT_SUPPORTED); +#define NOT_USE0() +#define NOT_USE1(a1) \ + { \ + UNUSED(a1); \ + } +#define NOT_USE2(a1, a2) \ + { \ + NOT_USE1(a1) NOT_USE1(a2) \ + } +#define NOT_USE3(a1, a2, a3) \ + { \ + NOT_USE2(a1, a2) NOT_USE1(a3) \ + } +#define NOT_USE4(a1, a2, a3, a4) \ + { \ + NOT_USE2(a1, a2) NOT_USE2(a3, a4) \ + } +#define NOT_USE5(a1, a2, a3, a4, a5) \ + { \ + NOT_USE4(a1, a2, a3, a4) NOT_USE1(a5) \ + } +#define NOT_USE6(a1, a2, a3, a4, a5, a6) \ + { \ + NOT_USE4(a1, a2, a3, a4) NOT_USE2(a5, a6) \ + } +#define NOT_USE8(a1, a2, a3, a4, a5, a6, a7, a8) \ + { \ + NOT_USE4(a1, a2, a3, a4) NOT_USE4(a5, a6, a7, a8) \ + } +#define NOT_USE10(a1, a2, a3, a4, a5, a6, a7, a8, a9, aa) \ + { \ + NOT_USE8(a1, a2, a3, a4, a5, a6, a7, a8) NOT_USE2(a9, aa) \ + } +#define NOT_USE16(a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab, ac, ad, ae, af, ag) \ + { \ + NOT_USE8(a1, a2, a3, a4, a5, a6, a7, a8) NOT_USE8(a9, aa, ab, ac, ad, ae, af, ag) \ + } +#define OP_UNSUP(num, ...) NOT_USE##num(__VA_ARGS__) NOT_SUPPORT + +class Factory { +public: + virtual ~Factory() + {} + + virtual std::shared_ptr createConvolution(DataType dt, + ConvolutionParamSpec p, + ActivationParamSpec dwActivationParamSpec, + ActivationParamSpec pwActivationParamSpec) = 0; + + virtual std::shared_ptr createDeconvolution( + DataType dt, ConvolutionParamSpec p, ActivationParamSpec activationDesc) = 0; + + virtual std::shared_ptr createPooling(PoolingParamSpec p) = 0; + + virtual std::shared_ptr createFullyConnected( + DataType dt, FullyConnectedParamSpec p, U32 numInput) = 0; + + virtual std::shared_ptr createSoftmax(DataType dt, SoftmaxParamSpec p) = 0; + + virtual std::shared_ptr createConcat(ConcatParamSpec p) = 0; + + virtual std::shared_ptr createActivation(ActivationParamSpec activationDesc) = 0; + + virtual std::shared_ptr createEltwise(EltwiseParamSpec eltwiseDesc) = 0; + + virtual std::shared_ptr createScale( + DataType dt, ScaleParamSpec p, int numChannels) = 0; + + virtual std::shared_ptr createRNN(DataType dt, RNNParamSpec p) = 0; + + virtual std::shared_ptr createRNNCell(DataType dt, RNNParamSpec p) = 0; + + virtual std::shared_ptr createEmbedding(DataType dt, EmbedParamSpec p) = 0; + + virtual std::shared_ptr createPower(DataType dt, PowerParamSpec p) = 0; + + virtual std::shared_ptr createMatMul(DataType dt, MatMulParamSpec p) = 0; + + virtual std::shared_ptr createLayerNorm(DataType dt, U32 weightNum) = 0; + + virtual std::shared_ptr createReshape(DataType dt, ReshapeParamSpec p) = 0; + + virtual std::shared_ptr createResize(DataType paramDT, ResizeParamSpec p) = 0; + + virtual std::shared_ptr createSlice(DataType dt, SliceParamSpec p) = 0; + + virtual std::shared_ptr createTranspose(DataType dt, TransposeParamSpec p) = 0; + + virtual std::shared_ptr createAttention(DataType dt, AttentionParamSpec p) = 0; + + virtual std::shared_ptr createClip(DataType dt, ClipParamSpec p) = 0; + + virtual std::shared_ptr createSqueeze(DataType dt, SqueezeParamSpec p) = 0; + + virtual std::shared_ptr createUnsqueeze(DataType dt, UnsqueezeParamSpec p) = 0; + + virtual std::shared_ptr createReduction(DataType dt, ReductionParamSpec p) = 0; + + virtual std::shared_ptr createArgMax(DataType dt, ArgMaxParamSpec p) = 0; + + virtual std::shared_ptr createCopy(DataType dt, CopyParamSpec p) = 0; + + virtual std::shared_ptr createCheck(DataType dt, CheckParamSpec p) = 0; + + virtual std::shared_ptr createRepeat( + DataType dt, RepeatParamSpec p, I32 jumpOperatorIndex, I32 currentOperatorIndex) = 0; + + virtual std::shared_ptr createBilateralSliceApply(BilateralSliceApplyParamSpec p) = 0; + + virtual std::shared_ptr createPreAllocatedMemory(DataType dt, TensorDesc desc) = 0; + + virtual std::shared_ptr createSharedWeight(DataType dt, + TensorDesc desc, + std::string outputTensorName, + std::map> *tensorMapPtr) = 0; + + virtual std::shared_ptr createJump( + DataType dt, I32 jumpOperatorIndex, I32 currentOperatorIndex) = 0; + + virtual std::shared_ptr createSpace2Depth(DataType dt) = 0; + + virtual std::shared_ptr createDepth2Space(DataType dt, Depth2SpaceParamSpec p) = 0; + + virtual std::shared_ptr createAttentionMask(DataType dt, AttentionMaskParamSpec p) = 0; + + virtual std::shared_ptr createRelativePositionEmbedding( + DataType dt, EmbedParamSpec p) = 0; + + virtual std::shared_ptr createRelativeShift(DataType dt, RelativeShiftParamSpec p) = 0; + + virtual std::shared_ptr createPadding(DataType dt, PadParamSpec p) = 0; + + virtual std::shared_ptr createPReLU(DataType dt) = 0; + + virtual std::shared_ptr createPriorBox(DataType dt, PriorBoxParamSpec p) = 0; + + virtual std::shared_ptr createDetectionOutput( + DataType dt, DetectionOutputParamSpec p) = 0; + + virtual std::shared_ptr createYolov3DetectionOutput( + DataType dt, Yolov3DetectionOutputParamSpec p) = 0; + + virtual std::shared_ptr createChannelResize(DataType dt, ChannelResizeParamSpec p) = 0; + + virtual std::shared_ptr createL2Normalization(DataType dt) = 0; + + virtual std::shared_ptr createTile(DataType dt, TileParamSpec p) = 0; + + virtual std::shared_ptr createTfSlice(DataType dt, TfSliceParamSpec p) = 0; + + virtual std::shared_ptr createSplice(DataType dt, SpliceParamSpec p) = 0; + + virtual std::shared_ptr createShape() = 0; + + std::shared_ptr createOperators(OperatorSpec curOps, + DataType dt, + std::map operatorIndexMap, + std::map> *tensorMapPtr, + std::vector inputTensorsName, + std::vector outputTensorsName, + std::set *weightOpOutputNames) + { + OperatorType opType = curOps.type; + DataType dtNoQ = (dt == DT_F16_8Q) ? DT_F16 : dt; + std::string opName = curOps.name; + std::shared_ptr op; + auto curPs = curOps.ps; + switch (opType) { + case OT_Conv: { + ActivationParamSpec dwActiveDesc; + ActivationParamSpec pwActiveDesc; + dwActiveDesc.mode = curPs.conv_spec.dw_activation_type; + pwActiveDesc.mode = curPs.conv_spec.pw_activation_type; + dwActiveDesc.value[0] = 0; + pwActiveDesc.value[0] = 0; + op = createConvolution(dt, curPs.conv_spec, dwActiveDesc, pwActiveDesc); + break; + } + case OT_Deconvolution: { + ActivationParamSpec activeDesc; + activeDesc.mode = curPs.conv_spec.pw_activation_type; + activeDesc.value[0] = 0; + op = createDeconvolution(dtNoQ, curPs.conv_spec, activeDesc); + break; + } + case OT_FC: { + op = createFullyConnected(dt, curPs.fc_spec, 0); + break; + } + case OT_Pooling: { + op = createPooling(curPs.pooling_spec); + break; + } + case OT_Softmax: { + op = createSoftmax(dtNoQ, curPs.softmax_spec); + break; + } + case OT_Relu: { + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_RELU; + activationDesc.value[0] = curOps.ps.relu_spec.neg_slope; + op = createActivation(activationDesc); + break; + } + case OT_Relu6: { + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_RELU6; + op = createActivation(activationDesc); + break; + } + case OT_HSwish: { + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_H_SWISH; + op = createActivation(activationDesc); + break; + } + case OT_Sigmoid: { + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_SIGMOID; + op = createActivation(activationDesc); + break; + } + case OT_HSigmoid: { + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_H_SIGMOID; + op = createActivation(activationDesc); + break; + } + case OT_Gelu: { + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_GELU; + op = createActivation(activationDesc); + break; + } + case OT_TanH: { + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_TANH; + op = createActivation(activationDesc); + break; + } + case OT_Mish: { + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_MISH; + op = createActivation(activationDesc); + break; + } + case OT_Greater: { + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_GREATER; + op = createActivation(activationDesc); + break; + } + case OT_Concat: { + op = createConcat(curPs.concat_spec); + break; + } + case OT_Eltwise: { + op = createEltwise(curOps.ps.eltwise_spec); + break; + } + case OT_Embedding: { + op = createEmbedding(dtNoQ, curPs.embed_spec); + break; + } + case OT_MatMul: { + op = createMatMul(dt, curPs.matmul_spec); + break; + } + case OT_Power: { + op = createPower(dt, curPs.power_spec); + break; + } + case OT_Scale: { + op = createScale(dtNoQ, curPs.scale_spec, 0); + break; + } + case OT_LayerNorm: { + op = createLayerNorm(dt, 0); + break; + } + case OT_Reshape: { + op = createReshape(dt, curPs.reshape_spec); + break; + } + case OT_Resize: { + if (curPs.resize_spec.num_sizes > 0) { + op = createResize(DT_U32, curPs.resize_spec); + } else { + CHECK_REQUIREMENT(curPs.resize_spec.num_scales == 4); + op = createResize(DT_F32, curPs.resize_spec); + } + break; + } + case OT_Slice: { + op = createSlice(dt, curPs.slice_spec); + break; + } + case OT_Transpose: { + op = createTranspose(dt, curPs.transpose_spec); + break; + } + case OT_Attention: { + op = createAttention(dtNoQ, curPs.attention_spec); + break; + } + case OT_Clip: { + op = createClip(dtNoQ, curPs.clip_spec); + break; + } + case OT_RNN: { + if (curPs.rnn_spec.steps >= 0) { + op = createRNN(dt, curPs.rnn_spec); + } else { + op = createRNNCell(dt, curPs.rnn_spec); + } + break; + } + case OT_Squeeze: { + op = createSqueeze(dtNoQ, curPs.squeeze_spec); + break; + } + case OT_Unsqueeze: { + op = createUnsqueeze(dtNoQ, curPs.unsqueeze_spec); + break; + } + case OT_Reduction: { + op = createReduction(dtNoQ, curPs.reduction_spec); + break; + } + case OT_ArgMax: { + op = createArgMax(dtNoQ, curPs.argmax_spec); + break; + } + case OT_PreAllocatedMemory: { + PreAllocatedMemoryParamSpec curPreAllocatedMemoryParamSpec = + curOps.ps.preallocated_memory_spec; + TensorDesc desc = curPreAllocatedMemoryParamSpec.desc; + op = createPreAllocatedMemory(dtNoQ, desc); + break; + } + case OT_SharedWeight: { + SharedWeightParamSpec curSharedWeightParamSpec = curOps.ps.shared_weight_spec; + TensorDesc desc = curSharedWeightParamSpec.desc; + op = createSharedWeight(dtNoQ, desc, outputTensorsName[0], tensorMapPtr); + weightOpOutputNames->insert(outputTensorsName[0]); + break; + } + case OT_Repeat: { + op = createRepeat(dtNoQ, curPs.repeat_spec, operatorIndexMap[inputTensorsName[0]], + operatorIndexMap[opName]); + break; + } + case OT_Check: { + op = createCheck(dtNoQ, curPs.check_spec); + break; + } + case OT_Copy: { + op = createCopy(dtNoQ, curPs.copy_spec); + break; + } + case OT_BilateralSliceApply: { + op = createBilateralSliceApply(curPs.bilateral_slice_apply_spec); + break; + } + case OT_Jump: { + op = createJump( + dtNoQ, operatorIndexMap[inputTensorsName[0]], operatorIndexMap[opName]); + break; + } + case OT_Space2Depth: { + op = createSpace2Depth(dt); + break; + } + case OT_Depth2Space: { + op = createDepth2Space(dt, curPs.depth2space_spec); + break; + } + case OT_AttentionMask: { + op = createAttentionMask(dt, curPs.attention_mask_spec); + break; + } + case OT_RelativePositionEmbedding: { + op = createRelativePositionEmbedding(dtNoQ, curPs.embed_spec); + break; + } + case OT_RelativeShift: { + op = createRelativeShift(dt, curPs.relative_shift_spec); + break; + } + case OT_Pad: { + op = createPadding(dt, curPs.pad_spec); + break; + } + case OT_PriorBox: { + op = createPriorBox(dt, curPs.prior_box_spec); + break; + } + case OT_DetectionOutput: { + op = createDetectionOutput(dt, curPs.detection_output_spec); + break; + } + case OT_Yolov3DetectionOutput: { + op = createYolov3DetectionOutput(dt, curPs.yolov3_detection_output_spec); + break; + } + case OT_ChannelResize: { + op = createChannelResize(dt, curPs.channel_resize_spec); + break; + } + case OT_L2Normalization: { + op = createL2Normalization(dt); + break; + } + case OT_PRelu: { + op = createPReLU(dt); + break; + } + case OT_Tile: { + op = createTile(dt, curPs.tile_spec); + break; + } + case OT_TfSlice: { + op = createTfSlice(dt, curPs.tfslice_spec); + break; + } + case OT_Splice: { + op = createSplice(dt, curPs.splice_spec); + break; + } + case OT_Shape: { + op = createShape(); + break; + } + default: { + UNI_ERROR_LOG("unsupported layer %s\n", OperatorTypeName()[opType]); + break; + } + } + return op; + } +}; + +#endif // _FACTORY_H diff --git a/inference/engine/include/fully_connected.hpp b/inference/engine/include/fully_connected.hpp new file mode 100644 index 00000000..1f563e69 --- /dev/null +++ b/inference/engine/include/fully_connected.hpp @@ -0,0 +1,40 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _FULLY_CONNECTED_H +#define _FULLY_CONNECTED_H + +#include "weight_operator.hpp" + +class FullyConnected : public WeightOperator { +public: + FullyConnected(DataType dt, FullyConnectedParamSpec p, U32 numInput) + { + this->dt = dt; + this->p = p; + this->numInput = numInput; + this->hasBias = false; + } + + OperatorType get_type() override + { + return OT_FC; + } + +public: + U32 numInput; + + FullyConnectedParamSpec p; +}; + +#endif // _FULLY_CONNECTED_H diff --git a/inference/engine/include/inference.hpp b/inference/engine/include/inference.hpp new file mode 100644 index 00000000..a5944c49 --- /dev/null +++ b/inference/engine/include/inference.hpp @@ -0,0 +1,84 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _HPP_INFERENCE +#define _HPP_INFERENCE + +#include "cnn.h" +#ifdef _USE_MALI +#include "gcl.h" +#endif +#ifdef _BUILD_TEST +#include "sequential.hpp" +#endif +#include "thread_affinity.h" +#include "op_type.h" +#include "model_tools.h" +#include "model_serialize_deserialize.hpp" + +inline std::map extractInputDims(const ModelSpec *ms) +{ + std::map inputDescMap; + int inputNum = ms->num_inputs; + for (int i = 0; i < inputNum; i++) { + inputDescMap[ms->input_names[i]] = ms->input_dims[i]; + } + return inputDescMap; +} + +inline std::shared_ptr createPipelinefromMs( + const char *affinityPolicyName, ModelSpec *ms, const char *algorithmMapPath) +{ + AffinityPolicy affinityPolicy = thread_affinity_get_policy_by_name(affinityPolicyName); + CNN *cnn = new CNN(affinityPolicy, ms->dt, ms->model_name); + + cnn->sort_operators_sequential(ms); + + // create ops + cnn->initialize_ops(ms); + + std::map inputDescMap = extractInputDims(ms); + + cnn->loadAlgorithmMapFromText(algorithmMapPath); + + // assign space for output, tmp, bias, and trans_weight + cnn->ready(inputDescMap); + + CHECK_STATUS(cnn->mark_input_output()); + + return std::shared_ptr(cnn); +} + +inline std::shared_ptr createPipeline( + const char *affinityPolicyName, const char *modelPath, const char *algorithmMapPath = "") +{ + // deserialize model from file + ModelSpec ms; + CHECK_STATUS(deserialize_model_from_file(modelPath, &ms)); + + std::shared_ptr pipeline = createPipelinefromMs(affinityPolicyName, &ms, algorithmMapPath); + + CHECK_STATUS(mt_destroy_model(&ms)); + return pipeline; +} + +#ifdef _BUILD_TEST +inline Sequential createSequentialPipeline( + const char *affinityPolicyName, DataType dt, const char *modelName) +{ + AffinityPolicy affinityPolicy = thread_affinity_get_policy_by_name(affinityPolicyName); + auto sequential = Sequential(affinityPolicy, dt, modelName); + return sequential; +} +#endif +#endif diff --git a/inference/include/jump.hpp b/inference/engine/include/jump.hpp similarity index 65% rename from inference/include/jump.hpp rename to inference/engine/include/jump.hpp index 07ccd39e..2932217b 100644 --- a/inference/include/jump.hpp +++ b/inference/engine/include/jump.hpp @@ -1,28 +1,23 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _JUMP_H #define _JUMP_H #include "operator.hpp" -class Jump: public Operator -{ +class Jump : public Operator { public: - /** - @param mode - */ Jump(DataType dt, I32 jumpOperatorIndex, I32 currentOperatorIndex) { this->dt = dt; @@ -30,22 +25,29 @@ class Jump: public Operator this->nextOperatorIndex = currentOperatorIndex + 1; } - OperatorType get_op_type() override + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr( + new Jump(this->dt, this->jumpOperatorIndex, this->nextOperatorIndex)); + *mem = *this; + return mem; + } + + OperatorType get_type() override { return OT_Jump; } void run() override - { } + {} int get_next_operator_index() override { // check status if (this->inputTensors.size() > 1) { Tensor inputTensor = this->inputTensors[1]; - TensorDesc inputDesc = inputTensor.get_desc(); - I32 *ptr = (I32 *)(inputTensor.get_val()); - U32 length = tensorNumElements(inputDesc); + I32 *ptr = (I32 *)((CpuMemory *)(inputTensor.get_memory()))->get_ptr(); + U32 length = inputTensor.length(); for (U32 i = 0; i < length; i++) { if (ptr[i]) { return this->jumpOperatorIndex; @@ -54,12 +56,15 @@ class Jump: public Operator } return this->nextOperatorIndex; } - EE infer_output_tensors_size(VecinDims, Vec* outDims) override - { - UNUSED(inDims); - (*outDims)[0].dt = this->dt; - (*outDims)[0].nDims = 0; + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + UNUSED(inTensors); + auto outDim = outTensors[0]->get_desc(); + outDim.dt = this->dt; + outDim.nDims = 0; + outTensors[0]->resize(outDim); return SUCCESS; } @@ -68,4 +73,4 @@ class Jump: public Operator int nextOperatorIndex; }; -#endif //_JUMP_H +#endif // _JUMP_H diff --git a/inference/engine/include/l2normalization.hpp b/inference/engine/include/l2normalization.hpp new file mode 100644 index 00000000..9916cd87 --- /dev/null +++ b/inference/engine/include/l2normalization.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _L2NORMALIZATION_H +#define _L2NORMALIZATION_H + +#include "operator.hpp" + +class L2Normalization : public Operator { +public: + L2Normalization(DataType dt) + { + this->dt = dt; + } + + OperatorType get_type() override + { + return OT_L2Normalization; + } +}; + +#endif // _L2NORMALIZATION_H diff --git a/inference/include/layer_norm.hpp b/inference/engine/include/layer_norm.hpp similarity index 72% rename from inference/include/layer_norm.hpp rename to inference/engine/include/layer_norm.hpp index 59590fc8..f9e27ac0 100644 --- a/inference/include/layer_norm.hpp +++ b/inference/engine/include/layer_norm.hpp @@ -1,41 +1,37 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _LAYER_NORM_H #define _LAYER_NORM_H #include "operator.hpp" -#include "tensor_computing.h" -#include "tensor_desc.h" -#include "op_type.h" -class LayerNorm: public WeightOperator { +class LayerNorm : public WeightOperator { public: - LayerNorm(DataType dt, U32 weightNum){ + LayerNorm(DataType dt, U32 weightNum) + { this->dt = dt; this->weightNum = weightNum; this->hasBias = false; } - OperatorType get_op_type() override + OperatorType get_type() override { return OT_LayerNorm; } - virtual EE init_weight_bias_from_model(U8** modelPtr) = 0; protected: U32 weightNum; }; -#endif //_LAYER_NORM_H +#endif // _LAYER_NORM_H diff --git a/inference/engine/include/matmul.hpp b/inference/engine/include/matmul.hpp new file mode 100644 index 00000000..ee193d04 --- /dev/null +++ b/inference/engine/include/matmul.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _MATMUL_H +#define _MATMUL_H + +#include "operator.hpp" + +class MatMul : public Operator { +public: + MatMul(DataType dt, MatMulParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_MatMul; + } + +protected: + MatMulParamSpec p; +}; + +#endif // _MATMUL_H diff --git a/inference/engine/include/memory_tracker.hpp b/inference/engine/include/memory_tracker.hpp new file mode 100644 index 00000000..19f05a65 --- /dev/null +++ b/inference/engine/include/memory_tracker.hpp @@ -0,0 +1,115 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _MEMORY_TRACKER_H +#define _MEMORY_TRACKER_H + +#include "tensor_desc.h" + +class MemoryTracker { +public: + MemoryTracker() + { + this->storageSize.clear(); + this->tensorStoragePosition.clear(); + this->memoryNeedAssign = true; + } + + void trackOpTensorSizes(std::shared_ptr op, std::vector tensorNames) + { + I32 *pos = op->get_tensor_positions().data(); + auto inputTensors = op->get_input_tensors(); + auto outputTensors = op->get_output_tensors(); + size_t numInput = inputTensors.size(); + size_t numOutput = outputTensors.size(); + for (size_t i = 0; i < numInput; i++) { + U32 size = inputTensors[i].bytes(); + I32 slot = pos[i]; + this->tensorStoragePosition[tensorNames[i]] = slot; + if (-1 == slot) { + if (!memoryNeedAssign) { + if (size > inputTensors[i].capacity()) { + this->memoryNeedAssign = true; + } + } + continue; + } + this->trackSlotSize(slot, size); + } + for (size_t i = 0; i < numOutput; i++) { + U32 size = outputTensors[i].bytes(); + I32 slot = pos[numInput + i]; + this->tensorStoragePosition[tensorNames[numInput + i]] = slot; + if (-1 == slot) { + if (!memoryNeedAssign) { + if (size > outputTensors[i].capacity()) { + this->memoryNeedAssign = true; + } + } + continue; + } + this->trackSlotSize(slot, size); + } + } + + I32 getSlotByTensorName(std::string name) + { + return tensorStoragePosition[name]; + } + + U32 getNumSlots() + { + return this->storageSize.size(); + } + + U32 getSizeSum() + { + U32 sum = 0; + for (U32 size : this->storageSize) { + sum += size; + } + return sum; + } + + std::vector getStorageSize() + { + return this->storageSize; + } + + void setMemoryAssigned() + { + this->memoryNeedAssign = false; + } + + bool getMemoryNeedAssign() + { + return this->memoryNeedAssign; + } + +protected: + void trackSlotSize(I32 slot, U32 size) + { + if (slot >= (I32)this->storageSize.size()) { + this->storageSize.resize(slot + 1, 0); + } + if (size > this->storageSize[slot]) { + this->storageSize[slot] = size; + this->memoryNeedAssign = true; + } + } + + std::vector storageSize; + std::map tensorStoragePosition; + bool memoryNeedAssign; +}; +#endif diff --git a/inference/engine/include/model.hpp b/inference/engine/include/model.hpp new file mode 100644 index 00000000..6e00c69c --- /dev/null +++ b/inference/engine/include/model.hpp @@ -0,0 +1,196 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _MODEL_H +#define _MODEL_H + +#include "thread_affinity.h" +#include "operator.hpp" +#include "algorithm_map.h" +#include "tensor_desc.h" +#ifdef _USE_MALI +#include "gcl.h" +#endif +#include "profiling.h" + +class Model { +public: + Model() + {} + Model(AffinityPolicy affinityPolicy, DataType dt, std::string name) + { + this->set_device_info(affinityPolicy); + this->dt = dt; + this->name = name; + std::string deviceName = ""; +#ifdef _USE_MALI + if (this->deviceInfo.schedule == MALI) { + this->handle = OCLContext::getInstance().handle; + deviceName = this->handle->deviceName; + } +#endif + algorithmMap = std::shared_ptr( + new AlgorithmMap(this->deviceInfo.schedule, name, deviceName, dt)); + } + + void set_device_info(AffinityPolicy affinityPolicy) + { +#ifndef _USE_IOS + this->deviceInfo = get_cpu_info(affinityPolicy); + this->set_runtime_device_dynamic(); +#else + this->deviceInfo.affinityPolicy = affinityPolicy; + this->deviceInfo.schedule = ARM_A76; +#endif + } + + void set_runtime_device(int cpuId, int threadId = 0) + { + this->set_runtime_device(cpuId, this->deviceInfo.archs[cpuId], threadId); + } + + void set_runtime_device(int cpuId, Arch arch, int threadId = 0) + { + this->deviceInfo.schedule = arch; + if (cpuId >= 0 && cpuId < this->deviceInfo.cpuNum) { + set_thread_affinity(threadId, &cpuId, 1); + for (auto op : ops) { + op->set_schedule(this->deviceInfo.schedule); + } + } + } + + void set_runtime_device_dynamic(int threadId = 0) + { + set_cpu_dynamic(&this->deviceInfo, threadId); + } + + Arch get_runtime_device() + { + return this->deviceInfo.schedule; + } + + virtual EE infer_output_tensors_size(std::map) = 0; + virtual void assign_output_tensor() = 0; + virtual void infer_tmp_memory_size() = 0; + virtual void assign_tmp_tensor() = 0; + + virtual void ready(std::map inputDescMap) + { + infer_output_tensors_size(inputDescMap); + assign_output_tensor(); + + infer_tmp_memory_size(); + assign_tmp_tensor(); + } + + virtual void run() = 0; + +#ifdef _USE_INT8 + virtual U32 find_next_dynamic_scale_op(std::vector calibratedOpIdx, U32 startIdx) + { + CHECK_REQUIREMENT(startIdx < this->ops.size()) + for (U32 i = startIdx; i < this->ops.size();) { + auto op = this->ops[i]; + if (op->is_dynamic_scale()) { + bool calibrated = false; + for (auto idx : calibratedOpIdx) { + if (i == idx) { + calibrated = true; + break; + } + } + if (!calibrated) { + return i; + } + } + + if (op->get_type() == OT_Repeat || op->get_type() == OT_Jump) { + i = op->get_next_operator_index(); + } else { + i++; + } + } + + return 0; // The first layer should never be quantized + } + + virtual std::shared_ptr get_operator_by_index(U32 index) + { + return this->ops[index]; + } + + virtual void run_till_breakpoint(U32 opIdx) + { + CHECK_REQUIREMENT(MALI != this->deviceInfo.schedule); + for (U32 i = 0; i < this->ops.size();) { + auto op = this->ops[i]; + if (op->get_type() == OT_Repeat || op->get_type() == OT_Jump) { + if (opIdx == i) { + break; + } + i = op->get_next_operator_index(); + } else { + op->run(); + if (opIdx == i) { + break; + } + i++; + } + } + } +#endif + + virtual bool checkOperator() + { + for (auto op : this->ops) { + if (!op->checkOperator()) { + return false; + } + } + return true; + } + + std::string get_name() + { + return this->name; + } + + void loadAlgorithmMapFromFileStream(const char *algoFileStream) + { + this->algorithmMap->loadAlgorithmMapFromFileStream(algoFileStream); + } + + void loadAlgorithmMapFromText(std::string algorithmMapPath) + { + this->algorithmMap->loadAlgorithmMapFromText(algorithmMapPath); + } + + void saveAlgorithmMapToText(std::string algorithmMapPath) + { + this->algorithmMap->saveAlgorithmMapToText(algorithmMapPath); + } + +protected: + std::vector> ops; + DeviceInfo deviceInfo; + DataType dt; +#ifdef _USE_MALI + std::shared_ptr handle; +#endif + std::shared_ptr algorithmMap; + +private: + std::string name; +}; +#endif diff --git a/inference/engine/include/ocl/activation_ocl.hpp b/inference/engine/include/ocl/activation_ocl.hpp new file mode 100644 index 00000000..d7aaa5af --- /dev/null +++ b/inference/engine/include/ocl/activation_ocl.hpp @@ -0,0 +1,55 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _ACTIVATION_OCL_H +#define _ACTIVATION_OCL_H + +#include "activation.hpp" + +class ActivationOCL : public Activation { +public: + ActivationOCL(ActivationParamSpec activationDesc) : Activation(activationDesc) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~ActivationOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new ActivationOCL(this->activationDesc)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(activation(inputTensor, this->activationDesc, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(activation_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _ACTIVATION_OCL_H diff --git a/inference/engine/include/ocl/argmax_ocl.hpp b/inference/engine/include/ocl/argmax_ocl.hpp new file mode 100644 index 00000000..329fb93b --- /dev/null +++ b/inference/engine/include/ocl/argmax_ocl.hpp @@ -0,0 +1,66 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _ARGMAX_OCL_H +#define _ARGMAX_OCL_H + +#include "argmax.hpp" + +class ArgMaxOCL : public ArgMax { +public: + ArgMaxOCL(DataType dt, ArgMaxParamSpec p) : ArgMax(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~ArgMaxOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new ArgMaxOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(argmax(inputTensor, this->p, this->temp, outputTensor, &this->archInfo)); + } + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS( + argmax_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + U32 bytes = 0; + CHECK_STATUS(argmax_infer_forward_tmp_bytes( + inputTensor, this->p, outputTensor, &bytes, &this->archInfo)); + return bytes; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _ARGMAX_OCL_H diff --git a/inference/engine/include/ocl/bilateral_slice_apply_ocl.hpp b/inference/engine/include/ocl/bilateral_slice_apply_ocl.hpp new file mode 100644 index 00000000..b2d3150f --- /dev/null +++ b/inference/engine/include/ocl/bilateral_slice_apply_ocl.hpp @@ -0,0 +1,87 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _BILATERAL_SLICE_APPLY_OCL_H +#define _BILATERAL_SLICE_APPLY_OCL_H + +#include "bilateral_slice_apply.hpp" + +class BilateralSliceApplyOCL : public BilateralSliceApply { +public: + BilateralSliceApplyOCL(BilateralSliceApplyParamSpec p) : BilateralSliceApply(p) + { + this->guideTensor = Tensor(OCLMem); + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~BilateralSliceApplyOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new BilateralSliceApplyOCL(this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor gridTensor = this->inputTensors[1]; + Tensor outputTensor = this->outputTensors[0]; + + if (this->p.mode == BSliceApply_NULL) { + this->guideTensor = this->inputTensors[2]; + } + CHECK_STATUS(bilateral_slice_apply( + inputTensor, guideTensor, gridTensor, p, this->temp, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + auto inTensor = inTensors[0]; + auto gridTensor = inTensors[1]; + auto inDim = inTensor->get_desc(); + DataType dt; + DataFormat df; + U32 width; + U32 height; + U32 numChannels; + U32 num; + CHECK_STATUS(tensor4dGet(inDim, &dt, &df, &num, &numChannels, &height, &width)); + TensorDesc guideDesc = tensor4df(DT_F16, df, num, 1, height, width); + this->guideTensor.resize(guideDesc); + + CHECK_STATUS(bilateral_slice_apply_infer_output_size( + inTensor, &guideTensor, gridTensor, p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(bilateral_slice_apply_infer_forward_tmp_bytes(this->inputTensors[0], + this->guideTensor, this->inputTensors[1], p, &bytes, &this->archInfo)); + return bytes; + } + + REGISTER_OCL_OPERATOR_RUN +private: + Tensor guideTensor; +}; + +#endif // _BILATERAL_SLICE_APPLY_OCL_H diff --git a/inference/engine/include/ocl/channel_resize_ocl.hpp b/inference/engine/include/ocl/channel_resize_ocl.hpp new file mode 100644 index 00000000..8a76e921 --- /dev/null +++ b/inference/engine/include/ocl/channel_resize_ocl.hpp @@ -0,0 +1,72 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CHANNEL_RESIZE_OCL_H +#define _CHANNEL_RESIZE_OCL_H + +#include "channel_resize.hpp" + +class ChannelResizeOCL : public ChannelResize { +public: + ChannelResizeOCL(DataType dt, ChannelResizeParamSpec p) : ChannelResize(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~ChannelResizeOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new ChannelResizeOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + + CHECK_STATUS(channel_resize(inputTensor, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + TensorDesc inDesc = inTensors[0]->get_desc(); + int channelAxis = inDesc.nDims - 2; + if ((int)inDesc.dims[channelAxis] != this->p.channel_before) { + this->p.channel_before = inDesc.dims[channelAxis]; + } + if (this->p.group == 0) { + this->p.group = 1; + this->p.channel_before = (int)inDesc.dims[channelAxis]; + this->p.channel_after = this->p.channel_before; + } + if (this->p.group != 1) { + return NOT_SUPPORTED; + } + CHECK_STATUS( + channel_resize_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _CHANNEL_RESIZE_OCL_H diff --git a/inference/engine/include/ocl/check_ocl.hpp b/inference/engine/include/ocl/check_ocl.hpp new file mode 100644 index 00000000..3e4c322c --- /dev/null +++ b/inference/engine/include/ocl/check_ocl.hpp @@ -0,0 +1,54 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CHECK_OCL_H +#define _CHECK_OCL_H + +#include "check.hpp" + +class CheckOCL : public Check { +public: + CheckOCL(DataType dt, CheckParamSpec p) : Check(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~CheckOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new CheckOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputATensor = this->inputTensors[0]; + Tensor inputBTensor = this->inputTensors[1]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(check(inputATensor, inputBTensor, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + return check_infer_output_size(inTensors, outTensors[0], &this->archInfo); + } + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _CHECK_OCL_H diff --git a/inference/engine/include/ocl/clip_ocl.hpp b/inference/engine/include/ocl/clip_ocl.hpp new file mode 100644 index 00000000..8733027e --- /dev/null +++ b/inference/engine/include/ocl/clip_ocl.hpp @@ -0,0 +1,54 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CLIP_OCL_H +#define _CLIP_OCL_H + +#include "clip.hpp" + +class ClipOCL : public Clip { +public: + ClipOCL(DataType dt, ClipParamSpec p) : Clip(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~ClipOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new ClipOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(clip(inputTensor, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(clip_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _CLIP_OCL_H diff --git a/inference/engine/include/ocl/concat_ocl.hpp b/inference/engine/include/ocl/concat_ocl.hpp new file mode 100644 index 00000000..54ba4027 --- /dev/null +++ b/inference/engine/include/ocl/concat_ocl.hpp @@ -0,0 +1,60 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CONCAT_OCL_H +#define _CONCAT_OCL_H + +#include "concat.hpp" + +class ConcatOCL : public Concat { +public: + ConcatOCL(ConcatParamSpec p) : Concat(p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~ConcatOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new ConcatOCL(this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + auto outputTensor = this->outputTensors[0]; + CHECK_STATUS(concat(this->inputTensors, this->p, this->temp, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(concat_infer_output_size(inTensors, this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(concat_infer_forward_tmp_bytes(this->inputTensors, &bytes, &this->archInfo)); + return bytes; + } + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _CONCAT_OCL_H diff --git a/inference/engine/include/ocl/convolution_ocl.hpp b/inference/engine/include/ocl/convolution_ocl.hpp new file mode 100644 index 00000000..be6e3400 --- /dev/null +++ b/inference/engine/include/ocl/convolution_ocl.hpp @@ -0,0 +1,534 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CONVELTWISEPOOLING_OCL_H +#define _CONVELTWISEPOOLING_OCL_H + +#include "convolution.hpp" + +#include "ocl_desc_trans.h" + +class ConvolutionOCL : public Convolution { +public: + ConvolutionOCL(DataType dt, + ConvolutionParamSpec p, + ActivationParamSpec dwActivationParamSpec, + ActivationParamSpec pwActivationParamSpec) + : Convolution(dt, p, dwActivationParamSpec, pwActivationParamSpec) + { + setMALIArchInfo(&(this->archInfo), &(this->runInfo), &this->needSetKernelVec, + &this->needSelectKernelLS); + } + + ~ConvolutionOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new ConvolutionOCL( + this->dt, this->p, this->dwActivationParamSpec, this->pwActivationParamSpec)); + *mem = *this; + return mem; + } + + EE infer_weight_desc() override + { + TensorDesc wDesc[2]; + TensorDesc vDesc[2]; + wDesc[0] = this->filterDesc; + U32 filterNum = 1; + DataType dtNoQ = (this->dt == DT_F16_8Q) ? DT_F16 : this->dt; + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + vDesc[0] = tensor1d(dtNoQ, + this->p.num_outputs); // bias data type should be the same as input and output + ((MaliPara_t)(this->archInfo.archPara))->forwardRunInfo->algorithm = + CONVOLUTION_ALGORITHM_NULL; + break; + } + case Convolution_Depthwise: { + vDesc[0] = tensor1d(dtNoQ, this->p.num_outputs); + ((MaliPara_t)(this->archInfo.archPara))->forwardRunInfo->algorithm = + DEPTHWISE_CONVOLUTION_ALGORITHM_NULL; + break; + } + case Convolution_Depthwise_Pointwise: { + wDesc[1] = this->filterDescExt; + vDesc[0] = tensor1d(dtNoQ, this->numChannels); + vDesc[1] = tensor1d(dtNoQ, this->p.num_outputs); + filterNum = 2; + ((MaliPara_t)(this->archInfo.archPara))->forwardRunInfo->algorithm = + DEPTHWISE_CONVOLUTION_ALGORITHM_NULL; + break; + } + case Convolution_Dilation: { + CHECK_STATUS(NOT_SUPPORTED); + return NOT_SUPPORTED; + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + return NOT_SUPPORTED; + } + + for (U32 i = 0; i < filterNum; i++) { + Tensor modelWeightTensor = Tensor(OCLMem); + Tensor modelVectorTensor = Tensor(OCLMem); + auto weightMem = (OclMemory *)modelWeightTensor.get_memory(); + auto vectorMem = (OclMemory *)modelVectorTensor.get_memory(); + modelWeightTensor.resize(wDesc[i]); + modelVectorTensor.resize(vDesc[i]); + + U32 ww, wh, wc, wn; + DataFormat df; + DataType dt; + tensorSelectGet(wDesc[i], &dt, &df, &wn, &wc, &wh, &ww); + U32 stride[3] = {ww * wh, wc, wn}; + U32 offset[3] = {0, 0, 0}; + GCLMemType mt = GCL_MEM_BUF; + MemFlags flags = CL_MEM_READ_WRITE; + GCLMemDesc desc = gclmem_build_desc(); + CHECK_STATUS(gclmem_set_desc_padding(&desc, stride, offset, dt, df, mt, flags)); + weightMem->padding(desc); + + mt = GCL_MEM_IMG_1D; + U32 vecLen = vDesc[i].dims[0]; + U32 vecAlign = 4; + stride[0] = (vecLen + vecAlign - 1) / vecAlign; + if (i == 0) { + U32 iw, ih; + TensorDesc inputDesc = this->inputTensors[0].get_desc(); + tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, &iw); + if ((wn == 1 && this->p.convolution_type == Convolution_Pointwise) || + (ww == 1 && wh == 1 && iw == 1 && ih == 1)) { + mt = GCL_MEM_BUF; + vecAlign = 8; + stride[0] = (vecLen + vecAlign - 1) / vecAlign * vecAlign; + } + } + stride[1] = 1; + stride[2] = 1; + gclmem_set_desc_padding(&desc, stride, offset, dt, DF_NHWC, mt, flags); + vectorMem->padding(desc); + this->weightTensors.push_back(modelWeightTensor); + this->biasTensors.push_back(modelVectorTensor); + } + return SUCCESS; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + if (this->needTransInput) { + auto inputMem = (OclMemory *)inputTensor.get_memory(); + GCLMemDesc inputDesc = inputMem->get_desc(); + void *inputPtr = inputMem->get_ptr(); + TensorDesc inputDescCpu = inputTensor.get_desc(); + DataType dt; + DataFormat df; + U32 iw, ih, ic; + U32 iw_str, ih_str, ic_str, iw_off, ih_off; + tensorSelectGet(inputDescCpu, &dt, &df, NULL, &ic, &ih, &iw); + get_gclmem_dim(inputDesc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); + if (inputDesc.memFormat == df && iw_str == iw && ih_str == ih && ic_str == ic && + iw_off == 0 && ih_off == 0) { + this->needTransInput = false; + } else { + auto tmpMem = (OclMemory *)this->temp.get_memory(); + void *tmpPtr = tmpMem->get_ptr(); + U32 stride[3] = {iw, ih, ic}; + U32 offset[3] = {0, 0, 0}; + GCLMemType mt = GCL_MEM_BUF; + MemFlags flags = CL_MEM_READ_WRITE; + GCLMemDesc initDesc = gclmem_build_desc(); + CHECK_STATUS(gclmem_set_desc_padding(&initDesc, stride, offset, dt, df, mt, flags)); + CHECK_STATUS(ocl_trans_mem(OCLContext::getInstance().handle.get(), + (GCLMem_t)inputPtr, initDesc, (GCLMem_t)tmpPtr, initDesc)); + CHECK_STATUS(ocl_trans_mem(OCLContext::getInstance().handle.get(), (GCLMem_t)tmpPtr, + initDesc, (GCLMem_t)inputPtr, inputDesc)); + } + } + Tensor filterTensor = this->weightTensors[0]; + filterTensor.resize(this->filterDesc); + U8 *scalePtr = nullptr; + Tensor biasTensor = this->biasTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + CHECK_STATUS( + convolution(inputTensor, filterTensor, p, this->pwAlg, scalePtr, biasTensor, + this->temp, outputTensor, this->pwActivationParamSpec, &this->archInfo)); + break; + } + case Convolution_Depthwise: { + CHECK_STATUS( + depthwise_convolution(inputTensor, filterTensor, p, this->dwAlg, biasTensor, + this->temp, outputTensor, this->dwActivationParamSpec, &this->archInfo)); + break; + } + case Convolution_Depthwise_Pointwise: { + auto dwFilterTensor = filterTensor; + auto pwFilterTensor = this->weightTensors[1]; + auto dwBiasTensor = biasTensor; + auto pwBiasTensor = this->biasTensors[1]; + CHECK_STATUS( + depthwise_pointwise_convolution(inputTensor, dwFilterTensor, pwFilterTensor, p, + this->dwAlg, dwBiasTensor, pwBiasTensor, this->temp, outputTensor, + this->dwActivationParamSpec, this->pwActivationParamSpec, &this->archInfo)); + break; + } + case Convolution_Dilation: { + CHECK_STATUS(NOT_SUPPORTED); + break; + } + default: { + UNI_ERROR_LOG("[ERROR] unsupported convolution type %d\n", this->p.convolution_type); + } + } + } + + EE infer_forward_algorithm(std::shared_ptr algorithmMap) override + { + OCLContext::getInstance().handle.get()->kernelVec = &this->opKernelVec; + auto inputTensor = this->inputTensors[0]; + auto filterTensor = this->weightTensors[0]; + auto outputTensor = this->outputTensors[0]; + filterTensor.resize(this->filterDesc); + ConvolutionPolicy policy = CONVOLUTION_TUNNING; + DataType targetType = DT_F16; + I32 algo[7]; + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + if (this->dt == DT_F16_8Q) { + targetType = DT_I8; + } + if (algorithmMap->getAlgorithmInfoFromMap(this->name, algo, 4)) { + this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0]; + this->runInfo.best_w[0] = algo[1]; + this->runInfo.best_c[0] = algo[2]; + this->runInfo.best_k[0] = algo[3]; + this->pwAlg = (ConvolutionForwardAlgorithm)algo[0]; + } else { + CHECK_STATUS(convolution_infer_forward_algorithm(inputTensor, filterTensor, + outputTensor, p, policy, &(this->pwAlg), targetType, + this->pwActivationParamSpec, &this->archInfo)); + algo[0] = this->runInfo.algorithm; + algo[1] = this->runInfo.best_w[0]; + algo[2] = this->runInfo.best_c[0]; + algo[3] = this->runInfo.best_k[0]; + this->pwAlg = (ConvolutionForwardAlgorithm)algo[0]; + algorithmMap->setAlgorithmInfoToMap(this->name, algo, 4); + } + break; + } + case Convolution_Depthwise: { + if (algorithmMap->getAlgorithmInfoFromMap(this->name, algo, 4)) { + this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0]; + this->runInfo.best_w[0] = algo[1]; + this->runInfo.best_c[0] = algo[2]; + this->runInfo.best_k[0] = algo[3]; + this->dwAlg = (DepthwiseConvolutionForwardAlgorithm)algo[0]; + } else { + CHECK_STATUS(depthwise_convolution_infer_forward_algorithm(inputTensor, + filterTensor, outputTensor, p, policy, &(this->dwAlg), targetType, + this->dwActivationParamSpec, &this->archInfo)); + algo[0] = this->runInfo.algorithm; + algo[1] = this->runInfo.best_w[0]; + algo[2] = this->runInfo.best_c[0]; + algo[3] = this->runInfo.best_k[0]; + this->dwAlg = (DepthwiseConvolutionForwardAlgorithm)algo[0]; + algorithmMap->setAlgorithmInfoToMap(this->name, algo, 4); + } + break; + } + case Convolution_Depthwise_Pointwise: { + if (algorithmMap->getAlgorithmInfoFromMap(this->name, algo, 7)) { + this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0]; + this->runInfo.best_w[0] = algo[1]; + this->runInfo.best_c[0] = algo[2]; + this->runInfo.best_k[0] = algo[3]; + this->runInfo.best_w[1] = algo[4]; + this->runInfo.best_c[1] = algo[5]; + this->runInfo.best_k[1] = algo[6]; + this->dwAlg = (DepthwiseConvolutionForwardAlgorithm)algo[0]; + } else { + auto dwFilterTensor = filterTensor; + auto pwFilterTensor = this->weightTensors[1]; + CHECK_STATUS(depthwise_pointwise_convolution_infer_forward_algorithm( + inputTensor, dwFilterTensor, pwFilterTensor, outputTensor, p, policy, + &(this->dwAlg), targetType, this->dwActivationParamSpec, + this->pwActivationParamSpec, &this->archInfo)); + algo[0] = this->runInfo.algorithm; + algo[1] = this->runInfo.best_w[0]; + algo[2] = this->runInfo.best_c[0]; + algo[3] = this->runInfo.best_k[0]; + algo[4] = this->runInfo.best_w[1]; + algo[5] = this->runInfo.best_c[1]; + algo[6] = this->runInfo.best_k[1]; + this->dwAlg = (DepthwiseConvolutionForwardAlgorithm)algo[0]; + algorithmMap->setAlgorithmInfoToMap(this->name, algo, 7); + } + break; + } + case Convolution_Dilation: { + CHECK_STATUS(NOT_SUPPORTED); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + } + return SUCCESS; + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + auto inputTensor = inTensors[0]; + Tensor filterTensor = Tensor(OCLMem); + TensorDesc inDim = inputTensor->get_desc(); + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + CHECK_STATUS(tensor4dGet(inDim, &idt, &idf, &in, &ic, &ih, &iw)); + this->numChannels = ic; + U32 numFiltersOcl = this->p.num_outputs; + GCLMemDesc inputGclDesc = ocl_get_desc(*inputTensor); + if (this->p.num_outputs_origin == 1 && inputGclDesc.byteSize == 0) { + numFiltersOcl = this->p.num_outputs_origin; + } + DataType targetType = DT_F16; // Default DT_F16 + + auto inputMem = (OclMemory *)inputTensor->get_memory(); + GCLMemDesc gclDesc = inputMem->get_desc(); + this->needTransInput = (gclDesc.byteSize == 0) ? true : false; + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + this->filterDesc = tensor4df(this->dt, DF_NCHW, numFiltersOcl, this->numChannels, + this->p.kernel_h, this->p.kernel_w); + filterTensor.resize(this->filterDesc); + CHECK_STATUS(convolution_infer_output_size( + inputTensor, filterTensor, p, outTensors[0], targetType, &this->archInfo)); + break; + } + case Convolution_Depthwise: { + this->filterDesc = tensor4df( + this->dt, DF_NCHW, 1, this->numChannels, this->p.kernel_h, this->p.kernel_w); + filterTensor.resize(this->filterDesc); + CHECK_STATUS(depthwise_convolution_infer_output_size( + inputTensor, filterTensor, p, outTensors[0], targetType, &this->archInfo)); + break; + } + case Convolution_Depthwise_Pointwise: { + this->filterDesc = tensor4df( + this->dt, DF_NCHW, 1, this->numChannels, this->p.kernel_h, this->p.kernel_w); + this->filterDescExt = + tensor4df(this->dt, DF_NCHW, this->p.num_outputs, this->numChannels, 1, 1); + filterTensor.resize(this->filterDesc); + Tensor filterTensorExt = Tensor(OCLMem); + filterTensorExt.resize(this->filterDescExt); + CHECK_STATUS(depthwise_pointwise_convolution_infer_output_size(inputTensor, + filterTensor, filterTensorExt, p, outTensors[0], targetType, &this->archInfo)); + break; + } + case Convolution_Dilation: { + return NOT_SUPPORTED; + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + } + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + auto inputTensor = this->inputTensors[0]; + auto filterTensor = this->weightTensors[0]; + auto outputTensor = this->outputTensors[0]; + + U32 bytes = 0; + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + CHECK_STATUS(convolution_infer_forward_tmp_bytes(inputTensor, filterTensor, + outputTensor, p, this->pwAlg, &bytes, &this->archInfo)); + break; + } + case Convolution_Depthwise: { + CHECK_STATUS(depthwise_convolution_infer_forward_tmp_bytes(inputTensor, + filterTensor, outputTensor, p, this->dwAlg, &bytes, &this->archInfo)); + break; + } + case Convolution_Depthwise_Pointwise: { + CHECK_STATUS(depthwise_pointwise_convolution_infer_forward_tmp_bytes(inputTensor, + filterTensor, this->weightTensors[1], outputTensor, p, this->dwAlg, &bytes, + &this->archInfo)); + break; + } + case Convolution_Dilation: { + CHECK_STATUS(NOT_SUPPORTED); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + } + if (this->needTransInput) { + TensorDesc desc = inputTensor.get_desc(); + U32 size = tensorNumBytes(desc); + if (bytes < size) { + bytes = size; + } + } + return bytes; + } + + GCLMemDesc infer_wtm_memory_size_mali() override + { + auto filterTensor = this->weightTensors[0]; + filterTensor.resize(this->filterDesc); + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + GCLMemDesc tmpDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + GCLMemDesc gclmemWtmDesc[2]; + gclmemWtmDesc[0] = tmpDesc; + gclmemWtmDesc[1] = tmpDesc; + U32 bytes = 0; + ((MaliPara_t)(this->archInfo.archPara))->gclmemFilterDesc = gclmemWtmDesc; + bool needTransBiasImgToBuf = false; + U32 biasNum = 0; + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + CHECK_STATUS(convolution_transform_filter_bytes( + filterTensor, this->p, this->pwAlg, &bytes, &this->archInfo)); + U32 best_c = this->runInfo.best_c[0]; + U32 best_k = this->runInfo.best_k[0]; + if (best_c == 4 && best_k == 1) { + needTransBiasImgToBuf = true; + } + break; + } + case Convolution_Depthwise: { + CHECK_STATUS(depthwise_convolution_transform_filter_bytes( + filterTensor, this->p, this->dwAlg, &bytes, &this->archInfo)); + break; + } + case Convolution_Depthwise_Pointwise: { + U32 bytesExt = 0; + CHECK_STATUS(depthwise_pointwise_convolution_transform_filter_bytes(filterTensor, + this->weightTensors[1], this->p, this->dwAlg, &bytes, &bytesExt, + &this->archInfo)); + wtm_dp = Tensor(OCLMem); + OclMemory *wtmMem = (OclMemory *)wtm_dp.get_memory(); + wtmMem->padding(gclmemWtmDesc[1]); + wtmMem->alloc(); + if (this->dwAlg == DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_GEMM) { + needTransBiasImgToBuf = true; + biasNum = 1; + } + break; + } + case Convolution_Dilation: { + CHECK_STATUS(NOT_SUPPORTED); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + } + if (needTransBiasImgToBuf) { + Tensor biasTensorBuf = Tensor(OCLMem); + auto biasMemImg = (OclMemory *)(this->biasTensors[biasNum].get_memory()); + auto biasMemBuf = (OclMemory *)(biasTensorBuf.get_memory()); + GCLMemDesc descImg = biasMemImg->get_desc(); + TensorDesc desc = tensor4df(descImg.dt, descImg.df, descImg.dims[3], descImg.dims[2], + descImg.dims[1], descImg.dims[0]); + biasTensorBuf.resize(desc); + GCLMemDesc descBuf = gclmem_build_desc(); + U32 stride[3] = { + (descImg.stride[0] * 4 + 7) / 8 * 8, descImg.stride[1], descImg.stride[2]}; + U32 offset[3] = {0, 0, 0}; + GCLMemType mt = GCL_MEM_BUF; + MemFlags flags = CL_MEM_READ_WRITE; + CHECK_STATUS( + gclmem_set_desc_padding(&descBuf, stride, offset, desc.dt, DF_NCHW, mt, flags)); + biasMemBuf->padding(descBuf); + biasMemBuf->alloc(); + void *bufPtr = biasMemBuf->get_ptr(); + CHECK_STATUS( + gcl_fill_memory_zero(OCLContext::getInstance().handle.get(), (GCLMem_t)bufPtr)); + biasMemBuf->copy_from((Memory *)biasMemImg); + this->biasTensors[biasNum] = biasTensorBuf; + } + return gclmemWtmDesc[0]; + } + + EE transform_filter() override + { + auto filterTensor = this->weightTensors[0]; + filterTensor.resize(this->filterDesc); + + if (DT_F16_8Q == this->dt && Convolution_Pointwise == this->p.convolution_type && + CONVOLUTION_ALGORITHM_WINOGRAD == this->pwAlg) { // int8 winograd + return NOT_SUPPORTED; + } else if (DT_F16_8Q == this->dt && + Convolution_Pointwise == this->p.convolution_type) { // int8 tilegemm + return NOT_SUPPORTED; + } else { // All other cases + auto wtmDesc = this->infer_wtm_memory_size_mali(); + this->wtm = std::shared_ptr(new Tensor(OCLMem)); + OclMemory *wtmMem = (OclMemory *)this->wtm->get_memory(); + wtmMem->padding(wtmDesc); + wtmMem->alloc(); + + switch (this->p.convolution_type) { + case Convolution_Pointwise: { + CHECK_STATUS(convolution_transform_filter(filterTensor, this->p, this->pwAlg, + this->temp, this->wtm.get(), &this->archInfo)); + break; + } + case Convolution_Depthwise: { + CHECK_STATUS(depthwise_convolution_transform_filter( + filterTensor, this->p, this->dwAlg, this->wtm.get(), &this->archInfo)); + break; + } + case Convolution_Depthwise_Pointwise: { + CHECK_STATUS(depthwise_pointwise_convolution_transform_filter(filterTensor, + this->weightTensors[1], this->p, this->dwAlg, this->wtm.get(), + &this->wtm_dp, &this->archInfo)); + this->weightTensors[1] = wtm_dp; + break; + } + case Convolution_Dilation: { + CHECK_STATUS(NOT_SUPPORTED); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + } + } + this->weightTensors[0] = *this->get_wtm(); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN + +private: + Tensor wtm_dp; + TensorDesc filterDesc; + TensorDesc filterDescExt; + bool needTransInput; + +protected: + ForwardRunInfoMali runInfo; +}; + +#endif // _CONVELTWISEPOOLING_H diff --git a/inference/engine/include/ocl/copy_ocl.hpp b/inference/engine/include/ocl/copy_ocl.hpp new file mode 100644 index 00000000..64b4b54d --- /dev/null +++ b/inference/engine/include/ocl/copy_ocl.hpp @@ -0,0 +1,72 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _COPY_OCL_H +#define _COPY_OCL_H + +#include "copy.hpp" + +class CopyOCL : public Copy { +public: + CopyOCL(DataType dt, CopyParamSpec p) : Copy(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~CopyOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new CopyOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + TensorDesc srcDesc = this->inputTensors[0].get_desc(); + TensorDesc dstDesc = this->inputTensors[1].get_desc(); + U32 batch = srcDesc.dims[srcDesc.nDims - 1]; + if (batch > 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + U32 copyLength = (this->p.length >= 0) ? this->p.length : tensorNumElements(srcDesc) / batch; + U32 srcStride = (this->p.src_dims[0] >= 0) ? this->p.src_dims[1] + : tensorNumElements(srcDesc) / batch; + U32 dstStride = (this->p.dst_dims[0] >= 0) ? this->p.dst_dims[1] + : tensorNumElements(dstDesc) / batch; + U32 srcIndex = this->p.src_dims[2]; + U32 dstIndex = this->p.dst_dims[2]; + CHECK_STATUS(copy(this->inputTensors, srcIndex, dstIndex, srcStride, dstStride, copyLength, + &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(copy_infer_output_size(inTensors, &this->archInfo)); + auto desc = outTensors[0]->get_desc(); + desc.dt = this->dt; + desc.df = getTensorDefaultDataFormat(0); + desc.nDims = 0; + outTensors[0]->resize(desc); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _COPY_OCL_H diff --git a/inference/engine/include/ocl/deconvolution_ocl.hpp b/inference/engine/include/ocl/deconvolution_ocl.hpp new file mode 100644 index 00000000..03bcde25 --- /dev/null +++ b/inference/engine/include/ocl/deconvolution_ocl.hpp @@ -0,0 +1,195 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _DECONVOLUTION_OCL_H +#define _DECONVOLUTION_OCL_H + +#include "deconvolution.hpp" + +class DeconvolutionOCL : public Deconvolution { +public: + DeconvolutionOCL(DataType dt, ConvolutionParamSpec p, ActivationParamSpec activationDesc) + : Deconvolution(dt, p, activationDesc) + { + setMALIArchInfo(&(this->archInfo), &(this->runInfo), &this->needSetKernelVec, + &this->needSelectKernelLS); + } + + ~DeconvolutionOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr( + new DeconvolutionOCL(this->dt, this->p, this->activationDesc)); + *mem = *this; + return mem; + } + + EE infer_weight_desc() override + { + auto curOpWs = this->get_weightspec(); + DataType dt = curOpWs.mdt; // weight data type may not be the same as input and output + if (curOpWs.weight == nullptr) { + dt = this->dt; + } + DataType dtNoQ = (this->dt == DT_F16_8Q) ? DT_F16 : this->dt; + DataFormat df = DF_NCHW; + U32 fh, fw, fc, fn; + fn = this->numInputs; + fc = this->p.num_outputs; + fh = this->p.kernel_h; + fw = this->p.kernel_w; + U32 vectorLen = fn; + ((MaliPara_t)(this->archInfo.archPara))->forwardRunInfo->algorithm = + CONVOLUTION_ALGORITHM_NULL; + TensorDesc filterTensorDesc = tensor4df(dtNoQ, df, fn, fc, fh, fw); + TensorDesc vectorTensorDesc = tensor1d(dtNoQ, vectorLen); + + Tensor modelWeightTensor = Tensor(OCLMem); + Tensor modelVectorTensor = Tensor(OCLMem); + auto weightMem = (OclMemory *)modelWeightTensor.get_memory(); + auto vectorMem = (OclMemory *)modelVectorTensor.get_memory(); + modelWeightTensor.resize(filterTensorDesc); + modelVectorTensor.resize(vectorTensorDesc); + U32 stride[3] = {fw * fh, fc, fn}; + U32 offset[3] = {0, 0, 0}; + GCLMemType mt = GCL_MEM_BUF; + MemFlags flags = CL_MEM_READ_WRITE; + GCLMemDesc desc = gclmem_build_desc(); + CHECK_STATUS(gclmem_set_desc_padding(&desc, stride, offset, dtNoQ, df, mt, flags)); + weightMem->padding(desc); + + mt = GCL_MEM_IMG_1D; + stride[0] = (vectorLen + 3) / 4; + stride[1] = 1; + stride[2] = 1; + gclmem_set_desc_padding(&desc, stride, offset, dtNoQ, DF_NHWC, mt, flags); + vectorMem->padding(desc); + this->weightTensors.push_back(modelWeightTensor); + this->biasTensors.push_back(modelVectorTensor); + return SUCCESS; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor filterTensor = this->weightTensors[0]; + U8 *scalePtr = nullptr; + Tensor biasTensor = this->biasTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(deconvolution(inputTensor, filterTensor, p, this->alg, scalePtr, biasTensor, + this->temp, outputTensor, this->activationDesc, &this->archInfo)); + } + + EE infer_forward_algorithm(std::shared_ptr algorithmMap) override + { + OCLContext::getInstance().handle.get()->kernelVec = &this->opKernelVec; + ConvolutionPolicy policy = CONVOLUTION_TUNNING; + ((MaliPara_t)(this->archInfo.archPara))->forwardRunInfo->algorithm = + CONVOLUTION_ALGORITHM_NULL; + DataType targetType = DT_F16; + + I32 algo[4]; + if (algorithmMap->getAlgorithmInfoFromMap(this->name, algo, 4)) { + this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0]; + this->runInfo.best_w[0] = algo[1]; + this->runInfo.best_c[0] = algo[2]; + this->runInfo.best_k[0] = algo[3]; + this->alg = (ConvolutionForwardAlgorithm)algo[0]; + } else { + CHECK_STATUS(deconvolution_infer_forward_algorithm(this->inputTensors[0], + this->weightTensors[0], this->outputTensors[0], p, policy, &(this->alg), targetType, + this->activationDesc, &this->archInfo)); + algo[0] = this->runInfo.algorithm; + algo[1] = this->runInfo.best_w[0]; + algo[2] = this->runInfo.best_c[0]; + algo[3] = this->runInfo.best_k[0]; + this->alg = (ConvolutionForwardAlgorithm)algo[0]; + algorithmMap->setAlgorithmInfoToMap(this->name, algo, 4); + } + return SUCCESS; + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + auto inputTensor = inTensors[0]; + TensorDesc inDim = inputTensor->get_desc(); + DataType idt; + DataFormat idf; + U32 in, ic, ih, iw; + CHECK_STATUS(tensor4dGet(inDim, &idt, &idf, &in, &ic, &ih, &iw)); + this->numInputs = ic; + + TensorDesc filterDim = tensor4df(this->dt, DF_NCHW, this->numInputs, this->p.num_outputs, + this->p.kernel_h, this->p.kernel_w); + Tensor filterTensor = Tensor(OCLMem); + filterTensor.resize(filterDim); + this->p = createConvolutionParamSpec(this->p.group, this->p.kernel_h, this->p.kernel_w, + this->p.stride_h, this->p.stride_w, this->p.padding_top, this->p.padding_bottom, + this->p.padding_left, this->p.padding_right, this->p.dilatedRate_h, + this->p.dilatedRate_w, this->p.num_outputs, this->p.convolution_type); + + DataType targetType = this->dt; + CHECK_STATUS(deconvolution_infer_output_size( + inputTensor, filterTensor, p, outTensors[0], targetType, &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor filterTensor = this->weightTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + + U32 bytes = 0; + CHECK_STATUS(deconvolution_infer_forward_tmp_bytes( + inputTensor, filterTensor, outputTensor, p, this->alg, &bytes, &this->archInfo)); + return bytes; + } + + GCLMemDesc infer_wtm_memory_size_mali() override + { + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + GCLMemDesc gclmemWtmDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + U32 bytes = 0; + ((MaliPara_t)(this->archInfo.archPara))->gclmemFilterDesc = &gclmemWtmDesc; + CHECK_STATUS(deconvolution_transform_filter_bytes( + this->weightTensors[0], this->p, this->alg, &bytes, &this->archInfo)); + return gclmemWtmDesc; + } + + EE transform_filter() override + { + Tensor filterTensor = this->weightTensors[0]; + auto wtmDesc = this->infer_wtm_memory_size_mali(); + Tensor wtm(OCLMem); + OclMemory *wtmMem = (OclMemory *)wtm.get_memory(); + wtmMem->padding(wtmDesc); + wtmMem->alloc(); + CHECK_STATUS(deconvolution_transform_filter( + filterTensor, this->p, this->alg, this->temp, &wtm, &this->archInfo)); + this->weightTensors[0] = wtm; + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN + +protected: + ForwardRunInfoMali runInfo; +}; + +#endif // _DECONVOLUTION_OCL_H diff --git a/inference/engine/include/ocl/depth2space_ocl.hpp b/inference/engine/include/ocl/depth2space_ocl.hpp new file mode 100644 index 00000000..d8f09678 --- /dev/null +++ b/inference/engine/include/ocl/depth2space_ocl.hpp @@ -0,0 +1,67 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _DEPTH2SPACE_OCL_H +#define _DEPTH2SPACE_OCL_H + +#include "depth2space.hpp" + +class Depth2SpaceOCL : public Depth2Space { +public: + Depth2SpaceOCL(DataType dt, Depth2SpaceParamSpec p) : Depth2Space(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~Depth2SpaceOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new Depth2SpaceOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(depth2space(inputTensor, this->p, this->temp, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS( + depth2space_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + U32 bytes = 0; + CHECK_STATUS(depth2space_infer_forward_tmp_bytes( + inputTensor, this->p, outputTensor, &bytes, &this->archInfo)); + return bytes; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _DEPTH2SPACE_OCL_H diff --git a/inference/engine/include/ocl/eltwise_ocl.hpp b/inference/engine/include/ocl/eltwise_ocl.hpp new file mode 100644 index 00000000..6a1df477 --- /dev/null +++ b/inference/engine/include/ocl/eltwise_ocl.hpp @@ -0,0 +1,54 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _ELTWISE_OCL_H +#define _ELTWISE_OCL_H + +#include "eltwise.hpp" + +class EltwiseOCL : public Eltwise { +public: + EltwiseOCL(EltwiseParamSpec eltwiseDesc) : Eltwise(eltwiseDesc) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~EltwiseOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new EltwiseOCL(this->eltwiseDesc)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + CHECK_STATUS(eltwise(this->inputTensors, this->eltwiseDesc, this->temp, + this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(eltwise_infer_output_size(inTensors, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; +#endif // _ELTWISE_OCL_H diff --git a/inference/engine/include/ocl/embedding_ocl.hpp b/inference/engine/include/ocl/embedding_ocl.hpp new file mode 100644 index 00000000..ec52ecb6 --- /dev/null +++ b/inference/engine/include/ocl/embedding_ocl.hpp @@ -0,0 +1,104 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _EMBEDDING_OCL_H +#define _EMBEDDING_OCL_H + +#include "embedding.hpp" + +class EmbeddingOCL : public Embedding { +public: + EmbeddingOCL(DataType dt, EmbedParamSpec p) : Embedding(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~EmbeddingOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new EmbeddingOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor weightTensor; + if (this->weightTensors.size() > 0) { + weightTensor = this->weightTensors[0]; + } else { + weightTensor = this->inputTensors[1]; + } + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(embedding(inputTensor, weightTensor, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(embedding_infer_output_size( + inTensors[0], this->p, this->dt, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + EE init_weight_bias_from_model(std::shared_ptr *modelPtr) override + { + auto curOpWs = this->get_weightspec(); + if (modelPtr == nullptr && curOpWs.weight == nullptr) { + return SUCCESS; + } + TensorDesc weightDesc; + if (this->p.transpose) { + weightDesc = tensor2df(this->dt, DF_TRANSPOSE, this->p.num_output, this->p.input_dim); + } else { + weightDesc = tensor2df(this->dt, DF_NORMAL, this->p.input_dim, this->p.num_output); + } + Tensor modelWeightTensor = Tensor(OCLMem); + auto weightMem = (OclMemory *)modelWeightTensor.get_memory(); + modelWeightTensor.resize(weightDesc); + U32 stride[3] = {weightDesc.dims[0], weightDesc.dims[1], 1}; + U32 offset[3] = {0, 0, 0}; + GCLMemType mt = GCL_MEM_BUF; + MemFlags flags = CL_MEM_READ_WRITE; + GCLMemDesc desc = gclmem_build_desc(); + CHECK_STATUS(gclmem_set_desc_padding(&desc, stride, offset, this->dt, DF_NCHW, mt, flags)); + weightMem->padding(desc); + + CpuMemory weight_mem_src; + std::shared_ptr weight_ptr; + if (modelPtr) { + weight_ptr = *modelPtr; + } else { + weight_ptr = std::shared_ptr(curOpWs.weight); + } + weight_mem_src.resize(weightDesc); + weight_mem_src.set_shared_ptr(std::shared_ptr(weight_ptr)); + weightMem->copy_from((Memory *)&weight_mem_src); + this->weightTensors.push_back(modelWeightTensor); + if (modelPtr) { + *modelPtr = + std::shared_ptr(*modelPtr, (*modelPtr).get() + tensorNumBytes(weightDesc)); + } + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _EMBEDDING_OCL_H diff --git a/inference/engine/include/ocl/factory_ocl.hpp b/inference/engine/include/ocl/factory_ocl.hpp new file mode 100644 index 00000000..8ebb4ad5 --- /dev/null +++ b/inference/engine/include/ocl/factory_ocl.hpp @@ -0,0 +1,357 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _FACTORY_OCL_H +#define _FACTORY_OCL_H +#include "factory.hpp" +#include "ocl/resize_ocl.hpp" +#include "ocl/channel_resize_ocl.hpp" +#include "ocl/deconvolution_ocl.hpp" +#include "ocl/bilateral_slice_apply_ocl.hpp" +#include "ocl/pooling_ocl.hpp" +#include "ocl/convolution_ocl.hpp" +#include "ocl/eltwise_ocl.hpp" +#include "ocl/softmax_ocl.hpp" +#include "ocl/activation_ocl.hpp" +#include "ocl/fully_connected_ocl.hpp" +#include "ocl/scale_ocl.hpp" +#include "ocl/concat_ocl.hpp" +#include "ocl/clip_ocl.hpp" +#include "ocl/squeeze_ocl.hpp" +#include "ocl/reshape_ocl.hpp" +#include "ocl/space2depth_ocl.hpp" +#include "ocl/depth2space_ocl.hpp" +#include "ocl/embedding_ocl.hpp" +#include "ocl/layer_norm_ocl.hpp" +#include "ocl/matmul_ocl.hpp" +#include "ocl/power_ocl.hpp" +#include "ocl/transpose_ocl.hpp" +#include "ocl/slice_ocl.hpp" +#include "ocl/shared_weight_ocl.hpp" +#include "ocl/repeat_ocl.hpp" +#include "ocl/copy_ocl.hpp" +#include "ocl/check_ocl.hpp" +#include "ocl/preallocated_memory_ocl.hpp" +#include "ocl/argmax_ocl.hpp" +#include "ocl/unsqueeze_ocl.hpp" +#include "ocl/rnn_ocl.hpp" +#include "ocl/rnncell_ocl.hpp" +#include "ocl/padding_ocl.hpp" +#include "ocl/prelu_ocl.hpp" + +class FactoryOCL : public Factory { +public: + std::shared_ptr createConvolution(DataType dt, + ConvolutionParamSpec p, + ActivationParamSpec dwActivationParamSpec, + ActivationParamSpec pwActivationParamSpec) override + { + auto cep = + (Convolution *)(new ConvolutionOCL(dt, p, dwActivationParamSpec, pwActivationParamSpec)); + return std::shared_ptr(cep); + } + + std::shared_ptr createDeconvolution( + DataType dt, ConvolutionParamSpec p, ActivationParamSpec activationDesc) override + { + auto cep = new DeconvolutionOCL(dt, p, activationDesc); + return std::shared_ptr(cep); + } + + std::shared_ptr createPooling(PoolingParamSpec p) override + { + auto cep = (Pooling *)(new PoolingOCL(p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createFullyConnected( + DataType dt, FullyConnectedParamSpec p, U32 numInput) override + { + auto cep = (FullyConnectedOCL *)(new FullyConnectedOCL(dt, p, numInput)); + return std::shared_ptr(cep); + } + + std::shared_ptr createSoftmax(DataType dt, SoftmaxParamSpec p) override + { + auto cep = new SoftmaxOCL(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createConcat(ConcatParamSpec p) override + { + auto cep = (Concat *)(new ConcatOCL(p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createActivation(ActivationParamSpec activeDesc) override + { + auto cep = (Activation *)new ActivationOCL(activeDesc); + return std::shared_ptr(cep); + } + + std::shared_ptr createEltwise(EltwiseParamSpec eltwiseDesc) override + { + auto cep = (Eltwise *)new EltwiseOCL(eltwiseDesc); + return std::shared_ptr(cep); + } + + std::shared_ptr createScale(DataType dt, ScaleParamSpec p, int numChannels) override + { + auto cep = (Scale *)(new ScaleOCL(dt, p, numChannels)); + return std::shared_ptr(cep); + } + + std::shared_ptr createPReLU(DataType dt) override + { + auto cep = (PReLU *)(new PReLUOCL(dt)); + return std::shared_ptr(cep); + } + + std::shared_ptr createRNN(DataType dt, RNNParamSpec p) override + { + auto cep = (RNNCell *)(new RNNOCL(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createRNNCell(DataType dt, RNNParamSpec p) override + { + auto cep = (RNNCell *)(new RNNCellOCL(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createEmbedding(DataType dt, EmbedParamSpec p) override + { + auto cep = (Embedding *)new EmbeddingOCL(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createPower(DataType dt, PowerParamSpec p) override + { + auto cep = (Power *)new PowerOCL(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createMatMul(DataType dt, MatMulParamSpec p) override + { + auto cep = (MatMul *)(new MatMulOCL(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createLayerNorm(DataType dt, U32 weightNum) override + { + auto cep = (LayerNorm *)new LayerNormOCL(dt, weightNum); + return std::shared_ptr(cep); + } + + std::shared_ptr createReshape(DataType dt, ReshapeParamSpec p) override + { + auto cep = (Reshape *)(new ReshapeOCL(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createResize(DataType paramDT, ResizeParamSpec p) override + { + // auto cep = new Resize(paramDT, paramPtr); + // OP_UNSUP(2, paramDT, paramPtr); + auto cep = (Resize *)(new ResizeOCL(paramDT, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createSlice(DataType dt, SliceParamSpec p) override + { + auto cep = (Slice *)(new SliceOCL(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createTranspose(DataType dt, TransposeParamSpec p) override + { + auto cep = (Transpose *)(new TransposeOCL(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createAttention(DataType dt, AttentionParamSpec p) override + { + // auto cep = new AttentionOCL(dt, numHeads, fromSequenceLength, toSequenceLength); + OP_UNSUP(2, dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createClip(DataType dt, ClipParamSpec p) override + { + auto cep = (Clip *)(new ClipOCL(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createSqueeze(DataType dt, SqueezeParamSpec p) override + { + auto cep = (Squeeze *)(new SqueezeOCL(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createUnsqueeze(DataType dt, UnsqueezeParamSpec p) override + { + auto cep = (Unsqueeze *)new UnsqueezeOCL(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createReduction(DataType dt, ReductionParamSpec p) override + { + OP_UNSUP(2, dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createArgMax(DataType dt, ArgMaxParamSpec p) override + { + auto cep = (ArgMax *)new ArgMaxOCL(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createCopy(DataType dt, CopyParamSpec p) override + { + auto cep = (Copy *)new CopyOCL(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createCheck(DataType dt, CheckParamSpec p) override + { + auto cep = (Check *)new CheckOCL(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createRepeat( + DataType dt, RepeatParamSpec p, I32 jumpOperatorIndex, I32 currentOperatorIndex) override + { + auto cep = (Repeat *)new RepeatOCL(dt, p, jumpOperatorIndex, currentOperatorIndex); + return std::shared_ptr(cep); + } + + std::shared_ptr createBilateralSliceApply(BilateralSliceApplyParamSpec p) override + { + auto cep = (BilateralSliceApply *)(new BilateralSliceApplyOCL(p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createPreAllocatedMemory(DataType dt, TensorDesc desc) override + { + auto cep = (PreAllocatedMemory *)new PreAllocatedMemoryOCL(dt, desc); + return std::shared_ptr(cep); + } + + std::shared_ptr createSharedWeight(DataType dt, + TensorDesc desc, + std::string outputTensorName, + std::map> *tensorMapPtr) override + { + auto cep = (SharedWeight *)new SharedWeightOCL(dt, desc, outputTensorName, tensorMapPtr); + return std::shared_ptr(cep); + } + + std::shared_ptr createJump( + DataType dt, I32 jumpOperatorIndex, I32 currentOperatorIndex) override + { + OP_UNSUP(3, dt, jumpOperatorIndex, currentOperatorIndex); + return std::shared_ptr(cep); + } + + std::shared_ptr createSpace2Depth(DataType dt) override + { + auto cep = (Space2Depth *)(new Space2DepthOCL(dt)); + return std::shared_ptr(cep); + } + + std::shared_ptr createDepth2Space(DataType dt, Depth2SpaceParamSpec p) override + { + auto cep = (Depth2Space *)(new Depth2SpaceOCL(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createAttentionMask(DataType dt, AttentionMaskParamSpec p) override + { + OP_UNSUP(2, dt, p) + return std::shared_ptr(cep); + } + + std::shared_ptr createRelativePositionEmbedding(DataType dt, EmbedParamSpec p) override + { + OP_UNSUP(2, dt, p) + return std::shared_ptr(cep); + } + + std::shared_ptr createRelativeShift(DataType dt, RelativeShiftParamSpec p) override + { + OP_UNSUP(2, dt, p) + return std::shared_ptr(cep); + } + + std::shared_ptr createPadding(DataType dt, PadParamSpec p) override + { + auto cep = (Padding *)(new PaddingOCL(dt, p)); + return std::shared_ptr(cep); + } + + std::shared_ptr createPriorBox(DataType dt, PriorBoxParamSpec p) override + { + OP_UNSUP(2, dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createDetectionOutput(DataType dt, DetectionOutputParamSpec p) override + { + OP_UNSUP(2, dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createYolov3DetectionOutput( + DataType dt, Yolov3DetectionOutputParamSpec p) override + { + OP_UNSUP(2, dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createChannelResize(DataType dt, ChannelResizeParamSpec p) override + { + auto cep = new ChannelResizeOCL(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createL2Normalization(DataType dt) override + { + OP_UNSUP(1, dt); + return std::shared_ptr(cep); + } + + std::shared_ptr createTile(DataType dt, TileParamSpec p) override + { + OP_UNSUP(2, dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createTfSlice(DataType dt, TfSliceParamSpec p) override + { + OP_UNSUP(2, dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createSplice(DataType dt, SpliceParamSpec p) override + { + OP_UNSUP(2, dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createShape() override + { + OP_UNSUP(0); + return std::shared_ptr(cep); + } +}; +#endif // _FACTORY_OCL_H diff --git a/inference/engine/include/ocl/fully_connected_ocl.hpp b/inference/engine/include/ocl/fully_connected_ocl.hpp new file mode 100644 index 00000000..ffcfb29b --- /dev/null +++ b/inference/engine/include/ocl/fully_connected_ocl.hpp @@ -0,0 +1,207 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _FULLY_CONNECTED_OCL_H +#define _FULLY_CONNECTED_OCL_H + +#include "fully_connected.hpp" + +class FullyConnectedOCL : public FullyConnected { +public: + FullyConnectedOCL(DataType dt, FullyConnectedParamSpec p, U32 numInput) + : FullyConnected(dt, p, numInput) + { + setMALIArchInfo(&(this->archInfo), &(this->runInfo), &this->needSetKernelVec, + &this->needSelectKernelLS); + } + + ~FullyConnectedOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr( + new FullyConnectedOCL(this->dt, this->p, this->numInput)); + *mem = *this; + return mem; + } + + EE infer_weight_desc() override + { + TensorDesc weightDesc = tensor2df(this->dt, DF_NORMAL, this->p.num_outputs, this->numInput); + TensorDesc biasDesc = tensor1d(this->dt, this->p.num_outputs); + + Tensor modelWeightTensor = Tensor(OCLMem); + Tensor modelVectorTensor = Tensor(OCLMem); + auto weightMem = (OclMemory *)modelWeightTensor.get_memory(); + auto vectorMem = (OclMemory *)modelVectorTensor.get_memory(); + modelWeightTensor.resize(weightDesc); + modelVectorTensor.resize(biasDesc); + + U32 stride[3] = {this->p.num_outputs, this->numInput, 1}; + U32 offset[3] = {0, 0, 0}; + GCLMemType mt = GCL_MEM_BUF; + MemFlags flags = CL_MEM_READ_WRITE; + GCLMemDesc desc = gclmem_build_desc(); + CHECK_STATUS(gclmem_set_desc_padding(&desc, stride, offset, this->dt, DF_NCHW, mt, flags)); + weightMem->padding(desc); + + stride[0] = (this->p.num_outputs + 3) / 4 * 4; + stride[1] = 1; + stride[2] = 1; + gclmem_set_desc_padding(&desc, stride, offset, this->dt, DF_NHWC, mt, flags); + vectorMem->padding(desc); + this->weightTensors.push_back(modelWeightTensor); + this->biasTensors.push_back(modelVectorTensor); + return SUCCESS; + } + + EE infer_forward_algorithm(std::shared_ptr algorithmMap) override + { + OCLContext::getInstance().handle.get()->kernelVec = &this->opKernelVec; + Tensor inputTensor = this->inputTensors[0]; + Tensor filterTensor = Tensor(OCLMem); + Tensor outputTensor = this->outputTensors[0]; + filterTensor.resize(filterDesc4D); + ((MaliPara_t)(this->archInfo.archPara))->forwardRunInfo->algorithm = + CONVOLUTION_ALGORITHM_NULL; + I32 algo[4]; + if (algorithmMap->getAlgorithmInfoFromMap(this->name, algo, 4)) { + this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0]; + this->runInfo.best_w[0] = algo[1]; + this->runInfo.best_c[0] = algo[2]; + this->runInfo.best_k[0] = algo[3]; + } else { + CHECK_STATUS(fully_connected_infer_forward_algorithm( + inputTensor, filterTensor, outputTensor, &this->archInfo)); + algo[0] = this->runInfo.algorithm; + algo[1] = this->runInfo.best_w[0]; + algo[2] = this->runInfo.best_c[0]; + algo[3] = this->runInfo.best_k[0]; + algorithmMap->setAlgorithmInfoToMap(this->name, algo, 4); + } + return SUCCESS; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor weightTensor = this->weightTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + Tensor biasTensor = this->biasTensors[0]; + + CHECK_STATUS(fully_connected( + inputTensor, weightTensor, biasTensor, this->temp, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + TensorDesc inputDesc = inTensors[0]->get_desc(); + U32 ic, ih, iw; + if (inputDesc.df == DF_NCHW) { + tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, &ih, &iw); + } + if (inputDesc.df == DF_MKT) { + iw = 1; + ih = 1; + ic = inputDesc.dims[1]; + } + filterDesc4D = tensor4df(this->dt, DF_NCHW, this->p.num_outputs, ic, ih, iw); + this->numInput = ic * ih * iw; + Tensor filterTensor = Tensor(OCLMem); + filterTensor.resize(filterDesc4D); + CHECK_STATUS(fully_connected_infer_output_size( + inTensors[0], filterTensor, outTensors[0], &this->archInfo)); + if (this->p.num_slices > 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor filterTensor = Tensor(OCLMem); + filterTensor.resize(filterDesc4D); + U32 bytes = 0; + CHECK_STATUS(fully_connected_infer_forward_tmp_bytes( + inputTensor, filterTensor, &bytes, &this->archInfo)); + return bytes; + } + + GCLMemDesc infer_wtm_memory_size_mali() override + { + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + GCLMemDesc gclmemWtmDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + U32 bytes = 0; + ((MaliPara_t)(this->archInfo.archPara))->gclmemFilterDesc = &gclmemWtmDesc; + Tensor filterTensor = Tensor(OCLMem); + filterTensor.resize(filterDesc4D); + CHECK_STATUS(fully_connected_transform_filter_bytes(filterTensor, &bytes, &this->archInfo)); + return gclmemWtmDesc; + } + + EE transform_filter() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor filterTensor = this->weightTensors[0]; + filterTensor.resize(this->filterDesc4D); + auto wtmDesc = this->infer_wtm_memory_size_mali(); + if (this->p.num_slices == 1) { + this->wtm = std::shared_ptr(new Tensor(OCLMem)); + OclMemory *wtmMem = (OclMemory *)this->wtm->get_memory(); + wtmMem->padding(wtmDesc); + wtmMem->alloc(); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + CHECK_STATUS(fully_connected_transform_filter( + inputTensor, filterTensor, this->wtm.get(), &this->archInfo)); + this->weightTensors[0] = *this->get_wtm(); + auto inputDesc = this->inputTensors[0].get_desc(); + if (inputDesc.df == DF_MKT) { + Tensor biasTensorImg = Tensor(OCLMem); + auto biasMemBuf = (OclMemory *)(biasTensors[0].get_memory()); + auto biasMemImg = (OclMemory *)(biasTensorImg.get_memory()); + GCLMemDesc descBuf = biasMemBuf->get_desc(); + TensorDesc desc = tensor4df(descBuf.dt, descBuf.df, descBuf.dims[3], descBuf.dims[2], + descBuf.dims[1], descBuf.dims[0]); + biasTensorImg.resize(desc); + GCLMemDesc descImg = gclmem_build_desc(); + U32 stride[3] = {(descBuf.stride[0] + 3) / 4, descBuf.stride[1], descBuf.stride[2]}; + U32 offset[3] = {0, 0, 0}; + GCLMemType mt = GCL_MEM_IMG_1D; + MemFlags flags = CL_MEM_READ_WRITE; + CHECK_STATUS( + gclmem_set_desc_padding(&descImg, stride, offset, desc.dt, DF_NCHW, mt, flags)); + biasMemImg->padding(descBuf); + biasMemImg->alloc(); + biasMemImg->copy_from((Memory *)biasMemBuf); + biasTensors[0] = biasTensorImg; + } + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN + +private: + TensorDesc filterDesc4D; + +protected: + ForwardRunInfoMali runInfo; +}; + +#endif // _FULLY_CONNECTED_OCL_H diff --git a/inference/engine/include/ocl/layer_norm_ocl.hpp b/inference/engine/include/ocl/layer_norm_ocl.hpp new file mode 100644 index 00000000..b6097018 --- /dev/null +++ b/inference/engine/include/ocl/layer_norm_ocl.hpp @@ -0,0 +1,87 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _LAYER_NORM_OCL_H +#define _LAYER_NORM_OCL_H + +#include "layer_norm.hpp" + +class LayerNormOCL : public LayerNorm { +public: + LayerNormOCL(DataType dt, U32 weightNum) : LayerNorm(dt, weightNum) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~LayerNormOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new LayerNormOCL(this->dt, this->weightNum)); + *mem = *this; + return mem; + } + + EE infer_weight_desc() override + { + auto curOpWs = this->get_weightspec(); + if (0 != curOpWs.bytes_of_weight) { + this->weightNum = curOpWs.bytes_of_weight / bytesOf(curOpWs.mdt); + } + DataType dtNoQ = (DT_F16_8Q == this->dt) ? DT_F16 : this->dt; + TensorDesc weightDesc = tensor1d(dtNoQ, this->weightNum); + TensorDesc biasDesc = tensor1d(dtNoQ, this->weightNum); + Tensor modelWeightTensor = Tensor(OCLMem); + Tensor modelBiasTensor = Tensor(OCLMem); + auto weightMem = (OclMemory *)modelWeightTensor.get_memory(); + auto vectorMem = (OclMemory *)modelBiasTensor.get_memory(); + modelWeightTensor.resize(weightDesc); + modelBiasTensor.resize(biasDesc); + U32 stride[3] = {(this->weightNum + 3) / 4 * 4, 1, 1}; + U32 offset[3] = {0, 0, 0}; + GCLMemType mt = GCL_MEM_BUF; + MemFlags flags = CL_MEM_READ_WRITE; + GCLMemDesc desc = gclmem_build_desc(); + CHECK_STATUS(gclmem_set_desc_padding(&desc, stride, offset, dtNoQ, DF_NCHW, mt, flags)); + weightMem->padding(desc); + vectorMem->padding(desc); + this->weightTensors.push_back(modelWeightTensor); + this->biasTensors.push_back(modelBiasTensor); + return SUCCESS; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor weightTensor = this->weightTensors[0]; + Tensor biasTensor = this->biasTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(layer_normalization( + inputTensor, weightTensor, biasTensor, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(normalization_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _LAYER_NORM_OCL_H diff --git a/inference/engine/include/ocl/matmul_ocl.hpp b/inference/engine/include/ocl/matmul_ocl.hpp new file mode 100644 index 00000000..f92d499a --- /dev/null +++ b/inference/engine/include/ocl/matmul_ocl.hpp @@ -0,0 +1,97 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _MATMUL_OCL_H +#define _MATMUL_OCL_H + +#include "matmul.hpp" + +class MatMulOCL : public MatMul { +public: + MatMulOCL(DataType dt, MatMulParamSpec p) : MatMul(dt, p) + { + setMALIArchInfo(&(this->archInfo), &(this->runInfo), &this->needSetKernelVec, + &this->needSelectKernelLS); + } + + ~MatMulOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new MatMulOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensorA = this->inputTensors[0]; + Tensor inputTensorB = this->inputTensors[1]; + Tensor outputTensor = this->outputTensors[0]; + + CHECK_STATUS(matmul(inputTensorA, this->p.transpose_a, inputTensorB, this->p.transpose_b, + this->temp, outputTensor, &this->archInfo)); + } + + EE infer_forward_algorithm(std::shared_ptr algorithmMap) override + { + OCLContext::getInstance().handle.get()->kernelVec = &this->opKernelVec; + Tensor matrixATensor = this->inputTensors[0]; + Tensor matrixBTensor = this->inputTensors[1]; + Tensor matrixCTensor = this->outputTensors[0]; + ((MaliPara_t)(this->archInfo.archPara))->forwardRunInfo->algorithm = + CONVOLUTION_ALGORITHM_NULL; + I32 algo[4]; + if (algorithmMap->getAlgorithmInfoFromMap(this->name, algo, 4)) { + this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0]; + this->runInfo.best_w[0] = algo[1]; + this->runInfo.best_c[0] = algo[2]; + this->runInfo.best_k[0] = algo[3]; + } else { + CHECK_STATUS(matmul_infer_forward_algorithm(matrixATensor, this->p.transpose_a, + matrixBTensor, this->p.transpose_b, matrixCTensor, &this->archInfo)); + algo[0] = this->runInfo.algorithm; + algo[1] = this->runInfo.best_w[0]; + algo[2] = this->runInfo.best_c[0]; + algo[3] = this->runInfo.best_k[0]; + algorithmMap->setAlgorithmInfoToMap(this->name, algo, 4); + } + return SUCCESS; + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(matmul_infer_output_size(inTensors[0], this->p.transpose_a, inTensors[1], + this->p.transpose_b, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(matmul_infer_forward_tmp_bytes(this->inputTensors[0], this->p.transpose_a, + this->inputTensors[1], this->p.transpose_b, &bytes, &this->archInfo)); + return bytes; + } + + REGISTER_OCL_OPERATOR_RUN + +protected: + ForwardRunInfoMali runInfo; +}; + +#endif // _MATMUL_OCL_H diff --git a/inference/engine/include/ocl/padding_ocl.hpp b/inference/engine/include/ocl/padding_ocl.hpp new file mode 100644 index 00000000..13827052 --- /dev/null +++ b/inference/engine/include/ocl/padding_ocl.hpp @@ -0,0 +1,57 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _PADDING_OCL_H +#define _PADDING_OCL_H + +#include "padding.hpp" + +class PaddingOCL : public Padding { +public: + PaddingOCL(DataType dt, PadParamSpec p) : Padding(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~PaddingOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new PaddingOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(padding(inputTensor, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS( + padding_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _PADDING_OCL_H diff --git a/inference/engine/include/ocl/pooling_ocl.hpp b/inference/engine/include/ocl/pooling_ocl.hpp new file mode 100644 index 00000000..53e590db --- /dev/null +++ b/inference/engine/include/ocl/pooling_ocl.hpp @@ -0,0 +1,66 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _POOLING_OCL_H +#define _POOLING_OCL_H + +#include "pooling.hpp" + +class PoolingOCL : public Pooling { +public: + PoolingOCL(PoolingParamSpec p) : Pooling(p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~PoolingOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new PoolingOCL(this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + CHECK_STATUS(pooling( + this->inputTensors[0], this->p, this->temp, this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + if (this->p.kernel_h == 0 && this->p.kernel_w == 0) { + Pooling::set_stride(1, 1); + } + CHECK_STATUS( + pooling_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(pooling_infer_forward_tmp_bytes( + this->inputTensors[0], this->outputTensors[0], &bytes, &this->archInfo)); + return bytes; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _POOLING_OCL_H diff --git a/inference/engine/include/ocl/power_ocl.hpp b/inference/engine/include/ocl/power_ocl.hpp new file mode 100644 index 00000000..dea8229e --- /dev/null +++ b/inference/engine/include/ocl/power_ocl.hpp @@ -0,0 +1,56 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _POWER_OCL_H +#define _POWER_OCL_H + +#include "power.hpp" + +class PowerOCL : public Power { +public: + PowerOCL(DataType dt, PowerParamSpec p) : Power(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~PowerOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new PowerOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + + CHECK_STATUS(power(inputTensor, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(power_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _POWER_OCL_H diff --git a/inference/engine/include/ocl/preallocated_memory_ocl.hpp b/inference/engine/include/ocl/preallocated_memory_ocl.hpp new file mode 100644 index 00000000..9b189417 --- /dev/null +++ b/inference/engine/include/ocl/preallocated_memory_ocl.hpp @@ -0,0 +1,58 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _PREALLOCATED_MEMORY_OCL_H +#define _PREALLOCATED_MEMORY_OCL_H + +#include "preallocated_memory.hpp" + +class PreAllocatedMemoryOCL : public PreAllocatedMemory { +public: + PreAllocatedMemoryOCL(DataType dt, TensorDesc desc) : PreAllocatedMemory(dt, desc) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~PreAllocatedMemoryOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new PreAllocatedMemoryOCL(this->dt, this->desc)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + CHECK_STATUS(preallocated_memory(this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + if (inTensors.size() > 0) { + CHECK_STATUS(NOT_MATCH); + } + outTensors[0]->resize(this->desc); + CHECK_STATUS(preallocated_memory_infer_output_size(outTensors[0], &this->archInfo)); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _PREALLOCATED_MEMORY_OCL_H diff --git a/inference/engine/include/ocl/prelu_ocl.hpp b/inference/engine/include/ocl/prelu_ocl.hpp new file mode 100644 index 00000000..3d784f5a --- /dev/null +++ b/inference/engine/include/ocl/prelu_ocl.hpp @@ -0,0 +1,86 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _PRELU_OCL_H +#define _PRELU_OCL_H + +#include "prelu.hpp" + +class PReLUOCL : public PReLU { +public: + PReLUOCL(DataType dt) : PReLU(dt) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~PReLUOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new PReLUOCL(this->dt)); + *mem = *this; + return mem; + } + + EE infer_weight_desc() override + { + auto curOpWs = this->get_weightspec(); + U32 weightNum = 0; + if (curOpWs.weight != nullptr) { + weightNum = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt)); + } + if (weightNum == 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (weightNum == 1) { + this->preluDesc.propagate_down = true; + } else { + this->preluDesc.propagate_down = false; + } + Tensor modelWeightTensor = Tensor(OCLMem); + auto weightMem = (OclMemory *)modelWeightTensor.get_memory(); + TensorDesc weightDesc = tensor1d(this->dt, weightNum); + modelWeightTensor.resize(weightDesc); + + U32 stride[3] = {1, 1, 1}; + U32 offset[3] = {0, 0, 0}; + stride[0] = (weightNum > 1) ? (weightNum + 3) / 4 * 4 : 1; + GCLMemType mt = GCL_MEM_BUF; + MemFlags flags = CL_MEM_READ_WRITE; + GCLMemDesc desc = gclmem_build_desc(); + CHECK_STATUS(gclmem_set_desc_padding(&desc, stride, offset, this->dt, DF_NCHW, mt, flags)); + weightMem->padding(desc); + this->weightTensors.push_back(modelWeightTensor); + return SUCCESS; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + CHECK_STATUS(prelu(this->inputTensors[0], this->weightTensors[0], this->preluDesc, + this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(prelu_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _PRELU_OCL_H diff --git a/inference/engine/include/ocl/repeat_ocl.hpp b/inference/engine/include/ocl/repeat_ocl.hpp new file mode 100644 index 00000000..e5d2892b --- /dev/null +++ b/inference/engine/include/ocl/repeat_ocl.hpp @@ -0,0 +1,116 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _REPEAT_OCL_H +#define _REPEAT_OCL_H + +#include "repeat.hpp" + +class RepeatOCL : public Repeat { +public: + RepeatOCL(DataType dt, RepeatParamSpec p, I32 jumpOperatorIndex, I32 currentOperatorIndex) + : Repeat(dt, p, jumpOperatorIndex, currentOperatorIndex) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~RepeatOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr( + new RepeatOCL(this->dt, this->p, this->jumpOperatorIndex, this->nextOperatorIndex - 1)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + } + + int get_next_operator_index() override + { + // check status + if (this->inputTensors.size() > 1) { + Tensor inputTensor = this->inputTensors[1]; + TensorDesc inputDesc = inputTensor.get_desc(); + GCLMem_t ptr = (GCLMem_t)(((OclMemory *)(inputTensor.get_memory()))->get_ptr()); + U32 length = tensorNumElements(inputDesc); + DataFormat df = ptr->desc.memFormat; + if (df != DF_NCHW) { + CHECK_STATUS(NOT_SUPPORTED); + } + U32 w_off, h_off; + w_off = ptr->desc.offset[0]; + h_off = ptr->desc.offset[1]; + if (w_off != 0 || h_off != 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + I32 *val = hostVal.get(); + CHECK_STATUS(gcl_trans_memory(OCLContext::getInstance().handle.get(), ptr, val, &length, + DEVICE_BUF_TO_HOST, CL_TRUE)); + for (U32 i = 0; i < length; i++) { + // end loop + if (val[i]) { + this->iter = 0; + return this->nextOperatorIndex; + } + } + } + + // check loop + if (this->iter < this->p.loops) { + this->iter++; + return this->jumpOperatorIndex; + } else { + this->iter = 0; + return this->nextOperatorIndex; + } + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + this->iter = 0; + if (this->p.axis >= 0) { + int axisIndex = 0; + if (inTensors.size() > 2) { + axisIndex = 2; + } else { + UNI_ERROR_LOG("[ERROR] set to use axis feature of Repeat must meet input tensors " + ">= 3 requirement\n"); + } + TensorDesc desc = inTensors[axisIndex]->get_desc(); + this->p.loops = desc.dims[desc.nDims - 1 - this->p.axis]; + } + TensorDesc outDesc = outTensors[0]->get_desc(); + outDesc.dt = this->dt; + outDesc.nDims = 0; + outTensors[0]->resize(outDesc); + auto inTensor = inTensors[1]; + TensorDesc inDesc = inTensor->get_desc(); + U32 length = tensorNumElements(inDesc); + hostVal = std::shared_ptr(new I32(length)); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN + +private: + std::shared_ptr hostVal; +}; + +#endif // _REPEAT_OCL_H diff --git a/inference/engine/include/ocl/reshape_ocl.hpp b/inference/engine/include/ocl/reshape_ocl.hpp new file mode 100644 index 00000000..5c038bab --- /dev/null +++ b/inference/engine/include/ocl/reshape_ocl.hpp @@ -0,0 +1,67 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RESHAPE_OCL_H +#define _RESHAPE_OCL_H + +#include "reshape.hpp" + +class ReshapeOCL : public Reshape { +public: + ReshapeOCL(DataType dt, ReshapeParamSpec p) : Reshape(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~ReshapeOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new ReshapeOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(reshape(inputTensor, this->temp, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + + CHECK_STATUS( + reshape_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(reshape_infer_forward_tmp_bytes( + this->inputTensors[0], this->outputTensors[0], &bytes, &this->archInfo)); + return bytes; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _RESHAPE_OCL_H diff --git a/inference/engine/include/ocl/resize_ocl.hpp b/inference/engine/include/ocl/resize_ocl.hpp new file mode 100644 index 00000000..fa4ab2bb --- /dev/null +++ b/inference/engine/include/ocl/resize_ocl.hpp @@ -0,0 +1,75 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RESIZE_OCL_H +#define _RESIZE_OCL_H + +#include "resize.hpp" +#include "image.h" + +class ResizeOCL : public Resize { +public: + ResizeOCL(DataType paramDT, ResizeParamSpec p) : Resize(paramDT, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~ResizeOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new ResizeOCL(this->paramDT, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + CHECK_STATUS(resize(inputTensor, this->temp, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + ResizeDesc resizeDesc; + resizeDesc.paramDT = this->paramDT; + U32 bytes; + switch (paramDT) { + case DT_F32: { + CHECK_REQUIREMENT(1 == this->p.scales[0] && 1 == this->p.scales[1]); + CHECK_STATUS(resize_infer_output_size(inTensors[0], resizeDesc, this->p.scales + 2, + outTensors[0], &bytes, &this->archInfo)); + break; + } + case DT_U32: { + CHECK_STATUS(resize_infer_output_size(inTensors[0], resizeDesc, this->p.sizes, + outTensors[0], &bytes, &this->archInfo)); + break; + } + default: { + CHECK_STATUS(NOT_SUPPORTED); + } + } + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _RESIZE_H diff --git a/inference/engine/include/ocl/rnn_ocl.hpp b/inference/engine/include/ocl/rnn_ocl.hpp new file mode 100644 index 00000000..4ab6424d --- /dev/null +++ b/inference/engine/include/ocl/rnn_ocl.hpp @@ -0,0 +1,71 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RNN_OCL_H +#define _RNN_OCL_H + +#include "ocl/rnncell_ocl.hpp" + +class RNNOCL : public RNNCellOCL { +public: + RNNOCL(DataType dt, RNNParamSpec p) : RNNCellOCL(dt, p) + {} + + ~RNNOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new RNNOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + + // NOTE: no clean tmp and output + CHECK_STATUS(rnn(inputTensor, this->weightTensors, this->biasTensors, this->p, this->temp, + outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + return NOT_SUPPORTED; + TensorDesc inDim = inTensors[0]->get_desc(); + + DataType dt; + DataFormat df; + U32 iB, inT, iX; + CHECK_STATUS(tensor3dGet(inDim, &dt, &df, &iB, &inT, &iX)); + this->xDim = iX; + CHECK_STATUS(rnn_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(rnn_infer_forward_tmp_bytes(this->inputTensors[0], this->weightTensors[0], + this->outputTensors[0], this->p, &bytes, &this->archInfo)); + return bytes; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _RNN_OCL_H diff --git a/inference/engine/include/ocl/rnncell_ocl.hpp b/inference/engine/include/ocl/rnncell_ocl.hpp new file mode 100644 index 00000000..8caf784f --- /dev/null +++ b/inference/engine/include/ocl/rnncell_ocl.hpp @@ -0,0 +1,228 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RNNCELL_OCL_H +#define _RNNCELL_OCL_H + +#include "rnncell.hpp" + +class RNNCellOCL : public RNNCell { +public: + RNNCellOCL(DataType dt, RNNParamSpec p) : RNNCell(dt, p) + { + setMALIArchInfo(&(this->archInfo), &(this->runInfo), &this->needSetKernelVec, + &this->needSelectKernelLS); + } + + ~RNNCellOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new RNNCellOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor xTensor = this->inputTensors[0]; + Tensor stateTensor = this->inputTensors[1]; + Tensor hTensor = this->outputTensors[0]; + + CHECK_STATUS(rnncell(xTensor, this->weightTensors, this->biasTensors, stateTensor, this->p, + this->xDim, this->p.numOutput, 0, this->temp, hTensor, &this->archInfo)); + } + + EE infer_forward_algorithm(std::shared_ptr algorithmMap) override + { + OCLContext::getInstance().handle.get()->kernelVec = &this->opKernelVec; + Tensor xTensor = this->inputTensors[0]; + Tensor filterTensor = this->weightTensors[0]; + Tensor biasTensor = this->biasTensors[0]; + Tensor hTensor = this->outputTensors[0]; + ((MaliPara_t)(this->archInfo.archPara))->forwardRunInfo->algorithm = + CONVOLUTION_ALGORITHM_NULL; + I32 algo[7]; + U32 algoNum = (this->p.numProjection > 0) ? 7 : 4; + if (algorithmMap->getAlgorithmInfoFromMap(this->name, algo, algoNum)) { + this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0]; + this->runInfo.best_w[0] = algo[1]; + this->runInfo.best_c[0] = algo[2]; + this->runInfo.best_k[0] = algo[3]; + if (algoNum == 7) { + this->runInfo.best_w[0] = algo[4]; + this->runInfo.best_c[0] = algo[5]; + this->runInfo.best_k[0] = algo[6]; + } + } else { + CHECK_STATUS(rnncell_infer_forward_algorithm(xTensor, filterTensor, biasTensor, this->p, + this->xDim, this->p.numOutput, hTensor, &this->archInfo)); + algo[0] = this->runInfo.algorithm; + algo[1] = this->runInfo.best_w[0]; + algo[2] = this->runInfo.best_c[0]; + algo[3] = this->runInfo.best_k[0]; + if (algoNum == 7) { + algo[4] = this->runInfo.best_w[1]; + algo[5] = this->runInfo.best_c[1]; + algo[6] = this->runInfo.best_k[1]; + } + algorithmMap->setAlgorithmInfoToMap(this->name, algo, algoNum); + } + return SUCCESS; + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + TensorDesc inDim = inTensors[0]->get_desc(); + DataType dt; + DataFormat df; + U32 iB, iX; + if (inDim.nDims == 2) { + CHECK_STATUS(tensor2dGet(inDim, &dt, &df, &iB, &iX)); + } else if (inDim.nDims == 3) { + dt = inDim.dt; + U32 m, k, t; + if (inDim.df == DF_MTK) { + m = inDim.dims[2]; + t = inDim.dims[1]; + k = inDim.dims[0]; + } else if (inDim.df == DF_MKT) { + m = inDim.dims[2]; + t = inDim.dims[0]; + k = inDim.dims[1]; + } else { + return NOT_SUPPORTED; + } + if (t != 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + iB = m; + iX = k; + } else { + return NOT_SUPPORTED; + } + this->xDim = iX; + CHECK_STATUS(rnncell_infer_output_size(inTensors, this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(rnncell_infer_forward_tmp_bytes(this->inputTensors[0], this->weightTensors[0], + this->outputTensors[0], this->p, &bytes, &this->archInfo)); + return bytes; + } + + GCLMemDesc infer_wtm_memory_size_mali() override + { + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + GCLMemDesc tmpDesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + GCLMemDesc gclmemWtmDesc[2]; + gclmemWtmDesc[0] = tmpDesc; + gclmemWtmDesc[1] = tmpDesc; + U32 bytes = 0; + ((MaliPara_t)(this->archInfo.archPara))->gclmemFilterDesc = gclmemWtmDesc; + CHECK_STATUS( + rnn_transform_filter_bytes(this->weightTensors, this->p, &bytes, &this->archInfo)); + wtm_pro = std::shared_ptr(new Tensor(OCLMem)); + OclMemory *wtmMem = (OclMemory *)wtm_pro->get_memory(); + wtmMem->padding(gclmemWtmDesc[1]); + if (this->p.numProjection > 0) { + wtm_pro->alloc(); + } + return gclmemWtmDesc[0]; + } + + EE transform_filter() override + { + auto wtmDesc = this->infer_wtm_memory_size_mali(); + this->wtm = std::shared_ptr(new Tensor(OCLMem)); + OclMemory *wtmMem = (OclMemory *)this->wtm->get_memory(); + wtmMem->padding(wtmDesc); + this->wtm->alloc(); + std::vector filterTensors; + std::vector ftmTensors; + filterTensors.push_back(this->weightTensors[0]); + ftmTensors.push_back(this->wtm.get()); + if (this->p.numProjection > 0) { + filterTensors.push_back(this->weightTensors[1]); + ftmTensors.push_back(this->wtm_pro.get()); + } + CHECK_STATUS(rnn_transform_filter(filterTensors, this->p, ftmTensors, &this->archInfo)); + this->weightTensors[0] = *this->get_wtm(); + if (this->p.numProjection > 0) { + this->weightTensors[1] = *wtm_pro.get(); + } + return SUCCESS; + } + + EE infer_weight_desc() override + { + U32 row = this->xDim + this->p.numOutput; + U32 column = (this->p.numProjection > 0) ? this->p.numProjection : this->p.numOutput; + U32 filterRow = 4 * column; + U32 filterCol = this->p.numOutput + this->xDim; + TensorDesc weightDesc[2]; + weightDesc[0] = tensor2df(this->dt, DF_NK, filterRow, filterCol); + TensorDesc biasDesc = tensor1d(this->dt, column * 4); + U32 weightNum = 1; + if (this->p.numProjection > 0) { + weightDesc[1] = tensor2df(this->dt, DF_NK, this->p.numOutput, this->p.numProjection); + weightNum = 2; + } + + for (U32 i = 0; i < weightNum; i++) { + Tensor modelWeightTensor = Tensor(OCLMem); + modelWeightTensor.resize(weightDesc[i]); + auto weightMem = (OclMemory *)modelWeightTensor.get_memory(); + U32 s0 = (i == 0) ? row : this->p.numProjection; + U32 s1 = (i == 0) ? column * 4 : this->p.numOutput; + U32 stride[3] = {s0, s1, 1}; + U32 offset[3] = {0, 0, 0}; + GCLMemType mt = GCL_MEM_BUF; + MemFlags flags = CL_MEM_READ_WRITE; + GCLMemDesc desc = gclmem_build_desc(); + CHECK_STATUS(gclmem_set_desc_padding(&desc, stride, offset, dt, DF_NCHW, mt, flags)); + weightMem->padding(desc); + this->weightTensors.push_back(modelWeightTensor); + + if (i == 0) { + Tensor modelBiasTensor = Tensor(OCLMem); + auto vectorMem = (OclMemory *)modelBiasTensor.get_memory(); + modelBiasTensor.resize(biasDesc); + stride[0] = column * 4; + stride[1] = 1; + CHECK_STATUS(gclmem_set_desc_padding(&desc, stride, offset, dt, DF_NCHW, mt, flags)); + vectorMem->padding(desc); + this->biasTensors.push_back(modelBiasTensor); + } + } + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN + +private: + std::shared_ptr wtm_pro; + +protected: + ForwardRunInfoMali runInfo; +}; + +#endif // _RNNCELL_OCL_H diff --git a/inference/engine/include/ocl/scale_ocl.hpp b/inference/engine/include/ocl/scale_ocl.hpp new file mode 100644 index 00000000..6a37c588 --- /dev/null +++ b/inference/engine/include/ocl/scale_ocl.hpp @@ -0,0 +1,118 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SCALE_GPU_H +#define _SCALE_GPU_H + +#include "scale.hpp" + +class ScaleOCL : public Scale { +public: + ScaleOCL(DataType dt, ScaleParamSpec p, int numChannels) : Scale(dt, p, numChannels) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~ScaleOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new ScaleOCL(this->dt, this->p, this->numChannels)); + *mem = *this; + return mem; + } + + EE infer_weight_desc() override + { + auto curOpWs = this->get_weightspec(); + if (0 != curOpWs.bytes_of_weight) { + this->numChannels = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt)); + } else if (0 != curOpWs.bytes_of_vec) { + this->numChannels = curOpWs.bytes_of_vec / UNI_MAX(1, bytesOf(curOpWs.mdt)); + } else { + this->numChannels = 0; + } + Tensor modelWeightTensor = Tensor(OCLMem); + Tensor modelBiasTensor = Tensor(OCLMem); + TensorDesc weightDesc = tensor1d(this->dt, this->numChannels); + TensorDesc biasDesc = weightDesc; + modelWeightTensor.resize(weightDesc); + modelBiasTensor.resize(biasDesc); + auto weightMem = (OclMemory *)modelWeightTensor.get_memory(); + auto vectorMem = (OclMemory *)modelBiasTensor.get_memory(); + + U32 stride[3] = {(this->numChannels + 3) / 4 * 4, 1, 1}; + U32 offset[3] = {0, 0, 0}; + GCLMemType mt = GCL_MEM_BUF; + MemFlags flags = CL_MEM_READ_WRITE; + GCLMemDesc desc = gclmem_build_desc(); + CHECK_STATUS(gclmem_set_desc_padding(&desc, stride, offset, this->dt, DF_NCHW, mt, flags)); + weightMem->padding(desc); + vectorMem->padding(desc); + this->weightTensors.push_back(modelWeightTensor); + this->biasTensors.push_back(modelBiasTensor); + return SUCCESS; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + int inputNum = this->inputTensors.size(); + Tensor inputTensor = this->inputTensors[this->dataID]; + Tensor outputTensor = this->outputTensors[0]; + if (inputNum == 1 && weightTensors.size() == 0) { + CHECK_STATUS(NOT_MATCH); + } + + if (inputNum > 1) { + U32 cNum = this->inputTensors[0].get_desc().dims[2]; + for (int i = 1; i < inputNum; i++) { + if (cNum != this->inputTensors[i].get_desc().dims[2]) { + CHECK_STATUS(NOT_MATCH); + } + } + } + + void *alpha, *beta; + if (inputNum == 1) { + alpha = ((OclMemory *)(this->weightTensors[0].get_memory()))->get_ptr(); + beta = ((OclMemory *)(this->biasTensors[0].get_memory()))->get_ptr(); + } else { + alpha = ((OclMemory *)(this->inputTensors[1 - this->dataID].get_memory()))->get_ptr(); + beta = nullptr; + } + CHECK_STATUS(scale(inputTensor, alpha, beta, this->p, outputTensor, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + if (inTensors.size() > 1) { + U32 len0 = inTensors[0]->length(); + U32 len1 = inTensors[1]->length(); + if (len1 > len0) { + this->dataID = 1; + } + } + CHECK_STATUS( + scale_infer_output_size(inTensors[this->dataID], outTensors[0], &this->archInfo)); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _SCALE_GPU_H diff --git a/inference/engine/include/ocl/shared_weight_ocl.hpp b/inference/engine/include/ocl/shared_weight_ocl.hpp new file mode 100644 index 00000000..1a0ce04c --- /dev/null +++ b/inference/engine/include/ocl/shared_weight_ocl.hpp @@ -0,0 +1,134 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SHARED_WEIGHT_OCL_H +#define _SHARED_WEIGHT_OCL_H + +#include "shared_weight.hpp" + +#include "ocl_desc_trans.h" +#include "ocl_data_trans.h" + +class SharedWeightOCL : public SharedWeight { +public: + SharedWeightOCL(DataType dt, + TensorDesc desc, + std::string outputTensorName, + std::map> *tensorMapPtr) + : SharedWeight(dt, desc, outputTensorName, tensorMapPtr) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~SharedWeightOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr( + new SharedWeightOCL(this->dt, this->desc, this->outputTensorName, tensorMapPtr)); + *mem = *this; + return mem; + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + UNUSED(inTensors); + outTensors[0]->resize(this->desc); + U32 s0, s1, s2; + s0 = this->desc.dims[0]; + s1 = (this->desc.nDims > 1) ? this->desc.dims[1] : 1; + s2 = (this->desc.nDims > 2) ? this->desc.dims[2] : 1; + U32 stride[3] = {s0, s1, s2}; + U32 offset[3] = {0, 0, 0}; + GCLMemType mt = GCL_MEM_BUF; + MemFlags flags = CL_MEM_READ_WRITE; + GCLMemDesc gclMemDesc = gclmem_build_desc(); + CHECK_STATUS(gclmem_set_desc_padding(&gclMemDesc, stride, offset, dt, DF_NCHW, mt, flags)); + ocl_set_desc(outTensors[0], gclMemDesc); + return SUCCESS; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + } + + EE init_weight_bias_from_model(std::shared_ptr *modelPtr) override + { + auto dstTensor = (*this->tensorMapPtr)[this->outputTensorName]; + auto dstMem = (OclMemory *)(dstTensor->get_memory()); + GCLMemDesc dstMemDesc = dstMem->get_desc(); + std::shared_ptr weight_ptr; + auto curOpWs = this->get_weightspec(); + if (modelPtr) { + weight_ptr = *modelPtr; + } else { + weight_ptr = std::shared_ptr(curOpWs.weight); + } + U32 s0, s1, s2; + s0 = this->desc.dims[0]; + s1 = (this->desc.nDims > 1) ? this->desc.dims[1] : 1; + s2 = (this->desc.nDims > 2) ? this->desc.dims[2] : 1; + this->needTrans = false; + if (dstMemDesc.stride[0] == s0 && dstMemDesc.stride[1] == s1 && dstMemDesc.stride[2] == s2) { + CpuMemory weight_mem_src; + weight_mem_src.resize(this->desc); + weight_mem_src.set_shared_ptr(std::shared_ptr(weight_ptr)); + dstMem->copy_from((Memory *)&weight_mem_src); + } else { + this->needTrans = true; + this->host_ptr = weight_ptr; + } + this->weightTensors.push_back(*dstTensor.get()); + if (modelPtr) { + *modelPtr = + std::shared_ptr(*modelPtr, (*modelPtr).get() + tensorNumBytes(this->desc)); + } + return SUCCESS; + } + + EE transform_filter() override + { + if (needTrans) { + auto dstTensor = (*this->tensorMapPtr)[this->outputTensorName]; + auto dstMem = (OclMemory *)(dstTensor->get_memory()); + GCLMem_t dst = (GCLMem_t)dstMem->get_ptr(); + auto tempMem = (OclMemory *)(this->temp.get_memory()); + GCLMem_t temp = (GCLMem_t)tempMem->get_ptr(); + CHECK_STATUS(ocl_set_input(OCLContext::getInstance().handle.get(), dst, this->desc, + host_ptr.get(), temp, true)); + this->weightTensors[0] = *dstTensor.get(); + } + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + if (needTrans) { + bytes = tensorNumBytes(this->desc); + } + return bytes; + } + + REGISTER_OCL_OPERATOR_RUN + +private: + std::shared_ptr host_ptr; + bool needTrans; +}; + +#endif // _WEIGHT_OCL_H diff --git a/inference/engine/include/ocl/slice_ocl.hpp b/inference/engine/include/ocl/slice_ocl.hpp new file mode 100644 index 00000000..b825e6dc --- /dev/null +++ b/inference/engine/include/ocl/slice_ocl.hpp @@ -0,0 +1,54 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SLICE_OCL_H +#define _SLICE_OCL_H + +#include "slice.hpp" + +class SliceOCL : public Slice { +public: + SliceOCL(DataType dt, SliceParamSpec p) : Slice(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~SliceOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new SliceOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + Tensor inputTensor = this->inputTensors[0]; + CHECK_STATUS(slice(this->inputTensors[0], this->p, this->outputTensors, &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(slice_infer_output_size(inTensors[0], this->p, outTensors, &this->archInfo)); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _SLICE_OCL_H diff --git a/inference/engine/include/ocl/softmax_ocl.hpp b/inference/engine/include/ocl/softmax_ocl.hpp new file mode 100644 index 00000000..7afb03ac --- /dev/null +++ b/inference/engine/include/ocl/softmax_ocl.hpp @@ -0,0 +1,63 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SOFTMAX_OCL_H +#define _SOFTMAX_OCL_H + +#include "softmax.hpp" + +class SoftmaxOCL : public Softmax { +public: + SoftmaxOCL(DataType dt, SoftmaxParamSpec p) : Softmax(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~SoftmaxOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new SoftmaxOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + CHECK_STATUS(softmax( + this->inputTensors[0], this->p, this->temp, this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(softmax_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS( + softmax_infer_forward_tmp_bytes(this->inputTensors[0], &bytes, &this->archInfo)); + return bytes; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // SOFTMAX_OCL_H diff --git a/inference/engine/include/ocl/space2depth_ocl.hpp b/inference/engine/include/ocl/space2depth_ocl.hpp new file mode 100644 index 00000000..ae76f01c --- /dev/null +++ b/inference/engine/include/ocl/space2depth_ocl.hpp @@ -0,0 +1,54 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SPACE2DEPTH_OCL_H +#define _SPACE2DEPTH_OCL_H + +#include "space2depth.hpp" + +class Space2DepthOCL : public Space2Depth { +public: + Space2DepthOCL(DataType dt) : Space2Depth(dt) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~Space2DepthOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new Space2DepthOCL(this->dt)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + CHECK_STATUS(space2depth(this->inputTensors[0], this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS(space2depth_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _SPACE2DEPTH_OCL_H diff --git a/inference/engine/include/ocl/squeeze_ocl.hpp b/inference/engine/include/ocl/squeeze_ocl.hpp new file mode 100644 index 00000000..7e29a191 --- /dev/null +++ b/inference/engine/include/ocl/squeeze_ocl.hpp @@ -0,0 +1,55 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SQUEEZE_OCL_H +#define _SQUEEZE_OCL_H + +#include "squeeze.hpp" + +class SqueezeOCL : public Squeeze { +public: + SqueezeOCL(DataType dt, SqueezeParamSpec p) : Squeeze(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~SqueezeOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new SqueezeOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + CHECK_STATUS(squeeze(this->inputTensors[0], this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS( + squeeze_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _SQUEEZE_OCL_H diff --git a/inference/engine/include/ocl/transpose_ocl.hpp b/inference/engine/include/ocl/transpose_ocl.hpp new file mode 100644 index 00000000..d404f7bc --- /dev/null +++ b/inference/engine/include/ocl/transpose_ocl.hpp @@ -0,0 +1,64 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _TRANSPOSE_OCL_H +#define _TRANSPOSE_OCL_H + +#include "transpose.hpp" + +class TransposeOCL : public Transpose { +public: + TransposeOCL(DataType dt, TransposeParamSpec p) : Transpose(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~TransposeOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new TransposeOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + CHECK_STATUS(transpose( + this->inputTensors[0], this->p, this->temp, this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS( + transpose_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(transpose_infer_forward_tmp_bytes( + this->inputTensors[0], this->outputTensors[0], &bytes, &this->archInfo)); + return bytes; + } + + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _TRANSPOSE_OCL_H diff --git a/inference/engine/include/ocl/unsqueeze_ocl.hpp b/inference/engine/include/ocl/unsqueeze_ocl.hpp new file mode 100644 index 00000000..03aec120 --- /dev/null +++ b/inference/engine/include/ocl/unsqueeze_ocl.hpp @@ -0,0 +1,54 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _UNSQUEEZE_OCL_H +#define _UNSQUEEZE_OCL_H + +#include "unsqueeze.hpp" + +class UnsqueezeOCL : public Unsqueeze { +public: + UnsqueezeOCL(DataType dt, UnsqueezeParamSpec p) : Unsqueeze(dt, p) + { + setMALIArchInfo( + &(this->archInfo), nullptr, &this->needSetKernelVec, &this->needSelectKernelLS); + } + + ~UnsqueezeOCL(){DESTROY_OCL_KERNEL} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new UnsqueezeOCL(this->dt, this->p)); + *mem = *this; + return mem; + } + + inline void run_prepare() + { + OCLContext::getInstance().handle.get()->curOpName = this->get_name(); + CHECK_STATUS(unsqueeze(this->inputTensors[0], this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + this->needSetKernelVec = true; + CHECK_STATUS( + unsqueeze_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + REGISTER_OCL_OPERATOR_RUN +}; + +#endif // _UNSQUEEZE_OCL_H diff --git a/inference/include/operator.hpp b/inference/engine/include/operator.hpp similarity index 53% rename from inference/include/operator.hpp rename to inference/engine/include/operator.hpp index 0f025266..33b70642 100644 --- a/inference/include/operator.hpp +++ b/inference/engine/include/operator.hpp @@ -1,275 +1,251 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -#ifndef _OPERATOR_H -#define _OPERATOR_H - -#include -#include "sys.h" -#include "tensor_computing.h" -#include "tensor.hpp" -#include "op_type.h" -#include -#include "model_tools.h" -#define HashMap std::map - -#ifdef _USE_MALI -#include "gcl.h" -#endif - -class Operator { -public: - virtual bool checkOperator() { - for (U32 i = 0; i < inputTensors.size(); i++) { - if (!tensorDescIsValid(inputTensors[i].get_desc())) - return false; - } - for (U32 i = 0; i < outputTensors.size(); i++) { - if (!tensorDescIsValid(outputTensors[i].get_desc())) - return false; - } - return true; - }; - - virtual void run() = 0; - - /** - * @param inputTensors - * @param outputTensors - */ - virtual void set_input_output_tensors(Vec it, Vec ot) - { - this->inputTensors = it; - this->outputTensors = ot; - } - - virtual Vec get_input_tensors() - { - return this->inputTensors; - } - - virtual Vec get_output_tensors() - { - return this->outputTensors; - } - - virtual void set_input_tensors(Vec it) - { - this->inputTensors = it; - } - - virtual void set_output_tensors(Vec ot) - { - this->outputTensors = ot; - } - - virtual bool can_input_output_the_same() { return false; } - - virtual EE infer_output_tensors_size(Vec, Vec*) = 0; - - std::string get_name() - { - return this->name; - } - /** - * @param name - */ - explicit Operator(std::string name) - { - this->name = name; - } - - Operator():name("") { } - - virtual bool is_weight() - { - return false; - } - - virtual U32 infer_tmp_memory_size() - { - this->lenOfTemp = 0; - this->temp = std::shared_ptr(); - return 0; - } - - virtual void set_tmp_memory(U32 len, std::shared_ptr temp) - { - this->lenOfTemp = len; - this->temp = temp; - } -#ifdef _USE_MALI - virtual EE set_mali_handle(std::shared_ptr handle){ - this->handle = handle; - oclExtInfo.maliInfo.handle = handle.get(); - runInfo.algorithm = 0; - runInfo.best_w[0] = 1; - runInfo.best_w[1] = 1; - runInfo.best_c[0] = 1; - runInfo.best_c[1] = 1; - runInfo.best_k[0] = 1; - runInfo.best_k[1] = 1; - oclExtInfo.maliInfo.forwardRunInfo = &runInfo; - return SUCCESS; - } - virtual EE infer_gclmem_desc(Vec*, Vec*){return NOT_SUPPORTED;} -#endif - - virtual U32 get_len_of_temp() - { - return this->lenOfTemp; - } - - virtual std::shared_ptr get_tmp() - { - return this->temp; - } - - virtual OperatorType get_op_type() = 0; - - virtual void set_op_name(std::string opName) { - this->name = opName; - } - - virtual void set_op_schedule(Arch opSchedule) { - this->schedule = opSchedule; - } - - virtual Vec get_tensor_positions() - { - return this->tensorPos; - } - - virtual void set_tensor_positions(Vec tensorPos) - { - this->tensorPos = tensorPos; - } - - virtual ~Operator(){ } - - virtual int get_next_operator_index() - { - return -1; - } - - virtual void setAlgorithmInfoToMap(HashMap &algorithmMap, std::string name, I32* algorithmArray, U32 ArrayNum) - { - std::string algoInfo = "/"; - for(U32 i = 0; i < ArrayNum; i++) { - algoInfo += std::to_string(algorithmArray[i]); - algoInfo += "/"; - } - algorithmMap[name] = algoInfo; - } - - virtual void getAlgorithmInfoFromMap(HashMap &algorithmMap, std::string name, I32* algorithmArray, U32 ArrayNum) - { - std::string algoInfo = algorithmMap[name]; - U32 be = algoInfo.find_first_of("/"); - U32 end; - for(U32 i = 0; i < ArrayNum; i++) { - end = algoInfo.find("/", be + 1); - algorithmArray[i] = std::stoi(algoInfo.substr(be + 1, end - be - 1)); - be = end; - } - } - - virtual void init_feature_scale(U32 num, QuantSpec* qs) - { - UNUSED(num); - UNUSED(qs); -#ifdef _USE_INT8 - if (1 == num && 0 == qs[0].scale[0]) { // OP is labelled as no-quantization - if (DT_F16_8Q == this->dt) { - this->dt = DT_F16; - } - return; - } - featureScale.resize(num); - for (U32 i = 0; i < num; i++) { - featureScale[i].resize(qs[i].num_scale); - memcpy(featureScale[i].data(), qs[i].scale, qs[i].num_scale * bytesOf(DT_F32)); - } -#endif - } - -#ifdef _USE_INT8 - virtual void set_feature_scale(Vec> fs) { - this->featureScale = fs; - } - - virtual bool is_dynamic_scale() - { - OperatorType ot = this->get_op_type(); - if (OT_Conv != ot) { - return false; - } - - U32 numScale = featureScale.size(); - U32 numQuant = (DT_F16_8Q == this->dt) ? inputTensors.size() : 0; - - if (0 != numScale && 0 == featureScale[0][0]) { // OP is labelled as no-quantization - return false; - } - - if (0 != numScale && -2 == (featureScale.back())[0]) { // OP is labelled as fp-output - numScale = 0; - numQuant += 1; - } - - for (auto tensor : outputTensors) { - if (DT_I8 == tensor.get_desc().dt) { - numQuant++; - } - } - if (0 == numQuant) { - return false; - } - - if (0 == numScale) { - return true; - } - - CHECK_REQUIREMENT(numQuant == numScale); - return false; - } - -#endif - std::string get_op_name() - { - return this->name; - } - -public: - Arch schedule; - DataType dt; - - Vec inputTensors; - Vec outputTensors; - Vec tensorPos; - - U32 lenOfTemp; - std::shared_ptr temp; - -#ifdef _USE_MALI - std::shared_ptr handle; - ExtInfo oclExtInfo; - ForwardRunInfoMali runInfo; -#endif - - std::string name; - Vec> featureScale; -}; - -#endif //_OPERATOR_H +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _OPERATOR_H +#define _OPERATOR_H + +#include +#include "sys.h" +#include "tensor.hpp" +#include "algorithm_map.h" +#include "tensor_computing.h" +#ifdef _USE_MALI +#include "gcl.h" +#include "gcl_engine.h" +#endif + +class Operator { +public: + Operator() + { + this->dt = DT_F32; + this->name = ""; + this->lenOfTemp = 0; + this->archInfo.archPara = nullptr; + } + + Operator(std::string name) + { + this->dt = DT_F32; + this->name = name; + this->lenOfTemp = 0; + this->archInfo.archPara = nullptr; + } + + virtual ~Operator() + { + if (this->archInfo.archPara != nullptr) { + free(this->archInfo.archPara); + this->archInfo.archPara = nullptr; + } + } + + virtual std::shared_ptr clone() = 0; + + virtual EE infer_output_tensors_size(std::vector, std::vector) = 0; + + virtual U32 infer_tmp_memory_size() + { + this->lenOfTemp = 0; + return 0; + } + + virtual void set_tmp_memory(Tensor temp) + { + this->lenOfTemp = temp.bytes(); + this->temp = temp; + } + + virtual void run() = 0; + + virtual void set_input_output_tensors(std::vector it, std::vector ot) + { + this->inputTensors = it; + this->outputTensors = ot; + } + + virtual void set_input_tensors(std::vector it) + { + this->inputTensors = it; + } + + virtual std::vector get_input_tensors() + { + return this->inputTensors; + } + + virtual void set_output_tensors(std::vector ot) + { + this->outputTensors = ot; + } + + virtual std::vector get_output_tensors() + { + return this->outputTensors; + } + + virtual bool can_input_output_the_same() + { + return false; + } + + virtual bool is_weight() + { + return false; + } + + virtual U32 get_len_of_temp() + { + return this->lenOfTemp; + } + + virtual Tensor get_tmp() + { + return this->temp; + } + + virtual void set_name(std::string opName) + { + this->name = opName; + } + + std::string get_name() + { + return this->name; + } + + virtual void set_schedule(Arch opSchedule) + { + this->archInfo.arch = opSchedule; + } + + virtual void set_tensor_positions(std::vector tensorPos) + { + this->tensorPos = tensorPos; + } + + virtual std::vector &get_tensor_positions() + { + return this->tensorPos; + } + + virtual int get_next_operator_index() + { + return -1; + } + + virtual void init_feature_scale(U32 num, QuantSpec *qs) + { +#ifdef _USE_INT8 + if (1 == num && 0 == qs[0].scale[0]) { // OP is labelled as no-quantization + if (DT_F16_8Q == this->dt) { + this->dt = DT_F16; + } + return; + } + featureScale.resize(num); + for (U32 i = 0; i < num; i++) { + featureScale[i].resize(qs[i].num_scale); + memcpy(featureScale[i].data(), qs[i].scale, qs[i].num_scale * bytesOf(DT_F32)); + } +#endif + } + +#ifdef _USE_INT8 + virtual void set_feature_scale(std::vector> fs) + { + this->featureScale = fs; + } + + virtual bool is_dynamic_scale() + { + OperatorType ot = this->get_type(); + if (OT_Conv != ot) { + return false; + } + + U32 numScale = featureScale.size(); + U32 numQuant = (DT_F16_8Q == this->dt) ? inputTensors.size() : 0; + + if (0 != numScale && 0 == featureScale[0][0]) { // OP is labelled as no-quantization + return false; + } + + if (0 != numScale && -2 == (featureScale.back())[0]) { // OP is labelled as fp-output + numScale = 0; + numQuant += 1; + } + + for (auto tensor : outputTensors) { + if (DT_I8 == tensor.get_desc().dt) { + numQuant++; + } + } + if (0 == numQuant) { + return false; + } + + if (0 == numScale) { + return true; + } + + CHECK_REQUIREMENT(numQuant == numScale); + return false; + } +#endif + + virtual bool checkOperator() + { + for (U32 i = 0; i < inputTensors.size(); i++) { + if (!tensorDescIsValid(inputTensors[i].get_desc())) { + return false; + } + } + for (U32 i = 0; i < outputTensors.size(); i++) { + if (!tensorDescIsValid(outputTensors[i].get_desc())) { + return false; + } + } + return true; + }; + + virtual OperatorType get_type() = 0; + + virtual EE infer_forward_algorithm(std::shared_ptr algorithmMap) + { + UNUSED(algorithmMap); + return SUCCESS; + } + + virtual void set_algorithm_map(std::shared_ptr algorithmMap) + { + this->algorithmMap = algorithmMap; + } + +protected: + ArchInfo archInfo; + DataType dt; + + std::vector inputTensors; + std::vector outputTensors; + std::vector tensorPos; + + U32 lenOfTemp; + Tensor temp; + + std::string name; + std::vector> featureScale; + std::shared_ptr algorithmMap; +}; + +#endif // _OPERATOR_H diff --git a/inference/engine/include/padding.hpp b/inference/engine/include/padding.hpp new file mode 100644 index 00000000..aebfa7b0 --- /dev/null +++ b/inference/engine/include/padding.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _PADDING_H +#define _PADDING_H + +#include "operator.hpp" + +class Padding : public Operator { +public: + Padding(DataType dt, PadParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_Pad; + } + +protected: + PadParamSpec p; +}; + +#endif // _PADDING_H diff --git a/inference/engine/include/pooling.hpp b/inference/engine/include/pooling.hpp new file mode 100644 index 00000000..1c0f8f7c --- /dev/null +++ b/inference/engine/include/pooling.hpp @@ -0,0 +1,47 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _POOLING_H +#define _POOLING_H + +#include "operator.hpp" + +class Pooling : public Operator { +public: + Pooling(PoolingParamSpec p) + { + this->p = p; + } + + OperatorType get_type() override + { + return OT_Pooling; + } + + void set_kernelSize(U32 globalKernelSizeH, U32 globalKernelSizeW) + { + this->p.kernel_h = globalKernelSizeH; + this->p.kernel_w = globalKernelSizeW; + } + + void set_stride(U32 globalStrideH, U32 globalStrideW) + { + this->p.stride_h = globalStrideH; + this->p.stride_w = globalStrideW; + } + +protected: + PoolingParamSpec p; +}; + +#endif // _POOLING_H diff --git a/inference/include/multiply.hpp b/inference/engine/include/power.hpp similarity index 66% rename from inference/include/multiply.hpp rename to inference/engine/include/power.hpp index ed8fd4ac..91a37389 100644 --- a/inference/include/multiply.hpp +++ b/inference/engine/include/power.hpp @@ -1,42 +1,36 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#ifndef _POWER_H +#define _POWER_H -/** - * Project deploy - */ -#ifndef _MULTIPLY_H -#define _MULTIPLY_H #include "operator.hpp" -#include "tensor_computing.h" -class Multiply: public Operator { +class Power : public Operator { public: - Multiply(DataType dt, F32 scale, F32 bias) + Power(DataType dt, PowerParamSpec p) { this->dt = dt; - this->alpha = scale; - this->beta = bias; + this->p = p; } - OperatorType get_op_type() override + OperatorType get_type() override { - return OT_Multiply; + return OT_Power; } public: - F32 alpha; - F32 beta; + PowerParamSpec p; }; -#endif //_MULTIPLY_H +#endif // _POWER_H diff --git a/inference/engine/include/preallocated_memory.hpp b/inference/engine/include/preallocated_memory.hpp new file mode 100644 index 00000000..6a909c54 --- /dev/null +++ b/inference/engine/include/preallocated_memory.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _PREALLOCATED_MEMORY_H +#define _PREALLOCATED_MEMORY_H + +#include "operator.hpp" + +class PreAllocatedMemory : public Operator { +public: + PreAllocatedMemory(DataType dt, TensorDesc desc) + { + this->dt = dt; + this->desc = desc; + } + + OperatorType get_type() override + { + return OT_PreAllocatedMemory; + } + +protected: + TensorDesc desc; +}; + +#endif // _PREALLOCATED_MEMORY_H diff --git a/inference/engine/include/prelu.hpp b/inference/engine/include/prelu.hpp new file mode 100644 index 00000000..0a0e504c --- /dev/null +++ b/inference/engine/include/prelu.hpp @@ -0,0 +1,35 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _PRELU_H +#define _PRELU_H + +#include "weight_operator.hpp" + +class PReLU : public WeightOperator { +public: + PReLU(DataType dt) + { + this->dt = dt; + } + + OperatorType get_type() override + { + return OT_PRelu; + } + +protected: + PReLUParamSpec preluDesc; +}; + +#endif // _PADDING_H diff --git a/inference/engine/include/prior_box.hpp b/inference/engine/include/prior_box.hpp new file mode 100644 index 00000000..4ee39b74 --- /dev/null +++ b/inference/engine/include/prior_box.hpp @@ -0,0 +1,54 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _PRIOR_BOX_H +#define _PRIOR_BOX_H + +#include "operator.hpp" + +class PriorBox : public Operator { +public: + PriorBox(DataType dt, PriorBoxParamSpec p) + { + this->dt = dt; + this->p = p; + } + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new PriorBox(this->dt, this->p)); + *mem = *this; + return mem; + } + + OperatorType get_type() override + { + return OT_PriorBox; + } + + void run() override + { + CHECK_STATUS(priorbox(this->inputTensors, this->p, this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(priorbox_infer_output_size(inTensors, this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + +protected: + PriorBoxParamSpec p; +}; +#endif // _PRIOR_BOX_H diff --git a/inference/engine/include/reduction.hpp b/inference/engine/include/reduction.hpp new file mode 100644 index 00000000..4f3e776d --- /dev/null +++ b/inference/engine/include/reduction.hpp @@ -0,0 +1,83 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _REDUCTION_H +#define _REDUCTION_H + +#include "operator.hpp" + +class Reduction : public Operator { +public: + Reduction(DataType dt, ReductionParamSpec p) + { + this->dt = dt; + this->p = p; + } + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new Reduction(this->dt, this->p)); + *mem = *this; + return mem; + } + + OperatorType get_type() override + { + return OT_Reduction; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor outputTensor = this->outputTensors[0]; + Tensor maskTensor; + if (this->inputTensors.size() > 1) { + maskTensor = this->inputTensors[1]; + } else { + TensorDesc maskDesc; + maskDesc.nDims = 0; + maskTensor.resize(maskDesc); + } + + CHECK_STATUS( + reduction(inputTensor, maskTensor, this->p, this->temp, outputTensor, &this->archInfo)); + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(reduction_infer_forward_tmp_bytes( + this->inputTensors[0], this->p, this->outputTensors[0], &bytes, &this->archInfo)); + return bytes; + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + Tensor maskTensor; + if (inTensors.size() > 1) { + maskTensor = *(inTensors[1]); + } else { + TensorDesc maskDesc; + maskDesc.nDims = 0; + maskTensor.resize(maskDesc); + } + return reduction_infer_output_size(inTensors[0], maskTensor, this->p, outTensors[0]); + } + +private: + ReductionParamSpec p; +}; + +#endif diff --git a/inference/engine/include/relative_position_embedding.hpp b/inference/engine/include/relative_position_embedding.hpp new file mode 100644 index 00000000..3af4d378 --- /dev/null +++ b/inference/engine/include/relative_position_embedding.hpp @@ -0,0 +1,96 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RELATIVE_POSITION_EMBEDDING_H +#define _RELATIVE_POSITION_EMBEDDING_H + +#include "cpu/embedding_cpu.hpp" + +class RelativePositionEmbedding : public EmbeddingCPU { +public: + RelativePositionEmbedding(DataType dt, EmbedParamSpec p) : EmbeddingCPU(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr( + new RelativePositionEmbedding(this->dt, this->p)); + *mem = *this; + return mem; + } + + OperatorType get_type() override + { + return OT_RelativePositionEmbedding; + } + + void run() override + { + Tensor inputTensor = this->inputTensors[0]; + Tensor weightTensor; + if (this->weightTensors.size() > 0) { + weightTensor = this->weightTensors[0]; + } else { + weightTensor = this->inputTensors[1]; + } + Tensor outputTensor = this->outputTensors[0]; + + TensorDesc inputDesc = inputTensor.get_desc(); + U8 *weightPtr = (U8 *)((CpuMemory *)weightTensor.get_memory())->get_ptr(); + U8 *outputPtr = (U8 *)((CpuMemory *)outputTensor.get_memory())->get_ptr(); + + I32 tmpAxis = (this->p.axis + inputDesc.nDims) % inputDesc.nDims; + U32 batch = inputDesc.dims[inputDesc.nDims - 1]; + U32 length = inputDesc.dims[inputDesc.nDims - 1 - tmpAxis]; + for (U32 in = 0; in < batch; in++) { + U8 *ptr = outputPtr + in * length * this->p.num_output * bytesOf(this->dt); + if (length > this->p.input_dim) { + U32 size = (length - this->p.input_dim) * this->p.num_output * bytesOf(this->dt); + memset(ptr, 0, size); + ptr += size; + } + U32 start = 0; + U32 copyLength = this->p.input_dim; + if (length < this->p.input_dim) { + start = this->p.input_dim - length; + copyLength = length; + } + if (this->p.transpose) { + for (U32 i = 0; i < copyLength; i++) { + for (U32 j = 0; j < this->p.num_output; j++) { + memcpy(ptr, + weightPtr + (j * this->p.input_dim + start + i) * bytesOf(this->dt), + bytesOf(this->dt)); + } + } + } else { + memcpy(ptr, weightPtr + start * this->p.num_output * bytesOf(this->dt), + copyLength * this->p.num_output * bytesOf(this->dt)); + } + } + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + TensorDesc inDim = inTensors[0]->get_desc(); + I32 tmpAxis = (this->p.axis + inDim.nDims) % inDim.nDims; + U32 batch = inDim.dims[inDim.nDims - 1]; + U32 length = inDim.dims[inDim.nDims - 1 - tmpAxis]; + TensorDesc outDim = tensor3df(this->dt, DF_MTK, batch, length, this->p.num_output); + outTensors[0]->resize(outDim); + return SUCCESS; + } +}; + +#endif // _RELATIVE_POSITION_EMBEDDING_H diff --git a/inference/include/relative_shift.hpp b/inference/engine/include/relative_shift.hpp similarity index 52% rename from inference/include/relative_shift.hpp rename to inference/engine/include/relative_shift.hpp index 422227bd..7fd0c11f 100644 --- a/inference/include/relative_shift.hpp +++ b/inference/engine/include/relative_shift.hpp @@ -1,90 +1,98 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _RELATIVE_SHIFT_H #define _RELATIVE_SHIFT_H + #include "operator.hpp" -class RelativeShift: public Operator { +class RelativeShift : public Operator { public: - RelativeShift(DataType dt, I32 axis, I32 shiftLength) + RelativeShift(DataType dt, RelativeShiftParamSpec p) + { + this->dt = dt; + this->p = p; + } + + std::shared_ptr clone() override { - this->dt = dt;; - this->axis = axis; - this->shiftLength = shiftLength; + std::shared_ptr mem = + std::shared_ptr(new RelativeShift(this->dt, this->p)); + *mem = *this; + return mem; } - OperatorType get_op_type() override + OperatorType get_type() override { return OT_RelativeShift; } void run() override { - UTIL_TIME_TIC(__CLASS_FUNCTION__) - Tensor inputTensor = this->inputTensors[0]; + Tensor inputTensor = this->inputTensors[0]; Tensor outputTensor = this->outputTensors[0]; - U8* inputPtr = inputTensor.get_val(); - U8* outputPtr = outputTensor.get_val(); + U8 *inputPtr = (U8 *)((CpuMemory *)(inputTensor.get_memory()))->get_ptr(); + U8 *outputPtr = (U8 *)((CpuMemory *)(outputTensor.get_memory()))->get_ptr(); TensorDesc inputDesc = inputTensor.get_desc(); - I32 tmpAxis = (this->axis + inputDesc.nDims) % inputDesc.nDims; + I32 tmpAxis = (this->p.axis + inputDesc.nDims) % inputDesc.nDims; tmpAxis = (I32)inputDesc.nDims - 1 - tmpAxis; U32 length = inputDesc.dims[tmpAxis]; if (tmpAxis + 1 >= (I32)inputDesc.nDims) { - memcpy(outputPtr, inputPtr, tensorNumBytes(inputDesc)); + U32 bytes = inputTensor.bytes(); + memcpy(outputPtr, inputPtr, bytes); return; } - U32 loops = inputDesc.dims[tmpAxis+1]; + U32 loops = inputDesc.dims[tmpAxis + 1]; U32 innerLength = 1; U32 outerLength = 1; for (I32 i = 0; i < tmpAxis; i++) { innerLength *= inputDesc.dims[i]; } - for (U32 i = tmpAxis+2; i < inputDesc.nDims; i++) { + for (U32 i = tmpAxis + 2; i < inputDesc.nDims; i++) { outerLength *= inputDesc.dims[i]; } U32 tileSize = innerLength * bytesOf(inputDesc.dt); U32 chunkSize = length * tileSize; - U8* dstPtr = outputPtr; + U8 *dstPtr = outputPtr; for (U32 i = 0; i < outerLength; i++) { - U8* srcPtr = inputPtr + i * loops * chunkSize; - U32 num = loops * length - (loops - shiftLength) * (shiftLength + length); - U32 start = shiftLength * length - num; + U8 *srcPtr = inputPtr + i * loops * chunkSize; + U32 num = + loops * length - (loops - this->p.shift_length) * (this->p.shift_length + length); + U32 start = this->p.shift_length * length - num; U32 srcIndex = start * tileSize; - memcpy(dstPtr, srcPtr+srcIndex, num*tileSize); + memcpy(dstPtr, srcPtr + srcIndex, num * tileSize); dstPtr += num * tileSize; srcIndex += num * tileSize; - for (U32 j = shiftLength; j < loops; j++) { - memset(dstPtr, 0, shiftLength*tileSize); - dstPtr += shiftLength * tileSize; - memcpy(dstPtr, srcPtr+srcIndex, chunkSize); + for (U32 j = this->p.shift_length; j < loops; j++) { + memset(dstPtr, 0, this->p.shift_length * tileSize); + dstPtr += this->p.shift_length * tileSize; + memcpy(dstPtr, srcPtr + srcIndex, chunkSize); dstPtr += chunkSize; srcIndex += chunkSize; } } - UTIL_TIME_TOC(__CLASS_FUNCTION__) } - EE infer_output_tensors_size(Vec inDims, Vec* outDims) override + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override { - (*outDims)[0] = inDims[0]; + outTensors[0]->resize(inTensors[0]->get_desc()); return SUCCESS; } + private: - int axis; - int shiftLength; + RelativeShiftParamSpec p; }; -#endif //_RELATIVE_SHIFT_H +#endif // _RELATIVE_SHIFT_H diff --git a/inference/engine/include/repeat.hpp b/inference/engine/include/repeat.hpp new file mode 100644 index 00000000..7d5e1471 --- /dev/null +++ b/inference/engine/include/repeat.hpp @@ -0,0 +1,42 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _REPEAT_H +#define _REPEAT_H + +#include "operator.hpp" + +class Repeat : public Operator { +public: + Repeat(DataType dt, RepeatParamSpec p, I32 jumpOperatorIndex, I32 currentOperatorIndex) + { + this->dt = dt; + this->p = p; + this->iter = 0; + this->jumpOperatorIndex = jumpOperatorIndex; + this->nextOperatorIndex = currentOperatorIndex + 1; + } + + OperatorType get_type() override + { + return OT_Repeat; + } + +protected: + RepeatParamSpec p; + int iter; + int jumpOperatorIndex; + int nextOperatorIndex; +}; + +#endif // _REPEAT_H diff --git a/inference/engine/include/reshape.hpp b/inference/engine/include/reshape.hpp new file mode 100644 index 00000000..0a8f9f7b --- /dev/null +++ b/inference/engine/include/reshape.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RESHAPE_H +#define _RESHAPE_H + +#include "operator.hpp" + +class Reshape : public Operator { +public: + Reshape(DataType dt, ReshapeParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_Reshape; + } + +protected: + ReshapeParamSpec p; +}; + +#endif // _RESHAPE_H diff --git a/inference/engine/include/resize.hpp b/inference/engine/include/resize.hpp new file mode 100644 index 00000000..981855b0 --- /dev/null +++ b/inference/engine/include/resize.hpp @@ -0,0 +1,41 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RESIZE_H +#define _RESIZE_H + +#include "operator.hpp" + +class Resize : public Operator { +public: + Resize(DataType paramDT, ResizeParamSpec p) + { + if (paramDT == DT_F32 || paramDT == DT_U32) { + this->paramDT = paramDT; + this->p = p; + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + } + + OperatorType get_type() override + { + return OT_Resize; + } + +protected: + DataType paramDT; + ResizeParamSpec p; +}; + +#endif // _RESIZE_H diff --git a/inference/include/result_format.hpp b/inference/engine/include/result_format.hpp similarity index 79% rename from inference/include/result_format.hpp rename to inference/engine/include/result_format.hpp index 41344e7d..411579cc 100644 --- a/inference/include/result_format.hpp +++ b/inference/engine/include/result_format.hpp @@ -1,23 +1,21 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _RESULT_FORMAT_H #define _RESULT_FORMAT_H #include "tensor.hpp" -Vec topK_index(Tensor data, U32 topK); - +std::vector topK_index(U8 *res, TensorDesc desc, U32 topK); -#endif //_RESULT_FORMAT_H +#endif // _RESULT_FORMAT_H diff --git a/inference/engine/include/rnncell.hpp b/inference/engine/include/rnncell.hpp new file mode 100644 index 00000000..c898e9a4 --- /dev/null +++ b/inference/engine/include/rnncell.hpp @@ -0,0 +1,39 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RNNCELL_H +#define _RNNCELL_H + +#include "weight_operator.hpp" + +class RNNCell : public WeightOperator { +public: + RNNCell(DataType dt, RNNParamSpec p) + { + this->dt = dt; + this->p = p; + this->hasBias = false; + } + + OperatorType get_type() override + { + return OT_RNN; + } + +public: + RNNParamSpec p; + U32 xDim; + ClipParamSpec clipParam; +}; + +#endif // _RNNCELL_H diff --git a/inference/include/scale.hpp b/inference/engine/include/scale.hpp similarity index 66% rename from inference/include/scale.hpp rename to inference/engine/include/scale.hpp index acedd315..e438dfdc 100644 --- a/inference/include/scale.hpp +++ b/inference/engine/include/scale.hpp @@ -1,55 +1,45 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _SCALE_H #define _SCALE_H -#include #include "weight_operator.hpp" -#include "tensor_computing.h" -#include "tensor_desc.h" -#include "op_type.h" -class Scale: public WeightOperator -{ +class Scale : public WeightOperator { public: - Scale(DataType dt, int axis, int numChannels, int numSource) + Scale(DataType dt, ScaleParamSpec p, int numChannels) { this->dt = dt; - this->axis = axis; - this->numSource = numSource; + this->p = p; this->numChannels = numChannels; this->dataID = 0; } - OperatorType get_op_type() override + OperatorType get_type() override { return OT_Scale; } - + bool can_input_output_the_same() override { return true; } - virtual EE init_weight_bias_from_model(U8** modelPtr) = 0; - protected: - int axis; + ScaleParamSpec p; U32 numChannels; - int numSource; // How many source tensors compose this input int dataID; }; -#endif //_SCALE_H +#endif // _SCALE_H diff --git a/inference/engine/include/sequential.hpp b/inference/engine/include/sequential.hpp new file mode 100644 index 00000000..28c7c67e --- /dev/null +++ b/inference/engine/include/sequential.hpp @@ -0,0 +1,228 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SEQUENTIAL_HPP +#define _SEQUENTIAL_HPP + +#include "sys.h" +#include "error.h" +#include "types.h" +#include "tensor.hpp" +#include "operator.hpp" +#include "convolution.hpp" +#include "fully_connected.hpp" +#include "cnn.h" +#include "op_type.h" +#include "tensor_desc.h" +#include "sequential.hpp" +#include "cpu/rnn_cpu.hpp" + +class Sequential : public CNN { +public: + Sequential(AffinityPolicy affinityPolicy, DataType dt, std::string name) + : CNN(affinityPolicy, dt, name) + {} + + void initialize_weight(std::shared_ptr _modelPtr) + { + this->modelPtr = _modelPtr; + } + + EE infer_output_tensors_size(std::map inputDescMap) override + { + if (inputDescMap.size() != 1) { + return NOT_SUPPORTED; + } + std::vector inputTensors; + std::vector inputTensorsPtr; + std::vector inDims; + std::vector outputTensors; + std::vector outputTensorsPtr(1); + U32 count = 0; + for (auto iter : inputDescMap) { + Tensor tensor; + tensor.resize(iter.second); + inputTensors.push_back(tensor); + inDims.push_back(iter.second); + inputTensorsPtr.push_back(&inputTensors[count]); + count++; + } + this->dimsOp = {inDims}; + auto num = [](std::vector inDims) -> U32 { + U32 ret = 0; + for (auto d : inDims) { + ret += tensorNumElements(d); + } + return ret; + }; + maxOutputElements = num(inDims); + + count = 0; + for (auto op : this->ops) { + Tensor tensor; + outputTensors.push_back(tensor); + outputTensorsPtr[0] = &outputTensors[count]; + CHECK_STATUS(op->infer_output_tensors_size(inputTensorsPtr, outputTensorsPtr)); + auto outDesc = outputTensorsPtr[0]->get_desc(); + std::vector outDescVec; + outDescVec.push_back(outDesc); + dimsOp.push_back(outDescVec); + U32 numElements = tensorNumElements(outDesc); + if (maxOutputElements < numElements) { + maxOutputElements = numElements; + } + inputTensorsPtr[0] = &outputTensors[count]; + count++; + } + return SUCCESS; + } + + void assign_output_tensor() override + { + auto firstPtr = (U8 *)operator new(bytesOf(this->dt) * maxOutputElements); + std::shared_ptr firstSharedPtr(firstPtr); + auto secondPtr = (U8 *)operator new(bytesOf(this->dt) * maxOutputElements); + std::shared_ptr secondSharedPtr(secondPtr); + for (U32 i = 0; i < this->ops.size(); i++) { + auto op = this->ops[i]; + auto inDims = dimsOp[i]; + auto outDims = dimsOp[i + 1]; + + std::vector inTensors; + U32 index = 0; + for (auto d : inDims) { + auto val = + std::shared_ptr(firstSharedPtr, (U8 *)firstPtr + index * bytesOf(this->dt)); + Tensor tensor; + tensor.resize(d); + ((CpuMemory *)tensor.get_memory())->set_shared_ptr(val); + inTensors.push_back(tensor); + index += tensorNumElements(d); + } + + std::vector outTensors; + index = 0; + for (auto d : outDims) { + auto val = std::shared_ptr( + secondSharedPtr, (U8 *)secondPtr + index * bytesOf(this->dt)); + Tensor tensor; + tensor.resize(d); + ((CpuMemory *)tensor.get_memory())->set_shared_ptr(val); + outTensors.push_back(tensor); + index += tensorNumElements(d); + } + + op->set_input_output_tensors(inTensors, outTensors); + + std::swap(firstPtr, secondPtr); + std::swap(firstSharedPtr, secondSharedPtr); + } + } + + EE ConvBiasAssignmentAndWeightTransform() + { + return SUCCESS; + } + + EE FCBiasAssignmentAndWeight() + { + return SUCCESS; + } + + void ready(std::map inputDescMap) override + { + for (auto op : this->ops) { + op->set_schedule(this->deviceInfo.schedule); + } + this->infer_output_tensors_size(inputDescMap); + this->assign_output_tensor(); + + for (auto op : this->ops) { + if (op->is_weight()) { + if (op->get_type() == OT_Conv) { + auto convOpPtr = dynamic_cast(op.get()); + CHECK_STATUS(convOpPtr->init_weight_bias_from_model(&modelPtr)); + CHECK_STATUS(convOpPtr->infer_forward_algorithm(this->algorithmMap)); + CHECK_STATUS(convOpPtr->transform_filter()); + } else if (op->get_type() == OT_FC) { + auto fcOpPtr = dynamic_cast(op.get()); + CHECK_STATUS(fcOpPtr->init_weight_bias_from_model(&modelPtr)); + CHECK_STATUS(fcOpPtr->transform_filter()); + } else if (op->get_type() == OT_RNN) { + auto rnnOpPtr = dynamic_cast(op.get()); + CHECK_STATUS(rnnOpPtr->init_weight_bias_from_model(&modelPtr)); + CHECK_STATUS(rnnOpPtr->transform_filter()); + } + } + } + + this->infer_tmp_memory_size(); + this->assign_tmp_tensor(); + } + + void infer_tmp_memory_size() override + { + tmpElements.clear(); + maxTmpElements = 0; + + for (auto op : this->ops) { + auto len = op->infer_tmp_memory_size(); + tmpElements.push_back(len); + if (len > maxTmpElements) { + maxTmpElements = len; + } + } + } + + void assign_tmp_tensor() override + { + temp.resize(tensor1d(DT_U8, maxTmpElements)); + temp.alloc(); + for (auto op : this->ops) { + op->set_tmp_memory(temp); + } + } + + void add(std::shared_ptr op) + { + this->ops.push_back(op); + } + + std::vector get_inputTensors() + { + auto op = this->ops[0].get(); + return op->get_input_tensors(); + } + + std::vector get_output_tensors() + { + auto len = this->ops.size(); + auto op = this->ops[len - 1].get(); + return op->get_output_tensors(); + } + + void set_input_tensors(std::vector inputTensors) + { + auto op = this->ops[0].get(); + op->set_input_tensors(inputTensors); + } + +private: + std::shared_ptr modelPtr; + U32 maxOutputElements; + std::vector> dimsOp; + U32 maxTmpElements; + std::vector tmpElements; + Tensor temp; +}; +#endif diff --git a/inference/engine/include/sequential_ocl.hpp b/inference/engine/include/sequential_ocl.hpp new file mode 100644 index 00000000..d4843e06 --- /dev/null +++ b/inference/engine/include/sequential_ocl.hpp @@ -0,0 +1,230 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_MALI +#ifndef _SEQUENTIAL_OCL_HPP +#define _SEQUENTIAL_OCL_HPP + +#include +#include "sys.h" +#include "error.h" +#include "types.h" +#include +#include "tensor.hpp" +#include "operator.hpp" +#include "cnn.h" +#include "op_type.h" +#include "tensor_desc.h" +#include "memory.hpp" +#include "weight_operator.hpp" +#include "pooling.hpp" +#include "convolution.hpp" +#include "bilateral_slice_apply.hpp" +#include "ocl/pooling_ocl.hpp" +#include "memory_ocl.hpp" +#include "ocl/convolution_ocl.hpp" +#include "ocl/bilateral_slice_apply_ocl.hpp" +#include "ocl/fully_connected_ocl.hpp" +#include "ocl/scale_ocl.hpp" + +class SequentialOcl : public CNN { +public: + SequentialOcl(AffinityPolicy affinityPolicy, DataType dt, std::string name) + : CNN(affinityPolicy, dt, name) + { + input_output_same = false; + } + virtual ~SequentialOcl() + {} + + EE ready(std::vector dims, std::shared_ptr modelPtr, U32 numOutput) + { + this->ops[0]->set_schedule(this->deviceInfo.schedule); + input_output_same = this->ops[0]->can_input_output_the_same(); + CHECK_STATUS(this->infer_output_tensors_size(dims, numOutput)); + std::vector inTensors; + std::vector outTensors; + for (U32 i = 0; i < inputTensors.size(); i++) { + inTensors.push_back(*inputTensors[i].get()); + } + for (U32 i = 0; i < outputTensors.size(); i++) { + outTensors.push_back(*outputTensors[i].get()); + } + this->ops[0]->set_input_output_tensors(inTensors, outTensors); + this->ops[0]->set_algorithm_map(this->algorithmMap); + + if (this->ops[0]->is_weight()) { + if (this->ops[0]->get_type() == OT_Conv) { + auto convOpPtr = dynamic_cast(this->ops[0].get()); + auto weightOp = (WeightOperator *)convOpPtr; + weightOp->set_hasBias(true); + CHECK_STATUS(convOpPtr->init_weight_bias_from_model(&modelPtr)); + CHECK_STATUS(convOpPtr->infer_forward_algorithm(this->algorithmMap)); + CHECK_STATUS(convOpPtr->transform_filter()); + } + if (this->ops[0]->get_type() == OT_FC) { + auto fcOpPtr = dynamic_cast(this->ops[0].get()); + auto weightOp = (WeightOperator *)fcOpPtr; + weightOp->set_hasBias(true); + CHECK_STATUS(fcOpPtr->init_weight_bias_from_model(&modelPtr)); + CHECK_STATUS(fcOpPtr->transform_filter()); + } + if (this->ops[0]->get_type() == OT_Scale) { + auto scaleOpPtr = dynamic_cast(this->ops[0].get()); + auto weightOp = (WeightOperator *)scaleOpPtr; + weightOp->set_hasBias(true); + CHECK_STATUS(scaleOpPtr->init_weight_bias_from_model(&modelPtr)); + } + } + this->infer_tmp_memory_size(); + this->assign_tmp_tensor(); + this->alloc_output_host_tensors(numOutput); + return SUCCESS; + } + + EE infer_output_tensors_size(std::map) override + { + return NOT_SUPPORTED; + } + + void assign_output_tensor() override + {} + + EE infer_output_tensors_size(std::vector dims, U32 outputTensorNum) + { + std::vector inTensors; + std::vector outTensors; + for (U32 i = 0; i < dims.size(); ++i) { + std::shared_ptr tmpTensor(new Tensor(OCLMem)); + tmpTensor->resize(dims[i]); + inputTensors.push_back(tmpTensor); + inTensors.push_back(inputTensors[i].get()); + } + for (U32 i = 0; i < outputTensorNum; ++i) { + std::shared_ptr tmpTensor(new Tensor(OCLMem)); + outputTensors.push_back(tmpTensor); + outTensors.push_back(outputTensors[i].get()); + } + + CHECK_STATUS(this->ops[0]->infer_output_tensors_size(inTensors, outTensors)); + for (auto p : inTensors) { + p->alloc(); + } + return SUCCESS; + } + + EE infer_gclmem_descs(std::map) + { + return NOT_SUPPORTED; + } + + void alloc_output_host_tensors(U32 outputTensorNum) + { + for (U32 i = 0; i < outputTensorNum; i++) { + auto mem = (OclMemory *)outputTensors[i]->get_memory(); + mem->mapped_alloc(); + } + } + + void infer_tmp_memory_size() override + { + maxTmpElements = 0; + for (auto op : this->ops) { + auto len = op->infer_tmp_memory_size(); + if (len > maxTmpElements) { + maxTmpElements = len; + } + } + } + + void assign_tmp_tensor() override + { + this->temp = Tensor(OCLMem); + if (maxTmpElements) { + temp.resize(tensor1d(DT_U8, maxTmpElements)); + temp.alloc(); + } + for (auto op : this->ops) { + op->set_tmp_memory(temp); + } + } + + void add(std::shared_ptr op) + { + this->ops.push_back(op); + } + + void mark_input_output() + { + if (this->deviceInfo.schedule == MALI) { + U32 tmpBufSize = 0; + for (U32 i = 0; i < inputTensors.size(); i++) { + Tensor *inputTensor = inputTensors[i].get(); + TensorDesc desc = inputTensor->get_desc(); + U32 size = tensorNumBytes(desc); + ArchInfo archInfo; + archInfo.arch = MALI; + tmpBufSize = (tmpBufSize < size) ? size : tmpBufSize; + } + + if (tmpBufSize > maxTmpElements) { + maxTmpElements = tmpBufSize; + } + temp.resize(tensor1d(DT_U8, maxTmpElements)); + temp.alloc(); + } + } + + void set_input_tensors(std::vector modelInputTensors) + { + for (U32 i = 0; i < modelInputTensors.size(); i++) { + auto hostMem = (CpuMemory *)modelInputTensors[i].get_memory(); + U8 *hostPtr = (U8 *)hostMem->get_ptr(); + TensorDesc hostDesc = modelInputTensors[i].get_desc(); + auto *mem = (OclMemory *)inputTensors[i]->get_memory(); + GCLMem_t input = (GCLMem_t)mem->get_ptr(); + auto *tmpmem = (OclMemory *)temp.get_memory(); + GCLMem_t tmp = (GCLMem_t)tmpmem->get_ptr(); + CHECK_STATUS(ocl_set_input(this->handle.get(), input, hostDesc, hostPtr, tmp, true)); + } + gcl_finish(this->handle.get()); + } + + std::vector> get_output_tensors() + { + return this->outputTensors; + } + +#ifdef _USE_MALI +#else + EE ConvBiasAssignmentAndWeightTransform() + { + return SUCCESS; + } + + EE FCBiasAssignmentAndWeight() + { + return SUCCESS; + } +#endif + +private: + using Model::ready; + U32 maxTmpElements; + Tensor temp; + std::vector> inputTensors; + std::vector> outputTensors; + bool input_output_same; +}; +#endif +#endif diff --git a/inference/engine/include/shape.hpp b/inference/engine/include/shape.hpp new file mode 100644 index 00000000..a67d30c0 --- /dev/null +++ b/inference/engine/include/shape.hpp @@ -0,0 +1,30 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SHAPE_H +#define _SHAPE_H + +#include "operator.hpp" + +class Shape : public Operator { +public: + Shape() + {} + + OperatorType get_type() override + { + return OT_Shape; + } +}; + +#endif // _SHAPE_H diff --git a/inference/engine/include/shared_weight.hpp b/inference/engine/include/shared_weight.hpp new file mode 100644 index 00000000..d1d0e4d7 --- /dev/null +++ b/inference/engine/include/shared_weight.hpp @@ -0,0 +1,43 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SHARED_WEIGHT_H +#define _SHARED_WEIGHT_H + +#include "weight_operator.hpp" + +class SharedWeight : public WeightOperator { +public: + SharedWeight(DataType dt, + TensorDesc desc, + std::string outputTensorName, + std::map> *tensorMapPtr) + { + this->dt = dt; + this->desc = desc; + this->outputTensorName = outputTensorName; + this->tensorMapPtr = tensorMapPtr; + } + + OperatorType get_type() override + { + return OT_SharedWeight; + } + +protected: + TensorDesc desc; + std::string outputTensorName; + std::map> *tensorMapPtr; +}; + +#endif // _WEIGHT_H diff --git a/inference/include/slice.hpp b/inference/engine/include/slice.hpp similarity index 66% rename from inference/include/slice.hpp rename to inference/engine/include/slice.hpp index 5c5b517e..34568a91 100644 --- a/inference/include/slice.hpp +++ b/inference/engine/include/slice.hpp @@ -1,41 +1,36 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _SLICE_H #define _SLICE_H #include "operator.hpp" -#include "tensor_computing.h" -class Slice: public Operator { +class Slice : public Operator { public: - Slice(DataType dt, I32 axis, I32* slicePointsPtr, I32 sliceSize) + Slice(DataType dt, SliceParamSpec p) { this->dt = dt; - this->axis = axis; - this->slicePoints = Vec(sliceSize); - memcpy(this->slicePoints.data(), slicePointsPtr, sizeof(I32) * sliceSize); + this->p = p; } - OperatorType get_op_type() override + OperatorType get_type() override { return OT_Slice; } protected: - Vec slicePoints; - I32 axis; + SliceParamSpec p; }; -#endif //_SLICE_H +#endif // _SLICE_H diff --git a/inference/include/softmax.hpp b/inference/engine/include/softmax.hpp similarity index 75% rename from inference/include/softmax.hpp rename to inference/engine/include/softmax.hpp index cb92aa3b..60cda0fb 100644 --- a/inference/include/softmax.hpp +++ b/inference/engine/include/softmax.hpp @@ -1,37 +1,35 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _SOFTMAX_H #define _SOFTMAX_H #include "operator.hpp" -#include "tensor_computing.h" class Softmax : public Operator { public: - explicit Softmax(DataType dt, int axis) + explicit Softmax(DataType dt, SoftmaxParamSpec p) { this->dt = dt; - this->axis = axis; - this->lenOfTemp = 0; + this->p = p; } - OperatorType get_op_type() override + OperatorType get_type() override { return OT_Softmax; } + protected: - int axis; + SoftmaxParamSpec p; }; -#endif //_SOFTMAX_H +#endif // _SOFTMAX_H diff --git a/inference/include/space2depth.hpp b/inference/engine/include/space2depth.hpp similarity index 78% rename from inference/include/space2depth.hpp rename to inference/engine/include/space2depth.hpp index 5c7874a6..592c4903 100644 --- a/inference/include/space2depth.hpp +++ b/inference/engine/include/space2depth.hpp @@ -1,38 +1,32 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _SPACE2DEPTH_H #define _SPACE2DEPTH_H #include "operator.hpp" -class Space2Depth: public Operator -{ +class Space2Depth : public Operator { public: - /** - @param mode - */ Space2Depth(DataType dt) { this->dt = dt; } - OperatorType get_op_type() override + OperatorType get_type() override { return OT_Space2Depth; } - }; -#endif //_SPACE2DEPTH_H +#endif // _SPACE2DEPTH_H diff --git a/inference/engine/include/splice.hpp b/inference/engine/include/splice.hpp new file mode 100644 index 00000000..31b024c3 --- /dev/null +++ b/inference/engine/include/splice.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SPLICE_H +#define _SPLICE_H + +#include "weight_operator.hpp" + +class Splice : public WeightOperator { +public: + Splice(DataType dt, SpliceParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_Splice; + } + +protected: + SpliceParamSpec p; +}; + +#endif // _EMBEDDING__H diff --git a/inference/include/squeeze.hpp b/inference/engine/include/squeeze.hpp similarity index 68% rename from inference/include/squeeze.hpp rename to inference/engine/include/squeeze.hpp index 63964494..978b7217 100644 --- a/inference/include/squeeze.hpp +++ b/inference/engine/include/squeeze.hpp @@ -1,44 +1,36 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _SQUEEZE_H #define _SQUEEZE_H #include "operator.hpp" -class Squeeze: public Operator -{ +class Squeeze : public Operator { public: - /** - @param mode - */ - Squeeze(DataType dt, I32 axis, I32 *dims, I32 dimSize) + Squeeze(DataType dt, SqueezeParamSpec p) { this->dt = dt; - this->axis = axis; - this->dims = Vec(dimSize); - memcpy(this->dims.data(), dims, sizeof(I32) * dimSize); + this->p = p; } - OperatorType get_op_type() override + OperatorType get_type() override { return OT_Squeeze; } protected: - I32 axis; - Vec dims; + SqueezeParamSpec p; }; -#endif //_SQUEEZE_H +#endif // _SQUEEZE_H diff --git a/inference/engine/include/tfslice.hpp b/inference/engine/include/tfslice.hpp new file mode 100644 index 00000000..6deb2f1d --- /dev/null +++ b/inference/engine/include/tfslice.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _TFSLICE_H +#define _TFSLICE_H + +#include "operator.hpp" + +class TfSlice : public Operator { +public: + TfSlice(DataType dt, TfSliceParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_TfSlice; + } + +protected: + TfSliceParamSpec p; +}; + +#endif // _TFSLICE_H diff --git a/inference/engine/include/tile.hpp b/inference/engine/include/tile.hpp new file mode 100644 index 00000000..c1cc34ed --- /dev/null +++ b/inference/engine/include/tile.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _TILE_H +#define _TILE_H + +#include "operator.hpp" + +class Tile : public Operator { +public: + Tile(DataType dt, TileParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_Tile; + } + +protected: + TileParamSpec p; +}; + +#endif // _TILE_H diff --git a/inference/include/transpose.hpp b/inference/engine/include/transpose.hpp similarity index 67% rename from inference/include/transpose.hpp rename to inference/engine/include/transpose.hpp index 6dcc0213..0de95a88 100644 --- a/inference/include/transpose.hpp +++ b/inference/engine/include/transpose.hpp @@ -1,38 +1,36 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #ifndef _TRANSPOSE_H #define _TRANSPOSE_H -#include #include "operator.hpp" -#include "tensor_computing.h" -class Transpose: public Operator { +class Transpose : public Operator { public: - Transpose(DataType dt, U32* transDimsPtr, U32 transDimsSize){ + Transpose(DataType dt, TransposeParamSpec p) + { this->dt = dt; - this->transDims = Vec(transDimsSize); - memcpy(this->transDims.data(), transDimsPtr, sizeof(U32) * transDimsSize); + this->p = p; } - OperatorType get_op_type() override + OperatorType get_type() override { return OT_Transpose; } + protected: - Vec transDims; + TransposeParamSpec p; }; -#endif //_TRANSPOSE_H +#endif // _TRANSPOSE_H diff --git a/inference/engine/include/unsqueeze.hpp b/inference/engine/include/unsqueeze.hpp new file mode 100644 index 00000000..8f0e82c8 --- /dev/null +++ b/inference/engine/include/unsqueeze.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _UNSQUEEZE_H +#define _UNSQUEEZE_H + +#include "operator.hpp" + +class Unsqueeze : public Operator { +public: + Unsqueeze(DataType dt, UnsqueezeParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_Unsqueeze; + } + +protected: + UnsqueezeParamSpec p; +}; + +#endif // _UNSQUEEZE_H diff --git a/inference/engine/include/weight_operator.hpp b/inference/engine/include/weight_operator.hpp new file mode 100644 index 00000000..83e94f29 --- /dev/null +++ b/inference/engine/include/weight_operator.hpp @@ -0,0 +1,195 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _WEIGHTOPERATOR_H +#define _WEIGHTOPERATOR_H + +#include "operator.hpp" +#include "model_tools.h" + +class WeightOperator : public Operator { +public: + WeightOperator() + { + this->hasBias = false; + this->lenOfWtm = 0; + + this->ws.mdt = DT_U8; + this->ws.bytes_of_weight = 0; + this->ws.weight = nullptr; + this->ws.bytes_of_vec = 0; + this->ws.vec = nullptr; + } + + bool is_weight() override + { + return true; + } + + U32 get_weight_size() + { + U32 ret = 0; + for (auto tensor : this->weightTensors) { + TensorDesc desc = tensor.get_desc(); + ret += tensorNumBytes(desc); + } + return ret; + } + + virtual void set_weight_tensors(std::vector weightTensors) + { + this->weightTensors = weightTensors; + } + + virtual std::vector get_weight_tensors() + { + return this->weightTensors; + } + + virtual void set_bias_tensors(std::vector biasTensors) + { + this->biasTensors = biasTensors; + } + + virtual std::vector get_bias_tensors() + { + return this->biasTensors; + } + + virtual U32 infer_wtm_memory_size() + { + this->lenOfWtm = 0; + this->wtm = std::shared_ptr(); + return 0; + } + +#ifdef _USE_MALI + virtual GCLMemDesc infer_wtm_memory_size_mali() + { + this->lenOfWtm = 0; + this->wtm = std::shared_ptr(); + U32 stride[3] = {0, 0, 0}; + U32 offset[3] = {0, 0, 0}; + GCLMemDesc tmpdesc = gcl_mem_desc(stride, offset, DT_U8, DF_NCWHC4); + return tmpdesc; + } +#endif + + virtual void set_wtm_memory(U32 len, Tensor wtm) + { + this->lenOfWtm = len; + this->temp = wtm; + } + + virtual U32 get_lenOfWtm() + { + return this->lenOfWtm; + } + + virtual Tensor *get_wtm() + { + return this->wtm.get(); + } + + virtual void set_weightspec_ptr(WeightSpec ws) + { + this->ws = ws; + } + + virtual WeightSpec get_weightspec() + { + return this->ws; + } + + virtual void set_hasBias(bool hasBiasOrNot) + { + this->hasBias = hasBiasOrNot; + } + + virtual EE infer_weight_desc() + { + return SUCCESS; + } + + virtual EE init_weight_bias_from_model(std::shared_ptr *modelPtr) + { + EE ret = this->infer_weight_desc(); + if (ret != SUCCESS) { + return ret; + } + auto curOpWs = this->get_weightspec(); + CpuMemory weight_mem_src, bias_mem_src; + std::shared_ptr weight_ptr, bias_ptr; + if (modelPtr != nullptr) { + weight_ptr = *modelPtr; + bias_ptr = *modelPtr; + } else { + weight_ptr = std::shared_ptr(curOpWs.weight); + bias_ptr = std::shared_ptr(curOpWs.vec); + } + + U32 weight_offset = 0; + for (auto weight_tensor : this->weightTensors) { + TensorDesc desc = weight_tensor.get_desc(); + auto weight_mem_dst = weight_tensor.get_memory(); + weight_mem_src.resize(desc); + weight_mem_src.set_shared_ptr( + std::shared_ptr(weight_ptr, weight_ptr.get() + weight_offset)); + weight_mem_dst->reuse(&weight_mem_src); + weight_offset += tensorNumBytes(desc); + } + + U32 bias_offset = (modelPtr != nullptr) ? weight_offset : 0; + if (this->hasBias) { + for (auto bias_tensor : this->biasTensors) { + TensorDesc desc = bias_tensor.get_desc(); + auto bias_mem_dst = bias_tensor.get_memory(); + bias_mem_src.resize(desc); + bias_mem_src.set_shared_ptr( + std::shared_ptr(bias_ptr, bias_ptr.get() + bias_offset)); + bias_mem_dst->reuse(&bias_mem_src); + bias_offset += tensorNumBytes(desc); + } + } else { + for (auto bias_tensor : this->biasTensors) { + TensorDesc desc = bias_tensor.get_desc(); + auto bias_mem_dst = bias_tensor.get_memory(); + bias_mem_src.resize(desc); + bias_mem_src.alloc(); + U8 *tmp = (U8 *)bias_mem_src.get_ptr(); + memset(tmp, 0, bias_mem_src.bytes()); + bias_mem_dst->reuse(&bias_mem_src); + } + } + if (modelPtr != nullptr) { + *modelPtr = std::shared_ptr(bias_ptr, bias_ptr.get() + bias_offset); + } + return SUCCESS; + } + + virtual EE transform_filter() + { + return SUCCESS; + } + +protected: + std::vector weightTensors; + std::vector biasTensors; + bool hasBias; + + U32 lenOfWtm; + std::shared_ptr wtm; + WeightSpec ws; +}; + +#endif // _WEIGHTOPERATOR_H diff --git a/inference/engine/include/yolov3_detection_output.hpp b/inference/engine/include/yolov3_detection_output.hpp new file mode 100644 index 00000000..1c4f6188 --- /dev/null +++ b/inference/engine/include/yolov3_detection_output.hpp @@ -0,0 +1,57 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _YOLOV3_DETECTION_OUTPUT_H +#define _YOLOV3_DETECTION_OUTPUT_H + +#include "operator.hpp" + +class Yolov3DetectionOutput : public Operator { +public: + Yolov3DetectionOutput(DataType dt, Yolov3DetectionOutputParamSpec p) + { + this->dt = dt; + this->p = p; + } + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new Yolov3DetectionOutput(this->dt, this->p)); + *mem = *this; + return mem; + } + + OperatorType get_type() override + { + return OT_Yolov3DetectionOutput; + } + + void run() override + { + CHECK_STATUS(yolov3detectionoutput( + this->inputTensors, this->p, this->outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(yolov3detectionoutput_infer_output_size( + inTensors, this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + +protected: + Yolov3DetectionOutputParamSpec p; +}; +#endif // _YOLOV3_DETECTION_OUTPUT_H diff --git a/inference/engine/src/BoltModel_Jni.cpp b/inference/engine/src/BoltModel_Jni.cpp new file mode 100644 index 00000000..143136a6 --- /dev/null +++ b/inference/engine/src/BoltModel_Jni.cpp @@ -0,0 +1,558 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _USE_JNI +#include +#include "cnn.h" +#include "BoltModel.h" +#include "../api/c/bolt.h" + +struct ModelHandleInfo { + void *cnn; + DEVICE_TYPE deviceType; + void *algoPath; + bool useFileStream; +}; + +typedef struct { + U32 dims[4] = {0}; + char name[NAME_LEN] = {0}; + DataType dt; + DataFormat df; + void *dataPtr; +} DataDesc; + +typedef struct { + U32 num_outputs; + DataDesc *outputArr; + DEVICE_TYPE deviceType; +} ResultHandleInner; + +AFFINITY_TYPE str2AFFINITY_TYPE(std::string affinity_str) +{ + AFFINITY_TYPE ret = CPU_HIGH_PERFORMANCE; + if (affinity_str == "CPU_AFFINITY_HIGH_PERFORMANCE") { + ret = CPU_HIGH_PERFORMANCE; + } else if (affinity_str == "CPU_AFFINITY_LOW_POWER") { + ret = CPU_LOW_POWER; + } else if (affinity_str == "GPU") { + ret = GPU; + } else { + UNI_ERROR_LOG("unsupported JNI CPU affinity setting %s\n", affinity_str.c_str()); + } + return ret; +} + +DEVICE_TYPE str2DEVICE_TYPE(std::string device_str) +{ + DEVICE_TYPE ret = CPU_ARM_V8; + if (device_str == "CPU_ARM_V7") { + ret = CPU_ARM_V7; + } else if (device_str == "CPU_ARM_V8") { + ret = CPU_ARM_V8; + } else if (device_str == "CPU_ARM_A55") { + ret = CPU_ARM_A55; + } else if (device_str == "CPU_ARM_A76") { + ret = CPU_ARM_A76; + } else if (device_str == "GPU_MALI") { + ret = GPU_MALI; + } else if (device_str == "CPU_X86_AVX2") { + ret = CPU_X86_AVX2; + } else if (device_str == "CPU_SERIAL") { + ret = CPU_SERIAL; + } else { + UNI_ERROR_LOG("unsupported JNI device setting %s\n", device_str.c_str()); + } + return ret; +} + +DATA_TYPE str2DATA_TYPE(std::string data_type) +{ + DATA_TYPE ret = FP_32; + if (data_type == "FP32") { + ret = FP_32; +#ifdef __aarch64__ + } else if (data_type == "FP16") { + ret = FP_16; +#endif + } else if (data_type == "INT32") { + ret = INT_32; + } else if (data_type == "UINT32") { + ret = UINT_32; + } else { + UNI_ERROR_LOG("unsupported JNI data type setting %s\n", data_type.c_str()); + } + return ret; +} + +DATA_FORMAT str2DATA_FORMAT(std::string data_format) +{ + DATA_FORMAT ret = NCHW; + if (data_format == "NCHW") { + ret = NCHW; + } else if (data_format == "NHWC") { + ret = NHWC; + } else if (data_format == "MTK") { + ret = MTK; + } else if (data_format == "NORMAL") { + ret = NORMAL; + } else { + UNI_ERROR_LOG("unsupported JNI data format setting %s\n", data_format.c_str()); + } + return ret; +} + +std::string DataFormat2str(DataFormat data_format) +{ + std::string ret = "NCHW"; + switch (data_format) { + case DF_NCHW: + ret = "NCHW"; + break; + case DF_NCHWC8: + ret = "NCHWC8"; + break; + case DF_NHWC: + ret = "NHWC"; + break; + case DF_MTK: + ret = "MTK"; + break; + case DF_NORMAL: + ret = "NORMAL"; + break; + default: + UNI_ERROR_LOG("unsupported JNI data format setting %d\n", data_format); + } + return ret; +} + +extern "C" JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_createModel)( + JNIEnv *env, jobject, jstring modelPath, jstring affinity) +{ + const char *modelPathPtr = env->GetStringUTFChars(modelPath, JNI_FALSE); + const char *affinityPtr = env->GetStringUTFChars(affinity, JNI_FALSE); + std::string affinity_str = (std::string)affinityPtr; + AFFINITY_TYPE affinity_cur = str2AFFINITY_TYPE(affinity_str); + long modelAddr = (long)CreateModel(modelPathPtr, affinity_cur, NULL); + ModelHandleInfo *ihInfo = (ModelHandleInfo *)modelAddr; + if (nullptr == ihInfo->cnn) { + UNI_ERROR_LOG("Bolt instance not created\n"); + modelAddr = 0; + } + env->ReleaseStringUTFChars(modelPath, modelPathPtr); + env->ReleaseStringUTFChars(affinity, affinityPtr); + return modelAddr; +} + +extern "C" JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_cloneModel)( + JNIEnv *env, jobject, jlong modelAddr) +{ + ModelHandle handle = (ModelHandle)modelAddr; + ModelHandle cloneHandle = CloneModel(handle); + long ret = (long)cloneHandle; + return ret; +} + +void getInputParameters(JNIEnv *env, + jint num, + jobjectArray input_names, + char ***data_name_ptr, + jintArray n, + int **data_n_ptr, + jintArray c, + int **data_c_ptr, + jintArray h, + int **data_h_ptr, + jintArray w, + int **data_w_ptr, + jobjectArray dt_input, + DATA_TYPE **data_dt_ptr, + jobjectArray df_input, + DATA_FORMAT **data_df_ptr) +{ + if (env->GetArrayLength(input_names) != num) { + UNI_ERROR_LOG("input name array length %d is not equal to input num %d\n", + env->GetArrayLength(input_names), num); + } + if (env->GetArrayLength(n) != num) { + UNI_ERROR_LOG( + "input N array length %d is not equal to input num %d\n", env->GetArrayLength(n), num); + } + if (env->GetArrayLength(c) != num) { + UNI_ERROR_LOG( + "input C array length %d is not equal to input num %d\n", env->GetArrayLength(c), num); + } + if (env->GetArrayLength(h) != num) { + UNI_ERROR_LOG( + "input H array length %d is not equal to input num %d\n", env->GetArrayLength(h), num); + } + if (env->GetArrayLength(w) != num) { + UNI_ERROR_LOG( + "input W array length %d is not equal to input num %d\n", env->GetArrayLength(w), num); + } + if (env->GetArrayLength(dt_input) != num) { + UNI_ERROR_LOG("input DataType array length %d is not equal to input num %d\n", + env->GetArrayLength(dt_input), num); + } + if (env->GetArrayLength(df_input) != num) { + UNI_ERROR_LOG("input DataFormat array length %d is not equal to input num %d\n", + env->GetArrayLength(df_input), num); + } + int *data_n = (int *)malloc(num * sizeof(int)); + int *data_c = (int *)malloc(num * sizeof(int)); + int *data_h = (int *)malloc(num * sizeof(int)); + int *data_w = (int *)malloc(num * sizeof(int)); + char **data_name = (char **)malloc(num * sizeof(char *)); + DATA_TYPE *data_dt = (DATA_TYPE *)malloc(num * sizeof(DATA_TYPE)); + DATA_FORMAT *data_df = (DATA_FORMAT *)malloc(num * sizeof(DATA_FORMAT)); + jint *curArray_n = env->GetIntArrayElements(n, 0); + jint *curArray_c = env->GetIntArrayElements(c, 0); + jint *curArray_h = env->GetIntArrayElements(h, 0); + jint *curArray_w = env->GetIntArrayElements(w, 0); + for (int i = 0; i < num; i++) { + data_n[i] = curArray_n[i]; + data_c[i] = curArray_c[i]; + data_h[i] = curArray_h[i]; + data_w[i] = curArray_w[i]; + + jstring cur_str = (jstring)(env->GetObjectArrayElement(input_names, i)); + const char *cur_str_ptr = env->GetStringUTFChars(cur_str, 0); + int length = strlen(cur_str_ptr); + data_name[i] = (char *)malloc(sizeof(char) * (length + 1)); + UNI_memcpy(data_name[i], cur_str_ptr, length); + data_name[i][length] = '\0'; + + jstring tmp_str_dt = (jstring)(env->GetObjectArrayElement(dt_input, i)); + const char *tmp_str_dt_ptr = env->GetStringUTFChars(tmp_str_dt, 0); + std::string cur_tmp_str_dt = tmp_str_dt_ptr; + data_dt[i] = str2DATA_TYPE(cur_tmp_str_dt); + + jstring tmp_str_df = (jstring)(env->GetObjectArrayElement(df_input, i)); + const char *tmp_str_df_ptr = env->GetStringUTFChars(tmp_str_df, 0); + std::string cur_tmp_str_df = tmp_str_df_ptr; + data_df[i] = str2DATA_FORMAT(cur_tmp_str_df); + + env->ReleaseStringUTFChars(cur_str, cur_str_ptr); + env->DeleteLocalRef(cur_str); + env->ReleaseStringUTFChars(tmp_str_dt, tmp_str_dt_ptr); + env->ReleaseStringUTFChars(tmp_str_df, tmp_str_df_ptr); + env->DeleteLocalRef(tmp_str_dt); + env->DeleteLocalRef(tmp_str_df); + } + env->ReleaseIntArrayElements(n, curArray_n, 0); + env->ReleaseIntArrayElements(c, curArray_c, 0); + env->ReleaseIntArrayElements(h, curArray_h, 0); + env->ReleaseIntArrayElements(w, curArray_w, 0); + *data_name_ptr = data_name; + *data_n_ptr = data_n; + *data_c_ptr = data_c; + *data_h_ptr = data_h; + *data_w_ptr = data_w; + *data_dt_ptr = data_dt; + *data_df_ptr = data_df; +} + +extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_prepareModel)(JNIEnv *env, + jobject, + jlong modelAddr, + jint num_input, + jobjectArray input_names, + jintArray n, + jintArray c, + jintArray h, + jintArray w, + jobjectArray dt_input, + jobjectArray df_input) +{ + ModelHandle ih = (ModelHandle)modelAddr; + char **data_name = nullptr; + int *data_n = nullptr; + int *data_c = nullptr; + int *data_h = nullptr; + int *data_w = nullptr; + DATA_TYPE *data_dt = nullptr; + DATA_FORMAT *data_df = nullptr; + getInputParameters(env, num_input, input_names, &data_name, n, &data_n, c, &data_c, h, &data_h, + w, &data_w, dt_input, &data_dt, df_input, &data_df); + + PrepareModel(ih, num_input, data_name, data_n, data_c, data_h, data_w, data_dt, data_df); + + free(data_n); + free(data_c); + free(data_h); + free(data_w); + for (int i = 0; i < num_input; i++) { + free(data_name[i]); + } + free(data_name); + free(data_dt); + free(data_df); +} + +extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_resizeModelInput)(JNIEnv *env, + jobject, + jlong modelAddr, + jint num_input, + jobjectArray input_names, + jintArray n, + jintArray c, + jintArray h, + jintArray w, + jobjectArray dt_input, + jobjectArray df_input) +{ + ModelHandle ih = (ModelHandle)modelAddr; + char **data_name = nullptr; + int *data_n = nullptr; + int *data_c = nullptr; + int *data_h = nullptr; + int *data_w = nullptr; + DATA_TYPE *data_dt = nullptr; + DATA_FORMAT *data_df = nullptr; + getInputParameters(env, num_input, input_names, &data_name, n, &data_n, c, &data_c, h, &data_h, + w, &data_w, dt_input, &data_dt, df_input, &data_df); + + ResizeModelInput(ih, num_input, data_name, data_n, data_c, data_h, data_w, data_dt, data_df); + + free(data_n); + free(data_c); + free(data_h); + free(data_w); + for (int i = 0; i < num_input; i++) { + free(data_name[i]); + } + free(data_name); + free(data_dt); + free(data_df); +} + +extern "C" JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_allocAllResultHandle)( + JNIEnv *, jobject, jlong modelAddr) +{ + ModelHandle ih = (ModelHandle)modelAddr; + ResultHandle ir = AllocAllResultHandle(ih); + return (long)ir; +} + +extern "C" JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_allocSpecificResultHandle)( + JNIEnv *env, jobject, jlong modelAddr, jint num_outputs, jobjectArray outputNames) +{ + if (env->GetArrayLength(outputNames) != num_outputs) { + UNI_ERROR_LOG("output name array length %d is not equal to output num %d\n", + env->GetArrayLength(outputNames), num_outputs); + } + ModelHandle ih = (ModelHandle)modelAddr; + char **output_names_ptr = (char **)malloc(sizeof(char *) * num_outputs); + for (int i = 0; i < num_outputs; i++) { + jstring cur_str = (jstring)(env->GetObjectArrayElement(outputNames, i)); + const char *cur_str_ptr = env->GetStringUTFChars(cur_str, 0); + int length = strlen(cur_str_ptr); + output_names_ptr[i] = (char *)malloc(sizeof(char) * (length + 1)); + UNI_memcpy(output_names_ptr[i], cur_str_ptr, length); + output_names_ptr[i][length] = '\0'; + + env->ReleaseStringUTFChars(cur_str, cur_str_ptr); + env->DeleteLocalRef(cur_str); + } + ResultHandle ir = AllocSpecificResultHandle(ih, num_outputs, output_names_ptr); + + for (int i = 0; i < num_outputs; i++) { + free(output_names_ptr[i]); + } + free(output_names_ptr); + return (long)ir; +} + +extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_setRuntimeDeviceJNI)( + JNIEnv *env, jobject, jlong modelAddr, jint cpu_id, jstring device) +{ + ModelHandle ih = (ModelHandle)modelAddr; + const char *devicePtr = env->GetStringUTFChars(device, JNI_FALSE); + std::string device_str = (std::string)devicePtr; + DEVICE_TYPE device_cur = str2DEVICE_TYPE(device_str); + SetRuntimeDevice(ih, cpu_id, device_cur); + env->ReleaseStringUTFChars(device, devicePtr); +} + +extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_setRuntimeDeviceDynamicJNI)( + JNIEnv *env, jobject, jlong modelAddr) +{ + ModelHandle ih = (ModelHandle)modelAddr; + SetRuntimeDeviceDynamic(ih); +} + +extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_runModel)(JNIEnv *env, + jobject, + jlong modelAddr, + jlong ResultHandleAddr, + jint num_input, + jobjectArray input_names, + jobjectArray inputData) +{ + if (env->GetArrayLength(input_names) != num_input) { + UNI_ERROR_LOG("input name array length %d is not equal to input num %d\n", + env->GetArrayLength(input_names), num_input); + } + if (env->GetArrayLength(inputData) != num_input) { + UNI_ERROR_LOG("input data array length %d is not equal to input num %d\n", + env->GetArrayLength(inputData), num_input); + } + ModelHandle ih = (ModelHandle)modelAddr; + ResultHandle ir = (ResultHandle)ResultHandleAddr; + + ModelHandleInfo *ihInfo = (ModelHandleInfo *)ih; + CNN *cnn = (CNN *)ihInfo->cnn; + std::map> inMap = cnn->get_inputs(); + + char **input_names_ptr = (char **)malloc(sizeof(char *) * num_input); + void **mem_ptr = (void **)malloc(sizeof(void *) * num_input); + for (int i = 0; i < num_input; i++) { + jstring cur_str = (jstring)(env->GetObjectArrayElement(input_names, i)); + const char *cur_str_ptr = env->GetStringUTFChars(cur_str, 0); + int length = strlen(cur_str_ptr); + input_names_ptr[i] = (char *)malloc(sizeof(char) * (length + 1)); + UNI_memcpy(input_names_ptr[i], cur_str_ptr, length); + input_names_ptr[i][length] = '\0'; + env->ReleaseStringUTFChars(cur_str, cur_str_ptr); + env->DeleteLocalRef(cur_str); + + jfloatArray curArray = static_cast(env->GetObjectArrayElement(inputData, i)); + jfloat *datas = env->GetFloatArrayElements(curArray, JNI_FALSE); + std::string curTensorName = input_names_ptr[i]; + std::shared_ptr cur_input_tensor = inMap[curTensorName]; + jint dataNum = env->GetArrayLength(curArray); + TensorDesc tensorDesc = cur_input_tensor->get_desc(); + mem_ptr[i] = ((CpuMemory *)(cur_input_tensor->get_memory()))->get_ptr(); + transformFromFloat(tensorDesc.dt, datas, mem_ptr[i], dataNum); + env->ReleaseFloatArrayElements(curArray, datas, 0); + env->DeleteLocalRef(curArray); + } + + RunModel(ih, ir, num_input, input_names_ptr, mem_ptr); + for (int i = 0; i < num_input; i++) { + free(input_names_ptr[i]); + } + free(input_names_ptr); + free(mem_ptr); +} + +int calculateLength(int *array, int num) +{ + int length = 0; + for (int j = 0; j < num; j++) { + if (array[j] == 0) { + break; + } else { + if (length == 0) { + length = array[j]; + } else { + length *= array[j]; + } + } + } + return length; +} + +extern "C" JNIEXPORT jobject JNICALL BOLT_JNI_PREFIX(BoltModel_getOutput)( + JNIEnv *env, jobject, jlong ResultHandleAddr) +{ + std::string boltResultClassPath = std::string(BOLT_JNI_PATH_PREFIX) + "BoltResult"; + jclass stucls = env->FindClass(boltResultClassPath.c_str()); + + jmethodID constrocMID = + env->GetMethodID(stucls, "", "([[F[[I[Ljava/lang/String;[Ljava/lang/String;)V"); + + ResultHandleInner *ir_inner = (ResultHandleInner *)ResultHandleAddr; + DataDesc *outputArrPtr = (*ir_inner).outputArr; + int num_outputs = (*ir_inner).num_outputs; + + jobjectArray output_values; + jclass floatArrCls = env->FindClass("[F"); + output_values = env->NewObjectArray(num_outputs, floatArrCls, nullptr); + jobjectArray output_dimension; + jclass intArrCls = env->FindClass("[I"); + output_dimension = env->NewObjectArray(num_outputs, intArrCls, nullptr); + + jobjectArray output_names_arr; + output_names_arr = (jobjectArray)env->NewObjectArray( + num_outputs, env->FindClass("java/lang/String"), env->NewStringUTF("")); + + jobjectArray df_arr; + df_arr = (jobjectArray)env->NewObjectArray( + num_outputs, env->FindClass("java/lang/String"), env->NewStringUTF("")); + + for (int i = 0; i < num_outputs; i++) { + std::string cur_output_name = outputArrPtr[i].name; + env->SetObjectArrayElement(output_names_arr, i, env->NewStringUTF(cur_output_name.c_str())); + DataType cur_data_type = outputArrPtr[i].dt; + DataFormat cur_data_format = outputArrPtr[i].df; + std::string cur_data_format_str = DataFormat2str(cur_data_format); + env->SetObjectArrayElement(df_arr, i, env->NewStringUTF(cur_data_format_str.c_str())); + + void *cur_dataPtr = outputArrPtr[i].dataPtr; + int tensorNumber = calculateLength((int *)outputArrPtr[i].dims, 4); + jfloatArray floatArr = env->NewFloatArray(tensorNumber); + float *tmp_output_values = env->GetFloatArrayElements(floatArr, NULL); + + jint tmp_output_dimensions[4]; + jintArray intArr = env->NewIntArray(4); + + for (int j = 0; j < 4; j++) { + tmp_output_dimensions[j] = (int)(outputArrPtr[i].dims[j]); + } + + transformToFloat(cur_data_type, cur_dataPtr, tmp_output_values, tensorNumber); + env->SetFloatArrayRegion(floatArr, 0, tensorNumber, tmp_output_values); + env->SetObjectArrayElement(output_values, i, floatArr); + env->ReleaseFloatArrayElements(floatArr, tmp_output_values, 0); + + env->DeleteLocalRef(floatArr); + + env->SetIntArrayRegion(intArr, 0, 4, tmp_output_dimensions); + env->SetObjectArrayElement(output_dimension, i, intArr); + env->DeleteLocalRef(intArr); + } + + jobject bolt_result_obj = env->NewObject( + stucls, constrocMID, output_values, output_dimension, output_names_arr, df_arr); + env->DeleteLocalRef(stucls); + env->DeleteLocalRef(intArrCls); + env->DeleteLocalRef(output_values); + env->DeleteLocalRef(output_dimension); + env->DeleteLocalRef(output_names_arr); + env->DeleteLocalRef(df_arr); + return bolt_result_obj; +} + +extern "C" JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_cloneResultHandle)( + JNIEnv *, jobject, jlong ResultHandleAddr) +{ + ResultHandle ir = (ResultHandle)ResultHandleAddr; + return (long)CloneResultHandle(ir); +} + +extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_freeResultHandle)( + JNIEnv *, jobject, jlong ResultHandleAddr) +{ + ResultHandle ir = (ResultHandle)ResultHandleAddr; + FreeResultHandle(ir); +} + +extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_destroyModel)( + JNIEnv *, jobject, jlong modelAddr) +{ + ModelHandle ih = (ModelHandle)modelAddr; + DestroyModel(ih); +} +#endif diff --git a/inference/engine/src/CMakeLists.txt b/inference/engine/src/CMakeLists.txt new file mode 100644 index 00000000..f3e3f5b7 --- /dev/null +++ b/inference/engine/src/CMakeLists.txt @@ -0,0 +1,20 @@ +file(GLOB_RECURSE srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) + +# shared library +add_library(${PROJECT_NAME} SHARED ${srcs}) + +# static library +add_library(${PROJECT_NAME}_static STATIC ${srcs}) +if (USE_IOS_CLANG) + target_link_libraries(${PROJECT_NAME} LINK_PUBLIC tensor image model_tools) + if (BUILD_TEST) + target_link_libraries(${PROJECT_NAME} LINK_PUBLIC ${JPEG_LIBRARY}) + endif (BUILD_TEST) +endif (USE_IOS_CLANG) + +set_target_properties(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}") +set_target_properties(${PROJECT_NAME} PROPERTIES CLEAN_DIRECT_OUTPUT 1) +set_target_properties(${PROJECT_NAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) +install(TARGETS ${PROJECT_NAME} ${PROJECT_NAME}_static + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) diff --git a/inference/engine/src/bolt.cpp b/inference/engine/src/bolt.cpp new file mode 100644 index 00000000..7b23c8b5 --- /dev/null +++ b/inference/engine/src/bolt.cpp @@ -0,0 +1,689 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "inference.hpp" +#include "../api/c/bolt.h" + +struct ModelHandleInfo { + void *cnn; + DEVICE_TYPE deviceType; + void *algoPath; + bool useFileStream; +}; + +typedef struct { + U32 dims[4] = {0}; + char name[NAME_LEN] = {0}; + DataType dt; + DataFormat df; + void *dataPtr; +} DataDesc; + +typedef struct { + U32 num_outputs; + DataDesc *outputArr; + DEVICE_TYPE deviceType; +} ResultHandleInner; + +DataType dt_mapping_user2bolt(DATA_TYPE dt_user) +{ + DataType ret = DT_F32; + switch (dt_user) { + case FP_32: + ret = DT_F32; + break; +#ifdef __aarch64__ + case FP_16: + ret = DT_F16; + break; +#endif + case INT_32: + ret = DT_I32; + break; + case UINT_32: + ret = DT_U32; + break; + default: + UNI_ERROR_LOG("unsupported user data type in API\n"); + } + return ret; +} + +DATA_TYPE dt_mapping_bolt2user(DataType dt_bolt) +{ + DATA_TYPE ret = FP_32; + switch (dt_bolt) { + case DT_F32: + ret = FP_32; + break; +#ifdef __aarch64__ + case DT_F16: + ret = FP_16; + break; +#endif + case DT_I32: + ret = INT_32; + break; + case DT_U32: + ret = UINT_32; + break; + default: + UNI_ERROR_LOG("unsupported bolt data type in API\n"); + } + return ret; +} + +DataFormat df_mapping_user2bolt(DATA_FORMAT df_user) +{ + DataFormat ret = DF_NCHW; + switch (df_user) { + case NCHW: + ret = DF_NCHW; + break; + case NHWC: + ret = DF_NHWC; + break; + case NCHWC8: + ret = DF_NCHWC8; + break; + case MTK: + ret = DF_MTK; + break; + case NORMAL: + ret = DF_NORMAL; + break; + default: { + UNI_ERROR_LOG("unsupported user data format in API\n"); + } + } + return ret; +} + +DATA_FORMAT df_mapping_bolt2user(DataFormat df_bolt) +{ + DATA_FORMAT ret = NCHW; + switch (df_bolt) { + case DF_NCHW: + ret = NCHW; + break; + case DF_NHWC: + ret = NHWC; + break; + case DF_NCHWC8: + ret = NCHWC8; + break; + case DF_MTK: + ret = MTK; + break; + case DF_NORMAL: + ret = NORMAL; + break; + default: { + UNI_ERROR_LOG("unsupported bolt data format in API\n"); + } + } + return ret; +} + +inline AffinityPolicy affinity_mapping_user2bolt(AFFINITY_TYPE affinity) +{ + AffinityPolicy ret = AFFINITY_CPU_HIGH_PERFORMANCE; + switch (affinity) { + case CPU_HIGH_PERFORMANCE: + ret = AFFINITY_CPU_HIGH_PERFORMANCE; + break; + case CPU_LOW_POWER: + ret = AFFINITY_CPU_LOW_POWER; + break; + case GPU: + ret = AFFINITY_GPU; + break; + default: { + UNI_ERROR_LOG("unsupported user affinity type in API\n"); + } + } + return ret; +} + +inline Arch device_mapping_user2bolt(DEVICE_TYPE device) +{ + Arch ret = ARM_V8; + switch (device) { + case CPU_ARM_V7: + ret = ARM_V7; + break; + case CPU_ARM_V8: + ret = ARM_V8; + break; + case CPU_ARM_A55: + ret = ARM_A55; + break; + case CPU_ARM_A76: + ret = ARM_A76; + break; + case GPU_MALI: + ret = MALI; + break; + case CPU_X86_AVX2: + ret = X86_AVX2; + break; + case CPU_SERIAL: + ret = CPU_GENERAL; + break; + default: { + UNI_ERROR_LOG("unsupported user device type %d in API\n", device); + break; + } + } + return ret; +} + +inline DEVICE_TYPE device_mapping_bolt2user(Arch arch) +{ + DEVICE_TYPE ret = CPU_ARM_V8; + switch (arch) { + case ARM_V7: + ret = CPU_ARM_V7; + break; + case ARM_V8: + ret = CPU_ARM_V8; + break; + case ARM_A55: + ret = CPU_ARM_A55; + break; + case ARM_A76: + ret = CPU_ARM_A76; + break; + case MALI: + ret = GPU_MALI; + break; + case X86_AVX2: + ret = CPU_X86_AVX2; + break; + case CPU_GENERAL: + ret = CPU_SERIAL; + break; + default: { + UNI_ERROR_LOG("unsupported bolt device type %d in API\n", arch); + break; + } + } + return ret; +} + +void copyTensorDescToDataDesc(TensorDesc srcDesc, DataDesc *dstDesc) +{ + dstDesc->dt = srcDesc.dt; + dstDesc->df = srcDesc.df; + if (srcDesc.nDims > 4) { + UNI_ERROR_LOG("user interface only support 4 dimensions, not %d\n", srcDesc.nDims); + } + for (U32 i = 0; i < srcDesc.nDims; i++) { + dstDesc->dims[i] = srcDesc.dims[srcDesc.nDims - 1 - i]; + } + for (int i = srcDesc.nDims; i < 4; i++) { + dstDesc->dims[i] = 1; + } +} + +ModelHandle CreateModel(const char *modelPath, AFFINITY_TYPE affinity, const char *algoPath) +{ + ModelHandleInfo *handle = new ModelHandleInfo(); + ModelSpec *ms = new ModelSpec(); + if (SUCCESS != deserialize_model_from_file(modelPath, ms)) { + UNI_ERROR_LOG("CreateModel failed\n"); + delete ms; + handle->cnn = nullptr; + return (ModelHandle)handle; + } + CNN *cnn = new CNN(affinity_mapping_user2bolt(affinity), ms->dt, ms->model_name); + cnn->sort_operators_sequential(ms); + cnn->initialize_ops(ms); + + handle->cnn = (void *)cnn; + handle->deviceType = device_mapping_bolt2user(cnn->get_runtime_device()); + handle->algoPath = (void *)algoPath; + handle->useFileStream = false; + CHECK_STATUS(mt_destroy_model(ms)); + delete ms; + return (ModelHandle)handle; +} + +ModelHandle CloneModel(ModelHandle ih) +{ + ModelHandleInfo *handle = (ModelHandleInfo *)ih; + ModelHandleInfo *cloneHandle = new ModelHandleInfo(); + *cloneHandle = *handle; + CNN *cloneCnn = new CNN(); + *cloneCnn = ((CNN *)handle->cnn)->clone(); + cloneHandle->cnn = cloneCnn; + return (ModelHandle)cloneHandle; +} + +ModelHandle CreateModelWithFileStream( + const char *modelFileStream, AFFINITY_TYPE affinity, const char *algoFileStream) +{ + ModelHandleInfo *handle = new ModelHandleInfo(); + ModelSpec *ms = new ModelSpec(); + if (SUCCESS != deserialize_model_from_file(modelFileStream, ms, true)) { + UNI_ERROR_LOG("CreateModelWithFileStream failed\n"); + delete ms; + handle->cnn = nullptr; + return (ModelHandle)handle; + } + CNN *cnn = new CNN(affinity_mapping_user2bolt(affinity), ms->dt, ms->model_name); + cnn->sort_operators_sequential(ms); + cnn->initialize_ops(ms); + + handle->cnn = (void *)cnn; + handle->deviceType = device_mapping_bolt2user(cnn->get_runtime_device()); + handle->algoPath = (void *)algoFileStream; + handle->useFileStream = true; + CHECK_STATUS(mt_destroy_model(ms)); + delete ms; + return (ModelHandle)handle; +} + +int GetNumInputsFromModel(ModelHandle ih) +{ + ModelHandleInfo *ihInfo = (ModelHandleInfo *)ih; + CNN *cnn = (CNN *)ihInfo->cnn; + if (ihInfo == NULL) { + UNI_ERROR_LOG("GetNumInputsFromModel: inference handle is nullptr\n"); + } + return (cnn->get_model_input_tensor_names()).size(); +} + +void GetInputDataInfoFromModel(ModelHandle handle, + const int number_inputs, + char **inputNames, + int *n, + int *c, + int *h, + int *w, + DATA_TYPE *dt_input, + DATA_FORMAT *df_input) +{ + ModelHandleInfo *ihInfo = (ModelHandleInfo *)handle; + CNN *cnn = (CNN *)ihInfo->cnn; + if (ihInfo == NULL) { + UNI_ERROR_LOG("GetInputDataInfoFromModel: inference handle is nullptr\n"); + } + std::vector inputTensorDescs = cnn->get_model_input_tensor_descs(); + std::vector inputTensorNames = cnn->get_model_input_tensor_names(); + if (number_inputs != (int)inputTensorDescs.size() || + number_inputs != (int)inputTensorNames.size()) { + UNI_ERROR_LOG("GetInputDataInfoFromModel: number of inputs is not match, " + "please use GetNumInputsFromModel to get the right value\n"); + } + DataType dt; + DataFormat df; + U32 in, ic, ih, iw; + for (int i = 0; i < number_inputs; i++) { + strcpy(inputNames[i], inputTensorNames[i].c_str()); + in = ic = ih = iw = 0; + if (tensorIs1d(inputTensorDescs[i])) { + CHECK_STATUS(tensor1dGet(inputTensorDescs[i], &dt, &df, &in)); + } else if (tensorIs2d(inputTensorDescs[i])) { + CHECK_STATUS(tensor2dGet(inputTensorDescs[i], &dt, &df, &in, &ic)); + } else if (tensorIs3d(inputTensorDescs[i])) { + CHECK_STATUS(tensor3dGet(inputTensorDescs[i], &dt, &df, &in, &ic, &ih)); + } else if (tensorIs4d(inputTensorDescs[i])) { + CHECK_STATUS(tensor4dGet(inputTensorDescs[i], &dt, &df, &in, &ic, &ih, &iw)); + } else { + UNI_ERROR_LOG("C API currently only support 1d,2d,3d,4d query\n"); + } + n[i] = in; + c[i] = ic; + h[i] = ih; + w[i] = iw; + dt_input[i] = dt_mapping_bolt2user(dt); + df_input[i] = df_mapping_bolt2user(df); + } +} + +std::map getInputDataFormatFromUser(ModelHandle ih, + const int num_input, + char **name, + const int *n, + const int *c, + const int *h, + const int *w, + const DATA_TYPE *dt_input, + const DATA_FORMAT *df_input) +{ + ModelHandleInfo *ihInfo = (ModelHandleInfo *)ih; + CNN *cnn = (CNN *)ihInfo->cnn; + std::vector inputTensorNames = cnn->get_model_input_tensor_names(); + U32 num = inputTensorNames.size(); + if (num != (U32)num_input) { + UNI_ERROR_LOG("getInputDataFormatFromUser: model has %d inputs, not %d\n", num, num_input); + } + if (ihInfo == NULL) { + UNI_ERROR_LOG("getInputDataFormatFromUser: ih is nullptr\n"); + } + if (n == NULL) { + UNI_ERROR_LOG("getInputDataFormatFromUser: n is nullptr\n"); + } + if (c == NULL) { + UNI_ERROR_LOG("getInputDataFormatFromUser: c is nullptr\n"); + } + if (h == NULL) { + UNI_ERROR_LOG("getInputDataFormatFromUser: h is nullptr\n"); + } + if (w == NULL) { + UNI_ERROR_LOG("getInputDataFormatFromUser: w is nullptr\n"); + } + if (name == NULL) { + UNI_ERROR_LOG("getInputDataFormatFromUser: name is nullptr\n"); + } + for (U32 i = 0; i < num; ++i) { + if (name[i] == NULL) { + UNI_ERROR_LOG("getInputDataFormatFromUser: input name %d is nullptr\n", i); + } + } + + std::map modelInputDims; + for (U32 i = 0; i < num; ++i) { + std::string inputName = name[i]; + bool findTensorName = false; + for (U32 j = 0; j < num; ++j) { + std::string modelName = inputTensorNames[j]; + if (modelName == inputName) { + DataType dt = (dt_input == NULL) ? DT_F32 : dt_mapping_user2bolt(dt_input[i]); + DataFormat df = (df_input == NULL) ? DF_NCHW : df_mapping_user2bolt(df_input[i]); + switch (df) { + case DF_NORMAL: + modelInputDims[inputName] = tensor2df(dt, df, n[i], c[i]); + break; + case DF_MTK: + modelInputDims[inputName] = tensor3df(dt, df, n[i], c[i], h[i]); + break; + case DF_NCHW: + modelInputDims[inputName] = tensor4df(dt, df, n[i], c[i], h[i], w[i]); + break; + default: + UNI_ERROR_LOG("unsupported data format in %s\n", __func__); + } + findTensorName = true; + break; + } + } + + if (!findTensorName) { + std::string errorLog = "("; + for (U32 j = 0; j < num; ++j) { + errorLog.append(inputTensorNames[j]); + if (j != num - 1) { + errorLog.append(", "); + } + } + errorLog.append(")"); + UNI_ERROR_LOG("input data %s is not a valid model input %s\n", inputName.c_str(), + errorLog.c_str()); + } + } + return modelInputDims; +} + +void PrepareModel(ModelHandle ih, + const int num_input, + char **name, + const int *n, + const int *c, + const int *h, + const int *w, + const DATA_TYPE *dt_input = NULL, + const DATA_FORMAT *df_input = NULL) +{ + ModelHandleInfo *ihInfo = (ModelHandleInfo *)ih; + CNN *cnn = (CNN *)ihInfo->cnn; + + std::map modelInputDims = + getInputDataFormatFromUser(ih, num_input, name, n, c, h, w, dt_input, df_input); + if (ihInfo->algoPath) { + const char *algoPath = (const char *)ihInfo->algoPath; + if (ihInfo->useFileStream) { + cnn->loadAlgorithmMapFromFileStream(algoPath); + } else { + cnn->loadAlgorithmMapFromText(algoPath); + } + } + cnn->ready(modelInputDims); + cnn->mark_input_output(); + return; +} + +void ResizeModelInput(ModelHandle ih, + const int num_input, + char **name, + const int *n, + const int *c, + const int *h, + const int *w, + const DATA_TYPE *dt_input = NULL, + const DATA_FORMAT *df_input = NULL) +{ + ModelHandleInfo *ihInfo = (ModelHandleInfo *)ih; + CNN *cnn = (CNN *)ihInfo->cnn; + + std::map modelInputDims = + getInputDataFormatFromUser(ih, num_input, name, n, c, h, w, dt_input, df_input); + cnn->reready(modelInputDims); +} + +ResultHandle AllocAllResultHandle(ModelHandle ih) +{ + ModelHandleInfo *ihInfo = (ModelHandleInfo *)ih; + CNN *cnn = (CNN *)ihInfo->cnn; + DEVICE_TYPE device = ihInfo->deviceType; + + ResultHandleInner *model_result_ptr = (ResultHandleInner *)malloc(sizeof(ResultHandleInner)); + std::vector modelOutputTensorNames = cnn->get_model_output_tensor_names(); + int model_num_outputs = modelOutputTensorNames.size(); + DataDesc *outputArrPtr = (DataDesc *)malloc(sizeof(DataDesc) * model_num_outputs); + for (int i = 0; i < model_num_outputs; ++i) { + std::string name = modelOutputTensorNames[i]; + U32 length = name.size(); + length = (length > NAME_LEN) ? NAME_LEN : length; + memcpy(outputArrPtr[i].name, name.c_str(), length); + if (length < NAME_LEN) { + outputArrPtr[i].name[length] = '\0'; + } + TensorDesc srcDesc = cnn->get_tensor_desc_by_name(name); + copyTensorDescToDataDesc(srcDesc, &outputArrPtr[i]); + } + model_result_ptr->num_outputs = model_num_outputs; + model_result_ptr->outputArr = outputArrPtr; + model_result_ptr->deviceType = device; + return (void *)model_result_ptr; +} + +ResultHandle AllocSpecificResultHandle(ModelHandle ih, const int num_outputs, char **outputNames) +{ + ModelHandleInfo *ihInfo = (ModelHandleInfo *)ih; + CNN *cnn = (CNN *)ihInfo->cnn; + DEVICE_TYPE device = ihInfo->deviceType; + + ResultHandleInner *model_result_ptr = (ResultHandleInner *)malloc(sizeof(ResultHandleInner)); + int model_num_outputs = num_outputs; + DataDesc *outputArrPtr = (DataDesc *)malloc(sizeof(DataDesc) * model_num_outputs); + for (int i = 0; i < num_outputs; i++) { + U32 length = UNI_MIN(strlen(outputNames[i]), NAME_LEN - 1); + memcpy(outputArrPtr[i].name, outputNames[i], length); + if (length < NAME_LEN) { + outputArrPtr[i].name[length] = '\0'; + } + std::string name = outputNames[i]; + TensorDesc srcDesc = cnn->get_tensor_desc_by_name(name); + copyTensorDescToDataDesc(srcDesc, &outputArrPtr[i]); + } + model_result_ptr->num_outputs = model_num_outputs; + model_result_ptr->outputArr = outputArrPtr; + model_result_ptr->deviceType = device; + return (void *)model_result_ptr; +} + +void SetRuntimeDevice(ModelHandle ih, int cpu_id, DEVICE_TYPE device) +{ + ModelHandleInfo *ihInfo = (ModelHandleInfo *)ih; + CNN *cnn = (CNN *)ihInfo->cnn; + cnn->set_runtime_device(cpu_id, device_mapping_user2bolt(device)); + ihInfo->deviceType = device; +} + +void SetRuntimeDeviceDynamic(ModelHandle ih) +{ + ModelHandleInfo *ihInfo = (ModelHandleInfo *)ih; + CNN *cnn = (CNN *)ihInfo->cnn; + cnn->set_runtime_device_dynamic(); + ihInfo->deviceType = device_mapping_bolt2user(cnn->get_runtime_device()); +} + +void RunModel(ModelHandle ih, ResultHandle ir, const int num_input, char **inputNames, void **mem) +{ + ModelHandleInfo *ihInfo = (ModelHandleInfo *)ih; + CNN *cnn = (CNN *)ihInfo->cnn; + DEVICE_TYPE device = ihInfo->deviceType; + ResultHandleInner *ir_inner = (ResultHandleInner *)ir; + + for (int index = 0; index < num_input; index++) { + std::string input_name(inputNames[index]); + cnn->copy_to_named_input(input_name, (U8 *)(mem[index])); + } + cnn->run(); + + DataDesc *outputArrPtr = ir_inner->outputArr; + for (U32 curIndex = 0; curIndex < ir_inner->num_outputs; curIndex++) { + Tensor output_tensor = cnn->get_tensor_by_name(outputArrPtr[curIndex].name); + copyTensorDescToDataDesc(output_tensor.get_desc(), &(outputArrPtr[curIndex])); + if (device == GPU_MALI) { +#ifdef _USE_MALI + auto mem = (OclMemory *)output_tensor.get_memory(); + outputArrPtr[curIndex].dataPtr = mem->get_mapped_ptr(); +#else + UNI_WARNING_LOG("this binary not support GPU, please recompile project with GPU " + "compile options\n"); +#endif + } else { + outputArrPtr[curIndex].dataPtr = ((CpuMemory *)(output_tensor.get_memory()))->get_ptr(); + } + } +} + +int GetNumOutputsFromResultHandle(ResultHandle ir) +{ + ResultHandleInner *ir_inner = (ResultHandleInner *)ir; + return (*ir_inner).num_outputs; +} + +void GetOutputDataInfoFromResultHandle(ResultHandle ir, + int num_outputs, + char **outputNames, + int *n, + int *c, + int *h, + int *w, + DATA_TYPE *dt_output, + DATA_FORMAT *df_output) +{ + ResultHandleInner *ir_inner = (ResultHandleInner *)ir; + DataDesc *outputArrPtr = (*ir_inner).outputArr; + for (int i = 0; i < num_outputs; i++) { + n[i] = outputArrPtr[i].dims[0]; + c[i] = outputArrPtr[i].dims[1]; + h[i] = outputArrPtr[i].dims[2]; + w[i] = outputArrPtr[i].dims[3]; + strcpy(outputNames[i], outputArrPtr[i].name); + DataType dt = outputArrPtr[i].dt; + dt_output[i] = dt_mapping_bolt2user(dt); + df_output[i] = df_mapping_bolt2user(outputArrPtr[i].df); + } +} + +void GetPtrFromResultHandle(ResultHandle ir, + int num_outputs, + char **outputNames, + void **data, + int *n, + int *c, + int *h, + int *w, + DATA_TYPE *dt_output, + DATA_FORMAT *df_output) +{ + ResultHandleInner *ir_inner = (ResultHandleInner *)ir; + DataDesc *outputArrPtr = (*ir_inner).outputArr; + for (int i = 0; i < num_outputs; i++) { + n[i] = outputArrPtr[i].dims[0]; + c[i] = outputArrPtr[i].dims[1]; + h[i] = outputArrPtr[i].dims[2]; + w[i] = outputArrPtr[i].dims[3]; + strcpy(outputNames[i], outputArrPtr[i].name); + DataType dt = outputArrPtr[i].dt; + dt_output[i] = dt_mapping_bolt2user(dt); + df_output[i] = df_mapping_bolt2user(outputArrPtr[i].df); + data[i] = outputArrPtr[i].dataPtr; + } +} + +void CopyOutputsFromResultHandle(ResultHandle ir, int num_outputs, const int *size, void **data) +{ + ResultHandleInner *ir_inner = (ResultHandleInner *)ir; + DataDesc *outputArrPtr = (*ir_inner).outputArr; + for (int i = 0; i < num_outputs; i++) { + U32 dataSize = size[i]; + memcpy((void *)data[i], (void *)outputArrPtr[i].dataPtr, dataSize); + } +} + +ResultHandle CloneResultHandle(ResultHandle ir) +{ + ResultHandleInner *irInner = (ResultHandleInner *)ir; + ResultHandleInner *cloneIrInner = new ResultHandleInner(); + *cloneIrInner = *irInner; + U32 size = sizeof(DataDesc) * cloneIrInner->num_outputs; + cloneIrInner->outputArr = (DataDesc *)malloc(size); + memcpy(cloneIrInner->outputArr, irInner->outputArr, size); + return (ResultHandle)cloneIrInner; +} + +void FreeResultHandle(ResultHandle ir) +{ + ResultHandleInner *ir_inner = (ResultHandleInner *)ir; + DataDesc *outputArrPtr = (*ir_inner).outputArr; + free(outputArrPtr); + (*ir_inner).outputArr = nullptr; + free(ir_inner); +} + +void DestroyModel(ModelHandle ih) +{ + ModelHandleInfo *ihInfo = (ModelHandleInfo *)ih; + if (nullptr == ihInfo) { + UNI_WARNING_LOG("DestroyModel received null handle.\n"); + return; + } + CNN *cnn = (CNN *)ihInfo->cnn; + if (nullptr != ihInfo->algoPath) { + const char *algoPath = (const char *)ihInfo->algoPath; + UNI_THREAD_SAFE(cnn->saveAlgorithmMapToText(algoPath)); + } + if (nullptr == cnn) { + UNI_WARNING_LOG("nullptr in DestroyModel. Resource cleared.\n"); + } else { + delete cnn; + ihInfo->cnn = nullptr; + } + delete ihInfo; +} diff --git a/inference/engine/src/bolt_dllite.cpp b/inference/engine/src/bolt_dllite.cpp new file mode 100644 index 00000000..450d9080 --- /dev/null +++ b/inference/engine/src/bolt_dllite.cpp @@ -0,0 +1,489 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "../api/c/bolt.h" +#include "../api/dllite/Bolt.h" +#include "inference.hpp" +#include "tensor.hpp" + +struct ModelHandleInfo { + void *cnn; + DEVICE_TYPE deviceType; + void *algoPath; + bool useFileStream; +}; + +struct DLLiteInfo { + ModelHandle modelHandle; + bool isReady; +}; + +typedef struct { + U32 dims[4] = {0}; + char name[NAME_LEN] = {0}; + DataType dt; + DataFormat df; + void *dataPtr; +} DataDesc; + +typedef struct { + U32 num_outputs; + DataDesc *outputArr; + DEVICE_TYPE deviceType; +} ResultHandleInner; + +inline AFFINITY_TYPE AffinityMapDLLite2c(bolt::AffinityType affinity) +{ + AFFINITY_TYPE ret = CPU_HIGH_PERFORMANCE; + switch (affinity) { + case bolt::AffinityType::CPU_HIGH_PERFORMANCE: + ret = CPU_HIGH_PERFORMANCE; + break; + case bolt::AffinityType::CPU_LOW_POWER: + ret = CPU_LOW_POWER; + break; + case bolt::AffinityType::GPU: + ret = GPU; + break; + default: { + UNI_ERROR_LOG("Unsupported affinity type in dllite API\n"); + } + } + return ret; +} + +bolt::TensorType TypeMapBolt2DLLite(DataType dt) +{ + bolt::TensorType ret = bolt::TensorType::FP32; + switch (dt) { + case DT_F32: + ret = bolt::TensorType::FP32; + break; +#ifdef _USE_FP16 + case DT_F16: + ret = bolt::TensorType::FP16; + break; +#endif + case DT_I32: + ret = bolt::TensorType::INT32; + break; + case DT_U32: + ret = bolt::TensorType::UINT32; + break; + default: + UNI_ERROR_LOG("unsupported bolt data type in DLLite API\n"); + } + return ret; +} + +DataType TypeMapDLLite2bolt(bolt::TensorType dt) +{ + DataType ret = DT_F32; + switch (dt) { + case bolt::TensorType::FP32: + ret = DT_F32; + break; +#ifdef _USE_FP16 + case bolt::TensorType::FP16: + ret = DT_F16; + break; +#endif + case bolt::TensorType::INT32: + ret = DT_I32; + break; + case bolt::TensorType::UINT32: + ret = DT_U32; + break; + default: + UNI_ERROR_LOG("unsupported data type in DLLite API\n"); + } + return ret; +} + +bolt::TensorLayout LayoutMapBolt2DLLite(DataFormat df) +{ + bolt::TensorLayout ret = bolt::TensorLayout::NCHW; + switch (df) { + case DF_NCHW: + ret = bolt::TensorLayout::NCHW; + break; + case DF_NHWC: + ret = bolt::TensorLayout::NHWC; + break; + case DF_NCHWC8: + ret = bolt::TensorLayout::NCHWC8; + break; + case DF_MTK: + ret = bolt::TensorLayout::RNN_MTK; + break; + case DF_NORMAL: + ret = bolt::TensorLayout::ROW_MAJOR; + break; + default: { + UNI_ERROR_LOG("unsupported bolt data layout in DLLite API\n"); + } + } + return ret; +} + +DataFormat LayoutMapDLLite2bolt(bolt::TensorLayout df) +{ + DataFormat ret = DF_NCHW; + switch (df) { + case bolt::TensorLayout::NCHW: + ret = DF_NCHW; + break; + case bolt::TensorLayout::NHWC: + ret = DF_NHWC; + break; + case bolt::TensorLayout::NCHWC8: + ret = DF_NCHWC8; + break; + case bolt::TensorLayout::RNN_MTK: + ret = DF_MTK; + break; + case bolt::TensorLayout::ROW_MAJOR: + ret = DF_NORMAL; + break; + default: { + UNI_ERROR_LOG("unsupported data layout in DLLite API\n"); + } + } + return ret; +} + +std::map GetInputInfoFromDLLite( + bolt::ModelHandle ih, const std::vector &inputs) +{ + DLLiteInfo *handle = (DLLiteInfo *)ih; + ModelHandleInfo *ihInfo = (ModelHandleInfo *)handle->modelHandle; + CNN *cnn = (CNN *)ihInfo->cnn; + std::vector inputTensorNames = cnn->get_model_input_tensor_names(); + int num = inputTensorNames.size(); + if (num != (int)inputs.size()) { + UNI_ERROR_LOG( + "GetInputInfoFromDLLite: model has %d inputs, not %d\n", num, (int)inputs.size()); + } + + std::map modelInputDims; + for (int i = 0; i < num; ++i) { + std::string inputName = inputs[i].name; + bool findTensorName = false; + for (int j = 0; j < num; ++j) { + std::string modelName = inputTensorNames[j]; + if (modelName == inputName) { + DataType dt = TypeMapDLLite2bolt(inputs[i].type); + DataFormat df = LayoutMapDLLite2bolt(inputs[i].layout); + switch (df) { + case DF_NORMAL: + modelInputDims[inputName] = + tensor2df(dt, df, inputs[i].shape[0], inputs[i].shape[1]); + break; + case DF_NCHW: + modelInputDims[inputName] = tensor4df(dt, df, inputs[i].shape[0], + inputs[i].shape[1], inputs[i].shape[2], inputs[i].shape[3]); + break; + case DF_MTK: + modelInputDims[inputName] = tensor3df( + dt, df, inputs[i].shape[0], inputs[i].shape[1], inputs[i].shape[2]); + break; + default: + UNI_ERROR_LOG("unsupported data format in %s\n", __func__); + } + findTensorName = true; + break; + } + } + + if (!findTensorName) { + std::string errorLog = "("; + for (int j = 0; j < num; ++j) { + errorLog.append(inputTensorNames[j]); + if (j != num - 1) { + errorLog.append(", "); + } + } + errorLog.append(")"); + UNI_ERROR_LOG("[ERROR] input data %s is not a valid model input %s\n", + inputName.c_str(), errorLog.c_str()); + } + } + return modelInputDims; +} + +void UpdateDataDesc(TensorDesc srcDesc, DataDesc *dstDesc) +{ + dstDesc->dt = srcDesc.dt; + dstDesc->df = srcDesc.df; + if (srcDesc.nDims > 4) { + UNI_ERROR_LOG("user interface only support 4 dimensions, not %d\n", srcDesc.nDims); + } + for (U32 i = 0; i < srcDesc.nDims; i++) { + dstDesc->dims[i] = srcDesc.dims[srcDesc.nDims - 1 - i]; + } + for (int i = srcDesc.nDims; i < 4; i++) { + dstDesc->dims[i] = 1; + } +} + +bolt::ModelHandle bolt::CreateModel(const bolt::ModelConfig &modelConfig) +{ + DLLiteInfo *handle = new DLLiteInfo(); + if (nullptr != modelConfig.modelStream.first && modelConfig.modelStream.second > 0) { + handle->modelHandle = CreateModelWithFileStream((char *)modelConfig.modelStream.first, + AffinityMapDLLite2c(modelConfig.affinity), + modelConfig.algoStream.second > 0 ? (char *)modelConfig.algoStream.first : nullptr); + } else if ("" != modelConfig.modelPath) { + handle->modelHandle = CreateModel(modelConfig.modelPath.c_str(), + AffinityMapDLLite2c(modelConfig.affinity), modelConfig.algoPath.c_str()); + } else { + handle->modelHandle = nullptr; + } + handle->isReady = false; + return (bolt::ModelHandle)handle; +} + +bolt::ReturnStatus bolt::GetIOFormats(bolt::ModelHandle modelHandle, + std::vector &inputs, + std::vector &outputs) +{ + DLLiteInfo *handle = (DLLiteInfo *)modelHandle; + ModelHandleInfo *ihInfo = (ModelHandleInfo *)handle->modelHandle; + if (nullptr == ihInfo) { + return bolt::ReturnStatus::NULLPTR; + } + CNN *cnn = (CNN *)ihInfo->cnn; + std::vector inputTensorNames = cnn->get_model_input_tensor_names(); + std::vector inputTensorDescs = cnn->get_model_input_tensor_descs(); + + std::map inputDescMap; + for (size_t i = 0; i < inputTensorNames.size(); i++) { + inputDescMap[inputTensorNames[i]] = inputTensorDescs[i]; + } + if (ihInfo->algoPath) { + const char *algoPath = (const char *)ihInfo->algoPath; + cnn->loadAlgorithmMapFromText(algoPath); + } + cnn->ready(inputDescMap); + cnn->mark_input_output(); + if (ihInfo->algoPath) { + const char *algoPath = (const char *)ihInfo->algoPath; + cnn->saveAlgorithmMapToText(algoPath); + } + handle->isReady = true; + + std::map> inMap = cnn->get_inputs(); + inputs.clear(); + + for (auto iter : inMap) { + bolt::IOTensor in; + in.name = iter.first; + TensorDesc inDesc = iter.second->get_desc(); + in.type = TypeMapBolt2DLLite(inDesc.dt); + in.shape.clear(); + for (U32 j = 0; j < inDesc.nDims; j++) { + in.shape.push_back(inDesc.dims[inDesc.nDims - 1 - j]); + } + in.layout = LayoutMapBolt2DLLite(inDesc.df); + inputs.push_back(in); + } + + std::map> outMap = cnn->get_outputs(); + outputs.clear(); + for (auto iter : outMap) { + IOTensor out; + out.name = iter.first; + TensorDesc outDesc = iter.second->get_desc(); + out.type = TypeMapBolt2DLLite(outDesc.dt); + out.shape.clear(); + for (U32 j = 0; j < outDesc.nDims; j++) { + out.shape.push_back(outDesc.dims[outDesc.nDims - 1 - j]); + } + out.layout = LayoutMapBolt2DLLite(outDesc.df); + outputs.push_back(out); + } + + return bolt::ReturnStatus::SUCCESS; +} + +bolt::ReturnStatus bolt::PrepareModel( + bolt::ModelHandle modelHandle, const std::vector &inputs) +{ + DLLiteInfo *handle = (DLLiteInfo *)modelHandle; + if (handle->isReady) { + return bolt::ReturnStatus::SUCCESS; + } + ModelHandleInfo *ihInfo = (ModelHandleInfo *)handle->modelHandle; + if (nullptr == ihInfo) { + return bolt::ReturnStatus::NULLPTR; + } + CNN *cnn = (CNN *)ihInfo->cnn; + + std::map modelInputDims = GetInputInfoFromDLLite(modelHandle, inputs); + if (ihInfo->algoPath) { + const char *algoPath = (const char *)ihInfo->algoPath; + cnn->loadAlgorithmMapFromText(algoPath); + } + cnn->ready(modelInputDims); + cnn->mark_input_output(); + if (ihInfo->algoPath) { + const char *algoPath = (const char *)ihInfo->algoPath; + cnn->saveAlgorithmMapToText(algoPath); + } + return bolt::ReturnStatus::SUCCESS; +} + +bolt::ReturnStatus bolt::GetInputTensors( + bolt::ModelHandle modelHandle, std::vector &inputs) +{ + DLLiteInfo *handle = (DLLiteInfo *)modelHandle; + ModelHandleInfo *ihInfo = (ModelHandleInfo *)handle->modelHandle; + if (nullptr == ihInfo) { + return bolt::ReturnStatus::NULLPTR; + } + CNN *cnn = (CNN *)ihInfo->cnn; + + std::map> inMap = cnn->get_inputs(); + + for (U32 i = 0; i < inputs.size(); i++) { + auto tensorPtr = inMap[inputs[i].name]; + if (nullptr == tensorPtr) { + return bolt::ReturnStatus::FAIL; + } + inputs[i].buffer.first = ((CpuMemory *)(tensorPtr->get_memory()))->get_ptr(); + inputs[i].buffer.second = tensorPtr->bytes(); + } + return bolt::ReturnStatus::SUCCESS; +} + +bolt::ReturnStatus bolt::ResizeInput( + bolt::ModelHandle modelHandle, const std::vector &inputs) +{ + DLLiteInfo *handle = (DLLiteInfo *)modelHandle; + ModelHandleInfo *ihInfo = (ModelHandleInfo *)handle->modelHandle; + if (nullptr == ihInfo) { + return bolt::ReturnStatus::NULLPTR; + } + CNN *cnn = (CNN *)ihInfo->cnn; + + std::map modelInputDims = GetInputInfoFromDLLite(modelHandle, inputs); + cnn->reready(modelInputDims); + return bolt::ReturnStatus::SUCCESS; +} + +bolt::ResultHandle bolt::AllocResult( + bolt::ModelHandle modelHandle, const std::vector &outputs) +{ + DLLiteInfo *handle = (DLLiteInfo *)modelHandle; + char **outputNames = (char **)malloc(outputs.size() * sizeof(char *)); + for (size_t i = 0; i < outputs.size(); i++) { + U32 length = outputs[i].name.length(); + outputNames[i] = (char *)malloc(length + 1); + memcpy(outputNames[i], outputs[i].name.c_str(), length); + outputNames[i][length] = '\0'; + } + bolt::ResultHandle rh = (bolt::ResultHandle)AllocSpecificResultHandle( + handle->modelHandle, outputs.size(), outputNames); + for (size_t i = 0; i < outputs.size(); i++) { + free(outputNames[i]); + } + free(outputNames); + return rh; +} + +bolt::ReturnStatus bolt::RunModel(bolt::ModelHandle modelHandle, + bolt::ResultHandle resultHandle, + const std::vector &inputs) +{ + DLLiteInfo *handle = (DLLiteInfo *)modelHandle; + ModelHandleInfo *ihInfo = (ModelHandleInfo *)handle->modelHandle; + if (nullptr == ihInfo) { + return bolt::ReturnStatus::NULLPTR; + } + CNN *cnn = (CNN *)ihInfo->cnn; + DEVICE_TYPE device = ihInfo->deviceType; + ResultHandleInner *ir_inner = (ResultHandleInner *)resultHandle; + + for (size_t index = 0; index < inputs.size(); index++) { + cnn->copy_to_named_input(inputs[index].name, (U8 *)(inputs[index].buffer.first)); + } + cnn->run(); + + DataDesc *outputArrPtr = ir_inner->outputArr; + for (U32 curIndex = 0; curIndex < ir_inner->num_outputs; curIndex++) { + Tensor output_tensor = cnn->get_tensor_by_name(outputArrPtr[curIndex].name); + UpdateDataDesc(output_tensor.get_desc(), &(outputArrPtr[curIndex])); + if (device == GPU_MALI) { +#ifdef _USE_MALI + auto mem = (OclMemory *)output_tensor.get_memory(); + outputArrPtr[curIndex].dataPtr = mem->get_mapped_ptr(); +#else + UNI_WARNING_LOG("this binary not support GPU, please recompile project with GPU " + "compile options\n"); +#endif + } else { + outputArrPtr[curIndex].dataPtr = ((CpuMemory *)(output_tensor.get_memory()))->get_ptr(); + } + } + return bolt::ReturnStatus::SUCCESS; +} + +bolt::ReturnStatus bolt::GetOutputTensors( + bolt::ResultHandle resultHandle, std::vector &outputs) +{ + ResultHandleInner *ir_inner = (ResultHandleInner *)resultHandle; + if (nullptr == ir_inner) { + return bolt::ReturnStatus::NULLPTR; + } + DataDesc *outputArrPtr = (*ir_inner).outputArr; + + for (size_t i = 0; i < outputs.size(); i++) { + U32 n = outputArrPtr[i].dims[0]; + U32 c = outputArrPtr[i].dims[1]; + U32 h = outputArrPtr[i].dims[2]; + U32 w = outputArrPtr[i].dims[3]; + DataType dt = outputArrPtr[i].dt; + U32 size = n * c * h * w * bytesOf(dt); + outputs[i].buffer = std::make_pair((void *)outputArrPtr[i].dataPtr, size); + } + return bolt::ReturnStatus::SUCCESS; +} + +bolt::ReturnStatus bolt::FreeResult(bolt::ResultHandle resultHandle) +{ + if (nullptr == resultHandle) { + return bolt::ReturnStatus::NULLPTR; + } + FreeResultHandle((ResultHandle)resultHandle); + return bolt::ReturnStatus::SUCCESS; +} + +bolt::ReturnStatus bolt::DestroyModel(bolt::ModelHandle modelHandle) +{ + DLLiteInfo *handle = (DLLiteInfo *)modelHandle; + ModelHandleInfo *ihInfo = (ModelHandleInfo *)handle->modelHandle; + + if (nullptr == ihInfo) { + UNI_ERROR_LOG("DestroyModel received null handle.\n"); + return bolt::ReturnStatus::NULLPTR; + } + CNN *cnn = (CNN *)ihInfo->cnn; + if (nullptr == cnn) { + UNI_WARNING_LOG("nullptr in DestroyModel. Resource cleared.\n"); + delete ihInfo; + return bolt::ReturnStatus::SUCCESS; + } + delete cnn; + delete ihInfo; + return bolt::ReturnStatus::SUCCESS; +} diff --git a/inference/engine/src/cnn.cpp b/inference/engine/src/cnn.cpp new file mode 100644 index 00000000..d96568f7 --- /dev/null +++ b/inference/engine/src/cnn.cpp @@ -0,0 +1,610 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cnn.h" +#if defined(_USE_CPU) +#include "cpu/factory_cpu.hpp" +#endif +#ifdef _USE_MALI +#include "ocl/factory_ocl.hpp" +#endif + +bool is_same_tensor(Tensor a, Tensor b) +{ + auto ptr_a = ((CpuMemory *)a.get_memory())->get_ptr(); + auto ptr_b = ((CpuMemory *)b.get_memory())->get_ptr(); + bool ret; + if (ptr_a != nullptr && ptr_a == ptr_b) { + ret = true; + } else { + ret = false; + } + return ret; +} + +CNN CNN::clone() +{ + CNN cnn = *this; + for (U32 i = 0; i < cnn.ops.size(); i++) { + cnn.ops[i] = cnn.ops[i]->clone(); + cnn.operatorMap[cnn.ops[i]->get_name()] = cnn.ops[i]; + } + for (auto &tensor : cnn.tensorMap) { + std::shared_ptr cloneTensor = std::shared_ptr(new Tensor()); + *cloneTensor = tensor.second->clone(false); + tensor.second = cloneTensor; + } + cnn.assign_output_tensor(); + cnn.tmpTensor = this->tmpTensor.clone(); + for (auto &operatorTensor : cnn.operatorTensorMap) { + std::string operatorName = operatorTensor.first; + std::vector> tensors(operatorTensor.second.size()); + for (U32 i = 0; i < operatorTensor.second.size(); i++) { + for (U32 j = 0; j < operatorTensor.second[i].size(); j++) { + std::string tensorName = operatorTensor.second[i][j]; + if (cnn.weightOpOutputNames.find(tensorName) != cnn.weightOpOutputNames.end()) { + cnn.tensorMap[tensorName] = this->tensorMap[tensorName]; + } + tensors[i].push_back(*(cnn.tensorMap[tensorName].get())); + } + } + cnn.operatorMap[operatorName]->set_input_output_tensors(tensors[0], tensors[1]); + cnn.operatorMap[operatorName]->set_tmp_memory(cnn.tmpTensor); + } + for (auto &tensor : cnn.inputTensors) { + tensor.second = cnn.tensorMap[tensor.first]; + } + for (auto &tensor : cnn.outputTensors) { + tensor.second = cnn.tensorMap[tensor.first]; + } + + // check + CHECK_REQUIREMENT(!is_same_tensor(this->tmpTensor, cnn.tmpTensor)); + for (U32 i = 0; i < this->storageMemory.size(); i++) { + CHECK_REQUIREMENT( + !is_same_tensor(*(this->storageMemory[i].get()), *(cnn.storageMemory[i].get()))); + } + for (auto iter : this->tensorMap) { + if (cnn.weightOpOutputNames.find(iter.first) == cnn.weightOpOutputNames.end()) { + CHECK_REQUIREMENT( + !is_same_tensor(*(iter.second.get()), *(cnn.tensorMap[iter.first].get()))); + } + } + for (auto iter : this->inputTensors) { + CHECK_REQUIREMENT( + !is_same_tensor(*(iter.second.get()), *(cnn.inputTensors[iter.first].get()))); + } + for (auto iter : this->outputTensors) { + CHECK_REQUIREMENT( + !is_same_tensor(*(iter.second.get()), *(cnn.outputTensors[iter.first].get()))); + } + for (auto iter : this->operatorMap) { + std::shared_ptr op1 = iter.second; + std::shared_ptr op2 = cnn.operatorMap[iter.first]; + for (int i = 0; i < 2; i++) { + std::vector names = this->operatorTensorMap[iter.first][i]; + std::vector tensor1, tensor2; + if (i == 0) { + tensor1 = op1->get_input_tensors(); + tensor2 = op2->get_input_tensors(); + } else { + tensor1 = op1->get_output_tensors(); + tensor2 = op2->get_output_tensors(); + } + CHECK_REQUIREMENT(tensor1.size() == tensor2.size()); + for (U32 j = 0; j < tensor1.size(); j++) { + if (tensor1[j].bytes() != 0) { + CHECK_REQUIREMENT( + is_same_tensor(tensor1[j], *(this->tensorMap[names[j]].get()))); + CHECK_REQUIREMENT(is_same_tensor(tensor2[j], *(cnn.tensorMap[names[j]].get()))); + if (this->weightOpOutputNames.find(names[j]) == this->weightOpOutputNames.end()) { + CHECK_REQUIREMENT(!is_same_tensor(tensor1[j], tensor2[j])); + } + } + } + } + } + return cnn; +} + +void CNN::sort_operators_sequential(const ModelSpec *ms) +{ + int opNum = ms->num_operator_specs; + this->sortedOps.clear(); + for (int i = 0; i < opNum; i++) { + std::string opName = ms->ops[i].name; + if (opName.compare("data") == 0) { + continue; + } + this->sortedOps.push_back(opName); + } +} + +void CNN::initialize_ops(const ModelSpec *ms) +{ + int opNum = ms->num_operator_specs; + + for (int i = 0; i < ms->num_inputs; i++) { + this->modelInputTensorNames.push_back(ms->input_names[i]); + this->modelInputTensorDescs.push_back(ms->input_dims[i]); + } + for (int i = 0; i < ms->num_outputs; i++) { + this->modelOutputTensorNames.push_back(ms->output_names[i]); + } + + U32 operatorIndex = 0; + std::map operatorIndexMap; + for (int i = 0; i < opNum; i++) { + OperatorSpec curOps = ms->ops[i]; + std::string opName = curOps.name; + if (opName.compare("data") == 0) { + continue; + } + operatorIndexMap[opName] = operatorIndex++; + } + + for (int i = 0; i < opNum; i++) { + OperatorSpec curOps = ms->ops[i]; + std::string opName = curOps.name; + if (opName.compare("data") == 0) { + continue; + } + std::vector inputTensorsName; + std::vector outputTensorsName; + int inputTensorsNum = curOps.num_inputs; + for (int j = 0; j < inputTensorsNum; j++) { + inputTensorsName.push_back(curOps.input_tensors_name[j]); + } + + int outputTensorsNum = curOps.num_outputs; + for (int j = 0; j < outputTensorsNum; j++) { + outputTensorsName.push_back(curOps.output_tensors_name[j]); + } + + int numTensors = inputTensorsNum + outputTensorsNum; + std::vector tensorPositions(numTensors); + memcpy(tensorPositions.data(), curOps.tensor_positions, numTensors * bytesOf(DT_I32)); + // create op object + std::shared_ptr factory; + if (this->deviceInfo.schedule == MALI) { +#ifdef _USE_MALI + auto factory_ocl = (Factory *)(new FactoryOCL()); + factory = std::shared_ptr(factory_ocl); + for (int j = 0; j < outputTensorsNum; j++) { + auto curOutputTensorName = outputTensorsName[j]; + for (auto modelOutputTensorName : modelOutputTensorNames) { + if (modelOutputTensorName == curOutputTensorName) { + tensorPositions[j + inputTensorsNum] = -1; + } + } + } +#endif + } else { + auto factory_cpu = (Factory *)(new FactoryCPU()); + factory = std::shared_ptr(factory_cpu); + } + std::shared_ptr op = factory->createOperators(curOps, this->dt, operatorIndexMap, + &this->tensorMap, inputTensorsName, outputTensorsName, &weightOpOutputNames); + op->set_name(opName); + op->set_schedule(this->deviceInfo.schedule); + op->set_tensor_positions(tensorPositions); + op->init_feature_scale(curOps.num_quant_feature, curOps.feature_scale); + op->set_algorithm_map(this->algorithmMap); + this->ops.push_back(op); + + // setup operatorMap, tensorMap, operatorTensorMap + this->add(op, inputTensorsName, outputTensorsName); + } + + // setup WeightSpec ptr in WeightOperator + for (int i = 0; i < ms->num_weight_specs; i++) { + WeightSpec curOpWs = ms->ws[i]; + std::string opName = curOpWs.op_name; + auto op = this->operatorMap[opName]; + auto weightOp = dynamic_cast(op.get()); + weightOp->set_weightspec_ptr(curOpWs); + if (curOpWs.bytes_of_vec != 0) { + CHECK_REQUIREMENT(curOpWs.vec != nullptr); + weightOp->set_hasBias(true); + } + // These two pointers will be managed by engine via shared_ptr, so mt_destroy_model should not free them + ms->ws[i].weight = nullptr; + ms->ws[i].vec = nullptr; + } +} + +void CNN::ready(std::map inputDescMap) +{ + UNI_DEBUG_LOG("ready() schedule: %d\n", (int)(this->deviceInfo.schedule)); + UNI_PROFILE( + { + this->infer_output_tensors_size(inputDescMap); + // handle the weight ops + for (auto op : this->ops) { + UNI_DEBUG_LOG("ready() op: %s init weight and infer forward algorithm\n", + op->get_name().c_str()); + if (op->is_weight()) { + auto weightOpPtr = dynamic_cast(op.get()); + CHECK_STATUS(weightOpPtr->init_weight_bias_from_model(nullptr)); + } + CHECK_STATUS(op->infer_forward_algorithm(this->algorithmMap)); + } + + this->tmpTensor = *(this->allocate_tensor().get()); + this->infer_tmp_memory_size(); + this->assign_tmp_tensor(); + // transform filter + for (auto op : this->ops) { + UNI_DEBUG_LOG("ready() op: %s transform filter\n", op->get_name().c_str()); + if (op->is_weight()) { + auto weightOpPtr = dynamic_cast(op.get()); + CHECK_STATUS(weightOpPtr->transform_filter()); + } + } + this->infer_tmp_memory_size(); + this->tmpTensor.alloc(); + this->assign_output_tensor(); + }, + std::string("ready"), std::string("prepare")); +} + +void CNN::reready(std::map inputDescMap) +{ + this->infer_output_tensors_size(inputDescMap); + if (this->memoryTracker.getMemoryNeedAssign()) { + this->assign_output_tensor(); + } + this->infer_tmp_memory_size(); + this->tmpTensor.alloc(); +} + +EE CNN::mark_input_output() +{ + this->inputTensors.clear(); + for (U32 i = 0; i < this->modelInputTensorNames.size(); i++) { + std::string str = this->modelInputTensorNames[i]; + if (tensorMap.find(str) != tensorMap.end()) { + inputTensors[str] = tensorMap[str]; + } else { + return NOT_MATCH; + } + } + this->outputTensors.clear(); + for (U32 i = 0; i < this->modelOutputTensorNames.size(); i++) { + std::string str = this->modelOutputTensorNames[i]; + if (tensorMap.find(str) != tensorMap.end()) { + outputTensors[str] = tensorMap[str]; + } else { + return NOT_MATCH; + } + } + return SUCCESS; +} + +void CNN::copy_to_named_input(std::string inputName, const U8 *data) +{ + if (inputTensors.find(inputName) == inputTensors.end()) { + CHECK_STATUS(NOT_MATCH); + } + auto tensorPtr = this->inputTensors[inputName]; + Tensor input; + input.resize(tensorPtr->get_desc()); + std::shared_ptr shared_data((U8 *)data, [](U8 *ptr) {}); + ((CpuMemory *)(input.get_memory()))->set_shared_ptr(shared_data); + tensorPtr->copy_from(&input); +} + +void CNN::set_input_tensors_value(std::map> modelTensorsInput) +{ + for (auto &modelTensorInput : modelTensorsInput) { + std::string inputName = modelTensorInput.first; + std::shared_ptr data = modelTensorInput.second; + if (inputTensors.find(inputName) == inputTensors.end()) { + CHECK_STATUS(NOT_MATCH); + } + auto tensorPtr = this->inputTensors[inputName]; + Tensor input; + input.resize(tensorPtr->get_desc()); + ((CpuMemory *)(input.get_memory()))->set_shared_ptr(data); + tensorPtr->reuse(&input); + } +} + +std::map> CNN::get_inputs() +{ + std::map> ret; + if (this->deviceInfo.schedule == MALI) { +#ifdef _USE_MALI + for (U32 i = 0; i < modelInputTensorNames.size(); i++) { + std::shared_ptr tmpTensorCPU(new Tensor()); + tmpTensorCPU->resize(modelInputTensorDescs[i]); + tmpTensorCPU->alloc(); + auto p = std::pair>( + modelInputTensorNames[i], tmpTensorCPU); + ret.insert(p); + } +#endif + } else { + ret = this->inputTensors; + } + return ret; +} + +std::map> CNN::get_outputs() +{ + return this->outputTensors; +} + +Tensor CNN::get_tensor_by_name(std::string tensorName) +{ + if (this->tensorMap.find(tensorName) == this->tensorMap.end()) { + CHECK_STATUS(NOT_MATCH); + } + return *(this->tensorMap[tensorName].get()); +} + +TensorDesc CNN::get_tensor_desc_by_name(std::string tensorName) +{ + TensorDesc desc = tensor4d(DT_U8, 0, 0, 0, 0); + if (this->tensorMap.find(tensorName) != this->tensorMap.end()) { + desc = this->tensorMap[tensorName]->get_desc(); + } + return desc; +} + +std::vector CNN::get_model_input_tensor_names() +{ + return this->modelInputTensorNames; +} + +std::vector CNN::get_model_input_tensor_descs() +{ + return this->modelInputTensorDescs; +} + +std::vector CNN::get_model_output_tensor_names() +{ + return this->modelOutputTensorNames; +} + +EE CNN::infer_output_tensors_size(std::map inputDescMap) +{ + this->set_input_tensors_desc(inputDescMap); + for (auto iter : inputDescMap) { + UNI_DEBUG_LOG("infer_output_tensors_size() model input: %s desc %s\n", iter.first.c_str(), + tensorDesc2Str(iter.second).c_str()); + } + this->infer_layout_desc(); + this->update_op_tensors(); + return SUCCESS; +} + +void CNN::assign_output_tensor() +{ + this->storageMemory.clear(); + auto storageSize = this->memoryTracker.getStorageSize(); + for (U32 size : storageSize) { + auto tensor = this->allocate_tensor(size); + this->storageMemory.push_back(tensor); + } + + std::set input_set(modelInputTensorNames.begin(), modelInputTensorNames.end()); + std::set output_set(modelOutputTensorNames.begin(), modelOutputTensorNames.end()); + for (std::string opName : this->sortedOps) { + std::shared_ptr op = this->operatorMap[opName]; + std::vector tensorPositions = op->get_tensor_positions(); + std::vector> tensors(this->operatorTensorMap[opName].size()); + for (U32 i = 0, tensorIter = 0; i < this->operatorTensorMap[opName].size(); i++) { + std::vector &tensorNames = this->operatorTensorMap[opName][i]; + for (std::string &tensorName : tensorNames) { + UNI_DEBUG_LOG("assign_output_tensor() tensor %s slot %d\n", tensorName.c_str(), + tensorPositions[tensorIter]); + auto tensor = this->tensorMap[tensorName]; + if (i == 1 || input_set.find(tensorName) != input_set.end()) { + if (tensorPositions[tensorIter] != -1) { + auto mem = this->storageMemory[tensorPositions[tensorIter]].get(); + tensor->reuse(mem); + } else if (this->weightOpOutputNames.find(tensorName) == + this->weightOpOutputNames.end()) { + if (this->deviceInfo.schedule == MALI && + output_set.find(tensorName) != output_set.end()) { +#ifdef _USE_MALI + auto mem = (OclMemory *)tensor->get_memory(); + mem->mapped_alloc(); +#endif + } else { + tensor->alloc(); + } + } + } + tensorIter++; + tensors[i].push_back(*(tensor.get())); + } + } + op->set_input_output_tensors(tensors[0], tensors[1]); + } + this->memoryTracker.setMemoryAssigned(); +} + +void CNN::run() +{ + for (U32 opIndex = 0; opIndex < ops.size();) { + std::shared_ptr op = this->ops[opIndex]; + UNI_DEBUG_LOG( + "run() op: %s type: %s\n", op->get_name().c_str(), OperatorTypeName()[op->get_type()]); + if (op->get_type() == OT_Repeat || op->get_type() == OT_Jump) { + opIndex = op->get_next_operator_index(); + } else { + UNI_PROFILE(op->run(), op->get_name(), + std::string(OperatorTypeName()[op->get_type()]) + std::string("::run")); + opIndex++; + } +#ifdef _DEBUG + std::vector outputTensors = op->get_output_tensors(); + for (U32 i = 0; i < outputTensors.size(); i++) { + Tensor outputTensor = outputTensors[i]; + std::string line = outputTensor.string(32); + UNI_DEBUG_LOG(" output: %s\n", line.c_str()); + } +#endif + } +} + +std::shared_ptr CNN::allocate_tensor(U32 size) +{ + MemoryType type = CPUMem; + if (this->deviceInfo.schedule == MALI) { + type = OCLMem; + } + std::shared_ptr tensor = std::shared_ptr(new Tensor(type)); + tensor->resize(tensor1d(DT_U8, size)); + tensor->alloc(); + return tensor; +} + +void CNN::add(std::shared_ptr op, + std::vector inputTensorsName, + std::vector outputTensorsName) +{ + std::string operatorName = op->get_name(); + this->operatorMap[operatorName] = op; + + if (this->operatorTensorMap.find(operatorName) == this->operatorTensorMap.end()) { + this->operatorTensorMap[operatorName] = {inputTensorsName, outputTensorsName}; + } else { + UNI_ERROR_LOG("duplicate tensor: %s\n", operatorName.c_str()); + } + + for (std::string &inputName : inputTensorsName) { + if (this->tensorMap.find(inputName) == this->tensorMap.end()) { + this->tensorMap[inputName] = this->allocate_tensor(); + } + } + + for (std::string &outputName : outputTensorsName) { + if (this->tensorMap.find(outputName) == this->tensorMap.end()) { + this->tensorMap[outputName] = this->allocate_tensor(); + } + } +} + +void CNN::infer_layout_desc() +{ + for (std::string opName : this->sortedOps) { + auto op = this->operatorMap[opName]; + UNI_DEBUG_LOG("op: %s type: %s\n", opName.c_str(), OperatorTypeName()[op->get_type()]); + std::vector curOpInputTensorName = this->operatorTensorMap[opName][0]; + std::vector curOpOutputTensorName = this->operatorTensorMap[opName][1]; + std::vector inputTensors; + std::vector outputTensors; + for (std::string inputTensorName : curOpInputTensorName) { + auto tensor = this->tensorMap[inputTensorName].get(); + inputTensors.push_back(tensor); + UNI_DEBUG_LOG(" input: %s desc %s\n", inputTensorName.c_str(), + tensorDesc2Str(tensor->get_desc()).c_str()); + } + for (std::string outputTensorName : curOpOutputTensorName) { + auto tensor = this->tensorMap[outputTensorName].get(); + outputTensors.push_back(tensor); + } + CHECK_STATUS(op->infer_output_tensors_size(inputTensors, outputTensors)); + for (std::string outputTensorName : curOpOutputTensorName) { + UNI_DEBUG_LOG(" output: %s desc %s\n", outputTensorName.c_str(), + tensorDesc2Str(this->tensorMap[outputTensorName]->get_desc()).c_str()); + } + } +} + +void CNN::update_op_tensors() +{ + for (auto opName : this->sortedOps) { + auto op = this->operatorMap[opName]; + UNI_DEBUG_LOG("update_op_tensors() op: %s type: %s\n", opName.c_str(), + OperatorTypeName()[op->get_type()]); + std::vector curOpInputTensorName = this->operatorTensorMap[opName][0]; + std::vector curOpOutputTensorName = this->operatorTensorMap[opName][1]; + std::vector inTensors, outTensors; + for (std::string inputTensorName : curOpInputTensorName) { + auto tensorTmp = this->tensorMap[inputTensorName]; + inTensors.push_back(*tensorTmp.get()); + } + + for (std::string outputTensorName : curOpOutputTensorName) { + auto tensorTmp = this->tensorMap[outputTensorName]; + outTensors.push_back(*tensorTmp.get()); + } + op->set_input_output_tensors(inTensors, outTensors); + + curOpInputTensorName.insert( + curOpInputTensorName.end(), curOpOutputTensorName.begin(), curOpOutputTensorName.end()); + memoryTracker.trackOpTensorSizes(op, curOpInputTensorName); + } + check_memory_reuse_ratio(); +} + +void CNN::set_input_tensors_desc(std::map inputDescMap) +{ + for (auto iter : inputDescMap) { + if (tensorMap.find(iter.first) == tensorMap.end()) { + UNI_WARNING_LOG("Unused model input node: %s\n", iter.first.c_str()); + continue; + } + TensorDesc desc = iter.second; + (this->tensorMap[iter.first].get())->resize(desc); + } +} + +void CNN::infer_tmp_memory_size() +{ + U32 tmpSize = this->tmpTensor.bytes(); + // input data format transform tmp buffer + if (this->deviceInfo.schedule == MALI) { + for (auto desc : modelInputTensorDescs) { + tmpSize = UNI_MAX(tmpSize, tensorNumBytes(desc)); + } + } + + // operator tmp buffer + for (auto &op : this->ops) { + auto len = op->infer_tmp_memory_size(); + tmpSize = UNI_MAX(tmpSize, len); + } + this->tmpTensor.resize(tensor1d(DT_U8, tmpSize)); +} + +void CNN::assign_tmp_tensor() +{ + this->tmpTensor.alloc(); + for (auto &op : this->ops) { + op->set_tmp_memory(this->tmpTensor); + } +} + +void CNN::check_memory_reuse_ratio() +{ + U32 originalSize = 0; + U32 standaloneSize = 0; + for (auto tensor : this->tensorMap) { + U32 tensorSize = tensor.second->bytes(); + originalSize += tensorSize; + if (weightOpOutputNames.find(tensor.first) != weightOpOutputNames.end()) { + standaloneSize += tensorSize; + } + } + UNI_DEBUG_LOG("tensor memory: originally %d tensors take %u bytes.\n", + (int)this->tensorMap.size(), originalSize); + UNI_DEBUG_LOG("tensor memory: now %u tensors take %u bytes, and %u bytes are reserved " + "for standalone tensors (e.g. loop topology). reuse rate: %f\n", + this->memoryTracker.getNumSlots(), this->memoryTracker.getSizeSum(), standaloneSize, + (F32)originalSize / (this->memoryTracker.getSizeSum() + standaloneSize)); +} diff --git a/inference/engine/src/data_loader.cpp b/inference/engine/src/data_loader.cpp new file mode 100644 index 00000000..8750162f --- /dev/null +++ b/inference/engine/src/data_loader.cpp @@ -0,0 +1,377 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifdef _BUILD_TEST + +#include +#include +#include +#include +#include +#include +#include + +#include "image_processing.hpp" +#include "data_loader.hpp" + +template +void init_one(U8 *memory, U32 len) +{ + T *data = (T *)memory; + for (U32 i = 0; i < len; i++) { + data[i] = 1; + } +} + +template +void init_rand(U8 *memory, U32 len) +{ + T *data = (T *)memory; + for (U32 i = 0; i < len; i++) { + data[i] = (rand() % 1024) / (T)1024.0 - (T)0.5; + } +} + +void get_files(std::string directoryName, std::vector &files) +{ + if (directoryName.empty()) { + UNI_ERROR_LOG("null data\n"); + } + DIR *directory = opendir(directoryName.c_str()); + if (NULL == directory) { + UNI_ERROR_LOG("permission denied to access %s\n", directoryName.c_str()); + } + struct dirent *file; + while ((file = readdir(directory)) != NULL) { + if (strcmp(file->d_name, ".") == 0 || strcmp(file->d_name, "..") == 0) { + continue; + } + if (file->d_type == DT_DIR) { + continue; + } else { + files.push_back(directoryName + "/" + file->d_name); + } + } + closedir(directory); +} + +std::vector load_jpeg( + std::string dataPath, std::vector imageDesc, ImageFormat ImageFormat, F32 scaleValue) +{ + FILE *file = fopen(dataPath.c_str(), "rb"); + CHECK_REQUIREMENT(NULL != file); + + struct jpeg_decompress_struct info; + struct jpeg_error_mgr err; + + info.err = jpeg_std_error(&err); + jpeg_create_decompress(&info); + + jpeg_stdio_src(&info, file); + jpeg_read_header(&info, TRUE); + + jpeg_start_decompress(&info); + + U32 width = info.output_width; + U32 height = info.output_height; + U32 numChannels = info.output_components; + U32 dataSize = numChannels * height * width; + + UNI_DEBUG_LOG("%s: channels %u , out color space %d\n", dataPath.c_str(), numChannels, + info.out_color_space); + CHECK_REQUIREMENT(2 == info.out_color_space); // Support RGB for now + + U8 *data = (U8 *)malloc(dataSize); + JSAMPROW row_pointer[1]; + while (info.output_scanline < info.output_height) { + row_pointer[0] = data + info.output_scanline * width * numChannels; + int ret = jpeg_read_scanlines(&info, row_pointer, 1); + CHECK_REQUIREMENT(ret == 1); + } + + jpeg_finish_decompress(&info); + jpeg_destroy_decompress(&info); + fclose(file); + + TensorDesc rgbDesc = tensor4df(DT_U8, DF_RGB, 1, 3, height, width); + Tensor rgbTensor = Tensor::alloc_sized(rgbDesc); + U8 *rgb = (U8 *)((CpuMemory *)(rgbTensor.get_memory()))->get_ptr(); + U8 *r = rgb; + U8 *g = r + height * width; + U8 *b = g + height * width; + + U8 *dataMov = data; + for (U32 i = 0; i < height * width; i++) { + r[i] = dataMov[0]; + g[i] = dataMov[1]; + b[i] = dataMov[2]; + dataMov += numChannels; + } + free(data); + + std::shared_ptr imageTensor = + load_resize_image(rgbTensor, imageDesc[0], ImageFormat, scaleValue); + std::vector result; + imageTensor->resize(imageDesc[0]); + result.push_back(*imageTensor.get()); + return result; +} + +std::vector load_fake_data(std::vector dataDesc) +{ + std::vector result; + for (U32 index = 0; index < dataDesc.size(); index++) { + Tensor tensor = Tensor::alloc_sized(dataDesc[index]); + U8 *ptr = (U8 *)((CpuMemory *)(tensor.get_memory()))->get_ptr(); + switch (dataDesc[index].dt) { + case DT_F32: { + init_one(ptr, tensorNumElements(dataDesc[index])); + break; + } +#ifdef __aarch64__ + case DT_F16: { + init_one(ptr, tensorNumElements(dataDesc[index])); + break; + } +#endif + case DT_U32: { + init_one(ptr, tensorNumElements(dataDesc[index])); + break; + } + case DT_I32: { + init_one(ptr, tensorNumElements(dataDesc[index])); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + result.push_back(tensor); + } + return result; +} + +Tensor fscanfReadData(FILE *f, TensorDesc desc) +{ + Tensor tensor = Tensor::alloc_sized(desc); + U32 size = tensor.length(); + DataType dataType = desc.dt; + auto ptr = ((CpuMemory *)(tensor.get_memory()))->get_ptr(); + switch (dataType) { + case DT_F32: { + F32 *dataPtr = (F32 *)ptr; + for (U32 i = 0; i < size; i++) { + fscanf(f, "%f", dataPtr + i); + } + break; + } +#ifdef __aarch64__ + case DT_F16: { + F16 *dataPtr = (F16 *)ptr; + F32 value; + for (U32 i = 0; i < size; i++) { + fscanf(f, "%f", &value); + dataPtr[i] = (F16)value; + } + break; + } +#endif + case DT_U32: { + U32 *dataPtr = (U32 *)ptr; + for (U32 i = 0; i < size; i++) { + fscanf(f, "%u", dataPtr + i); + } + break; + } + case DT_I32: { + I32 *dataPtr = (I32 *)ptr; + for (U32 i = 0; i < size; i++) { + fscanf(f, "%d", dataPtr + i); + } + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + break; + } + return tensor; +} + +std::vector load_txt(std::string dataPath, std::vector dataDesc) +{ + std::vector result; + FILE *f = fopen(dataPath.c_str(), "r"); + CHECK_REQUIREMENT(f != nullptr); + for (U32 index = 0; index < dataDesc.size(); index++) { + result.push_back(fscanfReadData(f, dataDesc[index])); + } + fclose(f); + return result; +} + +std::vector load_seq(std::string dataPath, std::vector dataDesc) +{ + std::vector result; + FILE *f = fopen(dataPath.c_str(), "r"); + CHECK_REQUIREMENT(f != nullptr); + for (U32 index = 0; index < dataDesc.size(); index++) { + U32 sequenceLen = 0; + fscanf(f, "%u", &sequenceLen); + TensorDesc sequenceDesc = dataDesc[index]; + sequenceDesc.dims[0] = sequenceLen; + for (U32 j = 1; j < sequenceDesc.nDims; j++) { + sequenceDesc.dims[j] = 1; + } + + result.push_back(fscanfReadData(f, sequenceDesc)); + } + fclose(f); + return result; +} + +std::vector load_bin( + std::string dataPath, std::vector sourceDataType, std::vector dataDesc) +{ + std::vector result; + FILE *f = fopen(dataPath.c_str(), "r"); + if (nullptr == f) { + result = load_fake_data(dataDesc); + } else { + for (U32 index = 0; index < dataDesc.size(); index++) { + TensorDesc sourceDesc = dataDesc[index]; + sourceDesc.dt = sourceDataType[index]; + Tensor tensor = Tensor::alloc_sized(sourceDesc); + U32 len = tensor.length(); + auto ptr = ((CpuMemory *)(tensor.get_memory()))->get_ptr(); + U32 readLength = fread(ptr, bytesOf(sourceDataType[index]), len, f); + CHECK_REQUIREMENT(len == readLength); + if (sourceDataType[index] != dataDesc[index].dt) { + Tensor transform_tensor = Tensor::alloc_sized(dataDesc[index]); + if (0) { +#ifdef __aarch64__ + } else if (sourceDataType[index] == DT_F32 && dataDesc[index].dt == DT_F16) { + F32 *ptr1 = (F32 *)ptr; + F16 *ptr2 = (F16 *)((CpuMemory *)(transform_tensor.get_memory()))->get_ptr(); + for (U32 i = 0; i < len; i++) { + ptr2[i] = (F16)ptr1[i]; + } +#endif + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + result.push_back(transform_tensor); + } else { + result.push_back(tensor); + } + } + fclose(f); + } + return result; +} + +int string_end_with(std::string s, std::string sub) +{ + std::transform(s.begin(), s.end(), s.begin(), ::tolower); + std::transform(sub.begin(), sub.end(), sub.begin(), ::tolower); + return s.rfind(sub) == (s.length() - sub.length()) ? 1 : 0; +} + +std::vector load_data(std::string directoryPath, + std::vector dataDesc, + std::vector> *datas) +{ + std::vector dataPaths; + if (directoryPath == "") { + std::vector data = load_fake_data(dataDesc); + (*datas).push_back(data); + dataPaths.push_back("fake data"); + return dataPaths; + } + + std::vector paths; + get_files(directoryPath, paths); + std::vector data; + for (U32 i = 0; i < paths.size(); i++) { + std::string dataPath = paths[i]; + if (string_end_with(dataPath, ".txt")) { + data = load_txt(dataPath, dataDesc); + } else if (string_end_with(dataPath, ".seq")) { + data = load_seq(dataPath, dataDesc); + } else { + UNI_ERROR_LOG("can not load data %s\n", dataPath.c_str()); + } + (*datas).push_back(data); + dataPaths.push_back(dataPath); + } + return dataPaths; +} + +std::vector load_image_with_scale(std::string directoryPath, + std::vector dataDesc, + std::vector> *datas, + ImageFormat ImageFormat, + F32 scaleValue) +{ + std::vector dataPaths; + if (directoryPath == "") { + std::vector data = load_fake_data(dataDesc); + (*datas).push_back(data); + dataPaths.push_back("fake data"); + return dataPaths; + } + + std::vector paths; + get_files(directoryPath, paths); + std::vector data; + for (U32 i = 0; i < paths.size(); i++) { + std::string dataPath = paths[i]; + if (string_end_with(dataPath, ".jpg") || string_end_with(dataPath, ".jpeg")) { + data = load_jpeg(dataPath, dataDesc, ImageFormat, scaleValue); + } else if (string_end_with(dataPath, ".txt")) { + data = load_txt(dataPath, dataDesc); + } else { + UNI_ERROR_LOG("can not load jpeg data %s\n", dataPath.c_str()); + } + (*datas).push_back(data); + dataPaths.push_back(dataPath); + } + return dataPaths; +} + +std::vector load_bin_with_type(std::string directoryPath, + std::vector dataDesc, + std::vector> *datas, + std::vector sourceDataType) +{ + std::vector dataPaths; + if (directoryPath == "") { + std::vector data = load_fake_data(dataDesc); + (*datas).push_back(data); + dataPaths.push_back("fake data"); + return dataPaths; + } + + std::vector paths; + get_files(directoryPath, paths); + std::vector data; + for (U32 i = 0; i < paths.size(); i++) { + std::string dataPath = paths[i]; + if (string_end_with(dataPath, ".bin")) { + data = load_bin(dataPath, sourceDataType, dataDesc); + (*datas).push_back(data); + dataPaths.push_back(dataPath); + } + } + return dataPaths; +} +#endif diff --git a/inference/engine/src/result_format.cpp b/inference/engine/src/result_format.cpp new file mode 100644 index 00000000..f214a6ad --- /dev/null +++ b/inference/engine/src/result_format.cpp @@ -0,0 +1,49 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "result_format.hpp" + +std::vector topK_index(U8 *res, TensorDesc desc, U32 topK) +{ + U32 len = tensorNumElements(desc); + std::vector index(len); + for (U32 i = 0; i < index.size(); i++) { + index[i] = i; + } + + switch (desc.dt) { +#ifdef __aarch64__ + case DT_F16: { + F16 *dataPtr = (F16 *)res; + sort(index.begin(), index.end(), + [&](const int &a, const int &b) { return (dataPtr[a] > dataPtr[b]); }); + break; + } +#endif + case DT_F32: { + F32 *dataPtr = (F32 *)res; + sort(index.begin(), index.end(), + [&](const int &a, const int &b) { return (dataPtr[a] > dataPtr[b]); }); + break; + } + default: + break; + } + + std::vector::const_iterator first = index.begin() + 0; + std::vector::const_iterator last = index.begin() + topK; + std::vector indexesTopK(first, last); + + return indexesTopK; +} diff --git a/inference/engine/tools/CMakeLists.txt b/inference/engine/tools/CMakeLists.txt new file mode 100644 index 00000000..c392e964 --- /dev/null +++ b/inference/engine/tools/CMakeLists.txt @@ -0,0 +1,25 @@ +cmake_minimum_required(VERSION 3.2) + +set_test_c_cxx_flags() + +if (BUILD_TEST) + engine_test(common_algo_search ./common_algo_search/common_algo_search.cpp) + install(TARGETS common_algo_search + RUNTIME DESTINATION tools) +endif (BUILD_TEST) +if (BUILD_TEST AND USE_INT8) + engine_test(ptq_calibration ./ptq_calibration/ptq_calibration.cpp) + install(TARGETS ptq_calibration + RUNTIME DESTINATION tools) +endif (BUILD_TEST AND USE_INT8) +if (USE_MALI) + engine_test(preprocess_ocl ./preprocess_ocl/preprocess_ocl.cpp) + install(TARGETS preprocess_ocl + RUNTIME DESTINATION tools) +endif (USE_MALI) +if (USE_TRAINING) + train_test(model_finetuner ./model_finetuner/model_finetuner.cpp) + TARGET_LINK_LIBRARIES(model_finetuner RaulLib) + install(TARGETS model_finetuner + RUNTIME DESTINATION tools) +endif (USE_TRAINING) diff --git a/inference/engine/tools/common_algo_search/common_algo_search.cpp b/inference/engine/tools/common_algo_search/common_algo_search.cpp new file mode 100644 index 00000000..50e1b01a --- /dev/null +++ b/inference/engine/tools/common_algo_search/common_algo_search.cpp @@ -0,0 +1,128 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "ut_util.h" +#include "tensor_computing.h" +#include "algorithm_map.h" +#include "parse_command.h" + +int convolutionCPUFloatAlgorithmSearch(Arch arch, DataType dt, std::string path) +{ + TensorDesc inputDesc, filterDesc, outputDesc; + ConvolutionPolicy policy = CONVOLUTION_TUNNING; + ActivationParamSpec activationDesc; + activationDesc.mode = ACTIVATION_RELU; + activationDesc.value[0] = 0; + ConvolutionParamSpec convParamSpec; + convParamSpec.dilatedRate_h = 1; + convParamSpec.dilatedRate_w = 1; + U32 in = 1; + ArchInfo archInfo; + archInfo.arch = UT_ARCH; + U32 ic_step, ihw_step, fn_step, ic_max, ihw_max, fn_max; + std::set fwh; + std::set stride; + std::string modelName = ""; + std::string deviceName = ""; + AlgorithmMap *algoMap = new AlgorithmMap(arch, modelName, deviceName, dt); + algoMap->getCommonAlgoMapPara( + &ic_step, &ihw_step, &fn_step, &ic_max, &ihw_max, &fn_max, &fwh, &stride); + for (auto sv : stride) { + for (auto fv : fwh) { + U32 pl = fv / 2; + U32 pr = (fv - 1) / 2; + U32 pt = fv / 2; + U32 pb = (fv - 1) / 2; + for (U32 fn = fn_step; fn <= fn_max; fn += fn_step) { + for (U32 ic = ic_step; ic <= ic_max; ic += ic_step) { + for (U32 ih = ihw_step; ih <= ihw_max; ih += ihw_step) { + for (U32 iw = ihw_step; iw <= ihw_max; iw += ihw_step) { + if (ic % 8 != 0) { + inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, ih); + } else { + inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, ih); + } + convParamSpec.stride_h = sv; + convParamSpec.stride_w = sv; + convParamSpec.padding_left = pl; + convParamSpec.padding_right = pr; + convParamSpec.padding_top = pt; + convParamSpec.padding_bottom = pb; + filterDesc = tensor4df(dt, DF_NCHW, fn, ic, fv, fv); + Tensor inputTensor; + Tensor outputTensor; + Tensor filterTensor; + inputTensor.resize(inputDesc); + outputTensor.resize(outputDesc); + filterTensor.resize(filterDesc); + CHECK_STATUS(convolution_infer_output_size(&inputTensor, filterTensor, + convParamSpec, &outputTensor, dt, &archInfo)); + ConvolutionForwardAlgorithm algorithm = CONVOLUTION_ALGORITHM_NULL; + CHECK_STATUS(convolution_infer_forward_algorithm(inputTensor, + filterTensor, outputTensor, convParamSpec, policy, &algorithm, dt, + activationDesc, &archInfo)); + algoMap->setCommonAlgoInfoToMap(OT_Conv, dt, ic, ih, iw, fn, fv, fv, sv, + sv, (I32 *)(&algorithm), 1); + } + } + } + } + } + } + algoMap->saveAlgorithmMapToText(path); + delete algoMap; + return 0; +} + +int main(int argc, char *argv[]) +{ + std::string affinityPolicyName = "CPU_AFFINITY_HIGH_PERFORMANCE"; + std::string algorithmMapPath = "./"; + ParseRes parse_res; + parseCommandLine(argc, argv, &parse_res, "examples"); + if (parse_res.archInfo.second) { + affinityPolicyName = parse_res.archInfo.first; + } + if (parse_res.algoPath.second) { + algorithmMapPath = parse_res.algoPath.first; + } + AffinityPolicy affinityPolicy = thread_affinity_get_policy_by_name(affinityPolicyName.c_str()); + + if (affinityPolicyName == "CPU_AFFINITY_HIGH_PERFORMANCE" || + affinityPolicyName == "CPU_AFFINITY_LOW_POWER") { + Arch arch; +#ifndef _USE_IOS + DeviceInfo deviceInfo = get_cpu_info(affinityPolicy); + set_cpu_dynamic(&deviceInfo, 0); + arch = deviceInfo.schedule; +#else + arch = ARM_A76; +#endif +#ifdef _USE_FP16 + convolutionCPUFloatAlgorithmSearch(arch, DT_F16, algorithmMapPath); + +#endif +#ifdef _USE_FP32 + convolutionCPUFloatAlgorithmSearch(arch, DT_F32, algorithmMapPath); +#endif + } else if (affinityPolicyName == "GPU") { + UNI_ERROR_LOG("Unsupport GPU now\n"); + exit(-1); + } else { + UNI_ERROR_LOG("Unknow archInfo %s, please use " + "CPU_AFFINITY_HIGH_PERFORMANCE/CPU_AFFINITY_LOW_POWER/GPU\n", + affinityPolicyName.c_str()); + exit(-1); + } + return 0; +} diff --git a/inference/engine/tools/preprocess_ocl/CMakeLists.txt b/inference/engine/tools/preprocess_ocl/CMakeLists.txt new file mode 100644 index 00000000..329d60b5 --- /dev/null +++ b/inference/engine/tools/preprocess_ocl/CMakeLists.txt @@ -0,0 +1,10 @@ +cmake_minimum_required(VERSION 3.2) + +project(kernelbin) + +include_directories(${PROJECT_SOURCE_DIR}/include) +include_directories(${PROJECT_SOURCE_DIR}/extern) + +file(GLOB_RECURSE kernel_bin_src_list cpp/*.cpp) + +add_library(${PROJECT_NAME} SHARED ${kernel_bin_src_list}) diff --git a/inference/engine/tools/preprocess_ocl/build_preprocess_ocl.sh b/inference/engine/tools/preprocess_ocl/build_preprocess_ocl.sh new file mode 100644 index 00000000..290cb9d1 --- /dev/null +++ b/inference/engine/tools/preprocess_ocl/build_preprocess_ocl.sh @@ -0,0 +1,114 @@ +#Ensure your target device is connected with host by adb +#Set your target devices number here +device=GCL5T19822000030 + +#Set your preprocess_ocl program file location of host +preprocess_ocl=${BOLT_ROOT}/install_arm_llvm/tools/preprocess_ocl + +#Set your bolt models location on device, put all your bolt models need to preprocess here +device_bolt_models=/data/local/tmp/preprocess_bolt_models + +#Set your work location on device, make sure it is read-write avaliable, sh will build filefolds automatically +device_work_local=/data/local/tmp/preprocess +device_algo_files=${device_work_local}/algoFiles +device_include=${device_work_local}/include +device_cpp=${device_work_local}/cpp + +host_work_local=$(pwd) +host_algo_files=${host_work_local}/algoFiles +host_include=${host_work_local}/include +host_cpp=${host_work_local}/cpp +host_extern=${host_work_local}/extern +host_lib=${host_work_local}/lib +host_build=${host_work_local}/build +rm -rf ${host_algo_files} ${host_include} ${host_cpp} +mkdir ${host_algo_files} ${host_include} ${host_cpp} + + +adb -s ${device} shell "rm -rf ${device_work_local}" +adb -s ${device} shell "mkdir ${device_work_local}" +adb -s ${device} shell "mkdir ${device_work_local}/lib" +adb -s ${device} shell "mkdir ${device_algo_files}" +adb -s ${device} shell "mkdir ${device_include}" +adb -s ${device} shell "mkdir ${device_cpp}" + +adb -s ${device} push ${preprocess_ocl} ${device_work_local} > /dev/null || exit 1 +for file in `ls ${BOLT_ROOT}/install_arm_llvm/lib/*.so` + do + adb -s ${device} push ${file} ${device_work_local}/lib > /dev/null || exit 1 + done + +echo "Running GPU preprocess on device ${device}" +adb -s ${device} shell "cd ${device_work_local} && chmod +x preprocess_ocl && export LD_LIBRARY_PATH=./lib && ./preprocess_ocl ${device_bolt_models} ${device_algo_files} ${device_include} ${device_cpp}" +echo "Finish GPU preprocess on device ${device}" + +echo "Aquire algoFiles and kernelBins from device ${device}" +adb -s ${device} pull ${device_algo_files} ${host_algo_files} > /dev/null +adb -s ${device} pull ${device_include} ${host_include} > /dev/null +adb -s ${device} pull ${device_cpp} ${host_cpp} > /dev/null + +echo "build kernel bin .so on host" +if [ -d ${host_algo_files}/algoFiles ]; then + mv ${host_algo_files}/algoFiles/* ${host_algo_files} + rm -rf ${host_algo_files}/algoFiles +fi + +if [ -d ${host_include}/include ]; then + mv ${host_include}/include/* ${host_include} + rm -rf ${host_include}/include +fi + +if [ -d ${host_cpp}/cpp ]; then + mv ${host_cpp}/cpp/* ${host_cpp} + rm -rf ${host_cpp}/cpp +fi + +rm -rf ${host_extern} +mkdir ${host_extern} +cp ${BOLT_ROOT}/common/gcl/include/gcl_kernel_type.h ${host_extern} +cp ${BOLT_ROOT}/common/gcl/include/gcl_kernel_binmap.h ${host_extern} + +cpp_files_name=$(ls ${host_cpp}) +lib_name=libkernelbin +for p in ${cpp_files_name[@]} +do + postfix=${p##*.} + if [ ${postfix} = "h" ]; then + lib_name=${p%.*} + lib_name=${lib_name#inline_} + fi +done + +lib_name=${lib_name%.*} + +rm -rf ${host_build} +mkdir ${host_build} +cd ${host_build} +cmake .. -DCMAKE_C_COMPILER=`which aarch64-linux-android21-clang` \ + -DCMAKE_CXX_COMPILER=`which aarch64-linux-android21-clang++` \ + -DCMAKE_STRIP=`which aarch64-linux-android-strip` +make -j33 + +cd ${host_work_local} +rm -rf ${host_lib} +mkdir ${host_lib} +#mv ${host_build}/libkernelbin.so ${host_lib}/lib${lib_name}_map.so + +allSrcs=`find ${host_build} -name "*.o" -printf "%P\n"` +for file in ${allSrcs} +do + sharedSrcs="${sharedSrcs} ${host_build}/${file}" +done +CXX=aarch64-linux-android21-clang++ +${CXX} -shared -o ${host_lib}/lib${lib_name}_map.so ${sharedSrcs} \ + -L${BOLT_ROOT}/third_party/arm_llvm/opencl/lib64 -lOpenCL -Wl,-soname,lib${lib_name}_map.so + +cd ${host_lib} +STRIP=aarch64-linux-android-strip +${STRIP} lib${lib_name}_map.so + +cd ${host_work_local} +rm -rf ${host_cpp} ${host_extern} ${host_build} ${host_include} +echo "Preprocess finish" +echo "Check algofiles in path ${host_algo_files}" +echo "Check lib${lib_name}_map.so in path ${host_lib}" diff --git a/inference/engine/tools/preprocess_ocl/preprocess_ocl.cpp b/inference/engine/tools/preprocess_ocl/preprocess_ocl.cpp new file mode 100644 index 00000000..c7a4b080 --- /dev/null +++ b/inference/engine/tools/preprocess_ocl/preprocess_ocl.cpp @@ -0,0 +1,252 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "inference.hpp" +#include "tensor.hpp" +#include "result_format.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include "types.h" +#include "error.h" + +#ifdef _USE_FP16 +inline std::vector buildModelsNameArray(std::string path, std::string postfix) +{ + struct dirent *dirTp; + DIR *handle = opendir(path.c_str()); + std::vector names; + if (handle != NULL) { + while ((dirTp = readdir(handle)) != NULL) { + std::string modelName = dirTp->d_name; + U32 len = modelName.size(); + U32 postfix_len = postfix.size(); + if (len > postfix_len) { + if (modelName.substr(len - postfix_len) == postfix) { + modelName = path + modelName; + names.push_back(modelName); + } + } + } + } else { + UNI_ERROR_LOG("opendir %s failed\n", path.c_str()); + } + closedir(handle); + return names; +} + +inline void write_to_file(std::string str, std::string path, std::string name) +{ + std::string fileName = path + name; + std::ofstream file(fileName.c_str()); + if (file.is_open()) { + file << str.c_str(); + file.close(); + } else { + UNI_ERROR_LOG("fail to write file %s\n", fileName.c_str()); + } +} + +inline void runBoltModel(CI8 *modelPath, CI8 *algoPath, std::vector *kernelNames) +{ + if (!strstr(modelPath, "f16.bolt")) { + UNI_ERROR_LOG("Bolt gpu only support F16(_f16.bolt) now\n"); + UNI_ERROR_LOG("Ensure your model is xxxx_f16.bolt\n"); + exit(1); + } + + UNI_INFO_LOG("Building algofile and used kernelNames for %s\n", modelPath); + auto cnn = createPipeline("GPU", modelPath, algoPath); + std::vector inputDescs = cnn->get_model_input_tensor_descs(); + U8 **input_ptrs = new U8 *[inputDescs.size()]; + for (U32 i = 0; i < inputDescs.size(); i++) { + U32 size = tensorNumBytes(inputDescs[i]); + input_ptrs[i] = new U8[size]; + } + + std::vector inputNames = cnn->get_model_input_tensor_names(); + for (U32 i = 0; i < inputNames.size(); i++) { + cnn->copy_to_named_input(inputNames[i], input_ptrs[i]); + } + + std::map> outMap; + cnn->run(); + outMap = cnn->get_outputs(); + cnn->saveAlgorithmMapToText(algoPath); + GCLHandle_t handle = OCLContext::getInstance().handle.get(); + for (auto p : handle->kernelMap) { + std::string device_name = handle->deviceName; + std::string kernelName = p.first; + kernelName.erase(0, device_name.size() + 1); + if (find((*kernelNames).begin(), (*kernelNames).end(), kernelName) == (*kernelNames).end()) { + (*kernelNames).push_back(kernelName); + } + } + for (auto p : handle->programMap) { + std::string kernelName = p.first; + if (find((*kernelNames).begin(), (*kernelNames).end(), kernelName) == (*kernelNames).end()) { + (*kernelNames).push_back(kernelName); + } + } + + for (U32 i = 0; i < inputDescs.size(); i++) { + delete[] input_ptrs[i]; + } + delete[] input_ptrs; + CHECK_STATUS(gcl_finish(handle)); +} + +inline void buildKernelBinFiles( + std::vector kernelNames, std::string includePath, std::string cppPath) +{ + GCLHandle_t handle; + CHECK_STATUS(gcl_create_handle(&handle)); + std::string device_name = handle->deviceName; + std::string device_name_up = device_name; + std::transform(device_name_up.begin(), device_name_up.end(), device_name_up.begin(), ::toupper); + + std::string inline_kernel_bin_head; + std::string inline_kernel_bin_head_name; + inline_kernel_bin_head_name = "inline_" + device_name + ".h"; + inline_kernel_bin_head = "#ifndef _INLINE_" + device_name_up + "_H\n"; + inline_kernel_bin_head += "#define _INLINE_" + device_name_up + "_H\n"; + + std::string device_map_head; + std::string device_map_head_name; + device_map_head_name = device_name + "_map.h"; + device_map_head = "#ifndef " + device_name_up + "_MAP_H\n"; + device_map_head += "#define " + device_name_up + "_MAP_H\n"; + device_map_head += "extern \"C\" {\n"; + device_map_head += " gcl_kernel_binmap* create_" + device_name + "_kernelbin_map();\n"; + device_map_head += "}\n"; + device_map_head += "#endif"; + write_to_file(device_map_head, includePath, device_map_head_name); + + std::string device_map; + std::string device_map_name; + device_map_name = device_name + "_map.cpp"; + device_map = "#include \"gcl_kernel_binmap.h\"\n"; + device_map += "#include\"" + device_map_head_name + "\"\n"; + device_map += "#include\"" + inline_kernel_bin_head_name + "\"\n"; + device_map += "class " + device_name + " : public gcl_kernel_binmap {\n"; + device_map += "public:\n"; + device_map += " " + device_name + "() {\n"; + device_map += " loadKernelBin();\n"; + device_map += " }\n"; + device_map += " void loadKernelBin();\n"; + device_map += "};\n"; + device_map += "void " + device_name + "::loadKernelBin() {\n"; + + std::string device_kernel_bin; + std::string device_kernel_bin_name; + device_kernel_bin_name = device_name + "_kernel_bin.cpp"; + device_kernel_bin = "#include\"" + inline_kernel_bin_head_name + "\"\n"; + + for (auto p : kernelNames) { + Kernel kernel; + U8 *binary; + U32 len; + CHECK_STATUS(gcl_create_kernel(handle, p.c_str(), &kernel)); + Program program = handle->programMap[p]; + CHECK_STATUS(gcl_get_program_info(program, &binary, &len)); + std::string func = device_name + "_" + p; + inline_kernel_bin_head += "extern const unsigned int " + func + "_len;\n"; + inline_kernel_bin_head += "extern const unsigned char " + func + "[];\n"; + device_map += " put(\"" + func + "\", " + "{" + func + ", " + func + "_len});\n"; + device_kernel_bin += "const unsigned int " + func + "_len = " + std::to_string(len) + ";\n"; + device_kernel_bin += "const unsigned char " + func + "[] = " + "{"; + for (U32 i = 0; i < len; i++) { + char tempstr[4]; + if (i % 20 == 0) { + device_kernel_bin += "\n"; + } + sprintf(tempstr, "0x%02x", binary[i]); + device_kernel_bin += std::string(tempstr); + if (i != len - 1) { + device_kernel_bin += ", "; + } else { + device_kernel_bin += "};\n"; + } + } + CHECK_STATUS(release_kernel(kernel)); + } + inline_kernel_bin_head += "#endif"; + device_map += "}\n"; + device_map += "gcl_kernel_binmap* create_" + device_name + "_kernelbin_map(){\n"; + device_map += " " + device_name + "* kernelbin = new " + device_name + "();\n"; + device_map += " return (gcl_kernel_binmap*) kernelbin;\n"; + device_map += "}"; + write_to_file(inline_kernel_bin_head, cppPath, inline_kernel_bin_head_name); + write_to_file(device_map, cppPath, device_map_name); + write_to_file(device_kernel_bin, cppPath, device_kernel_bin_name); + gcl_destroy_handle(handle); +} +#endif + +int main(int argc, char *argv[]) +{ +#ifdef _USE_FP16 + if (argc != 5) { + UNI_INFO_LOG("Please set your models path, and put your bolt models into it\n"); + UNI_INFO_LOG("Please set your algosPath for save produced algo files, and ensure it is " + "clean\n"); + UNI_INFO_LOG("Please set your include Path for save ocl kernelBin headFile, and ensure it " + "is clean\n"); + UNI_INFO_LOG("Please set your cpp Path for save ocl kernelBin cpp, and ensure it is " + "clean\n"); + UNI_INFO_LOG("For example: ./preprocess_ocl ./boltModels/ ./algoFiles/ ./include/ " + "./cpp/\n"); + exit(1); + } + I8 lastFlag; + std::string modelsPath = (CI8 *)argv[1]; + lastFlag = modelsPath[modelsPath.length() - 1]; + if (strcmp(&lastFlag, "/") != 0) { + modelsPath += "/"; + } + + std::string algoPath = (CI8 *)argv[2]; + lastFlag = algoPath[algoPath.length() - 1]; + if (strcmp(&lastFlag, "/") != 0) { + algoPath += "/"; + } + + std::string includePath = (CI8 *)argv[3]; + lastFlag = includePath[includePath.length() - 1]; + if (strcmp(&lastFlag, "/") != 0) { + includePath += "/"; + } + + std::string cppPath = (CI8 *)argv[4]; + lastFlag = cppPath[cppPath.length() - 1]; + if (strcmp(&lastFlag, "/") != 0) { + cppPath += "/"; + } + + std::vector modelsNameArray; + modelsNameArray = buildModelsNameArray(modelsPath, ".bolt"); + std::vector kernelNames; + for (auto name : modelsNameArray) { + runBoltModel(name.c_str(), algoPath.c_str(), &kernelNames); + } + + buildKernelBinFiles(kernelNames, includePath, cppPath); +#endif + return 0; +} diff --git a/inference/engine/tools/ptq_calibration/ptq_calibration.cpp b/inference/engine/tools/ptq_calibration/ptq_calibration.cpp new file mode 100644 index 00000000..da0fc546 --- /dev/null +++ b/inference/engine/tools/ptq_calibration/ptq_calibration.cpp @@ -0,0 +1,443 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include +#include +#include "inference.hpp" +#include "tensor.hpp" +#include "data_loader.hpp" +#include "result_format.hpp" +#include "profiling.h" +#include "tensor_computing.h" +#include "model_print.h" +#ifdef _USE_FP16 +#include "../../../../compute/tensor/src/cpu/arm/fp16/arm_functions_fp16.h" +#endif +#ifdef _USE_FP32 +#include "../../../../compute/tensor/src/cpu/arm/fp32/arm_functions_fp32.h" +#endif + +#define BINS 2048 +#define NUM_IMAGES_INPUT 100 + +void print_help(char *argv[]) +{ + std::cout << "usage: " << argv[0] + << " modelPath dataDirectory dataFormat scaleValue affinityPolicyName " + "algorithmMapPath" + << std::endl; +} + +int main(int argc, char *argv[]) +{ +#ifdef _USE_FP16 + UNI_TIME_INIT + + char *modelPath = (char *)""; + char *dataDir = (char *)""; + char *affinityPolicyName = (char *)""; + char *algorithmMapPath = (char *)""; + ImageFormat imageFormat = RGB; + F32 scaleValue = 1; + if (argc < 5) { + print_help(argv); + return 1; + } + modelPath = argv[1]; + dataDir = argv[2]; + + imageFormat = (std::string(argv[3]) == std::string("BGR") ? BGR : RGB); + if (std::string(argv[3]) == std::string("RGB_SC")) { + imageFormat = RGB_SC; + } else if (std::string(argv[3]) == std::string("BGR_SC_RAW")) { + imageFormat = BGR_SC_RAW; + } else if (std::string(argv[3]) == std::string("RGB_SC_RAW")) { + imageFormat = RGB_SC_RAW; + } + + scaleValue = atof(argv[4]); + + if (argc > 5) { + affinityPolicyName = argv[5]; + } + + if (argc > 6) { + algorithmMapPath = argv[6]; + } + + ModelSpec int8Ms; + CHECK_STATUS(deserialize_model_from_file(modelPath, &int8Ms)); + CHECK_REQUIREMENT(DT_F16_8Q == int8Ms.dt || DT_F16 == int8Ms.dt); + int8Ms.dt = DT_F16_8Q; + + ModelSpec f16Ms; + CHECK_STATUS(deserialize_model_from_file(modelPath, &f16Ms)); + f16Ms.dt = DT_F16; + + ModelSpec resultMs; + CHECK_STATUS(deserialize_model_from_file(modelPath, &resultMs)); + resultMs.dt = DT_F16_8Q; + + auto relationNum = resultMs.num_op_tensor_entries; + auto relationPtr = resultMs.op_relationship_entries; + resultMs.num_op_tensor_entries = 0; + resultMs.op_relationship_entries = nullptr; + + auto int8CNN = createPipelinefromMs(affinityPolicyName, &int8Ms, algorithmMapPath); + auto f16CNN = createPipelinefromMs(affinityPolicyName, &f16Ms, algorithmMapPath); + + // load images + std::map> inMap = int8CNN->get_inputs(); + TensorDesc imageDesc = (*(inMap.begin()->second)).get_desc(); + std::vector imageDescs; + imageDescs.push_back(imageDesc); + std::vector> images; + std::vector imagePaths = + load_image_with_scale(dataDir, imageDescs, &images, imageFormat, scaleValue); + + std::cout << "[Calibration]:" << std::endl; + + std::vector dBuf; + //std::vector qBuf; + std::vector calibratedOpIdx; + + auto curModelInputTensorNames = int8CNN->get_model_input_tensor_names(); + for (int index = 0; index < (int)curModelInputTensorNames.size(); index++) { + int8CNN->copy_to_named_input(curModelInputTensorNames[index], + (U8 *)((CpuMemory *)images[0][index].get_memory())->get_ptr()); + } + + U32 opIdx = int8CNN->find_next_dynamic_scale_op(calibratedOpIdx, 0); + std::map> tensorScale; + + while (0 != opIdx) { + auto op = int8CNN->get_operator_by_index(opIdx); + std::string opName = op->get_name(); + std::cout << "Calibrating OP " << opIdx << ": " << opName << std::endl; + std::string opsName = int8Ms.ops[opIdx].name; + CHECK_REQUIREMENT(opName == opsName); + + std::vector> scales; + auto inputTensors = op->get_input_tensors(); + auto outputTensors = op->get_output_tensors(); + std::cout << " Inputs:\n"; + + for (U32 i = 0; i < int8Ms.ops[opIdx].num_inputs; i++) { + std::string tensorName = int8Ms.ops[opIdx].input_tensors_name[i]; + TensorDesc inDesc = inputTensors[i].get_desc(); + + auto it = tensorScale.find(tensorName); + if (it != tensorScale.end()) { + scales.push_back(tensorScale[tensorName]); + std::cout << " InputTensor " << i << " " << tensorName << " inherits scale " + << tensorScale[tensorName][0] << std::endl; + continue; + } + + if (DT_I8 == inDesc.dt) { // Gets scale from int8 pooling or concat. Label with -1 + std::vector scale; + scale.push_back(-1); + scales.push_back(scale); + tensorScale[tensorName] = scale; + std::cout << " InputTensor " << i << " " << tensorName + << " inherits transformed scale " << std::endl; + continue; + } + + U32 dBytes = tensorNumBytes(inDesc); + dBuf.resize(dBytes * NUM_IMAGES_INPUT); + U8 *d = dBuf.data(); + std::vector histogram; + F32 last_max = 0; + F32 interval = 0; + + for (U32 j = 0; j < images.size(); j++) { + for (int index = 0; index < (int)curModelInputTensorNames.size(); index++) { + int8CNN->copy_to_named_input(curModelInputTensorNames[index], + (U8 *)((CpuMemory *)images[j][index].get_memory())->get_ptr()); + } + + int8CNN->run_till_breakpoint(opIdx); + memcpy(d, ((CpuMemory *)(inputTensors[i].get_memory()))->get_ptr(), dBytes); + d += dBytes; + + if ((j != images.size() - 1) && ((j + 1) % NUM_IMAGES_INPUT != 0)) { + continue; + } + + if (j == NUM_IMAGES_INPUT - 1 || + ((j == images.size() - 1) && (j < NUM_IMAGES_INPUT - 1))) { + UNI_DEBUG_LOG("---------- start getting 1 - %u images input tensors " + "----------\n", + j + 1); + F16 *ptr_d = (F16 *)dBuf.data(); + F32 max = array_maxabs_f16(ptr_d, (I32)(tensorNumElements(inDesc) * (j + 1))); + UNI_DEBUG_LOG(" %f is the maximum value\n", max); + interval = max / BINS; + histogram.resize(BINS, 0.00001f); + //update histogram first time + update_histogram(tensorNumElements(inDesc) * (j + 1), ptr_d, BINS, interval, + histogram.data()); + last_max = max; + d = dBuf.data(); + dBuf.clear(); + continue; + } + + if ((j + 1) % NUM_IMAGES_INPUT == 0 && j != (NUM_IMAGES_INPUT - 1)) { + UNI_DEBUG_LOG("---------- start getting %d - %u images input tensors " + "----------\n", + j + 1 - 100, j + 1); + F16 *ptr_d = (F16 *)dBuf.data(); + F32 max = array_maxabs_f16( + ptr_d, (I32)(tensorNumElements(inDesc) * NUM_IMAGES_INPUT)); + if (max <= last_max) { + UNI_DEBUG_LOG(" %f is the maximum value\n", last_max); + interval = last_max / BINS; + //update histogram if no new max + update_histogram(tensorNumElements(inDesc) * NUM_IMAGES_INPUT, ptr_d, BINS, + interval, histogram.data()); + } else { + UNI_DEBUG_LOG(" %f is the maximum value\n", max); + interval = max / BINS; + F32 numPerBin = (F32)max / last_max; + //last_max = max; -> may optimize accuracy. + histogram = compress_histogram(histogram, numPerBin, last_max); + last_max = max; + update_histogram((tensorNumElements(inDesc) * NUM_IMAGES_INPUT), ptr_d, + BINS, interval, histogram.data()); + } + d = dBuf.data(); + dBuf.clear(); + continue; + } + + if ((j == images.size() - 1) && ((j + 1) % NUM_IMAGES_INPUT != 0)) { + UNI_DEBUG_LOG("---------- start getting %d - %u images input tensors " + "----------\n", + j + 1 - ((j + 1) % NUM_IMAGES_INPUT), j + 1); + dBuf.resize(dBytes * ((j + 1) % NUM_IMAGES_INPUT)); + F16 *ptr_d = (F16 *)dBuf.data(); + F32 max = array_maxabs_f16( + ptr_d, (I32)(tensorNumElements(inDesc) * ((j + 1) % NUM_IMAGES_INPUT))); + if (max <= last_max) { + UNI_DEBUG_LOG(" %f is the maximum value\n", last_max); + interval = last_max / BINS; + //update histogram if no new max + update_histogram(tensorNumElements(inDesc) * ((j + 1) % NUM_IMAGES_INPUT), + ptr_d, BINS, interval, histogram.data()); + } else { + UNI_DEBUG_LOG(" %f is the maximum value\n", max); + interval = max / BINS; + F32 numPerBin = (F32)max / last_max; + //last_max = max; -> may optimize accuracy + histogram = compress_histogram(histogram, numPerBin, last_max); + last_max = max; + update_histogram((tensorNumElements(inDesc) * NUM_IMAGES_INPUT), ptr_d, + BINS, interval, histogram.data()); + } + d = dBuf.data(); + dBuf.clear(); + continue; + } + } + + UNI_DEBUG_LOG("---------- compute KL ----------\n"); + std::vector scale = compute_scale_with_KL(histogram, interval); + UNI_DEBUG_LOG("--------- finish compute KL ---------\n"); + scales.push_back(scale); + tensorScale[tensorName] = scale; + UNI_DEBUG_LOG(" InputTensor %u %s gets scale %f\n", i, tensorName.c_str(), + tensorScale[tensorName][0]); + } + + op->set_feature_scale(scales); + UNI_DEBUG_LOG(" Outputs:\n"); + + for (U32 i = 0; i < int8Ms.ops[opIdx].num_outputs; i++) { + std::string tensorName = int8Ms.ops[opIdx].output_tensors_name[i]; + TensorDesc desc = outputTensors[i].get_desc(); + + auto it = tensorScale.find(tensorName); + CHECK_REQUIREMENT(it == tensorScale.end()); + + if (DT_F16 == desc.dt) { + continue; + } + + CHECK_REQUIREMENT(DT_I8 == desc.dt); + + auto opF16 = f16CNN->get_operator_by_index(opIdx); + auto outputs = opF16->get_output_tensors(); + + TensorDesc outDesc = outputs[i].get_desc(); + U32 dBytes = tensorNumBytes(outDesc); + dBuf.resize(dBytes * NUM_IMAGES_INPUT); + std::vector histogram; + F32 last_max = 0; + F32 interval = 0; + + U8 *d = dBuf.data(); + + for (U32 j = 0; j < images.size(); j++) { + for (int index = 0; index < (int)curModelInputTensorNames.size(); index++) { + f16CNN->copy_to_named_input(curModelInputTensorNames[index], + (U8 *)((CpuMemory *)images[j][index].get_memory())->get_ptr()); + } + + f16CNN->run_till_breakpoint(opIdx); + memcpy(d, ((CpuMemory *)outputs[i].get_memory())->get_ptr(), dBytes); + d += dBytes; + + if ((j != images.size() - 1) && ((j + 1) % NUM_IMAGES_INPUT != 0)) { + continue; + } + + if (j == NUM_IMAGES_INPUT - 1 || + ((j == images.size() - 1) && (j < NUM_IMAGES_INPUT - 1))) { + UNI_DEBUG_LOG("---------- start getting 1 - %u images output tensors " + "----------\n", + j + 1); + + F16 *ptr_d = (F16 *)dBuf.data(); + F32 max = array_maxabs_f16(ptr_d, (I32)(tensorNumElements(outDesc) * (j + 1))); + UNI_DEBUG_LOG(" %f is the maximum value\n", max); + interval = max / BINS; + histogram.resize(BINS, 0.00001f); + //update histogram first time + update_histogram(tensorNumElements(outDesc) * (j + 1), ptr_d, BINS, interval, + histogram.data()); + last_max = max; + d = dBuf.data(); + dBuf.clear(); + continue; + } + + if ((j + 1) % NUM_IMAGES_INPUT == 0 && j != (NUM_IMAGES_INPUT - 1)) { + F16 *ptr_d = (F16 *)dBuf.data(); + F32 max = + array_maxabs_f16(ptr_d, (I32)tensorNumElements(outDesc) * NUM_IMAGES_INPUT); + + UNI_DEBUG_LOG("---------- start getting %d - %u images output tensors " + "----------\n", + j + 1 - 100, j + 1); + + if (max <= last_max) { + UNI_DEBUG_LOG(" %f is the maximum value\n", last_max); + interval = last_max / BINS; + //update histogram if no new max + update_histogram(tensorNumElements(outDesc) * NUM_IMAGES_INPUT, ptr_d, BINS, + interval, histogram.data()); + } else { + UNI_DEBUG_LOG(" %f is the maximum value\n", max); + interval = max / BINS; + F32 numPerBin = (F32)max / last_max; + //last_max = max; -> may optimize accuracy + histogram = compress_histogram(histogram, numPerBin, last_max); + last_max = max; + update_histogram(tensorNumElements(outDesc) * NUM_IMAGES_INPUT, ptr_d, BINS, + interval, histogram.data()); + } + d = dBuf.data(); + dBuf.clear(); + continue; + } + + if ((j == images.size() - 1) && ((j + 1) % NUM_IMAGES_INPUT != 0)) { + UNI_DEBUG_LOG("---------- start getting %d - %u images output tensors " + "----------\n", + j + 1 - ((j + 1) % NUM_IMAGES_INPUT), j + 1); + dBuf.resize(dBytes * ((j + 1) % NUM_IMAGES_INPUT)); + F16 *ptr_d = (F16 *)dBuf.data(); + F32 max = array_maxabs_f16( + ptr_d, (I32)(tensorNumElements(outDesc) * ((j + 1) % NUM_IMAGES_INPUT))); + if (max <= last_max) { + UNI_DEBUG_LOG(" %f is the maximum value\n", last_max); + interval = last_max / BINS; + //update histogram if no new max + update_histogram(tensorNumElements(outDesc) * ((j + 1) % NUM_IMAGES_INPUT), + ptr_d, BINS, interval, histogram.data()); + } else { + UNI_DEBUG_LOG(" %f is the maximum value\n", max); + interval = max / BINS; + F32 numPerBin = (F32)max / last_max; + //last_max = max; -> may optimize accuracy + histogram = compress_histogram(histogram, numPerBin, last_max); + last_max = max; + update_histogram(tensorNumElements(outDesc) * ((j + 1) % NUM_IMAGES_INPUT), + ptr_d, BINS, interval, histogram.data()); + } + d = dBuf.data(); + dBuf.clear(); + continue; + } + } + UNI_DEBUG_LOG("---------- compute KL ----------\n"); + std::vector scale = compute_scale_with_KL(histogram, interval); + UNI_DEBUG_LOG("---------- finish compute KL ---------\n"); + scales.push_back(scale); + tensorScale[tensorName] = scale; + UNI_DEBUG_LOG(" OutputTensor %u %s gets scale %f\n", i, tensorName.c_str(), + tensorScale[tensorName][0]); + } + if (int8Ms.ops[opIdx].num_quant_feature == 1 && + -2 == int8Ms.ops[opIdx].feature_scale[0].scale[0]) { + std::vector outputScale; + outputScale.push_back(-2); + scales.push_back(outputScale); + } + + op->set_feature_scale(scales); + + // Store scales into result model + if (nullptr != resultMs.ops[opIdx].feature_scale) { // Could be labelled with -2 + for (U32 i = 0; i < resultMs.ops[opIdx].num_quant_feature; i++) { + if (nullptr != resultMs.ops[opIdx].feature_scale[i].scale) { + delete[] resultMs.ops[opIdx].feature_scale[i].scale; + } + } + delete[] resultMs.ops[opIdx].feature_scale; + } + + resultMs.ops[opIdx].num_quant_feature = scales.size(); + resultMs.ops[opIdx].feature_scale = + (QuantSpec *)mt_new_storage(scales.size() * sizeof(QuantSpec)); + + for (U32 i = 0; i < scales.size(); i++) { + resultMs.ops[opIdx].feature_scale[i].num_scale = scales[i].size(); + U32 scaleBytes = scales[i].size() * sizeof(F32); + resultMs.ops[opIdx].feature_scale[i].scale = (F32 *)mt_new_storage(scaleBytes); + memcpy(resultMs.ops[opIdx].feature_scale[i].scale, scales[i].data(), scaleBytes); + } + + calibratedOpIdx.push_back(opIdx); + opIdx = int8CNN->find_next_dynamic_scale_op(calibratedOpIdx, opIdx); + } + + print_ms(resultMs); + + std::string modelStorePath = std::string(argv[1]); + auto suffixPos = modelStorePath.find(".bolt"); + modelStorePath.erase(suffixPos, 5); + modelStorePath += "_KL.bolt"; + CHECK_STATUS(serialize_model_to_file(&resultMs, modelStorePath.c_str())); + + CHECK_STATUS(mt_destroy_model(&int8Ms)); + CHECK_STATUS(mt_destroy_model(&f16Ms)); + resultMs.num_op_tensor_entries = relationNum; + resultMs.op_relationship_entries = relationPtr; + CHECK_STATUS(mt_destroy_model(&resultMs)); +#endif + return 0; +} diff --git a/inference/examples/CMakeLists.txt b/inference/examples/CMakeLists.txt new file mode 100644 index 00000000..13cad12c --- /dev/null +++ b/inference/examples/CMakeLists.txt @@ -0,0 +1,63 @@ +cmake_minimum_required(VERSION 3.2) + +file(GLOB BOLT_CONFIGURE_FILE $ENV{BOLT_ROOT}/common/cmakes/bolt.cmake ${BOLT_ROOT}/common/cmakes/bolt.cmake) +if (BOLT_CONFIGURE_FILE) + include(${BOLT_CONFIGURE_FILE}) +else (BOLT_CONFIGURE_FILE) + message(FATAL_ERROR " +FATAL: can not find bolt.cmake in directory, + please set shell or cmake environment variable BOLT_ROOT. + ") +endif (BOLT_CONFIGURE_FILE) + +project(examples) + +set_c_cxx_flags() +set_test_c_cxx_flags() + +include_flow() + +if (BUILD_TEST) + engine_test(bert bert/bert.cpp) + engine_test(tinybert bert/tinybert.cpp) + engine_test(classification image_classification/classification.cpp) + engine_test(nmt machine_translation/nmt.cpp) + engine_test(nmt_tsc machine_translation/nmt_tsc.cpp) + engine_test(asr_rnnt automatic_speech_recognition/asr_rnnt.cpp) + engine_test(asr_convolution_transformer automatic_speech_recognition/asr_convolution_transformer.cpp) + engine_test(tts text_to_speech/tts.cpp) + engine_test(vad automatic_speech_recognition/vad.cpp) + engine_test(detection object_detection/detection.cpp) + engine_test(tinybert_onnx bert/tinybert_onnx.cpp) + engine_test(benchmark benchmark/benchmark.cpp) + engine_test(test_api_c c_api/test_api_c.c) + install(TARGETS classification + benchmark + tinybert + tinybert_onnx + nmt + asr_rnnt + asr_convolution_transformer + tts + vad + test_api_c + RUNTIME DESTINATION examples) + if (USE_MALI AND USE_FP16) + engine_test(test_pipeline_ocl sequential/test_pipeline_ocl.cpp) + engine_test(hdr high_dynamic_range/hdr.cpp) + install(TARGETS hdr + RUNTIME DESTINATION examples) + endif (USE_MALI AND USE_FP16) + + if (USE_FLOW) + flow_test(graph_tinybert bert/graph_tinybert.cpp) + flow_test(flow_tinybert bert/flow_tinybert.cpp) + flow_test(flow_asr "automatic_speech_recognition/flow_asr.cpp;automatic_speech_recognition/audio_feature.cpp") + flow_test(flow_dlaWOdcn dlaWOdcn/flow_dlaWOdcn.cpp) + flow_test(flow_facesr facesr/flow_facesr.cpp) + install(TARGETS flow_asr + flow_dlaWOdcn + flow_facesr + RUNTIME DESTINATION examples) + endif (USE_FLOW) +endif (BUILD_TEST) diff --git a/kits/automatic_speech_recognition/asr_convolution_transformer.cpp b/inference/examples/automatic_speech_recognition/asr_convolution_transformer.cpp similarity index 62% rename from kits/automatic_speech_recognition/asr_convolution_transformer.cpp rename to inference/examples/automatic_speech_recognition/asr_convolution_transformer.cpp index 141220f6..f5c53193 100644 --- a/kits/automatic_speech_recognition/asr_convolution_transformer.cpp +++ b/inference/examples/automatic_speech_recognition/asr_convolution_transformer.cpp @@ -1,40 +1,37 @@ // Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - #include #include "inference.hpp" #include "tensor.hpp" #include "data_loader.hpp" -#include "utils.hpp" - -void print_help(char* argv[]) { - std::cout << "usage: " << argv[0] << " modelPath sequencesDirectory subNetworkName[encoder|prediction_net] cpuAffinityPolicyName" << std::endl; -} +#include "profiling.h" +#include "parse_command.h" -HashMap prepareStates(DataType dt, std::string sequenceDirectory, - std::string shapeMapFileName) +std::map prepareStates( + DataType dt, std::string sequenceDirectory, std::string shapeMapFileName) { - HashMap shapeMap; + std::map shapeMap; std::string filePath = sequenceDirectory + "/" + shapeMapFileName; FILE *shapeMapFile = fopen(filePath.c_str(), "r"); char buffer[NAME_LEN]; while (fscanf(shapeMapFile, "%s", buffer) != EOF) { TensorDesc desc; fscanf(shapeMapFile, "%u", &(desc.nDims)); - for (U32 i = 0; i < desc.nDims; i++) + for (U32 i = 0; i < desc.nDims; i++) { fscanf(shapeMapFile, "%u", &(desc.dims[desc.nDims - 1 - i])); + } if (std::string(buffer) == std::string("label")) { desc.dt = DT_U32; } else { @@ -50,17 +47,19 @@ HashMap prepareStates(DataType dt, std::string sequenceDire } fclose(shapeMapFile); - HashMap tensorMap; - for (auto iter: shapeMap) { + std::map tensorMap; + for (auto iter : shapeMap) { std::string filePath = sequenceDirectory + "/" + iter.first + ".txt"; TensorDesc desc = iter.second; - tensorMap[iter.first] = load_txt(filePath, Vec{desc})[0]; + tensorMap[iter.first] = load_txt(filePath, std::vector{desc})[0]; } return tensorMap; } -void saveStates(std::shared_ptr pipeline, std::string sequenceDirectory, - std::string outputFileName, std::string outputStatesFileName) +void saveStates(std::shared_ptr pipeline, + std::string sequenceDirectory, + std::string outputFileName, + std::string outputStatesFileName) { char buffer[NAME_LEN]; std::string outputFilePath = sequenceDirectory + "/" + outputFileName; @@ -75,17 +74,19 @@ void saveStates(std::shared_ptr pipeline, std::string sequenceDirectory, // write states fprintf(outputStatesFile, "%s\n", buffer); fprintf(outputStatesFile, "%u\n", desc.nDims); - for (U32 i = 0; i < desc.nDims; i++) - fprintf(outputStatesFile, "%u ", desc.dims[desc.nDims-1-i]); + for (U32 i = 0; i < desc.nDims; i++) { + fprintf(outputStatesFile, "%u ", desc.dims[desc.nDims - 1 - i]); + } // write data U32 num = tensorNumElements(desc); std::string outputDataPath = sequenceDirectory + "/" + std::string(buffer) + ".txt"; FILE *outputDataFile = fopen(outputDataPath.c_str(), "w"); for (U32 i = 0; i < num; i++) { - fprintf(outputDataFile, "%f ", tensor.getElement(i)); - if (i % 10 == 9) + fprintf(outputDataFile, "%f ", tensor.element(i)); + if (i % 10 == 9) { fprintf(outputDataFile, "\n"); + } } fclose(outputDataFile); } @@ -93,14 +94,13 @@ void saveStates(std::shared_ptr pipeline, std::string sequenceDirectory, fclose(outputStatesFile); } -int verify(Tensor tensor, std::string subNetworkName, HashMap inputDescMap) +int verify(Tensor tensor, std::string subNetworkName, std::map inputDescMap) { - U32 num = tensorNumElements(tensor.get_desc()); + U32 num = tensor.length(); F32 sum = 0; for (U32 i = 0; i < num; i++) { - sum += tensor.getElement(i); + sum += tensor.element(i); } - std::cout << "Output sum is " << sum << "\n"; I32 result = 0; if (subNetworkName == std::string("encoder")) { if (inputDescMap["sounds"].dims[1] == 15) { @@ -126,18 +126,32 @@ int verify(Tensor tensor, std::string subNetworkName, HashMap input = prepareStates(dt, sequenceDirectory, "input_shape.txt"); - HashMap inputDescMap; - for (auto iter: input) + std::map input = + prepareStates(dt, sequenceDirectory, "input_shape.txt"); + std::map inputDescMap; + for (auto iter : input) { inputDescMap[iter.first] = iter.second.get_desc(); + } pipeline->reready(inputDescMap); - for (auto iter: input) { - U8* tensorPointer = iter.second.get_val(); + for (auto iter : input) { + U8 *tensorPointer = (U8 *)((CpuMemory *)(iter.second.get_memory()))->get_ptr(); pipeline->copy_to_named_input(iter.first, tensorPointer); } @@ -186,17 +201,17 @@ int main(int argc, char* argv[]) { totalTime += (timeEnd - timeBegin); Tensor output = pipeline->get_tensor_by_name(outputTensorName); falseResult += verify(output, subNetworkName, inputDescMap); - saveStates(pipeline, sequenceDirectory, "output_name.txt", "output_shape.txt"); + //saveStates(pipeline, sequenceDirectory, "output_name.txt", "output_shape.txt"); } - UTIL_TIME_STATISTICS + UNI_TIME_STATISTICS std::cout << "[SUMMARY]:" << std::endl; U32 validSequence = loops; - CI_info("speech recognition rate: " << 100.0 * (validSequence - falseResult) / validSequence << " %"); - CI_info("avg_time:" << 1.0 * totalTime / validSequence << "ms/sequence"); + UNI_CI_LOG( + "speech recognition rate: %f %%\n", 100.0 * (validSequence - falseResult) / validSequence); + UNI_CI_LOG("avg_time:%fms/sequence\n", 1.0 * totalTime / validSequence); if (falseResult > 0) { - std::cerr << "[ERROR] verify failed" << std::endl; - exit(1); + UNI_ERROR_LOG("verify failed\n"); } return 0; diff --git a/inference/examples/automatic_speech_recognition/asr_labels.txt b/inference/examples/automatic_speech_recognition/asr_labels.txt new file mode 100644 index 00000000..9cfb1ac5 --- /dev/null +++ b/inference/examples/automatic_speech_recognition/asr_labels.txt @@ -0,0 +1,2 @@ +[NULL] [START] [SEP] [MASK] 的 我 你 了 是 不 一 么 有 好 在 个 人 这 要 天 什 到 大 来 上 吗 去 没 下 啊 就 电 点 怎 说 看 会 开 小 多 还 给 十 子 能 那 中 可 家 为 打 想 时 都 二 车 们 发 话 三 以 过 吧 吃 生 年 出 里 最 也 他 机 回 用 手 心 老 和 样 道 明 地 行 现 后 得 女 哪 国 今 五 公 妈 新 爱 事 自 学 呢 很 成 对 动 真 知 市 起 日 四 快 做 把 意 信 高 儿 面 方 美 爸 关 安 情 路 调 分 少 八 六 作 之 前 呀 哈 长 度 如 场 几 零 经 请 着 百 姐 歌 全 号 太 晚 九 业 放 第 问 月 乐 气 网 七 西 帮 工 加 法 觉 喜 东 别 定 睡 无 些 通 近 被 头 名 间 文 两 听 买 于 主 欢 空 然 友 又 再 玩 办 导 才 等 水 感 只 果 风 找 谁 理 查 同 让 宝 区 己 本 比 力 走 谢 海 叫 男 重 次 当 实 早 山 片 钱 外 州 南 她 啦 航 身 干 影 视 已 体 王 接 将 正 进 金 候 思 花 音 饭 语 记 提 部 哦 系 星 收 何 万 游 位 图 所 首 哥 见 张 站 从 班 应 者 息 该 门 合 式 城 马 北 服 火 节 相 因 英 完 字 周 世 白 活 口 带 民 刚 期 神 色 跟 种 照 孩 线 光 嗯 平 与 司 红 表 单 千 联 而 但 化 阳 交 物 保 特 题 结 婆 院 报 常 广 量 解 变 更 入 直 界 死 内 边 京 目 热 店 难 师 原 房 超 告 费 数 温 员 模 球 其 像 台 嘛 消 午 资 品 产 设 总 件 演 流 朋 错 利 江 价 先 码 代 呵 级 笑 福 需 战 短 亮 亲 速 建 眼 华 聊 强 清 务 转 教 向 声 离 商 写 元 龙 书 播 戏 改 此 传 考 卡 制 换 受 每 酒 包 微 远 认 充 拉 越 运 示 处 管 置 奶 醒 座 条 格 赛 县 展 林 停 试 黄 送 讲 园 喝 拿 医 排 警 衣 连 计 雨 算 望 性 始 婚 拍 求 春 票 半 校 准 幺 曲 黑 夜 剧 住 飞 灯 深 啥 词 整 附 装 穿 器 油 李 注 达 队 斯 省 选 诉 假 冷 功 德 政 复 米 备 科 客 程 画 双 证 持 局 洗 岁 便 未 底 梦 取 银 集 终 精 基 香 反 布 尔 妹 青 久 版 容 习 指 技 河 唱 士 任 案 您 云 推 使 句 钟 失 股 伤 蛋 非 义 至 低 读 款 检 步 巴 丽 坐 续 满 奇 病 况 频 食 驾 克 预 并 脑 休 掉 否 专 弟 祝 启 忙 刘 识 村 载 楼 鱼 它 助 汽 支 立 治 帅 菜 昨 言 标 药 故 牌 易 跑 切 湖 烦 阿 窗 确 显 创 组 型 闭 爷 景 古 决 雪 狗 必 鸡 养 户 却 街 石 各 铁 落 及 据 课 由 录 造 卖 武 肉 领 怕 牛 社 值 亚 投 术 兴 命 共 陈 除 差 待 屏 脸 拨 幸 统 答 贝 猪 宁 板 或 餐 随 观 约 修 季 床 麻 搜 货 毛 防 密 冰 足 累 害 份 希 企 盘 象 港 紧 较 懂 具 规 环 罗 营 旅 搞 团 忘 简 蓝 留 奥 绝 军 许 历 段 称 杨 获 够 皮 质 配 智 汉 镇 态 志 举 限 护 往 幕 牙 形 项 居 存 背 痛 源 论 升 庆 破 礼 按 陪 贵 康 味 轻 卫 翻 索 维 险 议 愿 农 评 料 增 健 类 参 永 慢 沙 招 彩 购 冬 漂 藏 博 粉 坏 初 秋 室 念 荣 爆 赶 众 苦 夫 响 育 艺 遇 极 血 顺 套 呼 急 嘿 询 职 驶 介 兰 暖 效 委 椅 官 竟 奖 魔 波 适 倒 属 怀 验 富 恩 角 恋 挺 势 销 济 减 雄 块 馆 软 乡 突 财 免 围 摩 讯 苏 降 即 左 控 静 继 童 范 断 救 登 优 退 练 根 宣 诗 秀 尽 母 疼 肥 舞 划 末 权 脚 谈 杀 苹 土 纪 党 羊 敢 韩 测 拜 豆 圈 跳 曾 右 普 烧 某 饿 肯 猫 际 胡 亿 似 击 依 娘 松 群 丝 络 颜 细 咋 木 岛 止 史 妇 绿 怪 哎 赵 闻 川 郑 灵 肚 冒 争 烟 址 哭 监 察 付 状 吴 雷 熊 闹 副 致 笔 章 草 胖 售 压 哇 临 补 仙 桥 吉 树 坚 迷 困 追 移 独 迎 列 吹 归 茶 惊 傻 秘 享 境 杰 朝 胜 骗 猜 耀 晓 率 娃 积 绍 典 夏 抱 引 则 田 府 靠 鞋 负 供 兄 择 露 研 额 圣 输 般 须 杯 令 严 孕 漫 厂 余 浪 姑 泰 笨 邮 佳 乱 弄 抓 素 欧 讨 激 采 涨 训 斗 箱 暴 冲 刻 尼 镜 甲 互 玉 腿 责 泡 施 究 叔 借 疑 透 曼 承 狂 庭 洋 勇 轮 厅 且 爽 瓜 鬼 酷 律 庄 担 冠 舒 洲 迪 宜 咯 页 豪 挂 野 堂 咱 攻 抢 峰 孙 毒 纸 译 父 唐 皇 甜 偷 赞 族 骑 叶 娱 脱 盟 略 呗 顾 圳 派 池 批 丰 舍 圆 恐 巨 益 距 刷 递 丹 净 威 另 锅 签 乖 糖 操 材 聚 晨 良 乎 暗 吸 凉 访 恶 融 晒 守 妻 荐 旧 概 耳 湾 兵 纳 抽 虎 仅 层 败 托 伦 睛 寻 汤 疗 嘴 媒 宋 酸 鲜 裤 印 齐 策 厉 善 曝 执 央 楚 宿 惠 萌 棒 腾 汇 昌 嘉 席 寒 届 途 屁 束 顶 租 鸟 辛 珠 编 召 骨 俩 折 毕 构 协 莫 熟 劳 骂 架 胎 税 例 巧 封 辣 诞 梅 疯 散 姓 哟 遭 娜 伙 键 泪 侠 贴 审 申 雅 虽 炒 阴 姨 菲 判 泉 币 异 颖 弃 革 震 努 凯 阅 宫 伟 仍 陆 帝 霸 辆 拼 误 拥 册 赢 征 赚 潮 锁 津 端 核 乌 档 缺 删 释 厌 悲 伴 织 尚 堵 订 摄 域 紫 谷 逃 犯 私 培 估 眠 默 违 桃 杭 仔 桌 账 屋 剑 挑 隐 刀 尾 缘 盛 稳 戴 滴 弹 萨 枪 鲁 蜜 糕 扣 盖 君 罪 佛 瘦 刺 剩 淘 澡 探 蒙 惜 绩 忍 淡 搭 逼 禁 库 宾 雾 徐 若 聪 析 贷 唉 瓶 序 厦 凤 趣 嘻 危 忆 滚 瑞 烤 洛 醉 狼 亦 旁 遍 偶 丁 晕 乘 钢 竞 泽 纯 兽 森 榜 闲 丑 恭 幼 伊 避 阵 铃 麦 盗 秒 篇 嫁 射 硬 欲 肤 丢 吐 慧 饮 塞 均 猎 奔 迟 鼻 滑 闪 锋 遗 船 甘 尊 唯 诺 饼 惯 敬 纷 宇 噢 虚 郭 昆 卷 润 炸 亡 塔 染 朵 混 逛 罚 俊 贫 针 沉 幻 旗 症 拒 纹 袋 袭 剪 替 炎 锦 胸 珍 梁 篮 植 厕 尸 鼠 扎 吓 跌 顿 滨 扬 诚 玲 孤 虑 烈 含 凡 延 占 燕 媳 映 臭 敏 姥 姆 乔 摇 吵 废 鹿 朱 杂 灰 扫 恒 障 返 祖 煮 弱 艳 暂 撞 摸 敌 刑 斤 劲 沈 辈 喊 兔 朗 怒 残 咖 扰 迹 妆 嫌 洞 摆 迅 邓 胃 偏 闷 井 龄 仪 督 莱 既 凌 貌 溪 妞 坑 湿 掌 览 触 综 惨 幅 腰 鼓 励 玛 呆 链 隆 患 柳 琴 摔 虫 沟 隔 鸭 灭 魂 宽 逆 饱 妖 柔 扶 宵 萝 郎 插 钓 寄 杜 洁 抗 舅 裙 辑 懒 烂 喂 徽 涛 戒 饰 浙 旦 桂 赏 怡 恨 仁 伯 寂 欣 壁 述 详 擦 莲 盒 炉 液 彻 慕 涉 损 氛 碰 芳 宗 董 秦 宠 冻 墙 怜 僵 泳 悔 坛 搬 羽 缩 裁 拖 侧 奋 夺 妙 揭 筑 郁 抄 浮 狐 栏 喔 甚 尘 横 灾 献 添 寿 贸 腐 宅 岗 垃 忽 固 耶 夹 纠 幂 圾 措 哼 宏 渐 靓 尿 躺 撒 阶 艾 焦 悉 辉 忧 倍 啡 脏 穷 祥 炼 援 竹 著 碗 莉 奏 羡 澳 厚 铺 截 姿 徒 拳 污 赫 促 箭 欠 猴 胆 尝 怖 旋 洪 盆 飘 帽 赔 谓 爹 桩 括 坦 耐 描 勒 扩 撑 芝 吻 谱 惹 肖 逐 恢 膜 窝 跨 拆 氏 拔 颗 虹 蛮 衡 晴 盐 詹 寞 挥 柏 晶 痘 卓 碎 催 爬 佩 聘 岸 苍 炮 债 咬 磨 燃 遥 棋 饺 潜 挡 泥 毁 径 慰 券 畅 汗 岳 捷 辞 捕 霆 欺 孔 俗 籍 眉 扮 挖 莞 符 悠 肠 歉 咪 埋 亏 拾 赖 耍 喽 俄 握 霜 汪 贺 浦 璃 呜 瓦 荷 岩 缓 蛇 梯 绵 瑶 拟 夕 玻 黎 驰 陵 傅 棉 凭 绕 储 弯 愁 塘 翔 厨 夸 陷 悬 蔡 偿 仓 胶 幽 浩 猛 奈 携 予 岭 振 番 荒 逗 薄 繁 贤 迁 巡 兆 疆 堡 坊 卧 匙 丫 狠 鹏 虾 毫 壮 薇 忠 坡 乳 晗 娇 喷 倾 稀 驻 卜 仲 陌 晋 巾 霍 肃 悟 牵 荡 帐 熬 轨 勤 尖 允 旺 循 柜 矿 稍 潘 辽 丈 疾 寺 尬 滩 魅 涂 堪 钥 阻 苗 槽 诊 邀 湘 姻 炫 枝 抵 堆 凶 芭 咳 羞 钻 肿 傲 沿 侣 婴 铜 谅 瞬 遮 祸 伞 卢 呐 吊 填 尤 仿 浏 霞 浓 碧 鉴 悦 裂 尴 铠 咸 漏 腹 酱 粗 痴 串 煤 肌 挣 翼 狱 糊 贾 芦 葡 闺 绪 嗨 挤 锡 曹 摘 姜 彤 棍 骚 婷 寨 筹 琳 俺 陕 帘 惑 墨 蟹 狮 冯 授 粥 耗 蜂 亭 贯 桑 趋 馨 蓉 萄 玫 踢 谋 椒 淮 妃 兼 辅 诸 肩 舟 础 犹 桶 慈 捡 庙 唔 瞎 谜 诱 姚 诈 龟 壳 柯 踏 粮 凰 淋 叉 逸 贱 赌 蝶 寓 袜 齿 愉 烫 盈 屎 涯 刹 跃 瑰 浴 卸 鸣 丸 裸 盾 孝 拘 迫 洒 肝 橙 殊 萧 鹅 趟 娶 汁 沃 氧 尺 筋 踪 廉 伍 暑 捐 扇 歇 涵 滋 扔 浅 赴 崩 邻 谦 斑 蒸 披 粤 袁 贼 胞 唤 迈 躲 痒 锻 蕾 砸 妮 轩 赤 邪 犬 嫂 溜 峡 寸 抛 悄 琪 柴 薪 纱 渡 侵 哒 哲 虐 陶 霉 糟 牢 莎 蚊 梨 脾 樱 奚 翠 屈 恼 伏 叹 誉 扑 蒋 翅 昏 覆 刮 侦 逢 劫 醋 宴 仇 笼 绑 鹰 袖 蹈 丘 遵 衫 塑 俱 潭 芒 鹤 薛 捧 菊 驱 罩 哀 稿 碑 磊 潇 署 牧 邦 鸿 薯 泄 啤 彭 踩 闯 舌 勿 膀 罐 沧 漠 翰 谣 雕 膏 寝 柱 怨 兹 仰 孟 墓 杏 蒂 芬 熙 愤 囊 乃 伸 菇 矛 割 葫 魏 坤 埃 咨 蜡 纽 辩 莓 彼 癌 旬 韵 嫩 琦 撕 押 贪 韦 吾 旭 枣 贡 慌 崇 呦 吕 捉 葛 械 茫 敲 杆 肺 慎 辰 磅 嗽 蕉 茂 抬 躁 垫 跪 劝 渣 溃 役 缝 谎 苑 窃 巢 憾 愈 腊 纵 谐 剂 喉 宙 罢 渠 匆 侯 缴 肾 御 粒 伪 佑 抚 昂 痕 砍 扭 皆 坪 葱 渴 谊 淇 腻 雀 茄 瘾 抹 碍 瑟 颈 逊 敦 茅 疲 唇 阁 纲 衰 炖 脉 谭 穆 蔬 疏 厘 扯 愧 螺 鼎 撤 赠 蝴 怼 轿 斌 艰 哄 矮 嘟 浆 框 嘎 绒 渔 瓷 摊 盼 灿 荆 舰 鑫 煎 阜 酬 泛 呈 玄 弗 甩 盲 砖 莹 雯 佬 媚 匹 脂 奉 凑 葩 脖 骄 惧 奕 拐 筒 叠 朴 凝 践 萍 廊 奢 巷 戚 兜 胀 谨 丧 逝 枕 恰 帆 阔 乏 歪 芯 翁 脆 辨 硕 吨 豫 兑 郊 呃 飙 霾 抑 掘 铭 垂 塌 蓄 瞧 锤 吼 蠢 桐 拓 栋 珊 绘 诀 菌 绣 腔 斜 蛙 捞 碳 昕 抖 噶 涌 雁 勾 靖 毅 肇 誓 芙 扒 殿 磁 绳 拯 锐 疫 兮 妥 甄 嗓 掩 咒 蜀 臣 彦 昔 喻 崔 殖 吞 趁 帖 穴 逻 赋 削 棚 轰 驴 郸 枚 诶 璐 颁 玺 柿 募 邯 鸽 沪 捏 凳 忌 攀 僧 坠 挽 沫 挨 屯 嫣 乙 绮 沂 彬 缠 邵 暮 奸 裹 坝 馒 豹 颠 爪 渭 厢 帕 臂 粘 愚 爵 饶 汕 灌 慨 弥 乒 浑 廷 扁 芜 壶 喵 淑 楠 馅 钙 钮 岂 俪 嘞 黛 辱 拦 咙 妒 耻 杉 伐 佐 撩 芽 祭 晃 筝 仗 橘 茜 娟 淀 枫 漆 榴 蝎 蚁 遂 衬 澄 邢 瑜 冤 咧 蹭 罕 鞭 蛛 瓣 煌 惩 憋 倩 泊 烊 株 菠 冈 戈 掀 刊 腺 鲍 隧 碌 襄 赣 睁 妍 裕 吟 鞍 奴 屌 绎 碟 昭 骤 蘑 讼 咏 祈 蒲 卦 崛 荔 拌 墅 泸 玮 哑 叛 噜 绯 卑 棵 咩 巫 沦 敷 氓 嘲 趴 卵 螃 杠 恺 铅 坎 狸 缸 辖 麟 陀 稻 燥 藤 钉 揍 蜘 丛 耽 契 函 蚂 喇 湛 贩 婶 筷 盯 巅 饥 梳 蒜 媛 庞 屠 菱 橡 辟 撼 砂 溢 葬 衔 邱 嫉 毯 泼 汰 甸 惕 蹄 犀 牡 勉 蹲 蓬 琼 履 榆 搏 弊 氢 娄 堰 噩 禅 葵 咽 姬 颇 沁 酿 柠 弦 鄙 烛 戳 淄 檬 鲨 粽 逾 跤 坞 畀 沾 椎 枯 晰 斥 煲 涕 揉 狙 骏 渤 彰 宪 膝 渝 潍 韶 卿 衷 浇 勃 侄 贞 焰 巩 闸 婉 撸 掏 翘 馋 剥 辜 暨 栈 屉 硫 姗 斩 琐 稚 悍 勋 庐 溶 矫 鸦 瞌 牲 疤 叮 昧 屡 炭 伽 涩 漳 庸 啰 乾 讽 侈 诵 剁 昊 迦 澜 轴 袍 驼 乓 禾 挪 蹦 仑 妨 纤 靴 钩 瘤 叙 逮 宰 笛 蔚 匪 渊 刃 寡 赐 辐 俏 柚 憨 畏 烨 泣 嗦 耿 袄 坟 秩 疮 屿 淫 弘 猩 邑 邹 挫 胁 倪 樊 鄂 谍 讶 颂 佟 扛 蝉 凄 剖 烁 埔 劣 鳄 侨 耕 菏 栗 瘫 栽 歧 郡 睇 皱 猥 荧 倡 祷 夷 傍 匠 睿 孽 贿 咕 滥 梧 膊 胳 栖 亨 儒 梗 蔽 牺 钠 浒 腕 疹 窦 茨 闽 侃 驳 茹 竖 阱 娅 咔 蜗 崖 喘 弓 埠 擎 劈 粑 乞 萱 馈 啧 凸 俞 钰 吁 珂 滞 揽 魄 啃 舔 彪 顽 沸 艇 驹 骆 捣 窄 菩 啪 亩 絮 绽 颤 韭 淹 剃 炅 逍 朔 擅 霖 熄 垄 哗 粹 惫 瀑 俾 馍 铝 抠 皂 禽 氯 闰 蝠 篷 肪 舆 搅 仕 攒 拱 雇 蚕 汾 斋 浸 楔 殴 镖 枉 辫 茵 肆 畔 囧 棠 蚌 哩 倚 熏 漱 嵩 淳 卤 甫 倦 嚣 荟 硅 哨 榨 睫 黔 堕 勺 冥 蝙 沐 矩 窍 嘘 拽 俯 椰 旱 焊 懵 汝 氨 豚 呕 翡 纺 濮 尹 睹 莆 瞒 曦 锈 缅 祁 酥 磕 钞 陇 纬 沥 肢 叭 晖 搁 诛 捂 芹 娥 颐 噪 隋 岚 赁 魁 衍 鹦 搓 瞻 脊 钦 鲤 诠 聋 茉 笋 榄 臀 禹 郝 杞 祛 庚 廖 诡 鹉 仆 裔 畜 阖 艘 唠 斧 啸 挠 芋 狄 槛 冀 绥 嫖 洱 饲 泌 碱 哮 鲸 瘩 悴 疙 崎 摧 梭 觅 憔 汀 惬 痣 茎 苔 溺 梓 堤 镐 碾 冉 翩 腥 灶 暧 宛 潼 禺 芸 滤 哺 遛 遣 侍 镑 闫 秤 旷 殷 龚 糯 醇 晏 奎 陋 喧 舱 矣 冕 惟 峻 芷 匀 毙 咦 霄 纶 噻 蝇 嬛 竿 缉 眷 萎 禄 峨 佣 咚 恍 拂 掐 蔓 瞳 泻 丞 籽 穹 曰 枸 阎 怅 竭 桔 贬 樟 铲 藕 穗 缆 痰 稣 壤 妓 嗷 捶 啼 杖 娲 瞪 猿 擒 沛 赎 驿 瞅 芥 墩 焚 癫 鸳 琅 揪 稽 坨 珞 峪 挞 囚 敞 哉 霹 雳 挚 眨 眸 簿 屑 狭 旨 尧 嚏 麽 绅 蜕 丙 懿 肋 悚 婿 匿 崽 渺 锣 涡 澈 绸 膨 焉 缕 璧 皓 焕 瞄 璇 酵 滕 惭 垮 泗 窟 叨 垒 裴 凹 虞 矶 昼 疡 隶 硝 鹊 橄 鱿 朕 邂 叼 怂 柄 萤 逅 曙 铸 汶 棕 楂 颓 壹 髓 貂 呛 蕴 窑 脐 侮 沽 煞 窖 洽 猝 俐 秃 朦 眯 盔 炳 鸯 麒 湄 阑 鹭 抒 窒 伺 漯 滁 鸥 睐 芈 粪 窥 哆 拇 螂 豁 韬 捅 钧 妄 捆 拢 苟 羯 胧 丐 腌 岔 聂 喀 懈 鳞 嘱 悼 卉 籁 恳 楷 巍 牟 喱 娼 嗝 蒽 笙 惆 寥 驭 嗲 搂 掖 涟 榕 莺 疚 泵 咆 甥 忑 娴 阙 镯 掰 箫 焖 禧 忐 锯 祠 缇 郴 砺 辕 鞠 缔 痔 烽 灸 歹 嫦 玥 趾 筛 妾 墟 槐 孜 羁 栓 厄 冶 乍 锄 忻 徊 俭 唧 棱 黍 荫 赃 唿 诅 勘 拎 徘 垣 汹 灼 衢 圭 锂 裳 阀 隙 跆 佰 聆 撇 琢 澎 闵 炬 啵 呱 绊 廿 凋 滔 藻 暇 鲫 雌 阮 槟 涧 蕊 孵 篱 昵 捎 坂 皋 坷 奠 顷 雍 剔 躯 砥 靡 腩 狡 擂 掠 竣 讪 窜 枢 钝 磷 惦 嘀 鬓 灏 忒 婊 涮 秧 癖 钾 殡 馄 沭 烹 獒 叽 喆 鳌 嚼 亳 刁 漓 岐 盏 跷 肛 婕 绚 帷 瞩 梵 棺 泾 糙 薰 缤 饵 橱 渗 蔷 辙 髦 蛤 雹 噗 僻 韧 烘 琉 酝 俑 蜓 粱 翟 阚 鹃 璨 褪 馀 锌 豌 撬 扳 肘 饨 蜻 吱 掷 黏 喃 亢 嗒 咐 痪 祺 缀 懦 羹 檐 扉 妩 躬 婧 蟒 雏 漾 锥 朽 珀 柬 敛 眶 炜 拧 愣 桦 揣 惚 嗑 抉 蟑 吏 猕 浊 蛀 迢 汛 熔 绰 琶 俘 谚 蚀 寇 煽 酪 陛 潢 帼 拙 犁 噬 黯 倔 谴 娆 猬 淤 涅 拭 衅 茸 桓 侏 嘶 晾 淌 孰 绞 粟 苇 痫 芮 缚 璀 弈 踹 酌 唬 遏 铛 釜 峥 莽 恕 瑾 廓 拣 弧 靳 尉 沌 嗜 耸 垦 箍 殃 褒 檀 掂 镶 勐 诫 帜 驯 伶 彝 姊 绷 瑚 嵌 迭 倘 湃 龈 襟 茧 烙 矢 溅 惶 坍 钗 溯 糜 讳 纫 醛 珑 仨 蔗 邨 荤 筐 崭 棘 腮 赘 秉 惰 咀 袱 衩 戎 榔 蚓 胭 蛾 侬 囤 辗 蚯 媲 飓 洼 瀚 睦 寐 溧 酶 霏 瓢 痞 胚 骁 恬 蹊 嬉 淅 凿 癣 羚 烯 垢 酋 狩 吒 琵 惋 亥 唾 炊 裆 敖 吩 缭 撰 壕 蚝 苯 圃 簧 皖 忏 秆 蛟 岷 拷 霓 缪 斐 藉 琊 犊 痿 玖 啄 夭 妲 崂 瑕 嗅 逞 呸 幢 膳 亵 氮 虏 猖 卒 秸 庇 捍 蝌 嚎 笠 蚪 泷 殇 麓 旌 涪 遁 沮 茗 昀 酣 舶 舜 榻 唏 嘤 怯 砌 颍 萦 酮 谬 陨 怠 樽 梢 阐 宥 饪 痹 馥 濒 鳅 栾 遐 绢 镁 迄 笈 苛 蔻 恤 晟 嗡 稼 侥 吭 骷 婺 滇 矜 嬷 渍 斓 颊 崴 茬 铐 糗 铂 碉 刨 慵 瑄 阆 睬 炙 妊 枭 颅 屹 畸 匡 犒 畴 髅 猾 裘 栀 摁 瓮 膺 浔 袅 毋 蟆 佘 鸠 羔 渎 礁 菁 泓 偕 筵 飒 栅 凛 陡 轲 癜 诏 陂 寅 铮 慷 涤 鳝 隍 莅 碘 樵 胰 跻 憬 烩 岑 剿 哽 婪 憧 蟾 庵 羿 腋 沼 蹬 苓 孚 淼 瞥 涿 洙 孑 轶 稠 锰 蛰 骇 昙 彷 锵 羌 琍 恙 匈 鸾 璋 簇 撮 喋 掺 吝 狈 杷 祎 枇 嚷 徨 浣 荨 睾 羲 娠 琛 腑 菅 蒿 厮 憎 蕙 蜈 磋 鸵 漩 寮 晤 镀 赂 祀 孢 莘 濠 轼 婵 宸 踝 酉 萃 筱 钛 蛳 娩 沱 皎 苞 隽 眺 嫡 迥 莒 琥 藩 挟 寰 氟 僚 戟 鳖 夯 薏 绛 炽 沅 蔑 窘 庶 咻 脓 吆 蛊 曳 簪 眩 褐 磐 嗟 阪 呻 庾 覃 蓓 瘟 匕 轧 峭 肮 蟀 疣 邋 挎 痊 肴 辘 砰 芊 滦 嗖 町 匣 攘 戮 蓟 峙 濑 笃 鹫 谤 骸 蜥 戛 蟋 伎 镍 竺 胺 桨 俨 偈 搀 悸 遢 饽 酚 蚣 扼 幌 谧 骅 辄 剌 辍 惘 啬 蓑 岱 鲈 鲶 杳 喳 珉 瑙 赡 痧 螳 囡 壑 簸 蹋 悯 瞰 歆 煳 谛 踊 膛 蔼 傀 咤 隅 渲 殉 衙 燎 涝 砚 荃 拄 飕 儡 骝 拴 晦 唷 臻 漪 擀 烷 惺 瓯 慑 蘸 蚤 漉 挝 耘 瑛 窿 嘈 昱 嵊 茱 粕 阡 钵 渚 疵 甭 拗 嘚 盎 撅 瘸 匝 蹿 瘪 淆 卯 馁 盹 臊 泞 婀 讧 拈 徂 讷 痤 叟 湍 抡 啷 孬 仄 餮 亘 吮 碜 尻 齁 冗 纂 尕 叵 怄 酩 囔 旯 擘 哏 蓿 谆 噌 扃 忾 骈 逡 鳔 逋 忖 瓤 疃 嘬 捯 颙 耨 蒯 虿 蓊 剋 曩 抔 谝 哕 镲 夼 谮 撙 哞 耪 裉 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ABOUT ACCEPT ACCESS ACER AOSS ACT ACTION ACTIVITY ADD ADDRESS ADOBE AFRAID AFTER AFTERNOON AGAIN AGE AGO AGREEMENT AIR AJAX ALEX ALL ALMOST ALONE ALPHA ALREADY ALSO ALTHOUGH ALWAYS AM AMAZING AMAZON AMERICAN AMONG AN AND ANDROID ANGEL ANGELA ANGLE ANGRY ANIMAL ANIMALS ANOTHER ANSWER ANY ANYONE ANYTHING ANYWAY APEC APP APPLE APPLICATION APPOINTMENT APRIL ARE ARM ARMANI AROUND ART AS ASIA ASK ASS ASSISTANT AT ATTACK AUDI AUGUST AUTO AUTUMN AWAY AWESOME BACK BAD BAG BALANCE BALL BANANA BAND BANG BANK BAR BASE BASS BATTLE BE BEACH BEAN BEAR BEAST BEAT BEATS BEAUTIFUL BEAUTY BECAUSE BECOME BED BEE BEEN BEFORE BEGIN BEHIND BEING BELIEVE BELL BEN BEST BETA BETTER BEYOND BIG BILL BIN BINGO BIOS BIRD BIRTHDAY BIT BITCH BLACK BLESS BLIZZARD BLOOD BLUE BOARD BODY BOOK BOOT BORN BOSS BOT BOTH BOX BOY BOYS BRA BRAND BRAVE BREAK BREAKING BREATHE BRIDGE BRIGHT BRING BROKEN BROTHER BROWN BUFF BUG BUILD BULL BULLSHIT BUS BUSINESS BUSY BUT BUTTON BUY BY BYE CAKE CALIFORNIA CALL CAMERA CAN CANADA CANDY CANNOT CANON CANT CAPTAIN CAR CARD CAT CATCH CAUSE CELL CENTER CHAIR CHAIRMAN CHAMPION CHANCE CHANEL CHANGE CHARLOTTE CHECK CHEER CHEERS CHEESE CHERISH CHERRY CHICKEN CHILD CHILDREN CHINA CHOCOLATE CHOICE CHOOSE CHRISTMAS CHROME CINDY CISCO CITY CLASSIC CLEAN CLEAR CLOSE CLOUD CLUB COACH COCK COCO CODE COFFEE COLD COLLECTION COLLEGE COLOR COLUMBIA COM COME COMES COMING COMMON COMMUNICATION COMPANY COMPLETE COMPUTER CONCERT CONSTANCE CONTINUE CONTROL COOK COOKIE COOL COPY COSPLAY COST COSTA COULD COUNTRY COURAGE COURSE COVER AZY EAM EATE EW OSS OWN YSTAL CULTURE CUP CURRENT CUT CUTE DAILY DANCE DANCER DANCING DARK DATA DATE DAUGHTER DAVID DAY DEAD DEAL DEAR DEATH DEBUG DECEMBER DECISION DEEP DEER DEFAULT DELL DELPHI DESTINY DEVICE DIAMOND DICK DID DIE DIFFERENCE DIFFERENT DIFFICULT DINNER DIOR DISABLE DISCUSS DISCUSSION DISK DISNEY DISPLAY DISS DO DOES DOG DOING DONALD DONE DOOR DOS DOUBLE DOWN DRAGON DREAM DREAMER DREAMS DRESS DRIVE DRIVER DRY EACH EARTH EAST EAT EBAY ECHO EDGE EIGHT EINSTEIN EITHER ELEVEN ELSE EMAIL ENABLE END ENDING ENERGY ENGLISH ENJOY ENOUGH ENTER EVEN EVENING EVENT EVER EVERY EVERYBODY EVERYDAY EVERYONE EVERYTHING EVIL EXCEL EXCITING EXIST EXPLAIN EXPRESS EYE EYES FACE FACT FADED FAIL FAILED FAIRY FAITH FALL FAMILY FAN FANCY FANS FANTASTIC FAR FARM FASHION FAST FAT FATHER FAVORITE FEAR FEBRUARY FEEL FEELING FEET FESTIVAL FEW FIELD FIFA FIGHT FIGHTING FILE FILES FILL FILM FINAL FINALLY FIND FINE FINGER FINISH FIRE FIREFOX FIRST FISH FIT FIVE FLASH FLOOR FLOWER FLY FLYING FOOD FOOL FOOT FORCE FORHEIGN FOREVER FORGET FORM FORWARD FOUND FOUR FOX FRANK FREE FREEDOM FRESH FRIDAY FRIEND FRIENDS FROM FUCK FUCKING FULL FUN FUNCTION FUNNY FUTURE GAIN GALAXY GAME GAP GARDEN GATE GAY GEAR GEEK GEFORCE GENERAL GEORGE GET GETTING GHOST GIFT GIRL GIT GIVE GIVING GLASS GLOBAL GLORIA GO GOAL GOD GOES GOING GOLD GOLDEN GOLF GONE GONNA GOOD GOODBYE GOODNIGHT GOOGLE GOT GRACE GRADE GRAND GREAT GREATEST GREEN GROUND GROUP GROW GUCCI GUESS GUN GUY HAD HAIR HALF HAND HANDS HANG HAPPEN HAPPINESS HARD HARRY HARVARD HAS HATE HAVE HAVING HEAD HEALTH HEALTHY HEAR HEAT HEAVEN HEAVY HELEN HELL HELLO HELP HER HERE HERO HEY HI HIGH HILL HIM HIS HISTORY HIT HOLD HOLIDAY HOLMES HOME HONDA HONEY HONOR HOPE HOSPITAL HOST HOT HOUR HOUSE HOW HOWEVER HUG HUMAN HUNDRED HUNTER HURT HUSBAND ICE ICY IDEA IDOL IF IMAGE IMAGINE IMAX IMPOSSIBLE IN INCLUDE INDEED INDEX INFINITE INFORMATION INPUT INSIDE INSIGHT INSTALL INT INTEL INTEREST INTERESTING INTERNATIONAL INTERVIEW INTO IPHONE IPOD IRON IS ISLAND ITEM ITS IVY JACK JACKY JAM JAMES JANUARY JAPAN JAPANESE JASON JAVA JAY JAZZ JEAN JEEP JERRY JIM JOB JOE JOEY JOHN JOKER JORDAN JOURNAL JOURNEY JOY JUICY JULY JUMP JUNE JUNIOR JUNK JUST JUSTIN KATE KEEP KEY KEYBOARD KICK KID KIDS KILL KIM KIND KINDLE KING KISS KIT KITTY KNOCKING KNOW KOBE KOP KOREA KOREAN LAB LADY LAN LAND LANGUAGE LARGE LARRY LAST LATE LATER LAUGH LAUGHING LAW LAY LAZY LEAD LEADER LEARN LEARNING LEAVE LEFT LEG LEGAL LEMON LET LETTER LEVEL LICENSE LIGHT LIKE LILY LINE LINK LINUX LIST LISTEN LITTLE LIVE LIVING LOCAL LOCATION LOFT LOG LOGO LONDON LONELY LONG LOOK LOOKING LOSE LOSER LOST LOT LOVE LOVED LOVELY LOVER LOVING LOW LUCK LUCKY LUCY LUNCH MAC MACHINE MADE MAGIC MAIL MAIN MAJOR MAKE MAKES MALL MAMA MAN MANAGER MANGO MANY MAP MARCH MARCO MARK MARKET MARKETING MARRY MARTIN MARY MASTER MATCH MATE MATLAB MATTER MAX MAY MAYA ME MEANS MEDIA MEDICAL MEET MEETING MEMORY MEN MERRY MESSAGE METHOD METRO MEXICO MICHAEL MIO MIOPHONE MIOSOFT MID MIDDLE MIGHT MIKE MILES MILLION MIND MINE MINI MINUTE MIRROR MISS MISSING MIX MOBILE MOD MODE MODERN MOM MOMENT MONDAY MONEY MONICA MONKEY MONSTER MONTH MOON MORE MORNING MOST MOTHER MOUNTAIN MOUTH MOVE MOVIE MR MRS MUCH MUSE MUSIC MUST MY MYSELF NAME NASA NATION NATURAL NATURE NEAR NEARLY NEED NET NETWORK NEVER NEW NEWS NEWSPAPER NEWTON NEXT NEXUS NICE NICK NIGHT NIKE NINE NO NOBODY NOKIA NONE NORTH NOT NOTE NOTHING NOVEMBER NOW NULL NUMBER OBJECT OCCUR OCEAN OCTOBER OF OFF OFFER OFFICE OFFICIAL OFTEN OH OK OKAY OLD OLIVIA ON ONCE ONE ONES ONLINE ONLY OPEN OPPO OPTION OR ORACLE ORDER OSCAR OTHER OTHERS OUR OUT OUTLOOK OVER OWEN OXFORD PAD PAGE PAIN PANDA PANDORA PAPA PAPER PARENT PARIS PARK PART PARTNER PARTY PASS PASSION PASSWORD PAST PAY PAYPAL PEACE PENNY PEOPLE PERFECT PERSONAL PET PHONE PHOTO PHOTOS PHOTOSHOP PIANO PICK PICTURE PIECE PIG PIN PINK PIZZA PLACE PLAN PLAY PLEASE POCKET POINT POLICE POLO POOR POP POPULAR POSE POSITIVE POSSIBLE POST POWER PRACTICE PRE PREFER PRESENT PRESIDENT PRESS PRETTY PRICE PRINCE PRINCESS PRIVATE PRO PROBLEM PROCESS PROFESSIONAL PROFESSOR PROGRAM PROJECT PROMISE PROTECT PUBLIC PULL PUNK PURE PURPLE PUSH PUT PYTHON QUEEN QUEENIE QUESTION QUITE RABBIT RADIO RAIN RAINBOW RAISE RAM RAN RANGE RAP RATE RATHER RAY READ READY REAL REALLY REASON RECENT RECORD RED REGRET RELEASE REMAIN REMEMBER REPORT REPOST REQUEST RESEARCH REST RESULT RETURN REVIEW RICE RIGHT RING RISE RISK RIVER ROCK ROCKY ROLLING ROM ROOM ROOT ROYAL RULE RUN RUNNING RUSH SAD SAFE SAID SALES SAM SAME SAMSUNG SANDY SATURDAY SAVE SAY SCHOOL SCIENCE SCIENTIST SCIENTISTS SEEN SIPT SEASON SECOND SEET SEETARY SECTION SECURITY SEE SEED SELECT SELF SELL SEND SENIOR SENSE SEOUL SEPTEMBER SERIOUS SERVE SERVER SERVICE SET SETUP SEVERAL SEX SEXUAL SEXY SHADOW SHARE SHARP SHE SHELDON SHELL SHERLOCK SHIFT SHINE SHINING SHIRT SHIT SHOCK SHOES SHOOT SHOP SHORT SHOT SHOULD SHOW SHUFFLE SHY SIDE SILENCE SILENT SILVER SIM SIMPLE SINA SINCE SING SIR SIRI SISTER SIT SITE SIX SIZE SKIN SKY SKYPE SLEEP SMALL SMART SMILE SNEAKER SO SOCIAL SOCKET SOFTWARE SOLDIER SOLO SOME SOMEBODY SOMEONE SOMETHING SOMETIME SOMETIMES SON SONG SONY SOON SORRY SOUND SOURCE SOUTH SPA SPACE SPEAK SPEAKER SPECIAL SPEED SPEND SPIDER SPORT SPRING STAFF STAGE STAND STANFORD STAR STARS START STATE STATEMENT STATIC STATION STAY STEAM STEVE STEVEN STILL STONE STOP STORE STORY STREET STRING STRONG STRONGER STUDENT STUDIO STUDY STUFF STYLE SUB SUBJECT SUCCESS SUCH SUCK SUE SUGAR SUMMER SUN SUNDAY SUNFLOWER SUNNY SUNSHINE SUPER SUPERMAN SUPPER SUPPORT SURFACE SURPRISE SWEET SWIFT SYSTEM TAB TABLE TAG TAKE TALE TALK TANK TASK TATTOO TAX TAXI TAYLOR TEA TEACH TEACHER TEAM TEAMWORK TEARS TECHNOLOGY TED TELEVISION TELL TEN TENCENT TERESA TEST TEXT THAN THANK THANKS THAT THE THEIR THEM THEN THEORY THERE THESE THEY THING THINGS THINK THINKING THIRD THOSE THOUGH THOUGHT THOUSAND THREE THROUGH THROW THURSDAY TIFFANY TIGER TIME TIMELINE TIMES TIMING TIPS TIRED TITLE TO TODAY TOGETHER TOKYO TOM TOMATO TOMORROW TONIGHT TOO TOP TOTAL TOUCH TOUR TOWARD TOWN TRADE TRAVEL TREAT TREE TRICK TRIP TROUBLE TRUE TRUMP TRUST TRUTH TRY TUESDAY TURBO TURN TWITTER TWO TYPE UBER UBUNTU UNCLE UNDER UNDERSTAND UNIQUE UNIT UNITED UNITY UNIVERSITY UNIX UNTIL UP UPDATE US USE USER USUALLY VALUE VAMPIRE VAN VERSION VERY VIA VICTOR VICTORIA VIDEO VIEW VISA VISION VISTA VISUAL VIVO VOGUE VOICE VOID WAIT WAITING WAKE WALK WALKER WALL WANNA WAR WARM WASHINGTON WATCH WATER WATSON WAY WE WEAR WEB WEDDING WEDNESDAY WEEK WEEKEND WEIGHT WELCOME WELL WERE WEST WESTERN WHAT WHATEVER WHATS WHEN WHERE WHICH WHILE WHITE WHO WHOLE WHOM WHOSE WHY WIFE WIFI WIN WIND WINDOW WINDOWS WING WINNER WINNIE WINTER WISH WITH WITHIN WOLF WOMAN WOMEN WON WONDER WONT WORD WORK WORLD WORRY WORTH WOULD WOW WRITE WRONG XBOX YAHOO YEAH YEAR YELLOW YES YESTERDAY YET YOGA YORK YOU YOUNG YOUR YOURS YOURSELF YOUTH YOUTUBE ZARA ZERO ZIP ZONE ACCOUNT ADIDAS AGREE AIRPORT AMERICA AREA ASUS BABY BETWEEN BLOG CAFE CARE CLASS DADDY DAYS DESIGN DOCTOR EASY ECLIPSE EGG ERROR EXCHANGE EXPLORER FACTOR FANTASY FATE FOCUS FOR GOSSIP GRAY HAPPY HE HEART HOLY HOTEL IMPORTANT INTERNET IT JOBS JOIN KINGDOM LESS LIE LION LOAD MACBOOK MAGAZINE MAKING MAYBE MEAN MILK MINOR MIRACLE MODEL MUMMY NOVA NVIDIA ORANGE PERSON PIRATE PLAYER PLUS POSITION PRINT RACE RAINY REQUIRE ROAD ROSE SAT SCHEDULE SCORE SEVEN SHOPPING SIGN SINGLE SNOW SOFT SOUL STEP STUPID SURE TEDDY THIS TIM TWINS USED VISIT WANT WAS WILL WINE WITHOUT WONDERFUL WORDS WORKER EXISTS ACFUN AKB ANGELABABY BEAUTYLEG BIEBER BIGBANG CHINAJOY CHRISTY COULDNT CSGO DEBIAN DIDNT DOESNT DONT DOTA MOJI FACEBOOK GANK GATES GIRLS GUYS HIPHOP HTML ICLOUD INS INSTAGRAM IPAD JQUERY LOLY LOVES MEMORIES OCLOCK OLAY PEPPA PRADA ROLEX SDCARD SQL STARBUCKS TFBOYS THINKPAD VISIO WASNT WEARE WECHAT WEIBO YEARS ESPACE LIFE CHINESE EXIT SEA EXPERIENCE LETUS CASE HDMI EMUI FOLLOW CARRY MIKEY WCDMA BILIBILI FOREIGN CAMARA AMBER KEIL OBAMA HERMES BURBERRY ASCII ARIES LETS SWATCH QQ YY AV ID WLAN KTV CF CP CS DNF MM MV PK PP PPT PS TXT VS AIRPLANE BLUETOOTH AI OFO IOS API AARON ABBY ABEL ABRAHAM ADA ADAM ADRIAN ADRIENNE AGNES AID AKON ALAN ALBERT ALBERTO ALEJANDRO ALEXANDER ALEXANDRA ALEXIS ALFONSO ALFRED ALFREDO ALICE ALICIA ALIN ALISON ALLAH ALLAN ALLEN ALLIN ALMA ALONZO ALTON ALVIN ALYSSA AMANDA AMANI AMELIA AMOS AMY ANDRE ANDREA ANDRES ANDREW ANDY ANGELINA ANGELO ANGIE ANITA ANN ANNA ANNE ANNETTE ANNIE ANNY ANTHONY ANTOINETTE ANTONIA ANTONIO ARA ARCHIE ARD ARENA ARLENE ARMANDO ARNOLD ARON ARTHUR ARTURO ASHLEY ATLAS AUBREY AUDREY AUSTIN AVRIL BAN BANDA BANDER BARBARA BARK BARRY BAT BEATRICE BECKY BELINDA BENJAMIN BENNIE BENNY BERNADETTE BERNARD BERNICE BERT BESSIE BETH BETHANY BETSY BETTY BEVERLY BEY BEYONCE BILLIE BILLY BING BLAKE BLANCA BOB BOBBIE BOBBY BOBO BODDY BONNIE BOOS BOYD BRAD BRADFORD BRADLEY BRANDI BRANDON BRENDA BRENDAN BRETT BREW BRIAN BRIDGET BRIGHTMAN BRITTANY BROOKE BRUCE BRYAN BRYANT BY2 BYRON CABLE CALEB CALVIN CAMERON CAMILLE CANDACE CANDICE CARL CARLA CARLOS CARLTON CARLY CARMEN CAROL CAROLE CAROLINE CAROLYN CARRIE CARROLL CARY CASEY CASSANDRA CATHERINE CATHY CECELIA CECIL CECILIA CEDRIC CELIA CELINE CESAR CHARLENE CHARLES CHARLIE CHELSEA CHERYL CHESTER CHRIS CHRISTIAN CHRISTIE CHRISTINA CHRISTINE CHRISTOPHER CLAIRE CLAN CLARA CLARE CLARENCE CLARK CLARKSON CLAUDE CLAUDIA CLAY CLAYTON CLEVER CLIFFORD CLIFTON CLINT CLINTON CLYDE COCOLEE CODY COLIN COLLAR COLLEEN CONNIE CONNOR CONRAD CONVERSE CORA COREY CORNELIUS CORTANA CORY COS COURTNEY AIG ISTINA CYNTHIA DADY DAISY DALE DAMON DANIEL DANIELLE DANNY DARIN DARLA DARLENE DARLING DARNELL DARREL DARRELL DARREN DARRIN DARRYL DARYL DASH DAVE DAWN DEANNA DEBBIE DEBORAH DEBRA DELBERT DELIA DELLA DELORES DENISE DENNIS DEREK DERRICK DESIREE DEVIN DIANA DIANE DIANNA DIANNE DINA DINE DION DIXIE DOLORES DOMINIC DOMINICK DONNA DONNIE DORA DOREEN DORIS DOROTHY DOT DOTT DOUG DOUGLAS DOYLE DRING DUANE DUSTIN DWAYNE DWIGHT EAGLES EARL EARNEST EASON EBONY ECONOMY EDDIE EDGAR EDITH EDMOND EDMUND EDNA EDUARDO EDWARD EDWIN EILEEN ELAINE ELBERT ELEANOR ELENA ELIJAH ELISA ELIZABETH ELLA ELLEN ELLIS ELMER ELOISE ELSA ELSIE ELVIRA EMANUEL EMILIO EMILY EMINEM EMMA EMMETT ENRIQUE ERICA ERICK ERIK ERIKA ERIN ERMA ERNESTINE ERNESTO ERVIN ESSIE ESTELLE ESTHER ETHEL EUGENE EUNICE EVA EVAN EVELYN EVERETT EXO FANNIE FANNY FAYE FELICIA FELIPE FELIX FERNANDO FIR FLORA FLORENCE FLOYD FORD FORREST FRANCES FRANCIS FRANCISCO FRANKIE FRANKLIN FRED FREDDIE FREDERICK FREDRICK FRIENDLY FRY GABRIEL GAGA GAIL GAILE GALA GARRETT GARRY GARY GAYLE GEE GEM GENEVIEVE GEOFFREY GEORGIA GERALD GERARD GERTRUDE GILBERT GILBERTO GINA GINGER GLAD GLADYS GLEN GLENDA GLENN GODEN GOMEZ GORDON GRADY GRANT GREG GREGG GREGORY GRETCHEN GREY GRIMES GUILLERMO GUL GUSTAVO GWEN GWENDOLYN HANNAH HAROLD HARRELL HARRIET HARVEY HATTIE HAYES HEAL HEATHER HEBE HECTOR HEIDI HELLE HENRIETTA HENRY HERBERT HERMAN HILDA HOLLE HOLLY HOMER HORACE HOWARD HUBERT HUGH HUGO IAN IGNACIO IKE INEZ IRA IRENE IRMA IRVIN IRVING ISAAC ISABEL ISIS ISMAEL IVAN JACKI JACKIE JACKSON JACOB JACQUELINE JACQUELYN JAIME JAKE JAMIE JANE JANET JANICE JANIE JANIS JARED JASMINE JAYDEN JEANETTE JEANNE JEANNETTE JEANNIE JEANS JEFF JEFFERY JEFFREY JENNA JENNIE JENNIFER JENNY JEPSEN JEREMY JERMAINE JEROME JESSE JESSICA JESSIE JESUS JILL JIMMIE JIMMY JOAN JOANN JOANNA JOANNE JODY JOEL JOHANNA JOHNATHAN JOHNNIE JOHNNY JOJO JOKE JOLIN JON JONATHAN JONATHON JOSEPH JOSEPHINE JOSH JOSHUA JOYCE JUANA JUDE JUDITH JUDY JULIA JULIAN JULIE JULIUS KAELA KARA KAREN KARL KARLA KATHERINE KATHLEEN KATHRYN KATHY KATIE KATRINA KATY KAY KAYLA KEITH KELLER KELLEY KELLI KENDRA KERRY KEVIN KIMBERLY KIMI KIRK KONE KRIS KRISTEN KRISTI KRISTINE KRISTOPHER KRISTY LANA LAURA LAUREN LAWRENCE LEAH LEE LELA LELAND LENKA LEON LEONA LESLIE LESTER LETICIA LEWIS LILLIAN LILLIE LINDA LINDSEY LIONEL LISA LOIS LOLA LOREN LORENA LORENZO LORETTA LORI LORRAINE LOUIS LOUISE LOVATO LUCAS LUCIA LUCILLE LUIS LUKE LULA LUTHER LYLE LYNDA LYNETTE LYNN LYNNE MABEL MAKIYO MANDY MANUEL MARCELLA MARCOS MARGARET MARGARITA MARGIE MARIANNE MARIE MARILYN MARIO MARION MARJORIE MARLON MARSHA MARTA MARTHA MARTY MARVIN MARX MARYANN MATTHEW MATTIE MAUREEN MAURICE MAXINE MEGAN MELANIE MELBA MELINDA MELISSA MELODY MELVIN MEREDITH MICHAELJACKSON MICHALE MICHEAL MICHELE MICHELLE MIDI MIGUEL MILDRED MILTON MIMI MINDY MIRANDA MIRIAM MISTY MITCHELL MOMO MONA MOR MORRIS MOSES MUMA MURIEL MYRA NADINE NAIM NANCY NAOMI NATALIE NATASHA NATHAN NATHANIEL NELLIE NETTIE NICHOLAS NICHOLE NICOLAS NICOLE NIMMO NINA NORA NORMAN NORWOOD OLIVER OLLIE OMAR ORA ORLANDO ORVILLE OTIS PAM PAMELA PATRICIA PATRICK PATTI PATY PAUL PAULA PEGGY PERCY PETER PHILIP PHILLIP PRESTON PRISCILLA QUEENA RACHAEL RACHEL RAFAEL RAMONA RANDAL RANDALL RANDOLPH RANDY RAYMOND REBECCA REGINALD RENE RENEE RHONDA RICARDO RICHARD RIHANNA RITA RITE ROBBIE ROBBIEWILLIAMS ROBERT ROBERTA ROBERTO ROBYN ROCHELLE RODNEY ROGELIO ROGER ROLAND ROLANDO RONALD RONNIE ROOSEVELT ROSA ROSALIE ROSIE ROSS ROXANNE RUBEN RUDOLPH RUDY RUFUS RUSSELL RUTH RYAN SABRINA SADIE SAMANTHA SAMMY SAMUEL SANDRA SARA SARAH SAUL SCOTT SEAN SELENA SELINA SERGIO SETH SHAKIRA SHANE SHANIA SHANICE SHANNON SHARI SHARON SHAUN SHAWN SHAWNA SHAYNE SHEILA SHELIA SHELLEY SHERI SHERMAN SHERRI SHERYL SHIRLEY SIDNEY SILVIA SIMON SMITH SODA SOFIA SOM SONIA SONYA SOPHIA SOPHIE STACEY STACY STADION STANLEY STEPHEN STEVIE SUSAN SUSIE SUZANNE TABITHA TAMARA TANNER TANYA TEMPO TERENCE TERRANCE TERRENCE TERRI TERRY THELMA THEODORE THERESA THOMAS TIMBALAND TIMMY TINA TOBY TOMAS TOMMIE TOMMY TONI TONYA TRACEY TRACI TRACY TRAVIS TREVOR TRICIA VALERIE VANESSA VELMA VERNA VERNON VICKI VICKIE VIOLET VIRGIL VIRGINIA VITAS VIVIAN WALLACE WANDA WARD WAYNE WENDELL WENDY WESLEY WILLIAM WELLS WILBERT WILBUR WILFRED WILLARD WILLIAMS WILLIE WILLIS WILMA WILSON WINIFRED WINSTON YOLANDA YVONNE ZICO STELLA KELLY BYRNE STEPHANIE SILIENT TODD ALBERTA KENNETH NELSON ERNEST MINNIE GDRAGON JOSE SYLVIA JEREMIAH FRIENDSHIP JUAN DEAN PHYLLIS KATYPERRY CARLYRAE RICKY JIMKELLER PATSY BEA JAVIER CURTIS ALANCHEN MARCUS DREW JORGE FREDA CHE WALTER JULIO JUANITA RAQUEL LLOYD WADE MARIA STEWART HAZEL NORMA DRAW DAVEY SALLY DON LELAN EARLY VINCENT ROY TAMMY JOSEFINA ERIC MARCIA CLAYDERMAN TONY BYTWO FATHERS HENE RUBY LEONARD DEWEY RAUL TIMOTHY GAMES CARPENTERS ALLISON AMN LADYGAGA WINDS PERCENT MAKER TIZZY SNH48 BACKSTREET AIMER JONY AKB48 VICTORY CONTRACTIONSHOULDNT CONTRACTIONWHERELL CONTRACTIONWOULDNT CONTRACTIONCOULDNT CONTRACTIONMIGHTNT CONTRACTIONTHEYRE CONTRACTIONTHEYLL CONTRACTIONTHEYVE CONTRACTIONTHATLL CONTRACTIONWHATRE CONTRACTIONWHATLL CONTRACTIONWHERES CONTRACTIONWHERED CONTRACTIONWHENLL CONTRACTIONWERENT CONTRACTIONHAVENT CONTRACTIONDOESNT CONTRACTIONMUSTNT CONTRACTIONOCLOCK CONTRACTIONYOURE CONTRACTIONYOULL CONTRACTIONYOUVE CONTRACTIONSHELL CONTRACTIONTHEYD CONTRACTIONTHATS CONTRACTIONTHATD CONTRACTIONWHOLL CONTRACTIONWHATS CONTRACTIONWHATD CONTRACTIONWHENS CONTRACTIONWHEND CONTRACTIONWHYLL CONTRACTIONHOWLL CONTRACTIONARENT CONTRACTIONWASNT CONTRACTIONHASNT CONTRACTIONHADNT CONTRACTIONDIDNT CONTRACTIONYOUD CONTRACTIONHELL CONTRACTIONSHES CONTRACTIONSHED CONTRACTIONITLL CONTRACTIONWERE CONTRACTIONWELL CONTRACTIONWEVE CONTRACTIONWHOS CONTRACTIONWHOD CONTRACTIONWHYS CONTRACTIONWHYD CONTRACTIONHOWS CONTRACTIONHOWD CONTRACTIONISNT CONTRACTIONWONT CONTRACTIONDONT CONTRACTIONCANT CONTRACTIONILL CONTRACTIONIVE CONTRACTIONHES CONTRACTIONHED CONTRACTIONITS CONTRACTIONITD CONTRACTIONWED CONTRACTIONIM CONTRACTIONID SHOUD MISTER FENDIMAN BABYFACE FOURTY LINKA BEATLESS STORIES KISKIS ANYMORE ODYSSEY EVERYWHERE DEVILSLINE IMAGES USAHANA ACCIDENTALLY INGRESS MAINTENANT DREAMERS OVERLORD RELOAD SAFARI ROBERTS SUNSET MAGGIE ROLL FUSE ABS UME ANIMA SHANGHAI ENCORE MANHAND LIN LOCKING SWITCH TRUCK WORKING BABYLOONZ DEMO YELL RUNFOX RERIDED FALLER WARRIORS IMAGINATION AUMPAIPORN BRAHMS JOYJOY REGENESIS LESLI ISAIAH MOFY YHBOYS ENDLESSWHITE MOUVEMENT SLOW AKI CUTIE HELLOKITTY REALIZE MUFFIN AEW VIVID KRAWITZ BREATH MACH DIY HEYKIDS GEORGEN EXTRA DETROIT HELENA SUNNEE COMIC RELIFE PIPI NEO BLOSSOM KAIJUDO LULLABY GENNEO IDOLISH YOUTHFUL RANGERS BABYJOYJOY BUTLERS BLAST AICO ABCSONG CCUP ZEROG GAI AINY HELLOKONGZI HUTOS SPARK HO KITTISAK HAPPYKISS REMIX MEGA HIP HOP JAE HRS FOURTH FIFTH AH FORK LIP MRRIGHT SAIL HOLLYWOOD PANAMA PIS DOC RESPECT SALALA VENUS MARS CAGE CHEMICAL CONGRATULATIONS KALA SOSO INSTRUMENT SICKNESS DISCO CYBER ACOUSTIC EDEN INFINITY BRAIN HIPOP DIMENSION ROSY CHACHA PASTY TA SHINNING VITAMIN FREESTYLE SIGNAL CEO VAE GENTLEMAN XMAS DROP ZOMBIE ADVENTURE SCAPE HOHO SPELL IQ NANA PRECIOUS ICHIBAN SUPERSTAR TRULY INTRO OS GUIDANCE JINGLE SHAKE SHAPE FRAGMENT ONER SENS ITTY BLOCK SHELLY PANTA HOLLOW NOISE PAT REVERSE OMEN RUSSIAN YOO PUPPY PRISM NONONO DEPARTURE CBA NBA CHAOS HORIZON INTRODUCTION GUARD GUARDIAN PHANTOM JEWEL INNOVATION SWING WALTZ WEIWEI PRIDE PLANET COMPLEX DOUBT CHANNEL ACHE ORZ KIKI LEO CC YOYO LINA HEDI GROWTH MOOSE CLASSICAL WEED SONIC TASTE WISE JOHNSON HA PAC 5IVE MEI TIC TRACK JUSTINE LIQUID DIARY MISSION SMOKE SMOKING NEIGHBOR LESSON FILTHY SMASH DRAMA CLIMAX FAKE LAKE DOLLAR SAN COMEDY CHAT CENTRAL LATELY LAWYER SKETCH FOSTER CHARACTER GAMER AWARD CHEF COMMERCIAL MACDONALD JAIL PAIR TITAN MAXIMUM STABLE UNSTABLE NORTON ULTRA REVOLUTION TICKET ANCIENT CAVE DEMAND NOBODIES UNTITLED MARVEL CINEMATIC CINEMA BEGAN DANGEROUS ASSASSIN KNIGHT DISCOVERY YO MA JUSTICE CARSON JAMIESON MORRISON PINSON DE SE VIP YANG MARRON GENERATION PUTH TROYE SIVAN ADELE LINKIN BRUNO DIRECTION ED SHEERAN OWL CNBLUE AMIGO WASH CHICK ABOVE ACID IDIOT ASH BACH SUITE BAMBOO BANDARI BARI IMPROV BEATRICH BLAH BLAME BODAK BON BONEY BOOM CLAP WILD BREACH CHINATWO CAUGHT CONSOUL TRAININ OOKED YANKEE DARKNESS BOWIE DECK HALL DESPICABLE DEVOTE DINERO DISCONNECT DIVE DJETHAN ORLEANS TONITE DRUNK ELECTRICITY ELSTEN TORRES EMPIRE EMPTY DRINK FADE FAMOUS RAINCOAT FIX UPPER FLOW FREAK FURTHER CHOP GOTTA ADDICT GROOVE COVERAGE MANE HALO AINT HOOD REMIND HYDE POLAND PRAYER ILL IMAGINARY IMMIGRANT IMMORTAL ORIGINALLY PERFORM INNOCENCE JONES SUIT PAID KANE KESHA KNOCK VALENTINE APART TEAR STRANGER STRIKE MACK MAD MULLET MARIAH CAREY PETRIE MATTEO MAXIMILIAN HECKER AGAINST MEGHAN TRAINOR MERCY RONSON INVINCIBLE PERRY MILLIONAIRE TEMPLE THEME LIBRARY MYSTERY NEVADE ALASKA OLLY MURS IM OPERA PAINT PARADISE CHAMPAGNE POLLY MALONE PRAY SWAN MARRAKESH ROCKET ROUND SNHFOURTY BOAT SCARDO SCARE SCARED SEVE SHALL SHAME SHEEP EFFECT SNITCH SOCAN SOFI TUKER LOVIN SPIRIT SPOON STEPPENWOLF STRAWBERRY CIGARETTE SUEDE GRILL SHOULDER SHOUT USA SURVIVE PROPHET SUTTER SWIFTY BEATLE CHAIN SMOKER MAGICAL SPECTRE TRAIN RAT GOIN CAME THOMSTON TOCH TREASURE PILOT TWENTY UNBELIEVABLE UNLESS UNPREDICTABLE UPTOWN FUNK UTWO VARIOUS ARTIST WICK WONDERLAND WINKY WRECKING DUMB BROKE YOURE AUDIO UNRAVEL ASHES WASTE WEATHER PARADE BURN BURNING @ [OOV] [BLANK] +[NULL] [START] [SEP] [MASK] a0 a1 ai1 ai2 ai3 ai4 an1 an3 an4 ang1 ang2 ang4 ao1 ao2 ao3 ao4 ba1 ba2 ba3 ba4 bai1 bai2 bai3 bai4 ban1 ban3 ban4 bang1 bang3 bang4 bao1 bao2 bao3 bao4 bei0 bei1 bei3 bei4 ben1 ben3 ben4 beng1 beng2 beng4 bi1 bi2 bi3 bi4 bian1 bian3 bian4 biao1 biao3 biao4 bie1 bie2 bie3 bie4 bin1 bin4 bing1 bing3 bing4 bo0 bo1 bo2 bo3 bo4 bu1 bu2 bu3 bu4 ca1 cai1 cai2 cai3 cai4 can1 can2 can3 can4 cang1 cang2 cao1 cao2 cao3 ce4 cen1 cen2 ceng1 ceng2 ceng4 cha1 cha2 cha3 cha4 chai1 chai2 chai4 chan1 chan2 chan3 chan4 chang1 chang2 chang3 chang4 chao1 chao2 chao3 che1 che3 che4 chen1 chen2 chen3 chen4 cheng1 cheng2 cheng3 cheng4 chi1 chi2 chi3 chi4 chong1 chong2 chong3 chong4 chou1 chou2 chou3 chou4 chu1 chu2 chu3 chu4 chuai1 chuai3 chuai4 chuan1 chuan2 chuan3 chuan4 chuang1 chuang2 chuang3 chuang4 chui1 chui2 chun1 chun2 chun3 chuo1 chuo4 ci1 ci2 ci3 ci4 cong1 cong2 cou4 cu1 cu2 cu4 cuan1 cuan2 cuan4 cui1 cui3 cui4 cun1 cun2 cun3 cun4 cuo1 cuo2 cuo4 da0 da1 da2 da3 da4 dai1 dai3 dai4 dan1 dan3 dan4 dang1 dang3 dang4 dao1 dao2 dao3 dao4 de0 de1 de2 de4 dei3 deng1 deng3 deng4 di1 di2 di3 di4 dia3 dian1 dian3 dian4 diao1 diao3 diao4 die1 die2 ding1 ding3 ding4 diu1 dong1 dong3 dong4 dou1 dou3 dou4 du1 du2 du3 du4 duan1 duan3 duan4 dui1 dui3 dui4 dun1 dun3 dun4 duo1 duo2 duo3 duo4 e1 e2 e3 e4 ei1 en1 en4 er0 er2 er3 er4 fa1 fa2 fa3 fa4 fan1 fan2 fan3 fan4 fang1 fang2 fang3 fang4 fei1 fei2 fei3 fei4 fen1 fen2 fen3 fen4 feng1 feng2 feng3 feng4 fo2 fou3 fu1 fu2 fu3 fu4 ga1 ga2 ga3 ga4 gai1 gai3 gai4 gan1 gan3 gan4 gang1 gang3 gang4 gao1 gao3 gao4 ge1 ge2 ge3 ge4 gei3 gen1 gen2 gen4 geng1 geng3 geng4 gong1 gong3 gong4 gou1 gou3 gou4 gu1 gu3 gu4 gua1 gua3 gua4 guai1 guai3 guai4 guan1 guan3 guan4 guang1 guang3 guang4 gui1 gui3 gui4 gun3 gun4 guo1 guo2 guo3 guo4 ha1 ha2 hai1 hai2 hai3 hai4 han1 han2 han3 han4 hang1 hang2 hang4 hao1 hao2 hao3 hao4 he1 he2 he4 hei1 hen2 hen3 hen4 heng1 heng2 heng4 hong1 hong2 hong3 hong4 hou1 hou2 hou3 hou4 hu1 hu2 hu3 hu4 hua1 hua2 hua4 huai2 huai4 huan1 huan2 huan3 huan4 huang1 huang2 huang3 huang4 hui1 hui2 hui3 hui4 hun1 hun2 hun4 huo1 huo2 huo3 huo4 ji1 ji2 ji3 ji4 jia1 jia2 jia3 jia4 jian1 jian3 jian4 jiang1 jiang3 jiang4 jiao1 jiao2 jiao3 jiao4 jie1 jie2 jie3 jie4 jin1 jin3 jin4 jing1 jing3 jing4 jiong1 jiong3 jiu1 jiu3 jiu4 ju1 ju2 ju3 ju4 juan1 juan3 juan4 jue1 jue2 jue4 jun1 jun4 ka1 ka3 kai1 kai3 kai4 kan1 kan3 kan4 kang1 kang2 kang4 kao1 kao3 kao4 ke1 ke2 ke3 ke4 kei1 ken3 ken4 keng1 kong1 kong3 kong4 kou1 kou3 kou4 ku1 ku3 ku4 kua1 kua3 kua4 kuai3 kuai4 kuan1 kuan3 kuang1 kuang2 kuang3 kuang4 kui1 kui2 kui3 kui4 kun1 kun3 kun4 kuo4 la1 la2 la3 la4 lai2 lai4 lan2 lan3 lan4 lang1 lang2 lang3 lang4 lao1 lao2 lao3 lao4 le0 le4 lei0 lei1 lei2 lei3 lei4 leng2 leng3 leng4 li0 li1 li2 li3 li4 lia3 lian2 lian3 lian4 liang2 liang3 liang4 liao2 liao3 liao4 lie1 lie3 lie4 lin1 lin2 lin3 lin4 ling2 ling3 ling4 liu1 liu2 liu3 liu4 lo0 long2 long3 lou1 lou2 lou3 lou4 lu1 lu2 lu3 lu4 luan2 luan3 luan4 lun1 lun2 lun4 luo1 luo2 luo3 luo4 lv2 lv3 lv4 lve4 ma0 ma1 ma2 ma3 ma4 mai2 mai3 mai4 man1 man2 man3 man4 mang2 mang3 mao1 mao2 mao3 mao4 me0 mei2 mei3 mei4 men0 men1 men2 men4 meng1 meng2 meng3 meng4 mi1 mi2 mi3 mi4 mian2 mian3 mian4 miao1 miao2 miao3 miao4 mie1 mie4 min2 min3 ming2 ming3 ming4 miu4 mo1 mo2 mo3 mo4 mou1 mou2 mou3 mu2 mu3 mu4 na2 na3 na4 nai3 nai4 nan1 nan2 nan3 nan4 nang1 nang2 nang3 nao1 nao2 nao3 nao4 ne0 ne2 ne4 nei3 nei4 nen4 neng2 ng0 ni1 ni2 ni3 ni4 nian1 nian2 nian3 nian4 niang2 niang4 niao3 niao4 nie1 nie4 nin2 ning2 ning3 ning4 niu1 niu2 niu3 niu4 nong2 nong4 nou4 nu2 nu3 nu4 nuan3 nuo2 nuo4 nv3 nve4 o1 ou1 ou3 ou4 pa1 pa2 pa4 pai1 pai2 pai3 pai4 pan1 pan2 pan4 pang1 pang2 pang3 pang4 pao1 pao2 pao3 pao4 pei1 pei2 pei4 pen1 pen2 peng1 peng2 peng3 peng4 pi1 pi2 pi3 pi4 pian1 pian2 pian3 pian4 piao1 piao2 piao3 piao4 pie1 pie3 pin1 pin2 pin3 pin4 ping1 ping2 po1 po2 po3 po4 pou1 pou2 pu1 pu2 pu3 pu4 qi1 qi2 qi3 qi4 qia1 qia3 qia4 qian1 qian2 qian3 qian4 qiang1 qiang2 qiang3 qiang4 qiao1 qiao2 qiao3 qiao4 qie1 qie2 qie3 qie4 qin1 qin2 qin3 qin4 qing1 qing2 qing3 qing4 qiong2 qiu1 qiu2 qiu3 qu1 qu2 qu3 qu4 quan1 quan2 quan3 quan4 que1 que2 que4 qun1 qun2 ran2 ran3 rang2 rang3 rang4 rao2 rao3 rao4 re3 re4 ren2 ren3 ren4 reng1 reng2 ri4 rong2 rong3 rou2 rou4 ru2 ru3 ru4 ruan3 rui2 rui3 rui4 run4 ruo4 sa1 sa3 sa4 sai1 sai4 san1 san3 san4 sang1 sang3 sang4 sao1 sao3 sao4 se4 sen1 seng1 sha1 sha2 sha3 sha4 shai1 shai3 shai4 shan1 shan3 shan4 shang0 shang1 shang3 shang4 shao1 shao2 shao3 shao4 she1 she2 she3 she4 shei2 shen1 shen2 shen3 shen4 sheng1 sheng2 sheng3 sheng4 shi0 shi1 shi2 shi3 shi4 shou1 shou3 shou4 shu1 shu2 shu3 shu4 shua1 shua3 shuai1 shuai3 shuai4 shuan1 shuan4 shuang1 shuang3 shui3 shui4 shun3 shun4 shuo1 shuo4 si1 si3 si4 song1 song3 song4 sou1 sou3 sou4 su1 su2 su4 suan1 suan4 sui1 sui2 sui3 sui4 sun1 sun3 suo1 suo3 ta1 ta3 ta4 tai1 tai2 tai4 tan1 tan2 tan3 tan4 tang1 tang2 tang3 tang4 tao1 tao2 tao3 tao4 te4 teng2 ti1 ti2 ti3 ti4 tian1 tian2 tian3 tiao1 tiao2 tiao3 tiao4 tie1 tie3 tie4 ting1 ting2 ting3 tong1 tong2 tong3 tong4 tou1 tou2 tou4 tu1 tu2 tu3 tu4 tuan1 tuan2 tuan3 tui1 tui2 tui3 tui4 tun1 tun2 tuo1 tuo2 tuo3 tuo4 wa1 wa2 wa3 wa4 wai1 wai3 wai4 wan1 wan2 wan3 wan4 wang1 wang2 wang3 wang4 wei1 wei2 wei3 wei4 wen1 wen2 wen3 wen4 weng1 weng3 weng4 wo1 wo3 wo4 wu1 wu2 wu3 wu4 xi1 xi2 xi3 xi4 xia1 xia2 xia4 xian1 xian2 xian3 xian4 xiang1 xiang2 xiang3 xiang4 xiao1 xiao2 xiao3 xiao4 xie1 xie2 xie3 xie4 xin1 xin4 xing1 xing2 xing3 xing4 xiong1 xiong2 xiu1 xiu3 xiu4 xu0 xu1 xu2 xu3 xu4 xuan1 xuan2 xuan3 xuan4 xue1 xue2 xue3 xue4 xun1 xun2 xun4 ya0 ya1 ya2 ya3 ya4 yan1 yan2 yan3 yan4 yang1 yang2 yang3 yang4 yao1 yao2 yao3 yao4 ye1 ye2 ye3 ye4 yi1 yi2 yi3 yi4 yin1 yin2 yin3 yin4 ying1 ying2 ying3 ying4 yo1 yong1 yong2 yong3 yong4 you1 you2 you3 you4 yu1 yu2 yu3 yu4 yuan1 yuan2 yuan3 yuan4 yue1 yue3 yue4 yun1 yun2 yun3 yun4 za1 za2 za3 zai1 zai3 zai4 zan1 zan2 zan3 zan4 zang1 zang4 zao1 zao2 zao3 zao4 ze2 ze4 zei2 zen3 zen4 zeng1 zeng4 zha1 zha2 zha3 zha4 zhai1 zhai2 zhai3 zhai4 zhan1 zhan3 zhan4 zhang1 zhang3 zhang4 zhao1 zhao2 zhao3 zhao4 zhe0 zhe1 zhe2 zhe3 zhe4 zhen1 zhen3 zhen4 zheng1 zheng3 zheng4 zhi1 zhi2 zhi3 zhi4 zhong1 zhong3 zhong4 zhou1 zhou2 zhou3 zhou4 zhu1 zhu2 zhu3 zhu4 zhua1 zhua3 zhuai4 zhuan1 zhuan3 zhuan4 zhuang1 zhuang4 zhui1 zhui4 zhun1 zhun3 zhuo1 zhuo2 zi1 zi3 zi4 zong1 zong3 zong4 zou1 zou3 zou4 zu1 zu2 zu3 zuan1 zuan3 zuan4 zui3 zui4 zun1 zun3 zuo1 zuo2 zuo3 zuo4 a ai an ang ao ar b ba bai ban bang bao be bea bei ben beng ber bew bi bien bin bo bou bu cen d da dai dan dang de dea dei den di dia din diu do dong dou dra drai drhi drhing dro drong du e ea ei en er f fa fai fan fang fe fea fei fen feng few fi fing fo fou ft fu g ga gai gan ge gea gei gen gi gin go gong gou gu ha hai han hang hao he hea hei hew hi ho hong hou hu jei jew ji jia jiang jie jien jier jin jio jiong jiu jue juer jun k ka kai kan kang kea kei ken keng ker kew ki kin king kiou ko kon kou kre krhi krhin kro ks ksi kt ku kuai kwea kwi kwin la lai lan lang lao le lea lei len lew li lien lin ling lo long lou lu lun m ma mai man mang mao me mea mei men mi min ming miu mo mou mp ms mu mun na nai nan nao ne nea nei nen new ni ning no nou o ong ou p pa pai pan pang pe pea pei pen pew pi pia pien pin ping po pot pou ps pt pu q qai qew qi qia qiang qie qien qier qio quei rha rhai rhan rhang rhe rhea rhei rhen rhi rhin rhing rho rhong rhou ru ruo rza rzan rzao rzea rzei rzen rzer rzi rzin rzing rzong rzou s sa sai san sang sao se sea see sei sen sew sin sk so song sou sp st su sun t ta tai tan tang te tea tei ten tew ti tin ting to tong tou tra trai trhi trou tru ts tu twi twin v va vai van vea ven ver vew vi ving vo vou vrhi vs wa wai wan wang wao we wea wei wen wer wi win wo wong wu xew xi xia xiai xian xie xien xier xin xing xio xiu ya yang ye yen yer yew yi yier yin yo you zi [OOV] [BLANK] diff --git a/inference/examples/automatic_speech_recognition/asr_rnnt.cpp b/inference/examples/automatic_speech_recognition/asr_rnnt.cpp new file mode 100644 index 00000000..ae005d55 --- /dev/null +++ b/inference/examples/automatic_speech_recognition/asr_rnnt.cpp @@ -0,0 +1,110 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include "inference.hpp" +#include "tensor.hpp" +#include "data_loader.hpp" +#include "profiling.h" +#include "parse_command.h" + +int main(int argc, char *argv[]) +{ + UNI_TIME_INIT + ParseRes parse_res; + parseCommandLine(argc, argv, &parse_res, "examples"); + + char *modelPath = (char *)""; + char *sequenceDirectory = (char *)""; + char *affinityPolicyName = (char *)""; + + if (!parse_res.model.second) { + exit(-1); + } + if (parse_res.model.second) { + modelPath = parse_res.model.first; + } + if (parse_res.inputPath.second) { + sequenceDirectory = parse_res.inputPath.first; + } + if (parse_res.archInfo.second) { + affinityPolicyName = parse_res.archInfo.first; + } + + auto pipeline = createPipeline(affinityPolicyName, modelPath); + + // load sequences + std::map> inMap = pipeline->get_inputs(); + std::vector sequenceDescs; + TensorDesc soundInputDesc = (*(inMap["sounds"])).get_desc(); + sequenceDescs.push_back(soundInputDesc); + + std::vector> sequences, results; + std::vector sequencePaths = + load_data(sequenceDirectory + std::string("/input"), sequenceDescs, &sequences); + std::vector resultDescs; + resultDescs.push_back(soundInputDesc); + std::vector resultPaths = + load_data(sequenceDirectory + std::string("/result"), resultDescs, &results); + + double totalTime = 0; + U32 sequenceIndex = 0; + U32 falseResult = 0; + std::cout << "[RESULT]:" << std::endl; + for (auto sequence : sequences) { + std::cout << sequencePaths[sequenceIndex] << ": " << std::endl; + TensorDesc desc = sequence[0].get_desc(); + TensorDesc inputDesc = tensor3d(soundInputDesc.dt, 1, + tensorNumElements(desc) / soundInputDesc.dims[0], soundInputDesc.dims[0]); + std::map inputDescMap; + inputDescMap["sounds"] = inputDesc; + pipeline->reready(inputDescMap); + + auto modelInputTensorNames = pipeline->get_model_input_tensor_names(); + std::map> model_tensors_input; + for (int index = 0; index < (int)modelInputTensorNames.size(); index++) { + U8 *tensorPointer = (U8 *)((CpuMemory *)(sequence[index].get_memory()))->get_ptr(); + pipeline->copy_to_named_input(modelInputTensorNames[index], tensorPointer); + } + + double timeBegin = ut_time_ms(); + pipeline->run(); + double timeEnd = ut_time_ms(); + totalTime += (timeEnd - timeBegin); + + Tensor output = pipeline->get_tensor_by_name("labels"); + std::cout << output.string(32) << std::endl; + if (resultPaths.size() > sequenceIndex) { + F32 *result = (F32 *)((CpuMemory *)(results[sequenceIndex][0].get_memory()))->get_ptr(); + U32 inferenceSize = output.length(); + for (U32 i = 0; i < results[sequenceIndex][0].length(); i++) { + if (i >= inferenceSize || result[i] != output.element(i)) { + falseResult++; + break; + } + } + } + + sequenceIndex++; + } + + UNI_TIME_STATISTICS + + std::cout << "[SUMMARY]:" << std::endl; + UNI_CI_LOG( + "speech recognition rate: %f %%\n", 100.0 * (sequenceIndex - falseResult) / sequenceIndex); + UNI_CI_LOG("avg_time:%fms/sequence\n", 1.0 * totalTime / sequenceIndex); + + return 0; +} diff --git a/inference/examples/automatic_speech_recognition/audio_feature.cpp b/inference/examples/automatic_speech_recognition/audio_feature.cpp new file mode 100644 index 00000000..becb6410 --- /dev/null +++ b/inference/examples/automatic_speech_recognition/audio_feature.cpp @@ -0,0 +1,2352 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "audio_feature.h" +#include "error.h" + +void AudioFeatureExtractor::PreEmphasis( + std::vector &signal, short lastPoint, std::vector &output) +{ + float PRE_EMPH = 0.97F; + + output.push_back(static_cast(signal[0] - PRE_EMPH * lastPoint)); + + for (int i = 1; i < (int)signal.size(); i++) { + output.push_back(static_cast(signal[i] - PRE_EMPH * signal[i - 1])); + } +} + +void AudioFeatureExtractor::SplitToFrames( + std::vector &signal, std::vector> &output, int nFrames) +{ + auto itr = signal.begin(); + + for (int i = 0; i < nFrames; i++) { + if ((i * FRAME_STEP + W_LENGTH) < (int)signal.size()) { + std::copy(signal.begin() + i * FRAME_STEP, signal.begin() + i * FRAME_STEP + W_LENGTH, + output[i].begin()); + } else { + std::copy(signal.begin() + i * FRAME_STEP, signal.end(), output[i].begin()); + } + itr += FRAME_STEP; + } +} + +void AudioFeatureExtractor::CentralPadding(std::vector &signal, std::vector &output) +{ + int padding_size = (N_FFT - W_LENGTH) / 2; + std::vector begin_padding(padding_size, 0); + std::vector end_padding(padding_size, 0); + + std::copy(begin_padding.begin(), begin_padding.end(), output.begin()); + std::copy(signal.begin(), signal.end(), output.begin() + padding_size); + std::copy(end_padding.begin(), end_padding.end(), output.begin() + padding_size + W_LENGTH); +} + +std::vector AudioFeatureExtractor::GetHammingWindow(bool periodic) +{ + int normSize = W_LENGTH - 1; + if (periodic) { + normSize = normSize - W_LENGTH % 2 + 1; + } + + std::vector factors(W_LENGTH); + for (int i = 0; i < W_LENGTH; i++) { + float pi = 3.14159f; + factors[i] = 0.54F - (0.46F * static_cast(cos((2 * pi * i) / normSize))); + } + + return factors; +} + +void AudioFeatureExtractor::AddHammingWindow(std::vector &data) +{ + std::vector HAMMING_WINDOW; + HAMMING_WINDOW = GetHammingWindow(true); + + for (int i = 0; i < (int)data.size(); i++) { + data[i] *= HAMMING_WINDOW[i]; + } +} + +std::vector AudioFeatureExtractor::ComputePowerSpec(std::vector fft) +{ + std::vector powerSpec(N_DIM); + for (int i = 0; i < N_DIM; i++) { + auto fft_r = static_cast(fft[2 * i]); + auto fft_i = static_cast(fft[2 * i + 1]); + powerSpec[i] = (fft_r * fft_r + fft_i * fft_i) / N_FFT; + } + return powerSpec; +} + +double AudioFeatureExtractor::HerzToMel(double herz) +{ + return _MEL_HIGH_FREQUENCY_Q * log(1.0 + (herz / _MEL_BREAK_FREQUENCY_HERTZ)); +} + +std::vector AudioFeatureExtractor::HerzToMel(std::vector herzVec) +{ + std::vector melVec(herzVec.size()); + + for (int i = 0; i < (int)herzVec.size(); i++) { + melVec[i] = HerzToMel(herzVec[i]); + } + + return melVec; +} + +std::vector AudioFeatureExtractor::LineSpace(double lower, double upper, int number) +{ + double interval = (upper - lower) / (number - 1); + std::vector result(number); + + result[0] = lower; + result[number - 1] = upper; + + for (int i = 1; i < number - 1; i++) { + result[i] = lower + interval * i; + } + + return result; +} + +std::vector> AudioFeatureExtractor::GetLinearToMelMatrix() +{ + double nyquistHertz = SAMPLE_RATE / 2.0; + + std::vector linearFrequencies = LineSpace(0.0, nyquistHertz, N_DIM); + std::vector spectrogramBinsMel = HerzToMel(linearFrequencies); + std::vector bandEdgesMel = + LineSpace(HerzToMel(LOWER_HERZ_FREQ), HerzToMel(UPPER_HERZ_FREQ), N_FILTERS + 2); + + double bandEdgesMelScale = bandEdgesMel[1] - bandEdgesMel[0]; + + std::vector> melWeightsMat( + spectrogramBinsMel.size(), std::vector(N_FILTERS)); + + for (int i = 1; i < (int)spectrogramBinsMel.size(); i++) { + for (int j = 0; j < N_FILTERS; j++) { + double lowerSlope = spectrogramBinsMel[i] - bandEdgesMel[j]; + double upperSlope = bandEdgesMel[j + 2] - spectrogramBinsMel[i]; + double minSlope = fmin(lowerSlope, upperSlope); + if (minSlope > 0) { + melWeightsMat[i][j] = static_cast(minSlope / bandEdgesMelScale); + } + } + } + + return melWeightsMat; +} + +std::vector AudioFeatureExtractor::GetMelBankForSingleFrame(std::vector frame) +{ + std::vector powerSpec; + std::vector framePadded(N_FFT); + + AddHammingWindow(frame); + CentralPadding(frame, framePadded); + + fftwf_complex *in = static_cast(fftwf_malloc(sizeof(fftwf_complex) * N_FFT)); + fftwf_complex *out = static_cast(fftwf_malloc(sizeof(fftwf_complex) * N_FFT)); + for (int i = 0; i < N_FFT; i++) { + in[i][0] = framePadded[i]; + in[i][1] = 1.0f; + } + + fftwf_plan p = fftwf_plan_dft_1d(N_FFT, in, out, FFTW_FORWARD, FFTW_ESTIMATE); + fftwf_execute(p); + + std::vector specInput(N_FFT * 2); + for (int i = 0; i < N_FFT; i++) { + if (i == 0) { + specInput[2 * i] = out[i][0]; + specInput[2 * i + 1] = 0.0; + } else { + specInput[2 * i] = out[i][0]; + specInput[2 * i + 1] = out[i][1]; + } + } + fftwf_destroy_plan(p); + fftwf_free(in); + fftwf_free(out); + + powerSpec = ComputePowerSpec(specInput); + + // std::vector> MEL_WEIGHTS = GetLinearToMelMatrix(); + std::vector> MEL_WEIGHTS = { + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.028377542974119204, 0.01438900822987297, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.013988534744246243, 0.02877801645974593, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.04236607771836546, 0.00040047348562673776, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.027977069488492455, 0.014789481715499701, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.013588061258619515, 0.029178489945372667, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.04196560423273872, 0.0008009469712534763, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02757659600286572, 0.015189955201126462, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.013187587772992768, 0.02957896343099936, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.04156513074711206, + 0.0012014204568801378, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.027176122517239023, + 0.015590428686753173, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.012787114287366015, + 0.02997943691662618, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.041164657261485195, 0.0016018939425069506, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.026775649031612274, 0.015990902172379924, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.012386640801739273, 0.030379910402252925, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.040764183775858435, 0.002002367428133761, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.026375175545985473, 0.01639137565800666, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.011986167316112515, 0.030780383887879643, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.04036371029023186, 0.0024028409137602843, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.025974702060358814, 0.016791849143633433, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01158569383048571, 0.03118085737350646, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03996323680460505, 0.0028033143993871023, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.025574228574731996, + 0.017192322629260206, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.011185220344859019, + 0.031581330859133123, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.039562763318978324, + 0.003203787885013913, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.025173755089105205, 0.017592796114886934, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.010784746859232256, 0.031981804344759966, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.03916228983335135, 0.003604261370640695, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.024773281603478636, 0.01799326960051355, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.010384273373605622, 0.03238227783038662, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.03876181634772464, 0.004004734856267508, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.024372808117851818, 0.01832063995035457, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.010202637843519902, 0.031611458930250136, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03855391092566718, 0.0016560068311672965, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.028360663514711087, 0.010670250536719897, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01995565672770752, + 0.017961163884671668, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.013171320975496788, + 0.02369179259268357, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.007852420705033428, + 0.028013154870290456, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.003855252056611007, + 0.031065024343655736, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0010468890270737834, 0.03297666563721646, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.032525795232788056, 0.0006469856915086729, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.03098233138702626, 0.0013817753283573875, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.030265615006260627, 0.001327906355091734, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.030283262261724658, 0.0005751233146929622, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0008535034173718123, 0.029303698353873656, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0029060864012274584, 0.02658198842031494, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.005464414489148291, 0.02338369067460831, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.008459612705604193, 0.01977563150017821, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.011828568565461231, 0.01581902179659536, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.015513532642574207, 0.01156984666288346, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01946174420293937, 0.007079230618957747, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02362508042224929, 0.002393779809957561, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0026274615629053957, 0.02289111627252, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.007950754161161056, + 0.01708814892773665, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01335899693463432, + 0.01121670581095207, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.018818283166961453, + 0.005309541497503488, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0006491445198316474, + 0.023045788191944288, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00698437812813675, + 0.01629731656779974, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.013272337673729569, 0.00960843461949587, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.019490308734984828, 0.0030009953187898905, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0037686404167832357, 0.018348084311577542, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.01063669900930533, 0.011118855305415653, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.017361958679361446, 0.0040415268100010915, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.003085905508775607, 0.017977443408692638, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.010326550845234005, 0.010409214612737486, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.017366704187608988, 0.003048616094710451, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.004403341139589429, 0.01570311829619988, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.011844562319580402, 0.007962638905889666, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.01904049783284377, 0.0004729552180659903, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.007269122403884178, 0.011963869707503835, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.014769530008231939, 0.0041880517929891135, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0035523942877343393, 0.015137931707918249, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.011301830634324753, 0.007129837865952539, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0006128389755833674, 0.0175641571067387, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.008560664476431551, 0.009372962620292765, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.016181126760858023, 0.0015116418151702193, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006474993151394213, 0.010986520260981823, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.014228235878880604, 0.003005647064580796, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0049791027868558045, 0.012034413337757007, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.012825904098576837, + 0.003972035328996462, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.004012254256524645, + 0.012575694033629195, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.011917025434887662, + 0.004466317925452859, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0035183546875407045, + 0.012664884691514762, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01144887940563181, + 0.004539730732766297, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0034456490088684635, + 0.012352278780654393, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.011372839212756227, + 0.004239518924627132, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0037464303784879504, + 0.011684223437276832, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.011644097810997383, + 0.0036092042446276493, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.004376768392250799, 0.010703384650423508, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.012221410823378834, 0.002688835838931377, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.005296254038803357, 0.00944899610378322, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.01306685532137641, 0.001515225605278247, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.006467760423154104, 0.007957092627597761, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.014145603542796002, 0.00012216896312295877, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.007857218336451012, 0.006260729161023512, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.001568833130106022, 0.012399289358924065, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.009433405801511287, 0.004390186074597024, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.003441100930424836, 0.010239720250433047, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.01116775077277046, 0.002373161654500664, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.00545758568431269, 0.007947278217856237, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.013034146215745834, 0.00023495250500369148, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.007592836729846747, 0.005546619131420947, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0021515272439476597, 0.010858285757838202, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.00982366423969415, 0.003060196381725472, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.004638551644406247, 0.008121770180822758, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.012128978363063033, 0.0005084177982820857, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.007187999965218728, 0.005331674061446397, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.002247021567374424, 0.01015493032461071, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.009781299670107893, 0.0025059521691133246, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00507296073255986, 0.007102111763645642, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0003646217950118263, 0.01169827135817796, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.007915006851784577, 0.004040579742123949, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.003428353851945017, 0.008420335215566682, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.010758535703990588, 0.000984443037557675, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006483130851456317, 0.005157983492935077, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.002207725998922047, 0.009331523948312478, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.009516324471877342, 0.0019233806012160773, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0054422214258588635, 0.005900415401056275, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0013681183798403844, 0.009877450200896474, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.008634395297734325, + 0.0025171406115715206, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.004752116027798543, + 0.006306921965855592, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0008698367578627612, + 0.010096703320139662, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.008075210008049085, + 0.0028022115135905353, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0043757227443425445, + 0.006413556004178512, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.000676235480636005, + 0.010024900494766488, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.007804118255427382, + 0.0028122664319902243, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.004278816477084893, + 0.0062535755235970806, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0007535146987424039, + 0.009694884615203937, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00778916518370714, + 0.0025783508354261563, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.004429847596563119, + 0.005857630420866068, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0010705300094190984, + 0.009136910006305978, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.008000910888885989, 0.002129058688011321, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.004799762345909029, 0.005253937713179549, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.001598613802932068, 0.008378816738347778, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.00841226098941319, 0.0014906977582096177, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.005361834309476819, 0.004468445970131726, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.002311407629540449, 0.007446194182053835, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.008998307657936223, 0.0006874447175282588, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.006091506300628975, 0.0035249895767657533, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0031847049433217267, 0.006362534436003247, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0002779035860144786, 0.009200079295240741, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.006966242059193332, 0.0024454334226279993, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.004196303615362464, 0.005149376219385745, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0014263651715315966, 0.007853319016143492, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.007965387183447241, 0.001249808578221988, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.005325867643068401, 0.0038264397896499358, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0026863481026895624, 0.006403071001077883, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 4.682856231072361e-05, 0.008979702212505832, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.006554797514716648, 0.0024117534129888644, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0040395562754902675, 0.004867067337405281, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.001524315036263888, 0.007322381261821698, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.007866108718963106, 0.0009217718857476596, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.005469294300761493, 0.0032614805878292195, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0030724798825598823, 0.005601189289910779, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0006756654643582706, 0.00794089799199234, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.006960354349857755, 0.001601034175999913, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.004676390781788259, 0.0038305807753812276, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0023924272137187635, 0.006060127374762542, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0001084636456492677, 0.008289673974143857, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.00632248402662231, 0.0020236771012191778, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0041460578750377435, 0.004148248435415844, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0019696317234531766, 0.006272819769612511, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.007998465907218316, 0.00019236275053673336, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005924513914496727, 0.002216901441651177, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0038505619217751356, 0.0042414401327656205, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0017766099290535452, 0.006265978823880064, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.007716917721831137, 0.00027659128794857755, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005740615027167201, 0.002205807241231245, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0037643123325032638, 0.004135023194513913, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0017880096378393273, 0.0060642391477965804, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.007630220726782063, 0.00017515254470366014, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005746969641586685, 0.0020135339057196664, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.003863718556391308, 0.003851915266735673, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00198046747119593, 0.005690296627751679, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.721638600055224e-05, 0.007528677988767685, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005921636521264824, 0.0016613917038077958, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.004127055833060703, 0.003413215299699383, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.002332475144856583, 0.005165038895590971, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0005378944566524627, 0.006916862491482558, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006244424435332735, + 0.001168985171637787, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.004534339215261857, + 0.0028383264607739827, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0028242539951909788, + 0.004507667749910178, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0011141687751201007, + 0.006177009039046374, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006696773489474588, + 0.000554328893078138, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005067205379705533, + 0.0021450714462383793, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.003437637269936478, + 0.0037358139993986204, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0018080691601674225, + 0.005326556552558862, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00017850105039836746, + 0.006917299105719103, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005708801770471052, + 0.0013498006366808207, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0041559597289110885, + 0.002865645171829301, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.002603117687351125, + 0.004381489706977781, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.001050275645791162, + 0.005897334242126261, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006443682645083771, + 0.0004674935156182875, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.004963954125864155, + 0.0019119665072878423, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.003484225606644538, + 0.0033564394989573972, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0020044970874249215, + 0.004800912490626952, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0005247685682053051, + 0.006245385482296507, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005847654166832974, + 0.0008883156305972345, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0044375967160492505, + 0.002264777513911948, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0030275392652655262, + 0.0036412393972266614, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0016174818144818024, + 0.005017701280541375, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0002074243636980783, + 0.006394163163856089, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0054506368104109415, 0.0011187042639818625, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.004106970057566763, 0.002430357252403933, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0027633033047225848, 0.0037420102408260033, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.001419636551878406, 0.0050536632292480736, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 7.596979903422734e-05, 0.006365316217670143, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.005231466872755892, 0.0011792274825073057, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.003951064899279318, 0.0024291230174955107, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.002670662925802746, 0.003679018552483716, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0013902609523261731, 0.004928914087471921, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.00010985897884960065, 0.006178809622460126, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.005170621617823595, 0.001088853660626072, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.003950505684901677, 0.0022798995105370408, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0027303897519797603, 0.0034709453604480104, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0015102738190578432, 0.004661991210358979, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0002901578861359261, 0.005853037060269948, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.005250109542583016, 0.0008650585481435862, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.004087441161396524, 0.0020000255728906545, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0029247727802100315, 0.0031349925976377224, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0017621043990235388, 0.004269959622384791, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0005994360178370465, 0.0054049266471318585, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.005453367389203894, 0.0005239257536298926, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.004345441717441285, 0.001605454350836663, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0032375160456786754, 0.002686982948043433, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.002129590373916066, 0.003768511545250204, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0010216647021534563, 0.004850040142456974, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.005765163510226976, 8.024102741994223e-05, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.004709403059514468, 0.0011108472750655516, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0036536426088019584, 0.002141453522711161, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0025978821580894492, 0.0031720597703567706, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0015421217073769398, 0.00420266601800238, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0004863612566644305, 0.0052332722656479895, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.005165455855213152, 0.0005296622162899674, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0041594044948223075, 0.0015117437261357035, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0031533531344314636, 0.0024938252359814395, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.002147301774040619, 0.0034759067458271756, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0011412504136497745, 0.004457988255672912, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.00013519905325893013, 0.005440069765518648, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.004742198237154558, 0.0008100776535657531, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.003783515480037499, 0.0017459191489808859, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.002824832722920441, 0.002681760644396019, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0018661499658033825, 0.0036176021398111515, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0009074672086863242, 0.004553443635226284, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005390485695323663, 4.76413405109918e-05}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.004476941253182504, 0.0009394199716741435}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.003563396811041345, 0.0018311986028372953}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0026498523689001867, 0.0027229772340004467}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0017363079267590281, 0.0036147558651635986}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0008227634846178695, 0.004506534496326751}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005223188431655863}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.004352657026379878}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.003482125621103892}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.002611594215827906}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0017410628105519205}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0008705314052759349}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}}; + + std::vector melSpec(N_FILTERS); + for (int j = 0; j < N_FILTERS; j++) { + float sum = 0; + for (int i = 0; i < N_DIM; i++) { + sum += powerSpec[i] * MEL_WEIGHTS[i][j]; + } + melSpec[j] = sum > EPSILON ? static_cast(log(sum)) : LOG_EPSILON; + } + return melSpec; +} + +std::vector>> AudioFeatureExtractor::getEncoderInput( + std::vector signal, std::vector lastPoints, bool padding) +{ + int numCategory = 3; + std::vector>> featureByCategory(numCategory); + + short lastPoint; + int nFrames; + + lastPoint = lastPoints.back(); + if (padding) { + nFrames = static_cast((int)signal.size() - 1) / FRAME_STEP + 1; + } else { + nFrames = static_cast((int)signal.size() - W_LENGTH) / FRAME_STEP + 1; + } + + if (nFrames >= 1) { + std::vector signalPreEnph; + + // central padding's frames shape + std::vector> frames(nFrames, std::vector(W_LENGTH)); + + PreEmphasis(signal, lastPoint, signalPreEnph); + SplitToFrames(signalPreEnph, frames, nFrames); + + // calculate mel_bank_coefficiences + std::vector> melFeatures(nFrames, std::vector(N_DIM)); + for (int i = 0; i < (int)frames.size(); i++) { + melFeatures[i] = GetMelBankForSingleFrame(frames[i]); + } + featureByCategory[0] = melFeatures; + } else { + std::vector> melFeatures(1, std::vector(N_DIM)); + featureByCategory[0] = melFeatures; + } + return featureByCategory; +} +int AudioFeatureExtractor::getWavHead(FILE *file) +{ + static char wavHead[100]; + int headSize = 0; + int i = 0; + + fseek(file, 0, SEEK_SET); + size_t readSize = fread(wavHead, 1, 100, file); + if (readSize != 100) { + return -1; + } + + for (i = 0; i < 92; i++) { + if (wavHead[i] == 'd' && wavHead[i + 1] == 'a' && wavHead[i + 2] == 't' && + wavHead[i + 3] == 'a') { + headSize = i + 8; + } + } + fseek(file, 0, SEEK_SET); + return headSize; +} + +std::vector AudioFeatureExtractor::readWav(const std::string &wavName) +{ + unsigned int wavSize = 0; + size_t readSize = 0; + + FILE *fp = fopen(wavName.c_str(), "rb"); + if (fp == NULL) { + UNI_ERROR_LOG("wav file %s not exist\n", wavName.c_str()); + std::vector data; + return data; + } + + int wavHeadSize = getWavHead(fp); + if (wavHeadSize > 4) { + int retSek = fseek(fp, wavHeadSize - 4, SEEK_SET); + CHECK_REQUIREMENT(retSek == 0); + + readSize = fread(&wavSize, sizeof(int), 1, fp); + CHECK_REQUIREMENT(readSize == 1); + } + std::vector data(wavSize / 2); + readSize = fread(data.data(), sizeof(short), wavSize / 2, fp); + CHECK_REQUIREMENT(readSize == data.size()); + + fclose(fp); + return data; +} + +std::vector>> AudioFeatureExtractor::getEncoderInputFromWav( + std::string wavFilePath) +{ + std::vector audioRaw = readWav(wavFilePath); + std::vector lastPoints(10); + std::vector>> melFea = + getEncoderInput(audioRaw, lastPoints, false); + return melFea; +} diff --git a/inference/examples/automatic_speech_recognition/audio_feature.h b/inference/examples/automatic_speech_recognition/audio_feature.h new file mode 100644 index 00000000..04fd48d2 --- /dev/null +++ b/inference/examples/automatic_speech_recognition/audio_feature.h @@ -0,0 +1,77 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef AUDIO_FEATURE_H_ +#define AUDIO_FEATURE_H_ + +#include +#include + +using cd = std::complex; + +class AudioFeatureExtractor { +public: + static std::vector>> getEncoderInputFromWav( + std::string wavFilePath); + + static std::vector>> getEncoderInput(std::vector signal, + std::vector lastPoints, + bool padding); // padding false + +private: + static constexpr int FRAME_STEP = 160; + static constexpr int W_LENGTH = 400; // window length + static constexpr int N_FFT = 512; // Num of FFT length + static constexpr int N_DIM = N_FFT / 2 + 1; + static constexpr int N_FILTERS = 128; // N_FILTERS = 41; + + static constexpr int SAMPLE_RATE = 16000; + static constexpr double LOWER_HERZ_FREQ = 0; + static constexpr double UPPER_HERZ_FREQ = 8000; + static constexpr float EPSILON = 2.2204460492503131e-16F; + static constexpr float LOG_EPSILON = -36.043653389F; + + static constexpr float _MEL_BREAK_FREQUENCY_HERTZ = 700.0F; + static constexpr float _MEL_HIGH_FREQUENCY_Q = 1127.0F; + + static void PreEmphasis(std::vector &signal, short lastPoint, std::vector &output); + + static void SplitToFrames( + std::vector &signal, std::vector> &output, int nFrames); + + static void CentralPadding(std::vector &signal, std::vector &output); + + static std::vector GetMelBankForSingleFrame(std::vector frame); + + static void AddHammingWindow(std::vector &data); + + static void fft(std::vector &a, bool invert); + + static std::vector ComputePowerSpec(std::vector fft); + + static std::vector GetHammingWindow(bool periodic); + + static std::vector> GetLinearToMelMatrix(); + + static std::vector LineSpace(double lower, double upper, int number); + + static std::vector HerzToMel(std::vector herzVec); + + static double HerzToMel(double herz); + + static int getWavHead(FILE *file); + + static std::vector readWav(const std::string &wavName); +}; + +#endif // AUDIO_FEATURE_H_ diff --git a/inference/examples/automatic_speech_recognition/encoder_flow.prototxt b/inference/examples/automatic_speech_recognition/encoder_flow.prototxt new file mode 100644 index 00000000..510a5bf4 --- /dev/null +++ b/inference/examples/automatic_speech_recognition/encoder_flow.prototxt @@ -0,0 +1,350 @@ +name: "encoder" +input: "sounds" +input: "encoder_block0_trunk0_layer0_mem" +input: "encoder_block0_trunk0_layer1_mem" +input: "encoder_block1_trunk1_layer0_kmem" +input: "encoder_block1_trunk1_layer0_vmem" +input: "encoder_block1_trunk1_layer1_kmem" +input: "encoder_block1_trunk1_layer1_vmem" +input: "encoder_block2_trunk0_layer0_mem" +input: "encoder_block2_trunk0_layer1_mem" +input: "encoder_block2_trunk1_layer0_kmem" +input: "encoder_block2_trunk1_layer0_vmem" +input: "encoder_block2_trunk1_layer1_kmem" +input: "encoder_block2_trunk1_layer1_vmem" +input: "encoder_block3_trunk0_layer0_mem" +input: "encoder_block3_trunk0_layer1_mem" +input: "encoder_block3_trunk1_layer0_kmem" +input: "encoder_block3_trunk1_layer0_vmem" +input: "encoder_block3_trunk1_layer1_kmem" +input: "encoder_block3_trunk1_layer1_vmem" +input: "encoder_block3_trunk1_layer2_kmem" +input: "encoder_block3_trunk1_layer2_vmem" +input: "encoder_block3_trunk1_layer3_kmem" +input: "encoder_block3_trunk1_layer3_vmem" +output: "encoder_block3_transformer_ln" +output: "encoder_block0_conv0_neg_slice" +output: "encoder_block0_conv1_neg_slice" +output: "encoder_block1_transformer_layer0_k_neg_slice" +output: "encoder_block1_transformer_layer0_v_neg_slice" +output: "encoder_block1_transformer_layer1_k_neg_slice" +output: "encoder_block1_transformer_layer1_v_neg_slice" +output: "encoder_block2_conv0_neg_slice" +output: "encoder_block2_conv1_neg_slice" +output: "encoder_block2_transformer_layer0_k_neg_slice" +output: "encoder_block2_transformer_layer0_v_neg_slice" +output: "encoder_block2_transformer_layer1_k_neg_slice" +output: "encoder_block2_transformer_layer1_v_neg_slice" +output: "encoder_block3_conv0_neg_slice" +output: "encoder_block3_conv1_neg_slice" +output: "encoder_block3_transformer_layer0_k_neg_slice" +output: "encoder_block3_transformer_layer0_v_neg_slice" +output: "encoder_block3_transformer_layer1_k_neg_slice" +output: "encoder_block3_transformer_layer1_v_neg_slice" +output: "encoder_block3_transformer_layer2_k_neg_slice" +output: "encoder_block3_transformer_layer2_v_neg_slice" +output: "encoder_block3_transformer_layer3_k_neg_slice" +output: "encoder_block3_transformer_layer3_v_neg_slice" +node { + name: "sounds" + type: "Input" + output: "sounds" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 15 + input_dim: 128 +} +node { + name: "encoder_block0_trunk0_layer0_mem" + type: "Input" + output: "encoder_block0_trunk0_layer0_mem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 2 + input_dim: 128 + input_dim: 1 +} +node { + name: "encoder_block0_trunk0_layer1_mem" + type: "Input" + output: "encoder_block0_trunk0_layer1_mem" + input_type: "FLOAT32" + input_format: "NCHWC8" + input_dim: 1 + input_dim: 32 + input_dim: 1 + input_dim: 64 +} +node { + name: "encoder_block1_trunk1_layer0_kmem" + type: "Input" + output: "encoder_block1_trunk1_layer0_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 5 + input_dim: 6 + input_dim: 64 +} +node { + name: "encoder_block1_trunk1_layer0_vmem" + type: "Input" + output: "encoder_block1_trunk1_layer0_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 5 + input_dim: 6 + input_dim: 64 +} +node { + name: "encoder_block1_trunk1_layer1_kmem" + type: "Input" + output: "encoder_block1_trunk1_layer1_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 7 + input_dim: 6 + input_dim: 64 +} +node { + name: "encoder_block1_trunk1_layer1_vmem" + type: "Input" + output: "encoder_block1_trunk1_layer1_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 7 + input_dim: 6 + input_dim: 64 +} +node { + name: "encoder_block2_trunk0_layer0_mem" + type: "Input" + output: "encoder_block2_trunk0_layer0_mem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 2 + input_dim: 384 +} +node { + name: "encoder_block2_trunk0_layer1_mem" + type: "Input" + output: "encoder_block2_trunk0_layer1_mem" + input_type: "FLOAT32" + input_format: "NCHWC8" + input_dim: 1 + input_dim: 1024 + input_dim: 1 + input_dim: 1 +} +node { + name: "encoder_block2_trunk1_layer0_kmem" + type: "Input" + output: "encoder_block2_trunk1_layer0_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 7 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block2_trunk1_layer0_vmem" + type: "Input" + output: "encoder_block2_trunk1_layer0_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 7 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block2_trunk1_layer1_kmem" + type: "Input" + output: "encoder_block2_trunk1_layer1_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 9 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block2_trunk1_layer1_vmem" + type: "Input" + output: "encoder_block2_trunk1_layer1_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 9 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block3_trunk0_layer0_mem" + type: "Input" + output: "encoder_block3_trunk0_layer0_mem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 2 + input_dim: 512 +} +node { + name: "encoder_block3_trunk0_layer1_mem" + type: "Input" + output: "encoder_block3_trunk0_layer1_mem" + input_type: "FLOAT32" + input_format: "NCHWC8" + input_dim: 1 + input_dim: 1024 + input_dim: 1 + input_dim: 1 +} +node { + name: "encoder_block3_trunk1_layer0_kmem" + type: "Input" + output: "encoder_block3_trunk1_layer0_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 9 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block3_trunk1_layer0_vmem" + type: "Input" + output: "encoder_block3_trunk1_layer0_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 9 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block3_trunk1_layer1_kmem" + type: "Input" + output: "encoder_block3_trunk1_layer1_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 15 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block3_trunk1_layer1_vmem" + type: "Input" + output: "encoder_block3_trunk1_layer1_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 15 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block3_trunk1_layer2_kmem" + type: "Input" + output: "encoder_block3_trunk1_layer2_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 23 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block3_trunk1_layer2_vmem" + type: "Input" + output: "encoder_block3_trunk1_layer2_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 23 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block3_trunk1_layer3_kmem" + type: "Input" + output: "encoder_block3_trunk1_layer3_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 31 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block3_trunk1_layer3_vmem" + type: "Input" + output: "encoder_block3_trunk1_layer3_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 31 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_inference" + type: "Inference" + input: "sounds" + input: "encoder_block0_trunk0_layer0_mem" + input: "encoder_block0_trunk0_layer1_mem" + input: "encoder_block1_trunk1_layer0_kmem" + input: "encoder_block1_trunk1_layer0_vmem" + input: "encoder_block1_trunk1_layer1_kmem" + input: "encoder_block1_trunk1_layer1_vmem" + input: "encoder_block2_trunk0_layer0_mem" + input: "encoder_block2_trunk0_layer1_mem" + input: "encoder_block2_trunk1_layer0_kmem" + input: "encoder_block2_trunk1_layer0_vmem" + input: "encoder_block2_trunk1_layer1_kmem" + input: "encoder_block2_trunk1_layer1_vmem" + input: "encoder_block3_trunk0_layer0_mem" + input: "encoder_block3_trunk0_layer1_mem" + input: "encoder_block3_trunk1_layer0_kmem" + input: "encoder_block3_trunk1_layer0_vmem" + input: "encoder_block3_trunk1_layer1_kmem" + input: "encoder_block3_trunk1_layer1_vmem" + input: "encoder_block3_trunk1_layer2_kmem" + input: "encoder_block3_trunk1_layer2_vmem" + input: "encoder_block3_trunk1_layer3_kmem" + input: "encoder_block3_trunk1_layer3_vmem" + output: "encoder_block3_transformer_ln" + output: "encoder_block0_conv0_neg_slice" + output: "encoder_block0_conv1_neg_slice" + output: "encoder_block1_transformer_layer0_k_neg_slice" + output: "encoder_block1_transformer_layer0_v_neg_slice" + output: "encoder_block1_transformer_layer1_k_neg_slice" + output: "encoder_block1_transformer_layer1_v_neg_slice" + output: "encoder_block2_conv0_neg_slice" + output: "encoder_block2_conv1_neg_slice" + output: "encoder_block2_transformer_layer0_k_neg_slice" + output: "encoder_block2_transformer_layer0_v_neg_slice" + output: "encoder_block2_transformer_layer1_k_neg_slice" + output: "encoder_block2_transformer_layer1_v_neg_slice" + output: "encoder_block3_conv0_neg_slice" + output: "encoder_block3_conv1_neg_slice" + output: "encoder_block3_transformer_layer0_k_neg_slice" + output: "encoder_block3_transformer_layer0_v_neg_slice" + output: "encoder_block3_transformer_layer1_k_neg_slice" + output: "encoder_block3_transformer_layer1_v_neg_slice" + output: "encoder_block3_transformer_layer2_k_neg_slice" + output: "encoder_block3_transformer_layer2_v_neg_slice" + output: "encoder_block3_transformer_layer3_k_neg_slice" + output: "encoder_block3_transformer_layer3_v_neg_slice" + infer_output_size_parameter: "encoderInferOutputSize" + preprocess_parameter: "encoderPreProcess" + inference_parameter: "/data/local/tmp/CI/model_zoo/caffe_models/asr_convolution_transformer_encoder/asr_convolution_transformer_encoder_f32.bolt" +} diff --git a/inference/examples/automatic_speech_recognition/example.wav b/inference/examples/automatic_speech_recognition/example.wav new file mode 100644 index 0000000000000000000000000000000000000000..b6483ae8a05299cd6de0cbbbca0da3ed538f33cc GIT binary patch literal 93646 zcmYJ62b>f|^Y?qgrVAv=f{G}n#~cwAMNw4D5p%|Xm=op)vlvlC5fu?pQB0_hsF*Q} zS;T;fZ+*AC_s@QAZ+CWPy1K%zepTIj9KQd)`_4Q+Fvsk5^uV*uzjT9k z#+X1JH|!xVt~Dk!?aUdcUV7@2a!!+P@rf_{s?X$i>Rs)%msiFVB`-g1 zzo{C7HA`>9jl6tR(mwHR(`J%IeDGRyB6QhrZ(1vOnUzp7XW&0;ky85(Ap$(<^4 z$5funGM*7xOQwzKPwSw(?y6uZGcaUhe8;CSK;iH9Dedk-1rsnSabT-~<^D z+185mctTlzH_NNC%woB`*Ey|`S+vwXWzK<&XiP<<#-2Q*F)!EP6?qv$-{I?wlgXJ} zk436-M|^#m+^wZHCfYTk#IB6$^vhv((hnwmAyS5@wW2CBX&S6~xf7C9D)q-eY9+lj_fIj5bBQ;^Z?MVdOfvM8g+ za<_t9lb3Tqk`A@cI>|lSnoe@WiJbUBy*v{)ZDgdJykbt^0iJ+$+sHfor>-_nW`Ixg zL*$2|sl?aG*~A7mbuvRmTMDO(2qefstNXp!e zjH3AygbHd`ff?jxA_q^$|FL~uu3@J2f@*k1o7zm#6Q15E5@=Q+KFAJ4EEWRSz(SX<$oW*4DhFEU&wFD^P(MD{`t1OLCl}L_YHYBfvc`&twnARK666 z71xLic%Wt$5EL81robFLB`?nnG9Fgbyi4qo=MX1-dkN4I5ptvLgf z!z+*q&VUri#+}s?unU+6aR}$cYYQSZkplPAtf3LWxm*X{;ng4q=%qZdUZNYHEyz*L zjf@Tok}uJK6IsC%B9U3~3Fw4}m*pBf0R-j&^M&DX6>*AI;3I3~JgkdH;ceWT7(tiH z`M3k@09+u8Do-JH)oy46`;k{+qRk@H+JApH$*ZOET5+W;S|E$#=k@Yn){*>fFJrbB zZc<;HEm@3Q#W`RZ(N4Yq3ERsLxYu5O){*!IWm?a}#)>cnkt`D&BFaD+xCr_J>k>iL zP_)k7G|ID@d@Ds#Gy&^_t8zXWOEG~gK~#`0JBU@tHVtB*4)VL5=_$W1TA`#JTdj#tQlXJMc|2R z=yLg4Qo~>TpZl$m&suptSV2BPJ7|e&ijmPL|1@5;I)wXau7hKv88`sGj)#C3NWvAG z;fRN%rXQjK`M@sx1!;M32A?pJ`c#uhhh6bYDz+8!+$1xgLLzTaHE{$KC$6xBW1Q_c_1g6HC~iFhpa0GNWdsAJGAd6X=`EL4;Dm+DWM6PXC)!zb}Yu0m35tsc*L zT9N7uEe=&fh%}0E(}On8x=J8gxzwr+bz%wb+GmQKiY(M~Sfo`pjSTuhpdda&_YFz07#`e;8Ol=?Ylu2H4bh6f z!}W+HP*2$_SWf&B71TcPBQ!=7*NKkRClyIxXWS8VL?dcfyp?%Ty=tCAYxsnAxIh{* zN>$`#K6nk7LCzy4+X-LjY`O@i=^~nHFB*ZvknfS3C?{6IYt7_jCbA`xXe{HA#4p~| z+7~3l;_WIk7_K;u#aGKGWJ|0K8Y`PreyUZa+)74s|8irkeep&{QtdE0zSl;iEJ@W# zj>E#pMty^2+l$r74az~S$c;4gkErKWf6B;^9i&7ztmd`26Jvg9pgqlyE}4BSFDU?Ulj z`GAo8qh80m(Jg%3$q%u^71)bdie+>(22Qo|eZ_W=gDW-nVjcRFaDY`cUEw>*5aBl5 z5xju8Y21Mu8dZ^SD7;TIoZcPIh$Ue9q5s1SX*oVh0ZcH$WEp%uD4NY=zw}lAon=Olx`= z7hPd`{>rl8(9DsJnAYU5FythXVB^FEh|*e5Y2OS_1@nn}^ba1QHy*y`C9r9_QdC8Y zMUF+nwifyMa+FR;lN?(naxL-;NJbRV5!R?d8}uGPEv&<{GA}#_3*ya0JRZzH zbfic{KKMURBCnt~A{DkzJ&Jz8H;$n_MKyen?1ZlHR8}Gwi%K8am)BS$Y@2=#(h(VJ zX4n?P&tBf5+qWxg?VekMfP)3Q485?d$zZ8C;u>dcE6OaMv`qCA_tCq_T z-is}e5M&WtHj@z-N@TAQSZ}hT04}2`}j&6(m`lyu!@z4|ooH!-2 zxJJDV*@!yWJBW`@D9V=K|a5O`-=RrlY0^PM@Tet-+TVn|6~J;6aS4U1)k)8oi*U_P~)0B%=4s zNSYHBk--h}0R24>4ul8eur0d7@3pokpMmDOTLM&LR%%ta6>^X_^_g+dCqK$(usz)w zuF?F9e-Ud`OWKW8-lv@wt|!kjH|=fmo*6NJ_8Zt5?(t5$H0Y5L&=DvGlE4npBGIhK zr|h3u!}1lS$q%YSu0@AfmmI3y9{2|q#YfRUUXFJXby!zBAMkN>jz?faSROL4ilrkF zDa!=!=%8!o2((e&z?sOajF7nzX*{%&#U8{fdht^6>FPc+Rld)0t&NBfGCF=p*3p$m zEJbApKBF@tNSQKP2TSm4yaHqa2VpGi)KM0T&m$r60t#@p<`eiYoV0Z<2M@sViY#~l znL%fQZiz6x1O7$CgA`y2c^u1eH}u~mV~`=JkyKCM9$KOogw<8sbcgU=q=svd1@T@Y z2(58Pty=L*^+BZ5yoXgaruYmr#>(_AHBaM%XpPkxG+{;T0&)=%tc*}|@(#Nq37)0dkX(=7XoSO-I!fkPr?$(3ogQ?pdYW}5|MrmAUFCRB z`Q2WAcdq4SvJ5*WVAR_4(3%&-=B|ov)bePAP6-$QB2pu&1vPJDKV-zuKx_Sn2ckzj zi=8w?Y|HXc-^tz-q{zaOQ?u2e7-mW)7UigWVjR#pb$h z2bU%jAsb8tFT=CI6*wol&%|*cZ)K+)JxG`Y{zjhw%eIwsxDwQ6j~n(P)><(R z>FJ=rUGP2F=wknu=!!j*^kVQwW`#bN%R8_g@6kMt=djO@Rq2kh?@9L$&|5>Z?DT|j zQ76+KAWyfB4!5Rb0+(Y1*dEqo&x%&Y)U?R8Or+4dgN%jE6gA=Xtqh$Rk&)NR5wy-L zd;&ags#y*W#I>Lch(jg>Z$V+@wwleT2jCiQ%OTgR6uIPh)5U5LZ<8x$N>`T;A?|=`;1AUpC-84gH5Ze%9(U8u(K9rV;FJ`OH` z{*1<5K`?$aV`{Hk@67r5KAKeS34d0Z*+oe1CB~@F@JjALq-mv1R4GFP5%4hVrqKyI zBo?&JrVkBPliTn&vcFbUtjfSU$tYkJBdH${*)S|PFtY=r+thLjx@j**$QfiAoij(# zwG-BkJXAPX7avy?L|gbdc^?)B!&)gnsrV+<5TAO`-;vwEM#Jed)e1iQc+ z=tyG)ya)86E64-C;u-m{+bTiA)wNgHzBQB4m!-?RtkxT+zlDeL;1hTw$i$3wXNcDS zDgM&}v+2>WP7Jx~bkY5x`eulbCgy80IQ zOm>AiudKB=IW*)K+`$(`kegdxJcla)q0ada_Fw$vW1xd8a2O_~Wi6Iaa zOY7=6-mJWuZY?MdR)cx!|KJHsk2oaylur=5x>r}%zR)ug;|Xy4WilW53inb5sd6%J za1HJSmjNAN2%wrGC_Gu2pdQVO&FS!h$Lf(tjQ*KD_fWhd*07gyHN1#%RpVTt-5}+0 zL?CGIJ!_D}0-2L5sG_+$v7&Vad6g9|x}D%7>ki~+?N=Zp^)Z;inM4To*4-1_4+|0h zXjwIi4z1U`!}V$-&sFV1f z3w)0K2_P|7b0=m(w4pKH;fK8A4#>wdS|l%!@nEbV7Pw9XART9;%S5aITj4dwm?3jP z8(PwKn z@9}SZ4XlSF>3L*-vJ*04CuYTK{1DcoNRG9%u7(5S^TaDS43?k|&5GEcyD7IsKWGm7 z;s@kb)f_rv9hrKE6GS zj90p{Kvd$9_!mrpD-ywScq91|*}zuz+Jfa;$KV~<2>pT%`VX|?K|G^F-EYX)Ugio) zlRY|Z_qoEYo`avS);;V&rKZ!XaO=nA0tVi%N-KT(66(_h7Yb*0;<%wJmDl-zu zfETrP$-yJog~lM7)D=4r8N4F*;88>_JQ&MH=;JqCoFY-;CS`XX0Mit4^x+F15Y~zhPmWY7e-#K49MM<=(7NP=D$E5h?)? zlZXNb$@Kc295OnuSwCQplyYWRF1}6Fzz~;<$Ig|A;oXATCtwtN4H=E@Sv!%8s-&~j z^gX1XL@fmq;+a|mzJ@0e_t;Q(VNn0FQlniCWm;IA^+tSwtf^>^_L&i!4mq*5<}$2| z=P@^}%FthHC5so3UznF7AUO~2xkBO}1XwE4Fk6noJ;~`vNmR0r5bU7eK*m>m*7^dx z;XEV(!N44%9?nvgYe8#{5%I8U<_K%lJ#C7|FgkK2eyVF=pf9Kn;^JGxm3l;LWD$XI z4Pu_!j0e%C6&~aA{aP|5VW9|6bv->;_vYx}@Jj+;AW57J>%k4Ol4G5NU7}?#en3-=I>YRK}s%ky#TFa3}r7 zC@4k7AxklS2e|`xq!Xc15^Ge$UnE^uYrTfEMZn;qXvA4Xy=# zlFe0CIE7Gj?geu<@4*Ww)nqN7Xh zC&tOeuoMs%cKd%dEWJ`Nj`e3^0i7vFCCcIQunzJ!S%}IEtfS}DPUPeo`ZenaTGMZZ zV=5XUC-x`*@SVjK@(fQ#=j^%EJu~bk>1uk(V|}xptQ&x@J!H*beR=e(@o~UK{595z>i>*i$t~-d8Iq8z!~lIpauHM^z_JuAO~xX!~vF4wu=7PEd(Q> zqtPfb(b41kUifM|dBtuWG|F8xGPLg0ZaGyO+QK7v%{N(D1tX8)HLO<@9L-|v z^`!%fL}&u7abM7fIt2}~lEFDxn4JaKhnlNS;uQpCwsgo?X{UAsFIra*)K+xI$t7?J zx>3q+wR*+#$hGK?%m=RU-Aq2!eU;=8qJvnXE~mPLhv>H~hy>6Q6b2E<0Ym_Kl>UwG z2hnewYS(L}oXs3n8jWhYP5M1`{2wgmIQHQmGOrcuu9P`YeS>?tLZbU2>3o34;0tSu zicp|EsKHo7BWyfh+x@1U0i;ALWN@Tq#%SOl8Hbz?R)D;Gw+MEQ4yn%I`Q$k`GAO5O zNkj|h!hx|imI0|cNbSveP6{52h!fFoIBYI{0{c*Uw(6{Wjnm&;67i%ewR)>;=|ffub2T`Z~1ojagU<@NAt z7#ewMiOd8qz$+9F@gCjrf_KtqXGH~Dk-tE95Q1Dp6-*zR$kufhx;}Uud4Oj+JRpJY z%|Y`#$bMYU)kubHRITh7M@IZab17YSA_JaNC!fM2SXt5U7uAVXz*!IqRE5jqd+W-B zDuJDacp=>zDi7GWu2?97a38)YkDue&cnIu)ej|JYv?m&tX&Nr>KZC~BCB(M*fGCBX_yE) z&|VsO7`BbaQL({Aum@IwlYgYEJTG^T!|+d!9u&L4~ejL|E(8xqjN zRcmU8e?!x!R?GE}y|bH$Ozax$E<9vo*;m^~j%_MGx{3_DMF$$SU<#f{?u1E_HHZ@J zCV`e@ZE7L>3&bLdsl?#3e1hhwGvJ1-tFu#r@A(j8NP$-1F3ev0>e?BG2ZB}jI~)^> z;hCBZxTE@>{yh!y3mlpqzKcaeOGG==f>fkbzEp+e#O1<%;CWOxbhDbpm%u>07_Zfp zIlP4Ni3Y5UZsA3m8@0Pmt|K48`00Ns#)F8&mv))714Z0{l~jw`?*OCtKEr&G4rv$E zFdqKTo=|#(ATamVE*_qachi-i8m4w;HKLtpu!HD0^d4=g<&%?IekE~80?7kQr7(N;pH;k zBDo)ZMi3Oc;NMgi+?gEBZ(R%1x(37mTR|P{k9VO*=0xViH(>B^MYSo2538&bB%-$H zF0b)Cp4sKgDlCY}UE!qge%J&oS~&#~Plm@DXi~M`R*(b!3g5!_;A%uEGel3^0ZoB8 za5ZJHywmads|Wh+C}K@BBoTt|fpNN1hn?a?CD9G8vS$V!r{AAruQ%KhZi+whjSxHx zk5EJ-MqwXIYEk=-SX+4--PG2VP}VSDXk<4$iodmkC0W;^OGWRji=b*(L9`9zc|&*_3-(cs!mr38=!d`Yr7Il7Fr5u73KL=H7_m%Er>qw%V=Y)1JqUbS`MdU=;JQQ+ zF^uf6d$QCTkpzFEdcfQ87iOxxH+%+;$LcP;Lw6diEBe#F)dj~TOW}q19}GoTi{J*# z6q!J})|ESIGh~D1U~l@pthmE2@F;Wy?*uF8NRU&|y;djqG5s#2(u!MiH7kcy&18PA z*RD4^>sSTA)^J_E^-C6Q-4UWWn`hiyFbX$pJ@}%}Ube|t6Om_fgt<@(Hq~8%xyhIAD4#QV%U^D7G zm@u555kZ+&@1d{9_bQ1OxR`QG-QiS~>XKQ~Q9`@Sh>V2iqEUDg7K%hJSR8#ex&r7x zk(T&o22`nVRO|%Pz+`P>a9s57!0+}cFCjZjggf$Zx@DKV1WJhL3 z6cSaeXc1$K#Oy#=kXdVQa7hsbT%x8{Y@;fp=Z}WK7$?%r7m4&QODz`!rVI0@NXl#K zLS&~iMn0woP0TYU2&qU8uIf60_H5bxNA}Wvlw@1J(Li1yZb24~uwIX}bQGy=;ZpE5 z@DnMJ3Y-H`wVuawwCYrA!Zw&E8duh$`40w8<~Ow<^aZ z&hRRFr;N&X+sOYQF)RT-%Dh_t29Rb8{q_i`%y(7E{M7$++;o?%enXw;)-{&aJxp{J z@Epxtuo_mTU_)q9|0)IV_?+y;gG@#&!4dc?gm@opo!$s~fgK}`LW$5%>3NkAD(UisME@b`AH62g- zb<}HE2LFePQ&W)(@Dx^*ke4cn9EC+SZ?NJD+Ok$m+-c8&T#jF^tmTBo!bF(`tX}(V za0dFlL?U^QE<9YUy`Y%>P_a|pX!XbFTA^_TxQmaVTdr2-tvxB?oHYkD50=rL!9MsI z-FPDrOcFIvnfsSZQnIyW#2k5=R2Z?qnb>+CanuL#2~AucN@XM*0mCKa_h%mhd0lrN@b@Lbclz(-E3U1G)bsw$Thz={udq&XJq%r|d$5TP5=f=p9GI8x zA>y}o57Z~|dYCBQ%IXrk_=q?3rn!lgF6uuN*6N%bEg= zighBm1oaW`VG>x3ZXMo51_R?XZm}!&(u~YbL^2`c!luyzeoQ2&4{MhQwhN9Z6VdE} zKj6V29zLwqEDQu4vZlMBR!8#}7C?P?H>?Jq)7(cKQri&6cs-p+Vh7A6!-Bb?j12C! zmvhkveK3tLFon60ku;KEvG5vV<#+v?R2eOVcctTV?000prgj}cR{cH(oRMxHYbN9jv_{ty z9!VFNT*uBi{8MvmYqf;`5cl8|GH{mWZSAB|Q!*!=8CudV7`g{V$;i~@pgt@WgyEY$ z<(eGeCi>Cu;=&ry2{Aw|t$Q=|yA`ahDz7G|Qqk#79y&>MIH?!em#(`fnGJm$w6EI0 zi;)OE0=vQQVVwF`%!p4aa<~vOVlOHW*d=GH$G{mi&Z%fwzhu1uufcEdDe#Sq*sd0{ z_zif(H$eh9gSBpD-SpTwk6tZnuXHl;ijvp@Pk}X%i8OwwjLAE#Od6KTY={>sTsSVi zNd|89415odqe`Xf=qQ|-4l^AgY7qD^{dZkO=_cK7y5F2ZZAqPu1;||;m@M;v zc@g*c19DS)Q}vTa&=e8I-x=4;Pe%oOVg*`zPH+s(*37p~s@rvCR9yw2IzeCh*RNO) z09DZwQ3;>M@6lbout+!ve>sPGq}76W38;#$$b)z}cm`wFH7A%Zz5}KcnfO0_dvY&4 zk}eH@#ha+>A~T`Rz!%%rcq=xhN+OdY4-rbv0>R3)7*kfWKt`r-LbbuYxi82=9)sl| z-%_~}L_t4fGp$tVf`FN56F*ctqHAUfH^y^7B)C1_Y2t|Lhn%jAj99`tARApPJXBYO zl_SH=gm^+1tf8?-t87>}4T8lE2~jbXTZ_bj63b*KV%X zW3W#nGQh221NfBg3*_&^(o+VXwa&&rscgYxm;$*GbU_=S2zzJ9DDZief@kn_p}eOP zkEGxanyiYHYvg`#0L3y;lbi|CVmH_kY=ytcNDPrxwML@0B|^39q8S2aLEmAlhb!n5cvvD$=-!d-(5+@cI2V}3-=u(7wemsbrp%dKfJGGNlq=KIA;)S?KqqJs{gI=nv0={2 z$nhJPA&iXP3apU$MptkLuoAAtZ+3q23UtP+@nV=QY`lx0&BpTRZ8nk)S5L`k8_P2p zud84;*}Ai=fAFrmU=8M0441-z} z?gRE{<;T7em>z%KeVOW`l*W555JP2Wvoh#$x3P9V0Oo^R+l8 zs}h-TdJqB}V#f+ch-|!iv2bzy_5tV)k}^m1MpZ@S&MS5#;Ft6~&>80_2gR50EapWn zL-YJ7d&gfj=jje<@D{{DMl`Ot#kkZft#-p#h-2zb&Lf9%HsgSkU=@`ZbrAIk=npI4 z7`+14IKg_5j|y8c1K-r02+W8%G8Z@yn9FVzP@T>k_>MhQ+n_2Ki}mP$ERsye^Pf_U z(iQxthD;zie}9nN2QCsP@ED>DUBSG;AUaBP>AOnqr9aAZ5Akz2RC{^F5qzEOj#q(W z*qOB=j-VC10d@nU2hYd@cr2cV|MCv6pl=4_!dtNunnHWzIPDEmpJO*<)ovo1MkDO3 zf#tykS+hcJB9i?k-6UH1CKj_`^%9n+y9{*2i_VYkv!TXD*VLfA#(JzVvL6^E|x<+5R~c*FC?2GDSl3@f`dp)4q%m6`96JmJ_Ws6ZLN_E5+ekjWLt^t#k`kh}1|)47SR~x(0chx{Y-NvI0z%JU~UKn$w-D^zijc{r*4rPbEyJ6)woy zq^?`B`a^{a%K>X)9AJa4Q*$=6!WTdU#a#Rj9H)LK!WfgCMru!G8ps9$6LV-8FCgnv z`|($9=yBs$colPn)3I-i*KicADb+HZ#pl>Xc@f+nHbi}d-H9OL0nb$wMPiVe2ds*H zo}dQ)r#j;Q^?@59pK1~P(%}OExgM6MXsVn~*XG&F2$sTJ;S{iUjSN^gnV7l)FK1T@ zJRYn7ckm=46D@*ELTCsr$r0Qb#EO%p$;GFK& zgP-H^AT-{DhtjWJC^*eGh5nR%Hh;)F{z5FPy{y?#2e1OGj13g)DIBDa*~)Awf1$CD z>@ib*(p!G_G8@Ruz!c4a@D5M_|6L*@=;Lo0jc@Y(_3xT}zEHZP=By)lM-OJx(E*HpMhxony!SdW! zBwWuA@JIX0WsE2M1i9C~KJZ_a#u4XOo@M-R%hfn>ixr5cD_u`d8dV?t1r+ z>*L3|d+n+$w)fj7-0S{mbEUc3>~98|qk<2kTMNh3Jyri{{m{C5@~cgQEoE!6^IVI6 z)_m>XvIEnztD98Q%;_ipp*bj6D?Yl&?d*1Ssj0?c@zlCT<8+Jy&L1#H)r`1-Bor`Raue))dvGu%H@moF{Xq>=>wb&}xQmlX)h*JTd%`>t>=<0+&&o>4-N~W0 z>~9JV4qpy#HKY9TGMD#k;Lr1Kx;6F^dz}5u{^9cG8@czv!Lg>RKiiFVNBI*>=U~g= zMf0T}<*u+h+Q;1c;pF^|r9JAus9&eFTli>tY2~HL7U>mkm*Dzv>u`JXZgxy^RC2Z;5rywd2i1RGcYA)V3#+;E7nQBD>He9pIqDJ(F_&kjCym))e|Rt} z_%>(^KK1j(O6O$1*k%4z)8E|b_wfaPkGsxUH`708UX(6&U%#J!-}f>nn`g|+a?bhw zDmT}zw(q)o{nKHGd}A>y9b9)_aZLC^`gZxz@_xxA+clUTog9x2d;9Ox1JbkYSH9UC z92^+jXg2oy+OFC3>_d02M8h2tAAkGqemmFO&5{u>GZ#rL&l0Q~V2(F~&1n*w?}-jd z?t6Qziv=AfM_1)9F78yfUER5bdrZG%QOm^gW69QTRq#<<%#8{cx%biu=}>#EzuC+e z{hVW3+#gxHYzH~A)NC5OElBdQAL&Nhm+Z6dck$&3f)`uKXcx-wKw|lCe~2IA{8h47Eyq@K_S~RD?xcMC_)YU;c2Ig$wvqeT?^85St)nU!IwscFTx$~ngg*JtsqF?+o>5a+6^iw;; zpJjFp)(>X-*R0EmZUdR);NWrdrublQ`*C)(E%+9*cerzSQEnyz>~^q4C#=V`4+fa^y|tfa_huK`G5(ui*Qg;{5ezf`@t3<)V)a+| zk>t|_e!QO{OrVF^Ly)*b^jdsyp-bJ#bq5!keW5y`>5k@msvo-t;?Ii%OXuhIHw%*O zE1OjBOb>C-o0~%?5xKcP%|31q_eYxB#8+~n+4}LCnuLKuFXGQCTrJ@YlGrf*`Ku&Mo~`cW`N|=)81X^ZiXdDtp@csB^Jg8dOMv zPT8vRU*!SGg?5yg9{wCH4|g+9+uyP}H`ecK`kKrA{qA%-J^PRS)*Tvb8BK`p3g0%< z-8D9_N7#p44^s#dGhXu7D*vTqjeXr4?wD}Tc#Fc7QiS|mILv%rxuR)&v#p-(H^~ht z-Bda`_mKaqx@-B-$`$GB?xx_JD2vVv&-B~dP3&0rv}D7_ecqqshTFb&j{VXH;gQkC z(e&Ua{~vdXHTDI&g@4@a6>J#XFFDzXKfUCxw!{3gurar)cyRqkb+6|hw3Ax8G`-UD zRC;I7xv+lS?Zs=NzIN|QLwS$tmD%5ZY}g_GDLOki-~E(bW`A;zN#?%RFPGRmLNMT7l+lO13ZQT0y_3RYeMv!WHa7!>(tpBHE+}UoneZu}X=o@cXSXx(V zIH~kWaALJ<)4NUUSHE(r;%7>YbxCfzzrT8K%i8j8>F2H#J|FdrZwv?eIoXfd4HCDz zn&16e*VE|WVgn>X7@hWbnVx079y4#`e-y@T7LbK`xYzs;3`laFMF+EwmWGd8#} z7;2^n6Ita&JN|b2xZ5V&A?{VYtG-ji!-bFCr7iiUsV%Rje+0J_PO4j8cv8^+pz`MB z$;q>V#p9!)(UkChKR%1IzHX#>GdMN)Rr1Q*?Bn#n>~J^N42YIR?}T@lL1NXSogvxv zD$`eHI!pM$6@D*sw7Jc{V0U*Dg9*_Eg%0(V`do2O|7%PCCR1LNHiTCf%BAZH+k_XV z`Yn*e8o-n?+aI~GmGt6_5rgfygq+g-8c0!iU*oW-g=BV%lzu+xUJnyE+-tKTe#M|W0pjN;>qD6f3Ev8{YZGxx54evRpGb(?`+5H(Ch(s zVGu{VMDv59pO$T(4YE56LmLq88yxN*v@^47?el)q@UQTQ@OA%qHZYl=t_YIcq|##z zL+fvgA5505-MM+=v`27per&N@;h|t*I=$SxvLHJwEayhW3(fWRwCt?(&TOC=8$A+t z47YGua!0Z#UFq+Mro?NbNx}W@o3tVQESqNLgkOaDV3=#lMrOa+E6vn!t7zwNL;qR& zaCLQ33Of{zFTGH|P5pm!KE1eUWz*F3`CwRnX5ppWL~}-ZL8WuGnJ415x!t3S&9?5=tY>zOono#Fe+vRL&VDEvu&duM zxH4=PZXK-Whh+b-?3!S=oQv-=S^87*O18D17u*sp345FOv*zSK={fdI^Lcb=?1O{+0=uhw z!hdGgnh|Cv$usZU2kbNM0P(3+<{ioJ9i^uDPB`m1_7!t=yi@-E!iJ>_i|d7VRKIGu zElth#aX!~I+R$&B-c`LM{o2ouTy9SEx?7rTn$D5-&DoLoRj|fJ>A2+P>>U44n2V=} zyP7uc4LjVQ9XuC|Fz>pz?Ys7GdxvC$Mdms)!r$(mbmvP2vrJgd(mq{;Q~vKO#QM_eiHCU)hPl^61&Sd8eUL6sU=JQ7mmv{ZbsvmEM*o*u)$>9%%i90(xCHuqP;NLJy%xk`4L;I#3 zFAE;sgT8)T_MW{boSOf&xOd%+rIVrulKsj@W`mGhSa z)$X=OI6aQy56msu>}0v^5p9`2Cbxa?el{%mEZNWYHxC8V%=PZ2Y-!fjFAr>VMO+U4 zXJ@A8+B3|GARkmDGc{zlX5YF6W>|2Lc}zUx)ogKiX|Y$`u+q-OQam@CTYcSal)F8@ zGJMW{P#sm-JL&BT;n~rwV4RzouA5zKcFuj8KRUk6|B!Bxex9u_e0P8u;7Zv6>AH4r z^Gi5C+BtgM%oGMU)ej1v3okc6+e5N}*$4I>f1lL$qun0ahsm03kLZkIr_y7EeuW{q zHfDA<$M?(?@*Ts?>}JW!)%DX8>}3C%`QGQ;o!KEa3cBV_&+i)VYp$`k*cEP-?`ph@ z>@jJYyp|pB2L!qBonVyd;rsb@f?46O;jiXVcefpH?{POuO~0WpxMA7Z>3-QD(>J$E zp>KYJ{8@$FaudvWzgg5Ze?k1KADA_yYm>v$tFwjnHp%SwWT$86xHjRYxg&G$hMSn@ z{6Xe7^RfBak8wX{{nMM%j`kL*yNx+hIL}CV^?T4I+C7|MMoQ#9DLs(YQV%@p-ph_k z&q^P5M@9SOo{gj2tX%u}2w?^za_bgX=8g&~c6PQTOS1XdV0)l_Fx@EmA=%qbGy|hH zQ9hXGr~;H@xilISJsfTjy&KM!IxrUgV{mYbAwduFWTd?BZOt!mj&oQ0w*D`Bg?Tjzxl-X_dWek@$8mh$8cfrX)rx_Q~1|Afek;As(ssVZ|OpOZq^NcH1|r6 zYkz;I+s)nRdPxspU;l>d;6AWd$h`;4`|*BVU-E}a59v+${{UJ<;~V>*-8Q1-dt~Gt z{5x);+eOBkC?5B!u)72NiDHw~^b>oZAbqbmI4D>WEDLvt-VghSPXvbsPs;m0f-mLx z5c871-M`^CkgnGh@y-X`@ot(kBKyfQ>UejzyU#uAHui_hk>BOB-P|O%+^yrcm3Mc$ zx7&ZcU@g~*HJ3xFWtRvd)Ljau>IU4?rHa- zdt2DmNEvszJHbCARrWdlf6|Q{EM13=!6DMO*(`W0I3Ro?>>Txz*}fD`4j&4C3)ArP zaCrDxuub61k7k_Y)o$i=$z)Hv6Ww`kj5|f<)lo9uThdGF;`VbJyG3@MU1=9fg?phJ z=5CX|+U0JtEBHbFYSHF8@X*ilw0NMRYf zijJoUuh~|rp=YEEe2`fzy~;_!Tw&2Sg{A1gXms>yG%C6|8Xr9$y%4=2WBx6lY!ZGG z+$UCjO*&3@ON@N&&KLe&a4$<{-_lmH-?G&**6-Qp*$nwRv{lt#0p&iX4_6Rn7Tjc$p$MPtMNgtLR&q%-r9 z*+zPZJ%opy=Qear_Ey`+X4!`_&h+fz?1Ajs?8NM*>;>u7`fMw^k68L`TXLsL^ek}u zOS}}!Xo>xCK{+@od^K)At_X?j21_rOnI>5cY zMG$FwH^+{(huLNs>-ua+wrAE)*!c4FuXJv@Cfzx^P$XEEHP{2}^R`~}zuFxyee>%~ zK6pPkE&NvI_D?h*em3qPZJoMY7B7sK#&hC3v z(U@o(nNJoEmE62v?wH)sxkGc?=YEZkji-q=?+<-&Y4Dw7tHEBd+Wk=*>$z;*?CKKZG2`!yU; zKfg4h@MK|P>G0C({H?jK<9mW`!p7ehY&s$9Q{AeiPq}~cWAbq2yzsZ}Ze(r>bZ zWVz|O?6j<#ThHudezF^6GqZUT`8Q>e9~vw(K4{7Pudrb}KU}YHe8c}5gNAoYO@%#5 z<4ThY3v>DWJJCS*ZnBG1XdUe()uWn6G>t27lH5?u%2O)qC*@?Dw3>Y+>t)AD_u+E4 z#5@^H@onuM5_`XAccv%0ql4KJUysK93j?BSgJbiXGG^ql=tn^Cq*@0P@wOA9Bf}QN~nO?F9j-}F8^ zC>>MYta+^TK_5-8Op@x~>8*B(y-&8Wltmvs{9UfEu%&g(D4V8Jq|?2B@Liz}a`gR*|r(^^h%xxM|(_DA|% z%vs?-!E=6uJ5E+E?(;|4lhP5%SJmd~@Ag?)(^&1d%#AG^9z7IvD2!{I)9^-dVBzFK z`{G^shoY&$*TEIOQ`)%_R%fU6Y2V7nEu$-srE5exJ?#nZN4evlzL$9Bo8~UbQT?Qc z^oSjs9-NG>ZeBeuT_D;2B3}tR7r)AH9(IaPu3yzASAS38{`|IuiTN$#mxD3V|7lKN zt&A=2m^_xYNm|OCDoc|s-G_ps@45$l9JCGIG|vR<2A%!6k{M2L_t{6&SE}Du{wU8& z&NEBRUH;eb@Z!~lmTzIaw;Y8`6Ap^j-8x zv|;$TA8VzTP~Eb!LHU+y7r~3;(znxoZn2SaIvg1e4t5qEaj&djZtm~1mkTbvDQhBY ztH+Bre=a|3+eDQx7fmmGSNBN%&v1GEpf(>jzE(IrS}QDUv*0FwwfNf9>SYyM9$4L7 zI_raNcd3^S4UUVxl^ni0xK`x$e!8rpe3q?9?oIYhuT8H?ey;3YDOLW;E{h%&486TD zzwW8R#!=5)yM}Wbe=fW#-KwRsu5-TB@OP#osy|hRRYxb=*niv=et+Lb*7ZJ(Zi~N- zz8Ak*>{hzDvZnQGdSH@Eeor1wJESjHf3Eykx!ipbPs@Fk`=&Upu3P@c;K}%^`kD1v z?glf|ekk4k{(hw0A>Ft-pn6|&ZFZ(FNyhj}SmvYQ-SPAB5z$mrawph{!d|j$-Smm- z=hZdIhS{w2fuvuxUpgxMDZeJ)wz#Bjz0#C$nmHx^dHr9-@!|URuVk<6S$BpXYVSx~ zWl*wPHq@UU_LVpu5?mW>9*vE^j&2S@_d)thdX=!2YPvzPS9RTFWjbH-;Jl7-Y@c0|^-H@Xg=BKtRrug$lI8vmM&~~(G#8JkpHR9zZ09ZyUMOr; zygd5dK2_bZdQA4Ezt!C6*30frpUPhJ>&2Jk2jqr_r%FZiurT|_&5QQ9WI^Tq>a=u% zWVcnx@?|*2)lv(+>VlUqartp(R(yh6{J(~SdeY>)t+Cln6GlV(MaL3yv>2_|n_?qJ2(!Hf~N{8is zu$w2tTrqb+{`2r>JE^+5vS)T_@L?Rs^UR^HoOO5O!*ae~!G-f}ztklS+3(V4i|pL$ zJ(Uj0ZQ0iDFxkg3(9g}Lrmy=&xl-wYQm?vaid%%Ulf#qY=I`7$x!uFVZSQLT>KFF7 z=*e8qc;n!3cYSt{8x@T$jL&}<^tR6?8%x&P*&Xcr*m>2oGCa978}9l`Rh0Lu(!H`N z!9IltO7GN-tIOx_wP#i)q;tX>@||+Mg6GpSE4w8h`uF2D`6QZP_Oa_{Pxy2?EtA6Y!H?t%Jc#pPyp<=o0@ zw>)l_|1nxCTVIw{MrQ|y^K!rD21JAW!u03t)1XtnUw%xu%oO6rL_! zQ~zLTy>LmjXXTfyU-;j+Q93kt+2-V%RCahqr{#8!Pm#X(Pj-qwG8_?Q!JF>Lv{&`q z>Z9o`(v8``&PuzdN2I@GCBK`jf?pEMh_=eVB6x9AT`?!U<;qLRQ+|i2d;C)Pz5h@+ z;I=LbPZDO{J^Iwla#y>|KN(yejPzUDA?Yc}#AJ*x;AbSSEK2`L-^fY3>W{`YWN^n56PYbU!X`&znH-Q!E*OQUUr*W54mWw*j#B&#AX zxTmD&_D8j2@KL@T3lQTK3^KgDirKa_sc9>GSk)^cS!q56aLCU;4GONZJ2$hv$dJH_oC zUY;-1ZPM_5eZIIlxG~!?8I`W%5Ccw_xG`F+juw0m-0y3mh`XXVa}Dt?kJ2&?VmmV|S1E8_QpPi0%=ec8Hh zmhk9H%}(~7>hUeVH9gT%NIQzfZ}P{>ey+*Ez}%sAuQ%S_aD8EXFg2T2osgXFw~Ft{ zO^%*1ciGWNuVkg&Ci*?MS?+(~a^KI!*`xMkVJk8_?KXe{P`|9$?od2W5G4UNkG7 z7ky#2&X!fSOXj)3(SLI<=DNpUnojoibavL+pJ*2Q`|X?6^;#O6|7iK8IwE__-C(3k z7hV&ck@uy^^$SYv^SSUo*HL!zoNitTWmQ1*cw;)WIy3EKMn}8lh713{)jTWvEnblw zB;WXBeM@#q_2!nf%}(2_e z$^TdyUH@F&@WRZn;JdjWq)J*6JP`GX?)8_aFD4yr7c(=g&#lO99bYT8+{5Y9>Dr9W zK_%-YyBIDgKVPm?hNdsr>t#Q{k3I@tiGL|9s_WVCe(9uKk8q5+-*4%+GM7r{X{kF$ z`Yg?|&GEltsVCzL!!6CluDzWgKDu0bQNLy5lj)T^$~Tt3tUR8y$^OdLci;OF!L(={ z*|T+7{avM%xr?HC;uW$MCB4uV)7SOMp2!}QNW3;WD>p6oLG-R!W&a;bR{ zXLpOcJH?9~Qi>dQxI2ftySuv|?(S0D-KDsDac#RBpX7g+?|=Hh0oU#%xwPX(%wEchHokrQF&h(WvO_L&(2kYe(YZ$fIr-&SS?@E z<6eETiLWhG=U0+tOg;BMU<^79cz1x2HOEsDvzjs5@R{+ZVh?mmyM36+oGV^Z)~a)q zJO{1Niy-p4HCNq>9DY(KeJ}v3#L3ssCdmi|t9yqD&cIH|yjhseT z!!EOn&TlvvaW+kIqNxdxB6l?Qyh~<@}-+>tX#7uM3%g417rV8u$u3RVfxm%k?(=9Zf zE^-@r8!(5~x0hP0&78&@eV*~j{A`_eQZa|QrQ&`0k-E?Km)cpn%Qq&Sy@$?0JHiRj z#qKBOAMPOU7uEt5c*U4*N#K^hA==&>#%SH6&J6pcwZ$B282T8al3C8GK(dc##gH(F7r-5-c8fwUk&b3@6QNPuL3@C~ ziNxr6sGrgM8}rQ?_Bon@-Of*y<|`-EXX*`Qv-BB!aR=|K^RHbJqwu!7h^@z066c8( zg=E}Hrm1`184M=%74U(@z~_?DZFYC7x*2U;(@Pn3&Fj`9=bKlT%PJO@uPL9^wdxtv z~nCugBOPfTEUr$_&?QYn8B?G>;2mIhNI^TH=N&QN*s*&EH zMn&s7R??1aEB?HAMINkX@zqk>$sdJ7 zZKdpLyf5Utpk9;r2<_M?y4qT2J}`UR-P|ASa^5d46L~Qbc+cP7aQe+AI6G|sGWOh- zoqWz{yRJ1KyRZ4CVokS8x#ii6yk9bupT2hfX1>ZwTJae>mv*o+nr+OGb(WTA`}2|F zKyj~7i08>NZyjy#ytePyzrn*l#hJ6a{lrRT?K2;moxn^_boRK7SesicR#ry&ruYq? zs9qHNlCKzJy79?qZ55^WutTmcToP6YQ~CBpVg7ZS)4|R;d#*j(?rG<;?_-C$#~fw8 zF-KSxu)7=KwI@&bUQ%tfoqw|5Qg_MSg`Dga$1ziw{Y@TN6VFcO-tg^&Ou{qn8Jht* zi@~&w6K}t>7uz~?39YQ(W+gMf`2?q@2q#DnF$=gn;w8DVZ5!}YehW;C@8`kiUOwc;!A zv$#&gV$Or18vrb}A5h(A&Q*IHMhI(eH{Kh4td2ATn@vcrJoV)aqzE+g<&i_&J8yw= z)I6wP)*qS+on&5dwhlLkFUY?p_u0qHa&L;8KsVD!s5Rf&f8)#^Wro+4`_^lx3-g`7 zfj+X^pWy%7w^ZuPjra08jJZs2Xw{2c#Kau-J(vYo8Pj45vnq%}M^fw81 z0XxC$X3hWt@Rz-s_GbSR@+dughyDHhk!okL35j!;LG-*tA80JLbZ4A*gT2hn<9+;h z;;?a8vrl5CZbJ7s>493lwZ59OO~c$_<#0~ATZkbRRz?3uKkJW^$M8*=Uyf$Z*PH5P z%{{hBhcOa2neWDT=Z=zxY)2+L)EK#4tb)!hyMx`&T43%n8(EcYAMMCo;djVIeT)73 z{p*1QWhW!tMK*7)$KEG9)E;l$n(W`G`x5*ht}AKIW@0LUZ(R-jP$p*~lqsAw7b=;c zRT87VcUb4F=I}4{kN5SKv-5AgQBJ&B#MrGz;;B-*BbnKMP8IvPdE_X26kPBpx0E{x zqw2L?AGmCUWt)ervp5NLW4iDo#T5p!!{{<=rm;ZphThzn#(FQ1e~RjgYe^5cxSh(Z zV0<=aTKk+IZcg?C*~YB^m(hu|fl_t7m)si$73W0ff}O+OgSlh7^{@S(Gu0i!io$BS zhp)H4qOX|j=WlzNo#AE;W3`dWYV4eLHE8#CaPPp9*Cf@Tiw%MgJL;yw80zi}2CBQw z+G#n~1A914@(OdTv_L)T@8o}|+!E@rZ=A*EJ3XjxG#}X2+$GFBvYdm*lb)Az8pUiC%9A04{w=w2)cyH(25U-uDP>a$4YM%Lhr<0M)D?+d z7AP>ALYJpHwXEG{a_fZUw^!I%p!n!V-Uu(15rG2{zkJ=qY)oZ)oqjfSL~CzRAS{)5 zllKXixSi}?uc^y{yFBl$W;A96bmN8GTlOi_Z*Di|TLr)ieRisOGRYzE%2)r%h=qPu z>cjN1zvWh)M%#Bi<|3xuUe2c~2{;?KKkUCvry^DNe;1Xg&MZeU5p-Pm6ga zS(D8|=Cjq9M4He3Z0yH8c*(kDcXk-s$9>B*=2nSQ)p3CWf#LFTwlO%4``Q^TV6E}$ z^82v1rxPN$YRp-h*RkxTG(D3UOnD=+0PEf~y2*NL6g6vFt#Cg700uLwTa7tLUJ2)v zoPq9v_p--!a-z()+5zpCdE7hB=av%1EW%;38<@ck)K6(zAKxFzEh1alBFuEU(MmKL znM}uIMG>FkO0Db3SW(p@u#eK;liutz+t=)ZQYrPAgTZy~TK`xB?E+A>HWLa7JS>8x zJnF2sTAI16yY^eB37GaW^tQ8$W@c2buGm7I8Cc~XFYa@bo7c2-+8wzaQH+hpi0|ttB3R_!E&YtO%!irxJPx#x(>mFwwVAb2N)(f2W zwHMaY`p^jX(7qW@-7&%x<&L6>XV`B}4~w@_+q>vcww>@;oXUS^M!27y9(E0@oVCp^ z3HC29{Q@*KC4EDEOinIJoT3c&C-duK3opC5L0haXG&a#bd|l-)b+Och6s2P=$IM_K zp%vJSf+9&mMm7psubuV{%mfSUTy!u^bk+h}DM>%mW?nT?S%{Ehd~N(Wlx8Gg6Mek) zLqBboBQbJ+^`l&rAMTn~TI;=4o{oZ|@3N=~$)KHMXc1?!{l>~=k9K67=?IXO+O$8E zBIlR`TtHkQ7xCRzhl-QE81uMRU+-)d2EI2~d8a%O`?1rUo|t#5I=Q?Gq@FNEe8Zjc zMnWOo4=1mB;MfMh%TWU;`!guvF2a}6gnh;Bg>T`g+FxxU_V=oo5&Am)xcShXEDTjf zC`H9f%wNtx>#G%@PrV~#v5-u>L!P-Q=|P+{!k@VT+;j!|gLT7hMTdJMp$}NdKIMiA zZ>6Q`FtxUr$GdBs(Awyy%t-Hva9ugC)E2KYX`LoO7c0RhbeGJ59%~R;<0|wdSoITj zPW1h1_Ghe(Wt@Q9mZ{EWhRft3S;c3N#;MDFwdJo&9W%eyRhwbhw5?D@{iR-(9<$Y* zo#=fFo$1~zoM2jrnYbtJ7N?Ap0cc-MJD*+Der&z4e%g;|AI1bjcFkMGcIEmCvhu{Y zN{!|h+bgwiq4N3#`!8;cvd_0z5%?u^gc)T{wYRv(SV@Q#oA8&sbVt2HQ z*gvcbR(m@$FrTty1KYvN$kZY>mtJb%TkY>I@ADL6Vn_>RHUmtobko<%mmns3s#VxX zZ7l`UT8h6Yz7`s=znu2=TDvsV43g8q{$%yH##oo^>FyPlCJzUBLipr zsm1Y5TkU%=TI*nU=8~&p{Vmig{B@@ScFc$E_CV)P3D?Cq?wI=@o-c)y3|@e2PHODE zebxeNzZ1#CK+Cq?>*7{mRA}qQs&yk;`g8I_t#qNr!4ujbdP4Zux5r;g$w0PQRrTk_ zU1tvSk((@56^oPl^p%y_9`7unZKz}4wX$1rV4~{NX6$Rsd?%m?IpSSr*9nt-^^&dk zPvuIOUz08-z1PONeyM@Kcp!_Cf*o%Dr^OnrXlr&he^2~{UCKdcrj^FdPshL;MVvm? zEVD6AFjr|6a+pucJ#`Dv`)(e#o3O|io2+;sHQ&=*n=~NlleXF|EzR{u2ky%QnG;4q zZKJV>I&49~hBn|glhw&(Y1SAZ5=C6WDQ=Z9-y1b7o$e=Dgz4Nf_p{xEmSMK@np!A% zy=42uXI6!z(TRsc*=S|4rT&F8VeV3l55W9cUGCBtz9$({OqWc z#ky?Nz^=C%ZH&=zneV}@wCC9K+y~^HY)3puzDiAWYqvUn|wElNue|LhOQp;#fWrpz=ggrtux5?dWJu|C0GSh|~ z>Y29RENy%-D%k_Q^t>Sc;5=8fGFS`UC0rflQbg_K71h!1fMBY`y`lAXUjDdR*Z*FD z*2!L_#p(rUYOa8gPRPy2F`J!eE8eQ^j)e>E8*OA&G3FSmm6I-IiwT8=IChbv8-LlO znYF@eHGi@V5%a};Rv>A7QV!#tds{5v%jIh;!P{(})P9@!nMHhBp*25+ymW_x9T?|y zWtx&+%vPth*~qA7R<#q|;#?OY4UgM zt$W5r=RTPrweiJ7?D4kYC`7{|f$+;%cDHMu(cbn?`FV+D-~PBw13)LShXxZ&?2l(*uNj)g|q zIoLf|adOC=_>rz)mNWV~Bgr}80ACyW@Fn1Q$Juk>1B@lLn2`M%YU@tcPG^cY9NO)U zq?>ost`6_$R92LhD}ujV;J94R9ThqqOksWTG7CrKLNYJZXEs>%jdFG-FborgV=CJCOKDab^I$X_c~;x?gxlnyB3H_4iHT`&bP^ zA@JUlNp10^lqfVN3+Qk&!R+kr<~*SUAIS#1zR;npayq#w!GZVk#@S_ofMmvg{g~I4 zJql%HKhH-UtF>JaJt9uZrTphRtJdajnR&DeW)80c*0l@b0R9X!!r5rnwS(R~{)JGB zv%yxTp`)F0)aU)p)?jP9*Q~5K7j6XR?V3oYk>EYpf?Jex%vzTUgM2jjM>276(YHFK%l!)j>gj5-T~-a|8W(&ggM>Z2`+IL zcGDlAu&n3t?p2OIA-9_M5X!OvG>-mYHu3qzVbW!_ zkZ&!Y(P7PyjYJM^6Q6^>46oh=I7JH3(QF>!3xA5e>&~XVfxq&%jagv;VlP)6u?2 z(~=6p9)1|x0QmSIdJ3MYnsE9pb0eHs=m2zU6`c&#>KgX~RA6o4FB|7IbjQ;+UT*HW zuu~Fz`TaS>k@PHd6w8=Q{3xM3{|rpaHoKMGpPpbF2(mDry-RC2Wob6=JX4At#Y}ha z*+}JqQjB$Ulz4@iwHeeQ0fC|8S z%dH|iJ@5yXV)e3ACl}o0_n^XFiIvdj4P!U5gS<-gk^R$&Weu?}9K8hm&MvPCYPdP~ zO`Ii`;&!@G_E4*%GuV3#mgNZB&9&`nV0OBY^ISRd0Dn7bMZpENi{&J;7j}2A}g6G>5bPI5;yvKl{$x%*KP$(17Cpa^}&J?n}BE z_(nnOW|rfW($_5tRNyzYonBxe$}k1EY~pEgsaQy!CD#+O5+DcsP$7;VK}KVrN}$p$ z=yq{i!%H}p=A;Ly-y6g{2W}PRZUw7#$h{4{@LJ4aYi!jy;ADYbYJxk)-AGeHr+LCX z<$Y!o`1#^Fv5q`Zp<-#EI9K?NVx;(*o56g5Q@p!fVllTQ0GJ}Y4A8P zNH>I&WPleB9cX3ri_-MHGYY%T@xY@uB3>r9_a6V&#@&hZg8OKu^~Tt7L|eWFmV$1k~N@L&(EZGLzwpzVDcHQ1oJ7_<3<>BOYL7!O1yv`^f)brce)J( z>uMXy|D?d%hE6MVCshRjMuoMG|*v;$__-kX_ zwX~n3*tM+1P*S!9%3KG%|D!EA5zY{&8y(~l&vd`JbI?BjA&Fx)vMMSvkC@4$Dzwib_8t3y?LauV46{S+91Y~FF4voz&Mo6capj<&O@keJER_7q*$;3#mSuiI zyT9H|q7&&qlsYSceFwoKys&LMyYn|RO}CtEbS_Pa`TwOG1ufKNoI^%4ci>IP#kOYG zLN}ilZj7w(v3G}hcR!TS$iBd@Gh`Q8LZ-qsPzx@R15mQxhC@RD!kEK50A+jxJaMCt z#o#!1G2Ui4!*D`b1~s$~+V=I-rghx);CHhk+2bz!Ky{(_-@zEn-|Sd;Fut<^5=q*T z>F_38AwLPrB|_8w2xV>u>QbNB(DTQzFOe0JhS`pF@g8y$qR@BuqQq+GuO<>W7^v9` z0-IPs&%-C+Ko!&&-jA1HT&uu;aU7g~WoQf^Bh8};I|FL{IFwlhv&t~MQ8&_&bU^uy zNetfIPu{VsP!}24=g<$ffhOY!KB0)02p`0Bw;5EHCNywM>2j#K=Rjq?1ij+`Jwk6{ zmMh_Qb~n0D+_Xp!S&!V882I#dBa0*@G8yK8AOFsBL?o$6ZfNs!kW!=o+>!$M!d_<& zB7>tPo0t8`Y=H7T3VA43z?;?t>Ut0FF~Du^7ITxq!SjyZrgx$0`T$>TGI%n|qV7jy zzKeGQ(D#kI9nOe(aGmVHj`A409j=Adc+*a7L$(Z?3atMlcs4e`=~4%I z1xem{Xy`h5CE!gCVXfW^cTH!sa!W8XHQ_snL77ePFJs&_(CsI>nc>3f;Vl6o@(H`Y zYRJ7<3!mA0jGq9T2fl~0Y!ztr8{jGGVvLl+zZ64H;#nP~o=0DphqrBtbdNkp5kguE zQ0E@_#DZRg7j$31TM~=6+l_kP>mGx*<_5-00@7%TdVirs`lDS|;Ef+4NhlTa3#+2^ z3CvPv7jqaMpI6Z0YVdWXKoW+6UjbG?xgq8y^9ndzAc?xyPJkjxq(Hf;kA+0F^c@2A@+Xw3i z7gag5dPQUjsmpX@hTszyApKz% zbB?*dTt*Gtgs<&7YWV=ZHes|+M#@gdKlvu5m~4MCl)_n3&(Pv~;rEyUAJM#PU$UJz84x^p#BDE=er$ZK~G0HFv;g}ESBu#)WIs9uXJV&#c zG0=F9L)y>)JXJf)8Eu%#Obn8Z@*(jgKi)T-JsHlP`-)n>f?SSGc=u%(-%GJhPeWPL z@ibxY*nG6_F1+m}eBTpfc>FKzJ0((}a^t-#q4ui5_0|aQ)CNz{5i@=_{N4iN_Fw#J zfgW8Q-%$ZhnEYtv>?k>$miZMe_8PtB_Mfu1d;7hu-YPu7TJ-$2_}f~Pc?LeZ>$nZ> z3FeP*TGjtL<;_0a@5MzxVS6n^6p_|9DT ziEv_B_^zpNc2@WXfbS^(C2}s_ql7rr|3$ptHT0xsc&bO(i#)~b@fP*-0|{K=G*6M64Kp6ISO@gFh1iR+aDKjIPJ3yg2wDne z*mH~HlT@aW_tPzh@zoLuBJo&-!#9Y0L2thTKSVudFWMj_Jh+eXoT<>Wx}%hnxC=26 zH4|P-DxeJo^f?_pHGtlxq76#oUBe0Z;oBF|BZtiLDxvgasP9s!#Tj^$dgvXS(aPaF zY=Zc-`{+yQ@XjyLuO<9TUiK+mRUL4$9mceFr_zD$J!GP72H$v_CihgYg*kKZPNz1U)Kj4hI_T9$ zouxqAtws{sRm?_tutLnl_$&feauM3#9I}A6VBVjM*>xe__Y~%yS6)8&!qZ@^_Q%@Y z12aeuB&nUo*lvkY-VgKpF!cBmsFe?x9qwUHeB?n}3$`@89#w+#H8s|+idaRed-<{3 z>gi3v%3Ka>^D*?ct?*{v#CQ(hY*z#`L{rS##o*`rhx}m2at--&BqQ|wC54;(RdAkz z*!<)fdCg>YW1+M@$wqQPB0G` zk2Ie~^wdTxFqFT{fi`vic4v8WnIG&f?ksCD++i%vi6JyO6HN zNg^F^qf%bHo1IkUE@Kx`Q>dy`kR!zeex^Wyz^xWalO||S6;o>HGDKZ0uN3bJJH<$xKTm>(Xy~=^Hc&VzjIVl6-LKEmcx{a~ zO;2TPHOgAuo!ah9?>Ix*zFZ4op*Tof$FVq0gQIRmLYBeaUzr_hnm zf7¬})lFW?!cty$bALaP=vK?c#Z9q%=>=B~}#g2|f8`#D$Nkky{yvM^7;JFQDD( zXZo$m^cd!d`piDAoLEFyz%PXo$Hp(UZ& zq4}ZW+HZZmIR(jK@97j~1{Ahu;iJfYQC=YLTwYOz`?OooV(lE4OvT z+;46$1*?U%-5J9aAvH)rzP!9lZXh-l+p0pKyDzQWMi|H+g;R3{bHS-+EjGIvE42lo zPNB)6A)yPQ;~Ha@g8s8SrS2PcEiZ}##Peb{DPC$PpO!R1=3imF{bDj8CuKg~GbJ=H zXUx6kF(?5JTZir0z&UoZEBG{mDfCye`2LnNiLvqm|3hDTIhtq5am+(Uyq$I*qnlnv z&!)8xO$>bwRnmTGk;VWkExqZk_mYriQbN2AH_H~OvQ%GM3&l338$qZ$`yHyG)F9o_Kg-qt`t+jtwjfc zIZI~!G`d4&^~<jC9i#z*qsx$~qA zvw>!Hw%C8!Gpv(l1UxCZ&7#(1d$RL4PK@V~;joGAN7|4izK-luUdxr#uWEhiESFj6 zrDRnm^NT$RXWWi%Rk%({=ykN8+ChxoDaJwbq$yde?Wt}on-K|c@nis>Oll%uky?sN zp>3_pW#*Q#58Y+XI=ia<-5Lu#vN5!LXUs@@qcfYT@Dcw9=U^{#n5-s4ge`JG#ji9| zYpHF-+VEqHQl`oCxr^>*rwkOgkDQU_3+OH4v?W?yof=Qgoo1xv*ag5|H|GM}V=|ww zAaw&4_(4eEevVt7MU& z$(@x-%1n4%dcY}noiC?slechFUDsZXOt524WAm|=L7Sk}&~odQ%%+wI&&)HsqIaCU z=Q<;^rz8E-kX58P`NA}GbHEYSz-kA*KsC5uvYKbj!FE;X%!9~;$jF>! zC9W!0o%=5Mluk-5<*NEYJuf~c9gvB)PJY8}ckcj|>jvz$t?AcuYX!9UP=eOc+yjKD zjCIZ~13yP2{xUrN9e7?EC#!O8k>@94Ww6)>&=3@IR$6aO)f!<9wCM4fmRD+2t`GtvCDWowO+JacG@Z z$7JpOR(`7kc;+=^6z^~;IAqC*MdbYQYOyLmk0i3O$nE?L1-E8>HouuetX)=RD9BEk zjqSSh3)Yr1UIS(yOMoH8aC!K7QdZ@LyhcgsiGQ+W&LEARU`r#xo-?_{@%iSbb;Ug|A7nMHoKcM}GuaK3Qc&OY*Kx;P5 z%4jFE%Uc`FLDpzzkh>B4hXn5!Q=C-iI&np~IR2+}MLsNlP-4}_QgfhAAH>|sSmBZP z#=dX2hmymzmKquKFXi_ijpb{@F9y_ML*#9_SF;3^`+pr23?rAI6G(DSN_No-rL zlw3vG$*1=o+4+$Udd3-T6*uCvuc5i2z1m2#s2vMuZ71ss^^tV^K&;5fB@^yQwd4eG z6@Q(ig|4tS8}#nbeRhnM4CS|hK4Ckw#j~yF&IY$5&X;?=gG^3RpKHnu=JE+Yq%U$_ zWsZ7G{Ux>~5!^Z?pLG*fdug3i&Tym>vi1byxOO9SI&@5{3?1$WtBQHvEJv5I?=Z$w zk&)y#KNI7=gp^Du&y^q&d4TMNjc$EsiIoNzV>e`Zy)ZK)_&>Q51@xj6P@)28|29bb z%ZY@8tzt@LmvThS=o>Fz;AW6+;$!87bcIb0H;Lx#rsbS2W}-GFlqM7kz0r$UU96Jk zH6x4N53b@3TniwqZ@D|-7`YDo%58bVttDrP4983v+R$zacT^WU4YXlnpd7tu`Di0x zM$It(b}*+%9=;25NFMQfrEN+(wSjMwnpMioedZoWYvE7%hGZ$jZV5F{aXTIO`rM(v zL(8=}Mgp8%BaJj>oI}{s$i5qc9dZpJtE|a}^iojyf?PAM1!u9{y=l$|YnGMIZiBR( z2iS*PGhSJmbCF(#>gFwy@xGFmm^HI;{|LF{VydZ*^nF)ai-)<@!aKR4q6ksmXn43) zp!`#fk(fE@Do->N4zYIB4dOi%EiaK+bu`hN@+}c~5VFwX(D|6pYIpJ1c4e_c@XGNMXqU7hM{;y*kKu z%E$XI%U8s-(gfwHQdGK08hM-D^X_pPg^ais+QN_&stlFNMD3Hd9a^1FW^>vKCmqup z&ko@;N<-xwa%<^=xL6z`Mu>a(G2}0%8(0^argU0bL*c6UqMODOGpiMCJ+g8+;19e~ zKq9BG7f4mH7{=jFUk-msINUPGgO#u910`16fnAaglqfsRXYJ85Yjs0Qf*XVHf>lCI zLi0kdw!&=b?5Ew44wS@xz#5ocE-&YiZRw-5Ln9JvAW&Y$$!G9s^^uz!CG!o*GQ%KtYkP_g4sh4 zTE&cBT8U86VCLZ8K`D4T>3P!3P;29ub2q*PAHzoZy3 z1D}rEhu1zHsZq_$`bH*$z*m$;&#x!yMa}(IYNs@EZWu2+dxTGv)+x`_@4h!apRc1& z^PdRR@ZV56Np*xg{Cx7x)147kU890)OS^xFk(9=xe)-qsi zmv|$AQp^XV@RNQ5gK>h6MaEKXDAPL7F35Z8h&-K-V2uo#8Fh5r{RIZ6AQ0g@ULK$) zEr6u#1~QX|Edb7A${w*`_)x} zrA!A}{sMQ1776FIqD5W8He84l# zM9NuPB;OFItD6AHk(u}4;aq(BX>b|?k?m!mHFRpy=U{A)A$9#09C1nE6jhvh>VW+i zhJ>s)ZhH9miXnq)Be2F~K$pXru3OM^G$uEjA8bc=_CM6iVX#XV*k|BIbE0N6Rw4m# zYBG7woL^>;wrG`{-|ruJ&Y&m>efe334rHHpcM7+D1+7CgioFnB}oB#q(rNrq<;+^1l@{NSRh z!W~f7ZHT8>1coLKjJAOtRSh_brvVlF0v~WW^p;M{9B?72*sSP*RZ%N_!9=}8550}L zxq$Y`gWC84Chi)08a;R@c(BZOpzK~Tn%uGBawSQ zhMmSPLC@WcelraRIL7P1U6Bh*}wXa4YUD=_191fpLMyuo|4>u!wNzNp=N;KoGH0WWpj-3AW! zHW<+x`14_KRH@L;BXFrxA+CUQwpH9&B5=Tz%y)QPJ$JBgQs|n%wdXA^APjf zX|P4NFpAa#EAEThnuPIP0VPNM!TE!jeV$^p-Noo%ftj@(#>#Yj)kPoGq174!mbHq< z0dsEYO#)*x2v4~RNbysk%eQbJSUHTbh8P9=MDaB0@dI$eSHP6R?8sd(C0pPF3Fl!y0dH^^JtP);uG@IN zGhh%Opw_@ZsszPnceavcIb?*|)^3rvF#ICwgIW(NF38t_|T79e~R zuMcmbz_Icfxl`fWfoyzUJm!h;{X5~?55uW6Pk}$*$5V!%_CEf61aJ2WZ}s30Cv+5T z9&W+kc!r1gi3EIS_#UmCDEEJNOGQHWl^1M73h)!MuH ze#F{y15f=KZ}Awlc@w{XLjCN<=z9K#|2T?FyPJ4M7F@+ujQa;Dhs7Jc!*{&}1Lec# zeaG8}Z?Vk{W+nl7SsB4R#o+BFylFK0czL{ga|GV!xE!c+) zC|AV$mPaY+@Yy`xKN@YE3(r^-Egg;a;qZS-{NcoS^dkp-F$3!A6?)Jk)IQII`^XhM zL6~EEi1)vbQo^@9eaHX(fjd;3KedyHXIGebJOeym7)=&x^)sF?eB(_PFhSuPwrt$V zQUHHTiO-Kjk1vM538;m?@hrK}iw!UyVRk6@AJ)}H3GpZ^e4pr7)a4!2fCFYFd=K<} z)K~c4zwli|5?F#DW{?1Sab}bfzHc&oAGr2sR>_R=QlWLjH_(Ld1q?GI?}d+4nMHx~Xx3%$m3 zgl|6jhT1aGxBd9{kLVQ{@kUutO8AcM>?kSYpQlQWPmRQ%!wgj!tb?ra|i}g z+WkTqXo7Pi4|pV!Q$GOb??>UjBioq5^cPY~9=p$&amesH$&ANdW{0Pc5?qjTka=4P ztI=oXB36~x>}IB-+mt4_->}cT4HaHq?+eXAFSL+2?;RI-KFkh zd@XY^*TNGy8wrsLX1=}X2gfjjTENPxveif{=wu}9ly8vEfPeJG`?JVDbdYBppiPib7GQFdRmeP>EDe|Ysj9kH zx*@Igor#E!Sc#M0aWR6=M-td>B#9YI+gkhdg~9EKGn0hSwBW167YW@H>LnCUToyd0 z-!c1HO>C7SW!^1}6OSrfmR!~HKMxE@HY!dLV`&8+Kcu~C+C>@Y|@8wCtOU9wcaQ-Yq55et3jW5Ax32zb$hW3Xhgequ~ z9-~bRRfEbnz4Os2?mfVMpb0b*n~{5)3z-fXlq5B$+Q=sbUMm&(uiQELneU}KNRiQsAnTe`$vnNg&fENC?@of3ajz*79oynN5;7+tjuO3dy_rHOljuB zNmvV>3YO4XYsIuCMk~9!Q^}rZzd;7$YP!QsWYcoL@$@{molhsg(5sdURQC0k@=JN- zW1>$yC*6`;Nk@cDKo7Hf3+&F&{QWk28^??>=2<;1luc`^?Fl7?Ug}>>!Itet&TwR= z1>7QDN9-;KLtW&Ob;wGy#F1*jKn>q_>43z`r;wo)klV{Wr0U`a{y7;BJg*|`iFR%DqtN3=O4pP%fxUq~>R7pt+zX%lP24BFlKz#(ikP|JCcR|UHfQST zwb|Nty^Zlx-=H(P7J3;>7Fwi58Fi45_z)@at+3zqLus@Gt)7KV#upZENC$mFvKxU# zY6E4t{8+pxlo2aQ{iM{=6>*;MfSpgTTZ4_Sp(nvxp*s3YT{LPLae7y+W@vb*rS^}0 z$M|i3a);uEsb6eKa89$CwO)GmK6hW}DhC3)lVy(B4YV+wa!70-ED`#Pi@?PBrAfjW zHlv%*ey{%uzDbG=D%wMBx4rSqWWr3or!{tjhe&#^5r% z5%P?{t7PjVvIOe*Z6zqy7h;6ELUlnCW`a+>#hUIp`-I*xm?>#;QoCT&P@I<27-am? zr|T~8lYW|}&$c4zI3x?L;Rf<1SBYu}{|hCr zoLm|zl!P-mH@}5^hJ(8uvdW%nSwjz!?j}CKmCeOMf|df=k#&r$Mki!8NLEX`KJDTS zLNBhxr{_y?6S;-_eO?pxC=dJv141A!P(P5`mtJ`%wG-zdD_}g|gB#C|U~Kog-QB3F ztqguoY?k;}QthA+%BQW;8yR_wc1Ak5jE7o9oqR5C++$C0qxt<@keua);O9q(tJPcn zd469YK5)X{P5mVAl#UAZ`64!&;)UrQ(@EaD@%%GeRSBQ=0m9`jWS91WC-xE%Q5JE2^XvWQ=iWSm{_ z38neg+!^+$>$4Z?GlP+dY=SRgSHiQznZYQnfIdo()qm;bj5|gO+-s8BUB%=8C)67G zz>WD^tOW=DOWK3a-x*No)w{*HYHD`-ieStvBz=D7jxQV-iUWlWG z89Xo468DRxF(>l;ab|_{(byWY6AvZ4O4yh9C23zMM!%*n(p&54^p{#|{kXBrVrdE{ zfgI&~3mXJM=+CufZQQBUoP9+Sg?#D_f2}~(z)rMqMtQ2(OE}KwgR^%CTpIhuYC;Nf z8mVAiw5>_06VoKhNhO2NL+$ma`cVC&)(m`llr~WxVTw+EuOjJ$Hl8Tt5KbZKr8siD zD=@FwD1Mq$)R!%gC(yy~si))(;z;2upF?OV2BlcJfm~5a zzlVzHx_%D%22ZfHKNH*#8ms4kcd-@P`2ycW$S?Hb6>bLZKst&W1S^uk{5f&2lEq)g z|5Kf+e3leZ64DAcg?C~fd4n=Qc_0lC$`Ba>JwY!TY8u=Vj1T?P7U^&FfqEk?B2+nO zC2bDQ)BZKPI`wdS*AFCy7Dledd@`MVigW)lrY2huNnhi|s&aXCs46Rmq)%vpx57m6 z6y9RB(og-UoRxA5t;uOOxjhElV{fgc_Cc$GwQq)YFSI<^F6l{P*`$AieYM8eznpU~ z!-J@DcaRQ{4a%bc=|{d0jg;Za@TrAfSe=T2Vf-%5lWs}drKM7BIVG6x3QAw)j(k#V z!Ivjfygp7IYo9S$-wxGEb8U9$X0T&$PtxbauZf3};)6%DD08j7g0{xZCqXtH$xdc~ zH7~)P;3D`dNN)Q^I>Rdw$$tn(SZ zBeO4)-b^c@(NI)qXz*oHD6vuEsKm-ii-L!>3g&$KKia_C%k+Wf{uy*n3R2!D@t63v z{B)=p27_DAkNnt2;v)Ggc6KW(*OIjXcjRw#GMYxr`LTHtAP^TZlEWV%bAXF6k@GZC+?9m(Q8Nrpm4Ax7E zN*tWf5Z83CP6~iETx>--ec*?C=N<$1-iSR*VvuC?L0Bcs=bLaGQE->vS@7x8MNh7y zPFDhQv~*ECB2JM$%VpH>&}3v#e@U}=VD?@F{b{{7Um;bxADHpkp@B))60#*!O-Py8 zKdD)0mR? zu6d5wN?MDu`})TCCaI$GS$xd7$a}wz+@>dX9wa&sFq&xxf(4U)CLF;0yez3_Fgi3= zJEb={zK^b0fhqci~JjlXT!X;iIq_eesFhSNbYcfRnI~ z+*;|VK2g)Eo#Zc=gF176u|t8ZG;%*8NBNBTN&gnQ7mN=$+#X~d=E)A7@|DBKH}kxNcmqb^mkIo>H&)}%v19^pF@)YdA= z)D>z~wSmlvqA-zvONv1!R{@!!-H_sY&iF^a5{e0x4IK|^Nm+wD{+uFILu+aDfYv3! z`h`@x6YxF#W^%!il7icao|%ffjWqjK+#1|zQC~PBZWl%Ap;SQXB{UYMNuQK$DphJL z>t#tILVNxg_C^KC4F;}QC$;t4_-$-9_Uc2lF`=iy7Qs5fEFl&nu%_Mz+N9!U6^l4y z=^A*khJkq)#U|l)hg(D^$xx?#aFfL~(udD2E)<`L)1`daXOtCAi9ThTI#|u1be2RU zw4UW9;R@dk8SBXNpzo|AR!6J7b=N#*xLSwM$za3Kcx}FJV~x#gOfgcLIgpW-16)l_ zDF5?A6+4w#!Tw7wU-iI>GVv9owcSS<_&OXtat<ahlr$t>b1o=>IMmnaxS4&9CMcBn6hU46M4doNLHJ`;FVY znj?8RJCO87z=3lDGoOi>>=${Bdtv%><*=hDB4mNCDIR;4;n04(;tRmtya2mV8|c73 z=qisw+1e6$JR87Tkf%TaNF6w}D&S{Heyn;wwS_GDA2t0E!aF`4P zqFe(=_XbyUt3oS08Snl8^TIG7xR>E0P{F(OX4k{F@&Q=B%@#+h@NMioPQbg<5U!y+ z`14`xMFQjzye2!qc4T7LKrb#ZVcq!=_`6tScg?}gFe8ELr*W_19>YnrFYSTsr!q7< zRp2K~K?}e=G#qJ=w~!Ls94?HYn+*u>ZXj80z<*h|t6>rxJjqEXvIp~fMQ$j!jyua; z;$Csza2Lf%Xn7iPS-CG@@}kHI;QlXx`sM@rwhTAM`~=3*4E_}rT*5ruypa|qnohiP z)>-Pz!mpLic6cUl!^ad!TZ08kfID!O`_wHAg)PNx!JBc@Wk0YxRWSEogJz*K)TkEj zt_*U1J_XbUvGAb9aT~bw$bG6qvVh+h1m0;q#?}|GXGT&`>6`YmW@>6-S89jg_<}!IKDV!>OetBo}dO& z!;9978-qJQ)+5_#G}j%oO<7I_J~V+8CvSjqw1Ed@EikUHz_07!2F5M$Yo>HB!#g#Z z_JOu1HO276%@RMI_s$Q8qgio7NF{J_lj(kFG}5}Qa39D6usuzI&<4P(>;uzO8ji7h z;J7*-hqMlJ<| zkqx(6JV)~OBs@n;uzz9m1GW?hrZbxdx4@*Rs04PH~;@?Itw^Ss%sBdd(U*w zth=zVu=ql7mqi}3XwV?R!v;ct1bw(9kRZV!IKed#oCm=n5Q6U_yR19wGp*HC@B6yK z&TnRWdaA4LJ@@GUoO>=6w7Jy3!TwJKF`PlB;86Cq%;v6_lTR8=rtACGhn$i75UYL< zl)0CksxMNb@&hX{I@*O3Vjkh_moVCt`p0|0PzRFVxe8Qo5oeX`2~Yf)il+}!Q^_x- z*-18#weQPUI^Dy2uQSR^$+v8tYLVT&9F%wv{P-eSuHE7hbOw8o9M0MB<*w|e8%3_G zKVN0WG?qNmj^ubwVVB^8RDY~Q681nY-Id-T*Y*oW@DKL(gu!~`bB-d9bUWU{Gwil{ zmfc>jk>`7!)xQ#%ok(6VpPEOG@NTN?w+GFg&RYH%iQJz2$trdl{TrSBD5vUNLssi1 z&K9~EEbo!%IkHaQMN8ZHor&!_^LxO~=Yfu|-QII`1rqT-BUnJbuP^7@j3mo7gj`pD z@_B{S61?s&$tpd}i7uDHH+ynAmB?@pc>5bX|1PpNmypLfo+|desGZ-M6Pq@to_#H{ zn_H4i+Y^aBh&=T9;ig>Jw|u2uJ`fv7>Mg@WF!6q z7QL{YMM;794{amZ8_a#e zuGt^yH%l{ydCY(VAB>NnN?{(fFS0vsZ_ZJ>HolRQoNgy~@dPKAj0Y`vlT(Q2z(>ch zQ|}jW(-Y986Mh{2J3@XPJWKxa5dOA5 zG`x;1+0$e+AL1m**;E&p>uiyG*@&#lp^WuiPXGRxldX>;>#z$t_f+yBm%K|l>Zj3h$-i6NFu)a^`zqx=Y+ae(g!< z&Dku`viMJ_$*KPF>C{v`kQzcp{PNV6>G$JLTYruFrVET|JNRf{bjE0Q@E#rS3(cP* zv%M|n8C`&;eJ1`Z@y+X;7Ir(*_cA$^!R$c2BHlJS7cMw~uLH=%4W+)L$=<%gghLq;0$ZfMfS*tEJB6 zJhVH5%g~}nu&&G47xcgJ=wPkX&72~>4%(qlun8P9kv*$lptToa#SXxG`-;7>hjB*P z8_{H{BFb?1)tm!&FDv_M@MY>Ec6Gm+Ud%3^DVcq;w`Kn?dw2H1%q#5Cc^W_FTYSbl z(7YE%Gg|{%Q|T)?xgIoLX z&-dgHE52R4yKrn_VaLVgaph||eoh+zW)IPxktN-NLL#ceGYxVD0B%s#jL; zqL#W}?|rK#^&C~nRQIlYSvjzFQ2mSgj?F!&4L_Yz&rT1T)Qb#E56>+xOfKxg=}w!J z=x3L^y!d|Cq1{(@?pZjbFoL+|^}-(uKW2xqp9fpU`J~6uIpyo>(Un~*TT|D!YwsJY z?k9PEL8Yg14Y`z2>@uRFmCE<8>4dN>I1OvLQ+gg}O#hR5!ABW&pN<{!|Ce1{IIldi zYwONi^VjBX&rd0QS6HpEN%rIT^VU$bYQN~K=56&at7&50H>wv`XZ1eab87Fs?1Y$A zeWspm9ADpzvytA8E@KzgC-K9?XSb*J<%GXQnIZWJg)--rTwHuRHzPAQ|2BP6dpq`_ zhHXOr(82|U3583u|D*!$Q1to|R$>7)tIf(t@U!Qub9-;^IkESo%AYH5Ry!JdH4dPz z{fTf@WYxnEs$E({D%azU^~vr=G_+pf_~O;YJE+CqsjynPr(CAr?7q2;^E(%gDjZN) zH#;%Drj=pm*j}x_HE*szUj1$5?#d`>!`WK$}>};`c5W7lVFBXfpXFtpAQ#_%(Tlvf46X25R{4@E{h41p`r_-FC zK8S4WY2muf@%0U>!z)i!-l`s7y}kF0-hJtk`Bf#VuO4pNJca&gXGiP{WY_o&)YZKg zteswxz9n-_?##l}eE-57#k=ySWKPT8+A*oTVQH^IPj)zYq|5U|3)km<&3Wo8>Fd5l zD{9`-IFr3Ng-TGZS2NYOd-tkbS=pqrqWZVyE#Xo0Ih_*j9aoTtrFeyRvp0VSJjUVK zTk{hO8|UvR%q>{4yJx;{X;k^dj%UE`{wFtxYQyfrR+(Q$hqTV8e)W*>n&vZ9Y^E!F zRsL4}s(K20Q3}=5D{of%vG?Yq@LNs`IW78q@JZ^z^iAp0Qv1`TVte{1?4G-%aAkh8 z{PTrboM!O5%q7JSOY=Iv$e##$y=yMaRq_Yr*sIvOwY6jP30=GIX*^MTy7FCbk+ZiC zuijfZqq=7GrpjIPr}}`n;ZO92Sr}iPx;H(7>gC6%RQwT7cxGlc{PiOJ)F$UI$#9Nn z?yb(RN>>%{2N!rHdui@^<}xgOH=iFAUBziSyEkvCgS)enXT#b?wPls-s^3=^!Cj|< z0#9%CY3UzzOI%tA-v(zmwauFgbSsJDRqo^Y#sybvx#lrgof?`&VXA z_Oom;->>jt=FsR7;`V>BZ{ynL#*M>k+p)uL38$}J0EZo2I~PRwj>b^D&Y4u*trhoU zpXIKcK|d(HD|_cRqD#Z1Y>Udy)9B&0Ap2NqN@{8TFuH&rnSX&;Z=39yx!(NlxtHQ6 z={T{R(|F!&zTFsGpH-RMduKIOU!R`rFV_av9;|*@f2Xx=d{*=_rzI^3^69>rr#buM ze0J+Er7CJt=IH!>`L%PC^1sgwNiPjXMX0ql9p}GEb+qV$NMD=x>0`caV}AAh-duHD{p|XuoQQW({hZq4 z^_yBxvBz{FUc(qL|1F4R56rvPY@WHiW8cz8g;(i! z+Bb7E!my(FXxUA zc4bGxh}MU6cc0LFqkdBLtKL?13jNK_s!w8f)e-fRn#WURd^P>HMg|kWitfq0k$E-U z5gbc&@fiKQ2ju2tD|7|=cjh2GmqPC5&cix4$Y(Pv(nHvjcp$aN2gcLb^|KazvbSh9 z*y*u|OzzF>m|VZ{P5lYZBw1R2mb2dXpeyU$@j^OXZk~B3b44bX{v#c5?jaXDIrDG2 zTW_B|Dz|QW&-lKeRG_T8_)zvXBGW@Nx99rh$7C*RUECZJPNvSeqj^c=H`I||Q8}aj z>&C&2LmJ;SI_Xt>d{jwoo8A(y{n6CgnR7DlW~%9xc(gZAfjKxeHhTg)U;dE&JPRHj zjZ4>yxzfjlci9a(1AW#ne|PST)NynsofMuOHrU-)sXtv^(0g)qXyeYtwbYll8YeX- zgsTV3sZm`YZ}+P7bs#B!V8_I=_)@goz0vmR-S7}5X2R^HnXB>bAI{#@IjS?tt(KXd zUL#ZG90M|n@kR8-|07*hucMF7fcpEDk(K8-yXRYKA$DNT%UaFt*cW?j=4kxX7sc86pN*$O*Ax^h!f??g+39|}j6?kirzj;2!P{me_bzvbf8f5Nex zcwA)Hz(jhze^KqMl&ZPL%0{!XIn@UTHD3#|0o*C4TiOz6`=9KylWZT2a zN4;~aH#C+u`ZRA~2h=;wQ=?w>}BaZ|$C%l({H< zSo*N+TA2fgH1^1CTbk6lQU3YNy_v@|>vBr>Yn=0S0_R`9);yA3U;`V!V!gMktX*Hf zxj((=RtvWTg*qnqBr`5sA&#F(#$*9qhPF*_6tr5C=>&FX>)BL;F7A(}pQ6s=?r5Fh z&cb%(IUNskuFVITne2XgGIv~hNbAGKTFp-zn{bZ8uj@ar{-yVZ%8dGi=A`C|<|Cj= zRIjEh*2*HazM?5l1=%pXu6b@W3RLaT)TU$>4k6Roou1lpYB{Fw*Fo80 z_Pp%mT$EiE6vNl3lE^kL=9H6fYEM>P?tQm9ukn0yE_(!z1JPT_X**}-J|!3XUOLL` znmr`5TKdr7z1GBV5FM9}j<-nP3o4c(dfB$?5BOWIWoK~ysWi9^}a*Xf6Bd|o0OT5J|lBnb}ec~woJv* zwc)PK@r@svUBTp3Gw4qz(rr0C>DFL+;n=R1sWsUu_Y$4r_hN7L=IJP`qmTBke^tG) zx?inUxwZGL%J|0GoEdXk>jJ9Ie;xF&pYQBk19a=H%zD|wGcTnw!R@U-HP2~WT>q@O zPH-Gm+`mc>$_>hV9Dk9zzT@kzO-frAPR-q)T`fC1ds;S=db7Dreck%OwH*5?)?@ct zfZwxTa|TxNaXhx42dh#Oa-ZdQ&DF>wy^^_yzUB7^A4NNcH#W|!Z%MV!xm3&kjq_tR z%^jVd5FC)5T>7wUtIogUQJm%{qo*=__Z{^;~TJ`h7FIo>o zZ^iEi6H_N=cF#YRKaZ@_H|YtP_t*`32)!SF)?8G7w{~uQdFyAX1$5{dm3t)DlX@!H zBA@PhuY5-FfLt#76^Qe=%o?dx;W?aMcuVaj_Nvch|JvKN{p$yXE2)o}z)7}4Qj3D` z)30ZL$c#-@*rWd%7HJyY+9rni<`rnl3mP}ZYmoVWGFxK5$^7gSsk3w6;8%ZA*gx}a z&>b&})(f8GtiOlJupUlj!TOxQcnQ7qriZuDIsGb7y@~04Q{#e_!3OF6sdb`T*tNA= zYunbRXtfv6{~I?L3=HnfKiDy^_?wO=%cpc+n)yetFn4X~&f=2vz0t_<nY z=;-P=t+;jR%FcZD`S|1XE$j#WHuJx658WT%345d8MsI`*8keJ$n$3-(FUZbif~)C> zR!SWfoEzVU=eTV%OF!?y;jbE(H_0iTwAd8z&Pxv|cjIX0#W-c#|=$KuY+A*i| z@Z7CL+Di%>cYcyv9^@l>i-sG;=fvAbz3iBK8U31V-4`64UXN2eejtWj2x54C)Y&?) zc?RuD9tit4KWlW6O<2V4(%+=_qA%&j)Y9LEoqnDZKE9;?^XlO74>vp6CwDr# zF~)&QzRo$(ada&6STlYxyFo{H$92U4rE@y}oY^8eF*T{<(#}V6p9J@`-sSY&L%}qz zh)xdwui4%Fq%pR&cWO^gA=;U}1k5kuatStVnbVBEA>G7?dQag6;Uiv2YY_MkQgy!qvy41pM6+euHI*|Icx5N8Wt@K_* zyle$-{XUvR{J$!k2jBG%*9^brNnz_b)_f#0*@6i4#&|${R_plS$imK@zwWrIbMNx0 zg;DWQt($T;m+KupnNjg;oJI3(>qjubYoamXMUC~yyOrYWGf$=Wji-?jX|%RxpUR)3 zox<^rDUE7#bJqUz))Lv+j!pz&0e5OXR{vvdM6(j?lm4n zmclfB1zSzx_EW`9B0pyx9ohT^Qz9H%9+lerE;8XcW~*p^27NjgGu30 zjW3(ydU9>^))|?jv#$h8!rht|g}b&+By+S&>nl!d`QOIBI0Ik+T>D%!sr5_Zo$;W4 z<>2F}yG0+L;=85OO1GCDFRxkX7i}HxmV3GDsM3^dHCn6rdHud{PJCage?XV{`i0dW zYIjEWa|&QH?g&S7`b4317JD(S4!5XpNR+-kJrduICey9)2yz(rfGPbQH20P0{cv2c zO)+2oYw3p4RpmeB?``cDev?ae|E|R8&(Rvq7t!dC;*a+WhJ|_R$)2j+6P-!TQy70p zXZsb+>EU0HhhK+bZB^|LjeV%0o)-OsE*JrPd<~AOx?;Cy?Z;(DO*dctN_Iv!qZ2UQA=QiV8nzuJDY~CK8 z8m)?M;0%a?^>f&v`)%tOP>nC5&043oILC}!=r4koqEG2t`I4o+JrHahuhZH#c!~PS>r0Q8b}e6+f3Ed)Gsyj> z>+RCcx!dB8nh#TlH74FReMoAL);jgyR=R8bqJ6WU5Ved6Pif3-EN)J2eH!iC>d!vy z)f@HZgRTFKFW?mBJL%0hf)geW#ctk?)|eFEm>*L*uXF-W`D5X=XhO4p=E?GPq8R!#LmUgVwc;O{s)nU1~FXS5d8 zU#L$B{~cV&d1_M{Z&p_J_G{ddx;%GGWHKHuxzbnVlR7R43e9xt*;2pm^NUkcBU^XY&#yPa zLsEC7Pmb?wY*V?U_xQ%$>0R>2Wgco}=>2hDr?A%BQ>wHW9<57s_|rfO}T47~Zv2;VWJHE2<>)Nf2nY?%yd%j*^Xb8 zz9^s4bzt$NcvW*)=9|(ZWhz}#Q_)jz)y@rXN_~^QFFv&KMQ^Hi-v-tAR8zkhu32AJ zUAMM*V?eY+a9wmx^8zsEFPociCe`y$Z8CV$qtP?eoqk9pbYpN<=H^b&Z1B!*kV z2<)UDp3bRpZ__WM7Tl5Ur60hf=+H0X$@Gl*Gu=7{W;2CR9b1%M>e{(n&(3e{7jIEG zrL=#?g6uuPs^+V;gX`I-U;43BqgAMX)Eid%w0_OrlMkY;8@E+2s61J{v3V<*%~v@S za1Q&Nr-Xk9rllWB{UHj&zqWoCA4ueQQRY7I?RN@q7e`P#cxvasda)M!_R9+R!*or(fX9yjsFez0b_Z*vU{yx*q3(@NIka0-@8`yK?oldqc6Z*>Ij1mzlW`8t5A3XR23?%qIoPjtQFwHeC4;g? ze0_LheMeBh0j;&++G%72-sBvcj_N6mb?6d3CagBDrz^^(t>wYX)b$?C%&v+z3ARXY zSNNoKdAZj4KvVd;V5zp`DNW40M}G=~Jk>G;t<{!24e+qZH;y(_vt z{wVycc6w!D&zU_hRQ74kY8~7ioa_F_|BOuRxb4aMJMxnQHJ^>x9PI)yb8=G`6B1;PJ+y>gJUP zs7XAa`att{bnPBUw7a}Ms(D#-N&3>vgP=J5$&s#`?pN5Vyl3}%T?dzH#pT(DQa5I< z@Azf;!H!*X>GXq9y?K53=iqTVg#12U(A>24Mei%s9mBh$v%;zLBpcQ{eN{*A2KCp& zpSL~)f$rgC!d7!aa9(C@PDnnAp0ZD*?#tfY@o3kP-S3uf?Hpg&F*7lJL4Is$%hG#= zJ+s48*S8)(*IkqYCDZP`OX%x%o!=_{FP@tJcV?@g*m^bmF4~VQ_;h@yZ|f&i z2Ub6(XUBVBv0K;1gZwY;y}LTEzBkqHltaK(UvmU(Nh6yIua@#a&9f zm;0CdmCi4onY%u_X?{fU_~N#Odvgb6whX>@_kFZ`{A%
rio?^TPduTfS1PVKj~ zoobI%3*?oqMjOs=%t2XRY=)?Hy;P6x~y*}sg6?6CGA1|)o`B`VwIi~aT!kM|R zsp;=7WD3XTf0sKm^Jws1bR*~c9!6*Sa%*{Wcl!2yQuF(0b{w2s+rBo2YVYr5ko~&;2A9a_;SmoN|0YZcJ`kCP@DR z4<;2iTZPu?oPx6lrxO+$&(^mBPrjE*``2qmYF}d@|ygXaDv#=ze%U_h+H8(cb$j;Axm7SEmkjnOb!421A*IYH66w*hk zc@BMyuHi(={pjU0u0EI(-QTP~+BmB@lz5~}#ncQkG@FsPpG{w>tJ3>smarpX?c9HI z2j?FJ1MkScnEN0%Bex{?CCK7lx!-4hk@;Qv@KkU7DZKz1bb0wO9L#BWIZi^otbSsB z`+BjyihWW;8do<)H{W8n)E|gP4{A-M%kva6#bxU9&Z8UQJJgjv&iS}Ev-TWtg4)5oj&aohM9ra!7@6>l` zyu|LYH=BEda9rBDEcz`yGPeh9%x9j>Y?ghTJk&g@440F03F##_G5>xJbc)>z^D>)c z4o@FV9I_|bftjuD*5=_s&4)P^q`^6*Z_(TB@%nT1PwN93H#YXB`}nNzc@VBOs6)9T zI12P&aOOp7q(9E?$qaINPX40%{*>IWb3bNJ%zjTDX-I9~q|`<1{rQS`@|4zR^fG^d z6Q4fi+^RF_&wdrB{e8gcb!UM9{v|ww9r|yx2X6=+Dt^J9|Lds0em)bh6J`phKa9(* zmRphiJ9l1^`BkPjeHQx*&q>Xpn&p~!F7;3MwuZJI40oni`rFONIWyn}c91>MTuOhq zXTw4CiQFQ3D>|B;t7oy3{HW9qsjbqd(6?t3vNbP(EPn~5&d+R=nM$SYw(RY{m~%|7 zr84JJ_T_&}4cV*o?%$6(!!qZH^gxGIoF%qt>rY7HYScab7#&PK&L->&f0I1wcxubu zV0Y-o>7CR2qz_J?nBG2Jr90j4Qae%Qw36-``%+Ex2~`fOv5)gEBKYg*|9v=d{pNHE zI4JrZT?1xEnRrJkM?Pfd?pSJ-{>uLPciHd$IdvV|Q#E!F9RWJ=)fQ51@>g)8eW~sl zOqbS&sHHeJK04l+9rfd>gW49%We+N$j)||}Z*Ng|GL#*!n^605933s5q)y@rYJsky z8th)`+pZX{y<;^8dn6s)E?ii!cwPoN6$NKk<{s86!{YCwI#B=*Fg zMUT>lsk3>VE+dbSA-j#c-AwQ1{ir?~P0hg)DwiIn9^(Rb+W!*tY)d+gTDi4<`}-Pd zsh(qZ^~dbn?MJ=Je$=_`#`i^3Fd*~X=@v$K9@P=2G42Vh`S#Sb^nfURN^5q$o}e=58Qy!H3aOdYM^(T->Qo&3jLxd|R^FQZ#bfCiJAvP}qME3i9m9RO zUx}Txc86WS*K$0Dn7Wf6zzvpBITrCd;0gaX&=5Ih)Xndd^HYW44*MM4Qsz+uQ{nF` z`2RchUN4|7YbxXWk$2M60sRO9F@##LQLNQ^?CZ9UX#;+@H}qCig^i?cX$4oT#z^Ya zBYgvfz2~>=aekY9=ynVEqW#-<-1Tez(qy-@J&r@hHJrWE)RC>v-PVBaYp~X9 zv8%d+nx`)ITlb|7Yk7NQ?zNxJ-9Dw3WF>VaR+3G{53E8jE6P?x|1zhP$M2vC)6c`AncenX7&)xb{$2Nc`bhXzv z1%Fj}s-J6cr`g;?p0JmKPb~6&w!Nw`RW(`oAYz2w%*Bq*?s@NmR)hFiX2dDx?wAX_ z-vO^I=jwzWi&^Petn_lmx`h8{!?V-5f0cPG;^c%Mvb{C~nEfzb z4~PB(cr}7oBe{PIx((#1c_{3BbBrzIlPkHOeF7Hp%FYU(qqBZsKBT`G+X8rCIW=ak zYysYG(k-CG>k<5H!j=8G%FYQvTMl};wgm<2{H!pS8o&4QlvVuS;0f;05B~dPpY9Xw z?2za4%NawaO^JR~chBIrCNflFlyjJC4TRyA57TqkhP7nJ1^g zmqXe|GiG_AKa#K(Kkc3{vb}!xTettgAf8);A8ORZDYbOgZjW3}taEiA=DZY|WVv=I zf3*g21=O&cfqgIR*U`Zfeq?qTC?2z#b|?=R`$|SZfYFvGJCAqp`A$BUW)6cH*l7Fq1RX#yA zTAsEax^kW6DtT3XQsDg><0vp6Kd*fE;ffNUR9Ds7cgeOnFwHw^WF>VV;|ighJgvT$ z&${`R54*WjU;duRd|$ZFo)^Ozf1(XajC?4cvX^}aSNgdu+GWXq$LvZf<2^iU6@2Q? zg-E6pbgiuJO?bJ^YkS0Za&NUrFLScrMZ_xCc`fIr{>x`-m=yDq8cm)c*SC08Yd<5) z7^IMTK`!SaemXlxC%2TLm2~m^fgDuL!OI1nq_(rmhJ8NNEDi4HJ-oa3FLIqS7V(@0 z)J>#+DOYE?i({nuFEg`Ky>nQ~=c~NOrdz(7?N=-L=`|sg3yXAI$$=6EeQ!eS7NL>s|h~--%j930T&qmfGE?s+St=r>x*za1UtdBm_>!x>Owu1 zR-4y7C&2abe%(;7&Xt~75JRt?wv0-9@7T*MeOFyApBLI+a)2ZAH>sK8|A9zh0oyZ( zJNjOl5zFH-&roLiKts7nYi*wx*Lg0Ka-}_Apcl$HLiN8Ak>^u&o-EJU9fyy#byGKg z(~FQpJNTQ@ypqwo+CA+#^>FV@dsOnO;~oIj9F5(75{c8Asimc**C^eNRC=jBtBhC; zBIl}8q*fQdD;x5#obOLZFQ+=2ReUQoo|~Zla`xJveti4vBLC|XNUuCs$$|2u8q=}6 z221&}n~i)fPb#5mAy-ju$#9n@pOR8aPcMIEQzZXuckAuDaO1W#OKJU4pKDJY<)p~- z3(P@pqrkWG)?PX<|Ed;{;u)?^)=%%DfBW;UfGg*E_+&@d!Uq|R8C*Eh45y_LRDCp2$dzhGUX)EzG^LPkRw&j z)ZqcEsh-wKIKD)SY9X{uMoX@)@}OU#mQL&#O#t|eYqzpZFX^bp^0b$P4;;NdmOLVL ze1^Q6VsvF@VplA=PMOscRf1ea?U^)^j(RzR+Eg*NQ)k!Uv?bhAp4T=n!&=WpYCY>F zd7|s5$Et=_j`R?JvIACMP-l?Pt0DY4690r~Iyl@P5uQhlI%U zYKc<&H+#jY7y7lY*MIRUwVhF6o;k=rN|9cB!f!vxRI#nQr zC0c9XOqA0>=pA*oXFR!IM#n$ou{Hdw<4YT$|V{BN+YtBG>7C%T10?|6JN@Rl2#_2un7|ZBmNvnk4I=P#=NXy&FeT=n~38OM~hMcJH zEB(}0^)^>(aT8BP+19J@n&i3Kpgg1Q+a9evqc%*WX(jsD6_md2VoT)Nzu7s^I5qKM z^>Y&cA=%}od^LHpW7Z>4a^(!?qo?MzYALmXk*B)NmDBnwY5Kl$uF{dnhw`SJC&+^G zC8gAnQp-q?7d%DHkwo;aveMyi;vL!y?P{Yfd3r-qNB(x55*=EC5{Z=+Ju!;w=8jp` zyu=lWO;+}$iEFJ^(kD%9uw!(7MvJboF`F_Wy}YZIMCs-do~@-+5|vGP!@n4LFNaIi zIcmtn4>5}OJvk%s+SQJUhcD%Py7IJ=>(sq1=w#IHnP5xMgTCLx`Yq>wqiLf)@2O7F zx+JgVvqqbXii}UWQO_m5rcYhL7^S-LfE*;v6P=@;bk_1~8l9~#B!{XOm5_wP3AF5QSZus-bF7!tt6Kx zepATxK1bhBTcLld-=tlXAGL|Dw%6&qRG?f!KYc)@R~l$nq~{W-tB)&1i~#i(wL&X- z-~O7j`RjbV>AAe?(_G_(uce5*nNXqB);7iy19_z%HVnSeEAh3sLgJ~4%q7}Q$(M?b zR_RsWDZz5DQX@Cy+LDv#cq4KpREiko>lvuw)%0o|HJ);u4F0;p`9Gg1^LeEKlP1aG1Y@96DB`c^G=UlaadNoR=9O62NeaPp@X!ZKU z%CxHLStEFLpc+n&(q1Pz#>m6p61^k$2JG#tGJCO!{?My~pM!`P#e4LS$M9O*L6k%! zP)tS5pk-5Xy78#RbBx0JxBsR^u_I>^-K#A{g^iTtVfn$aXnnL7dW!O^KCCOHKdcq<%NL66^D;0=*I) zmgt{k4HCU<*QVxD-sL7^>oW5asg^2(p_Q?=mQ@*$8gjGVdfzrr%Y}M5V!DZk z-y8mBx7C9oYeH{T_X|W)}JB{?XnW$C`5s@O;UI;BT_c?gj^lZl&%#V6=*Yen?yoVPU4OSd<${;;1~)xOrXbY<1HUaL0npI&(q zohCC;lWHN1a@Ci9`58LGaR;SbWKBJ!Z;-^`>QX78l;{s7+D^|yFD=V@ySCbRy+_y5 zCn&Mb)s<2sCSIKOSc{>TyNN9|^?xJ^;B5n3JBTOJal_5S8glB|Si?m~Xo3i(OB z%p@k$!%}XfT!wL(QPC!gC9C)K6V(vL3%=({iOxB8JxDExtEO-3d80|Ro2;>1u4k== zw`+CckGksKU#v^r+QUz;R2M8}?CNMOhVmkf^gq(XpxzuM}(Rr(ND>G%B6C--mH``22L!N-b&(0`qL=Oh)#{8 z51qL{O?bVffy6Sl?jPz53tB2(SWlW0sGg*19cH+HBCACWuDJ9uc zb+!JVE2rhv^VDmz`-R>|!V&sb%0*&-jX4qoV*naebXuCpWom!5oP3gawOUubr$ir` zRkMe=akrk-Tzn@vMSNi)e!*h$7IX0pX0&~U1;~Ov!BXDwnqDX&M=NJy+gf8WwV`b* z;#iF9j2ZP_9KXN$HKJ{y&Fc&%-QK#D(7a3^g+$H=q)PMS>AJY z^1RZW*i!Q-V$pJ)@+}8xpVg@e5Bshwpbb_khwxm_gjE9Uw{A?X52yF17V2VTqO9io z%zw%Q+7IujW>oJuGWC~nVB#6d!y@AeQkLAydB|0cRmH+^m6jF)miOB>GR3T@Fys8|%q8T3(}c>7Wm!PxF)K z@88ryS?$LEn*epx*HIDw~Van=m&uihD~f(SyP5afx+E zSw}7&?6+$tjkQT;0=3`rr505j&l!e9a`L)=*Vc=Oi7abxPrn@LSi-6s zY39fx=AnnPH}96%QE?3A&UJR4Nff4a60Y9_mzd795~o%Ak~ z7%9B7S=J8niLCn8OuI__=`h62C0*z4TSp zot~kf1p2oo&sCG?Q)r=#OIERl8N7Q(FAisRzXLf-@_?>Ii`VKXS6b;-5~YlIs2=nZ z?m3rHXrZd@ih%@sx5nW|p5*G-^WT~UCDL=9MQF-=DLG@g)t! zw~QHGuK~PcrN&@>msTzKPNc4jR~1H6Yp;EWG zct`uKZdT*ELP?#7yw^f1?CURLuaDr}=1>z;;r&6(Wj4=Qgw*!seSMOp+{bK;8DjOe ze5cLSnyMF!A&o86T~bjCsUN0pRnsP@gD71{?sYEfuYTy|ne(~Qxyenl+t1b8QA(9# z`O7#+JXd|9t+3ufouejO!MpnZ(n$@Nhd7A#1nxxPkg-=(j81;?dr*Vj!l1O2Z zD`T==uDLeYC`JvcEitxHJ1g0qP9k1xwCWv22*t(vFivf_=c)UgB)T^u)Jt*o)obcu zSK3^H^h$hE@epG$S1E*Nmh&WW&lcP@zfA!pM0=pepp2Mv)Fz8f4B=aAq+hLft{<8B zFY=w1U*1sPBsw&~S@rsq2j`na%6{5h@<8rm9BGY>`KZCXQlpzSP&z~_jChjOPY~3w z{Tw-^%-`lQSJzUC8Vl%C>bL0k>AfgJ+9RWJ*I5oo{7P+*7NF3!%1Km`c&OHvi1~?R z)Zwr`+Ir?|#Kt-YQ6Etm@0MrG2}*0+=PdPlL%47db5qC5ks>+93CZ|=@@e#u z^a{-Sh>wfAYBROdEAU=dA;pf_b5fF8AES8DLwzypFRXXbyIjiXld3`WN)hQ1;a4L$ zP7y5QB&A~|qfg=}b+SCaivCwdA|2=%BX4VStc%FWS~GMO=;)HMoZe!va2|4b@)}eXvqf3~zC}PSfY;J9y5s+E>tQG9}#NKYSH+McbW)Ry_Z81B=D6t%)@^A~zsR*}e^dLv1di|aUz z->n&x<_+w2qLsTCv)O55Tyw3;qgf6kzZ9KDS7O!VaqnlGYYnAxVSFXdnWBOxM_t(J zT)&bTrg(QWcUj7K^iM;msm8OG%!tbDsJcPDnT9R}o~M-S$p_Sde#2UHw%2SKRm$JD zsXL@S66+KPwxvLiN~^DyuvX1~BH=MLgX#ojSf5kdFp}T<^7loIUkj&Qs>7kn7@xjd z#Pjr7tRGnj<@CC>Al3&fM|#2fW5XD;*~Fy&MZ32)b<5&Q3%QrEP(pS6P(S5gtilxU zn8g)ocu;@XGq{Wq8$3ZgwTCeoYdZEkqg2-qqXM}PnoiB1p^|tVKC8dmg+8~YF5z%B znRtumCnd4!GDf@xbJFKs#4`p!0du{2V9OZ0eoqqlsI8>3Cre2SErhy8JEVTngLb_= zoyz%zROgCrW#~6Lg%PVC<+a7!NBPNf1GM+mXD$1b^_~LEoY|d9C$GYwW zQ&KiPeM&7R=UTg%rbEOWW;+_YGy)o1g&D&g#x!Cc+Jt_L!RjyT(N;0eb-7Z%!dO@x zYFE%azs=xU_IY!!kx*IxCxS8}`+7(E1#*6AX>|*!yq^ni^nv0-g8A&@ z_=Yi!gvw>QK>pAkMFx7$XWcUBB=;xGV||D43{P*H%j%7R^V9SXorXpcBN@b6X#r;N z*9v!bWb-%y?-SNr{!4S7<44?ES=Gl`!nN~}F#Sq*D&&#YANgz=8?hN&)oiF-W95o` zQcKsLYxF~hz`OdTaa(|I2{w2;cys>D29oJlWt9`B5T z-}-Q^djeYRs!>;OHdkw#^@QDLA{Et^Mh0pR@ymb}8OSHKGrjQBV7S&O*S!#8gL=C9 z9@g(^q09V5pJy@eiY}>He65XA*N9%pq1Lz=7v|bBBw}PtVFa_18878?;$Z<&v6K;N z$INwzrl@(W!f&!lNi|=xho%pIp9YPajKWA>q&DVT+-4OPy2!}&-#ojr6I(T_&B@Y8 zUv3rGY9Bq}QtwoR+8OAVxa&w8WaPRonkqrS%z|5QBzmE*Zyf2E7qFUolS7bl_mRwH z1oBuPcqBpAdl~sy)=MNyL~STfb&o_B6zGIT=2^A$qtT#48BZC#I;O2}jChAK{=}Yn zo}mbfSc^WI-e?AUJBs;tLILA;DQX;S%qq$qphrXu#q1NW)0)y=WI?`NlW}UN^(2gC z#hA^2CYbK(ZAxfKTkxrop%p7)dS-h?7Uc!=7gEwVK`f!S&7B?X(J2j9$R>V-nZIdB zjoMA^WrkgSp&c|1lK%cp+&+mM#YkKmy`Ubr#p}2|6e)pGSMofaXTtT|CDK zVi7;}CB^03PxDhfxY3fntCH`&5@TQaO1mJ+B?g>SWvUe_j8loS(p=rCy%Yyi6B$9O zDMdnC*gaR$%9T#mCdru@wI*nw8e7j%WJuIUovO`J|Ep&cL{l_3sRi&pX3>oMRxoGt z5n^cxx-XKYElbc2b%moB!8iM=_u|OR&gqXg8NKUVL9=ABT=UvvHcBy;m)=rN`{WZv zq|`k|wEB_yN}??#IBPc78MPTL>bLr?`4dqJZC_GJU}Z-d?QOm&%N?Y4Qqg1#CWk9s z^UyH~Mr=fC4nQu8nTZnV^zZF;%aXLVLry=cG$WyF$Tw_C)AhSR>u8G zv!^SoUxqB+AC^+!el_uUO)6xi~eUC?v`$1RE75Y!+DMfvp9Q#py~~(Rg^B{ap!7;sGKU#Nd`=xM7deS*c16O zXC>Y*$7|h<#S)BIy_3}bs%-&V>ukJn7a}q7oawau6)=%hXrFoW~ zi+-7&*%Dq4gCE@KXbi1)Y9><@e+ZOt?X_I)ic%8voD2NaI;dZh)3J?ShDY&v?fJO$|OA%0|(of5$O1U0d;s?ni zS|TGeb06ZKE76wj+cg(u4Y^*uwQim-Zca&?V(v_u4Cj60bNz^ZyjJgvV2SOy%gZa1 zoOWOM+7&X&F%OvJ*4%xkA8t;}yCnNu^plNiwA#iG+I;cNf&4UUBpU55Whtfw7N0W% zWIb3J{&6Km63r7>{Ud*v0ZOVvtg_HwHJ&%eA#N>7C^|3J;=H@KzrM5)slITM-4(Ib zyEFzb^V9s1XsRB$W7VIG@s!mO+DSP`iYcvi?w{z;q_#;br)OhzfvYKE7cd(A*fqG{ zAnvcvy&>OS#Kf!d`&!IGTtmC=4m5qYq{=5*DSbfY#}&}F$o)wroaneTQ2GD)8sgHuULUzytv$ttZadm z^c{1=#zV$g>I}Ugb(E1>vWIa9f9a3?6SXj_;Is5>lt{DA<}Kx~zKoy&Wu2|-YmJ-t z6K~Nz$~W${R*P$$%n0hW%|Sw}<`sjQf<$_rjGo6#?w}UlVOZxb^agDyY8AF2{#M}HCdf2NAudsj3AQUtGY|+V%vOG&iBOGg3B3Cq83MoF0-t z&CIIXrLjDsqBpLSF=nIfYa~dd2)#WD)X7FA^SQ>p7k=XV z_Gl(^&w1cXvlz*jJVE?NtN$&}?*$3cqEBT6;?~AF^PznTt1kb{=Xqu)?K`jlN^0Nq zJ3d-m9EYLdX53Fj-m=wDv|3YU$}bWf4!5_djKpbLrnQ3#xnw z()Axs%9+ZTM960H&VLx?7wuVm$%yB3l{oNhXg3Sg%-BQ^coFxU$Mv6aujwHBR=Jod zo5laGl@W&2mg^V8%hsjJIdkDiB`L%CQuCp{_gKbHV_rQ<^DH6+=B&i2%`>g)t8M%ycZ+!I?@Jl2)o^&W4^n8nWxhiU*Z5#9UXA5vUv$VAINb^{(V)aGiNYip zW`FqA?6GlqA~{N!RrLDd{aA@^tb@Ll^;BYRVn%E7``RGOgWI;lZV5hh9exgMqxRjr za<8$tR2Noe9KVg>u6@yZ+W$4$8boBe$XNA}MzNmaK*sx7IKgT$qh$HSY`Q$BKG!p~ zt}M~#Bj6+Lu9*H1#w~6sPSV+?nY*$4lyPYakIBeOt)m4on)XViFsV0nuag*tTxYCc zK3l|l7&A70HUlR9rR7pjhyiH%^f25%r_I;blX|Y4(U7!M2l{Rjs~a_Cc(Reb`b{rV zWKHiy|Ju8%J*{?3@~k382?DedF6fWWRJ;3py+ZY#xT==QJ#co7 ztRho8EMko2J=FMmCwk%1!JL5WXSAV*(%H6l?yeSh9E(L81-Iyhtkt%D>%(!Q;52h9 zqj)`rpVml@ZhyiW3uBZ4e2dsw2PW=doq~9t^iRA|t(Usj+5s(vbWf_6 zYqeh!y3OJ}Ya*-#Q&+h6YB}0Q%)%}_>2}O5-l>P9e{RObdPU`<57Huknq%+c9dqSo z%JdCI9L-LtJ(T}Md)CO~iu4R-G>f>SJGR797V{~sM1iyOX7d!Y7FM-cyWuDe9t0T<#s6!K5sx4Gb zt!uRYz_>8kU*LL}=XN#pQpED~$JC2vkoD=U+B0uycOiEHim2*`iY`fs1R=2k*UBu< z>>C7Mb@ED&Z!lcx4teu6zH4og{({!q`W%t30o+F~#C(tbu{y%qdNqsGRw}fTd7hut zU^-&48GQor3b`)9$MuojPp|)?CNe%TzSoBpe=}FDL?`}@^t6&iU7-i$X!NbEUhwW+ z++B^4)c!g~dCu&VK89K*sX#FUWgUt?69iRHV0v3}?doLoX_inA#E4vf!|pjr2OaBK zMIQBG^an&DlIlq#NBh3`B%dMLt4(p0^rprB9H;S;QG!0dR8@MDSvZ@-(ujjuo#j)E znauQS0g^f-Yorn_uFbbgg}kf07y*j`u3)tKGDg)&t-N>g%NR_#)_)g)k&aTvs8zHo z*$=M%*NYZ~QA?RWvf?GFiBoIIL1NZwL-PnpO{`ffkproqe<)Jztc_}{`ZKfVJ*}}) zJ0vubi^UoAkDa$@jTmr({kw_@9xL{gd?yj3nXJSsH!G+vm;1C$33cRlEr-a7v+z^e zH}efiuE~{AAL}J2*%Z-j^^iW#kBq?BU%R3gFS?~fna|N`DnI%MMh;>QVvSbxuZC0& z=e7Q}yQ`HkHI$$76TR@S)!q8j;xkL}WYsP6c%9UcOv6)~f!CU3Vy!0@2U!N6S~+aL z0`-)A$MmJ-QIV+m{8Td}HL!Ava-|G7Kjk1nUW~@2hVibrgdVUmrd?EX%M<26rJPZL z)-u5q^vtb2Q>K%SBaU|=zT{M7!s-J(ay=IPbSqV)miUDgzurY8L9QQzHk9iV{#WN% zCpe((PnlP8cZN03RuqU*CtXUsixx?IP&~w3N{go%l_vW+t!k_@BGD&NK_doN&#G1X zpBZ;q>1U0wbg*JjpIO96{UJ6nfcx4*Eb%~$_eS%2IRBe7k*34AM!V~py34%8-|SCd zG;M8-*p}7KVt&TX19`UDCN+eJycI}4wz?cuvjdn!1Nb&EP5;w-mWj*^a3!(E2GAdO~_N*6|zpsBuNcM5V3U z(@VFe`dgmpE?#kKy?y6k3~zL(KQCgfCtwalZ`|>y#YAt!KcuCYQi2NlDRWp!PEyX0 zbFI{LZlW0C9Z3zLc!E*eLawnIa27wm=N;=NjA>@`sb$1{BB;K$f1+|>g})KKqtf@2 zv&CBUjU9!Wz@0WCQ|bywYt54RPOY*CZC^N5yJY5kD1QBhTqU}q)R;F=f4QqrFV)yu z)Kv@UostZlv8fTflrv8#HN^1rctnEK`opNJPBd#WOMZl7sM|uT%=W?hRdE$SH`fHD^ zM{wob+hO!-T<;!nDc`8ty+;Mrot2AT4FZeqCu2H zQPm$9!4$@2#k!d-seIzTH6I#|;&{5|O>sOAuf zHCv!JVeN$WM(!406S3$;W0@0N$rT+SJU_za)3`pvYpuk$TxkWA*-`y|ZNJ$O>qoT@ z=7tvWj-G>+H`-rogA2?$>A|Bmurg@?YcvLWbx}7u9DQdsRsf}x+reC|Rx-alklC3z zOf>3LxNZP*6bD@dADZ27u$I2oPs{U}xgacNE?Q7coZMPau|K&)$yI|`L1X^a{BB>a zozME3Pgspv%xaI_jLcGIr#Cwk%37tNpRy*@74g#xbbS}X--)L&txX-HfyE$CpE5=> zLq=!jTFuMI8RqBo3-mtaaw%`rW><|gzl)b^v5n^SYzDzyRW!Po`0DVHwn#a&I!SLr z`=`&R-dF}5=7N$fX9n5~Ut4Etlqm|i5FE{lj_Hgf75u;_XS8ec^v|p>{tx36%WQC; znatKMA;w?UTkBt{rN+|z*veBYl!n4zBj}{O8Y9$>cCe!3_-zz)nGIF+orl1$uB9q`Z0cYkr_>x15_g!qa=8gT(&lMna}JuV8mN +#include +#include +#include "task.h" +#include "flow.h" +#include "audio_feature.h" + +DataType inferencePrecision = DT_F32; +const int N_FILTERS = 128; + +// prediction&joint&pinyin2hanzi +const int START_TOKEN = 0; +const int BLANK_TOKEN = 1600; + +// pinyin2hanzi +const int PINYIN_FEATURE_GAP = 2; +const int PINYIN_BUFFER_SIZE = 32; +const int PINYIN_BUFFER_VALID_SIZE = 16; +std::shared_ptr pinyinEmbeddingDict; +std::atomic pinyinEmbeddingFlag(0); + +EE encoderInferOutputSize(std::map> &inputs, + std::shared_ptr &tmp, + std::map> &outputs, + std::vector parameter = std::vector()) +{ + TensorDesc inputDesc = inputs["sounds"]->get_desc(); + TensorDesc desc = inputs["encoder_block0_trunk0_layer0_mem"]->get_desc(); + desc.dims[2] = UNI_MAX((int)desc.dims[2] + 1, 2); + outputs["encoder_block0_conv0_neg_slice"]->resize(desc); + outputs["encoder_block0_conv1_neg_slice"]->resize( + inputs["encoder_block0_trunk0_layer1_mem"]->get_desc()); + + int block1[2] = {5, 7}; + for (int i = 0; i < 2; i++) { + std::string inputPrefix = + std::string("encoder_block1_trunk1_layer") + std::to_string(i) + std::string("_"); + std::string outputPrefix = + std::string("encoder_block1_transformer_layer") + std::to_string(i) + std::string("_"); + TensorDesc desc = inputs[inputPrefix + "kmem"]->get_desc(); + desc.dims[2] = UNI_MAX((int)desc.dims[2] + block1[i], block1[i]); + outputs[outputPrefix + "k_neg_slice"]->resize(desc); + outputs[outputPrefix + "v_neg_slice"]->resize(desc); + } + + desc = inputs["encoder_block2_trunk0_layer0_mem"]->get_desc(); + desc.dims[1] = UNI_MAX((int)desc.dims[1] + 1, 2); + outputs["encoder_block2_conv0_neg_slice"]->resize(desc); + outputs["encoder_block2_conv1_neg_slice"]->resize( + inputs["encoder_block2_trunk0_layer1_mem"]->get_desc()); + int block2[2] = {7, 9}; + for (int i = 0; i < 2; i++) { + std::string inputPrefix = + std::string("encoder_block2_trunk1_layer") + std::to_string(i) + std::string("_"); + std::string outputPrefix = + std::string("encoder_block2_transformer_layer") + std::to_string(i) + std::string("_"); + TensorDesc desc = inputs[inputPrefix + "kmem"]->get_desc(); + int adder = 2; + if (inputDesc.dims[1] == 15) { + adder = 3; + } else { + if (inputDesc.dims[1] != 8) { + UNI_ERROR_LOG("unmatched encoder input\n"); + } + } + desc.dims[2] = UNI_MAX((int)desc.dims[2] + adder, block2[i]); + outputs[outputPrefix + "k_neg_slice"]->resize(desc); + outputs[outputPrefix + "v_neg_slice"]->resize(desc); + } + + desc = inputs["encoder_block3_trunk0_layer0_mem"]->get_desc(); + desc.dims[1] = UNI_MAX((int)desc.dims[1] + 1, 2); + outputs["encoder_block3_conv0_neg_slice"]->resize(desc); + outputs["encoder_block3_conv1_neg_slice"]->resize( + inputs["encoder_block3_trunk0_layer1_mem"]->get_desc()); + int block3[4] = {9, 15, 23, 31}; + for (int i = 0; i < 4; i++) { + std::string inputPrefix = + std::string("encoder_block3_trunk1_layer") + std::to_string(i) + std::string("_"); + std::string outputPrefix = + std::string("encoder_block3_transformer_layer") + std::to_string(i) + std::string("_"); + TensorDesc desc = inputs[inputPrefix + "kmem"]->get_desc(); + desc.dims[2] = UNI_MAX((int)desc.dims[2] + 1, block3[i]); + outputs[outputPrefix + "k_neg_slice"]->resize(desc); + outputs[outputPrefix + "v_neg_slice"]->resize(desc); + } + outputs["encoder_block3_transformer_ln"]->resize( + tensor2df(inferencePrecision, DF_NORMAL, 1, 512)); + return SUCCESS; +} + +EE predictionInferOutputSize(std::map> &inputs, + std::shared_ptr &tmp, + std::map> &outputs, + std::vector parameter = std::vector()) +{ + int block3[4] = {3, 5, 7, 9}; + for (int i = 0; i < 4; i++) { + std::string inputPrefix = + std::string("prediction_net_layer") + std::to_string(i) + std::string("_"); + std::string outputPrefix = + std::string("prediction_net_layer") + std::to_string(i) + std::string("_"); + TensorDesc desc = inputs[inputPrefix + "kmem"]->get_desc(); + desc.dims[2] = UNI_MAX((int)desc.dims[2] + 1, block3[i]); + outputs[outputPrefix + "k_neg_slice"]->resize(desc); + outputs[outputPrefix + "v_neg_slice"]->resize(desc); + } + outputs["prediction_net_ln"]->resize(tensor2df(inferencePrecision, DF_NORMAL, 1, 512)); + return SUCCESS; +} + +EE jointInferOutputSize(std::map> &inputs, + std::shared_ptr &tmp, + std::map> &outputs, + std::vector parameter = std::vector()) +{ + // outputs["joint_output_fc"]->resize(tensor2df(inferencePrecision, DF_NORMAL, 1, 512)); + outputs["output_argmax"]->resize(tensor2df(DT_I32, DF_NORMAL, 1, 1)); + return SUCCESS; +} + +EE pinyin2hanziInferOutputSize(std::map> &inputs, + std::shared_ptr &tmp, + std::map> &outputs, + std::vector parameter = std::vector()) +{ + TensorDesc desc = inputs["pinyin"]->get_desc(); + outputs["hanzi_squeeze/Squeeze"]->resize( + tensor4df(inferencePrecision, DF_NCHW, 1, 1, desc.dims[0], 7160)); + return SUCCESS; +} + +EE encoderPreProcess(std::map> &inputs, + std::shared_ptr &tmp, + std::map> &outputs, + std::vector parameter = std::vector()) +{ + int featureLength = N_FILTERS; + // inputs and outputs can not be same one + CHECK_REQUIREMENT(inputs.size() > 0); + std::vector weightA = {0.26793470448235757, 0.2597546401553133, 0.25070439183132637, + 0.2389518634030468, 0.22591939536296402, 0.21842706422127695, 0.21073101672676822, + 0.19888634668966934, 0.1934352819534865, 0.19483272371655574, 0.19307169092034548, + 0.19794880602465662, 0.2041545140444457, 0.20548612384306975, 0.205089112033574, + 0.202463874511741, 0.1997057297551323, 0.1986376615816107, 0.1953351397506247, + 0.19526630343057141, 0.19707734328352133, 0.19871668436383344, 0.19880258511761903, + 0.20143541652121727, 0.2044134862423108, 0.20602641560137125, 0.20564694818486318, + 0.206515308314549, 0.2092981906166021, 0.2105148453821694, 0.209482433282912, + 0.21072670095339943, 0.21295487096308688, 0.21402032655941866, 0.21254455731621794, + 0.21365817460879144, 0.2163171444197802, 0.21766703064503207, 0.21640375119276742, + 0.2177893882181534, 0.2205046640925341, 0.2218610679573307, 0.22053006469571076, + 0.22162170408445966, 0.22370872632630542, 0.22537803061334274, 0.22641169891592502, + 0.2274135200959736, 0.22817822886370503, 0.22850555770692876, 0.22849091616908523, + 0.22942646398018746, 0.23089530924664364, 0.23176498740499615, 0.23372326568964216, + 0.23547995759926693, 0.2364584692820128, 0.23713210245263003, 0.2375549912435519, + 0.23761757113350296, 0.23757638746581106, 0.23820814260735781, 0.2385523824231173, + 0.23896144410382456, 0.2397607819892432, 0.24065938255474512, 0.2416691468977067, + 0.24337672078468509, 0.24427940599421233, 0.24517506765424793, 0.24579829824437913, + 0.24723941129617125, 0.24809058963717726, 0.24874810693293706, 0.248877475370626, + 0.24951549731479883, 0.24955122418541695, 0.2492060337981675, 0.24902471798206796, + 0.24888344336656584, 0.24846182447195098, 0.24729274718749017, 0.24639018404388816, + 0.24659313647419556, 0.24630866444966484, 0.24585278398389177, 0.24605167118751672, + 0.24594061893719316, 0.24532106768133538, 0.24572437083432735, 0.2459548905112401, + 0.245982906631063, 0.24652363950502573, 0.24715790835692908, 0.2478608527450776, + 0.24889337178480928, 0.249329751248172, 0.24960285555075376, 0.24955584458875266, + 0.2497572027892517, 0.2499798759413889, 0.2500960262323433, 0.2506400682242264, + 0.2515477086314016, 0.25259227168784903, 0.25364113255322157, 0.25537851424540586, + 0.2573300627421209, 0.25956427589759357, 0.26117713995761727, 0.2624523374880242, + 0.2632993514075515, 0.26413640430134505, 0.26511896710476746, 0.2662951418810798, + 0.26744233631929915, 0.267688136864862, 0.2672668616086788, 0.26649503147446485, + 0.26594129076005935, 0.2659199727680806, 0.2664476518237045, 0.26695480256723025, + 0.2678133595844467, 0.2701192220836497, 0.2742489539853769, 0.2798973923783803, + 0.28540062392560295}; + std::vector weightB = {4.594726366770656, 4.192752172632116, 3.9776274929119557, + 3.4349833759246713, 3.0983192175590126, 2.8131751675954018, 2.674216353771496, + 2.299024401714484, 2.2277405730898843, 2.2079989172157086, 2.2080042633425534, + 2.239013527979191, 2.41471012643739, 2.405628743225133, 2.45394225056771, 2.3372751727216574, + 2.3356523900751234, 2.2857494554648192, 2.263597932542921, 2.199953784963237, + 2.283013730372439, 2.287507759169855, 2.3248724084010197, 2.3234718339153364, + 2.428010836779634, 2.4391312085381363, 2.4676774757702, 2.4445873870383834, + 2.5379614937156854, 2.541529720288643, 2.552965909269937, 2.528893119611279, + 2.609828446143808, 2.611520901760278, 2.6113588465301225, 2.5879040353367735, + 2.670180890126309, 2.6768002097714785, 2.6745482022603047, 2.6589252525406937, + 2.7405675184409484, 2.748250039256346, 2.7504889136399346, 2.7279897692691324, + 2.803509804647416, 2.8033767975633253, 2.81782662029014, 2.8398580132615985, + 2.8634585052804473, 2.8850252018322435, 2.8939588401492355, 2.9149064619044824, + 2.938446538597044, 2.9491789310074474, 2.9655894539521057, 2.9814448232043804, + 2.9946988873469187, 2.9974272291551625, 2.9982878146018908, 2.997330908879054, + 2.9987101107447867, 2.9833493242668405, 2.9875125168844545, 3.0194390288802575, + 3.028980829234581, 3.0057895811449447, 3.076450198087296, 3.0683058012421935, + 3.0938844769593064, 3.11508333263089, 3.121912904965018, 3.146879175832384, + 3.1768447540457245, 3.1598400327144147, 3.190448649847769, 3.1933782870894385, + 3.1789337132666655, 3.1801368920926776, 3.1702021059419705, 3.1585067337253734, + 3.145159095452153, 3.124279154413975, 3.1068527554445096, 3.103454244479969, + 3.096145034068362, 3.0888735929867055, 3.0728735019732527, 3.0772210570154477, + 3.0684300226295047, 3.0504857878230385, 3.068488307579292, 3.051638660693075, + 3.0726374420353735, 3.0707974307243466, 3.088892965875781, 3.103242655729246, + 3.1090877750810226, 3.112699742574199, 3.111884782449412, 3.1145576667173303, + 3.1185679471418215, 3.1242895827009405, 3.136642993753398, 3.15245492583083, + 3.185308230069337, 3.2015540228767803, 3.245292124114324, 3.2826235672398743, + 3.3220448193935534, 3.3566443133338755, 3.3843542201410166, 3.406417064746228, + 3.4294187840241075, 3.458963279130731, 3.4864911772857177, 3.508984664352243, + 3.525467921720016, 3.5317980631290027, 3.5339991083767575, 3.5397785467806564, + 3.5511168000118016, 3.5702997212991785, 3.6000097146634724, 3.6546755683682086, + 3.763185000352641, 3.9092252627215855, 4.07891493530088, 4.22557473399065}; + TensorDesc inputDesc = inputs["sounds"]->get_desc(); + outputs = inputs; + outputs["sounds"] = std::shared_ptr(new Tensor()); + outputs["sounds"]->resize(inputDesc); + outputs["sounds"]->alloc(); + int num = tensorNumElements(inputDesc); + int loops = num / featureLength; + CHECK_REQUIREMENT(loops * featureLength == num); + switch (inferencePrecision) { + case DT_F32: { + F32 *inPtr = (F32 *)((CpuMemory *)(inputs["sounds"]->get_memory()))->get_ptr(); + F32 *outPtr = (F32 *)((CpuMemory *)(outputs["sounds"]->get_memory()))->get_ptr(); + for (int i = 0, index = 0; i < loops; i++) { + for (int j = 0; j < featureLength; j++, index++) { + outPtr[index] = weightA[j] * inPtr[index] + weightB[j]; + } + } + break; + } +#ifdef _USE_FP16 + case DT_F16: { + F16 *inPtr = (F16 *)((CpuMemory *)(inputs["sounds"]->get_memory()))->get_ptr(); + F16 *outPtr = (F16 *)((CpuMemory *)(outputs["sounds"]->get_memory()))->get_ptr(); + for (int i = 0, index = 0; i < loops; i++) { + for (int j = 0; j < featureLength; j++, index++) { + outPtr[index] = weightA[j] * inPtr[index] + weightB[j]; + } + } + break; + } +#endif + default: + UNI_ERROR_LOG("unsupported precision type in asr encoder preprocess function\n"); + break; + } + return SUCCESS; +} + +void loadBinary(const std::string fileName, char *data, size_t size) +{ + std::ifstream ifs(fileName, std::ifstream::in | std::ifstream::binary | std::ifstream::ate); + if (!ifs.good()) { + UNI_ERROR_LOG("load binary data from %s failed\n", fileName.c_str()); + } + size_t length = ifs.tellg(); + ifs.seekg(0, std::ifstream::beg); + ifs.read(data, UNI_MIN(length, size)); + if (length < size) { + memset(data + length, 0, size - length); + } + ifs.close(); +} + +EE pinyin2hanziPreProcess(std::map> &inputs, + std::shared_ptr &tmp, + std::map> &outputs, + std::vector parameter = std::vector()) +{ + int embeddingSize = std::stoi(parameter[3]); + if (!atomic_exchange(&pinyinEmbeddingFlag, 1)) { + std::string embeddingFile = parameter[1]; + int classes = std::stoi(parameter[2]); + size_t size = sizeof(float) * classes * embeddingSize; + pinyinEmbeddingDict = std::shared_ptr(reinterpret_cast(operator new(size))); + loadBinary(embeddingFile, reinterpret_cast(pinyinEmbeddingDict.get()), size); + } + TensorDesc inputDesc = inputs["pinyin"]->get_desc(); + int batch = inputDesc.dims[inputDesc.nDims - 1]; + int inputSize = tensorNumElements(inputDesc); + int inputSizePerBatch = inputSize / batch; + unsigned int *inputPtr = + (unsigned int *)((CpuMemory *)(inputs["pinyin"]->get_memory()))->get_ptr(); + std::string name = "lm_in_deploy"; + outputs[name] = std::shared_ptr(new Tensor()); + outputs[name]->resize( + tensor4df(inferencePrecision, DF_NCHW, 1, embeddingSize, 1, inputDesc.dims[0])); + outputs[name]->alloc(); + float *pinyinEmbeddingDictPtr = pinyinEmbeddingDict.get(); + switch (inferencePrecision) { + case DT_F32: { + F32 *outputPtr = (F32 *)((CpuMemory *)(outputs[name]->get_memory()))->get_ptr(); + for (int i = 0; i < batch; i++) { + for (int j = 0; j < inputSizePerBatch; j++) { + int element = inputPtr[i * inputSizePerBatch + j]; + for (int k = 0; k < embeddingSize; k++) { + outputPtr[(i * embeddingSize + k) * inputSizePerBatch + j] = + pinyinEmbeddingDictPtr[element * embeddingSize + k]; + } + } + } + break; + } +#ifdef _USE_FP16 + case DT_F16: { + F16 *outputPtr = (F16 *)((CpuMemory *)(outputs[name]->get_memory()))->get_ptr(); + for (int i = 0; i < batch; i++) { + for (int j = 0; j < inputSizePerBatch; j++) { + int element = inputPtr[i * inputSizePerBatch + j]; + for (int k = 0; k < embeddingSize; k++) { + outputPtr[(i * embeddingSize + k) * inputSizePerBatch + j] = + pinyinEmbeddingDictPtr[element * embeddingSize + k]; + } + } + } + break; + } +#endif + default: + UNI_ERROR_LOG("unsupported precision type in asr pinyin2hanzi preprocess function\n"); + break; + } + return SUCCESS; +} + +std::map> getEncoderInputOutput( + std::vector>> feature, + int frameId, + int frameLength, + std::map> cache) +{ + std::map> tensors; + int frameOffset = ((frameId > 0) ? 15 : 0) + ((frameId > 0) ? (frameId - 1) : 0) * 8; + if (frameOffset + frameLength > static_cast(feature[0].size())) { + return tensors; + } + int featureLength = N_FILTERS; + tensors["sounds"] = std::shared_ptr(new Tensor()); + tensors["sounds"]->resize(tensor3df(inferencePrecision, DF_NCHW, 1, frameLength, featureLength)); + tensors["sounds"]->alloc(); + switch (inferencePrecision) { + case DT_F32: { + F32 *ptr = (F32 *)((CpuMemory *)(tensors["sounds"]->get_memory()))->get_ptr(); + for (int i = 0; i < frameLength; i++) { + memcpy(ptr + i * featureLength, feature[0][i + frameOffset].data(), + featureLength * sizeof(float)); + } + break; + } +#ifdef _USE_FP16 + case DT_F16: { + F16 *ptr = (F16 *)((CpuMemory *)(tensors["sounds"]->get_memory()))->get_ptr(); + for (int i = 0; i < frameLength; i++) { + for (int j = 0; j < featureLength; j++) { + ptr[i * featureLength + j] = feature[0][i + frameOffset][j]; + } + } + break; + } +#endif + default: + UNI_ERROR_LOG("not support inference precision to get encoder input\n"); + } + std::vector outputName = {"encoder_block3_transformer_ln", + "encoder_block0_conv0_neg_slice", "encoder_block0_conv1_neg_slice", + "encoder_block1_transformer_layer0_k_neg_slice", + "encoder_block1_transformer_layer0_v_neg_slice", + "encoder_block1_transformer_layer1_k_neg_slice", + "encoder_block1_transformer_layer1_v_neg_slice", "encoder_block2_conv0_neg_slice", + "encoder_block2_conv1_neg_slice", "encoder_block2_transformer_layer0_k_neg_slice", + "encoder_block2_transformer_layer0_v_neg_slice", + "encoder_block2_transformer_layer1_k_neg_slice", + "encoder_block2_transformer_layer1_v_neg_slice", "encoder_block3_conv0_neg_slice", + "encoder_block3_conv1_neg_slice", "encoder_block3_transformer_layer0_k_neg_slice", + "encoder_block3_transformer_layer0_v_neg_slice", + "encoder_block3_transformer_layer1_k_neg_slice", + "encoder_block3_transformer_layer1_v_neg_slice", + "encoder_block3_transformer_layer2_k_neg_slice", + "encoder_block3_transformer_layer2_v_neg_slice", + "encoder_block3_transformer_layer3_k_neg_slice", + "encoder_block3_transformer_layer3_v_neg_slice"}; + for (unsigned int i = 0; i < outputName.size(); i++) { + tensors[outputName[i]] = std::shared_ptr(new Tensor()); + } + + if (cache.size() > 0 && frameLength == 8) { + tensors["encoder_block0_trunk0_layer0_mem"] = cache["encoder_block0_conv0_neg_slice"]; + tensors["encoder_block0_trunk0_layer1_mem"] = cache["encoder_block0_conv1_neg_slice"]; + for (int i = 0; i < 2; i++) { + std::string inputPrefix = + std::string("encoder_block1_trunk1_layer") + std::to_string(i) + std::string("_"); + std::string outputPrefix = std::string("encoder_block1_transformer_layer") + + std::to_string(i) + std::string("_"); + tensors[inputPrefix + "kmem"] = cache[outputPrefix + "k_neg_slice"]; + tensors[inputPrefix + "vmem"] = cache[outputPrefix + "v_neg_slice"]; + } + + tensors["encoder_block2_trunk0_layer0_mem"] = cache["encoder_block2_conv0_neg_slice"]; + tensors["encoder_block2_trunk0_layer1_mem"] = cache["encoder_block2_conv1_neg_slice"]; + for (int i = 0; i < 2; i++) { + std::string inputPrefix = + std::string("encoder_block2_trunk1_layer") + std::to_string(i) + std::string("_"); + std::string outputPrefix = std::string("encoder_block2_transformer_layer") + + std::to_string(i) + std::string("_"); + tensors[inputPrefix + "kmem"] = cache[outputPrefix + "k_neg_slice"]; + tensors[inputPrefix + "vmem"] = cache[outputPrefix + "v_neg_slice"]; + } + + tensors["encoder_block3_trunk0_layer0_mem"] = cache["encoder_block3_conv0_neg_slice"]; + tensors["encoder_block3_trunk0_layer1_mem"] = cache["encoder_block3_conv1_neg_slice"]; + for (int i = 0; i < 4; i++) { + std::string inputPrefix = + std::string("encoder_block3_trunk1_layer") + std::to_string(i) + std::string("_"); + std::string outputPrefix = std::string("encoder_block3_transformer_layer") + + std::to_string(i) + std::string("_"); + tensors[inputPrefix + "kmem"] = cache[outputPrefix + "k_neg_slice"]; + tensors[inputPrefix + "vmem"] = cache[outputPrefix + "v_neg_slice"]; + } + } else { + tensors["encoder_block0_trunk0_layer0_mem"] = std::shared_ptr(new Tensor()); + tensors["encoder_block0_trunk0_layer0_mem"]->resize( + tensor4df(inferencePrecision, DF_NCHW, 1, 1, 128, 1)); + tensors["encoder_block0_trunk0_layer0_mem"]->alloc(); + tensors["encoder_block0_trunk0_layer1_mem"] = std::shared_ptr(new Tensor()); + tensors["encoder_block0_trunk0_layer1_mem"]->resize( + tensor4df(inferencePrecision, DF_NCHWC8, 1, 32, 1, 64)); + tensors["encoder_block0_trunk0_layer1_mem"]->alloc(); + + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 2; j++) { + std::string kv = std::string("k"); + if (j == 0) { + kv = std::string("v"); + } + std::string name = std::string("encoder_block1_trunk1_layer") + std::to_string(i) + + std::string("_") + kv + "mem"; + tensors[name] = std::shared_ptr(new Tensor()); + tensors[name]->resize(tensor4df(inferencePrecision, DF_NCHW, 1, 0, 6, 64)); + tensors[name]->alloc(); + } + } + + tensors["encoder_block2_trunk0_layer0_mem"] = std::shared_ptr(new Tensor()); + tensors["encoder_block2_trunk0_layer0_mem"]->resize( + tensor3df(inferencePrecision, DF_NCHW, 1, 1, 384)); + tensors["encoder_block2_trunk0_layer0_mem"]->alloc(); + tensors["encoder_block2_trunk0_layer1_mem"] = std::shared_ptr(new Tensor()); + tensors["encoder_block2_trunk0_layer1_mem"]->resize( + tensor4df(inferencePrecision, DF_NCHWC8, 1, 1024, 1, 1)); + tensors["encoder_block2_trunk0_layer1_mem"]->alloc(); + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 2; j++) { + std::string kv = std::string("k"); + if (j == 0) { + kv = std::string("v"); + } + std::string name = std::string("encoder_block2_trunk1_layer") + std::to_string(i) + + std::string("_") + kv + "mem"; + tensors[name] = std::shared_ptr(new Tensor()); + tensors[name]->resize(tensor4df(inferencePrecision, DF_NCHW, 1, 0, 8, 64)); + tensors[name]->alloc(); + } + } + + tensors["encoder_block3_trunk0_layer0_mem"] = std::shared_ptr(new Tensor()); + tensors["encoder_block3_trunk0_layer0_mem"]->resize( + tensor3df(inferencePrecision, DF_NCHW, 1, 1, 512)); + tensors["encoder_block3_trunk0_layer0_mem"]->alloc(); + tensors["encoder_block3_trunk0_layer1_mem"] = std::shared_ptr(new Tensor()); + tensors["encoder_block3_trunk0_layer1_mem"]->resize( + tensor4df(inferencePrecision, DF_NCHWC8, 1, 1024, 1, 1)); + tensors["encoder_block3_trunk0_layer1_mem"]->alloc(); + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 2; j++) { + std::string kv = std::string("k"); + if (j == 0) { + kv = std::string("v"); + } + std::string name = std::string("encoder_block3_trunk1_layer") + std::to_string(i) + + std::string("_") + kv + "mem"; + tensors[name] = std::shared_ptr(new Tensor()); + tensors[name]->resize(tensor4df(inferencePrecision, DF_NCHW, 1, 0, 8, 64)); + tensors[name]->alloc(); + } + } + for (auto iter : tensors) { + if (iter.first != std::string("sounds")) { + TensorDesc desc = iter.second->get_desc(); + U8 *ptr = (U8 *)((CpuMemory *)(iter.second->get_memory()))->get_ptr(); + memset(ptr, 0, tensorNumBytes(desc)); + } + } + } + std::shared_ptr tmp; + encoderInferOutputSize(tensors, tmp, tensors); + for (unsigned int i = 0; i < outputName.size(); i++) { + tensors[outputName[i]]->alloc(); + } + return tensors; +} + +std::map> getPredictionInputOutput( + std::map> jointResult, + std::map> cache) +{ + std::map> tensors; + if (jointResult.size() == 0) { + tensors["label"] = std::shared_ptr(new Tensor()); + tensors["label"]->resize(tensor2df(DT_U32, DF_NORMAL, 1, 1)); + tensors["label"]->alloc(); + U32 *ptr = (U32 *)(((CpuMemory *)(tensors["label"]->get_memory()))->get_ptr()); + *ptr = START_TOKEN; + } else { + tensors["label"] = jointResult["output_argmax"]; + } + if (cache.size() > 0) { + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 2; j++) { + std::string kv = std::string("k"); + if (j == 0) { + kv = std::string("v"); + } + std::string inputName = std::string("prediction_net_layer") + std::to_string(i) + + std::string("_") + kv + "mem"; + std::string outputName = std::string("prediction_net_layer") + std::to_string(i) + + std::string("_") + kv + "_neg_slice"; + tensors[inputName] = cache[outputName]; + } + } + } else { + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 2; j++) { + std::string kv = std::string("k"); + if (j == 0) { + kv = std::string("v"); + } + std::string name = std::string("prediction_net_layer") + std::to_string(i) + + std::string("_") + kv + "mem"; + tensors[name] = std::shared_ptr(new Tensor()); + tensors[name]->resize(tensor4df(inferencePrecision, DF_NCHW, 1, 0, 8, 64)); + tensors[name]->alloc(); + } + } + } + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 2; j++) { + std::string kv = std::string("k"); + if (j == 0) { + kv = std::string("v"); + } + std::string name = std::string("prediction_net_layer") + std::to_string(i) + + std::string("_") + kv + "_neg_slice"; + tensors[name] = std::shared_ptr(new Tensor()); + } + } + tensors["prediction_net_ln"] = std::shared_ptr(new Tensor()); + std::shared_ptr tmp; + predictionInferOutputSize(tensors, tmp, tensors); + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 2; j++) { + std::string kv = std::string("k"); + if (j == 0) { + kv = std::string("v"); + } + std::string name = std::string("prediction_net_layer") + std::to_string(i) + + std::string("_") + kv + "_neg_slice"; + tensors[name]->alloc(); + } + } + tensors["prediction_net_ln"]->alloc(); + return tensors; +} + +std::map> getJointInputOutput( + std::map> encoder, + std::map> prediction_net) +{ + std::map> tensors; + tensors["encoder"] = encoder["encoder_block3_transformer_ln"]; + tensors["prediction_net"] = prediction_net["prediction_net_ln"]; + tensors["output_argmax"] = std::shared_ptr(new Tensor()); + std::shared_ptr tmp; + jointInferOutputSize(tensors, tmp, tensors); + tensors["output_argmax"]->alloc(); + return tensors; +} + +std::map> getPinYin2HanZiInputOutput(int frameId, + unsigned int *buffer, + int bufferLength, + int bufferValidSize, + std::map> joint) +{ + std::map> tensors; + tensors["pinyin"] = std::shared_ptr(new Tensor()); + tensors["pinyin"]->resize(tensor2df(DT_U32, DF_NORMAL, 1, bufferLength)); + tensors["pinyin"]->alloc(); + if (frameId == 0) { + memset(buffer, 0, sizeof(unsigned int) * bufferLength); + } + int pinyin = *((unsigned int *)((CpuMemory *)(joint["output_argmax"]->get_memory()))->get_ptr()) - + PINYIN_FEATURE_GAP; + CHECK_REQUIREMENT(pinyin >= 0); + if (frameId < bufferValidSize) { + buffer[frameId] = pinyin; + } else { + for (int i = 0; i < bufferValidSize - 1; i++) { + buffer[i] = buffer[i + 1]; + } + buffer[bufferValidSize - 1] = pinyin; + } + unsigned int *ptr = (unsigned int *)((CpuMemory *)(tensors["pinyin"]->get_memory()))->get_ptr(); + memcpy(ptr, buffer, sizeof(unsigned int) * bufferValidSize); + memset(ptr + bufferValidSize, 0, sizeof(unsigned int) * (bufferLength - bufferValidSize)); + + tensors["hanzi_squeeze/Squeeze"] = std::shared_ptr(new Tensor()); + std::shared_ptr tmp; + pinyin2hanziInferOutputSize(tensors, tmp, tensors); + tensors["hanzi_squeeze/Squeeze"]->alloc(); + return tensors; +} + +std::vector split(const std::string &str, const std::string &sep) +{ + std::vector vec; + if (str.empty()) { + return vec; + } + + size_t pos1; + size_t pos2; + pos2 = str.find(sep); + pos1 = 0; + while (std::string::npos != pos2) { + vec.push_back(str.substr(pos1, pos2 - pos1)); + + pos1 = pos2 + sep.size(); + pos2 = str.find(sep, pos1); + } + if (pos1 != str.length() * sizeof(typename std::string::value_type)) { + vec.push_back(str.substr(pos1)); + } + + return vec; +} + +std::map> loadLabels(std::string labelFilePath) +{ + std::map> labels; + std::ifstream infile; + infile.open(labelFilePath); + if (!infile.is_open()) { + return labels; + } + std::string s; + int index = 0; + while (getline(infile, s)) { + switch (index) { + case 0: + labels["hanzi"] = split(s, std::string(" ")); + break; + case 1: + labels["pinyin"] = split(s, std::string(" ")); + break; + default: + UNI_WARNING_LOG("unrecognized label file line %s\n", s.c_str()); + break; + } + index++; + } + infile.close(); + return labels; +} + +bool jointOutputIsBlank(std::map> jointResult) +{ + if (jointResult.find("output_argmax") == jointResult.end()) { + UNI_ERROR_LOG("unrecognized joint result"); + } + TensorDesc desc = jointResult["output_argmax"]->get_desc(); + if (tensorNumElements(desc) != 1) { + UNI_ERROR_LOG("unrecognized joint result(output_argmax) tensor"); + } + U32 *ptr = (U32 *)((CpuMemory *)(jointResult["output_argmax"]->get_memory()))->get_ptr(); + bool ret = false; + if (*ptr == BLANK_TOKEN) { + ret = true; + } + return ret; +} + +void freshPinYinResult(std::vector> &pinyinResult, + std::vector pinyinLabels, + std::map> joint, + int frameId) +{ + int pinyin = + *((unsigned int *)(((CpuMemory *)(joint["output_argmax"]->get_memory()))->get_ptr())); + pinyinResult.push_back(std::pair(frameId, pinyinLabels[pinyin])); +} + +void freshHanZiResult(std::vector> &hanziResult, + std::vector hanziLabels, + std::map> pinyin2hanzi, + int frameId) +{ + int pinyinBufferIndex = -1; + if (frameId < PINYIN_BUFFER_VALID_SIZE) { + pinyinBufferIndex = frameId; + } else { + pinyinBufferIndex = PINYIN_BUFFER_VALID_SIZE - 1; + } + int pinyin = + ((U32 *)(((CpuMemory *)(pinyin2hanzi["pinyin"]->get_memory()))->get_ptr()))[pinyinBufferIndex] + + PINYIN_FEATURE_GAP; + if (pinyin == BLANK_TOKEN) { + return; + } + hanziResult.push_back(std::pair(frameId, "init")); + std::shared_ptr hanziTensor = pinyin2hanzi["hanzi_squeeze/Squeeze"]; + TensorDesc hanziTensorDesc = hanziTensor->get_desc(); + int num = tensorNumElements(hanziTensorDesc); + int loops = hanziTensorDesc.dims[1]; + int slots = hanziTensorDesc.dims[0]; + int batch = num / loops / slots; + CHECK_REQUIREMENT(batch == 1); + CHECK_REQUIREMENT(loops == PINYIN_BUFFER_SIZE); + for (int i = hanziResult.size() - 1; i >= 0; i--) { + std::pair element = hanziResult[i]; + int lastFrameId = element.first; + if (frameId - lastFrameId < PINYIN_BUFFER_VALID_SIZE) { + int lastPinyinBufferIndex = pinyinBufferIndex - (frameId - lastFrameId); + int offset = lastPinyinBufferIndex * slots; + int maxIndex = offset; + for (int j = 0, index = maxIndex; j < slots; j++, index++) { + if (hanziTensor->element(maxIndex) < hanziTensor->element(index)) { + maxIndex = index; + } + } + int hanziIndex = maxIndex - offset; + hanziResult[i] = std::pair(lastFrameId, hanziLabels[hanziIndex]); + } else { + break; + } + } +} + +int main(int argc, char *argv[]) +{ + flowRegisterFunction("encoderInferOutputSize", encoderInferOutputSize); + flowRegisterFunction("encoderPreProcess", encoderPreProcess); + flowRegisterFunction("predictionInferOutputSize", predictionInferOutputSize); + flowRegisterFunction("jointInferOutputSize", jointInferOutputSize); + flowRegisterFunction("pinyin2hanziInferOutputSize", pinyin2hanziInferOutputSize); + flowRegisterFunction("pinyin2hanziPreProcess", pinyin2hanziPreProcess); + + std::string wavFilePath = argv[6]; + AudioFeatureExtractor audioFeatureExtractor; + std::vector>> feature = + audioFeatureExtractor.getEncoderInputFromWav(wavFilePath); + + std::string encoderGraphPath = argv[1]; + std::string predictionGraphPath = argv[2]; + std::string jointGraphPath = argv[3]; + std::string pinyin2hanziGraphPath = argv[4]; + std::string labelFilePath = argv[5]; + std::map> labels = loadLabels(labelFilePath); + std::vector graphPath = { + encoderGraphPath, predictionGraphPath, jointGraphPath, pinyin2hanziGraphPath}; + // TODO(some): beam search conflict + std::vector pinyinBuffer(PINYIN_BUFFER_SIZE); + + int threads = 2; + + // TODO(some): beam search conflict + int frameId = 0; + Flow flowExample; + flowExample.init(graphPath, DT_F32, AFFINITY_CPU_HIGH_PERFORMANCE, threads, false); + sleep(5); + + std::map> blankData; + std::map> encoderData = + getEncoderInputOutput(feature, frameId, 15, blankData); + if (encoderData.size() == 0) { + return 0; + } + Task encoderTask(frameId, encoderGraphPath, encoderData); + std::map> predictionData = + getPredictionInputOutput(blankData, blankData); + Task predictionTask(frameId, predictionGraphPath, predictionData); + double timeStart = ut_time_ms(); + flowExample.enqueue(encoderTask); + flowExample.enqueue(predictionTask); + frameId++; + + std::set readyTaskId; + std::map encoderResults; + std::map predictionResults; + std::map jointResults; + std::vector> pinyinResult; + std::vector> hanziResult; + while (1) { + std::vector results = flowExample.dequeue(); + for (unsigned int i = 0; i < results.size(); i++) { + std::string graphPath = results[i].graphPath; + if (graphPath == encoderGraphPath) { + encoderResults[results[i].id] = results[i]; + readyTaskId.insert(results[i].id); + } else if (graphPath == predictionGraphPath) { + predictionResults[results[i].id] = results[i]; + readyTaskId.insert(results[i].id); + } else if (graphPath == jointGraphPath) { + jointResults[results[i].id] = results[i]; + // not skip blank will affect accuracy of result + if (jointOutputIsBlank(results[i].data)) { + Task copyTask(&predictionResults[results[i].id]); + copyTask.id++; + predictionResults[copyTask.id] = copyTask; + readyTaskId.insert(copyTask.id); + } else { + std::map> predictionData = + getPredictionInputOutput( + results[i].data, predictionResults[results[i].id].data); + Task predictionTask(results[i].id + 1, predictionGraphPath, predictionData); + flowExample.enqueue(predictionTask); + freshPinYinResult( + pinyinResult, labels["pinyin"], results[i].data, results[i].id); + } + + std::map> pinyin2hanziData = + getPinYin2HanZiInputOutput(results[i].id, pinyinBuffer.data(), + PINYIN_BUFFER_SIZE, PINYIN_BUFFER_VALID_SIZE, results[i].data); + Task pinyin2hanziTask(results[i].id, pinyin2hanziGraphPath, pinyin2hanziData); + flowExample.enqueue(pinyin2hanziTask); + } else if (graphPath == pinyin2hanziGraphPath) { + freshHanZiResult(hanziResult, labels["hanzi"], results[i].data, results[i].id); + } + } + for (std::set::iterator iter = readyTaskId.begin(); iter != readyTaskId.end();) { + int item = *iter; + if (encoderResults.find(item) != encoderResults.end() && + predictionResults.find(item) != predictionResults.end()) { + std::map> jointData = + getJointInputOutput(encoderResults[item].data, predictionResults[item].data); + Task jointTask(item, jointGraphPath, jointData); + flowExample.enqueue(jointTask); + iter = readyTaskId.erase(iter); + } else { + iter++; + } + } + if (frameId < 1000 && encoderResults.find(frameId - 1) != encoderResults.end()) { + std::map> encoderData = + getEncoderInputOutput(feature, frameId, 8, encoderResults[frameId - 1].data); + if (encoderData.size() > 0) { + Task encoderTask(frameId, encoderGraphPath, encoderData); + frameId++; + flowExample.enqueue(encoderTask); + } + } + + if (flowExample.size() == 0) { + break; + } + } + double timeEnd = ut_time_ms(); + std::string pinyinLine, hanziLine; + for (unsigned int i = 0; i < pinyinResult.size(); i++) { + pinyinLine += pinyinResult[i].second + " "; + hanziLine += hanziResult[i].second; + } + std::cout << "[PROFILE] flow asr time: " << timeEnd - timeStart << " ms" << std::endl; + std::cout << "[RESULT] length: " << pinyinResult.size() << std::endl; + std::cout << "[RESULT] pinyin: " << pinyinLine << std::endl; + std::cout << "[RESULT] hanzi: " << hanziLine << std::endl; + return 0; +} diff --git a/inference/examples/automatic_speech_recognition/joint_flow.prototxt b/inference/examples/automatic_speech_recognition/joint_flow.prototxt new file mode 100644 index 00000000..e13fe196 --- /dev/null +++ b/inference/examples/automatic_speech_recognition/joint_flow.prototxt @@ -0,0 +1,33 @@ +name: "joint_flow" +input: "encoder" +input: "prediction_net" +output: "output_argmax" +node { + name: "encoder" + type: "Input" + output: "encoder" + input_type: "FLOAT32" + input_format: "MTK" + input_dim: 1 + input_dim: 1 + input_dim: 512 +} +node { + name: "prediction_net" + type: "Input" + output: "prediction_net" + input_type: "FLOAT32" + input_format: "MTK" + input_dim: 1 + input_dim: 1 + input_dim: 512 +} +node { + name: "joint_inference" + type: "Inference" + input: "encoder" + input: "prediction_net" + output: "output_argmax" + infer_output_size_parameter: "jointInferOutputSize" + inference_parameter: "/data/local/tmp/CI/model_zoo/caffe_models/asr_convolution_transformer_joint_net/asr_convolution_transformer_joint_net_f32.bolt" +} diff --git a/inference/examples/automatic_speech_recognition/pinyin2hanzi_flow.prototxt b/inference/examples/automatic_speech_recognition/pinyin2hanzi_flow.prototxt new file mode 100644 index 00000000..7d8133d7 --- /dev/null +++ b/inference/examples/automatic_speech_recognition/pinyin2hanzi_flow.prototxt @@ -0,0 +1,24 @@ +name: "pinyin2hanzi_flow" +input: "pinyin" +output: "hanzi_squeeze/Squeeze" +node { + name: "pinyin" + type: "Input" + output: "pinyin" + input_type: "UINT32" + input_format: "NORMAL" + input_dim: 1 + input_dim: 32 +} +node { + name: "pinyin2hanzi_inference" + type: "Inference" + input: "pinyin" + output: "hanzi_squeeze/Squeeze" + infer_output_size_parameter: "pinyin2hanziInferOutputSize" + preprocess_parameter: "pinyin2hanziPreProcess" + preprocess_parameter: "/data/local/tmp/CI/test/pinyin_lm_embedding.bin" + preprocess_parameter: "1601" + preprocess_parameter: "512" + inference_parameter: "/data/local/tmp/CI/model_zoo/tflite_models/cnn_pinyin_lm_b7h512e4_cn_en_20200518_cloud_fp32/cnn_pinyin_lm_b7h512e4_cn_en_20200518_cloud_fp32_f32.bolt" +} diff --git a/inference/examples/automatic_speech_recognition/pinyin_lm_embedding.bin b/inference/examples/automatic_speech_recognition/pinyin_lm_embedding.bin new file mode 100644 index 0000000000000000000000000000000000000000..9e1940b1ea22cc6b377ffc7eee86e846b567343a GIT binary patch literal 3272704 zcmWKXcU(wc7{)7Y?Uc4klF>j$-SZwTZInVKWJLW4ksTT;S`;Nsl4PVLt9#yqQg&1# zSs^QXtBha&-aqc=-us?&-skx~@2yVmVu6?J;Niw5(RSe@_D(vQ44v|pT$y!^9dT_D zEz%ijznj5|rpB>-as453_gN;hVmEvKdl|c#|BU!KmWkI-7qDe-w_sk8EjjVyAhRD| zL|!-#5c|n{k(Hw*#K@fk@k_iR7`qfQCBp`?%A=bMK9Mhmza5R!X0}X-QXeTkD2OIz z*MO{i^p7oMzet>S06VXKSM9p7MFU$Gl*dd)G?Bf#l7$sZhug<^JtBjg4Pdd@$`oW;*o9HH zEUHLNJjbMKhR!W!^!7j;D(MZ%XVlU2lLPU(YzD0%w^^|0fRom*CAZI8u&dk3NrQqK z%&6#M*B?t^Swj~qEoov!OU|;Y-`~jRqAsF8;4&G#Z55IreaJgFnHZS-CEu3hvMlR< zp#33(dB$gv$_y1yS=1k^FZh$(y;oV0UM17BzbAGU+#^A?q4*cy3eC$qg|_xkd>C?( z-5~EtrDL1e;G+u{x7)IvH^xK7`?i{MpCs_#j$6dW_BNS5SP^YvY?=01Rq^qwt-{1} zcZFYzWKp%;5L!kAVCK9Ewobm8Ijn79DU%#w)Zy#G+08A?d%QWg`KaOEQ*FXMd?Pwp z%falNqvYTQTQ+R525D&bhSOhqnex7)WG1mA*(-FI?9OVY+W4H^SGYhFTg=$&Qx5p? z=RB7FX&9QP01iKPll=-lB!)cRMe<%7;@MNS?44IDJDFxn2IXmByu6I)`>;)PcJM~8 z8RH;Y>S@jN<@L-steB)p#uEKF|quI|cTY?Wiv5X7qkh{x;RBRqVuCxJZk2+04F84#3A|w&3Duu;g z48*l1dstvx1FOss}Y7w@fn%iOm{6FP4VGdOG)c3yXV(_Dk=hgSY(V5UAyEFlAZ0O>?KxwLV+*g6 z{+6YnbaFI1*XF|JIeih5%3qUz^&)}c8!|2rSI8|mKj00moENMj%BM?HnJ+s z(Gau3oYV~2D^50~V96O%;n|yiq-dHd2|BCE{1*axuH8w(lXnq5!wUPK=^@qX z!D3fdl`v_f3$B0C5A}2yx!_SuUb@I(zr~+~TD|-1O^qb+v7T#JVbUpnEZEQHeX9^2 zC>mkbrlZ7dOc&dzsfBuPhQr;_#pJ5Fn)qS24RiX{C7e)bC9fXa6Z5@o!b%AN-&7jl zT9pB;ck!8X(LuU6dUY4s_t>29#A258-#VdutPQFjkRzumXOo@R>xuMONnAC$p82QP z!SZ{z*p&NiV#=;kQn;*`OlmyE6pUB1lH4wqbKshA$Zdl7TTK}P^DmQU8xFBk_Z-Ls z+cu$}`gpiCzlgL1L^D6tDDl`;S-kvS0ZsHGSVqE2L3P0~!M{G5%(1m1Po|E*3312B z$2BwAi6C22@O2*fHT)6jJTVe3Xj{XeGEWvT-m50|S-IdDt_wfcw+Y*CM3b%0+5}a- z^{nDcy3m{3CT=i%N48Y@pF{6iUu&fkE$ov&T*ENtw_ z?s6M@;3EewpY_8--EHJe&;Xc}cv#?(a!@00ft_bhk_Ukij7G(i#mlOM1Mvl9#!6i{ z)z{Bf9oK}9ryjGHeRc4CxsiQp+=@ffx=7W`2DWKsBPoCa^0=&vRfWv4KlQv#T&NPw za@Rf-A1!QPA1#7eN@AOcL2l^5)r9fh4J^{mmaOeBN#u9Ug{YPqVqexKRJ?2?yJ(7i z?CrI}qjDA04p_`?@4YWxobCYX9iTBr z$R#BZd&y~lr z-XZAhsV24@NMdhXRIcCXmZ2bfTdcdGrOhU&||ulERuar-iI3ltWm^G zp@UFU#vMBRJ~7uddx_5ARv~9&vN+B94Y}PmL2yuAY`-tQr{<=7Ei*oFl1V6sv#_C& zOlKw~``$|kx7Q37Ev}go3B&Qiu(?Lik@b_UI@8L=eAI=YBR_~<(?>Es#2PEl-4NqV zl_18%sm3BWk|jTLLzRqrQs?dkZ8H^_^!qSUS#?D0^qx-Azz}tGjoC@HQJ_;lh?%>0 z5X~zB#(eR#Ps=^a(vk;ZPMjmFs{Kd~e73})<6UsW`Wg~)lZyG?5hP9B2yBWX1)W#5 zV&zOzywN$G#OS^iHUJU{Hj#W?DqxgHD3KklgwDeT!QJHsXj5%0+9qsex6?v}n|6ai zVvURt>);8NrLJHT6D#WE6^gH)CJQ(8Y>D#aF_5Nd4-$`p*oD=DiF39q$W0s#Iy$*R zn)MZC8!5$ZPoE zknSzNefK~S$Lh@G;d`RT4i>&u^WkB`~~*>8ennw8xnN*5L<9~7)*X+0~h+a zlSZp&WYZ-T2&ye(6{wH>ryPiJ6j262*?NQCKcHhKIIP}1GR8no)k zZ4D){=Fbi0aD4>nnR<_0;8ivMg;`?HylbS)QXuW0pOK(vt;A-7p_ucnL0Di;L8*HU zslC3JM9EXK%FhtyCw(C~0SUrqPfbWop2tF#4HP@96WFCa^NH=ofka~O9l=H_M$j|O zW?|#EiX}foiPDJ8-cP#E$bj*0E0lOP zpZ)jR7UN`lNPP8RY%g&@$=9Qxsx*(d{HK6QHM&gm=SY;;=3Jxqd=EQ0>Jkfc-6QzO z9u-4Y*bB$Ao)Os}_ld)#!%W%I8Osh&Wbw{!Xtf}ZL^;l4J?Bc8U-3iM*q%Zp;+@X* zi|~M`o~g{XAc@tCxTv8%<}3aka%5tCT83}yAAOzEa}Ek_C9bw zv)3tQb?>Y|(?T9>|Mp{=4=UK{Y7L+@$u;Bxo zu)A0i98@3KOT;{%;1=?OX=FDLiH>Od?&XqDP-2TpPol^d*h9Q+7m-UQt4UbKFHz_4 z0A@QziT&p^0bhIPirrsKAfQzsk#0j^t*jeP(b`3F{aTsL{hQ*kUs>$i2vxSr>x zGt3$KM?4;KfRt(ZF^dQ{W>}^IbqTK6l{N%lD(oV6XK7>FJQv8h|5+HXuMRZ<517LN zP14@+ludTM%WPzfMMJxhSo0~H*#8}gG5hM-lWjQPN6xqYyfkdME zlh|1pNK`K5lJR=i7{=dXNXBe(p@sNdeycYJYnkPQ{_nOG|{v-3BJS4OKvqHVJH>@(TMzl4w zB5BEP=>4-#H>F)55)Kw5QHqll_xj_vtnuV{{~g3;uQ!I&9JZI3IEmcu(idW!UI-1& zd&!#7W_WX%3!Lb03S~P7ksC*55tUyTNKbMHX^;C&HW|A?$h=CH`ooU2>GvZcb?XJ= z(0@$g=2yF!6wSe7?R>IfKd&GWv!@+*_AF^1&4LrM@(YaL%Jv-#FWtux^E|CO@ z`OocBo$SchRryS(RDslNU~ICc8G3{Ov>jQ>?xmPRd3Q7Gal6XeWdCC)Rcs-@XA^0W zD`!CyZ;%6JmS{-L&@3Q=NGMdZ=aH6B^Sz6_9{xqR`PKuQWfO_%%64LfJ4o5E!K_13 z3NJk4?9U8Il+DRvmddrP*6a=$9ma_Ag6m8od$ibm<06Tn;cVG`k?agm!-jtyWP{;A zv{jB~jnX~jb*uxC`*VUhNPDt2t@W&9vopGO$FZ1YmxWGw8B{142i0zlq)W>hOPm+< zX>C_*-!K3hKRhKd51QERcSEq6>5?&5_Ode7<*ddfgGqe3TN7|ymib8D7iwxtnDBxI9NXKWp|%SpwG4+Lfxi5t&`tMh8LRS>q%e4nqw^_|Lh79B9HcA zm$r-FrmkU^|2e?PhLkrzQSt`t0`npvTLXo}KvR3>lPqWqj)Q?A^+dQAgR1Lwz^F_b@a93{<*0-m0au8l zO#?A~zDx-CFNYPqTtbqaU2x3}JFFUI39cLDF~H3oS~c{bJNYSuF)2{--i7Z7crfHD8~=O`@hljDb}sfXU#6429sY`ZRQAOS5@*?i z;mMG8V=l-HGZ%CpT@_r_;~?B_D)G`eBA6^WX&+idnBV+?z&kwP&CijnUyeKWd}}1T zRc;bk8hQ4~>CI1l4>mn$oqrJiH`!`D8KM#%0WW|5MTRrYxjQtrr*GEn?@pI|Ku@K;~OM ziXEQ4lhr>_vDYoDAs4=mM;i@YD0sdI%J+7$FK>>s4{bWwob^=53jRoteI>@ElVodo zV{zVfrsHH_-*Y2@d>s-=bb9^pq=qIlmc2+Ujb4!3NlBzVMj9GF?T3~M!2LcS#EX$z zagpL4HcAU!LLvPwgm@Sefc9HCG8k$9307X|BS~iBet=b?d!>#T@K>zS*w}PNnLV4Y9M?y zHi!Daqj0-=yI|N>Z?T3-f>8C8&+3;DYjY))B z+tJvq7sqa{s~5Ljp99jFj3q-NnRreEZGWCb15Z`9&3g&UeG!PIMsZ~J!2;CoZ6=e( zRx-t+B=T*bjG*M{29p2%5#`1z3IiQiV`_po>yQd!hVetOI4PW5%gGmF&t$NO4k;EB zZiW6|Q7PmA1V$OarCd(+lR~BTYpy!exs6>>X)D z`ZH%^GN<6Z;92B(^?RJqiU%! ztp4Xj%uYlQKZ}WE(2!V`t$vE=tw|v&Zxvwo!!ROSmc}IBt3$!Qz39cavgB6}MX&H@ zc5h8JN%tu$vU~JCBqmqMuQ&4e=Foq5l@Xt6vs$Q;TrXhM5=;4P0C)JYVOHs@#xUcsZWx>+j4yX~8O7#5GNyD`M*h8iW z%eqzIY=R=T{pW)A#%tin?H0Dt$!s6EeycEn;JFSLRN3emWOk z+AL*8*HCh4gzmG#y*cr!@Lu&=-m{97Bi;+zIMSycaDO= zdmYT7Va(-t8p}UwF3t?6;-`&0sQAvrbcs4PP~qoY`JlQ!FEZ*gF?{KU`#o z-+duoAD$D}>t{vvr{h?RwT6~hb z39VDEk~M}0MO*dPq<76HHo_%Obl<6p-8q}>UY1U;*`g_dL-(B{Gk$&(BeIS$iNUE5 zrRj@xqmHx3Vc*VGe1F0C?Kn|yO*doTB{0ESi4{#S#Svb8-$-&anw+^OYM;GBqG!x! z3*F;T=3F9K+7!s1XN(28*jzzd>X#VxC5G&qcZ9UOsy}OIHwL1#rhwV+FYIfTDL9Wz z6MuG$Ve6MlgX_c7WL9e83KEGFv1MMZzs`oh?KWQ{-lxwh$=MP}6r@FwdJR{%rV@S#sU0_Q`le{`9 zGF*Eg9?f!LmNLm8!rZ9bN`X@Q}nC-(@o;i)Hd`67c; zV96_RR?rYSXH+p7^sT=iK8FSA8F6UWv<9Q{$6~C`SSmS0jT>(%WIZR>aJ!?2FfC*Q z82s#n-ye3P*O`r|yiA^lXYK~6$M*cdqHTQe@mZibw2z;B4Z&A?wYiq;Vt%6JA!sO- zL3+84=rtjhSZXWMqL2|h=5Z$G1n5(bqXJI5vJDNk9mkQy_ejZ@e?q)k87wI7z~T2U zfc_IFvU1reoSr!u2dfy+9}>4QGW-!+^$+Zu2aV)Y3lBm@z8sI;k>mhGbBYrrk zPgNZ+pv;|k`^cgFsI1jA{#)y|(DhFhdIr45!flcCjrKCMZ65=YV=6E+xdmsBEEVV- z7y4hqD!O!MG2~S_v6a_bF@4@}KCH0}7GG1Kswo+`a(EnOlu5#m`JudV>_rxvVuhJW z#SrY(f(qTA!S(7=h$zm);c7{|WOD`!Z>=FsC-+0In?{Wa7ob!A z1!6U;7cWgwr;eV3=+qD9eCUo&lp7pC*PkuGrjZgfJ>Lud*-fN|FIP~{-Bwh&{W3%z zi~|$%d+1s4*UvgFCNzFzKleSuyLG#9aoIkeZyAVldbH`$9etk6 zD8}9#?eW9dU2xsgom-!a!uzI=@j#0jw|b$*7tgB@Z14Kx@jW6A>dquHqjg1vh^>$i z#&D;a0}RQ}!zn8@d3T%v>AE_IZYvDty?73<)lGwQk+ad$c^iy>a2u9f%S20ART@=t z9+ivLxJ!H$`LekPw#}%9@;6sSpR&cUhZcd_lP;)eA3{3=yCHS_ZScw&0($3NNyl$T z+}P9~D|5EuM_EO_YLYV#8#j(`=~Cdf4x>@%4d%;#$#coDJ#^a&WBNB)4yQe7g+Ln2 z&4a$cl2#M?Iqe{>J+DM2YraCsyB^#;Nr^w3?}&dRBzR1V68$$^mM=AofrQz;FrO0E z`E?AvKQIHXB~-JCri18<4;~~gW)Idxl|V@DZ0>wMj-ULiP1Rl~bL-2%b(`M;H=DrN z=_>GxB=nN6GAaCYg|z*a0^2NsK00bhQ{)*uoNGX1MrWeQCnNeGaVX6HJeJm9yauLK z0mRF?6CS6|5Y595aYc;-2a9F)wIhtFS%p`(2@ANEX(t9z92;U7EUu$BM|gVKf7 z`5z&f-=wd}3*z^}3-$Z=BTpCHgQc&6@b%Mh(0W`55qX=~hO5bNU(p%6<79c@OCNmd zy@qGbx{dSQvRQohAi8Pb9OBey&6QuAg0)&#(Qc44jR@*wd)Mf}uN_lpf|-|S8dD12 z?@Q9|0&QNIHHYt-`44ZkoWajOTR>%&1Y{Yn!{I8su_-VBrb=e^Wi%>W)qfwJ)3#u5 zUIo)Zd+ks)zz`BU5?Rae-E_8f6~4M~9Sv-aIGe3c56{_!D@XbBOzjIWX|V$ht1Thx zHP*oW)OzfSeT%!6tHbnrDKKT~X-E!OPEwxk!@+$z;-zCf#2s=)w?htG`s*uPA!!aU zcp2T~x|7a-o{RH4rf``#h7kStGi3BH1f3RldaJM$gNE4fZyOBhjAgD+dm;`@k4}dD z7hk}z$~$o1xEyEsePB2CC(+aOmE_X?{_J0l6o;U9SUdd^ym{#dW!lY(Hf*L@u>6^GGxFmeBUn6eP7QIlezUU z;l{+i_dU$q75hUXJpfvK2$$Ys%mYW)3Q=02+%!Ri=yU^m%1ody9PR0!n11xn@g!U< ztq#5i{$nGT`qJmmN?_I3r%<)@1@m*R6SGgx=5?hF*z`95jx~&;j=dJ}Q*ssj-8>E@ z4)^Cap_8bIQ5-7t{2`xb&A=Hm+@NSdErd+1W0y0Ep!=>J5AD?8-^ZUtw^Py>)8oop zoQ&Y9!%gzJyaxKc`a+ievEhxUWVqxkMQAv^hhh zLHg?SaJjO0ta&udIgpQda2QtX_y^V=$4K>4P6leeWiPM^vM;N1{S9qo!G>ex<_HIB zx8xlvzS8BEs_U5dWjDI%_d@DFydEc8|HQK3O>BSBW%g$IY)nww4NF!@@QQ=_)aj5t z&!|wL-7$*P+hha{ZWzPtzCM7Q&GXS_-$}gi;V&+&ROEx*45`lw75-Hrke(Vhh|5@0 z+W2V-SM!Yyz{kJGe^kKlb^wHh&eWCGKsC=IfmS+*W8{iZh`a-6`m|_AP5% z{tCW3_U-#GoB8fRO=xYU%&Ue?!f#_c#N9EHs2)F&nVuRzU(8p6{5OU8*jOZx>1N_vnRlFS1lT| za59aN%M`Unjp4Q)sStlYZnfxci_vawCTZoYiQW-0|kFl`IDb&(Ep<%ZakSpMi{>r-6kZ# z$m3f4tW7Kr88VnogsJq>WexhUbP2r@5=YZcD$yTeF#p(o1-h&c5}Vdi==XO9{JQ=K znk;X#KyOZld@!QNYSS=uRt#PUI}SUaRb%c?GhQ@Mf?EcSqE|-`;o9p3Hr*(O7Jkda zt&68psgpYedG!IH?DGzX%q#>Zy#^CnHAjc9UM2mTqn3pPvW@$U*E zX3tmU=Y7&~v(^^AtkM{A9-2};zX&>4ZXsWw1K1MhLG=_B=*j*GILbC1hP3L@*N#$n zsUV&m^BW4U&Ar)~hu*worvWXWJ07IoS<gr^IqKP*inY|mouYJL$Tvg>Ss&u*I6am#$rqZbUckL@ik03jO8?m=G7W{6F zA&+ylX;xJq54j@Z>O~Y+x$E)Y6WZBLg~v>Jlmk5x_zfPFWYg3$CqUa~0nZt)O?Rx@ zj^E`Li+9>i&w_L_93R}yQf)8{+*2mBaa3XR7`;*nzqvH5%}%wgML z&AS7T)yIBs$?8Gp-2)J1?o5aN7>*lS3}~AAEUdHq-j`YS>5;vkKsSB>(T|&r`+pPa zw{Jc#@>@e~UwhKYGt*&B-g>_BLp*$Otj3{>YuTfX&ryC>G6}gA1hSvxSmT+uaNcA$ zmikJtil>`VXO%Aw%yAM^s*XbC$+PexYy!6(Br;#^A8>Z1KFw-!hJq#~Q0!;L1%Jlg z>VLpk^)P;5Wef11k*^Jej6EV>I2z!^8g_ek_dU5@b_)NhO4C&fme32sHt_WEBf-z@ z8{`D-6gGaoh;L0cV8$_B95XeSmv77?RwpxAV73Rpx;U85it(g@Kexf=;vA|{eF{y7 zu3^?nZRDG076>yeX{T_7JX1diH`Xts8%oWE>#Dw6b{(n9du-rGpO9WLFLP zd9^W0zpEX0s(s6z~ z%YPB?)E!8VW^JGr5mRy2+ri|Y<5N&mS%k0%jJ#Kx8_JJiOP!YT zs)W(h(OC-mc|5PVZlS<$7jFmm9DPvuUH57mC7h(*iA6_SOjo zxUc1*)jz>(w=TaXuf^qMjA`4TJEXzN0j9qUMcMF~a49qz?%q(KXSZyl74KZ2z;rMl zvTP2$-7$(k-zf>jk_Gm294Kv$?OW5~F_;%vfmbFif$0v_F!48qf8A@z`@{dpjD~*v z{PHA}zh{e;ppO#|>(MFij)>OJ`_W@^&v4PCk<>naAYc9cJ+7K|oMg8cLvO(%_&2zR zOkC2B`)ueThgbTLw;o3HU-BO~vuleeZQ;Yu_`LzQv+^w6Ee?(54j11~JcM_bUPjAX zM!b2`MA%?-hHP}xpgwb(S$3B%zKgqq-_jRDQg#)zB)C%j$oD9$y8;hp>k-5729XAo zr-s_S%)4?Ic&aD7KRRx#0l%C zbGwn6u+GmPhRt;Z{bEzT;7TnlVLNj1fm0cg$~K{o;?K~1FHQLJ^;H=2Gn#jQ(G|lj{P@Q9 zgIF4*P5+pwP;=`@KESpw<9<+#F7<=i1b1tCFtR@#X+NJIQJ>2jmdq2(mRCaLAQNJn zdIx0I?E~52jgVdL$ez}YWqWho1fBXGJd_6r)-5FSVJ+&r?|=?-YyNr$u))VEnPRvG zc8yOHzAbWrU3YEZj`k6L@yTGSACiw3R#?%jM_!;E)WEj1`q8|@TOjyEI^?AM#iJ<$ zXw%3zP~3YTj$F0kcRQyG!E1tOLgI7`+n)y`y3KLyu`s$I@-3V?t3nsX-NG9(?&KnI z!UU-Tj(=kLmfgy*cXJ#WwloNRHa)`oZBy|6@J0B0^C!}_avSG!OvuN1a{SVV<1n{! z5`CMk#g~$FIN$OVt0w7lIMpC3ln&){-&%5wNl9$({!~m%?S)fi!(ij{F05W^%-6fk zysX^KK+t~iyMmW#w4rs1l21n-H!42E5GtcPrcs`&CRwirE{3)|}%VrC9 zJ#YfO@-dT&vT^7)=mz4{)zyAxcKCidX0=iqw@n^|u zd>$mh+nyM3%{V2vn|q#Yon8-?)-LSEqAFOO;e)p6vv}Cx^C)exhFRSDfKjho$cZl( zpd@h=KjxIkceuZU_bKo2x$aWo^^zDmKgElWiZQ_rHmT4v^%FkZvzBOAO4IjicMIX2 zvMe;J)V{~37Q4s4gS#*jLgsmE=;F&yxP1pOCYp46mOK z?#mHIaI3W=geOld`IdQ=I5uYoW{YF+cVRN?zhn~k8~#sJzHbJTtYm3;fr#a;19AE$ zIi5C34dNPypncXr{wq&`9(wPo*uryYhFMn463gP2GHYJ|a2Po30VjX_Ev!GeR_Ij>n0yf?Mft)p&pv;*`!Ex-?0Ayxd3e(2yA6|d;mf*#G&|uTJif9I zLqh}U0;@U_KjJ*BU-8=h-fv|b{#zOsZMnsyN6FC|7D%RPPhkfm)a#-Wl)g_>hVLI@`LYlOk+`$0~6 zlORaGL6xvGFmK_P@jpYip=8M&wqlwzs@4Q!m`yqSm?Of~m+xWLk}L3TZX%fPABz!R zW2mw)8~my>Na&s8B=X@bSY!GIeNz{(rh7Fcvg0rF^0&Y_?Rl`jnPZ)xNO!GqBJ*7j z;q>Pn_AMu;^2cR+;rI`29%5h#;-h0^bI4u+oJ%r|MO9PF4;gGYh zoh>@LAN4awfn+ja_U_35lXu{tssUWNY(^jF>|igKZ3oE%x-6v25GU1Cf$6n_UeveA@RY9?^Muj3$ovjf#yQb7!p zPf{GQ8W)cWcSl!DbYBp)tj^{QU3GyLXw+( z5CW1$TtJ}Gnq_I4Xpn4 zg%rnBG1Uh~EW#ig$Mnqs9#o0NrpN&BKk*2rhVCc9soiYR8rQyzYB{>3W#NZ`Suj1h zj%96p$F9G-g=dBoK-|1I2;Qd33*V>XTj>qfCnmo#zCuBBx5?43|? zN1nGod<}(xYsraS7JO#!Vp#t`2G`1NsZkHU4OxD#!8dax7s~6;&Q&>tRX;D2;BT98 z)Wcss(@05K0a4C zjq6R#}=Q3&p!ibqfIdw`+DNSJw+JYb{0;Jv!m-Q>iW+23#6-m zHoJOnGc7V%2StnHFs$t|oZI~icWHWYk=sJfAw6cgVHAASz7Llc>w^7jA5kx_kqkKV z2yO*yu=F*Vc;QEFALrOcrZE#ZR%%8IYtBQ>JG*yGJj(qOcx)|H|=0HI;0-VhSEsN`ePzOJH5*1yJ`*g$;W5$X~q{ z{L;G=G>s;cuo-hOarEDs7d->HxpgzNKeaGN}W@ui8IP9Dh&)Y5!{dKm2 z`jaNObax^dv1|dIzfX^+T|dcAd_2y4t~*1_SyOZ|SKuYm8CdptCebX)$6+1yFgf)m z37AsIYDyE?!e$lJFFA%8)2pGdV-;=~{tcT4Du|C{D#6mP6?09s$@Z54bmxA1nkDbT zGkmB^0I>#AMudnAJ?w&gA zo0A5^FJC0uPYQ7AxNu&5@(Ek7bCgB;g=6cj0fPUNqgb^)7Tbq?f%4cAob(*#j-sk<$5`#Z)i6rpF8aNFhKHof#AADq%)hD4^>XV#PGccB zY^V{-F6irb9YCFgH9J`|F)y z%`gO>K4>MnA7}7AVd^;M$7O)m*NJsOFPhEDg}W|CnXiK@QC9TB`G$Frd+8P&n7abZ z<*ooVI!3f-d=u-_UI}~3x1saf`yka>ivfR+<7@g>@IO!vuWu*_Wt-Px?1Rf>n%iDJ z^SUaTd)ghw;v>QGP8-rX*oN0}jYSdWkn(J=ui6G0hQ; zKRtoeG3PPSQkO4Xxg3>m^k=`MvT)FyOeSY+$jt^VF_qKKv4T#2jYoqqdRx z9@dy3SxLG~7T^Wnbdn|=PAwl~0vPv*H|qN`@GNWel69u(-XqA_z+!e=XR)y2!(g6p zAPzj_5?S4GWBOup8MPZdlOAgdg`IK3;C|n{x^G&4{&9IIRkV1F>m92tZ&XPR>9 zPAhtU*ht!GWX7|9Ytg6qE7*2_JszByMf-1-qEbU*;7!mG9QqQWX|Blgleg1nGpy-5t(O>AAYDo1TM>Y_H6H-8TW=aS8= zQ^}8>oBjpTJ6=KL@zp$Nog`igUCy>du7q1ELm*1I0RAl+jN>L3LFT0?{Aqm(PFS7B zHiI4vVww2uhA(}as6}1GyLe-rVIM!8K^u?ErL9+7>D)n|NZqyXxVt2Syt#M?t!9tF zL0df`CSfX-4xu=ESp=VO&y$9%Z^momyTB+QAG-ee;mi&D_@5tp2nmg(t~r4MntJm^ zGXAw}1lPA;#A!#C(|`ZG>HcrIa6n4|mZ-_{`Wf-K(O@hueUyZI zE+(LSPa9e48x0$ljigIs2k_{maJ;=@9gi$nhA)1-BNiWDjF-85lKylYOG7-y^WSY7 zc*evya@9JKoL_kran&U}*OzzxS3I8msA&f`-LJyn>L3)`+EHKHmk)R_6^ETz&V6nm z4w(|oCmKzrD;Ma&J55_kV;s58f(tlzlMQd07*CsPYgo#{{k$RbGZgRGND_7{(ZkI{ z@W|MSbVf=74APaQFY7MjFXv^v;dC<^b}Q4iGvC>l@Tq9iF2&{8FfdoU#UjnrdA>v( z+w7B0*LT@rdPOR@_4(Xa=WQ4}Hpfc$ZWFP0y=}?0o z-WA&Qzp@g1{e9`4s5)dGh0w98MAUxJ(3h>h00)WdxNqeSR-0x*4Q7|JCntlc!BPdZ z?dr$%-^9QeuVRots)ytB#!}0~Z{&4P7W};8&kbaC`QFEBeCDe>ypq3**#*bJBC{G0 z7s%7n>WaRR$!h5Rmm@Ci3?QB1@5vq8I?>WW6+I^X!trNx@aVFqOir(ftyc}jTLobv zcbo%O%M(bm!V5U5D#^W`PvY`5O0;;fKU;rs3m>=Kj*siz!DCfliXqAUXz8y=?mRvN z>h^ok-&qq`qh=jijeHE>Bo~P#$192H1W}xPco%AfyI}v7v#6g(CdN%N#j`&w;OvKC zpya2C0eM@|BYHP>$f?s*!GSx4CG&GCDMHgWAA|rqT;ETL2X0@^BTgA`|JzcKI!A)K z?b5)vJ2T<*BgPg;{YO&|b;05xe~6>%XWVB!fF}%aryI%=@w99*^?yjnU#Zc|M*1Xd z{ML)|M)rKX_g$e^@(C%?jc1mNKHy$SBmO&bIRD%r%Vqm`^+hLDSQj-}Y&~2JRxvmXtb`78oTGwIV zI&}qaC|8rQBvwffG)Qqbs{fHC25G*rEmjJ=^u6U!#^NtcrO_k1a;(x*|qjN`d% z;v5J*o(t}6m#Af-3Aej7j*ba^hSI^Yc<}ss{Ca+dXw_VR@BIqlT1FcEVXF@f9XkBn zv%VZrvI?3u+wcj#_RxL08C+}%e}>Nc5z8)&<3&bfWkpCCQISy@&$$j!(o_jW8lqj& zBoZ^0C96n{8F0$u7Ue(h9237}SXy{)qTnIvoqTN7&zu_o5s9 zKa%d4Br3OUCJtDWfEC&XG)KQjoExhT<)dV|ug^zZ^2V1A7=HljUA_1eJr};OUWN{M zDo-B6Q^*(C_1D!~`JIECFhYBW=vL)wED>4p-D{O;+0tn^c>Ol2Czl0dUkAhD?L&B3 ziK*Qa3v>S6K$~CJ499B8mn`F>2OYma$bLiW(a=hlhW?VG&u%o6thPWhSW1evY*M76 z;Wqem%v9c|pa^Gj^F_J>&#gPugv*cHL8l)u!BYJicx*D9FPS2X@x41idqq3S$p`V6 z@=|znd93KdrYAV#rX1$E#?$+2rtr`S)-m@B`p}fZv$6|m{yTkc$?ldZ^H}8ci7^-Rpg$DB;I@f2_)D1L!q`FlX1=# zc(YJgrj(7rd%wc8v8!qE+plC)ixYNze!wzYqu}I26;WzM&R%kexJCmMIx73Q7rgzL{N`Gp6WcGa)l;K0}Uyb{$g zCa?|vW#7Uk$yem%pW*!a;bJm&;R_rXAkAhM$ndROM{|{aLaSYu{QvXWmsBQF;!RzBs^|57t5S`%|QMz6PAlA5Pte8MB6eOZc(I zB$oQ^51Q$Jg3T#Oa7ADiMAe%3s3HO8M@6A^O%gp?auo)qNb|OqiTHTIOjv#~hFk7$ z6G=)q@fWTH-!3v?QO{Jt@PawN9psKZN8^xf$|fV{#^BWUMKs~WT^ygZmgL(HrWN~j zFlfS8{M_ULe)U#VOZ6D>R!_#6W7KK7;}F=?qriiN+S;#@Px0fQ_bmdQvy{$CvxGL$5Po3UN!<1K4@!GzVXLASJ37Uc zTP@Z`zn@Vw=-C@~JW9k*&ZxlQpX~q|h;Yw)9ZsSo?W|M9)&OVT_v;U?tv`pN+9|xG zB#}sK{=inR)7Zi%^RIK{#l01&P;IrA{xaCapZILxzG^~Fud*G2b0XoVqcOjfrVc6V zPl$at#BrxPdx*&hbMcgBCCt-%>so?FDC4X3Yj-;+qB_QwW)f1?+s)Jbpxa zJ+IPsXAhfn*{?sR@LF6J>s@b24<9+jS4b%GZN2lrvuij#9q(J7w(~V=AMa+9=LU2A zOPX9oe-Qhbn+oHW>T=<(mVVOiV<}h65$&$v_FviP(vyhdxUKBn^&cqRn=HC3Wa;KB zC&SBctEu8qc^c3>oi{$Zh6d-(vt>g?RO3$*=0snFLpNPe&9lk=x#SknkF1xNDz%47 z_PFqhAqSwt@+}IGCF+k~hHH1ew(q}r7R$z&z{kKmqPFV=x^BI|Cj9Y$rfpBbIX#n& ztPNwberf2~av#oo*bdoVS5fZUoT&lT<<;<_ zIgz^KC|dgE7C6n3rSe}-&_i+~=-_1sVA9Vr9Q)Xa|4M$zu7rls?=ME8*MOr~)b+6Z z_N}8l+u{N29Q6otUUkFsjZw5rV2W2t$ni(}o7kP3C%I{%vGDPE42wEXvzE#d_}m#o zPh@VUr<#MfU7jpgJ!DUG<6n^8s0KJOJRQzO4C14-{z66fA#$bqA08X3jhD^TMYALm z#7Eca*q`;CNS|-`4mW-OvIUKK7<@4tHyhOA=*eR+Cj5?QMt%nVH%Ok3$)fyapa9WK znk$apJBUARQi6wp9i#(eK;euC{s{AUYh&>0zsEI+3(o#Mwg?j(yVDql=IjIK72Radt_!T7;W$3|cMQWbKfnlMJsv+L z8g86U#fft#(42s5I3GBc`<^SngkxFuT6yy9vWvi8zq^CpmEN=~U<~pXm&vKWf#BO@ zi3N*3vtK)h@Ng+bKC^0&NH%CHhF*&RRaq0di9BY9kG?|NhmAPOXe@p@H4bEF9wG)N z$Kn0x9$Xi)8AD3NFs5A|{-hiQ^OJQVd3`Uqy>T?{O*}-3tBbHunCrhpK8HX4mto@b z1QwflsN(XbPWTY$KwHk#V@qW!1bP*tU5*l5dAbs&&&-3r8_%*~NmpR}s029lavYB8 z*oht;&ElfN86s`jR5F_xV2_*{Y*SXG|0Xn%S*sqP?N>&&3-62hqM^LSPL(eX_#!GD z)Fq>nB(|u;bcB@1vD~VxY!Ab%n_zRq0n$d-aYRHEA zHkkVJAdLU6MK(;nh-L!+ARz`Thhdm28gp4+nCs+TqlsZ!Zg#i=$7~e9^sa5?)i? zf~Tx*pa3QFF&QOrg&u?pAvZB&i3~p57KSR4>8xn~XYu6zNQ_!~n;ahal!S%mg1?k9 zPI_4ZZrRybC?U@eZM;HO32dd*@`3!VRt*~UEhiG5)flQ*4cjF9aH5GWyw^14M+etK zSD+p(H9e0XN(<0DW(d4cnt`VsY;cNlf;d$DA*{=I06iD88O)2quqZ>=?{E@ETD-&L z=tnFnT!Gr9$ot6_&6t$Gkx_66Aj`>KgI;Fv(284uh z=Y=^#6*ICJn=2V6-rS_YJBHt31)8e-`uk|1A0(Zf-=Kx(FUn$<{Af-j>PcWmJ{#OG z{3e&V5^*pmLz9IWQu#8Tz8y$Z=EmcznsiY{?`fDgW(@RyGo!1m+u6iu5jS@)A<6y; zq`%V&f_7ZRv3(Z!XxS4cvQH*`Pn*QO-wxsJ^Iu4(QW8sl91R=7#AurG9zvM$Lnw0Fsw(FU-9L!={Jhc4?;=0oLdxzQ^n zzSY4M_FpO?*4NhK;$?$i*w#@Sy`+A+ z1YI}e0qQS11RsmksrFS7>dqa_dp}Rb54s`ZprCXzr}qRXS>6JNf@`A7M?-Mggy&#> zcqmrQm7!v-WUw-d0srrBq2X#evGfysMAap*b;J<5Tv3O(JbEDH7-b9|2ID`!AH?!N z98?em_)?q?my>UjALFtiIZv5%?4Uf`FB$u*|FGtQe~5_*U~ZHQ=kzYa`7I&*Z`N?E zdfqDX{$NaNHCu>ZkStv=b0l|o_E>zS>lAF9b`sA#afLlOpJ3mEf&A~WE?m|%fCeWY zMbG(q-1A5XHY-cuKv6jKTe)Ivc^2f3h$fX=U2xZkFS!2B0NmJe1vgDkfRTi=S&<{t?Jb{a5`QRBWjV~8_&@aZC z{7u0DD4Dhze@niDb965EzhuaLTX)lcj{%C#pTH{}i+FLxJa+f(KnNQj%745$gQcsZ z>EWO~@X&1o*O4uOw(fUCt7#6E8?XYpB4q`{?h!7(paG8(^tid?F97ey?B0?BRBAhm zj~4F*@AJ=Lc|te4pP^1urwyW8_l@C7+R@zQmcVt6>A*JWvAkuh2l$Mb!h>9t(WfUF z2K=apU+1Io?5ii(q*R28yCSe`Nfj9sDJPl`nvU)dy!mebD5~A*D_*=dg5TLEO%0_V zg7)!=G_#AtfW?Qf=*ckdK1WUTr|cv=Uu?nyEPk`QeN{Mpp)aIZDPq7~32xG3!4+R+ zVu|W@e#Xd-ta%rOPDQmC`sFm(Mg`LkpiD>l|7D>&KA~&eP>erygdR8ffxlm-;WABo zzUSshUM4tP?@!18mr0I``C&6E3tWRf16+w6tF{?EVD zh)e#}d><|%}I;5NK5d8E@sgF=`y|AeiST66tlomV;c8z52l9R z!ypqmdQUhHE#0yfUE42$}>&zq9`U`^^p?i}} z;lDQ-sJw4544FTg?&<#tw;cM}i7C6Vqj4#mTQP$>DBJNrqG3E@V=oCi>cZbES3wX> zgAAXWAbbB5k?E4hZ!>11@kG8uTyE`#9Fxi`zJm+sm^mtHR+7n zbhxB)4Ig}u<&$0y<-~suEPYuZ?u~P&xo;Q4z?q9_>aP!^J5Pqrm^q6(HdSHq?gNnY zt`kxo#9&=XJlBtG5>*vM^55Pev`grcyj`Y+6UryT7L{OpD|8plnzo9>md%5XI0qhP zHJmH|b|PC=x{xIeL!iOB6%XDE;OE0zq4|UipS*Mw?Wp#~KN|+px10WhHJ>8USECcW zvLb0%sRTCM^S}dM>-nyWF(|RR6;4b)OG68#(Pq|k=s4;Fn`SUEpSc zs%>ndjxQ~Idl_vEm*Ep?0k!vA!FlLV8u3k-4?FKNJLfh0+S?RfsuY9Y;tX)puqVXk z+8hW^jD}T<4S43jwY;$5HF$WN(D&wfIPLOK>e!xxOKoIv?B&<+WI;LTtr$Y1ZmZCG zJ3pc8K!F(nmFxvkL z1pN|vME@H?nSnm#XO{AR9^EKmjG@1Q_L)M1{d5HTDNr( zy;W+#AN?|>tuZrsf1?qVh9drEim~uFy09q!srcCK1iF=6g9ig8xs+@+jDNY083Z5a zjwg%BOvU~1_~SaBA8su=qw*7G%InjRltHZi^hmzP)>L3K=kYq{?{G%F1+5$y}5s=Lu{PhI%gphe{?8`%if4ePapE`+uE95B;_=bG{kZ9- zc$auA=#_QA%4$Q}-Y7+P4Bd>k)-GVHgT7#H>S5+&<;>kjEu|k8ALOfcc+(Bp!ui2w z41K83g@5bJNbs}$6hFCiSxkyn*W<17BU(NiDJk!{b< z@~$7pAZyuZy7Jt2+(lbaH+vZWGD(75%8BPAA12^3pFVW;NkNqbZ_%!_2rD<=CNm1g zigrpF@Cgo=VAIELobByS*ZftZsRei7(YI&NwQfB3byz4?Jt0FA<0ruJaN}{noaaIXD*5YoRpS458K zcHEOagKGu{G5eYXU71>m?Kh8OuHtVpuyG^RZVO2`j$`om&q>md zWGhs+L_+7=T)Yx%hLf8#q3NClx#Hi52{s`_yCNQ7i#i?c*Te4XhoMHYAzOWIh$y77 zgKX2B4FjW`@O*PAsn9qLa)&Y?B=nT6R{l;hb>}dUEcC0$9<+vpSv#WUNGhJGI7uvP z)5uP1JHqJZo2>ZsB^bT#KCISLV;7yJcuv|UviO%4JzRR6E&HMl@een^#i<9_f`h5z z|D04sv*+z0Hs%-D=~KIjU+-C1;g!XfU0Q)-YrQc*at_u#nvL%2*V%S$9cG*#181dW zAbzMTS>Y(f$C~{S?JHgc9cPr_^XCnq7^jF9FI!mPt~`iw<0QeaNL>HGo3y?7DMHm? z6t4<~cf$u@dx{q-Zc_rkb%xONWk31akuNs+lFMG0rog$(yLkU?B7X9(5_2X`vVtbtb?X$Q%Vl6q>ogjGk=ZM3oQ4n=)A3L>M4+AP^VW`F-7SK706<%_} z*&`&NRkRS)oW1ZsL=HKW_=Z$lS)l)2ZM091C8g`*V9~H|6^BZ9!tVL0aPnI@9xf=U zn3(s=zP0};X~}jK+dS67Gp`@O?s)~Ipfei&+ZqD#vB_ee|MrokXB+TW)m5S~&Q}@MmN`&L!&q{zCYiX# znOWYtPG-i8V~(1sY{w{Ryz?LqJLRI$t6v)4#pc7L=r*{scN%t-spI;9K^1zQ=V0pX zc@dRlY243{^u?cnFryn*%!JM}ve#lBl3} z{#8#4Uy+BhIa-yxWr|-AYYz64HwPCIdVDYhs`OyjN>!G6^aI;>bSX-|S|!fWKTy6# zstQefC&TwAid=bbA38*sL4M_9Qs17%N_uli`VA92ls+GH?fXdHs$zUrVT~#eM!~PV zMWhc#z+zv2kdxNLs5iy1oOa==Bg@GA)+}73xL9O+qziRQ!)v8#iuaoxb7 z_(^gwD_S=L*F0VTdq>Jc?)nUrK0Oh#Kd!-lq9PoY{FmvU+D4q0N0G`unB3q9tId$a&W#2iuivCIuK;Xb_KOP=X0Uffh3NJ}9g`aOlg@d+ zpjK>#7stBbzeW=>bwelX6u7h3XRNSc(G9XxHv}da$iNN*4{S!c@#-Kq*h5@9$pd$IUqBh%IOe&>kmx+lW@$_B;R@ec@uqi< zpLGjQYh8qURmSYL;w(^p+6$fs4#VWn>p?&E z4jc7ZB$B>70JYxFWx320du}!3kmDeHPcnetED`5YXDVTP304NhLx19D@r=HDLY`V< z)?8~ct8||5F5P9pk{Xcay$jT<-Emu_DvgRYfs_(~*>Q;k_NJ6*_wSd)bnPQB+9*OLKU)~? z?9J-OUb5Gi<;ru^52G99?7x8dkUw&QC}!G4u*lH>lIn(oW6zQP(^|ywSOhjNU5$a~ zorq6-Jd?Va1&;?b;e`G(qUCQ@@Y{=FM1JiB(ctPOIDSV0eEAh2{;V(*rWXDu`ddE? z)kKQCM|=dAFBUWXZe14a5`kNWejsW&br7n#0ctZ5CLJ;+%sU-tMegNU@#mP_tF!E# z*FE8mu>*h9M`G`#*X-T**Ps#ioLm#Hhcc_hs3P+PT*FdW?ju{8{LcWh{_TXe-~BK# zGKnNll4I0SncNwqf>SM>VWURM?Q9n0v4FjIB!+@SM3&F7MPOzx&;va>CUYqpbMsiFL49p$!8q6Ipp}DsYNreClRnQrHyS(VOJLH8>2Q3#1#DGY2e0ae@q&eq1 zJa?$YR|gmJS||ZID+wNYsgXo{-i3N9FPYa3!EkzONXDD4VmCnwZSGY;$hTNx6lg^) zYW;Y;-Fo7<-V%H?m-6{L_JI5`Euwfa5i13P!7`>4mpmUqCl50e{Ln|ZY@jYpns65j zQnfhPo)<5&-wC5-qUlQwX*%rCN$~%rLeox0qU$y@vC2>%o~ zs38y2`l^`xcwI`1Wq25-prm651RW1&+b%Y<_TXR~U*$*d&Ul6q0)JF}cPNe2oDi1{?($r_bc-p8?z> ztrZjFrqIU@2gs|x9+0>r8+(P`kKN}Uz>j;+a8^n*3^q!Dws$tvQs+K7@TeLZjH_5{ z*Lm#FJ&v12cGRugjXxc6lJ}%5@WSyPY===7n~~p5I*cpvx!eR~YcD~Qk}HI+)q}Uk zzGGu(6sQMxph9ml46fCsaRRU9y8jN5doYxT{QQVd_8GwXt`@)rJFvd^I#eHapl$CP zMekl&(jlS}I6B#dImhh4m#5WmUq%Q-z6cUO3FyN-Z#l7*MlsZkJ#Hs_7(};TJK+7Q zxwPVAKdR*?qScc7sIloD=$&|v9)G$~LPLrx*cfmZID*lA2I9iG`@#S4Sm>W-Odo~_ z=Ze<@Xj8#&P&bQVShkMm`35rkJt5RU{~>eox`d{WtFdwXDYW`pTG6ny0<*eSq4uQd z!XlG6o<4UB&tGE92drv^Z)cORtV~tB(KA_GsWk)^Z&qW5BTfrZ^kV+v%3-K7y^gkp zzNB~J5I7JgMeBMflI|YdQT$izzWoo@UKz~8p0>h8{k2Fos?Zvfo(em11Z)pU^S9Op zI5jdE?oBX*#h&+ZMoFQBsOnR3pU28;lQ~R$2=QB zuUvRVq6Lm`h4*tPnGgdagGJ)m(@f|M3r%e7ybE?qMcilaB3ctKfp`DhMSmd+Xtyo^ z*&Gj8rS%5(!3DS`JBp6C*T?lL!Z&<|6nB~~^uOg+V~gf`jO&{K9c!aWazGO~w9y;f z9A07SLcya9vJ;%+2og}cmscOq;Wr;%Vk=wR=?mEmYG!~5OZ{J89Wv<{SqJ>{R-lr1{^p3V!< z+h2kAv&K_rhrJm1sUL3dtA-8zkI?z*0W2CZ2j?_s@C_Ei?uaZidVEkSo<2K@&y3g# zmXF1#b0`gusgIzllTV|3Q4FtDK85x#E@CK+L5=Y4rb} zz*Ps_8;3)~@XIK+M37F!H*SrPT_V?-mQZl z$Lz+o{eq{YatJmqeFdV8Q_wEbAC9~|4BM@f;CDh6tmzph-r6>a)(7k-C&wMc>-TiA zad$di7pn+rfH^iT5bpff{UQ(kDB~fjLOYu(;X?BUG9wb%)thNB#KV!;gs#P*?~L%k zfj#)=MGd*58;!5@{UNex6!hKEV+L;WFl3c2S-8O%2F(}4pK4>TGG$19%SZNca2?E!)1f=`J7NCfdE{i;54h3X z&zvqe`zvx!!4y`}%s=fR<99p~J-ss;#mCBF zarIvGmTV{MZtLKgU7JOoj}7q6t9bCSxXj$BAd>S6#;gtC+tKrI=4=G`|7{{sJN!h! zvj#!kI8)aC;~k!DJwcrNvxwD@VA3+n6F&xelFj+DH2i)a@S3rBz%3GQaBVKSql{XG zlK5+(6b%j-2MhL_LEFFgB59)>QYF4m<`+kUeE1+RTAYK;vMFFacpAI>djis=k3{+H z8T=9E1xZal4_Dtzhq{N^_HLVl!NmVC%-cVPZ0l15Nt*^%5_FQNdlnOU?}-3P4^HMcs)3Q1%W4oUa6J%tZj&Rw8=0G%h)6A+Xx{0wEWr9^V3j! zc$s~f-!^=~7TJFf;cQFE5I*AIMAWxlO)d&-aeeVVXn1rBCT~!~1^ZgDDt9p2s0r@u z?p|T<#tH1-6$*wGyL0f6m#8<13&UXtZb&k{i0o=sLUOuc3F|EjW^jA z7kyr=Dg)DP6{t%50WgivB`fm=iCg5~lRu9-Se%nFxtOL18?}!yopNWmV{Qqtcg!)w zTm_QdIMcL=BO?_Y`M-z&vb4^FEKh?xShLSMsS;k~x1YbC1+ zWtm4_2;7tvWAVOy=yKneq?aedALDdLGQTF?tb9vgZ;!##_Tx-)_)9iK=sAh8ih**M zYB*lA1fID@!&eV0JifgT4F7hC3kF_66m}hKSd)P{n@?eVr7$DHE38sdW@|?ZOwi9g zs9+EZjaT9zCQ*{72PLAvOdKrqc>&=PhS+b_MFyB$#@Xq!@F3nL&VS;0@g{rPYub-j z+x%eEz_0MiZ3ljY7_pC19kGdwqnuFW;nbN{GyT$e@s-n!fuw!lPrY@*1P3^c>Tvnc<mO&!o!q0Vd9 zq+wTXEHqAw5E<8L)9YoUNnL0hG#^V9I0|_#IBqcFi;#a_-HwGCGuf37&CGI`J(kH% zr;Atz*uUs!8`d0W+#`A=_;45*{6>6j!X)^bvxf9l2zx4g-?O^2U803!o}dPuMy~b^p~chV zaIcvdJ?5oj{zreYkw!G0ay-iJ9n{6G`3aC#+$CyW15gz@iWn(%;4haL=sGF|iJO$f zb8;kU?asCE$3&90=fsfTfmtLra39`p*5?202a=>Kck!c)677xJ4|bXhQGakWn9Z#q z?ON5MVFLz&T`;9DoA0rbkCHs^=wO(Dm^cmLIj z^`FPvuUg#z(W~2mUWq~Dw4?SuQ!c_Si5R|Tl{!6=TE@l>7PI-|PJp-1JGlQYAFUH2 zvGJoEEKypF+t)mRiG3^Jq;VRS95IC4?`E`k{z>e+uutT!G8QvJtoVZX8gRdzNrHd2 z6*L*N&{*T0LP(%qMXB_f*y;TrnLHyDf~IytZj~Y~ znJY=B9j}9&*fTif<3w(;L|?39H;=XjIMd+|e}l2H4qqCuzhZey2JG_jfp@pRiryNh zKu6{lIM!W3tcNWS$>k4Y8ABtmb=+)TmogG=e%?rC`VY6CZ=gi)*Y~hx!+Ma*b>eZO z18{i86tZAS0*=}CghWJ?LejM~-1VsyoQ}vd_Y^6V&z*(mautZ=3QznyQ-d1ZSLUL+ z$>Pt`#PHYbGrs9v3T(VO9}qVjA95M`FUJYq2zIlZ?LOGooQ6v}i{VG|5hz(^hOJhQ zSlS&+*w(~wWAp{go%9jbuBgHj;U`gR`dAkKRhzt1lc!c*I!s>mFbtiv9ZpX^17)JG zWN}jiSe{j9NfR&H_q=+F4{H;dw!(kpy8Fe7X5maeV9FJ->lQOQYHYeFHu?>Nod{M7 z+~CU6Vf4Vx3XJfCCt)ckHe#Dl^&gd2M8NK$3@LK;;(U!|| z;l=L`@!#VOuM3k5#TOPPA#ZwIG<2=*~|hS0Zk z7E{hI2lo*+eBHtW*gsbm{HH~*1;>9?xTP^s=;~w8{cHr7_cSx5QdO#PEeSHNnDX@} zHxtKnXS`{vkAcH`@Z9P&NVC#LFO8$PWTG;+F_-3_rys_?-^-c)K!)i(DcG{|5ZXUo zMvL7xvPua*VdmO^Ro&(lYOd?q%VSw&r^Zl1$^}QqdI5ZyaSlr|n#q{LNSGH|1Jh29 z=YJ(5F{rYZysgb6>kWNj!2Dy_ksZb|PCN2r$|cyVo`G4NbFt*Dz}mar6j>HtfH|8L z`0)Odq_FW2ro|{S)!vuv(p`di0>^1!WseQ3gf6Cyob;cN6y6&n?)Tao{#z+S$G=kG zP8vBxrtp@%@#$mmYGb9abL2id)pHF_aXXyjmc$}*Y{kUV6vJ-I3-9$BAsdXq=enb4 zeO_DzRfr}h7R(a3-)08QO~S6Q_x6=8J(#T4jV<>dv%6EKfr6$z%Xu6?ELK>+o8x-) zrRG1TGHfi#bj@U~8T)YEj%RGe%Zu#8O9#wXv5{5g9(kX{{aGho zvrZopI-1$1Eg9ksW!Kqy-+XcjmFQeA8|MAAv7*^E5O&#hqN@HAeA1!K)+veMSwI^$ zc<95bFWS%|)e2HNitKaFPEq@zL~O6T4$DCBO9N!Uw4ng!|2%-lYmVc9hHzM#H3?Yhvx-fY|oZ5yuU|@ zi@N%;`(Q5HR9Q;wR_Nibrd8BzYniD3*Bwx7lj8fY-6z=xt6+I#6@KUwST%!Uv~3l3 z`u-N~Sw9-_iH_%Bv$7+n}!WOBA> z_B3iTFo;hu@e&!xu0*pJKsg_a)lN#j<|9=Ms$rY%THG zY-NF`7WU-X&!)S(YKgm-C)T#RW8_mkT9NY@h?wtBj1+TlVWT(p?KS5!Q^IlUgfK9xevH}H z33O!1Njy7eI-b^^j@xYH!QM`pUS1T6-o<9Zp0N8YGNl?ucG}Vfoz}E8=Mt7UjG(&j zL=fqy&WA4!N5X?Zsl@|62;Z}6A$dtP^aw`y)b;!L2=rN#q`~{6g>Dv=-RZf z6*cxp;1R2G@uq-Lpn3f;`fspC#{(8T?`IAcl=nlaRsrc5d!K;C4tTnK8vXV>h2OfG zNC$i|1KVs{TKw7o=!%=XT5*}cfO$|lT9PYkBx2q9Qv9Ibk16+sXIl0b9{RI`u3T5Z zc#Q%E?kK~TSKO)TvXgMM)(~9ujd=Wy*Vz9j23vNV5}lm~Y|0#S(D~)VF613T(;HD> z`Jh}Bv?UWTK;Sq;4LHBD8zTd+p!tJn+_-54+RON2y<;ejiMqq4*(A~Pk9TA3t_|>CPZr)8C`rrVqS(C|K{z;&XT|0V+GI6=V27K|roaLlvL8IDL z@Yk=lALOG-#|GTT2;n@xGXuR#sM%A5y zVYad8jiMuoaH)!~wvu`Kh&5xP7`3YKg> z2Ls;k;lH-56OC&e2uW|kiNTc$j9&4OEfKm+zn=I4o>}|A>h%qr)qV!Y4^W_6PDZiA z8zlIp>_{4V@E5jk8^(_~YVx=`;ZAV*Cz3U9HlD9G;*OY#|U8}A6PUp*_ z&+~7=_TPzEx@;=7xET!w_iC~2*mC;zwE`P4=e#)nask@K93#bhti(-;zwkhq2XDBT zLe9Jzfp2%-f)jzmdAhnecn+UI2ibLqEm!#Rt6WjY#iFQ6*9IK7HeXce`Am#fC0nh6c#sxk_AE74eQ=|glRyT^TDy-%f8nbcT%cE#E@eDq_Sd0fJ zc<~KwiQrzW2N$npfaIoFswbmK$GyxE9R8{N@}gSM{HBaQz8}NP6OmLmN9aD*lmjIR zM3?&_(1iX}mZhhOCp)#!8D~Lc#%bo1)Qxdpdn&?QgpBLsU$Rg|jEk&4 zpuTJi^;fo}>sJX3ZL%xNcsQLm{Jtwb6s$$Bcw0bv(lH3l4u*W!CDrYbscj_hjP5nUl>1V95N5~S-8^GbHwIsn=lAE&#h;r-()9uly zTXhpJ>O3Sp{k!2^rUMT9HGw29Oy}F>W8nVHRj6t@2|C{8urDDBe6XH_kk|M^hr2eP zY^_d%7PiAtX+xTK&5)F&4MoH9J8ki5Btbunh=Lb z=VGyUFs{^_3whQSxJS(b_dcH`@)P#dJQ3Iq*-cJhIzxkpeLcrsNeqNJR~KSp`!CY_ zVKbiAHGug;YascO1TUfnsG?kot7A;y>f!<1Jwc8faCwAN_8o!X?FBH}@i0oaZf0$& zGhn3ZVOB3Vrp6~vgTJ3PiVC%X3Ov@Q&87Bz;j&ER5dt-x%TeC#6na-N92-~3etS=X zZ5z&!Q_ebc%4%2mVWZ9?x8;iBo7%v0=U3crEsb;EXAoYeN4efe=47$SPN)2y=zI8J z-g(dtB%OqQ{T0LD`{#4GStSdf7uuuknEkBl@nBlw{Tarc&4$f0IxzN}33L=ZWYT}s zAoIc=l0C!_p6nk&iz@H4ttStn#c(C&^CuM=UuvOycn7&4m1kc&Dj7__T_+6{U8GSb zmJJd1lWnv~6x}VnfVw{KL`~uXvUQ6#?#bJa9v3T#+HAlbh2HSuhdRCBFG0g!4Z|Jl zVo|Q-x!BTdAn*BIjbHuW6AKSXk&(Q>eiT)pvWg{mebGgW2M;l7NFBBc9d^Wn%g(}(0ny+%E&x;pC*d^8T+YWzc?VMYzG0k@63WUh zqR2`a#d+=rm6A$mh)R2Ji8c|k8#YDRtD>Zw=YFKJLQ4v5v{SwD6WaIxlBVk|ckOlGvh!0m3R-dqf2-OAu& zA`b;=KEfU^#H^OJ~!5eE>Uw>E-h*1vY)EBmK_t@TG1I-kz%xFC=dLAFXKq@H1B z52~2P;nSD|51?p#D%tSt4q5R+m6geyVx9XkVJejegPHXpu`Lqi=U*pj`z45qXN72= zp(K$g?I#WU1YXRVbli}qjsq7ZV4UMsoSB!6qSyhDRC^T7H3U{c`a?_%j3x(52J(z^ zX++0sA=)TRLw#1oRxU2bsGChaJZT#bnqzp23iPDm+ybNxn)R#dgb#30Oee09!UbcBU`5h7RM3xS4>$aU@~7s+V3ojvw9-Yrd!~4qOhAhP37}gk z@G>mQ@kz+5s_S|}=EpXO5Fx9t)y|2-S~ZYW7>Ro{((z)KsqhT4P~Zs7K+}PTSl7U# zaJOLq-6Sv&+P3(z3$7CMlKuM(Evt^KZ!=1_YuGDX~2y)idm@A z4N_73muX*KB5pQryMS<>?4o<{Km2I_25)? z9m0=3B-U2ikf-ioU#xHyOx}GVKlLQ2^7i?#WMmX*SVl0*GgmO+whG^(nTlfzZ{fGw zbyooke(rl@OBJ^Gv z0`)1u0#ttWF^u*OZo!q%FQU-=IV+`)d#%E%gB0dXw0q_xovL%2-;D7ESGVqi#wKRDx3KKE`kwVT)Bi0?shtrs`^9DK?J!40f zPG+ZM+gO{hmsQqnh811Pc(U^XWWRnTS|UG@%cgwBefmF{=A>kH{AdyBI$VSg&NDdn zIEp>%bU?M&8szoHIF!wfA#110Qn|_PY{gLr*yyQ*Thq@&|2iuipmzX9u9fBAa16Wf zXD`gz)<*VgB!K;&M6h~sqUw8MDv6kO1ew%9_9gNpF4S1Pm;>K5X(9(Gg0@7Ea@5#NQ z3+>IgKKM1ZywipepB05&^(%JbWdd<)Z^a`LVyL}+2UjGuvl9~M&{4h-Of9pZ=J{!` zyVZoDx25@0^?6t#t;UO7ROocMNpLm$7tBddz~w`CV&*gZ_2fH7TaEyDzgq zU)URK3GBQ&-|IknZ)2?FXVNn14Bm^%Ci^Zo!ktC6nAq?U+E?`9_=uOpa_cq7pB;qq zI#N_qa|(TLIvSeu|AS>hM}O6GCUgyE!rP8v99%kB?vpck=IB~-QqdOn^p4{{rH6B= zUC~tf(?;Q(`yMK07NNv}2n-z;M0e!|pm)PgSYx%2z0WGZy(fxD#PbT|xfiMTm1g** zZHYGB<-$2#m0xjip$SnjBBt8~SN=Q3Bh?6L`LBvB+$Toa1B-a~zBAae(}x(kDe?Ie zZ&6`eN4Mn0z^Jvwu?*SEo zt?u{e4Ze6@g!z{vY3mYOuC%usgX|)CUC=-tXlGBSn~!FH4r=oJ&jaa(9w}PEfG*OF^~9g*&}(#Su}7bmE&8a5is0HJNx#WOwr+E=!&*8s7Q|EL5!d zIe~cHGMH~3eRp9rs#r)!N=h{ zF79HatZD)cUTh0Nv%GoAD|J43pDtaoB%GGc*}~i1%P~rD$i~%1@#SA^_?3wp>9+J} z82hCk+U2yMDDeg)D@4)A>!vLE?QIy4EQ^;?QYXdq$uDiTg~s4|UBSVe6bm zoYkj~*3}C1GARY4-dO%H_$#c_zKmCT(_yvxLh!h3gUY+yxk;=m>d%ekROkj?KVF8b zT9(s*4}P$dU$7Pw;jm5ZD1*_#{Ax=sS6Q`>oKigir#rvn=l`~$rnW5319|*+$bwp@ z)PvFVB7ACij4z2cphs#bmHiS3hP&_Ji6;%h&n&5g@hUE%oXKDQ_XSnkCvdqnm#}!$ zBV1`5Nn;+b!k(l^@QIHhewhmN@fve_d!rIP<`6@T9B0EBlWuGYSI39S%iu@xM>rk& zl5`liz~0&f2(XVu=fs=bNp1v;nHR@v6h4CE#Zj=Qrj{+!TE-BvoExo`A>n?p?@Z{dGvJ?jGb z^P`)^ln3F@y+82Ny<1S?m@TYC2 zmAi)13vR_^iltW7?GQ6|!tnv_ik-m~a>rAdAF4FHs(`q84x*_s-Jo9u2Miea9@d_Cg_aj~2yIB{ZNZz`d!+Mm#`#1jt~ zR*2QRW1v#ih9~U!f&S`oz?Avb3R?HjA+g{d@DeJW9&WXFJzsxME89$sJ8>Im+ z3y0A4@vcN!(V8cE-^UxO3h*@Kq9`tO4t-d41RYgXs7S~Q`wcjbC;xM%w{&i@t(I5t zvDQRmFy-tKwl?bk<(Se7n(#cV?t0?y+lD)!tPCP7L8Nr7|ti+qA-r&5_1Mt!78=P^E zr=6vtVAcym9is(`;RXV*)Ocu`##B!fu zcw=1(OsL#}=U*Gq-74EfX}*eljb$WPyrM{l&OHDlZPT8LnaBK4XeXzhT%jOd9 zm%*KUhF70eMEmxC!YiHjw8%0J2R+RZ`g8~Q)nZF}YQ7=V7H`7YWx?#Ag)H^au@vQ> z{>L6|U&^ONF@ zX#T5)!!&b4npGo1e~q0*(nh{uD~dXyuLO9YtP%h1R}F(rl?9$*J82%x@yFngcxcc| zoE~TarpF9mO`QgH(};x+`!rGUa|cX+<$$ZrzLM7^%Sg-PX7C<&j64$dn`f*x(GQ8a_we~~aNp0yk^WHYdB!~^E-p9VU^O+i9E8XhPZ;@jiebd16*YFu9;c-)g=ZB?20 zqLKoCmoWww6y8JGr|L99u@6V+jU>B7gSp=FdEml)cymMz+PwZIUcFS9sgw!3l*d`D zeE4%HZ7LGEulIz2Ws$h%vakyl&cjRQo`qKlgK7KMO}MaSA8~Y0Lh(2+e(zHWM5fD$ zbDErRNaJ$mpzQ!?d6+!iGzsSky)l*R53poxB8n4y+f940GVU%!u=ofDdY>@-7D$B_w_2;wkHY8b|_Q- z1=WmKe_}4$GCWOe!1six;IIwt#CwY`udw`w9qzs27R`a&`sZkrZvIa0*SHZ0?Hf?{ z+m7t?_Qi_YJFrl1Fn;@QBwX|Ph^9sY$*Csa@hqCnOnQf9E1>60%@!E8oE9 z>jT*ylWBBeZ!NB?k)%&^hw@o-1=gMAVd}QjLU6Dlj4!`{d8QTMIZy*s2223?ra^RV zxeR@@V-uP3i%?NXJTdH5My*A1lq{&g&c!OUApH&=9+?8?@+)!E3OBg_^BLYMR;HQH zmhz-qmmp`;G8%qEm9hn+a1A>H6NGc(rV15Q89js8#;AhnKJ#>VK% zP>&ar;X~I(eoOEL z%PFOyQP4qD-6ac&m&Vc}^$Kh`G74wu{(+sB_4(e>XRD%P6~OgavXCpxgWN@1Ky=fCrNNcHRH>(pcne*vm!B-z)!KNH zbABuqF3w?jpF2@Haz8BG8o}S{#^GkQzs#(?QLGen6Llm0hI`GkK^BpM%0wpjPdv9@Y1-gY*pL>v>7YMvl%f#jzeto$uyh||HuP3Hym*4yy$^{E^+;B&ogGZ zk>=8FkPw{Lhjy2ttkFG@`?iH_g1aUxGJl4TI%~07{ukyHD$}I9Z=t@zhStO@((_-3 zG8;Y?U{yEFHyBTVjOEAd((QLUJO(q(Fig>%&yJ+N7j3Nyrt_CMVB@z!rZC=#1`obK zLe}j=&-p_{XEJuca=|h`*UgGPf7Qx73PZqbSs@u{s3Xku{kfUIRM$9l9`ea|?6k-v z!-NdTtyS}|)@vFsu<3%nt!Zd8`53b_`HoYCy`s~LZ=$rZcEH|`qViA7sgi~#EGdhI zn`c_FaET&UYg3}h2}5b&pPMjg!CMj^70U8Y_M&@v03`46p*{;1pz*ers^U)$Fk7-3 zEA9WGgVPjDn^c1@9@>G!l78Z~@G=qUDHFf(x4^(-8CACuWWd;^sI4W& zY56B;H#hM#+L2&)e-LdTRLeA;3c>Np`3IwG8i+Y1A}cFK6# zJyso>JEdsr|#Jnxqwc$`A%u-;r2;wFaAsuA4pwG6j> zatIts>2XbW`y^`yC(A5dHHYZ|6FK=9O1`fWxQj-84q zzP5*_UsB~qjK9P7QKjNdqs>vhT%JF;{~hf^r;^HN_GEg-MmpmBERN@Aa-(O}BvSqt zWL}PEE)p_=ICdCM9Nf;vv@N3x=RSs204RFdZV~Hsa-}|BWIjont{Go@vk(*}7DHod>RJTTU;|IEqDGZajM84nA$kCA3Mf zr>lxI!8HBJWEvpy9)^bBZ^awiV`HjwVEmV+nbBB)__ zDEwYt$M%knVlLB$v&A#D>9#rX@ZuE0j)0kTkA#TJ+ic>CDi5&twX)<#MKIpJHGu!r zoyM2Dh;GWd+Z>U=Ckt>y>u-&51k$NeWe zJaGd&%+=yoqs3qyd>E$q4S~Nmhx0cRmXWNgMeHL##(#Z$OHAoJAzPh@%UmPjl1~Nr z)&0Yusd040OiNG_^4Cv1?8$+zm*A=8Fuuu22P3oNG0t=z*6W`~{gs;R&0BBWFYGY? zO%#Fqv`$cLh^@Mme~udNCcMSHc`OLW3?ow;eN@4hc-Jmr!l925#G)#M8lA7;|G7ofUiD3u-uWJ@bqug>)MPmqWvzB5}wv$0L zV-|Uyl1{q!{>DWS_2P{DVbt(?H2xj`3w9(Zai5Mpn5{2KX6bmKyt_0n*sDPkw@C2t zCtu-{Lo{>zzE@OuErxY(iz0{I1x85B&6Z9xC)JR0ihiPzL-^A$xUaPHp%Jb3an&Hn96EBqqZq_!W>_gxH;^-=Iz z_YL zmsm&;W~yB!B7Rm|9lFy6AE8zV{ktgwJ969Eb%B$9w^anvXX=5T{{;#WA3^rBDz8{r zhW0^X@ty6`RKn*eDtRif1=kkhajP#lGr^HsWiCd&kHEdx6jimawZ;7r9eD9x1o_qs z*b&$U+H(Z1zTFX&H2DKFc8{k1iIP0B@)T6Y#6e)LE)MY?Kgs^L)w7k=OgC%X3sqp_pltv}(! zw_fUp?w!{`v?LJq&wa!dz3sT@&m20$?mrCr;YJlNoA9dLAMoo~1K8rSjXRBNMjtap zKHc#oo?E1Z7o(?9rNsw$(IbJMxcMsk^PwI#XjIcq;=mnycJhsfeR=PSfi!Zw1|Ax| zmuGsJQHvBAx~WGBZA1mM&8(xX@S=bf56J^jmU%~;zrx; z*i$zM8+AtV4QGqNcI7b)4wwweQv1oQ`yRYR-JHHYDPUvT^e}A5aJp`-It-o}z@0>f z7`Y)5%8%>wl$!!uvoeE8y-VeD#o3~A>lSe3kiz%dg%dA<20?7O#){c39<_1%<*tg@zCJZ=-M%v&sMg&xh)94+t) z=F)$r_wZ_mA|-nysaBLd_nIN}juv{+kDH`P`ImTNo_ZN-BinFR)(LRPF{Py)M%3&7 z-i(L7*tT{v{dj36mDHLG7ljvp=HXeq;P6m<03%?H?mP&0UQb6hC6i|f_u$AcV`}g4 z80`~=@r}bD;XuI~_0Yqa-pPv)9lp2>+>(;O&UGrcDu2mD>#snv(q;BH(u((+%w(>& zhtQx!%lY}`L-CApX4Q&HZLW0hJgzmmh1pjJli*utu}P!~vR52MD&uWn>?wD6^|}li zynW!G%t1(MTME8?dN9^RlJ1{!jCw9LqKVc*Z?&O_ZnBdD!6gD!|AZ|5e-og4nHqn~ zZh`J~efnflCDAD;!HZKgXsWyzAMcSBIQLzko|aBxPr1|HrM_He{t+l}SxOTbM~icX zcsy8*Hs?*`#)4B}=-oL%~QE2F4hJVams9cK<3)kID_0^SdLv;>LJM<6NAG`0wUyjj^S9R>Iqnx1Md3zlfPJ zg%Ei;O7LHf;&DF@LDTaRmUk!|qijae1MkOka@vPBPaMddlkNGOc2$0Tv?a}LJ3ymt zG>KQDId_q*!MMZwVeXEnY{ip7IJVsjbTy~)PJO~fhnJAMH>`1okgrXvaG`6pr*o&L zwcw~#5AknY_<1=!=ojuWv%e3dx$|eh8@m~N$rOY;Ay34PKNCPP>nYJrv4S$~LujGZ zj;X!JQUA07|9Hg`oUH$2WdoI|?}VY$I6I6ES+W&%byVn`D@#ywq8)dW%ok?eDf9q| z6U%qj zWf-o%1L&`b5D%Z=+@x-_U2zqUry%GX2ZP_$u}prG7`ikZpxd~Y_0HZ)pIFB5XQA@^ zd7L}>t#gR>#cT7+;fj30RatH)@f$2$)oBjfjRrzLQ77RmczJ2?igQ=s^C@|d2@i&o zHl=W<_#|z0x1zdQz!lfNL5neG(cR$`>2sB$&!185_Ob!ta3g+ny$A;n9u)R0;n30& z$Y$7GB?qI0l%X! zW1cXJ|7TK4wyue$@yi_f!l^af_WDjHGu(uixShg_u7mMx??ocrbhAn%yM-#-9)w+o zzheLPLu6fK8q+Y=q`uEVq}v*dFEnOTC)-xyZvO(czs94CN*qXvOz@Ru0_wzT@CjQ+ zi~rWOz~5m8T;j+j$aK%dL!0He8SLW|1pQcCrUEuUq%<-s6W<^=;cvEi4cMfQF8>W;CzSBMr z{s!m zPd|_6wRflT)K?c-NZVChk(`9yZ7TGWWi2zb`3~ok<$1=kow$4KHn^4&iPbMPXba7z zPuCXXuHanUKe2$m8()lnX3Z2Ss$PKp4So3V=y@V1WJi7G&L-)uK<$ja5zj1T4sTX7 zgMNFyFlH)0@xu(<@*82yk^878-G}{0rt&WXcVL6SQW$w41l7W)a2;_ruUhaz*fAT? zUy=%3ROZYhf4al2;uW~~hz|Oy7{GWBRsM72E!_HVA&-b!OoPLIK)@QoEwDQiukQN? z&Obuw$VHaW5++AiD30I(r3XpIRy!)pF4?)|*Fny2DHW@>1dSImU0RT{W*RNJ6@hz;*W$7>+OVU04wX|l$Yk1d=%VZybn}se zh>?Q@_R=cysBJdHXbP!0`N2Gp+2OmTFL24wUQno?fbUe#@jhc~enP?o`$f~K{5^N* z`EdhR&o{XwL{-|LV z&-PkLilv4?K#UnxGaQ2fgBLQ5ElGTpr5@jKZ2^9YiLv)6>A+`NJFLi8-~M+Z;%VpDp6M49?KDW-4<{uiIAl)aM zdxlz2N0S6`TR;##II@6xLfFa~vZM$$v`Q_1ET%3Qx_Jby_} zGi%ZVbqU)c%R3Qj22G$E&tjQerO?aU`v&_B19^jUnecx13LWujcthelsqd(O2gyIc z{Ny$K=Mjr5YhIA$jUswVUJunGT=??Q9uT-*1*-yN9TM~|;@OR1T<%vCuWvYwFLsK_ z%2)$FKs1*fUljvuZe^1X-tKhPz8!c=G>@kJcHlj=W%%R7QoOQ7NhJNG4362=i{l0> zp>{|e+!&_C_0pG-l&B9NYjc)`jZ@(G;XJ!MQ<=M2ZlY$DRQUTMgs5KXaj6J+zmHp9^?Mo)p*IiKA3SegjPv< za%NsaqAw?tAAj;tR&g;`%#}lVHU$F9$C2rmlKG6FKsamgmpl&`&lZ|(B39i)AoHah z9WXZ-_LTX8vVk*KM_oSGR*vJe3uySXR%9|VSiJv;aDJb$l7`+e<(n*2aijf4o>nKx zbCn;E%Qfm`)C_n2?(u)vIXwa{r)I#wtM>FsTom*?>Bl8^gXvJCeE6oZAG&=4*^P`+ zQJC*~ytluPg$7GvP2o|I^_WN;P|QW6=NiJWrz=GU(mwQ{&weZ(^c^mN1Bjm{NkN${ zd!-~ICr_T_%R1(O-K4p^>C`_~Q5XyoFHeyb|J}h`hv)Iif-`L8>qS^O+@9*)J`2-E zs`1;iM)I9T;dpp+Cf+EfzlV2p_h#ncew7zqm-lILQgm7WqiO{ z_AQ%Jq9LASCm7~4u=HSwcJ0@*?I*BD&n^N*zMR*4Dr%ya>>87vByzKreygPIY zmmjwUn>;M>;Q?Ra-sBD2(lxkO;RmeoX@?zBN9cOJVRRZ5{^nhe@#-ac3=q8b2hUBV z+fIdZoopqvyOe?jTV|3QF|#51XgOWQ_i#sxY%(CKM z$!2-n>U@TTc*XLou0z?|S!r0Z%olcs_(Frb8van#hT&Ep#hpD`)bpky?YcjfclYQ( zMEx9gH#Hqf_AbPAR(r9gCl62eXS2$=apK{!VesAMIBOYs4&zFrP@(@e$cJ5G8!E?; zEhjZlI_wsC_isMBu}G2m6giN%>qcLj_`uyY<6)0V2iThiy5Bb0_AWdkjk(KM2m4iDdbQCb*=hiYw&3 z*~Up@z-m|s1l}W{Hinb<*Kg6{m=jhu|6t?KPGY`BZK7Q-gWW1WQMI0nYTV$aC>V5KCTg$ znrf5JK^{2idp0hbKFvN?AUblFQz8ksM$u#K$+&icANWUX#pc*_M4=bmIVxQ=e6kuH zyW0R$Ckvcd;mj3cG8WFc$FZt=SJ+su2KM9pakkql3;xba5e>0Afo3lY;iLB;{B_YA z0#D9IcbO=V$W4SJ6|$hRNfT#J8w(}@o-o6!5B}U<%Y1HagXL%1S?-5lSTj*y*l(PI zsNcQf(Sl!f_KxN7m=*k|58gt*a>n+n>_zl@@%-X2 zGAT-vjqXt;1vdtucX0-u30;o%$1HJexjY*<{+j4v&I;mU8U_PB2gBrh(e{6Pj*1{z zVCd`}gr%=du-%<@$!Wb1-6h1;?Fr<`5@t#A$J_;o|Q{tYxVwru4Lc>Cj=c zzv!>%vRH*4UeE+zb7g4oN>2<~@_;?wd5`UF5fCFE_ro?xD`>LnBlj%A$YF3Ky9VCH z_R8mC!&X~ZQay&&-7rTR)nH<28V;khM!~AVlVNd(hbU!&jJR;zKz@9-37ob0zk4_f z_YI9Fg?b9mRka!PCryQ*jA+O{m!e1-&3$i*Aw6Bxv~qqoY->4zntqD*yYoL2`PIi^j^#=4?V659EiR+%e+sZ@&=Zt% z9{`e$YOr}}0U4wvOJ=UEM&|)4Tqoa;y*<2(OiiBzpK{A#Ij$C2cDtcr!Ah8T{WL2+ zp-4UdX0vQ-LwsPGC~9A&$@PhSRZLDSm~|P#N~3+aPrDA(j`pDChDlVj;SD;y$%O&8 zPN6aAk`43%#!VeS`tA<}E0Z_Sb$l;qnjb}jpXXtG^;C!qGAELagD_`7j@a+b458cH z!ZNGl#X+S;ytyeBEbFSomU~Nxi}3zrteI=gTdz*P4 z9E1UJA=q0N%4}B*w*M}>78=X)1#fc;S-|Q=CEE_+frgb3a){#oNzcUZmoA2V`>tZ; zr)exq@E9b{t;5a7ju5l8Eo{bYp(n9&0CtBD#7}=C@a))PbgLb~Y6nEHC4;hwxsY=z zUw9027H7k?scqu2+g|9rBnq7t-D63r4MbWp7H&*U!@qCmF+2Y(Tx#-xyc~L2JW!S4 z)ay(fw(}S4GL{ov>k{%fVo6-&nuEJPoW-rn?buyrUzK`%F$3xdl@ZeJe zW`1`cWUAKS;Ta-G3s!@l(}|1zb>=vs6e@P)!_xcl zqPWLq?4Z#vC_A-VbaGcbi`g-YZCj{JRt|JSzp!$0WP&Ax-#ma>{`*1a&vpz8xCIME zYsl1fKd_?15Z4SJ3gU!~ynb^ZzIgA%eJ;)*eNRisodIvqe@qu@o!UfPyc{9w@HH4W ztdhOcomDk2b2k1{`y<{Z%X26zQ0GQ!V)R;a1CmG%lusB-2c{%}-JSW=_>+n{@Ai?OG1|UTe`y z-nuk!&oLUN7sscYcVc7U33Bm9D(04C@y#MnSpO{#j3nz=La#l%zdV|LUF-_wC3&FK z@)Fy>Jcjn^*P(T(56;o|7EMSRNM*|e=H~G?px^u+o-U1KukKzKyC!QuX!vk0xlf9Q z>Yap~)E?MSH{2CKYeeK`NTZ7}t$>{j&Hd(aAix#HZ(!vL^T=QiK87`|y`K&+ipWSfb z?3V|cIm^*k_o3+c_Sx8zZVnr#%tf_q6DqSXm1iB&g7G`I@YiJnxXsRaoOd)r))aHz zTw+ZpE%}VA-G}hK@4e^~5(B<7oj`G%G@Tvz8aBI}rdJ#qU>;dcBmF|)gX}IkctafM z2;QAqGb^4~d6n6IRp96Scko*-%XrAU-Dq8K4aQmu?r>dGm=`J~^d3ggT4Bbt<>GL< z!>pH7OMYUuXH595iD%*YHX9!J-yvF=G?8mcOrcLF4CChVqsi?n{iJ$IEtKZy@$Fi6 zI4G|edb4)3B-_zET2>0pb54jVduG#sH|2P=vJ0;N%He@?Z-Z!cN>$~)-DI@RQ*!d~ zbTTh|JHC#cf&J5Lu%mSwZb|p#;`9(0Ug1qg8cfHZB41klP==T1*wDb;&A51TI8~G# z33H}d)AFNgR6OGxBa@CY>BXTKm&FBt*KN4kP=NB8R=oVNI{!8HGWnu*MaXKX@()*H z_~rh?biLysHcB@PJ?ARX{i$bBtJa@}uCibUzs{iYZ)anzjxj8kwW2qE1yh%I(U{PG z6PxmN$QWEl6dr zQ2M+Gm`NSeZ#RPcwkxpaGa$wO$`wP0N5l|@!vJOhe$76EJeAHi4?(hASIqg-T zL;jPe-S5L-$|n=vKS-M2e|ef~Z8XQe7Z!A{;847N?kKLqlXO^P2Pw9jMvv+>KtRuH zfER7JawX->^OGSWr5&Zzrc$Y0FVU~@CmPF}^78MWVVS-=n~9@%O2A>Lz7$0ZyQf0+ zn`BrPpvvbj%*Wr0EJ<{s1Mjf@1;_tV2uWDP%QvggWYZ@Oi4?@$ZjJxYnC!7-Pk}6RLOClM^Zd6S)X@gOX2L% z>$&9-7qmYt%s_g~s8f{&*8qY0FsGe(ln$d-Yo9>skXA_Ov!R=PkD{(W!wA=lY~5X7 z{=VCq_qksX?V90^jp9LceDriaWcLqfH++oS0=;>$);%&n-4`Dm>qonQ$E@hyPWo|U z0DIkWgp9b50ahGHrGBtncRe35O_l#@cE;&W;VidSf~Oh( z5X~qmVi`#i_#t~ZH`ANQqequv#EwX$MsxU+!MBL~k21b4D+P)+Jz_uYrttI4$gYXJ zzE@Wp24Rc8Ek#s z#fAs&QJM<@w==&t|kR=?ms)HM9K3jjUW? znIulTA-KYwX;i-~Z#kjNbzjG_-Ny#;T8|+7b1#%Se$_>g&w}3ndPv647E}rm^vbgD6W~0fqVww8cvj+&c&G zN1g}qz`sYRrDnrN&EExjR$HiFC*rA!V)k8_A1C;x!LtJcXxHsM@YGKOAA8v0-A|{{ zTS|&lxHpQ&%upuXe`}!c$3d!y$s(f>hw!O{1-(1qC)^DB!hRGuq56+jOnjaX4c&{m zoB2*`oVXcW3R_XT^d?D7HWRK9!~>^oc*w|R_{mlA&1@(7LMB4kW&CCzmJi?y`b*%i zj2@5KRYTG(KcZCS88&73aJs&+gXNbCZihpk@bJt`I2L9jU}UUOM|~ofthofgJz{X~ z`e32|&;|k1I*^_pO=Tp`^TJ@tT90_s&uf#(?%a3aVSGr~DJn64muJ}g#*)eBMB;Ay z12n$HUSLFD#&%a{UbndjL$)~Z%8eHMp^hF`)so`+pn_KaEx^_yVJ@?;4Y6=7&t4_V zxBn}^)OkaA^t}?)nr^_it}wudAvQ1-zr!|x1sfNxPp1o6c;|#vTK=OGbLaYSol)=5 zz~32{EXlxycZT!MvM8E(+ye^UZNZQ0^Dy~)09~eV7*-aJL&fkWvDyt&kmp9+X_yrK z=b=Jl`hXu^AI)Wx|HHdENEZCLh8#VPjwo*w^kV+^-Dyc?Ff*<#>MLw;_09L($1rTW2sd~3~2npH9YlpgJ8lb5KY zjd0e_c03A`3dYhUHOBng_Zmn@_vRg?qd9q8iTiUy1-Hdaw3Zyj75klVT9%00JJ%q# zZKdkh7SYEF3#q$JA8T&<0k;$!u9uwAYmzf~rI`|up_jE<+X>RqXGoig7)%AVJ}`UR7Po{H@{VSa2U>__I0 zreo(N;bv1^_$K7k(!6fr`nS?td?K2f>CUIahUwD9WyAQ&b}PQ96kHC8ym2o;W>qf+8|N~ADJk9&e-$@p4&ddU z!K`3GH2rLlLAAc$#T4pA^9R{M(Z)EDrkg%JI^#I~F|init+AomtEzbPYe}x}H-M*0 zFSffD9|hBE)2Z$S7kcP@5tj0W+?&YL_3P8%Zt7sV+x#Q?yWfJVUCod(_%t3Ia2=PZr8CE zN8o-6?Ys^GHu*@;L#nVqe<$1bo*;aeZ>NsUb9qw8S+V_fuH-5CE3|dn29K^^5ps96 z39T=+`KVD4rvKBx1c%);YsnCNrhWqs9=|7^kh$RVmFMXG1O@Odiifgk@8IO)(=;x= zm$)l;g(}4Yu*=d{;6fScHda2c3O18XHod4MeW$`#7|w_6KUd^3R2#<%F`gD&V;QwJ-WchRK_n(Xs7rX_Lbi z-fXiM%5qG(qr($n^ZxaMb;CRAcG6$Cp`ed?**}CYo%~_IL}k2Raaf#l{U4mj`a~mc z^+Cgqub{6|G}Lx)hoA$8DdmDDD#UrflI~3uJ>wk8a;&7ek{7nxWpMo7o?@^1tzbFO znLSgh<%#$2(d2A(9&`PRxHI>Gc&vXXTJ9GnbYEw|QLBdKPOjc<79)rQipN~ZzxdV8;ytD^b9P~GDMp{N6&{TyEL%l#V%C; zayU%e^bZ(4rGK zhFOtMkDY?S$~Cxvb3_>Eg9%ZHv-_pvwbe2x_34Y*ME3U+4A0SdypFxTTI8mstv4sumwy5BY2|jeK_W22w7E! z;LYcB+Aw7|>0FDaSA!bh&du+TH@H8&5BW!3OH}yIeJ8H!vkmN1-@yI3TKL?+fIoZO zmX-avDkP4WDb1d>geeOxQSti?S&4L)n?G=ZH%$E#BHs@?E& z1Mo5X1rm2Jk4-;^!K(%XEU(%lSky;garY?5TC__rN`D0JkHnB>*YWUNrw=~cP=bB! zwnKo_w@=+agJtfjIQ>$U#C_<3#Zj8r^~oBz6_5wdk4%OPg@067l*KwbH-JOQQCM+# zBa!P3C|UIsyuOad=CtLoX}XRqdgl-_f4QA9A0fSSxd6B7rFr(d3qq$wwKP)q2kD;b z#v1FK@u!9tJGH(OMkx%2A~R(?{!0;!M`v+))eDHpm;!f45291A*5aurc5tj`C@QNp z$*#)PSZPNUxptF7MK?FxP}!Ylxkb>$HWi$cEOj|=e4s&|h1ffFv*4@ki_5E|Zb$hH z?zZ3}yxRGc^gmA$)U|!x>&Bb`m&Cz1`gIv>`|}%?I;q2g5lgY(_|^2zo2hV)p)4q- zFX=q?5lrhOZ+h($WT{^30J zXAtmBll}xry_TTi{J2?@A1RlK$FowQ;>S0^FCmoG^bdf$ZZvgV+#UL-I^uzZ|LAzn zQn7K-eWCRF2=rAb0Z(b4GP=N=cYfH;sX5_PW4M#fS4tksWkz`6*GRq^uZEUg71?6* zb9j8bl*%gY(IQD1gR~RKGb;!(j~QaphW^qnz7ySiexO`<(Oonz{794U_QE8!J9NF5 z4C%mnzBON)9$Ghv9oiqm_&-;K606(fsGrPRo-Y@_XGz}&Nlikt#P^98ztVoi2KH#P z=9qAt-XgMJBQ$rR(tO2`xq{!+tHyf zC%}8LKHl|-1nub0Q0QTff44ppqf{l|N%;Xu%jtym(wuUO!C5%|B7qOIYqF)a3+|A( zfUY4^I5_GHd@nWPtR9DGYqdAuxafonK( z{p~0nn9`rU%Ub2fa>HoMi*s_r0SR<|X*@UCr;CmOz45f!6!h`-<}p35NZy+`THok{ z!iyYu-)9$Y`=E;Hk>1RU&xzMueBsVS6~1pi2ycJyCG~zxp!~}vGEW=H$E^nP)y9)z zv8FZ#99IAvaXB_MD`WJgdswb@Oy=EH;_LL5=H@f)Azk8RUA55Ru|d0O-laDBi+knj zB{R7E;C)!J`#H`#Gl`G&_(#@e%A}Bu@GtBP`t|X_*vCV7%zyC^Hd^A^9F!OcQS&$i z*V7r*3Nk&D3uBy8XzciC8h`Z+tTgVx|MqCI+VcS3R$|M!ljE>+wFB=3{AC;_g&3A!rd-5+@l;^pi5 zK;z~}oI3L(_0`?V22uBTL0ty99dDH!hrWm*TfmmQX-9}DK5*NI{SqDuIc>JMHl-8i z?Cp!M+CzkkD z=8E?YPZSN`?1JwHl+oi|F-L`8h2z%BoDb*nM{7?bUZHB;x6&Qs4OlzWPOz2o09Vhvh2xQ3A!|e^jrPa_vmwetTIZppTRs>= zltP7+qzanWI)ImX&cS!VA{~pH%2?_HR#NZcnDHSvdncYAt=WYqbXVad>j`-A{2IJH z_aCfUVoVQ48pHKluZ08yN9gkLJmkD`ujJVLOjr>$3L1;daDk5@$0&E2;QT_uPViBc=Yw z`nA+Gb`AB2>5C>-x#GCmyWmWA{PCYGQvfgVLvk-Ul2I_X34YXz{}vyWD6 zN)x3ELx03v9~+F@`8v=55d_W0ejC)2KC?y zTA%z43JxgpMu%U5Ne6RbQ~%C5bWSGi9jeEsrmw`%S4Ci2`wfc4^R%tTlt&Kg0O}tV zxp`+I{(CDq)hUh|o{swoHyqBOz&?hpCYb!Xw6Wl#KYP65@Xn&RPj9muXafESaN zK<=MB+R(BAHjLRRelu3%(C4FhU*aTER$ULvR;u#DOWD|XY9h|~sn22FlgRzvRe8d{ zEqq6P6AVsQVvm?)8sgDPCH=He@3bvNum3<>izSE4E>m9IWwQKP_(d5ly8&^F#?vE1 zP28hrC3th7 z8Jm4hgfYb`Y?xFjbWa~54ysrtUaapxhcnNM%Wu7>g~KvX*YN@Uv>nZJ7M&*Cxp6C`-K#71+kaK4{g$lddJdOl zZG@#!Es*$T8iz(MLeqpV)M>7O{TwIoL`T&MtC7bcy?qW>>q4m#falGj%T7GXkL}e}IjYG@f=Ua13QVu}<-p6U+zJ5IQ*?lmJ z?FMm~={$ddCc@lDbokKQkY_VN&PX=<6_T@Cdl@y%&5qS_@Sz>gaYpgwj$x z@L=FG@i$t<{gM9hr>(Ylc_8HZa*|3kCh_ff1^2X~dCi(0Y4O zSp8?b7(YLk{>5j&|U*mQ--A_&wBtE_OcE|= zn89QJ{eVwC@8R;Y?wD^I%9pLB#J$wMy0Kt1HJN8~_rD&zyWIg&^1EZp<|DFSt5x}* zw4Z!*_d0|=vqr7{^H3?}f*j77u-cF5s5|Nt)NQ&X&ey(6YR>w2-BynuIZ1PG?`=ZL z6ek`yG?6L`h68=m#@+f7yGWXIox8Y!2ZjgHpRv8zt!{uY{$v~_^*_bmF6OYsaOs|Z zVG_icJF|X>0sfbnNouP?u`zD|x{X%lGZDAN<9mEXPtb==mtK?Ksh<1-Ix|V<_Duzq zLe+Z%b_!Ym^Y30GyWTVCVV`}F{zmd;{<6Rsi;W=l+AO+OHw`~+jHX}rl7%B<_lvU{ z-_y06t)zN+g3zmc8+UFFqwx*1IWjkhJvYB2<84bo?m2?flrM^TqunszrzhSq&cqE- z)9`SwC|-G61Hbya&^7y;@O*cuO~&j5Dj`-6yD`^Oa7xE1?U0FVi9e@0Wu0p7og9 zV-=?U97~asBW2J92e{;%FN|}2M|&Mcf_QxbCsY+ubG?zwBJemPz8K3%Zn8c zyTC)+FEHWbZjRNHvCp<>m|d3&r-~MFpJ(oJmtm2(_2X5V@_rrv^7$v|#m8Ym-(`Gr z$aIO_Qo=1wrRDC`g|KnKDPd-=IZsVE3^A|smoi90{ zZ1IXye^6hi2#X~qo$boOSe2(l-cL63;-^&*`eh2M4)4O_+c!yfqCooib}FrXp)0Y3 zwP?x9bo!;e6b~(&M|SyTR4a8mRcGIpjm(qwX>y52{UDfs<;3Cp+(wG7?*N}Sj$wyT z1Ewbmf_%zb_&Pp<#-y3!nq^VA-pU@;CC~lRQeCc3cmscv-MQ^+iKsI6Hf2e3m1~c^ zz~+3RwEs{<%SUGXYk4+>);<=hn~q9;(Ly@Cr4S~k_UCRb3hckzp4a-?N@tW_+|_n0 zXZ*6@yhD9(m9+u8mF2(=X(wLLyp=91495-<^Zas62R^^MMYvVE1G{TAiE)c7!C+XT zp!%en^nY4GJ1b1_#Ysnbl~V|2ep|t{4>e)Y>)z189pLg2pucr_u z*LYE{?L)Eqlvkp6E8?T~Tfjm`nVTjK1fO2{U^+OL_UsO!ozD)48s*#ZHhSa!uiX&Xy7sq+VxNIE4i}d zZ;|>g+g^|?ED%%Ux5#EE)>8eABuKf`6(6@$)85$|an?dZO3Jas86Xo~)czQsbKHWBTQ`IN`G%hdUw0x+#Hi2aWq2g7BOGdW)qFK1t&EswL{gyfSQd*e5)eB7P)?pe&o zG6GR)NEi6;(RO;cHWwoMwu!o3Zwf!Jk7sq|Ntm-wnu#6gPWQ%^!*{&^PTug6uGj72 zZk_9R-m7|O3EzcthD1mm;U4_^U?+B}*W>ZJ+v$hq6Y8|~JX~9NM*cXsCl0KQ#l71G zxqrP-A$eVLG0E{e`FxQ$q9(`iM2IuakajVl_j761?zN~`V#Ptn?D(_iTAZfSOiv+& zZ4P$lAy%8Y!?hJsPj3@U+I>=X#$y>wpS52|y6`|qdv#I#wRb0(Owh$IcZ#54e|Pjg zc!+MaErX~Hzo7EVVY#`HG&cxQ!J?;WU@}x48x8cRu~#3ws(VG^DrrbFt)qg!Fh_X$ z)J?SZx<)m^Bgwy|&h^Dlg~K~eL7+`Hbezdhd_9K8n2$!irolqM2QA!~nT*yK577NA z2Z@!`CKd+A2?6`n@$g28gY$Q`5Gua3 zPH{#}Jy;!(LxkQ9`gTZ-kL$bgcDP9En$O7$mD0$r*GBgFuFk6tC*#Yzaa45EmsP?x zi|s*D_U8RURv7FI-6dA9&Z^U-f8LVzp7a;vjw@g%$5MJD{GcyaY#}8CpdmSiqI-{p z;bwO=uJ1IK1@&hQaC#C3VE(FRCJyG&;%ni%*hpce<=A1+aj;&q7Ssk6z{VXW`08y0w_RIC zU7pp#Y83^HtKTOZb3c;4#bb8l4Eg$6G#7B%BieEwT;2J>IOrcsVV+- z%#iNR&!LOeC2*Ir5ARp?g`uU|bRnTiczb^W{Z~+r@y;$ZAc?oH>slMuI?BXkPokeEP2dXL;iHC z9h&ElqyY0M9yVeC_cYi9S0~u8#zKPiD&=tFMF9VNSq7?2y|Koi6b>n!qOl1nd{pXJ zERIx=UTFke$$2ZDju;2cbGl*4B}1NBc@@s}(Gg3h5tM4Lr^Qk}sJ37M>Ca0Ps+0UB zHbnz@839ak?app?;rLtSj(p^dp7_L6pIk~zu%qH(@LHzE+x98ayuth7>hBTg;D1k? zDEa4A7Hp8t`w_0H5B<@9`BVCDo<9b9Z4$?o_F&Jp-8fZS5%YgMsfg}6g{wmn;fh}m zNS-#B+P$V@5%$D8zu%DVh%7kP?!r$b_G4YH7Ek(c5W1$ic3-N5+LPyKK}IXxo-7C(>)Oe@ z%WjHWm4$acY$s#a9=wLtag>w9sdJLJe6NS_(ZY08tr)=g=85Ppu}NlroqUm+9rjCJD7lPuUhjin|7H9R@HJMBqxd1Thsp~(Vdd*}c=r2VGNe;bVccEHL8nn$ zYIm4d`GAI}h(e+FF41zcljLu*s2DK462_402d8FgvvOk!XqIT=*0(BzAJ@U9F`vQK<`l@{E>TNp zCG9GeX5;Ouw7(Pcv0(w2kn2U>TZ_P7Z6?nix&nP#&yuav8Fs!EgjPS_gTL-TI-ge~ zD%^h{+wGfzbJ*-(4=FL}*jI(1|t zIz)<;aI{&}nYfl7ezoFHky5rOViSi?TIc%Wf-407w?*c431LO&`dFO!Byz}rWblXi{=2IbTR$CKEL_njb%L&gxBW#1wjrR&{2jz z4mOKgO>^L`{Yv=VVH+J=mjFBad=P&h6Ct&H1^XG5yM^Z%dYq@g1ruCw zN!>!axveKhW;qDG2UiQ5>tbQ3T@VWvGjNB}PRee)FPINj!FeBaU~SJ%{3G3v1q|ld zd%dxCj30e{)(o#_sKc>W&m}&?4;r4^E;JpGN%`*1d|}Zj3JF%k9^H20jyrJ`R$py7 z=jBj(ek4e6SgOEg2B*LzMS*|jrLm%hD{Q$l9|MQahJNez2yFpRsG(1cX#d`u9`wwh zkE11*<2o(K>4XkfVI%FIdNxSy**R64v zY#r-AP9o2uuP|Y+Ih+@)xVF+2(`(G(%+r(PyJH@2>pKiX?~WGwt0hq8z&<=&oCB?f z{@6ugi~k&N#w!A!$nM_q#tU^X;likJ3>*F#Ubu~-Io(Hd>neBtqd6~@?CQ_J*& zJb364{?LC0)=N37PpHPt`aaSuTM-)rXNk_Kn)J_H%6m4Pq+RQZVODVvrE1$@Y(iJ& zwaMH^;;WUb{E^({YP8~Tq_Bv~glRSp<^M99z%Otd44(E(>~&l6gS_}b|31!S`;E2m z%%&$AWliAA`DcWSsUc#YmI8R77fj~^8e!tdeB2+DD|$Na65aYse4eswnR-YJH!cB= zO4|+(!?G!EP%xQYm1g;tJLI3g#NZC)Ynkj8M7LEUr1;34f2y6gPa)VS_)LWc#gRtUK2Omhad?wH-D?!_4*Y z`Mty^t`{lu(0;Lb-&!%?>NR1_gXxU#Cb4Pb3)lWFtFcvLB;?g9Kr8;H=a-~iT+nBT zJy#<9T-%dIj_EAC+i#8BMN_(SFNSZP2PxMXd2&c5zUbT^b0^Fc(Qh(F4?97H-|mXz zM$2h^XeLaLx=kykbHRnX!?0v@AB>Cd0co$JXw{Hd^3#(+T0}O_UnpRQ!H)zZr_V4z zZwI@68-P7+(^08Jnd^IHQB{W2M{MauYoZD$=tqXwIqZ>`?UF~M`douqO*&ZX@(3QP zroo7mUg)86N<3*TW$-;m;~1}xpd~TuZ+6=+`EDCJ`x;CX4uis$w^fDA` z4dpGTB`4jk%`kcVO!_T2VcNd^(!Hh^-_AKD^#^VEy5t(q?%#p=$2i!S=tVWJtZ>34 ze>^wI2*$3GV?ak;ZvNDp+86$&XFJyuNc?`^fj?=#ds{_iP`ntH<43QZjk&pdgP>Eq zR{Y%23V&Seg*W@gk?+`a{=4}Kyj?X*Xg6C7EBAk+oa#t^KfVh3HJ!l&Yeqr#ps(_- z_tL3+>Bk|3@ zw_?WLjgk}6fgM(8;uMK#-PUylw;cRJm!o^o)1~jA=dUr`Ue*K~AGgtuu}b2sF9)RV zmMKgfq|EOeI-%uOOZ?Fg!NXj>ijQq|u<*GW`s5ymtoD99q-7zUKfeTu#?(`V(KvBH zfeuD*Uy8ebe5KNgPCT%KGxZ;sk8MYHkkfzJRNZt{w0t4?7pJVD{aI3Hx@rn^|rmy zZuU`8r=UjWeeH@kqKD*OuCqrkY2Th_GDe=4y@JYLjul3&KLGCoM{?Y-4Oq2xHCI`= zpnKLYabM(m?xAG}+xoAmkZ&y(XBQMe$}t6TbA=nKqXI5nY{{o{GTaB`Z02bX6XD!E zLkvFL6D_Xx6h|dhu}$B8WIic~S0?2N$91HPhOZ$zmiDEw|J6Z-UJ}Ny>4KUO@lbg9 z66`Xt!1|5Kc&zOR-(S{=7yZ@3HPfWLlGG<%7c>V0ebf0^!Ievcv{piUQq0?fm^2<@qhs;*gk#>s9fx?K``d#~c@DoqJ9*R1$Lb~TXl7shJlh)6}LTKm5B)6MHdKM+nym38QY|Iu$ zg?Z!e8UvpH<04L#_JD6)UrX$gV3}2g8eiYDS1`NRiCw;nR1lR8yGjix;m|OeU~4Aa z9a%^*8?&g=@-L)4Fu?wA%w&DK{e$+>E40ePnGG^3AfRCfy}Ndv#{JIWo7Oq>_r01- z<4_s6PKkv2rDNd2#Z_2ulZeAcOyWunO)$FXj3=UXFl)i5im5y1(dh2SC_2RvI=LN@ zoFps?ZzsdKU`2@(I1+c(|D>!@+r;70p7~JHIp9O9Xlwr~;*?#wTpjO2f453!{j+}j zplB4dJ7(eiZV6)8@oxA?;)|_tw#1l8NNoq)_`2{`?ER~N93H1Zc8nd>%}Iwf1ycp( zzTNmNrSn6TUC_V&m@t|A+}{;El}Ekb2PYo5!qdE6&@##e#mlBV?y3eJc=Ut9I}G46 zSHd~X^BD~PvkV#sz7h*e4RGx@JKnmY7bMOv6q=;$x#!ylGuiPJB+Ef zlx0~s@d)Yftb}vd?$ewv;j~`S8ZB4u6qj3egFoLLz*_R$T|3wd`p$n03XSKW!Y+?O z&+I3~W47!dF(=L!ka*VBpF1yGNtQmdaNR2-s+FFhxq|e5)I68<@wh?ugKx{dmsQgI zxn|hA-Ic_XAE|c39SU@AhPxLpK<>>{n0|LJC>+>A?+40gtA08gw$;#|up97aK{dU5 zxC(5IwVBsXz~y-(*kg4&Y&Ez-{~YZRdTbGYUs7SK{4i9O&dXQM%o9$gIx`Vq)**c4} z>*1PV8}tp{M4x*)ka`aXPvk(N!WR5A_BdVJnRd2+Cs*v| z8OUamt17AV0yqvjO{)x}r9S!~zL&X`bvt>}pjJnAUQ_}q0%!VuZ4fK6;9{H%1%BtQcLw0d6Em5>(;hq&2Pun8|=AITle;r3f#~om7M-BL* zSw*8BexivxHqgm~R#Yn8Z{@Qb_X}-5H-RSs~Rxi*MmPWaVb4uf(`<`T)eM0I7 z>t6=jCM9w62}RC`2}N0bDw+Q*r}?@k;C01gp-p2FPgv6cCg*%GKw}U)o0{;2`4*B# z>=9+sY+>&2&Y1ny1+69-@|V$9X`bZ)ieDd%1B~k`nk|oF>CLWOo~(zx=9{Biq9Lw7 z>Hr!mMRE{qIb`rny5txx%|?tN!mJ;rtl3Rk`qS}6OOnvlDg+_<9rQSAfq7+m+|bXT z7JUmwUmGL7ZZwnEJe&qk6J}Co;UN+xkLB4b#_}dRbz$G0wVd7Y3pMrKPm83#Q>D^Q zh)7cgx1~3*Z~l6SdpVqAKS$8yfZ_CR5daW>g1? zPg5?6D?6k?NSF`(C!a5$wZs|&_QheQ5{oOmZi<(0TEW^yOEBt16xgaPi8;{NH_PeThPPmN)Q8s#d!SFiYFI*_g&y4#U`p=}qV6j*)*b4O z|6cS1oxvx-aLgv0etijPd29mn*it$eB}I(7tcDc@9q`Jpy~3@1Nj!CzoGi@N;>rVh zJv%#xH2qSxQd)r`(f(BKpNeYD9t;X=y&;5`Y>`o|&Dw9dwfKMkyh+0p)x_1c`=QgtzExD5_i=14z^ zE(p$lJAr3Q{_UKBfl4TZ=T{$b1;m`9HGsQo&M*VYo4SB&xhgCc{Tv zFu;GWaB2EqA*Ss!RGp5eoXwR&kx?Tk#h-)xBpW`Zy&SI>EfSZV$poRm82e}C;DD~} z)Tg-sbRK8$s8)Ag@xDd&z7pu=gcQ!7mB)vcb--Rxb7=jHTZ$T@sYe;zDqlpAC9x<4`;W#HpOnDUTQ~W1AY6k(WV8Iq^ByF zhJ=tsqznoUN!(u^&)V%hp*FTgER*)e7ZmNldS4aP{L7Ua*dGGf%opOc9utLA2hYIg zAJ6G;O{CCf9?vIoE{KKyT@zdOT^187UxSuI8gGo>4x>Ug!oYST?5o@cb*1Wj?O}I# zThb41m8qdY+6TzYen|<6g{0s0uzNxtJ!w0{` zPEt>D^~eLl6p5*+fAu4o7qr6t0n#Yj{}x5WD~sKe_L6>tluf?>L*}sTk&rNM4{5ch z2M+C&_Rt0m&C3|4&nE@A=r@E5%TxjVQI`7ar%Kba8J>X&(73FkNuf+NKJ*@=$#Nf z=_+Ncw&7#q7*eh9!&Dy=UOh4gpWCj&G%@3>5y!#%j3e-3V+yoRdn@kwbQIF!N3pL} zIfVFi$5+LGUu$j&n`$<3=c(ElHsusKH6Dc#<7aa3(kJq?y6%GVFMVDTmko0cd+=b% z(b<3cI5Mqir{@kSLV5cnVMYH??Ek!ltbZ$lcEcv<@WKcUqEzJh%||FAq^lh0992q>Gm!A1gw2bS?9d*>mv~URO?as9~p*a%s?C}p+nN8+`b6(lJW`LSuP#91wrrw;%&8#)GlOLGoWm%G$HX%+oD zbO{Qi{NapPJ^7kUMZT-%3t6^r#a#Ph@#D{aQ2Or##C{wAF4Y^McaQbrZ9i+#Ft`Ih z)vgqK?~~kb53FI-v6JBGtij0|8^EJmJpWl!0v)st3lmp3;`cw2^K*Br?8Z2WG50Qw zWnumJK-Z1nt(E^@uqx>U8NjoS2jvd`khZ1JW8{goWM ze+JpJU(-FoXw7w)wQMC9H(ijt@2wR4eWtj7i5A;j(H9@@x*^WH5k|M4?}g2%4Xtmk zlfLshxFzP&MvXV(ip5W5|5COH5!tES!EYhjtkmU*!-)cw=Tm{QD%$1?!dWVk{kP$# zSTtxUuh#tyKL;dmw^VDE|JFafjFYC2I`zF5)Z74!mow@1m|9< zd_n1hP%JTH6F<4Y=8KJ#^wk$fw~rG#tUm)9reh&#NC7;&)(YpAF2Z2Rd3s;!l$%zX zKtP9{c&N6BJKqT)lemFGPW3t{S^p9|_KK7&>S0=%)NL_o7XA|^k=uH8I1pJREZ2D> zwlpq<4^xi`6&wFj)u`R#y_M>qZElL)s{Mc+Nr!b8Pv+w;j&BXK=oFU zaDHMR91(YlPRC@(L)0c*i zFx$U_dtS>dVgIPjAk@!gCz~g{Po)U{1w+t1+5i`QGeEoSDzI5IhHN#S z!!I94HvSVWv|03ov`Gczv8gXyad0l~(_pRtb*dV9 z5$baH!1ogloMbv39Ze)}Lj5Y%$e4|MZwom|{dvWuox!AGGqsj<;S)Q$LdStRJjuy` z)p0fy>wxm#X?XnTuMbdwh13iEd zULH4{W_LagFWUCX#z!v3$hyaL+F6~2Gb4G)JOyl?c}Qk`TNxcABhJCghQrY#={6$P=oIPxIQh zBiJ=GQn=bF9L28*q;t{&Hx?eDWH4l-_A)_!z8YpHC_~7>MD}cWO1X*-ta}>y)R0`b z5;hAAA|3GJ$sh7zt*d!!YXfw+wtVf%@qrxrNm|L8ZPE zPf)Rk4im#6f2R#L+qsd>&lV`(pa?P3t+Bz^4SZMcrW3u}NTHiIoBM2q2{G?QUukc; zq|yXqu3v)hg930>ad)m=d&P#Nw4<9Bmx}%jaFE-~*c=V}OYmJ7*a!cI+QPreG185o*dpW3*Nu!|X9w!5RIV5}-%KQ#$Kb1a33ccw~Pc?z8lROuxp6 z*I12)>iQTyni|E_5JY*u6@=hy4=%m52s#;Wf*tD7LUNfMm%TS3mtF5k^|T4^(GFy% zElxan$YL1kmjaVN52168&tQ7TGAce*0)4zTa@yUq)Oo{lOdE1e=yxTCTn^9RO;dEG zOvw`XdwnBs`gRq>(2p?s{9iEH(vzK%j)*4SqxgEeD(Wum&s)wXO1TGP-mPbd7N`G% z*!eHuM5`K3E%OkIwkOkh)g=7d>W--Uhd!9eLG8v)+B@xyU{XS~-{J|(^ZgCBwGl{@ z!;wC%;+ms|c;;pTCI2acns!Gn=yrx=WmXtlHI04e*>SYaWs?6=gU;C-X=czxDD#jU zGnz`|x2p$y8{Y|*YIfy`r((oboowE^`YZ$;QRC+t`-R(U2Cz786UtkTQTogf_qn^$ zx!U6(^s+o6&-Hl0r;V<{^JGiDUDcC?@J0Cd#u-?es{jvwJL9yQR&+1$v3#Aw6_Abf z!~z>_etq%-)!W5`Pta)mZ}nF2zOe;-*5!z4CNq&`HRLhR9%G*=W6=VHehXB^>=td@ ze(n$*(K|>JvWByR#d&zOO`mH&s-faJ6T;u-{3T`!4z!tzC$2P#x>{w>JGL`ld+v

cswF)rbW;e$zIh6T zp>?G7dlaW`%ckw!N?=XIA)1=EjC`u4*I8XHUb-_D{Ukp7u%b|OO+F}WPCqMg{64@) z+sU*ncoQgp*2d14Cg8V(&(v};A2J+)R(@M2E{^e|lV2yv-%Nf2MQcyWgaU0`3h$}J zY#zGwIn(_h!ytjH2+1}RVI_GS1UapBc^}_!sI`4p--ZzXl zwf7!WN{Y78c+dTyNm-%2@QthnWhJyV5QQ{_$|$9h=sowNLDxxw_3n}ZI$*Qzi{JhEs6b2GNBgBBIT9^Pe ztCCndW*Tv=vm~on2_o8MLU~`VK*NOqDv~2h*Bw2Hzr*5i{7fpeEc}iiWADL%P*-qP zZ^sR9=CKX)<8bleXPECGOLt^Ehxmq3=4OsObF%Fzx;M2l4}yz;dhp?;(lqK`l#F^? zgW&6@0CcB(Sd`{KzbPNX(|;wXHP0S4Ep_JAzJ9{!Cp7acCJHbySP8p49iZG#7xz`w zq0@z8IJd!;oHE*tA#U7xG^~U-H8~f9`B~gJqYJkN#N+KQkNV+-DyTZ;D~weg!VX0N zw14e`31-TWm(4P3TXUgvqci8B9cSb=#lg#@YS^>=8)I{AI_#_wCVY=GJPB)O#&?Y! z>vVnzmTUcC9*c4rYAY9_S|dz1*wjGmLRnbo)x$5Uo6Kbk)`7V{H|sv##*2wKL_dVi zr{Aw%;Ti4h!5hm@VGZ}craKbFj`-)}zc=e~hh`)u-jU^{{<5LZttPY23VoA$?;sG^HFIm1O2u~!tLA_sAw%s26@|=qEjc~`2JJukBlCk_@qIOH_*aV z7WCTkJfrdV@fvJ@sEul=T!L=dH~#SB)4VdlBAl$xv7k6Uf!lK}dShrNdC~nIBC>U9 z{RJa7+8IFm**(m7w27%ym;o+(jMynN`HV=ZDluI-%xHbN1?>L)?1G0|(5c57HJW;Q1b3wegMa){JZ4)5k#Bq0irY8YM(2DGtmRyagA-xF`Fb{R!9`rPZ4wwr--hL~ z94aN~0M2Uh#)}W?AZdaWouPdK-p0zHD$yZ(&5$Ql?S|bJ@pR7OBh0Jw6R6h9-#G7Y zKCG9Uj7kri(Uo1pyx%&)*d1)g$FGyvH&Jg-MKgt z)B^LjU*Pg)cJQ3rHQ5)Rg>2*;mpQvI`s-9M{w@gKA2(9}mv`C6l`V{RY8dK^DAQka zkK;5yEh72x7X;5f2mwRZq|@98?v>WE`!fWY`TGZ9>+koh>*F1;DOlQWDzzdSTN7cM zlQSL~&P1mqJG#Ok2Pdxz!hnC};J?-j)XNWG&W2XD=+;5_^=vh^+f?H%g-Tf6Cq<8+ zDTUtk@}$-)9>N_L@h*$Z+T^zbJ~0Qt|bNy&tgl%im-5c}09s zxC3({JD6oIy2xt2V?TLDkqbueK}l5u6yCT~+ePK99=A96@gpD3A1(s-|2Q|^QZ0D* zY!lx0ZDM+khJq(v11||{_HsiK44glWynJK2uR{fAzw~DtcBF%C@Fddo=siCDpu|-1 zEWrL>F=G2<+G%nN=iNHO^=-Ks-{;x$ktaq+C->ipjBoQf-UeF5@D0Q57q1Ik_lvzyi^-KEcQ zjk5WMoojH)^(1UeJ_CfyV0LdcB0T&5$fmktbn?21b;sYZL9Q2B^>kVCB_o#T-%Dk0 zC|!V~AEQ9ETm+6i9)RV4^yt#VM_Jj*yX?e|49vP~K&62eA9XwJEe*}seKouxQw0Co;h?{QatRQy#vGX44d6>fzhkWfY>l$s_><(Ugg0| zvW^~RZHAtpb-FHzlhp>Zzy2gJ@DQ{)*WyCXTV?m1vIa*KX&nD6+tGrI?w~Z?lbMC- z2Ax>OdkoFIX69zQIep;s5FI_o`JrLmSbR_hWliMiV~u=nuNp!3I>)l>H@;!+u1-L! zPBED6Y(P`S^4YU<-H5l-RtAp0LY)t?(3@aKRwr7~%yLoMev{ijuI8BHSp*|w3ow0A zIVioD&be=QVWvt6j0=6jAc=A|rOXdQ7B{eJt7Z{BIU~$9l40e_C($&cB=ogkLYwax z)4_KSAp5x@Q+xCb_)AQrl;cHS{Myg{G7RAYdEz8Y>MmBeeqk(x{v%cV{lw`(83#)& zfj@%^bkUp%SnQz$iy}Uw(Ijp7_v8+U^%=1XO`owX*~KKl#vN`d6Li1(g59O-LyGc@ zsoAzS%+;8w#5;HzXq|Y18dlrUK|YK>bNVpnGcITOjsP>#*=635l-nz@X(cth2{;%j#a<6M1wQ}0Vz8RD!VFq{k z9r>C#ntq%n?Bx8j%QnLH?>WrDGF^i0ONhE)6xUUdr*X@@=!T*MjRiOms`_`OCVG8+(2i*|C~-M${wjo={DqW;onkyccvGvc38*_cjt*yaqS1f< zQS!AOOt?^b#rePR`fDK;%rPR*YB{$3l3vz$T$qM$Sk5ZU`~elw4{*JE1H51b}Y-qC;gO*5+mE2JSw=V+c=;ZnY zi>sjj$urPU*Mwt6Ma-JCwYbh@3Z3y=glewTW5+hjkcJKmBAhUh9FgS1zF~3B*O>$T zn@&=L=6Livw3@8o`Y6?07WhiV7nmOC$SfF3BNGx=LDDmCc)HMxgj9OsUhy-$nN?CW zaU_FS>1Hy&?Ej$SXE~DnDjGwMi_&lRW$97XbatYW3k@Gr1JP(hnmcnnGY@+)=)dPp9cUQM9ZvE!I@ z$B^>RYSXlj)leCvO2g+)$1OiNmwf$9)gfK!c-?@h*|W&f$A;aL#||<=C55elTCJ-g5H+TpllbHzmR>#URl2Zf=AUr zaON&pSLH%BtXhW8Ew+<=oa2m|=fTY*b||xVKO9{j!rq9L2AMo_^vh#me98(mk2ONi zuw0^e`aCM|9kB@Jkm6s9N%wvg)^Fet{*o_)`?-pw?Oh>gww9yCTu9n zOe&QKm2?}zCyp0j?CA;gUsJ_wnMdf=|CDI>_Y)v=Z!NRiXbJ5Y?5jowrLt?|3}g_4(4w#3S_D#6mQ1wqw0# z=8(Db6IuS3GThzU3>%mwu)=FAnO5h+7@Ug4y&idt(vTLNxnwgNNCVJ`+X1T>F2tTj z5#r1WQladd^@1ChQx97Q{4w;GaX0COUk;1ur@Agy|Ij0RC7g!^r*vuRofX6;l+TOa zBu2(^=98Sci^!^2S=#M5%LP0 zGFM&6RjzaS$}SZeia92f_#KSQ5+}abGhp@75B1*-CKBPI3V#2LPwXy!C(sAwFzF6a|=FB zTeyOLk{N@vGmFXmTlP5VPA0rKC_@F~onVIV1-^u39mjfE4J$KGv&(K%h++=W#0xfb z*kKDDo3@m$SKY<=7)3DKqKs*tQh@oB9MBI1?;ZROu;uPahoND1DwDczCVA>R09Tl! zeB2hxa}~YKh|L#AjidKroydCPzUMWHdSr0?nsb~NU6ic*{tM30UVNRio0n;#OE~B{ zO#fg{Uc^U{?%i=j(sCd9!R4*?#k^*)tqRio5}0pNeeke*6fVu&%XHXX<6o@&35|tU zF~eUR&IUCBCk%u0s+#CMpii@IG{DElEZn?Wg@?>XurJJ!j;@$Y%O2i_*?oazM|L#C z8U|uZBM;NLU6ZnrFkPzOkB*6HERk7Di=H?UmwW$U{h&BmFlIpHKl|Y3-vp#=zq7ka z_?T~gm9;t31>@fTxOY>8hPru7ZMY287>pqQoj=Xq{_OwqR!`ucG39R@At)kGK4@4H3w`d$nraIVNKEUJ~J=%}kYli!B?i=vVa#bm&(h2>slS zeJ2&kaHbQz`ALq*t;@uIVL@hhbsQ-?tV*jNc|z;@HheO9D)nxD1+KFD=zZTMkZ!q^ z7zma!+y2cb-DpjJX+C0eYO~4l*wbM6?hP7U>16^Udhx5W6`902Ay;nFfNu{XKz&a? zt8{7*w03EdM9-r%@Pq<6e3HwfSU2$=+?-67Xvg5xE5>wb@O1qBFBJL1znHbx?BMfkI<5UuN-f_Rf(&SD+ZDZ&7AWuCk=-mQDv#&m6qftB&4h zYEez<9+ZAK0a-8VVV<8ojf*lPDZa8aa?5PmptCK(#_YYeZ9$CE5xw>Qgg2+U-DlZPQ_FD3g__-Ud59`=R8GO7wp5fjQKsMpNzu!teWB zzkY@goZ(6G7;`<67>eweC6^nM=zxv*g!L6lCtp|RqstdN{?R*ANw32Q@5g`uH9uAc z5jG~|cwz@T{|fhvGSgzVikjix*8%kB!(SMbtA@{qM0n0z?mI7aIlVNqffvoazxlJh z?EhU4xm{6o+>*Qddq|Vu+$^Y;+($1vl)}>tWxDop4q7%}hMukBw3qXFS-<%V@1Hc2 zs#<$iZc!D4w}{b>1~2+Kb&y~EaXPe~b|k-T{fYg&ee9`$-OOPwODC)51X31H+1ZPd z@JysI8G2z!(}YIAop}sy`SRyDYHvjxJS;{n@GM3pMpAxGVqt>ve_+yw8!EaGiOy1RjC|@yGsgi!CM~e z7I_cECzH%g@xjF6E&M-*%5Z)A7I?(*MLvTRyq~W|PubUCY_dI(86|YsXFJiZ>_NZ6 zon(=}COxQRMr)@dWjo=F(?lk?%eSxr9gUQ4#L>+wp(W>L+#N?dm!g+^R#2W6u``qSYC=sYZi zJ8N#@QWrh)_}oNNB`V9$-DE`1U)@K&#w@7$d|7gyb6vdQxxl=_WYUsiPSjSf;5GnJcmA0+)Fdo?xep$f#lYW zFd?3b)H93YkB__ZE$o-#oL92s>*NhI*CqugB`J}}Oi|c7{WCOk_xF3%OW=%~5FOm& zfT2dW7?~GET+et9zptwmrYzBf|EiCoxdVXut`56~?u#%u>}QuY{spKt%SHwt$FT-^ z@-ak{i5PEjhVq)NBgFD0KBUj(80_2j#GC;y<$RVL9wl%6^y zM5O6)^tfhCOHccelb;q*yZSTCXtOnyF$y5h+d_!zJUQ~bS_*9T8j}zgG4NZHK%d<_ zK+m2lLo=x-eDQ6Cc!}FL9OZR0y|GX5dXOiY{P7_Z^CsiY-^(C=T|NxgYm>D?cI?*^ zryw@^1oDi>*)I<2G@$e~GqmdlUVXlmuDc!%s>_T>ZM{1zI4($cAL4wl2WKOXijn~( zBk&r2gLzGsw8O)WP27@=$@*4gUtlsUpUmA^)|R8+6p9Bkl<3yaXHoZw7(JzOnYYSA zmh(ATar}{1_E+hAtPHpTxeI!6^@+`J<@7>&LR5|(lQ&>jaL!L39nQZoy%pse_d;3! zBsQ_85hN1@$ycuH?eKgR=<2&+g~e9#_3~aw6pq1*OWv@HG&h64>2A7LT!SzP^~~h& z9_X6Zg+;rLqNwh+V#h^AuoL;melna& z`u~ZNp@gYqu<$3oN}0)qm6+m`)9FIs;hb(edBX*1W@%q_q z;P1+!_&o~pfAny-(ls0zc?O%ApnAUvvCNwPj?tKPqO{ke5Ze1oS{_=>Z1D0TmmXC?*^)YT zjJ2dUSLPA@rIp;h))Y$kzi`1aVch8&fxlYP;IOp@tedusv}85gl_;*o$}SDEUSfbx zZ<^zQ@ty3Jz(U^XLq+(wdMa)XRKNkrNFukKb7$R>pj+w#$ zu@@lkvTx(kJE5>SH5{0X4|t^0gB|cYkI6qjaJ}7flx=atWf%9-u|P4>eP=4v&QgQ+ z`_h{K}l&@ZO{XawBCxyyF(m{yCX9%TXP_sl8-0iz1=(!)y?qoCTeo z96L;WE?$U!1!>_&*=HAQaCKh+1Z?G=nJ>gi#=FOGwNMi8%f><48fBQUP=QX|W3Z>I0Mc%SHKZ&2e;e(8`F4# zI}Z~!qyND&RGrg_&gRW_jN26UBR8`jdy~qnFk8plw?+30RJGIcfVlx$*}IvUw4 zHyp^>#Ho0;W(_kj#}zh+eCLBiI5QIS0yg$rkw-Rba3EA2$F`NA)@81faK9dVGZ(@y z+XPgk_c>wX6F7I=iH!(q=4Kw;(x%!HOm|6t-LR@5}!3^K%)_PGeqrmlA=k937 z84ev+UQR54BVq->D7h20TWX{>?ILD&@5lL9#OT>y7np{RilpB_2oAIhQLMLQhVr6t z93~LWVl@c1?qF^`oJv1k)FM8gC)2GO)96W%Af01bEcGgZj}1PUrsD0AW~TedOjxnEkh!8Ji5iooSUaU1c>CxXMtiFYJ)pgj z_`Pfb$)R@k%gMLLm^v|L<+xU=-mpYE_M>}RNc?a*(HS*5<4L3-5-#elt6z( z%M!(7PoZ^egx$1pCVqF4!s8Kcq;V?3_Ul>j$F-EHbR5Fiv`*$ZqfDkLo@aI}(4rcn z_1GmUNUb+)pzm74Nz18ojIRG@h!II8d8w}<(yj-;X;$IwQ)RIF=_`2p>`GiuE3+vkPCtKv@t}4p!4G+jlZQe8PxHBG)0-FXo+%Rfl3)zz+Xe1MA+s zftuK4$e8#XE(CYsqZhfvrQ-y<*+&f9i5blmT8f|U3Q)1_V=!rFEQwRU&D>B(1>U2_ zIA@jwoLjCyZPwaCsaZTI9`T`_6ONN!;26RNDY&>b5(`6}$t1@>T%sLJ&YE;#zU zo^T5rWgSUl(@kb}=V8z`NMdudO;K`UJcy`PK#-^>nU$nV6gSjE=r{Tyy5Bg#AVrEgGIEhuygDgEo8-NMW55u-!VA{PYMCzagNnQCBtBN17>n=3&_|q%l#-kNbQaPJ` zPKX6ohRd^X-nz`koNsK#Jt|Kl|q`cTQraIh@B&v-q3V;ecRm6qEoQ{RV+ z$y?`BZ01EbEaVHbhKo3Ul`SUa^dZES^tJe_e}b^v#IO&oh`J$+6C#buj_jR^&e? z79~Zz>C>Z$sIHTM8+H|{wk$RO`0jYtImBy^*0$&c)O7%A6mC4xquMa5ANG2Z-e~vl6Z@ zMDLq6XU(wO%+mL$d9N6FlP{v7qXmtqb-|U1D~Rh7J$T^7c{H3P z$pn~2TII^%NuUI*b$1|DAp`KNKAJvP5NGPMoJq)cOZwjyZ`SPlDcmmGM12K2SjQcA z*olWy=;L$^A~QW5ofj(*_f>_kI8Kfno+C@jR)`aUx>fYS9L@+{txbwN6lvNbF>Kqx z-6=0NGqNdas5h$xZKhns;e`)Dm%V`228*%gW&)0my~4$#b?|iYVyanm3`Pq+V@|R* zYGr7UH$6L{_l7NUb6O#+T+EQWbmfn;jObk3Q+3q(1F zlTxG!J^50ON?!FP<7omQQg)umTAyGP=DmcFa!FFja^8ugLh^d1CDdGMVp(NP;+dMy z20Djf%3ph8T4F;Fw?r}lE;C8j$~rh~un!WpYSAt$IV^5`2fYC)jM|_#ez@&Pi&f*X zj}az|XB(h@t{qWYSPmC_O^8=Y4Rzr8kw-Hn@v_$lc=`WFBI~bVT4FX>h91PtHFF?a zt%?6~yA&?lEJ(xWd(n|QO8Ci>%Nf7_2zyNYK+WkSTy5P?AIFhf4 z<=tPLW;BdqhwRvYO~z!?PE9%xoB$iu%h2oD8N12NlIRtE0$wfLM$J0*koK%f>{kE6 zgj%g6f&E-Zb?^&)J#9?pKa-VKvh z&n98t6<~7vEaFle|n27sUR9yHRFh=iiudok0M{yN7p}ZDN7ZlLc^_9@eFD05>PjswnDRs4} zXRfvf(}IyG;>Fh{erF}HHN=!uSf7S3`)Gzz#de6MriPGES^#pDFES@3@E1w<|+w^-F~5$+s~)dj)a)utbxaZz%8rLucX;8~t-r5X~R~@L^b`3hYRh#Oyrr@)_bEMK>8O=*m z!L7tS&q8lVNs3NylJ@;t=m62@5iUsjd6_pB#{6p-ko|RxIF$Rx=XHwfa z3=VR4L(#Tk`e1VhT4YD_+cpftnRzS7yZ8C9<&QXyZk$BFcF15C?;9w%u42B7jzeYt zVQO->8uV($?0yvOCy&SC@%K-0G+p}u-(`p53haY9lk#|ivCg<5PmFAMB}pUrJXoBb z3-7xX;r86usC|y1C2Mcv4yB1yOVEwpxn@Um6)Uj9WHMc@_m%n4fM=Fp7sPV5<5$%+`|fMfAx{%`kGGG_6N z%N4!^H#s-5Bv6|%JUo{+vAWP#)Su>h>%$nX8KLKeAmH@qR1gC5og@5~hv)MC6V0a~q zo5zz#ngYSuwOUm1w+t1Nx`^V6Q|QV}pbdH1WFg;zJUW<#ff4)Y+T}4sd+r=~HsVL6 z3`}r1y_eN|vXul@G-2A#W$cL=)i8JT0G5uW1S@kR^2@Uc4Q}hSb^C zk5s6|vLE`T`7gIrF%0VpJ72$nO`WOCiRv{p;m#jsEw`&2^liWom!hFAvJPLd?&SC3 zNZMLZg0m_`Fue3Cm|TB>nFGT3UwQt6%}N9#9tNC)WsxE*-2Q|gIMjp(9&}-Ixj23CF^(>E zQ>0fOZDIb3RuGHAKwNrsA-=2A=Kncsf(NWa*w$|~?2dG<>v0m<&kGFbL`DUSD-4Oo zxpt;XNtl}4AI1_cYy4(~3cB&k=$Z*1nG1t6;bH9&#$CvlbG%Ol`2lIFTTGxc#1|F9 zTkYyz>>|&~2o0Oe@mM2Pk&8dM&+;V^Dt*+PE^5|gs}GpL-&Kjs=?yVBxwH#ns`k^^ zggnOS#^d@Cl|=IG%~86e+MBeR#L?q!W@zwt5?wH2Mu#T(*&V)EjDGUgWRu5Ex+ncP zFMV$v`V=q3MJ2*CdU6UJdtgcm+-8yQXBBC~nSa-!m=?j*y1SFR`+J zIW=FJ!YQ~@v4EQ+O5gz!a`Y>>D7M4OuIKpao+vCj7=-hKH1TfdOWf;PLeeWbLD)JG z;!9f?;~j5N*tr+wu4>TBLg!hN$5Y5X{qg!u49EMncLIl8 z&w*}8?=qs(?~Q?L;ZywZ)0+C6o&<_RZEU5t6tuM(kZl*Tq0&+eS@tda^06p&>}bO4 zlZUwX_P6LdQUo%#4ZJT)H_-RG^&oQJn9Isb5J8?gIsMv!`lZZ6p^K5M^=uzz`%D4? zd_McnK!WiJ$i-Tj+0Yt1$b{M$6ZW(K4pp?XU&rJ)=FC}k&-I7=sgv#T=z-7hGt>d) zqO-8JCJE0C3qwQ40XV1J2*-j%Sik$)a33_Nx9mP_K3T&Gd;Wm;XOh?_-ak>SWj}SB ztxV$D!{DfqI8py#i4qM7c<)?3;lffVU~--5ei8xq>g+m}7VGZ&Oe$~9Run&C$S)~{#RL`l=GIbw7&%%F2- zCEy%Wcaq8{u;2Cold>rqhf4*??{qzy{)w_`WAboja27eDqDR*1AIG51!Rk@X&{TaEwYt={ym*T4jd+9X<$8ce~?Y=xV$g&3*0DQX(;b9IFI{dC{}h z;ITYiQhw18PTzTmS29b%P~#b+>i2~yw$TQ6y_Ixv<6#*2=ME~zRX8teEZslz1_UlM zL`QdJY`wSwR9*B*-wB44?MtH0pEZco3H|!5VlRGL~WTn7ggmSF$1dGyZMRg|)C#pMxhm=G@xU;3WIXM1t- z=2QvWcjlFcqzmIs5%fqVZUS>C%24mG}7klICC-i71vv!e-U-FLJJFbiWkWEXrxypJ%ePR06ZxDYvz84t@FnZs zbTJ>cWWu0kHv4vC0Bie13yr)*=(*<#WPkKAx~t#_~kMYq^JbLHs3vZ++^+#<#*#}x&ptzmq{kFW{F5@dWNAIa-u7@uei@AkMs(@bR$ zz2ZTVCoJL|*$)}<%4zsvfho)h_8=1f_o5Zu2IYUftbe>GX}DlOZSSw<{ABV(qhbNs z=Y5S$=(>cx)^QNM^d<<2q|$xGDmbxY9y#eXm9>2khu5A~vVyK$ZohXDjehzHzXVUE zepgOI;#e?(r!a|uGEfPeP0ijaV0^`F`subPo8Rh4pN}7h{;?CVd!rW9t{sQ=mnOi$ zVR8C2brW7H9{|s7*}wsX$gES(K%IHZYh12G_wTkQCExS$L+l~4B*+~3_uNr>lQwg_ z=m*NDe}Zj-ee7Q4BXmY)FgbA|6Te2CMU$aGI^k^%%h%wW}q^C@2Y1To>6)VhIln0;@C2g4uX*zh{YIVwR{oKqo@=c?JC6(=$G z#wgg>cd;i6CeWI(4fM0M1U>&R4yTorqVc^K*kx5kWgDgG&+7BsOmhc4H9Lo>yQV-+ zl?z>HQ-e46IK!0n{*1u0HW0dgfLM-?<3E)V9NH~UZ2a!?!>0$*9Sgo-(pCk=VZ0BU zHjLs%u?!F%PDPVSahl1ygUZ>u#Pz}s@~pptSixZ*XbyAbU2C+s{^fz@0X^xP;r^J=!ZshJ4gWK-_?x(QpHtef1m`)@qZ- za#B=yFpgF^-DS?`%hC?-EXXr9!`*|F5!+x&e#=;rUl5Nj%@c6{y;^AB(1~9Zb@9y< zA2gsdo-u4vF{|}xKFzvz1U?@v zXM$#B(hoHPq=DRmzP@$T_md>G_ID&PYmDhR-;2C;C*?rz(+!T%w2zETt6;a5l=A0W z$q|FhC%9BBk|_V2$vo~3VzB-*w7xovwR{Fo8~+BI9};xvMj^!ONRjJ`j#R)}mAuu; z;2G8r!|9i;$m$57isDQn$nSwGHS@5o`aT}Iz8sghQFi{hW6;CSp=al(@o!}d(9*0! zxKCLU$GIJaeajzP3yXeisJPFR{}Us@g%{B5+BYV8W)t(oVFO7$w+Q|_;mp(FKF6MQ z_c2}h9n&qMgfa>Ptg)I2$Xq`{1kQ1cF)?dcnPo}>A72AyXBF78hO(NkLTJ;eeE1M{ z1s%9OVNv}vR4vG5f*&R@14|6ZpTP5Y(m#hxZct#aD2G!cdoGil{Tpm`xaT>S(PNL5 zqg2@uxSbKs^w)+Ho(qo#e)1(jdaH2uBaXd(4WRtJ4soBaN=Acu%(RXQjBC^2Y3IhE zgrOAsxT%0F;TXR)A=`;$!!BBF&gEvmZHB=}GukmP8Ta|eGCJ25Q4=ve`X?}qEV7Bg zg2ioc`j9i->9Z4V4JVT|x1!;f|4H6u#aNoO9+_mr0ygl)I80f-3-{aG5jY*kmT!K` z+xz)G|H+jC*x+y)60FM@eR(Om?N|U@(woQjib>W_JS|KPR`!7F1sU=+P>l|}F(cG$ z1)1ZlL<6TE0+Ghc@HQ7{bF?BT^usRt;rxB!L3B*=fK1t7#`5t-~E3dwg6>MwZF z*bplym@h;-*W{r98^t@lTaEfUW-z($hCooqkeRT{jJy(ijz=~P!R%#IXzET|HZy*l zwM@SRKJP?G#T-vE?0B2|epitv!Smt!hCA%NojGUeoVTM$q#vZ9(7EOY2!6@D#7z$`-l7R zUds!7$vxBUb=0Wn4^e9HyACb>a=n~S`>BQcT4;Oo3PxAw@_!4f(3yw1JxFT-^H1_3 zIGokMJ*gA;0~M`s+;g0L$B%)A(0*4H7WE#l zk1zfWRWU!nJ*|;_`tTa=eK`;MPQGF8i|WxAfuGqXkGc3Ga~aWTa>dGL{&e@rZBVh^ zo|^mVk;qelsQO+K!mYnzjn+MQoheV>zCXv;?&Nkj3zeDemp@`!y(ir@vXYS;JPDh} zhVT+2Om}f-P3l4?=H}QJTzSC(nAuu{Zx%?^C$_Omn`h#WHtxN(OM%_vaD%OiQ6h4a z8ey824ds=dh4EdZSnCJ0RJ;=Q?tB7&<(v88L&NCt-y~{z*aiy@ga!jnundpMLNaf%gydp;>l6Y@qRFeKz=Y;fY>Bys>+(`jBJ~T7 zSXNU<$t>W7-Ns)7d&!IMDzs2_BMQwEBF^ji5bFE}H)$~x!+faLQwP%5vWOorph-99 zv_VOBJ9su+BEyME@Imk=^S(A8E-Ai&^}i3Xw|#n;mF;}cGAKrEVF41x^}d~M)`M&5 zHTK+nIdV%`kZkb{BI04A5E_|E4*4gO^IKCebmSWH9L34s9FD6wO@eOZ?u&texs1Bo zD|U1*cZQQ5h=@}o6HY#7UV807AFDhl@^7}Y*0Mw$`CII_n29uhn=YA}@DG*NZpI$& z&MrT2o&9gl4XE5m*xt>3tns#PE>rsuQua^d?ds>)bn%ZMfu9YMrU!U=FD&^Z`nL3p zoF3h~0#UZ)J|jAV@@8?_uoqWc@Q14{ng5W&qP$7u>9<$7DtbQtRu`hVMrW{Y!+ls9 z&7T?05q>7y_WR7<;Tb{&2_BRnaeN{Uw8*r;{ zE=h$8He-0~q9NJ$OOQQLB1(@%*|JHqb72&c;obe+%=|+G;Hs_*A?fWX$u^^(>=dFN zB241j264JZCAQ?LU|xh2dUM|T@BApJoyKvujLk8#ARG2^{WLTGd}5U+K-u$duy3{@ zTnpX`_pW7uUDps6(lT5T?ayw%qQ=a)^#=TFwNW{zn24R2N_3X-XhumJequ`SdW#Xe z^3*r19R182=X#?}8xm1&cmi1t;n@AB1&bBk*o=;NW;1vHu$ujje^Sf?Gu!{LUxq(1 zbLMjMyU8NtHEhE16GUr=`*- zm1_8Ut|j()?PC`xd!WgcYv3DO$S?3xB4KylfW&5HwB2+AAKl|-7-c8%(luGO=Fw5c z)5i>Vc;utFUq9pZ$%?vaq+`JrapuL{i9~Ai2bgN1P5u*iVyxHn;54p_e|0`QU2MHutF1rv?(?Zga5fY-}0f42Lf^0XURvg#-7 zwBcjjkrH;lr9L)QrGdyDCyY5j|s+sd33)=yIAuW=mGEKCB{>q?Q0ODWM=M z6~xp$`3C+A1c_p3CjOP0K_k@nG0~CSyjwE}++JGKk>r=`%;Y|{%=aN?Y_q46GvcXU zK`gA4|HgXf#KLjwFtRkpl9|;dz)lskAzN%6n2cN7ae1vNuSL*``Usigw!Gyic&i5& zRWvf;c*=H%f+4oPScC?{8?ZX$4ZA{g3cjswWqqb;lP^zWVHKl54u)G{+5wKAwZxQc zbPeWmLeof$>@74)N@n-Qhof}PRy*qu3GfN(!TgERgfUjd*6pFF<3iX|CZUWhw~wlc zm<$iIr0AfG7_6yq!{TrLwDnH|?ljLK2Vc1nhswEVH0vo&IC25}l5fL5?r*_}>yvHu zjR+B0P~)?RZVD47-i3RZ=Q;D30{f-(zusY%-gv_9mv$mcvt8k$<21%CaU+z=E~PL3 z7GlKdQzXQr0afmn@%z-Zz_D=z)aB*iR&zKh&>u(r{&O_oh%;$OeSlBriqZ+YO5vO4 z1F+Eh1{U|$fPl#j2w0Gh>B4H{T&%n8)tP2w<-v3+oT|gc95T?hGJ*;!4f5vAN~0I_ z_md2HY3kZ6jA;+L*t>!+@Xq_$RN(MyOo)~uhxfOjO!qdLSAU7EkhW&y7#+4-G@YEh z`T*2df5(IpPpY}$A6m8R5cwaYuwHsS$v6`Q+7%Ww)N~L8U2n2^XOoz6u8TSK+67ql z!k!lGl^|)72grP@Xeg32tU@HCWcN)*NM$9w=YCX5h^A=LuOQfZy3gxm*c-Ufk7+PWOnP(tXQAXeB zF!WNC5pmsFg^}s6S#5~{CgeXADtu0exK9zn%j?C+G6xMZ%*_jbW`^>vPK#yU4863? zu;g-TTo0*@VK+>_Gs0XCdkm{?g;5732im<|gibS2MDch&)EUZ?pwm0SMZuihlXfKU z+6!5i-~t-2WJ1MTAh-{iee(Tl4JChOombH|wjqswT zSrgGC`#wMZ$^uH}*wb15XW81&Sn9WX3qAVrKI>5$Oz*il!WK^#+;ZzSHZ+g0Q?2qq zHM5hGuG#b@qK7}_y~1nre4?FSveN>YX_JlP2X z_FKrn{Mk5p)l3Lu7B+aFDS+H_+1RH!jW+qNg|K81a@#urjT4VEDkh55_Vp;QnPX7o z{IDmNWP)Ljx+TdteiqO8zUO+$LU2c8Ip@>l7@RPHF8^UiTd%F5&Bbr<(3@`d!M8Au z;bcxUH_RnK)#!$Kd2E?;9?eU8iWkESF{hvyCB8MXZ6RrB zdSwgG+_wr;JJwAT2}A-Qu$PGXG(Y z&0Ofnx(xnnC(_^R%whXmW!~gjo8iyN(?=JRFzm3U&Dh(5*j?ON2AKt7OYU(O8tcc$<_A*Z0-gPN;}@* zF|$0XailQhViJ&Wm%*6o~!iVq7fpMW!dw{bKzop&5WiZK>+<>o7^V5M^63ndtTay634ho_;e3K_8wo z0bECs%g_wcV50#W|MfGERtj*lSuyHg^#&g;7cv(RKWh2XJ-2vJ@@%BRZ-+kcE!4RG zi!Ckg-Azk;;^Az;Hyh|}Vu#YbAv|ItnLd64H6`uAA?Oir+su4o-*$)Daz>5BdL1U~ z#+$+EWju3vz!w*tv7tVD{$a%hd8#yJ2f68a5!1p1X!_VGu#{+n$`iU|$)mf>`&3uZ zBKP>S`j^!!E;OL*lSo=4+Q{#)04nDfhweSKjDqHIeDOw`1iRbQHIb=U@MaEAEJu>= zHP8jO3CZ+(IFHkTU&5oU=P=HSPao&bhTA8K2uct-J=%;|mH&a4vL-|@@e=dmH;bF9 zuYzxpI2+>R&AuHMVgJrLOg&s9=<~8>F#WKV=9bHWzq|yg&Kd!OB{^``NuJI)x(vyM zEGSJ?CU3NF!SmgER3`K&Tc-RUTC1<33QuqInoe4g=zqf0pPQR{+B|9ck_)I^5D9O( zx0CrEnMkXDV4REi>TwT}R;ke=A8wa2rk(8sT_H*(f2m44RLu zp>C@`gTU3T#4&aR3l+@aNbeu~D8+SRm8M|UOpCBVDR zBz%b;CX+PgK(Z{yzyHMU*ZGJGrz7}wUdFbO>GU(_1N66k11=&7w35q*5A-{+Ppz$p zaAFQ=x0;F9@?6PGp%h%QS)G1tJ4_@#7{lOr0i(3rgf)ydLCrby_})*=!SZPqo;sz7 z|9RQq-FaguRhxkc+s-hX1Sj$AwY0EYCGU#n6niSIG#^}&4VageibTrp4)hlCS=|*a zFxKk`_ts`(qse>Lo$GF0>!-uMC|L@7MHkW(#aZZF`iyn+^aBqr3mG``5UdjGsTX`Y zfj(RHp5q68#`P&n>6xS|Hocxlyp6p$E_WG}YP@FaRk9$s@GkECY0k#!b-}&nqgehu zjTJf^4k1S`K&XcTIWW8dC1-H{w=q+;%UT!SzT|kWo$9bcr4gcLhST3$UPHRl5SzW% z6g-<25(TB%^oi14ypcPV6l)DYOxGuNP60RnGg?HQW0mq>c3_)g7FGy7h40nzvUdkR(TI zJW$}BAC0)b8jm;bp@>s&XwF+VuNjQja!W-LDMI?~Q3?!VfUzbnXhF1>{U<4%9wQA@7R^ zK{9=a^;&-n*N^RiO_g>y@0JJ^>pIEgFMAFlceUx=LI>({9P=zES=O@M4V`C3F?Dy> zfoc3Yd~(Z&=2wf*m&TEtZzhiXU9*rlFKxmtiesSSQ3Hl#98+X&Grh_23R2nQKq&gG|^J5mn%rZ`%;|>LGZpImJ3$OF(J;Q{bf~!Dh{mB8~~);d-D4 zj9hSnC{8@jIeim#x2+;J#fB(b_78Pcjrn0Jvc&RLBIt66 z6CjV>kMJ{Vt#MxedZJq7#hgl*L_frFEQLRPFfCw?tx~b^gd_N0k z?<|59-)8U+y5D8?UR_K76Xx^pbNbu8bLsF;eiGdgbsEMJxtz88dkB{BAXT5wp!Y^` zkX8&L0CcJ5w5-1uCaed}3^hO>KE&x84RXWeAFy~2^~Ic5$QB18x;b_z-b9Kw>rvJh92 zfwd7Puq!Vb`!4sxPm|YpVf#^3sMe$|p_^&+n;^{Pm|eSIDVrsa53cA_g$ooT59+Zs--S>`Mw-2K)fA>$c#!~`0a){S zDgE4FY-{i@m@WI~fM-5baXymeG;LD`Ugy|?Q+7^=nR8uf?;UC*krISb84EG@%|*0Z z(*!dX{blG;7l`ft#g3V=SiH;|4GWb?h*vYdm3$8Z!V}2mXR%oB!@aQP3+Wl>NGi2G z8lC$lQ+?-0SY@S4@)j8pQ>}Q`CdU)QzZufUBe`JQGM$#{-NB1>hUDVj-|V98MW|+@ z2s@5xL1Wh;s<2F&Zivf;*UxXFw{SWBl+=OyQ+ULqvmHO&c!ts!J~8#GbC}`=PbfEE zKz|G#f;*zYBttI}tWJ*G+K1dmrOyT=Zul468QTRRF6)7-VgSn@Q_&|%2O_`9;P6p( z^33ZTGx^(IpUuN%Hg>intL5;*`4iChTZ;I| zY7lxOles+GihWozg?Kx1*w2SwnaYSH{F3iS+Qc`4DYr9p-*<)Kr*Q0qV~t?oJdHSU zKFOit0=B#H8P-Icg{_-hVc#`%T=__eJ{J7S?eZLmnQ$9RIS;&)wm#RZ7YGl}F=W>x zH@=_l7j~m$Ht1Q&<5=NqEPfCT|6Q;l7C-NSaF`GK{*N?2G=s|sw<$C4@5*BLiDVcG zYJ%q-u~-pv4^y60L-F2kAeJ``<=iHdr;T}qeoy2Ii`Z@E6nORR&W8%FS3^EQmPuVvRKN)p}0F1W}1INr^? zgR)C6ByOD;C4qTGnKtodj3s(iPIa(tLc-5R(3Vk!1QgkV3^KDg)+^_^6PTMB6K#@nsSk^Wq+A1Z7PFZ z>kDCB(0*L8-x9xYI`#G$Qe?NL328OI2ernB=(IOI4y5GmgD$yjXjbF1>rb=Hl9p4G-AvD_%;;>r%fgC3T@cwsYF%zp4>P4 zIv(sVfNOnRkK{!z|9Q%sNSj5HtgqtqQ^h0P?_GmpVGE(}l^j~vD$$Fb*VqdQkHGm= z31hG_33HG1uwJP<;K%EijGyWWu+nNolWEJ~Zwx~j)T5kje)|N zRM-)|jY`->qq#X^P3%N4G82T#R|aImya0GIx|MNi5rgYG3ZPpP&bk*pfP&fmaOd+M z7^?OFBl9i%*!T0XbAlivm{$omtW&67&TFp6O952XdDPWRj%Yn?W@Ji#!?IHsz=~r= z@p^wT%k<*d_RS`6II#)YCnDsKgD?$XEvZNOS+sfn4KIz_5rNO$wuTS?;lo2*4(qED zc{l?BodQ%eVAZyf_-iG{uPd4I zbc_3SaCaMGyZgJb|6h4S>dW8wYS3;}5Lalm@!V2jG5xC8*f6^1mhD!In#6#L$=z zPMee=b4L(_KD>scDIX_vcQVue4siTRNl?AJm(#-zL!;3;er|0$<941q?@A2GkoN=> zaja*Gm8;pV?iX-imnI4GNP+hX!f+=onk?o`r6z_)nG-8LFx=J(eb=4jEwpTC_`E(A zbcz@{vRsEQNsMN_!_MOdu?O(#z$;keWrwLcnq)9;KO?u?0B)ZVf~<`%81==Q@sPL; zGt*I;UN^46)+0Be?|d9fUvxFJ;dFl8VGD4us-SC!k1-2UlQ2H35^8b>;AegV3V9`C zjf)0c3f%%@%caP?(K2TLJZrwxU3GH&*$?KgXfWh(d-axmXF>l1m-P}l29{<|S&glG zh~$fTM2Pc+xkM`A?qvJ@Q$uBo?P4;;x{#T7hY=^*T#D(_1UDDw5=MO|Y9LIt1 zG>qKY3pz%PC|A9hwg#}Y-Z5t?ADaLHV*eV3{k}uo92@$iI02>$++rg3 zT;k~;l>;mON^+pXnrgg%&$tyx!*GN>y)SqXZXYdW;)f=Xh(Bqdbn7(8gcmdRTc!|W zv*XOMR3m!AbsZj0TY#_ojWA}E2bI(Jvgbp6&|}dUyZz-ZIG;O(JZg(!hsP%4{O>D> zhTjOx`fS`GMm$V z{9ND;rzh?&+sYWj5`3(F0|iGHqvFQvjHFpEqsYzq5$}TVj=U~r=p6wmrz-H9ILQ7} z$wr3_1`Mw)6N-%$*~BS2Fr9yz5gJM7pLke;!3MeD{*z-{bNSG1;x8eaH;0~4QG>;w zpEMNT>%?bDU)k2xS!}|(X3$9&0$$T~e){p7Y}XV4xM%E!@);^{VbeI{x-$_E?@D5K zw8cY_Ln@3=&WH7tg>#>0GsEkTKn1t=9CXlvOyw$&2+Xp*`dR>&9tG&m7-jBy1u#2z z)!|5hAuG0z@-vcLF}J#tua|a*S7P#>X-{>AxP$i_K1LdVd1ejo)VE-C9*F0=b5CO5 z>G_~mcDq5Px(uqSzCcG_7dvy>0?upi!YGtnWt@HV(bun&xo*b?YmZ@Q6ZymxoUY~z zIB|?oOA+Ra_;GN|TgdTw+nH~g-(h9LZB~B+cc%7aAm>tt_$4vCEpB%pJjxWVaX=dg;?udmNmVYe9C!S~m7n0?9y-^CfRW*-dj z*Mf((t8cnMWk5e$R9pcIbHku@LJbsNFJzLo?1VtWQgF>mz^tr#{FRf(3pl8abuZP4 z)lYkzGGl;!@VpjR6}9u_{w2c3u*syLJq7AIqVd=4!|1?-!=>6hkh|B;$Z9%}3%};W zea_#f9B$p9In|N>z0?a*-=BoQz&QT<702=B`&`s6nulvLSFpy(Lbiujox@{OL*VT4 zPpoD2Wmq7#9ECiRF^&Di>WZnO!Q%w>)7C#2TAvL6@4(>+XFzoQOXf@BPsUMXHnS)! z0!3GxfLrggz*^c8&VIgxa~~SB6H*m$_;MUm$MKg|+?j`Vy+z2iy5f%|!O)bqgSDz| z0PRSwTYJ|-%;v{n-^EAB8%W}>`JRqrx?;=(Wnl(O2H;+Zw{6V*blzgqYv^#L1|P)s z;OfOv@TfEl0yv#YY05k%%Y70Y@9PKDa_3ktiugHe9=;eJ$Er;3uHGw;>&$Ao`+W+A z>)e8_z#6=^YmL#Dcot%;ebf4J*@i`0Z5>*!&spY|xejHtJs}{7SNC zybas=>Gvk$UxS0-`SvKzo%D*0+MI==tAt^yW+ZcupNJmYA2Kpa$5|i8(~#<=K(A*$ z!OiRKAj}~X9$gxMaE*=3>5M64*R|j5=D`kF`Rp(RyPH5!`9Pk(9vb=OAu6`mOetd%2_n#EDaa@_9&~>Pj zT*tfa%ys&%`N_P}Ka4s#&Jg&t6lO`K@eNIMA@u51#&=o_rd|(YUZ`9{JJDyb$KeP7iBxF`w$hhw&1ZAC3sP7g^I&TkQ2TTgBKlT?(j6B|KJ$5S6{;G zMk4t0TUvunNjGbJ4@rTzC6T9jb^K>Q; zF8hw4fYMFdFG*z_`)(@s*XJ@!3&(>#h>TuPG@Gu;b%?}XU^nadKxs@h$Bp>G&b8Zx zUj2uW`gekM{X8)5Q^BjcJT_{A5%ajP9v%n^f%TLUpi16oexLyEUt(a(E*D6W)I}-v zHryxh7g}pKqe0M4xZc5OR5piE=HqX+Y1> z2&>k59BP!K!T;n3tlYE?lJ#FPt9;F2_W+k0d?|}Rm#49hGWX-d=OfI5Ntt}6X(qdI zmJsc~AH&?sH-?f&9M|XBUnYpz2R|HMqs6N$V4A9jJ$DrGY_}Xa>6rlfQGFD4iW{+%V zdAU4khIgR~0t+^xv7tKSBz}?a(+Ty)(NkxMd9}QN6}mHIh;)C!I&^JEHYjU z9rp#;cb(y2)qIJu$nUcKeoh9jp5KKxOs_!gG}d;jF#+*)^*FO95;v|0W&T;zu$I1_ z*gY)?Rp-^Cp~Pv{{I4J?-HHNN>!%I=eixzXSutGE%3$gZPhs+}Qt0kEhkZSUOkTGR z(nXexOKTC^v{(t6GYF_GuYn=YNSt;k8?OACN8DP?$*0OboA)j!Vc?q`UQh4l>2Kaf z%lJTMa(k1?gK8kGEl(%k`VFVJ&e@$Z3(#~!0gN8jV#fK(^wS1A{2*ov$ql3U;%Og< zt!;;f@D6DH?-Ew_+utXeld+Z^%%Wb%Cz^*VhDLzj=uZiVbm!FwQn_| zW(>vft9qDqOOBRaz6|qt9Vjf~!m3pY8CONko7%fw4x_$g3+T=-|3!8txGag}2qAWtis{ zz3_N_HOk+!A=)4O>93JDr1!GK4e2z|7^mXDqUi= z1wcOFIIP`K1##AMSaY=s5L1hW7Euurcbh?@F;~{=iU@tc&HQh+7;>FK9P@PlZrJ1Z z0Br>&>GriVSvfq0b$erY+S+lj(3400j4k99qa^u!_A=w1?EofGvUKlYC9AS)6+>nA zqoCj(VqWpinS%?g@4>(<5xSx3F`V@t<{6sorW?mU zGyb>4SYaB8iJ$&}5bqt+@;(u?xgBh+@nrfp&kM^wctO{pPPES5g zpqsD!0fn!p*l%U|>~lo}nBMRP-J&~C$wG$ih_yexl`Vud+f9J9&hE%+(7%jH0WS@*{qkx_~O%)6y_dHG6K@~fU6?g-2x3{r> zmo~t8$s2I?tTg%lU6|-ET7qYT2+1lxil@Wk@b`ge@@cpLq)*IX;y1^`>9CX7p=iiX ziWF_dN|fv3=e|RqYD1 zzt9%9Sqnk#3@13p?7|OL@vyOIj0qL&#YY;qpfYa_)PI?Zvuat+)6vIo8B(Q;=u5UM zW+Q3d=0{IEbJ;Gg>pr%%l)HzH@p6eh5Lo%6p(53svr+!(%6ATHGZ%^vp%g0$!FKy_IG>~w8_LElDLlPAJLOUxf5))!s|cJ6E@w*T^Jq(E8~zyNy3e}DV7*!;Q(Rn)rXo9_gF8c7#3C_&dl^=X z72vXe6Co}_pEUm6if?mj!DV&~J=-1+OJ6ax@lF!*8@570tr7XNR+|;y;|tF`=Tna& zIcUvYh`%+5V2RFF)Vwl+104U=qb`uioimMEFIkWGHdw(g%Lp1UFd0q46Upak_3V&v z4ERLtA(eKCm>n+1Iy>aU&7hg|xSTM_F_}UZTsg*WdbOXZhbi*5DHN z0(7fIaJm$rlip1`p=a`S@_mNG?eGu^5kxmLLz-!z4 z*d^h6piP!XiVo!B1Glws(ol`AlG=`TKg7VsFbVctdIM# zoGJu;qGfnz+DW_{@eiJwSTQc*91JL1j;)_`g4$a+z|c)Yk~`}JO1BU2m0Oiz;mMD< zd`c3^tj@=ISI>aj_cR=RmkjB4SidPWle|B&p`!8zO?ZAVpI~apFL82l27WpfOFeiQ@Q7E$q!CVz0x=#t;%~uEG zFRKXC;Y4&)yD<)d){A;$ZLB|RTqR3w8q%1j5})CPfFvBrn85jqqCg~HANw(nDC+dY+Md zoW(}u`}qyf*lAA{ue=3w$t}3#CYPZ(wFlU|T5w8z#CI+@4O43p_=D{&B#dN2N!}#v zNqL81<&t#t@@g<$5=*mvy`f4~0+<7b(X`x^1bgO@zRSh9oBPc^KUTvtT*gI6;T4Ky zEF&i#CqvJgSJ-nThLvdFjLwcf*hTp@>|cur*3Ewu-kbTn z@HGk~*j|nXT;;fNwfWBXMYiM>e0+Zk#({~@nof;F*q4WunxZmqccE1o9|36zkhKL%{c+{MXc zzDg=Vxok$g>=HSt)5_&2#bC%=ffbgGgld^692AiT(~H$SP5TmTJo6StR2Ytt%%kgC zKVg-$B*cX*rc00Sg{zm$h+|VSU3{|+$9(bu+bbGw)%0+F6(Q2=P!EkMGGHs6N#jCA z*}Ux@^uOpCU_8=@Qt2G~%#M&8I*^(fkD@;XX_%@xnNz0Crf4XW z<`qW?JA9Ej_+&Y|e^i2GhK#^jX>ZcP@nQVK59331jw_lH3bVhSBKfr~aJRGwL{=^% zasjI3`;=l-`nDVft%pS*)?;1?#)}CDZqv%Q9!41L-7f>{Dui z-dkVL_XET2eXZ!~|1^n9y(n1ymjhor4d}lcmoaGdR$S^ej!GwbG)M$V5;+rN>h&s##Q_C+T)hz; zUVi2>{=qQcGllLDQo`pd_keeA1w{6%;L^COxY;a_CaqadN)7es_iskj`IbESdv*@Y z(`Im!T_#VyO`CpnpUAdcaU=gxY1r$xf_fji%IY<(;yj+Nu(?WxD$B6oee^85S@bN3 zvEEJ@L(K7*w9_6Y-^M)dG<>mKE@C-vEu>l5wF0~0SmP0ltP(26`V6=BRKwa zVc*Y^!7IcPJGVO$yYWwu_V+m2nk*r@B{3wTbsyaHl!WnWcX%?{10w34;)JGX+mw+8 z*m-*j-K{SH0-we(N#qumY&ZhN52Q)CYXmdCL713%{DH_lY0MFt!2j(31W)J3K-Q@; zW|^!8j;1Yy0u>dSbgdcloZDb>{vDjyYy(}{(cB#;5whlS{@JX#*yAih#HOAl$s#kM zCLk1==eNQ*afTm7bs*O>0_g`A(yN!Auop#)=(u?tF0i@>8%~?i(~D9tT4g=4kL!k- zQC(V;^?(tIQ-W)krO2mud+B2yAMJO%WRpf)(7a2SOig%*-7)WAHETpG(l3)05C9KO zXE4Jt7qDvoaq>>y1bdHd1DtL_ETr6_GguED#MOy$PXs-;^)aC8cix3HYnTr<5&Xpu z7T_bEC`}r$APM_AVEa*Tm|@Tef!2jgq~9X4Ozjhs^8mNcAwJl>~R)jBjAc?nU8AW zQpDj-2L78>gE5jEgJRKAypqG+85^{zu#pL>y*US8e9Gba`^9+T@o)Cr0d>;zVvN6| z&jya3`UhFVQ=$IMPyEK1!D}Nuc>OpAoVeda_e%!YZ+;3Ts=1gt=^tOfFCNX~zM<4X zQSu?d3Y13-LFBY4o*q*o5x&Os*bHO*b*BjDT#ds;n~&h(gI93oL~)v$WCwqah>_(v z%ShUtIPjZhMr;x)pzFkEl-jkG_Ino4aP55b(c_+d^J!QwG#NhGT7v1Ti8SllRUF@} zj@Cw76T5^>!_#WQ`qy|9p=53C92nb>61TB zL{u;q3pNN6oBVi^7#Kn~CCj5t!Z!$WY66c}_P9VskPNYMO*OPdQcalGo z442O!>tm#mo|#I%2pN)IuV6S3p+wui9fl{G5z8j36LFBT4679AxgfmXS?M3vzPf?&13nS>VSM#W$Ni^~AOh-R7nW+K$uYXg#)RZKvA7rXmCq0LJ3Nk-%^>0PnNCdXcuuI062 z<8%d5ki3@}s&%G+4ISvA&+jn!tuh|zbjR|}vskoFi1&Eseptw9QPoGT0;6z@)iuy3 z`_$zbxOWLwikFg)i`=QCx+8t9JfAg)=|a1}Nig%^52hn@DK>H*=mG0Nj5;Ae`rK=& z$@lqWleHG{yx|7BXFSLJ`9^%zfi5gDTTBXl>}kk;C$>pp5GKu)p%YM*t*B~6ubyaV zNK+s(wY}i|LxaLu0iqdhgOPk5nf1h!i7)Shn5bcPKi7vh>S2iIDaRw7ISRi;wCGsS z3Np(j68cI+h+cmMyH7?R{TLzNvhgZT3tUl>an|~V8iL5C#8VZDc z0b9_HdXZVPTX4q5OZcQUfvvvq9qxqsaQpXJ%sbIAUX#*8D3*)l<``3YYwQKwmpY04 z2CFeueL9(y*2hept4B4LU&I`PYn*37hR7(q#Lgkk>-A|H_U3X3T;wdy8L_bAcI`I!T3w%Ax^{53Cjg;jWpk@xW z$Mwip!~I0rrUC=@b>NngV6y$qB9c7sBGip&(4J)?RA7xR$&XNHS2ilr?MFXD&2TG( z-Ps71Ni4MNX=01r*N5XUnadE@GTsTB>GBv2{^Hfsh>*V>y{uP@_d-r%BbTiX zZE&Jdmj#gb&zeLea$I@Wdu+}6=`?oTDDJTzWZy}D=Cq<)m=SQ6jmw`yTNc<+y6PTe z&Q>QCU1DTsFNMPT4e-9wl`dJG&CL69ipx^`z}uPs*xka@sdU(3IzHKyCa9Sa{~ru3 zGrWsw$w9oBSw+kgCl~xRbt#_n2_gF)DH4lwANjvV6A5I^B1*z4m{FU++|m)Gd!HA9 zikv25Xu)PcSTmBrr$(Na}}cZ_OSJ9VrP&L>PG$CCFqcOApP&W5dEt3 zi~SNTM2}hCpc3KT`r*k<3Zx2D0B;T7~e;y{o>=l7AZn{mh;VL zpTb|o!LY;ZGhEYHg2%1@v7wKoF~+PPMHXn1@0o8I$2+@e*&{PDn(_p#kM%=NW(pa< zr2+?xlRzUhket-i1S1gXA`FHzm1+*Lrlx}3#`^n zDf;eQ8nZsKoh{aWtC|@8hz_tMqYdzZ?-(+XI&l+=6B2M^MuF z6dkuHBN@AXqD9F`Y_eB}ltb-kTYHxo4yi|v5`9{vzKd3F5hHsyh%?%yl~}Pl5OqDp zsJ|d$Lt_oJzY->bKgDR}b5Xd#v29MyRVR$qK{)(z3JtN#rbovT>FI9+=&;EcW^z3R zcVu5PX6s~d|JSKB==>>Y9b8Iv)68k%wCnJq;vIi)RW1JDJl4M~B}q|G5}dxSPERZm zr_wGfNYVO5WQ}t81Tn)5|>!Y7ryc2R?#1P9%#hBQz3WTKU;mgsT@b^|ENu4iD zY|2k#=FCv+n{*q_R7jH0J(Ous3L)vq-wdNP~L^jbuMkKSh^ZwPaK`)A#1klE0BOf8BdF(P|M5d>KU! zjU1#u_feRAsTc~jMq}yDdT4XiMnQQ28duT@-*cMq%gd9HnJvQ_ob2a)x#UgSmmMK4 z-Dc$TOB2$OAiy%(uC}HBzJo!84e9fCMYxg9zvEy_Pg_RwvOUJJdTunHj2H#GZ_#A= zqRBKpa}duA`_nTchhVe8Exy=eYnKK=oMC3_CNg1SUz70pK9`kBk9J#7YB{FhVpY&QFjxOZpjV(MX z`F9d-vYbSBm$t% ztTl*bj31VhB#tpYbHIdt;tivIS`}mc>I_}AUB25x3-n=f^FjJ)mJ9mx@^G-g53e?*;tdd^x;yreb;C~~!OMs>@)}`|F6e|p ziRHAWzZFX^@gVu2J=J;aN)HJ#q;>ckG+Gp*`Lr7bt5sK>taXa`u7@0SP>@2V3N0cGV?s24|f8)?wU4vT_3zB;^ z`m|dtmGtOkLhqZ6H00eq`1QJ!G1s#~hq?qN{qYk>u9-#-F0`YMSykE_ z0(|-^2p!Ll6CHC!8ttIWeEt~AW-(KV@(gz(>L|r`O5*(YKO$)aa}?tKwqk7ZGmG~Y-!gQJ8X)GfrmOD@VxCa z&X2vF%e|UI;lKiPyP-jx1MSJ_q${we{va04{s7bMmF#n@yD)ekwMqLuj(CUK*20A%1 zT17qRvUv-ex~Ya4pP)!LDINipV=Eyqyb8Sxoyh1y9eQy`Hom`jh(67=BOym!@RVgA zUW~9uFG-efAsCJ~E^%zH@gMxNXP;c&tdh>t9Vhi2mYw{f_l#< z$UT@&#I{M&zT{o>o0L465mZF?tkVUTPuc8>3dBDivdlEjD>hmECa~=xq&-Il{FXdG z`+wrhn>mu;(kn@ZWJJlk?J~q6Yy;dJ_of<~Vo2b7Nus>D2psw}ao6VuC^~-?{H;-< zF%C<~mfposUCMP?DQMF2kP7@{RY=r(<3TQaI{8;S#!Fw>&$2IO)3MYh*fw?(E(|!) z=li|jlvOH>ZZYMsdS-aU>IK^Lbi$kVmDY==N5iN4N<@3fCx!`0MfWv#!AHlE^%QKu z)!T+(wih7+hf_f2^$eo^P=FEvQR2LKCM{~?*k=ppAWwz!9oIhvvm-}IR{aFJ^;jc! z_YvjJMmaisqYZW(j$?0IRf4!r2|V8Kj4?)1^sWC#JT+$>v{#=;`E*xXZCFGeUKXMA zB9DUB?P46fol2{VTi73C!AyBkD}L>c1qIV=`ag!wJ08m~4C6{>L}V+YvWt-QKKCI% zG78a9Q7M(u9+ZkAdn*zeqLe5q3h%j((vWBu8EukE6it=i^RGYfe%^J?HNIC9y(2Ji z)gVqA<5*~cJ-ejqM+TL9g28;IJ58rNjv7f(+@M72nRDW9z zt3HJZ7pzF;e#>8i%2&(TfwCq%)vYNEIXjEFmW#3?p%X^yPUf|tN|@#HUEmyE1eb1V z;Fw+FOl_7pD>v!qG@fo}{R);iZSPd}FD4H?bJd8Or8jsenG)q^d~fIbEH?jv208Jz z2FC|2Ad?beagWMAXdBsqi`~ZB<*4pv<_)V!l-N@k&u6I5PU^(P_8eT@(F=0B3UQSA zXyP^X5o~DL4~DHPUkt3PY%n;u&5L`2kgb)dt-8Yt7b?627ERtdA%DZ@$SpqG%{XxJ5 zF1MAJ7KdorvBdS+2)>i2g*Gw``1!mvd0Q-niNY;xOi!Vmuj5jDzfcrLPd`cBR|-+Q zJ{^uZ^@DmfzkTW>3%5!}FhyQ_c=ebAG5;YHc2KedmYE6YN7Vw zU$FA%WjyFTm1p-};PTwA)AM?>_>4_B9R8xp-mOz4KXYSo<(z%2PA?kRs&c$w6bI4C zVXXh2J{J2lVv4c?NoiP5+7FzEs~eZoc+YiM(c(g8FW|R7+N1Gp;T>8VX^iVOi7=}R zYJy#t$FqsH|F{dSRd6Zr9+-I_C-2m!v!=NRh?e3Aa&B8avRo0iqcMuS=C_tqUys4C zmUiF^;gB-35B=^}!OjiwY?&s%Puh8qeYpMs{s}yp{&fO9@=Kt|$(#H7`5Y|q=5z6J zBiS#*OYrILCU($e44K%|4_Ah!lKRn?YEa@h!?yLLHF zm7NAVh4M63@+MlB2XhyRTqgG~f9J| zM>3i;@C>aj;$;wFEDIe~Px0s=Dcts5JcZ zVF$O=I+*W=#FHBJV|aU<1iAa2-vwjySZQt{(F;#U1K-!sv}h3tYg|T-7Bl$CcfyxG zDM$L!n;i%p%bJXgNXB1B8Wr&YmdH4eh{_g-O5tw3p-uMd-d@R7bCbclqgJ%~n`XjiUUXFiqtr+aoCuyHl zNkGkBF7>7&OHh3wxT^ghSu|}c6UJ`^`^{>QYq*qs^e*L^cH}TmStU|%=^d}vo57~M z3N(FEMbD5SYTlfW4?RVh?~hT;W2GJ&Sy)PEp5MuBTN6fJ<#FWrvT*2Am`K;ni6-d> zuEV`!vXFYZ6~)f!GaZW-kk~Q}t^K!f#dEtjyPh%Bl-CM#_)UaKlQ&~5lVzG#Wvtcf z2stx108)E)LiqP0Oxd~)Y`Y4DP#DFP`6csxNJYZZm$1-hT{Nv|HK)^9f~GaPq;JUw zC^dAV&R(BkM1U%>ep^g^lr+e=xIB1bsRi|t?I7IXN`h(#d$pwt$61%ayx0tG!z^t$ zR>1aNkjxTw*7etr_B;Ag&@yN9R+n;{ zlO*w*m_ECC<0y@vyb*o`C8KxC2_iFgF&Vou6QaMl<3hekcH&?Q{wauI=3Du_?k~5w z(pDXIGcz0O6AnVfdTm@j&y)oGISD!0S9yP`0QHvqhcyQQ4KwGF;nloPm8d~(;A}D= zxr$9cei*FmmEg|Nk96nS%iNze@$6|UzvVq&is^m&h4-T*nVEMv^gbE_e~m11HR3oC zJ+6-BYSyG|+7M@Nw2ofYwgH>+RJ`6G;*L&TCQQ6KgH7~LBm3kk(5`7U9<5Pj1$EBsg2jDiyJQS$oaV%$ zDtP8UpNZA&Edt+1%VGONOS0|SHBkB($W}_pqw3AY%*byo7g{GlLL!YI-l?4KX`x^= zq(wrxyLh0C=YdryGNTQhICp0T?5v+o&bxku$`7f~By{5Mcuin^QGf+o(;#6@16&&5 zb-{1GlMepP}eEgnIuD$zsEysuN5|Ihy%m_#ru(!{kA?DH3}-4Gv7 z%B1-Y!Kibf_gNR-CTO#J=f6Onga#3ldjXY?N`#Bc7lUM$1#7raO&x!!ll#6#?Dnt% zZVqpT-VdUrCGZsZv~DJ8yA4SB$w++NB+aSj1X0bio)GhSKY4sTmRwvsk?gNBz)z*- zEO_oAwEWbLeHkXCXxJEC2F`<+=rl6paJ#L!n;lmmlS#(C9EB$R26SE0R9d+PF}%Z^ zosYT;?oF~RUX5qPXezMArH^@yq6iNq=rMtxIa^U~$34<{fGaJxlkyc0`OIq~TQro! zu4rel25Skn$omcC6h&ZIrDVsZ3;LA1^t`uCh)+FlVH$wv7joxs>2_0SU!G-)6+}d_)Om{Pa zQ@ftRS+6oWcCi83pQKMC6z8x_uZJ;hsyFPsE5fdAR6&&(+DWVP9F6m|sn; zyOTib-AZD3^#GbEpTVjATCA|}GdE?6BKv$dl4wg`z$YDg#OC;A9O*F~*Xo?ZG2`Z- z!^NBQ`h6W{l@~y>aWucfWx~!&ujG8>T;Z-( z9rOshsn!N>c1y1a%lw9iX9#NK`TGU z;tAW9ueP)N?eLP2#sSFcbULrP$6M8S*Zr2^EZ_*~@wc3rt3n>SJ?ZQs7y7 zbDoJHq(2Rot>45{f1ZGaK^~BjSOeRfHo;HxO-wi=42ZHjM)$qKE8T@K=j(gmiV87- zGlm%}V?p8y&v(qUfqcIh(tLd!D?j=J-*x6wjUo-^)msj8jKgvMn^!n*V*x0 z1j`=U6HS2-W-AHd>?RX3@fP3tTV4V4Ctkzkz)Pra=*Wimh(pg!864SKgIWLCl68Ks zIFW^$X-~sH?qXUK%yIn*i;N$^(N<|ftnZW0ZQa8_lB?cm6*Gi7|Hpun@BG>K_k{|qfacYF=Ev-Zp9u) zl>5^FMbBgLVe)U-=KchdGR1K4aa!pqoP8oYZyUX=PnU+i7TIq+Fx;iOnJ3ydT;HQ&&M zH}}H%iy5|SC8j}6;(64YK45o0d<@Z$Q(@XiTH)UKEs+0rE*?2p1>YPMSs9=C_;b9U zeodn6UNfJ=F*?I%qE$(+LjsljqQ};Z{>(*g7viWa{xrAr9jTj=1Xq8_LyAi+KBy>w z&j#W|(=%L9T4GOPLLbuJdPDv>>41X0$N0^=Dz0n~-+wG~W_NW)k;G(CW~zG|)s8G? zD~e5oGb4(4t*Qi97$&j`D@(x-#zb!Oe#U(%+X0(YracMW6dUzeQKWehtetk|qWGTFI+sY+8O2uPYS$2b^Jj>yN8+`fVgNIYp z$T4j`i(oPvaFHtpe!2nSjj7fjip%DzZg(x?jx=&0YqU6j}W#JZRbU$UZ$ z6PJ?u{;fnJDG`#V#KHEqBD_)i6M^@uU$%eb+4GKM((^v_DK%xeE!iYwCa-mGnZ>j2 zv+<@~2VQBPO?IaA;+6@ z#Y-WTKrVp{YN`>uRnFu!byYcIXP!wgy@bc^v80oYH z-+C%zP)R!uRcs*|x5LrmdnveEX5g7nv;TOU9 zFZc@G*Rx+BZt9Hch2Q8Vm0GCB*KlNyC3)7XO55V4;hG1aQ*|+C@*Z*gU@@GlaUnDA zB@nOO>sV_w-?yraAWC`~%>U*@Zj+`gIrBOPKk4K!xuP_d_W3M~^>4#dxi>-d=U=?9 zrN;Z5`Yd_tDR3_b@Nq2GTk95`?upgTFt*)S6L;SJwAaOiB5&8y1AUa_#6MC+|9o~S2JtLUeB3)9S z`W?3>O=Y`mCXqZ#o(Gt8wTm>)x~i3?~dxlOT@|Wc_m@dUk%bg&vIElZ);R^MA$%p z7L(6C44HgZ%lzwTdTMnp98*UOJ&DjG{^Av+bN+;9|1zpQ0cn zaR{Q9uV5RWZDl^W*CBY>A#PQM9QjtoXSK&QLXx0NpfWk1zPkK{zPmY^`QErf*Du*Z z^)69#=CkAnC0Bs@BN=vkpC&n|GX-Rt6!AZE8$PFMRZ~sb9IB))aJ@cw6 zc`rXwv_C;mBIg9mDomap-s_BG!CC|O9wLO+N1Jp+zvx_ z_f0lY3fu$nH^kxkh)KAiHjs^3;YOsi^66N~TXbs^LdpJe++C_bJVadCIq`DP{oBuP zYt3PYzZ$U5ijD9{cR%~GD~qb07LdFD3c0en9^nbmh49DRhpiltB&{K9*v6w??B;4a zPN{AKyEp15SbH_$jWoXFtxIq)%)t zi9y;d+|n_Pjfszee`-aLT<1g=CFQ`i;5gWuwh$I(S#kLLFuYSQ;1A|4;n2{~C2@ zs6o3sk-W|(gR^u;2+tPpV{;d*Wjj^}5h1$+4IAdbbe{XRX`d8xsr-c^fsxF^Qv&^E z;)EPKD!3z`4`n^O**NoVwA@e!B^mcQqt#W|>l6e_dM)r6KY?NJu@T;#e+!A}`Ydjo zDBF~JoTw;BvpDxYeCyj^?Wfy{+S>n+YNq3uC`V?uVk?Rq+JU{bKG@}7%Pr!&O45-v zXyFpW21kuY?yNZLIBrVBLM1`0c?R(^;Pd)pBuK{6CY*kDDN|_OM5W9^aN4``L}5>!48Q+W^GUyd?HR%*xAceg+Y z-}UOgRE|rMfQ;#%PK>SAg8{E2Ncwq_p67L_wEKbJl=v=e-DgPDMPl%5p(^^a_4D8`8>KTxRWYo#b$F@I^d|X^P)PnY(htxka38*ksGX=G}yU0Xj^z z-;Q0p7YlI{X0ax|$m*XT!1f-HW4?!@xD$)^vJ9gmY+S(>B67!;1h0S0#a)=he%P!) z^+pY_ejkG!Re7M)kc~G6CgRtk^AOk-N!ucFxK$;m&@oFH4;Lj6vCRD>IQyr-D6s;= zheMfj?^v=SVF{}X$OF@pi@9~SGg;nm9Z-2^!5twpnD_8RcK7T#gkLS3r1pEhx6;kC z;$6`({T8Y(I|*j(6Np1~8TVGjhlCYO1dm!D_%y#13vV>Rtta7Jq)-ZPUH^nSf3qN; z=SsJ)y38gTb>q4(f3Tr>Be@zVD~zi=N;-7iVP?x5_?q>R%U<&n!!!a>dbAlC7c~G) zN+ndsaU9dzAkDs3tR)AY%z){DRbk8p4 zD|He#UA;ga@77_#@9Xf>x=3!oZWa6}nT0o|{-MqV%ZTpLBux476vf;olfuBcTzsPw z`}F<`#2hS!hVn%;vppGpxu&z8p-)srLWMm15l2D}1>n3T3yAvcjqFeMJn)*vGq7Gi zfI{y?`lECuF8^yso_;LAhZE9?ex(Rhm#UGy=Jq&qguLy)j}kOOD2J(>5;Kw?#-3d> zG3~1|3tc&nG&(C0ncq%Cu~&?&^V&!pu3m!)yR&NUZ!5-SIri-RM0;}kYzK}UCxg>1 z6iCj`t-{AnbIA``H#THxNdDDmvha5c0lm{;SL!eHF1(DvF0pW2ER`5o?;~Ru+(lpe zv7jlx49fH~Nf)oVUI>4Q?cxLY@x3-t@fg7x)@!jRZ||Z;HP4_UF>3| z%DHmn!CPx6+WQCA9^ySgKLi@E6podAqix;wIBxM7loL+wD ztiYTuRW@ToGJL(F4V}-|5GyZt=t$4T^{Z1sB4#49`j&@N-}0NbZ%)&doz86a5j~=z zbQ<^iwL+Iw9#vE|!%;nQtXbR`3v5QSXqR9Xpz)3X+PW*onUv@v?3O=EVxQux}@5ZH)(OZh9&VZV!N+(QyQi^L+nv+Xaa&e1^J* z{~MMXtbXPkUURM!2(912eZOpOz$Sstnr^^Ej|#{Pb!{x=Ho|jJI|%X2V?OS-_)zm4 zq@3j$@P~~^h_wT;{5=C*dAjr)ZXUkNBKwpFurdH!)cQZJynuW{dTJi3Gt@Q0o8?uYv@xHMxoIO_E z$7SUDBUe8HFYVUu3WxTZ)#{jSrTp!|6|3Igjdgh?rFa zvfaR>``kfp!zNIgpu=iTjbqeM01Cqs&@oSgNl)gp1qV7Ir}h=roVksKM+dN{uLFae zuHuNxV#4X!|L~DpDtq-Ri|=PekXI2^@JfF-DacyDrio7^T*iIDhoY<8k3tW!$nQS4 z|LH>5q}PNl{uZq6F`rX8U`iT)`H=WuzD${C={&azWUc%z%!zPi(xozmos%v_(JOLf zr_(R!FLh>?jfaV#dLJo!+sRGKYsZ6MPU9Y4e>i(DlobVNph;~D9Lx(PyA`T%`rF5l zCL(~YO~0XOgd&bQ;Y9NNbD2y!zmF&)%a#{RW*et1LiNmZ5OqC>&CZ^OgI^+Xv(W^2 zJWve#O5B*l)MK2&?o`1`{%_tl^I->Ph~v-0ncPF&M^ye&I!S)A8|C6JFvAHgT!D2Q z#;LfGOU(yyPE$VYpUd+`831v&4CqT*&t;hp;^Q9^3wX66xwqgep0Lg^uZ{afRQC;dSWI z{c5aw#Z*?3IGeovXHS&UtjOx!x@@Bj!?i`qa510+$^&g#mbw_$YE6OV)AHHomt&Z~ zX%}95uf`nyzNfyILSVxEE!;ufcj&)2h7`;_!*52dVha3STYu{buK)cAjPclmpGwpP zAN57)O08sWNz4^z|fhiYjd%kO+i58to@m9V2M!LtOG zZ#zqN=`6=16W-tl>mYK(K9bCyz6KQ^g_GXSd$@0^7|C7C=yHQn=#il0dvhzg8ZLsV z!y}+c@-r$%M-dZKJ6Ib%0CQ*G7rsk92~)lVp|W-in>)~oLu<;PiU!zT~bft3Y zbKAK0f@BsWFGJS7Z3ojmSFlsohTU4?Ma{Ar=&$2;#Old)7-9Vl#1xcChFvvC@i~&f zlaXN2J(28QwvSBJuH?LSF2u6P5)zQI3eQW9Wjl`^<77&=6aV-+OqglK_BZj`XnPt{ zGV{l;^IxFTh${N~oD6#?mx&(zne_a~TR8XZ7n}FfTsKojczQ{Ev14 zn{*pStDCX5vDa}^fjoKqd=_NK&BgVb6j}K6T<{aT#&*Z0Ol&}djSpJJh7_)#(bFs3 z)qp+Z%*P2oOJ}^9>2t&EyuaiLR&U4eKg_sh`@bVjYXSEW59?pL`lSxB?PE2 zrFk4n4*kNFObsXU9huBEZU#H`I}L++uW+AJG}(oRu6%~#Cf!+}LK;2Ac~8`c{NelF zd*wuG%2&4H)a)2SJax(K5f0!hh{qpGWAQW39=}_k4JFV}h>XTF2#nd^{luovWaS&}OMPPG4H$7Ua219>6hq<+F2 zYz-V1IJ-u3ALC0DB> z9mFIud&DY^be~}Mt^}mth%>F#li7;Nzft^-CQdMUP_wAQo!Jc!W5vrp+Ws_>ZSxt& z=F-u`dB!RH^p^Kwv|q9NhUxI+t3Dfwl_Ba%8Zf@RkJBXoKzMuzGY^Zf<@ayXZ@o|W zB-|;|wwv$3>~(}qN~SFJvIQ5hZ4b_F-AEhNZq>XoQ>HuHKHx1~FSf)Y1{b`GhU>5e zF05#P!3&n;_k5oBa3ES3V0Q`E6;@$>7SBI-Z1~_H@b9R!`8o|)i09Wf&v%xGOhB-XfAvwnH(8DU7 zs7F|nFlP%Ji=qawKX{ne#3E6lON2S^+Jqwh2QaK`9a!&K2)5~ROqTDEm8p5*OOFlY z#W;Qo?py>*K5Z>DdGAPcY^%`9ZUxiXT?s5*kt=?!XZNgq2eb|>!?|*@_?S;6Cq}2vU8g zc?RSlK3fz+Vzd{L#c|fe_Pqi5G^vXHc_hl@)%s|ms2t`KOExCu9<`sL33s(+iHwIc zxvdb$UPd^R5#8(A3fBO_IqxFeo`)DHT8iu3PYUuhP1zQnpZ#yZiTilA4$Q_|GarQj zzUyDg?VoJG+^WU1i5+gH>{`8~J%M-g{8@gZ(_@6Kh)1_>X?o}k$d`_TO0ZCWX- zf?s5Q^GI(c7k=gR8Yj$4R86s6Q1p=hx;}6vHR5w6m@=zzKxMw|N0|>;j|h$(QzCE zD0yuUL^gr_b{(7bI}kPMpVx{s5j;I*AJmzarR8qhVrJAssz)6gY;BgK(bvQDJ<6 zI^Gyhn;T2$#YJ-X2lY6)5Mxw#ZlIPb3_hp2!Cre`u6DsJD9*kt%=je`jx~Ht^}OOB z(xI0tIMX6H#dVf@yi9g zt$T$VDd&NECbV;j_3E5Zx`9?riGt_jjo{H{1@2gQ6n*^PLz-k^Dzuq05j&6X;@N=l z7+ZEpSa&_2`rnp;(}y=>jj9FwEO1T~I5nA()bdHov?SL6W1BkgH(Qx*3}%aP>?S`5HEK_>`XLvs8JkMLSSe%33UgPi;&tN`o|Gfsxn3=gC;cl_h`1#_V+G8M(vio%Wy!~EkEQe8tGuJG+#Q0UgfPEBXHJ^vGx zES^TEWAQFI0_d@p2ZE(r-h~yRdGNrk4B8CuklHA z#^oYb7#nnhY8{_VfpH-0kK#Iw zop=w!5gUfgaa;LzY}0rwJf)+=E!Xf8jD8@`Ih}2%jgu$Ce`eFT<#)89h|k_Fa!Tc# z9#-Pdu)TD<@hSRiz9X2)ZnraiIg&OQr{Z7BG&mO{1Fo+g2uD5=Lz@fjoV@-77?pZd z_}zx!z`=VE+Px5u)kx9#bNLmmq6m05Zx82Wl?qFOU*HFgGVavQ^Z2*yDxKi?Ofc1C zCORDBHHK-!!o~R&NQ2DrKvEKl^4oh|PJW#0V_jI%oC+IvG~tHPIk;iVR73MN${-zBvg;nvR8(AGiEleL?jL%Qb zsp;0?JM+oST*Vk$=t}&+U7qI&1;2ZQCl^hnD(Cxzo!g!9?mu~8-%exjZ8xktb&3<8 zSc0PkDOh&V5!2^Br0#cys6(+luHNqf{Ws;&b@T_kQLqP+^(1K6?P6}y5qWfJ4x>3Y zQc(ZuT|uPhA9{4p4jT4%o*W#Yau~3-fhZEZUvCgQ)w`NprG7Hp=QD6+&Yw8{qAT=}fI;7BMe3dCCEO$VVxVQ(MKE9Gh*NJ( zhSC)a;r%y%4h|dQ-bL^1W{y6Fk;Z}Cp@)0$cSR~)?qUvyyYJ)K=?A&RM;6fkvR~oC zRt4Le@})Riay0Rg+J^H|crM6)b#zZzJ389(`$Of&fy(HE-CJL7?D|d&+!X`*oA=Tv zQ3cLOc{zNwbtj2Si-gOxZ*juPK=|6R6+Bj@Kt$Mfm^l21s;HKu1kZ<;wri)e_$>3v z4^4FNQ9o$Dh(mcbW4>D}gOAdhXg|*wt$c4x=PmqBr$|TA)Hofx4@<*9dXW+iCMEJ{qXq-cY8|?+Dui$kUP>? z=-Dec{q7`xuAWRjjdKuM@?0z{&3#0o`;k!gXBrMMYgpDQCVc%M1aBKng}B5JVc_jy zZoQj0QC3_^&uvZzXFE4|7Xh3x3b>wG79{!SOiXA|!_u+@{B*B@28r9U@hhfs;|(9c z&bdN#k=G<&ELU-<=@Zzg3kF0YY$ka(P=QB!71$|*@il`{`Vey?N3insdtr#{T}U>3 zk0KXWa#lG4$T;y7e$+p}*Nc!riG&tS=tGw_q=FBohnfyJ@|bpDoh ze0AIsw#r_F+zZBZ(zqgAdqsv^y*~nnzK#So7=Z;n`tUDsC!R{*2Vdmu;FYTb1kV!1 z^Ea!YvNjQp6#zI_t1%sLg|?o2D4HV&RUcoYLxK_{=#CR;Z;RnBht;EeRUJ-F{(uR! zk!Wx$gPwo74m>9PgN$u)kg9r)+iCoX8~OS&Ec`o+k7ZwTMqwLq&5>AC(^5uD{xcwT zjt2a@lLF6d7hv1ML&EZ`1@QO=(i;jrsF6AZL%w|+TdYF{<{id0&m?H3Y9!wNtU{W; zr^9ElXM#tjd+7qta6Era6cp@q(0~67p~0$DkX(_E!>1a7TXzoKmW?96qZVUyN;bfo zyIk{}QJiYAKfK!O3=%!F@#vILRO$&J`CrRwCcD4D4VTW~`#xhk=`ZszaLX-Wh@cJb z|N1G|0=i_w%?5Nxy-G)$%wc~w>#&1AHsaRfd#TB(VQy9CF}%O8x8{h38K-EUg;w2~ z@ae#L?!PN$=y&BQjIjMlwR2*GzV4#zrT<^9F1R00-OHuItj643=8}{TVGKJ#)BHE{``CME-=An0 z@#Y&B;c`KE@q;Iwm1lus!~4l{Cnc1wau+_}^Nf1!>jZc2Y6>h@+`~uN%``+!ocSyc z#RGnixK>`zJ<03j#e6GORk)dfem=v!g zrI$CdgXWdE!@OExdLj$seGJG?#XOjk;7ANQ-s1NOTLngPC#b}?C-k^$09>0=!a2s) zf!Os}p`zki7UW1lv6~S6Xa^#bF^vRuuLYM;UEG;3_1xLCm)ydr1Dto{LEJ9$hOP+h zL(yH7+MRHO@T=u;Hth|!_`^ha|7Do#yl*H>pXW>4i5~Tya~XbB{KHQ%(n1%V1ZpC6 z84DfU`HuEWc+B^w+O!98mN=g?*%JlhHAKL6AdqhHtK$ChUZ3qzo=5I~9e(j^V?i<7 zajdT`yVaNlOH_W;EYg{Qw_SLjNbL>&TY4DBHVa^8LpCIxi=pvzZ(_{q?g@@8ZG-vgG=je;_+3 zfWc82P#qydey`QVA9?3M;_Wz+bMgSJU6_IOvZfG6pTeoHUG&Hqh9z&T*sg#xc+nt< zIz^m=k_V@8#0@>1GnU^fSyqHyUIxT(r6HsVhapT~oKby09CS$l2Pr8MQmiLPwW>$O zw9mAy-xwq|n_|w+HmciVi$DJ6gNNlK7<+3P+rNJoj^H_?i3*e0PaSR8w9f;LZdzeb z@p0)rw1I!vU(e6D4C12I#&4URU4-Bc`us>Nw8zl z9z0h5m1e2OW0dJsujAdp%@e$AwIuGN`7Ho9{w#S(hDa-=u#R?h`qfDb)!Nf=_0+5A5TeGk z*J-gGA=N^M8J;xuXBm8*Je@N>X~V)DEa~TkvFz)k1Dppv&MFjM!C=cs;=_CAOZ!*g z?X!mLQ*tmHnX#W8Gt$6%heK>~a43nG7YUL!IqdMbbTHe0le^V3hd4W*;+_uzf9BL; zAHr6%QA^f?&h9kg5$sGdle+NP`8SZ+G=yj0D1%KR zvA+Ce&CL_OMe8e=eyxHi~k9Zs#Bl~- zc2WY3y;MyaMP`D?G2Xl1=mw_(Ml$=oF*Hl;8LW^MXLEIr!Q1Db>9pDouw>E@C%zuP z?vf$%)uwUt#z+!w%qmuJ+moEh-OZM|Z^nevpRrD#{|?)|9S-ek#k!*&?Ay2*V7ZLq z@|8S$>sJ-N#0v6ql|Or1q{NbMnXpe?ow&ZaOHi3L9mT35+4qnRlxX;j&0~#Oj>~=M z{iaGRvP#fUTZ^V%5NGq!#<2bsvTVzXiL5O-2D}U;*p@N&R4}v>-HgI%TVpHSX?sZu zEh^yRwF6vN_YCkoE5f44GjyLm8k_2mqt_90qHggY1d7K)`avmn<3qYoUxMF*{*eiX z^e+qiqzpgq4wk-NX< zGquH5=#^_v)K>MPvd4NhcA$~j=8Q#$KMc>D9>wyeHr1>Z7XuX|LlWkG4d>bZz^~?| zII2j%^a^w#>76Gn{JEa@;QnKUqby0sk^~GX3<2e-N<0U>h;*;`1F0omkUx{>HSgd* z7g~~uX!?F`_K1sQdd*C-<)t0_<2i$Do>|MYq~eM3@sThsJsS%6{=2T50Sh%az$Aw~ z$iDUn))7sKahx@a(JAB4 z+NC#z&*W5zNW^C>zx{#}<#~$h4jse^3tUNvT@}hniZSoiHl%-h3X8u|N-}z6a3fnw zl~#(d^lfcimr^u*m~Y5xLWj`oDIdHYO_i9q9S(6I#|b;?l)p zm=<7(m)A&>==axXzw2hQ>G%uq>+6B!ei7nZoetN|W}@l!c=D@b5F94t2#qWYP+(#T zhPqMEaRvT!rtN1d3=Y+LXJ-Kb7v*8?p!3XvunNHT^}qRBj@qkB7jvHUq}5JAi}7CgJLLf5D+P5thu`Obbj0(dH*|{N(FW zZWa2HoVlCA>^6?0?sM+LMkj9$4g{Jcg|@rx|Z7x$?0bHaS?6l{-646pKIx#~Rj7j7iP~WJK{y{h z(V*Eid-(C#qp+aCj?%&U^uWVil?hq)^yrl=2-zS*jmAjwS$9%!LjWaTkwL@YQ{?`% ze&+sWH`*;I6+cqG#D3Ff_#!reReBtNQAdxE>e@l{mfl)6vSr|%oxbB2wm;Z@; zodd+p%1U@{+IU#`ZX%?mv=cXhbuaxr3qxl;6f55KM317aaOM7f)D67=D`GoE8*5v{ zS9iUwko%Sm62Gfqwsty%j7f!14|BF+)?jQ&@WLQTN$_b{Cu(&Q_+aZ|(WuZ>;2+Fm z3C@K$d)5H(P&9y3safzdcO0y7TMpfS77>-Z4sd_#MiNkyN{$pnl8OB>u;%S7G`zC} zYTISAP49iVXXD;N)YM#ijahr!GClQ(bn z(9HWMNn*iS-x zGca1!2b=YhnerZIYQV3;e=Hk{Xa5tP!3-ZVW>cbeDynxB_ph zYD9*Lo}#7roJjmACvxF2qKd+8xTQ#*X3Tm?jt{AUA+ud=-z_M|qF00Q)t2$p3s~jQ zEJOI(_yeO1tBJ4N5YRMyAo3Z}DxMOi2)c)b+4;0n@Xr1Lv4pG2O;EV7&zP)VamTbSdw-If}fVLGwQLTPc0F6#-NjZ zQ#eeX+z*8MRne?U|2k{Q=!7}q>!fQ;0)}1ritpRdDz zpG=ZO)hQORCwwz57P3?k2iHRLz<4rXqcb?PTQKv$gE&-4=n<`w;!4kS;Dzi8cIcrE zK5#pUN=6;5=#w(-h!K2ENpnd@v2Yjva{yngx=)UsMtm_Y8m)f~fSLUpSVl1uA2xR; z!CpIw^UiF+7fbQMwGOBny_n1wrIMb22naY3BmSBq1urihvOCy6jw#WT@F^$_D-3!> z8&)4;!J1O6NX7wm)nnOot6Km^qePFYOGuMg5<>o{poEc-;oT?=KSJjUOs=8uqWK63 zJTVvTD{~;*Zh`QfmQ|S;Uk(aYx3M!w6JDNlW4{z~*i;z_7@Q!@gClRj=NG5IFJ~Zx zei@99-DcD74`0dl#X^qNSp$_+qlMj&2iip~MWYpg?8hw==DW%p6-693!M+*15;ie)-ksaQ+HV!-Lj$l>cUXto$3u0q| zp_;lLBBFNTqc(T?|9kL)-LtT1_)ai=mqpgUiWlDD*ZBCN7rc0N9UGmi*`JY0v^MUW z*mvG#bZFcIcie`7okGR8SO&~8_aM;*aUF23q*e}X`wT88pq}} zu+1ZcU7L>?`!lE!o)0raGpoz&cc}>)sSDqu({Hh9$JDU#p+q!xqyj&52%S~ifGj7>alasPZc7L&rp1Pk0|+jN|M zz=d5Ke-sB=sgu*;aqws7Fn+=4Bb)c1F(e4HiH{=_phm_KSM`mA=hn~3niHvPS)4rH z+hNZKtj@&5%za|V8)kgt#h!|@ox)vqVLa)0Hj-tQZLwXRSt&~NufpFT1=KzYR|j0e zM;l&~P0dT#p1G%Beabk(1|5Qxb7jcS+LJ6hZyuVw)rQGRaqPQr$Lg8rjjtw+hSo1O zWSo$v4EWAqZKO8JeHnmr+J}gC`;CNn*;@fKNF{FEXKbZ_T$9? z@-(>06>fLtvHY0_F<|Q|(6h~8?_$@&T9ppCuW}n-Pu`1de~*IC)~WC%@Eu9nX^$)8 zzgGlx1mS+cRo7c=!kr$rRm>XhOeT*mX3sBwAiC<8NMD~NK08%nyFGL=#zqyRxmqrK zxv&fDa;o9qvtEQthnVgoZ}^dZk&%eU3}>5wfm<;o^xYtTUu?nncM>eDDjp}@t%8%T z_rPt2E<_r|K)D%UkbN|SyTq|c&2{j+?=F}=+yh$bIq05g47T%Tf{tgr2!tWOl6;=zqGzmVKy!N^+M#kQN%u4re|WXH^dB7qZ@CR)fZ-VmR}5 zBEH?Nj+ZU#VV(169LmdxEA1f;TB-uaz>txQ4748YB#yp+Osr7oh}5is{eK6y$?^k} zSaTFx=dD4fU4>Yj>qEIx&jF{p)V1h`RKA?rngWqN#Vzb(XgyP2pu=aG5eKG ztmwZrI5WVTssA~_xQ~fQpPV5*O$$lcx+=C#7RieEBoa63BKD^2#sSB-lIBfQVT5`P zyzKtRs%O2#4QD&pHyJaiywgo|G)qLG_SZmnxgzr#S4l$ax`^gQXN>N9D7v3}0_%^? zWEZV(u~^9r^o<+?cd8Xx;P*~!X6dlXI2E@aw#H3YhJr=t;mH?t%$Vwka8P-c1ED?Y z5U5dy2@6zU7OX%( ztL3I+(DVe>-IOB=Z?eSoBU{*O7h6&A;?bn)a-`7bK8M%$wXnVKInnmIO%mB**rGla zCiJTFLbpazGBHVfT*!sIniIwx*EZXpZ1}-`Zodi1-+!_eB~2XoUlHlJH5(Ml`a~MB z`poIqC%9qJN;I!0p#AwFcxOqB=)ay?_&Rn7?X)z5U-1esUSbbxXf7b%A1S~!?PD02 zybEU2Uec4l8Gbhf5vjTgocu@F=0(s~h&U8^hE4=~y-HE9mko zqPplAFv;!!_UgYT`vo6k`m8F}W_pYL_g^H`Omz~@#Wvt$Z$iXcQ?Tk=F^&l7gozg} z<^Y-4yaiahI*D(}7u&Q>frvQ4MR4>gc?d?%I>*j=-zEve)lFP2;!XNp0)GoYgMkEm$g zIo!VF0^Yg&j9E1tf?s>}@S$9ho#HZEFzI?tbj(EpdsQ0BH&jE>twz|Jmktk`_JM!G zI&>U!h+S0s$@WZe!K1$`&~^ECC|+d%=1Y&`-{dj$jpY`WuvwqoUpNvco>ap1!v8=! zJcf;#-A*KD1mRPkVeHmdIlJxmjH!8kE|KX>h6hzKuv($b?#0$ZocC%UL{9oH!nDhv z>wJ_snQsw~Y9!w}N0O@@8vv`LN2BAJwOH3+$p;=dDloRsQCkNI?xS)WzKqeLRvP2@ zOVw$#;E9l7`_zC{bN@k$(q}U6dkd~jI*ALse&AQ54F2RoDO-@g2v<$sj%&JqkaeTy z@_;WAJnVsMWsj|`U0#1QF42>Pu~ZC6-(Qf3k-4;`T$uN|55f6P8GN7WHH54aP_w#= z{Yf@Q$D=2B;b3b%Z-4@Cf2)hd8~aI2tS2YMLY6G2nYeWMLxQT%hbtaJ2kpA-ogA%%5f}DIKqqPx8e@wiF#>^L0@GlmAv^0yR@pH@q<3buB~EczwQLbx5H`F zf-JCXOM@>jU*YQ9c#P_t$6lMaw6bxyjxTzWMBscFo|ihslUDm9VQ z_8#=Nd;p3)OF=U?fzMy9#AoK2)2}0(sOVQ0hLnv&m#MNi=eHdm_cG+sw@1_YUN@-Y z1WjINGnBv08HBSOr%~T0!X5FABzHJ+1?}_`A!e5bUtOcdvp%fn{MdFo&G?f{`@<(3 zsI17}J`jqZs@vdMq!`4mk|_GSiGRt>MjQ2axVoVp=Dl(y17k+BKdbgbm2daQBZCoB1XTov@onDQ4g_xlfqwCa7$OoW>X-+dllv1Gd>` z5E?5Ao%c{7Q_&Sj)AqUXn2C=EsFIPa@^+`|+uzVP&A<@`|>s0ThJq==TSQ3cbjMII2+u zBZOxr#CaI(On2g6WV=ad{9yX=jXPaBya^xecnY6PRe7kQCS5d7f+uZTO*N-&r7!=< zOw(rA($><4!S<+AF- zl5}V0LbCT`CSCt1iC;BcO>ZQJ(wT!tz!6iybD<{O&-ol^R5jzOE(5q@*9+mEJ)6r3 z{I?6uaYW*@G28X$I85+y=POsfgS-jlxS*`JQnu5TC#M{MU=1%y?~AVd-eQR3w|$0~ zeVKxDmXV+P@?mPgSFlSFT$)Q=5&?@^WCAY7rc4Hp2(f z8)DvOA)Mc1eQ&?6ra?NGE*2iF>Q?gO4}6`t28%k5{I%Pc4D1@``vlM~W7Y z9!jT<+9kRX6G>lm`B85x!4-ATg8n-20_3%q(&ICDt=%4Wl|OG)DBvxDgN2P5cO zjbZ4j`X5G3Ifo+xj^ep_seEeMEWS8w0$FTS3xDiV;P|zzFvxc|42+3kv+Ey|`sGvb zVOS|}sT`PWK z(zq`2GqDE_o+)9$v$WXyAEvZica|M!)`{=?7enKU88F;sHJ8|?#@$4Rn5bj~4YG)} z8(OGJxBjfgpYESQa?*M5-7td=yr@CyOg8Yxukyk5u?IbE?MZKa+eMFNNbvtwL9(eWAUJSFHl&D*%~_!o>fRN}`!9N?dhYci``eXcw13{1XY zLyLw5!;A~7NY8S68n{D}ZX6K<3r-!!b$>2k`1de2b6Yh3+Gb44&PdRy1tO}hY>C~v zdc1AP9-&iYh&=hBgcCn%E%SBvrW*zGeAI#UN{>AQ4 zfsLuVpV#`ng?>#_(fTu+VZ@>wMv@d!ZpmnRW^^V!7b1&SmYBedhzLGIaPH{rp{O|c z0R1{!mem!#V^Ll+Y&x$E=K6!=Y0mrUH0ow4@%|hl?hF{fM}A`D%5>h+g3ab3Zetp7L)@Wpj|q}S$Su5u3%AbYMV%5tzcK?O zwFIwr;SW3ij(EJ$ZwHD;T*2Et4ae@3<9b&6K<-Ka{t^YSBgd5al9K;KZT2w``ni)m zv=Z3GnzA+)qYU_Jqjsz{^@q;RAYS+`3TnS8;&*2ih|su(V-=*i>((^9Exv^M9ig~r zy$QGZN15TmRy+|mL+C^OsK7_KHO`=K1 z50j#}y~IK{h@N_u0GE}kiTu+%tlrxqDq469zw2o8w)gXZpYmf36^DVl4&v!Y4vOo- z?z0QcT2Qq9BnG!HrskVR()6|REKl|g&i@!ncl{^F-z%JFe={=48pkbcU)2V3O<-t8 z*PURa4F>W(5ivwDc(F*uLl5?Q9LKt=O?V|n=)g)2rEb2L8Oiiy_Q4UPCuat8H1i>1 zok~cHGs8n7HK1qLz<-;JM9)^5z`7Ca(=($e{>whnU>UodYhu5){fnyxt zT8$3(s~Skp!#cKNLJ*1C8jSC5rHgX2egXugvE?)R@x{Ppn6mhiDDBZs_%um}iuQkG zk{jG%@t*18poDY`Z6HX~L&@bJMS5ad4p~tB0*6=6A{G-8$c6#Y__Xg998NH&&r|2a z!|CT?kdGC%%Bu4g@6BWxxhHVxt%=8oAMBxADE=ro#6HHFar?XT#N+J4g*I3}I+-0N zr(85qG|H214DrS2S^E65?h2xknj`X@^A8sZcd#iFkaSBrVY};EEO@Cc zcJ@Dt<}oVtRrM)I_8q~S54RG(ne`ZaCxqWjPsPqV?x^sp6c?A|;fshGyY0EMR5$u1 zktr6Vc5fV*&~{ip`x{K?TralqdJfwn>Y4LW8D5+X+AR zJe+Q-dIu*m8ks@RXmpBvj)!^|;?sq9Ani(ktcO2cNOZHZNiU#jK*U9^jF?T;M_hsbDhJaQ@2-m5&q-6=ievaUW)^vAtMDw z)oHA0djNa4rNPB1ieTrT34_fC@)19Oz~i)3aM`Owdj?cvgp(<`-|-1k-Y7DA7R1&m zjfS(#71tCm!_)E&u=1GhJFOEbJ=$anS&^6Mr!AR|Y7}|4XcruQ3Om zV)z&I1zli|n0fmg*=2zihWmL4fAV$gVSb+pIBD}TYZr&VO|&Yx^`#vCyVoW~9Jnt@By1eP(V z5p(rE!nB^fX(GBrhT==3=Q#fLQTE~X z99S0kj~(x77fn9a0FS+$QAx`U^Y4wu#p0b9tP+TWqNJ!|mC*H88v;{&=E9b)L^|bJ z3|`)J6<>Sl^AoCtL|?xXS;hmP-|B_vQHW~*e7?Cvtzr*1R=Av7-AJ}aB zF%Och#zD8xwa=WTMmIQiV4I9By&s$>9<^8z^NbYH@rS_4oS@7-WzG`kmji?@UoH&a zVP$JGz#h}IJ!sSzS1Q$8i&CX&EcTQEXLqj9lW%rbUYys%#^?ES-SyIf(=DF%%C6%l zlMSeQvI5_nAi)(@=<~v#$H_m975FpkHT=mgg6oR9xWq@5Cw! z3aG$H!J9@5c(CVR+_?S(3D_4+C%X=zcijcHOkycFG}59)YEH1jSdKRvi@4v0V~~9N zCVi=M3|B9khy5D6x#ZY5643YxAN!P0li80k`b0me7%%L|qklsiAI~qw`;(>5l5h<( zq^%EC`PMO>927G|rq%ZRg7F1NyJ|@ntqbENQWwb4JSlu2E(JxW>3pj57(QjQ;MIW)30sxMU4k+6*8{-&zyh`{n5Df|bp$b;h0Gnh49hw6nsB!42*=z{uzJVQ~1 zPxM*9`?rtc2knPp{D1dP8KxEpFOw3nSW}i9F9}^ZC8WSjSJ3eZsvcWLGlp4$XlP$;+Va`T=q^`vO(A zmcbcsme2)t=ke9?B5cFu)Mi)@j30U!l{6#Sw~vAN!*{sAOe#ZZxn!JvNpRCFo<}WZ zjzZOrr)1$bWme-*2TSco(%QyJe7Bz}4_SJOyfiC?yrZ2^DEN1$o-T)uUuK{dvkO|7 z4&5A4N_tOjrVZr{tYYs`ra3i9m`g~2YqAd1O!E}@NGs{V_0iaVCK`9$T#E)f-{I*$ zOQ~w&5bA81NcT5if-X@Rx1KFVg?pA*z9LQ>WhTkBw@J}`9*OM9J%LqtVHPBn4imgi zVm#M3mj9NYL@%knAR`|>0d>Jcm>oTohx;aCt%A_!*xU&}vQ&6;_he}O^A?vZ6W9j= zzp%zt8&1a`qz9b_^0`5S>A~9vsI~eozUoO6c&g`el@A_p=&1xI-5kpgPjtciopaD^ zgB0$(St`o+&c^%YcJ!ED8(z(Ri~Hi2KwiRLSZ0zX+{2`~fBr~jlOTdC6&t~8(J0(% zbpiLc8re;V9mOLbc|hCny)1BJ4jvT02MsSHvFRrUEB8nUY``VBqo5M63{63`@Q=8r zX*ZoXJdg&gT*m3zE|~UyAivWvhN`OxF6cI6E}Jk!>`@ZIhxo=r{+oA#+jkJ`OXvj! zJrOmTRt>Lg#(_jbh){VKx=Qzwur4N?e_>3tZ?QeiO)f`QTOrr>bPo&?=D)H!ui5Oz zdF;}=8sbnriKox$5Si3o;JLNKVTfWIu@K4djb-Ni>t9vA?N~Bh-;*y6%1h&ewMJoG z>Jc1l-vr$qi}B-x5*|2e3|j7c3%)z-u;97`rRQsy>(d(C{AWFc;Vm@*B{x@F$2QJgM5)aOSXX0v0RWz*7C) zxWvyH?)_7Q} z6}}7k2FVn@Ep!#RJxqhowI}G)k&I3QMsnY30TBM9kbZ0wW}_b`!tQ!c_E=SpU#j$E zvjWT5&)A84`tmEd=kd+Tdu^t0>(*MfaQl8(eoT)h+ep*WUO!?iZHi&eH^I@YgM|H6 zp#@?c+dlu8z&*%MQ2E?vu^}4|XmrEUAXe4&Pz*s!n{C-vBv8gFe|NVu_W{ zu)laTbUF1w>e!|HmZt3%J?`E<}+4@xc< zv5d&KMCRiy{FotztwKgHyL$}WE>XhEj&XukzyQTB&sI8lzlWX6mhq44;!%B~A)V}h z2Q)JFLsIZRa%O;t>U#PL^Rt0`bhtEg)kVDXdK>RwwuG+wYDcB7tfVG_yT{nE3@@xI zu#1SQ6Z!EB{Qi73Z?yKNKP9H{91DH=>xK$dH+#ye*?hD;)QGbty9xb3DN>iSg=XxM z;3W-ybVK?QUaP7JP3ueP{&|UL*UoYM2N~{fp$mRN&oJVlr})uIvx=_|BY1)^i}ss% z7qp|(*$A1%AS>K~R;7Ff)e%ei1}!Q2{L&aM+9Wt`TjXek1vB~EOdpQ5|3^*+LxXS-Hp!t_4xbH-Z_vD^79eIo=updVn_xrxCDbX>e9?ecP5>t$6ZdI zgSd)d(xWq-XKbj$qWlb8vm_f5S6-t#-t2*7<^TtF52o1>!PH}o2Hf)qg`&-lw4!qt zJp1+p_Qrk&3(G~^(f9~^d%6cB8+P#>aTmGecLV;g<|>|D+DR%biH#BJLfag;? z@OPdg^xMA@RZsbgwr7KQ%CtiIz+a7id6!K0vgeR;+6Y9wQFvgFm+-rIo*&>ZFss%O z-UMq>_2YXWH(ziUzWk1d6Vz#6axT2qI*z-VmV#XNW|7`m3GPxe19$(itK63~l*@GP zM~UBRqMDCYpphiZ`Nyo{7ggior`>kGFmn+k3<;vTX191t**sJeW~#A|CeiM&3B3J^ z9B7%8p`79(^xZ#$%cZu$rk!a(=S=0gUW)wh3|T%}ZaFWEQ>1f#35nGGpTIn(2&X@b z;q`tU^hI(rE_r+t{BN6x6=lm{g^JK0PI?0pPNQ(d8-hQ7+ChZPVea%U4kX`7(WUOq zKnvvI?0~J1q@pM=iWZRknhDq)olmzXSMtKQuKcBsE_V<*e*La#aPoXUdi6#^+wlwF zZ!UBVJQ8urh78{HV>+LC;wW5g&4Vh~LIVsp^LGh9m|mMbjBQV+(WA|1!qg!&e$g86 z^5OVU$QQ473BlBWr)cidOST@l0DTXJf$!yPaJzMwzq+K&PfTtgeO=|)96cHAZ!aLD z#_6EQYbHH+&5HU-SF&Iv1=02waOYtu_KvaV8?V{Xq}vf( zzb1)VdsacJg`4pF^^r|hKAhQKAd;m9^x2!$xWJ)+6~+vuzLAzfKWPAe^Uf7(_FRHo zh4(^EK#w2S+(5ny{ZfM!2HcowkvSi-LHq1LI5?{S%I^rRKY`mk!7x-j`0!n*bl2qr ze>?GT%j0~7t}Zo96HhK1Q^tcX4X0DDrSl7Wnovegn_p_Yz?Pn|6&0uJI3#?I;;`zIG!6{S0ee@-b zQ}0Vy_^w0TRi;bZ8_U_-sQk+P+zh^H&%rs1hETKq*W`789a(!d3!g;16Lrly7zWjAQP(d9C>3MDpP#M}@ScCLB2x$6 zthC`@3JtOM^G7!B^m8z8h@elmouvi;)K>zJj##m23$i6a@KKOPGeJ>v8 z=iKjuwM{4nuNe<-KA*>-*JVk!3sT|!VBTYZ%8t@N$Z`s|?5967Ab3peJ1+CcYs9YeArfF|r zpXWXCu?IUSF$m*Vm9yZchcbQc*iZBmjQQ99w$Z-IYVa&g2X7C3USj+gR)!jhZYLJu zGeu24*+P@zDG8J?OG2TukLz`I)0yA<&{=veWH$xj^z(ffY;M5+7!bZ=?5xWFs=WCQ z!GjS$!UnByGx+$KQg6LEe38i!9CqbD*s3&uPWd^RJ}C#K=pUk+W zNeJ(Ydj`{t&G;iBySAiE8TZ@AVEpWzWO7b41jQv`@))2!TY~7n$+uwgJw<+V@g6>W zPy|HJbY=$i+Psii@{-rT@Vfg4_%iDc^m_-5PQ&XQc+CB)1!2t}Sz(?kf72dJ*ZV2*r9LvSc6KZtnV15f zNIdeRvX&8s>==#pCHhiG@7ME}sOtxZsuv zQaGv1h}T{3W!D71Ym|cE!TIkWj5@a-E*BohABUfVyS^f^j*`RK))(>okV$;Fx&~Jp z{DpL-N%3`0k3#Ge8}1S(FZ|7N^m6?~&JzTX>biBp{P+X1%JKAolNqs9NfPF?G4#{y zHe{M3_!stTC?Q@=ZA2X`@Rkm*KHGgq8`v&bDLfL;>q_|$I=-BkJu|; zhwHk=ldN_dI?g*8@?M|iKNigv$NtSlxx$rX>^OUjxN{LaKc2&Y<8`F-KQaHLwGcy2 zb-}2fY1}#dtH>mDr6}jxZlN<83p*|e+=kE-@ItB$yK5(4g7AArkCfzRUmSvh zPIY>|$QETk>CsO%aU%Lymi}q|1$x(ylSQSCJX_p=q2I59+x7-_)J?mbc{4j3J=F~ZVw=j9DBwwb8UDOfgO^{u zf~g&&czo~(KB7jMyWCBKTiQz8-tsjro50YZyP6FvxecZF$Knuy`Kr8Ff^SIv$MQ@D z@cdXmy!Xuon>mf{u6qwT;a!3s>oI=ze##V;RzpC8Ax6&+;FrH_ z1ow@e^ie8hqw>bUo2ygd^yMYIThW{j*wl=&ax1aA;Q;)b7zY!wmVsy38?ukYP@h2) z@Lcjy>|8j8{x?>U_TE#49h(NDLcKYC`|hP^&ELgb<@s4$cR>n0TO#4?69qcmF9xoS zHKgNzKZK`RYFx^1D&L!*N3y=Y1$B#M{P;_M6q$QqOl~ZU`l8J%;x0g4mns&o^uap~ zjaZO+6P`Vrg`tCcpx*5XW@N60n%-x`3ng1>6)_DJ|AWIZk(lHeAtr6A8&<2#M}Y~cN9D)B7_ z_WF23rA`#awk2Z!^)aYvGKjDG+QnKd_S0DL6dZox8@9Ntg~AxY_mFBvb6QnN$lYAt zv)~EbpPxPdFHEkWNwm47~9#^1+QFy#YbY}~~fFgh{-+zW^pFSS8@ z*Hs}0ew1$O3Wwz%&OrN%sqpcH9{PAa!PkBIAZ>e3G=0WMjF($WEoYD8)6Kn!L%6U< zyI{|?uU!ZMR`&?1^&qj0|d=om2MKtDE3G|2C(Gg4PF!!W9)<649^p~ulGmc5J=G|q~KX)Q^ z{yPgazF$VWNe=u~Rv$X{6~Nr_^*Glwk0kBg%^q$YL2qgt<4ZIQac#v0qP#Gl_}%#d z%kKnWB6JyoVeX21+L{fflsP+qo)q-=F$6X_`Fj# z)Xr`x{iP>{K9}PZBZKya3hAmFUy!S~M;E2tNLohGEGQ*JO6PU5x0J^tb7W1M8(y;pR;I@O49F1_&uDKT;>S?240pR$m zOK`@foE(c9gnfQ*(NQY|&s<0oye}nC;Pm4e#BK=TbUw~?=j{&8EtIQf(_g$=_siTSEAP2&XPBZtuXF!ClS?3 z@VCWs&>V6UEZ+O_On*62s~JVB@6CXV!!y`dFG{W^++j*Yl1fV*BocXHAmSzP+W9WK zYup2I77}d8RwaJ)$u-hl+X6rKE&%hFKySkV5};~<*H)W}O`|hJGU2Cj(&#H7xoa`G zA%6$FHoSo>ry^#&@eP>ejl{Gi)A#@t0P!w;*S_<>0Yq(0(4{2sOs298W1Usukh zhKb>%PFjwoZ@LNL7f153F6y*sX%_e#zR7|XE0CKW=Cmk0P4q48GnpP^N{j8}_|uj( z%r#UCV>g|F)1Jh3Gsq*Kk)V`+9-mEmL8|Y~$BWBwJu{e;xjB%=^q2V9 zFc;29W{6jfOlLuQA?(|vrJ~|2TPpQYiMaW!f(=1+Ouc;}mb6{LknuWvf0Zser@4wU zn-=1e&V?w~brR27CJ|Tp0ldAziTqmo68`E;qsvqhVB8l+{?#fRqV61reY>s6oV7E^ z+tRzX4YDr0@u3G~Mo+`xhTp}qFCW7CJY8(Hd`(W@&%uG}iezGhDj$7R3#~?~@Umwg zSnapfu+O{=A6pOPZgbUXDXNh#)tSu9;2dl_IGXB8OyrjP4#E*{BkVXW!@enN!d3gx z)aXPsj+Yg(bL0D2xQ-1?6n2Q^^3ganQ$sqpGvC4Gs)v2D_KO-H$nGjD+Zgq0qwBGuys=b zj`W=hy9aoPhUT`iguMcrrqf@jlg@+qj>(vxdW+0d^oP&mElGIrLL4A)ZH9;q`4(SO zt|L(j_x@PXc>)7*;pCCn__UeryDR~kGFNb?#SwULZ$BHi!~_>kIE2xidIA$z8j8V% z;z>&=HysH2X062i>IazS=ghy%mVvW(rR>b4?~xbg^T!AG6DC8VgTMSE1_3_1Iu(LLQdrqxeHKeoYd* zwso6Wrni`>+hz%_n^kam>|-db5jeTY(J*`AQml!}#nM+^_~GSsaG)8uyQ>n~ssdqA z^ho^nPL8^r$rg|DO=7ms&jKA|;>4D4(;!1oSIonDffG6QehK>Beu8ar znW6&`A6TSe5Ux+{05=N-ZZK;ojhlE7B-Jf-yt;J zWQTYBcZ$Jx3%}l-$GH3eYHqt9djBe5sm?7N=+}maw6cUQ$@Gd-q4BuQf?EVBv-D`y3r2;(oqXWC8Z=-48 zHoC`a7!8k*rd1DLF#QTuzCb3LjoYLS#pR{ASGxkISjNGF9UEcT^DH*-K{LyapTxAH zmw?lOW^&|X43qFyJkTl?tG}a4Edq+X&xs zg!4zzcnm!ef?KO9y`@I`YbZGr$yT52 z6)hWl9i1E=FyooR-20V~waPpM?fHV^LnYg`LmdYJ}Ck~FW#P56J z&{#^34ShdS=;z1qkw;>g&(p>9u-R;?Ryu?qbk*llvwjl|;XY9*^n5jo&v~A+_geRTUG5Pt#r(M#G0H#L zNTz3&T20H7d{vuqdK7}75K7{4TDr2{FBJRE%2i`ZQ@f(cXp)JIYch4w+`tnh5LZcX0O&d$? zE?j}JWi?>y9|{WQb5Tf=k?F%%3HQZ#e7`Dzj&2Tx?K7>oF}h8n;|WT9_SO+t_Ed#m z`1mi2-LePIE9}MeHzQztVH}dCa-339f{)aDa7M>{)S<3)?ecgkJNF(qUDjrHF|WaG zVi^AQS_CuysN>u1Cowy(iS3LF6J1SJUY4^D$wv)AwfwpFCP(TA?Rs76metRd5seaUMJbvmcoo&MdplS!5h z;`oRDRI59Ju3sT=RrKmu$E&#%okdtBK8dKnQA|wD#8$;0BmXnv&4$hu=myjR-;k8;I!Z_`~%`&N7CPaY(ZPqgd9={VWUngg1N#TcF~7C zeoAiy7`Bx`tinWouVf(qE42-;I-Aq4>#|UGx(~f`#1*fM7v|6B)5!Q7H|iVTg)@H+ z;II`7XqM+O-fMpn@w~AZ1Liy?cZ%!Crmdc&ETJCE<{QEyx_QEo**~+J?tnqN}>v) z?tUco>LxVbS%K{WbsBhRfQWervk~hod~0(SZV3KL*M@_bG;AkiUY$vgF1Uwn?;A1y zO%?fI-VE=@9fGVSPjH!UE^%A$kB638^UYhm!M7|NT1X}?Hw>bkBMkVxUQ^*kekPL& z78hn<64c_sNjMRf%>Ip)Vd8UhXjxStDNb958xQAHJv1=q?_Uz;C`R3Q{EH8%(37C` zv z1Ya)1v(i(%)tRv2{PE8jFjZg&1+TT|C0*9g4JA%Ab@+1re#B~6(~*gVO|^W|^5N7s zHiS1clAyJlbc9ZnEibVvg3~tUA~dSYfI_$ZwsmxR&Ak$(~6k65zt+ChVqx(1pdd_csTNDG)*td;5WTB z;16D{;Pira@v~;QF}LS$+1bOZ`3Rv)SpDlI&fi#z4-X3Gttsv>6F9mB<@r`)W9*Od zg`hi^@y_t;Y{$fGkezrFT8AeI-eEOfDq}J(%CY1n1;)+9pKiRJia1=HGni^nT3F~(;e`W%~1-C9jk=e&5vb7rn&+5E|;QJ?}j4py%=Ib)ablYJ!)>&37p=q#MB<%Uoj9x0olo68mXAJi zh}uhp(+z6N>8i=yXxcx3Ugn4SvPe7VIi)~zIw*hNRhzGRQj0~ygiK~wqcHPT!xw+N zX}m`^z3uXeETkvVdtVuf&647slx+py^bCGa`WO5X@`O~#3LTJne*Ct287k3WK|L#b z!NukzE%iN!pJpghlNJ{q(-dL!GYjj59nMs%c`F&r@do#Ig3r;zmQQR8;D_CF=cQ`r z^4F#1X>Pv^FDu;1vc@&D6!$GS`JO$U6h4Y*op7P*_cxFcTY9+lo?Ah@ONqbM^oRU0 zcVXvdd!tI(C7}oW9Qc``BC%q?V@*Toj{OVxT_st>sqZviU1&otbequMvlH?C{Q;zL z1*~o7GOXOqQM)&mV8{wF?7J=SgItnHARmg!|3%}HU46J*X*B)*D4g7!Gh6g< zpEAF1`485u8HnP6!|Cy&iBu|}02fzO!q9t5_*D-AfVZ{cM|ikVF)v5R{x*^eIpxD7 z?@pn<>c>zjwTYD!*zwd$iQn(E5*FEPg)`qKW5Y*z96Z(q0oyvrHPcsE)B2PgvnH_1 zK=3<%QRjyyUWD)LJJbF+m9DRJgUmZ}^z@zyG;@|Lc1I<``MEo&mELlgb4Q)-(b0!x zz47$8wBX!iW8t#>bvz~P%erIZaJ*zB6ts!3J!KneE9UU+62s_Yhw~!R_Y-@g7g3GJ z0q|gWB>yx~oG+O92!qHUbnTVoEfz@f#cfY;m$xas-|x+DFA9g5WCWT9-{d|9CBtQ5 zF6zA|0PW=rd5b-2XszbT|Cw6Hj`MSAM~Vg=l-g{0 zpTU>9_+pVkD6d^Mg#THxg8#F*6F+Xh$os`A(J{@Zc%PJc^v8jv5bmGCfA@FcKPRj~ znWj&bGNXs`6a8;u9u1>ogO|_`e`l~XlcUt{js^cL_9aQMZ3H9!JrmbaA#&f3az%l@ zY*m^)-kSIpzrD!8%;;$}TTzQD@3iGVpUA-s4=1W}%N*4VIX>L^t99*=v83(gCo;LG z5qIVP!46FsJbiUFJ^wm}^UzGiqwmgv=`1}uh%aHjJ_pfOf|upBE-5ZvLY1UO@rp+! z`9*UdGq0(a`BKGZSbQUscKu#LH^nGpRr)FV&GQYq?CC}Q4{gW`-xYqxPZI-)GQ64c z0}3_%LV!jzj;W0%(`QT5$6<}^R_92(@@5std>Ba+rH+tx$prXi6N!)QhEl%+8z9Zr zk>BPy87%~Nu&TpVd^}?l+xUP$a8Mf4d1XPPR0^@`kT-I#40!Qp;m(ws0&nw zGS`{7HL*%yuatnE#3=su)1|zt;Le+WvJ&W&3TE0gl{ZSvgojdMY=Th^aWZJenQ{hn zvDGuobI8W0UOBK){T$V4_yoy!iXgJj1B27Ln5Dr1(!O^Bf2^~FE%|#3!Sy37F#Zm` z&7WXLv^kyPqt5S~@{vg-DA119Z_M=CQ21rp2Ac&Z?k(E@tm?La$f!D8|Ji~p2r%W- z$6E78ax`dKR0h~o8q)%SJ=1q=KDD=T<7@Vn^0^0{Y5g>LzI*RHwD^~StE;|)nKe+8 zYQg!{IUggZdV#?dftS=$hcK?4YcZDS#9H`o}|=UkMp{@isfJJdyw7 za};kY^$C3>BYsxP7~W4TgO2YPhgE~Zj=i@YidQsafo3}RKYvZeP0^yMU)_lD6KT4m z`nB~>tt7#JuFeHH>GEZ+XP``&t(!NUfgwh{EIaU%z*Lvu4;~$g%WcPS4K4os%#Yqe z=02GVmfOqsCEN3dd(Fw~+2+(q#R+W>z7`#$XYsFOIm}Xs;(xo!QNQ>VwEU4Y9jx)d z%eU>Q-w9bN-R(jZI~rNT$UgYLTk^mw7fkW5Bon>g!S{Z1TCX1t6O3XRJyeAc2lJTT zYE4Lykl`chlHkVrbu_MK0v10tg^@$oVepMGetSR$I<`jAPfHIFuW}3C&+s_z^BPL4 zp4CFYr$3yT0YZXa7AaN;Lk;nBaQl3b@R{s_-DiyW-47!95d)@l4!ej@t0?MKZUf1m zx8O*oBFY{$CqW&R(3fb)Ijx*dJ<_-LYsiTG7 z@H{$fGzSI8p2Cr}A)uT#l=l1o#)pnuP$e#iw`_Nz{x=M1j=8{^70#=XJH@C^(0ck| zm=Q~B4#)61Ti)l-DB(VFowM)}dd}*5VbQo<{QR*Ne8acr=;!+Z9c3)}K9}eC{y$k> zc1#W`O(zXdqYX4Vbi>8(F?7KNG5X>`AKBC1fm=Su@Kd!? zNP&+8yFH8>V$kT)3eB;a4 z+~Lg@7{nQhslB$w-&y%8HtQYC`qsyiMgwuRT-F^gm1&D$R#Gtr>bkB^m%=-kKcvgXGu`; zqBHQ$zL*S-(4?-LM^W*8&CEn4k!7f#Cch8rV^CI%RY~wemdJ9!HRz{EBvA;(kJ?%I z-cZr8(I(&!nh&*mUo&f$3*em{iK7HA)621wnf<~qY-9c%k;DEiq7`lF@a#VXonIbc zX{H8rFqS(6`$ReV0+>vj!sR#T+SrV(Cq|GL z!3nVBHjp~wD0bnK9xfPu8mg+!V_(e%@}sN^2M&i~3|EIGj$VTAVHKEtTZ^K<8*y6v zBY0T2iOl-lDENF8`OPnuvL~er*zqJ~Sh~lKd;i%8uK9L|I#%rm0{h8!%EOkK@9<4; zlE~)N1T0Cr!MG|hYNt69P5hR!GWZMT>dO4eqrY&>gYoP?+x@WgkQ2s4^s(_r_J{@? zg>3PwA>gi-KyC+wpu~_wXz+RqP9u$&>zQaa_qTVI%c%y=pL2#04HL=2-7Ro^)mW6| z0nh0efwSgEE__6@sJSZ#i$~9+4jBQkD&ifHKRFX~LJDzw_fBwFn+h9j58|m;+u_uy z2=eZDJ5HXI53|efvyhNk%-Fw|{rz5u>+Ain$-@~PuN%Pi=~M8tvnf`PF&H3U#L3@z z2g%1LK&q=OE*j=W`iGW*M(|@YZTJT^uKb>8`jBq+cGXEVanrF>_aBM>Ru)u^4zVUC zU;eP0FOqOq%rc@gPlIkw+>f?f7GRI`YUqtBVB53)!yv0$0uS4d8CY1rq81t2>&VglUa`PaRzSDr4 zAslFqU4-qn-{5k(2X^I_L()t;m|>>HbcDWThg3C7YLlW3A>LSPDX={@cHoLXQ1xV! z;9pIB0^def0dac><*}g{qBRW7r?}%&L#$h2B7SJsgRY|al_!r)1zD3`Hb3JBc{W@etL)da zBflqOXuSl#sB{wcNAG9qo);ngYPzT(Xgu`A9)X^v_IN-3r{IlsXBsbZ(dEEi+z~vC z*ZF%1Bs~iRXXIm;)>Od8Y-)B}t0yx0^hzyRzWQpOg%nu8M!y}9YN|G`(y%h=NX zh-lo>0iT!mt)DOcMrO~sD|k@!;G<}xz{s7(DmER&BI8j&I+XG7**7H7bPn@bV*wjm zZ1ApS6f5#PM9x3~n_>`xaYE1F#Q7*2D>db=t$t{1TYEat=6{v|{s*5kiKd z71uVD7{_`!K8Nb$z5bI2K; z1kq@v)fk5NQS5{#c^!Qe=Clc(7^P+8PxunB-FSlp+bKXvV+4#b9g2ooVJtm93=#%j zvilkx5V@ce?s)~X_g~YA-0KT4Vn&QOJQSjz$lEGMsI}qfef)B>+ zt7kqj(tKz3DJF{e!-@(YF#oaXaK6?K!d0eXL{lSf+cX2>PO6d03$?JQHy9UfyHEP_ z)zCXKi0#ubgT!;o!SLHz7}xw0f9|ctM#+4vf27Ozu8ze)B_CLBZ3&aeNnq&#FnhC> zNLNOa9aZeh)DjABI8|72iEl!CL=~#K&nC97sBh zGgh5oVGf3%U38T699~SkT&L5yNjGt@!hzgT6u8jGS6NLO|6CMuFa=jn?BP_4Lm_sk z4mq%>o;aCE@}kq)_)~Ezx=QB@Y+Mg0a5JpZ8M7NgqG~|?zq=51Iv8{HLNVgu9gYZ} z{eh?jr0nu2UiwG9@C<6v@jiR;cV;!NZ#aY{r*1--TLnzpph{i-it$hv4*wXz3X2Ia zCiw!M-6x5!=HG@}Lgr*ERiKrbEAiCPuQ>YG9>K|NLnr5pp_8#ciyGsCbJDW8pNU_< z{nBQfR;0$uj9QP+uMeY34-3A>858lVpR2GtK=dm-Lf)pmBZcpcQK9+=_kB)1)HN5Q zY}RnNREp%zhBx4xk_2;w^UCL}Vq&?s81z=2!^`r!;O2)}aObTCRT;Sx^!^KFn`b$~ zf3=(O%5)8yQTvCS9B%;SLlfAgtC4toRTv}=lcB-U!v0Mv2ahUz7X9`6Ms8n7I$tq&8CX|& zqu3fn?CBiJHk+Li%{V^}r@aivhe0A1s8K=O^U7Fg^jMs+D2Ke-J0Am@E8)yVePT3l z4foXBLVKG5hDTihDVt)J_W2%6UH*uD)$b#g`-D!x}rSiGuPO|*kqYW%_?=m{j--S-{PEd7fh3J67FSN{? zg)Wzmf$Jo}cWQeT>Vgc&v&pgOy;_PNzCHyc$7^v>F6MMjz7y2xJ|}K^K5%y8aQZTN z1nY`7=PgWL!%1IFoc>1(=2Yad*ZsqxQ|vD`#@LX3qC{A%a2At~97jX7CidWCG>GYV zPz{%G=xewJIrqZxj=4RA+ZwSy6>C86f;h}E`^j|)p!9`lQTWR{l|1&l3(F?o0H%6J}&m7*hNYxN04G>-%Z8 z$aXhgQ9cQ;$Ay8vlRgxG)Mg63=3pgs3XhFjkNy|4;q5L*vb*6WS^gn{^NF&>QJ>>U zyYzKXbBt%V-g}F-54?j_9yTCWFqK_jr$RJV4CnWro{6X9Y}xkBwm6UgqWCydZkn+B zs%T0gs!u9m=Q_Xv`>S}_%MabhCUP|!V%TPR!D+jbLq4;RxvkdbTdiX;Mq5~yNv5Hy zeG(~Mq=1iC3mk-=U2LHLEzX)!$ogxB^Y7;M6Q@~IVdK;zr0n>0j6B=It*#fiOZ*wq zJ$M!JKo@Q#-ou5PoS5o;eH^*s5pmxRJ`V~ zKbHJuDv}&7j~fD8*OhbEUAsi%RK!SL$TUtXM;i*_KH!y4HLNr`mwD^_A*iqeMb&L! zWj6^T3=g53N+_H6cntk25mR;Vgb@y_%VQFXOZjFM6)IyD1Q)VAAmvFh{;*BN0}6A9 z|9VxtMv6ec$sGHtwRzQ{-J)Z`i!r6}CTsL9Ag{mdhM;XLLDK0qzGzm4a@~5-Mu;Yp z+m5lfKD)r7u%5K)N8sFqF*r)tDfEKsPtk2Pl$!+TgzO)o9;{=z&zRh4)-3Z(XP5xEX zJ!^X@UAj!P9LJU&U^Yajt8A%9w4*d8Wy;ZAZjDt z;)MEbY1GPEedgKO~Vr@ z$h9ONs8vzm<)!=~uuBrXZ3IWvq!cuB?-b3Pn$5n}9N?y!ontecpOHVix59+Cz2rr) zz}Kx+;0rUfY5cSptiQbu0+J>Z`Ax6b#vAG|(w$=Isw(#E`5w^9iiFPN%elY1cOjl_ z!obh{Fk$>ba_MWg@Y(VceQqCu{Ral|oAeKE@wz&;;@oXv2KkWu8FLUbT}Ck@JwrSm zHiZONwi5ljGk*xq%UWMm=wrqt z?Dg4_3$9T1*a&lFVugI(9PF@CgTp)LL&Wr}VDj9QK6UHGi&+HiMCaJ4IRal}j|B*N zLRi=?Lmz&>LVR-`!jfC(DmkM{mj7Xkz;;%nSExN0CSOE@er?Ey6HvHU3wOq)kW{?J z?(CO^`NwlP(ens$B1Dm_)}96H&b$&`@Hz(ftkTK6x$Vqa?>_r%B#n34(uu;1W)|_? zm9;-;CUO52!zSxFSUy$}XWTf<4qUAzd%nuUxahxJCAX5i^522S8=i{>n@$MX9z__8 z9w1fA)4+7yZVz=Sjy{&_fZ6wIr;1 zed^2b%gq}dD` zyLUjgf-~;v_{seIRXN+bCD>AuPv-PzgS_zj#;X>yCMjd^{eG0?->o1H?W2geSO9Yh z6_`u38|6w}A!?cu8~0qr3Vn7yNbQPsUuq@<1l=eaZJtgz`dDiWZSKAfGQUF zVO0-WA2|$h*XF|F-06_#t|uDExdb`??ZR`dgQRe&G_l@ynhX!x$JW%{XZL>@ldj*& znA~c`thy3~-CRD2udfjOU17(KP@ch5y{vGA_)l_Uxjx3fv=&X9ssiux%wX;XIW*bs zjxu%8kRz5yF3JzGAiecmzrGvSXfluGtU8DHn{A;4H{zT9<0`k@bH?Ydl8NFHZ8rW% z85p~%Vc;@TR*`uOhy9WvPXiBP))KhB0lQ>Q|ZZy|aAWEbcN^RJCt z8`v%1GbGOJDv48nMMA$!5x69+WZV~lKh)Ake*K7Ly>=~Zjh?{s9%{i${glqIb;2-Qr_Nt>tf#*dhYqf#0!PYMpiG`yuf7`e?jgBjkZ% zo)eQUX*hISAEqDc676}t76+$$fNFe|=*1Nee7rOl51Sg`t^L{L-PcZ1zcrGWyf}yZ zTK2F7bsBiayPj-$Uxc11??i8N_i@AIW8sEHai!WDo@8?$NlH~8xuSHHwG5VVPH*a& zme~Yg75PNBSkKz0eKWlLb*@T$Y8mH{Cke`()nuZDlIZNP6lM{g0yXddhzj{TNp!Qq@WIt5EXq9|d^A3>h5=)ER8>N1)+xg8dUIBp+=dCp>)~u+ zA+fJ4L1(9E_Iw;;F+U@?Bg=YN-OP1tpPd!SzLSTgujNVgktJ|WIgv!pbOqg-GMML= zP_--fEDK6DzBGo+?yAAq|AIko zx)RR4RK>ovDv+6X%SrQzu`K!XeG)Yxo}4H;j1x5Hu}gXH*|j8h29hCwE5_0F5-XYa z!@113zl!_VavFB6oX@;JE3yy!2T1H&2ei%egWrS7;9M6enr&%T_5PYPxE$1gP5fcT zDH!3hkCQ~TR@p+2c^y1RU4WBHtTE(W8JktB0XFI7B(V5Dwx+B==)5ctSTQN&?!^fZ zzVNANjmA^5^KL#@P(g8@M%=iK1;f54zis}R^-{{@61lGoQphX!oIv54{1?S zP!=PGziVQ#y$3~~&RKzReHi?GbC@lBIuA~+e`alSawT(k{GAOL93s)fz0k*Ot?1&> zF``$lE13DFysCiz0$7`AA8S~x&VowyMULZ&AX#EKsT$MHwlAq5jqkpaj6-Ik`rsON zBVrhY9jXDt3ONXqdyjIKC&*i$3+#YQ5VlXY0v{t`$I=^3n)d9+4kusopHGsI;h9aU z%ad4c^A=dJ;3%2ip9oio@OOJ3Wk>Vu@%3PwW$pjntipV|bM+-r*HeOPWV%V_s|=|3 z%i<0Sd+*!7Gtk-2ffQAah63H)@W3wsjt);iiwtB>T{t)}{uA4mrG_VihGI(U1n!G$ zmZ)RWRiP)I3mxN@gZ9T<(S7cW^~-^IWL$(fj=5%tvwHrr*K%t~tmJ#LWkH7Z(xG7_ z$z=-mmpu__%iM;_+pT0_Z$ee+sSn(0xlrP%lFmlBNDF_*6!ztt;G}pK&PJ%7!zThq z?tnwO^}m}Eq{8rs=-u`-^5wwU^LEUlhLR59|Ne2R zI6lg;sXAV{gDt#ORJpBr16avOF-wwwlgDV`HPzMd+W0U!hR2dI+vdZB0m{`>h7zqI zYPjjiP|{dn%T#U;Fn_zPY?SL`A{kQ(xAc_>30%b%|HolsqZPaFZV0=F8RF(ka`;$o z0!-O#jHi}-WXtv}0IRKKsI+?=1TPdv*YO8oQeOr}oR4S4H|v?LAxHAJJ``YbdF11) zd3fI08v9h*NnQAU5?ZBP^~GHR%T^s_6BpcO6fZOJzflZjwaDC`)_6T<4@T8Yg`YE5 zk)0_jRb4ehq3usJm)c$pnadJL{wZtLb>uebo${B=JotbMv`uCW$)?aM|AD<8Hv)P|f0g9meekH;JO})}ZE(Y~1(lHMuMJ zqpbcj$Ln=wuqHGUe)u>e9p}%knU=A(Ze_R-_?N^yddIY4Wk~H4bu9H9#g^qtK)-5i z)wZ(pRk5>_S;kNmZq*rovS~n!9KDuBmMa)?ernRBE_ew1(UgI<%2?rE?n9~ud%2_9 zUszCz65LZ!fe~jfa`Sb~(eQ^K`vAP?%BoS&@^k`giT@>faVQ7Bc>E^47hhsp?r`>S zu$(w+F2r%VV#Mu%HamK72>4GfB)cz+W+9meAz`NrmwVq1{;6t`A&=znT(__@QWY|Y z+5=?el*Onke80aE+l;$sg)pLTj7E!vXXn=;sBDUaI~VKNhQ3o+(MZ63gFY6^RTHc9 z8^m*6J4s3y#U{=k;9Ls@zo^4^=D1@F+J)&dy@Fomt1XMOADNOTx=zsbbRnocw1cyu z&p0wWn=OB04WisIGCR!}7rwMdw{Cr$AvX^Cm8Dpl**xZ6CyiB8mXm28ccVk@S+YHO z4l4bLWd|a~IL#fqVB*(TqFel0;^V&zJ086gSqwSBIoi~c;SHyWu}>_?nwBBbI6DD! z%c5Cr>^AnLv4(w|pUN`sP9>{LeN~rxwrwQI zQq_P{ukxx)wus|yEl*auaSr7DsAv1j61Wwj&)mh4S)}IlD6l)}%1Ul(kmdIkNSBZa zA9AOkHE-x8A$Ko}ez@wQ&h5L*V0;2wVqr@Xg7)GGx5+HW?hPk9<0x@f`pr%skSBe@ z9W%7r40HTKnA~bzT(6T2x^@BN)N2D0H`f>LeSTmaG|q`T^I`&d_3i*!KgSeP*UO{9 zA79-0Ae61#_S`y)-wei1gG}Ae4k~TtvtBNjXihnS(*J77vN3k7x1)wrI=YIzygfr? zV^~ItFW+X)C5EtO!WK~5*~@N`NR)|OLFPNnWpicLvNs>k5<|BW;MZ({>vSTZx~z^Y z+Ze_c=AIC>e%k{QCHmkv?vChFbTo-uoXKXohgsRLo&$=i>3BL&aEt6$C7!#dqk38u zd%yZL+x9+8=+h?=i?j$*)1e8!GkxIW6&3PZ>kUa*luWEH-sHaKi@}u*BSkdO5+nT5 z0A9wk%HXL?E^;G2YmX-JuN%3;6&bkiWF9&5eK@qa%w=Cp$I#+syLp}C1sGd9gijsa ziKi2i@bBp@)V%ZoOP?*vlhPDi<{87T<@(Sc|5o#2v0rg|f+X$pG~)-poaZ0#dC;wV zjOx#t!e5-F#h3NZ$3-8C~E`U)_}Am$h8LPe!q%`$Gi(=VT@Ii=4+b zW`x52BQ>PP>;pS8bSMUVb>TAYGx-bK%Lq#5;hrW!v+rf2_=^vysd*hj_U`A$zwrR8 znqZd94?=D1d8|63%`ZK%1wXC1g6zXyZ2qo-J)0MaUY@LikL^48e}kvF+BihucYF3R5#r*keQ(;a6v@=pUCsPVHI zuynC3RZYAACG8dBy0SiB!#%n7b>BCnC`1MYyu;}A5fdiw% z7dH9f0<|b=;4Ju&Th!^*IR*TPvoW0Pc~3n4(F+PsM&R^+Godck8yC%Gknws2?-sZZ zMn2P^)#?qja_bu0Wa-aePgu&Im)VaQ-JYOq{Exi-t4EZ?eCQ+XuiQ+lOwv9$48+Yp zq0t;Mnxrhabl(}1(AP6iOY%B$74|fDw+uCy{TxK+Z$Y-60zGIDOUHiLgIRq$_lduMZp{164&|-92f!`m3-~DM z(^2b&vD^7NG$U1<|E%$tu9%|K%J0N^PS+pi{I74Bb#^QA75ksFTTTvmZS02NjWS~JWIB0NWd1ipG0!>W|*Wr zpB>+%MY^-sgO=kNJo0`%HoUaq6(&uEzh(_wNP`m5h#Sl7S_AQ-QxN~N<{U^*BCz00 zA+!3--Sg+ueyDZi@3c=}xq|p#fam-dCnO++!0DjG~XP zn6UA@5$a4RhbB3M{O9#xo;`v)o3MwEKR1l1c&!0lLu)F(9N5a1(;_#67vPfl80PZv zC@&0Tdp3L&?H-*9Atj6HzOB~$rD2Qtt1mR^nza`|73%Pbhb8}dhX#7@e1O(BjPTnE zIh52skJ_b7bWBoeO=Xb#~>n zDSf_cBJJE~N_9)@`S^A9P((}UG5uvI?(mTmUrL6*mS+6maZTPeE?tz|cbF<&{f;|k ze<9YIeZuU+hqcdb1bA=58$Fyr^-^E5Id1pauP#Z-hX-LNuf}`H-^CA?ALDtOqwM%H zQ{YFx$Ec`0?yH;yj9F<%6~*)MxwRIbX#1Qr`N`62xCHi!8`4b`N!&B(iTvFq`Or4X zg*|Gkfp?{2_|^Y%So8yc>rjO;kIK1>SAzE{_XHjIUJpxK zw0VCkFW$&dhUV2#GIfO++3MMg>d_wb{&-nhBld)?oGHZ{jZ>haH>#BUmn^V64_UvL z7=Z&b2A%S3itu~>ZIZe`g`E}lA}JjPFd!yLM?bZ}Z(J!F*4J=)%Y#7XUn<0@q*-Y; zp2Yf#Q=omPEK~8?h-bTDV6f?7bJ~Y;uHb zAG4ba_}hw2mt1L%x-~kY9Ln~+fye*+V7!tPUn4Dpl<#JbUiq;rXT-Q0Bcoy7tL4z8 zn8D)dT~TNI1sFGHs;FE{1vXBT5nPk$ko!0se?I+)o~ukTZ>|dXHm2e7S|RfNFcq@K zJb@2a)Ij&THb0ven05=N32yM6aBrUpWM4V~I~(7T@N*Y&#~VU&?8Ru>(9gtY=M(Jr z|3OyI_{`!I)cHHq2v(@5LiN{D+?H#O0ZQ|sX;T_ZyE2^{uDGA=kELwMjd23k_Z&D> zd*VB_q4@T4G|GNH1igzp;7e}__NSJBpZE>(DtiRT-ulbxFU$eMH$OpN8C{VPDSB3fmL|1 z{WW}$6nqu+;cRvC8_}Y+;rx%<{Wvku9;DnG*?5;{Qq_6!{Gm?HXSO z9dqwP>n>Gl-J8Jue0Bj(&c6c|dB?aLh2#0Q-fV##pAPbeZi$K>`@;My5;SKGkEZYA zQ0I&cUnB5?CKqI225$uSqQ;IA$XI&-X69sx zyiRO|o*U;yFMGej&b$t$@0@~brkoxA1@ba(SdxScezIf^tgQZD6fvQhNxrf!*EFO+XHIPhESXE zd2naPOJb?f&%&z=VDY{ROv_M%$Dzk?*^{9VW1ojs>Z$N$Vip`n+`Su@6(Z@L>)u=CAJIJ4@&bIN+wu)NrEo-Zs6=rnDP3_CRC>6A`C^d=1FX)1O=lf(igKg zv3*9;w6yI3een>lD}R^r#~Bls&fC!+uP!{AZU0G3Uk zF(n`p6a34ergH({%4z(V$BMi;Ukjr@CF8$Wqxs3B%28MF59$3{1kSFlWS)i4>j=3B zA4fhz%jO*Nrp^Gx#@BFJR?6rjUCkY`%4K`Db%IlRE*l6@K-WrX(s0HHZVAk5^Zr-t zrEo%>J;#pVVH+sivy-=KDTg)Ej_@S(7I_|g6vDS>!j2p@cwv!)3%6Hd<(3+@WWggy z`7s=B%b3#E6NDP4d4i>H13rC{Mw|jdNv)0pl$kC-&*mmv8X?D@bpC{&H{0H`bcgNbRqFn)^+9ipH~Hb|7?CM|tb{&*JdPPGSzRtDZH&w_dM12R)) z6=WyNqn(^5?pl)#elZA!PMX%?U(?{!rVcFMqC@{wTo)YYYrxnr95c1;n2w-;VMU_k#VxnTB!I=Jwz2v*4Z^V@Y&NW{lc)OfuUzVa`?yYgeXjV^82 zG3f~^%H6=vo&U(<0~UO^OCbD-cS4;}>M-?a3`|~s73$|HPzN1N96h!lqr@`sd&&qF z`r;B;)||!0!~KpAwZJr5 z-?$aW=BKg*&$Dpu>``Hqr&1H!!FY{TA_jb9mOu1X=vg3o%iCNx$pqqAd|W82RqMB>m~{xM5IZv9Iz6LX;b#ujvp&{eAV za0y1aL_krz9~z95$F0g0u)3aHl`}4Pb0q z0Jv|K=l$z!sY-a-Y&RP%)z7FB<1zz2fzp(Xj zAk!Yb7#|BW(%w=V`sk}YPdg6rw$4Xj{FyV5^W>rEMQ|jzRlLL>r(B5GHQ}tY%#&`m z2xoz1E2y#4NE+OGmQ#AO4|>g9_#;tI$)w*8;L$o2{@ul5lHJR5GtA3Gd+yDm=AZ0@ z4#o+1p(4vCOLQXyM#J&Yo6JvRJs9}B#3?kMJvKFk{_=2;6xdOb^umlzj13*c`8La~*H)c{Hr$zs$=wd@kQ`{dma6@gwq1mE!c39(x-DG=^fuLFb+lZJf8x?2Pe`b zFBy7Tn1AU%Xh+BG({bghi)1I2qNC@|=Gomi_#-)$nVepWW)Cd+>h1Ti%Y7IxdnJeW z`KrgOxL?HA(=F)!hoLm$kO4I1xYF^O_t}=f#aJ)@7g~A_@*bJrMLN&oA+lbRPTw}0 zo-R=ZONWsVuYV2f{zuW7hUN5iakzP&2bzl}B&E{yoV6p8DHZ-ol$j_>WeCwMnj=Ms zXhJGd^qjR-sFWy?Dbj#S2@y%W=l#^jp6lscoxRuZx9qR&|Zd_kaMktP`}|a^WWuT zY+ZCFh9w_@6;D!_)Vt*f3)}b`*S*2iC3bkrehKk!wV@)P3sFk$BfCNCIL^Hkj*mZY zLp`cRbMNwSVrMri5Yh~#_r5aw`})Cok2;lH{}^l>t?3%eGmP1`By4;V1(x=`D7hm6 z17FDy-=Ht}o%S;eDmiySGq-d5I39;M2fVGO3vDNnuur^_St%)sznbExa-t53a?DMA zeQnPB?t(43BPeh-gjMS}4GXtxW9{mb*f~0%WcG-Vqak&8*gKju#@J(=Mi+{fJcWqa zebCWvLp{~XF?IG2oIL3^66-?P{^UH!U!I5;#LIEMg(;SD+1;A?eURIl!q)wgViv#2 zLbupY?fYQRT_t6q@S7_8c>K@ydnW zU9%VK@B31^x&eY;3X`$Z#rV?nA!fKYW1Zn+O326GA6unGfN`vnu&5NVwl$cZgz6ZTdotMf;U9ZW2oH)h}ski zxoPu}nYWilrRl*O3r`|__Xx1-C$kcoDoFT@u8Wv0$)0*mbfU?&iY zWA`f9hv7j)re+aI?rLU|d#3{rC*i@sIN%uz(POhS*_nmiXqs%#{e2ZL+7u-YyT%%}Z4G%848|6W~5y+nFo z)$LR0eqaseJ$?w=Op_|#<{g4Refs2%+5&LxF$49FyGX3}cJ{vsGpPjU1xwlzjXmEI z*}zgjMau{}Y<(P;?QY8bpWAaGs9Q2zzO*GV#qUgWECZ*mSch z_EzvLcvka+m6^q7e~?>HTXogM0_ zVADAk(5gf2AiUj>2z<67U(ct2hCwvE`SAp+E``HLy*V-Y{R(o;tJp6Aw=hm^kgR^h zr%{}rJjXbS-Fd{39Ff!IyquB9AJK&6jl~?3brIfj{*ErXA;j>WDTZt-^P~2;}91~XgqmxPo+AaIcsCvoqedk0l zZxw2plls%C`QavR||EhS#E zUjhnuvd)I<=_%?E1RGx(04MPDV4&c0&V=gH2tK%8B)9eC{soWA-iphsXiBv)*^S(cdS}9#* z4W??)J1@70ZX16QokRuYeJ zPtbh_SU=dut1q8RlAIecdiWYsIo}l%g>7hz%_lsiHJyZ%h=a9IH6*&b!DDmo4xzQh z{;+K_JHbH^zn`B#TFufKsTJ0^_PqvK_pS%)ILGf$jv5vEl8rrw96&YVHwNztXU$$5 z;g6iv!u^dhMEfeCzX$f?j^%=sFOvyrCaL)IzytKsZ$k5pe(3$|4y?K%LOef<(qp2_ ziBB32dmMSv4xKv=!%VPuwWhMcGGvn(s==-e^vuKZ7v_8O5AyQ)@3NVcNaC-=!LaQ zIrf&tJYH}AZf5&8DOht!7{&cB!s(;3RAKIVY9pA0pKGjfX<7^VM}1-Z>Qulx0ZsC=PJo8y%pg;{ zG{JOjG_7=!Cd;dZX#8_OxVJc)=)7G>-L5xP(l5V3Y3V1POc5}|pRBtC%-)@@>BHB!*Di?B}mnVt-9i$CGIf(|^kFO=QSvB)pzX0a;D z9@MtFn7>>=o+N4rg5~lsYV%6pY#yL4C|M_Pk*- zW9+h(ib-lywG%Hf=yyFc2V<~xa5h!#=CkF0jxxsSO1#A9J*@wKNqF&EA!Fry6;93n z2qtU%;bB7sopkOhR1DPNzW$q_{OBfpUY3atN;9E1XF0Vx@&d+!jx*C0=CjXU7!kdU zzo2gt4oMP;Xt6T^B0D1NRjcPv<=kg%k=cGY6Kcc;C1&ASi6x{go#NoyY4~LQU3jZa#bIs%-1B(vXkJ+2<7 z{fJl|3m&8prtkWQzD74`x#xZOvzOzAUp1f(o=Hqzqa-NleM9BB(Ns(LFe!SG00K0Y zk(u!xI>>Rl6_ZNQWzCxYn3+bqH485=A z3Np_tn2z6LsH1cTc5~jx*PXgBU(1AiFJhrcdLtEcjNu0Hu*LHj6&BalyH&)E%P^&@0u0$!kmQv(+DwehX$(lB@ZImTj^ z2K|~}ijx-z@a|^iP&0ckIxp-K#?>0oc!xM}shLLqy>G&M<0f#)Xb0Uq9trkUN1^0M zJQ3_0V{NZ~0gK)d7}l+X(8^h%b@2~=&{%rXE?*Z7XE87HU1jmQA@V*9=0qiu{T3PFc#ka0m_l)iaU{CF$Zz%Jhzz2@W{yp^Z5U z;8g8O5I+)tbFyCJd5)FQD?5rB{uZQ);|0n{xAQj0r<11lCir6hd~)jb2t=e0u;sUY zL*dE)u&O$Mb{27ab;3II?_OJYu4+T4D?8KWMW-vBN5UDs*YT)(BM8RY?dc|eQTTmc zhzu<*2HDRM)OE@Z_8xkZ8y|A$abiT@WN==*KjF}qyM`YaI*;8C0WkcWN4>L@Xz}6I zxKCmgs@P{>>>eZPmVDuM|Lf^gzI!nhTf{kTMWazVYM5OdT}~gAb>YK$3z)e7D(+V% z^x3~vbW(~I`*LnL$}GCWT%93{j;pv{O5$;RG1-d_zH9=1^)+7n*FW&1LkLxRf{16? zJ!C#RlT`5*)*wWX?A(!y^l=Tw$XX!(y92FqIteGbHqyFn2hh&nEE#Tysjqvp!B zWcA8oTydYvXSF4W$n6Cryfm7uZTtyw$-;z}vW(m)xeI*#AL!SdO*{9S&;b0M+ZjP;t}z`U;Fd_3nKGSQ`yG~=}&RUHx{ z`Kilr;%9#{F>V%pGkOnfe#%1VMg!XSrxWMQmZI&}rO*=Xf*xOabj!_)yx@D8Fd<8r zEZW_N*3Bv$`(K!@TapVMJ=5udxB&X-(j5By*bX|nzZwQj)UYcj7@ww$^W^@h(PULK z6j6}|o2a*VIah#|+-PUQ+8(m4Vf%69n+RxSx#CCZemK~A0%TJC=^kzeFZA9NGJoLm zW>!xq>shuKzFl)BSL{5%-cgP>G;=F+>5LuIGTVo)7+~ zsdUW`Fqm?VIE~5z6X1(xDIwUTwTF2de2x{Iu0^a;TJUX~8+rL?h}9AmBFXVQwr$1; zG>9xFpWNfgF@d?{n@s_$_pS;h#MJGhSJ={^kQBK2Z5p$GVl&*^|DBP#sYY*DT)